datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "var_opt_sketch.hpp"
|
|
21
|
+
#include "var_opt_union.hpp"
|
|
22
|
+
|
|
23
|
+
#include <pybind11/pybind11.h>
|
|
24
|
+
#include <pybind11/functional.h>
|
|
25
|
+
#include <sstream>
|
|
26
|
+
|
|
27
|
+
namespace py = pybind11;
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
namespace python {
|
|
31
|
+
|
|
32
|
+
template<typename T>
|
|
33
|
+
py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
|
|
34
|
+
py::list list;
|
|
35
|
+
for (auto& item : sk) {
|
|
36
|
+
py::tuple t = py::make_tuple(item.first, item.second);
|
|
37
|
+
list.append(t);
|
|
38
|
+
}
|
|
39
|
+
return list;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
template<typename T>
|
|
43
|
+
py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
|
|
44
|
+
subset_summary summary = sk.estimate_subset_sum(func);
|
|
45
|
+
py::dict d;
|
|
46
|
+
d["estimate"] = summary.estimate;
|
|
47
|
+
d["lower_bound"] = summary.lower_bound;
|
|
48
|
+
d["upper_bound"] = summary.upper_bound;
|
|
49
|
+
d["total_sketch_weight"] = summary.total_sketch_weight;
|
|
50
|
+
return d;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
template<typename T>
|
|
54
|
+
std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
|
|
55
|
+
if (print_items) {
|
|
56
|
+
std::ostringstream ss;
|
|
57
|
+
ss << sk.to_string();
|
|
58
|
+
ss << "### VarOpt Sketch Items" << std::endl;
|
|
59
|
+
int i = 0;
|
|
60
|
+
for (auto& item : sk) {
|
|
61
|
+
// item.second is always a double
|
|
62
|
+
// item.first is an arbitrary py::object, so get the value by
|
|
63
|
+
// using internal str() method then casting to C++ std::string
|
|
64
|
+
py::str item_pystr(item.first);
|
|
65
|
+
std::string item_str = py::cast<std::string>(item_pystr);
|
|
66
|
+
// item.second is guaranteed to be a double
|
|
67
|
+
ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
|
|
68
|
+
}
|
|
69
|
+
return ss.str();
|
|
70
|
+
} else {
|
|
71
|
+
return sk.to_string();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
namespace dspy = datasketches::python;
|
|
79
|
+
|
|
80
|
+
template<typename T>
|
|
81
|
+
void bind_vo_sketch(py::module &m, const char* name) {
|
|
82
|
+
using namespace datasketches;
|
|
83
|
+
|
|
84
|
+
py::class_<var_opt_sketch<T>>(m, name)
|
|
85
|
+
.def(py::init<uint32_t>(), py::arg("k"))
|
|
86
|
+
.def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
|
|
87
|
+
"Produces a string summary of the sketch")
|
|
88
|
+
.def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
|
|
89
|
+
"Produces a string summary of the sketch")
|
|
90
|
+
.def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
|
|
91
|
+
"Updates the sketch with the given value and weight")
|
|
92
|
+
.def_property_readonly("k", &var_opt_sketch<T>::get_k,
|
|
93
|
+
"Returns the sketch's maximum configured sample size")
|
|
94
|
+
.def_property_readonly("n", &var_opt_sketch<T>::get_n,
|
|
95
|
+
"Returns the total stream length")
|
|
96
|
+
.def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
|
|
97
|
+
"Returns the number of samples currently in the sketch")
|
|
98
|
+
.def("get_samples", &dspy::vo_sketch_get_samples<T>,
|
|
99
|
+
"Retyrns the set of samples in the sketch")
|
|
100
|
+
.def("is_empty", &var_opt_sketch<T>::is_empty,
|
|
101
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
102
|
+
.def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
|
|
103
|
+
"Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
|
|
104
|
+
"as upper and lower bounds on the estimate and the total weight processed by the sketch")
|
|
105
|
+
// As of writing, not yet clear how to serialize arbitrary python objects,
|
|
106
|
+
// especially in any sort of language-portable way
|
|
107
|
+
//.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
|
|
108
|
+
//.def("serialize", &dspy::vo_sketch_serialize<T>)
|
|
109
|
+
//.def_static("deserialize", &dspy::vo_sketch_deserialize<T>)
|
|
110
|
+
;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
template<typename T>
|
|
114
|
+
void bind_vo_union(py::module &m, const char* name) {
|
|
115
|
+
using namespace datasketches;
|
|
116
|
+
|
|
117
|
+
py::class_<var_opt_union<T>>(m, name)
|
|
118
|
+
.def(py::init<uint32_t>(), py::arg("max_k"))
|
|
119
|
+
.def("__str__", &var_opt_union<T>::to_string,
|
|
120
|
+
"Produces a string summary of the sketch")
|
|
121
|
+
.def("to_string", &var_opt_union<T>::to_string,
|
|
122
|
+
"Produces a string summary of the sketch")
|
|
123
|
+
.def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
|
|
124
|
+
"Updates the union with the given sketch")
|
|
125
|
+
.def("get_result", &var_opt_union<T>::get_result,
|
|
126
|
+
"Returns a sketch corresponding to the union result")
|
|
127
|
+
.def("reset", &var_opt_union<T>::reset,
|
|
128
|
+
"Resets the union to the empty state")
|
|
129
|
+
// As of writing, not yet clear how to serialize arbitrary python objects,
|
|
130
|
+
// especially in any sort of language-portable way
|
|
131
|
+
//.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
|
|
132
|
+
//.def("serialize", &dspy::vo_union_serialize<T>)
|
|
133
|
+
//.def_static("deserialize", &dspy::vo_union_deserialize<T>)
|
|
134
|
+
;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
void init_vo(py::module &m) {
|
|
138
|
+
bind_vo_sketch<py::object>(m, "var_opt_sketch");
|
|
139
|
+
bind_vo_union<py::object>(m, "var_opt_union");
|
|
140
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import cpc_sketch, cpc_union
|
|
20
|
+
|
|
21
|
+
class CpcTest(unittest.TestCase):
|
|
22
|
+
def test_cpc_example(self):
|
|
23
|
+
k = 12 # 2^k = 4096 rows in the table
|
|
24
|
+
n = 1 << 18 # ~256k unique values
|
|
25
|
+
|
|
26
|
+
# create a couple sketches and inject some values
|
|
27
|
+
# we'll have 1/4 of the values overlap
|
|
28
|
+
cpc = cpc_sketch(k)
|
|
29
|
+
cpc2 = cpc_sketch(k)
|
|
30
|
+
offset = int(3 * n / 4) # it's a float w/o cast
|
|
31
|
+
# because we hash on the bits, not an abstract numeric value,
|
|
32
|
+
# cpc.update(1) and cpc.update(1.0) give different results.
|
|
33
|
+
for i in range(0, n):
|
|
34
|
+
cpc.update(i)
|
|
35
|
+
cpc2.update(i + offset)
|
|
36
|
+
|
|
37
|
+
# although we provide get_composite_estimate() and get_estimate(),
|
|
38
|
+
# the latter will always give the best available estimate. we
|
|
39
|
+
# recommend using get_estimate().
|
|
40
|
+
# we can check that the upper and lower bounds bracket the
|
|
41
|
+
# estimate, without needing to know the exact value.
|
|
42
|
+
self.assertLessEqual(cpc.get_lower_bound(1), cpc.get_estimate())
|
|
43
|
+
self.assertGreaterEqual(cpc.get_upper_bound(1), cpc.get_estimate())
|
|
44
|
+
|
|
45
|
+
# unioning uses a separate class, but we need to get_result()
|
|
46
|
+
# tp query the unioned sketches
|
|
47
|
+
union = cpc_union(k)
|
|
48
|
+
union.update(cpc)
|
|
49
|
+
union.update(cpc2)
|
|
50
|
+
result = union.get_result()
|
|
51
|
+
|
|
52
|
+
# since our process here (including post-union CPC) is
|
|
53
|
+
# deterministic, we have checked and know the exact
|
|
54
|
+
# answer is within one standard deviation of the estimate
|
|
55
|
+
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
|
|
56
|
+
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
|
|
57
|
+
|
|
58
|
+
# serialize for storage and reconstruct
|
|
59
|
+
sk_bytes = result.serialize()
|
|
60
|
+
new_cpc = cpc_sketch.deserialize(sk_bytes)
|
|
61
|
+
self.assertFalse(new_cpc.is_empty())
|
|
62
|
+
|
|
63
|
+
if __name__ == '__main__':
|
|
64
|
+
unittest.main()
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import frequent_strings_sketch, frequent_items_error_type
|
|
20
|
+
|
|
21
|
+
class FiTest(unittest.TestCase):
|
|
22
|
+
def test_fi_example(self):
|
|
23
|
+
k = 3 # a small value so we can easily fill the sketch
|
|
24
|
+
fi = frequent_strings_sketch(k)
|
|
25
|
+
|
|
26
|
+
# we'll use a small number of distinct items so we
|
|
27
|
+
# can use exponentially increasing weights and have
|
|
28
|
+
# some frequent items, decreasing so we have some
|
|
29
|
+
# small items inserted after a purge
|
|
30
|
+
n = 8
|
|
31
|
+
for i in range(0, n):
|
|
32
|
+
fi.update(str(i), 2 ** (n - i))
|
|
33
|
+
|
|
34
|
+
# there are two ways to extract items :
|
|
35
|
+
# * NO_FALSE_POSITIVES includes all items with a lower bound
|
|
36
|
+
# above the a posteriori error
|
|
37
|
+
# * NO_FALSE_NEGATIVES includes all items with an uper bound
|
|
38
|
+
# above the a posteriori error
|
|
39
|
+
# a more complete discussion may be found at
|
|
40
|
+
# https://datasketches.github.io/docs/Frequency/FrequentItemsOverview.html
|
|
41
|
+
items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
|
|
42
|
+
items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
|
|
43
|
+
self.assertLessEqual(len(items_no_fp), len(items_no_fn))
|
|
44
|
+
|
|
45
|
+
# the items list returns a decreasing weight-sorted list, and
|
|
46
|
+
# for each item we have (item, estimate, lower_bound, upper_bound)
|
|
47
|
+
item = items_no_fp[1]
|
|
48
|
+
self.assertLessEqual(item[2], item[1]) # lower bound vs estimate
|
|
49
|
+
self.assertLessEqual(item[1], item[3]) # estimate vs upper bound
|
|
50
|
+
|
|
51
|
+
# we can also query directly for a specific item
|
|
52
|
+
id = items_no_fn[0][0]
|
|
53
|
+
est = fi.get_estimate(id)
|
|
54
|
+
lb = fi.get_lower_bound(id)
|
|
55
|
+
ub = fi.get_upper_bound(id)
|
|
56
|
+
self.assertLessEqual(lb, est)
|
|
57
|
+
self.assertLessEqual(est, ub)
|
|
58
|
+
|
|
59
|
+
# the values are zero if the item isn't in our list
|
|
60
|
+
self.assertEqual(fi.get_estimate("NaN"), 0)
|
|
61
|
+
|
|
62
|
+
# now create a second sketch with a lot of unique
|
|
63
|
+
# values but all with equal weight (of 1) such that
|
|
64
|
+
# the total weight is much larger than the first sketch
|
|
65
|
+
fi2 = frequent_strings_sketch(k)
|
|
66
|
+
wt = fi.get_total_weight()
|
|
67
|
+
for i in range(0, 4*wt):
|
|
68
|
+
fi2.update(str(i))
|
|
69
|
+
|
|
70
|
+
# merge the second sketch into the first
|
|
71
|
+
fi.merge(fi2)
|
|
72
|
+
|
|
73
|
+
# we can see that the weight is much larger
|
|
74
|
+
self.assertEqual(5 * wt, fi.get_total_weight())
|
|
75
|
+
|
|
76
|
+
# querying with NO_FALSE_POSITIVES means we don't find anything
|
|
77
|
+
# heavy enough to return
|
|
78
|
+
items_no_fp = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_POSITIVES)
|
|
79
|
+
self.assertEqual(len(items_no_fp), 0)
|
|
80
|
+
|
|
81
|
+
# we do, however, find a few potential heavy items
|
|
82
|
+
# if querying with NO_FALSE_NEGATIVES
|
|
83
|
+
items_no_fn = fi.get_frequent_items(frequent_items_error_type.NO_FALSE_NEGATIVES)
|
|
84
|
+
self.assertGreater(len(items_no_fn), 0)
|
|
85
|
+
|
|
86
|
+
# finally, serialize and reconstruct
|
|
87
|
+
fi_bytes = fi.serialize()
|
|
88
|
+
self.assertEqual(len(fi_bytes), fi.get_serialized_size_bytes())
|
|
89
|
+
new_fi = frequent_strings_sketch.deserialize(fi_bytes)
|
|
90
|
+
|
|
91
|
+
# and now interrogate the sketch
|
|
92
|
+
self.assertFalse(new_fi.is_empty())
|
|
93
|
+
self.assertGreater(new_fi.get_num_active_items(), 0)
|
|
94
|
+
self.assertEqual(5 * wt, new_fi.get_total_weight())
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_fi_sketch(self):
|
|
98
|
+
# only testing a few things not used in the above example
|
|
99
|
+
k = 12
|
|
100
|
+
wt = 10000
|
|
101
|
+
fi = frequent_strings_sketch(k)
|
|
102
|
+
|
|
103
|
+
self.assertAlmostEqual(fi.get_sketch_epsilon(), 0.0008545, delta=1e-6)
|
|
104
|
+
|
|
105
|
+
sk_apriori_error = fi.get_sketch_epsilon() * wt
|
|
106
|
+
reference_apriori_error = frequent_strings_sketch.get_apriori_error(k, wt)
|
|
107
|
+
self.assertAlmostEqual(sk_apriori_error, reference_apriori_error, delta=1e-6)
|
|
108
|
+
|
|
109
|
+
if __name__ == '__main__':
|
|
110
|
+
unittest.main()
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
import unittest
|
|
19
|
+
from datasketches import hll_sketch, hll_union, tgt_hll_type
|
|
20
|
+
|
|
21
|
+
class HllTest(unittest.TestCase):
|
|
22
|
+
def test_hll_example(self):
|
|
23
|
+
k = 12 # 2^k = 4096 rows in the table
|
|
24
|
+
n = 1 << 18 # ~256k unique values
|
|
25
|
+
|
|
26
|
+
# create a couple sketches and inject some values
|
|
27
|
+
# we'll have 1/4 of the values overlap
|
|
28
|
+
hll = hll_sketch(k, tgt_hll_type.HLL_8)
|
|
29
|
+
hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
|
|
30
|
+
offset = int(3 * n / 4) # it's a float w/o cast
|
|
31
|
+
# because we hash on the bits, not an abstract numeric value,
|
|
32
|
+
# hll.update(1) and hll.update(1.0) give different results.
|
|
33
|
+
for i in range(0, n):
|
|
34
|
+
hll.update(i)
|
|
35
|
+
hll2.update(i + offset)
|
|
36
|
+
|
|
37
|
+
# we can check that the upper and lower bounds bracket the
|
|
38
|
+
# estimate, without needing to know the exact value.
|
|
39
|
+
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
|
|
40
|
+
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
|
|
41
|
+
|
|
42
|
+
# unioning uses a separate class, and we can either get a result
|
|
43
|
+
# sketch or query the union object directly
|
|
44
|
+
union = hll_union(k)
|
|
45
|
+
union.update(hll)
|
|
46
|
+
union.update(hll2)
|
|
47
|
+
result = union.get_result()
|
|
48
|
+
self.assertEqual(result.get_estimate(), union.get_estimate())
|
|
49
|
+
|
|
50
|
+
# since our process here (including post-union HLL) is
|
|
51
|
+
# deterministic, we have checked and know the exact
|
|
52
|
+
# answer is within one standard deviation of the estimate
|
|
53
|
+
self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
|
|
54
|
+
self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)
|
|
55
|
+
|
|
56
|
+
# serialize for storage and reconstruct
|
|
57
|
+
sk_bytes = result.serialize_compact()
|
|
58
|
+
self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes())
|
|
59
|
+
new_hll = hll_sketch.deserialize(sk_bytes)
|
|
60
|
+
|
|
61
|
+
# the sketch can self-report its configuation and status
|
|
62
|
+
self.assertEqual(new_hll.lg_config_k, k)
|
|
63
|
+
self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
|
|
64
|
+
self.assertFalse(new_hll.is_empty())
|
|
65
|
+
|
|
66
|
+
# if we want to reduce some object overhead, we can also reset
|
|
67
|
+
new_hll.reset()
|
|
68
|
+
self.assertTrue(new_hll.is_empty())
|
|
69
|
+
|
|
70
|
+
def test_hll_sketch(self):
|
|
71
|
+
k = 8
|
|
72
|
+
n = 117
|
|
73
|
+
hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
|
|
74
|
+
hll.update('string data')
|
|
75
|
+
hll.update(3.14159) # double data
|
|
76
|
+
|
|
77
|
+
self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
|
|
78
|
+
self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())
|
|
79
|
+
|
|
80
|
+
self.assertEqual(hll.lg_config_k, k)
|
|
81
|
+
self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)
|
|
82
|
+
|
|
83
|
+
bytes_compact = hll.serialize_compact()
|
|
84
|
+
bytes_update = hll.serialize_updatable()
|
|
85
|
+
self.assertEqual(len(bytes_compact), hll.get_compact_serialization_bytes())
|
|
86
|
+
self.assertEqual(len(bytes_update), hll.get_updatable_serialization_bytes())
|
|
87
|
+
|
|
88
|
+
self.assertFalse(hll.is_compact())
|
|
89
|
+
self.assertFalse(hll.is_empty())
|
|
90
|
+
|
|
91
|
+
self.assertTrue(isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch))
|
|
92
|
+
self.assertTrue(isinstance(hll_sketch.deserialize(bytes_update), hll_sketch))
|
|
93
|
+
|
|
94
|
+
self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1))
|
|
95
|
+
self.assertIsNotNone(hll_sketch.get_max_updatable_serialization_bytes(20, tgt_hll_type.HLL_6))
|
|
96
|
+
|
|
97
|
+
hll.reset()
|
|
98
|
+
self.assertTrue(hll.is_empty())
|
|
99
|
+
|
|
100
|
+
def test_hll_union(self):
|
|
101
|
+
k = 7
|
|
102
|
+
n = 53
|
|
103
|
+
union = hll_union(k)
|
|
104
|
+
|
|
105
|
+
sk = self.generate_sketch(n, k, tgt_hll_type.HLL_4, 0)
|
|
106
|
+
union.update(sk)
|
|
107
|
+
sk = self.generate_sketch(3 * n, k, tgt_hll_type.HLL_4, n)
|
|
108
|
+
union.update(sk)
|
|
109
|
+
union.update('string data')
|
|
110
|
+
union.update(1.4142136)
|
|
111
|
+
|
|
112
|
+
self.assertLessEqual(union.get_lower_bound(1), union.get_estimate())
|
|
113
|
+
self.assertGreaterEqual(union.get_upper_bound(1), union.get_estimate())
|
|
114
|
+
|
|
115
|
+
self.assertEqual(union.lg_config_k, k)
|
|
116
|
+
self.assertFalse(union.is_compact())
|
|
117
|
+
self.assertFalse(union.is_empty())
|
|
118
|
+
|
|
119
|
+
sk = union.get_result()
|
|
120
|
+
self.assertTrue(isinstance(sk, hll_sketch))
|
|
121
|
+
self.assertEqual(sk.tgt_type, tgt_hll_type.HLL_4)
|
|
122
|
+
|
|
123
|
+
def generate_sketch(self, n, k, sk_type=tgt_hll_type.HLL_4, st_idx=0):
|
|
124
|
+
sk = hll_sketch(k, sk_type)
|
|
125
|
+
for i in range(st_idx, st_idx + n):
|
|
126
|
+
sk.update(i)
|
|
127
|
+
return sk
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == '__main__':
|
|
131
|
+
unittest.main()
|