datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <pybind11/pybind11.h>
|
|
21
|
+
|
|
22
|
+
namespace py = pybind11;
|
|
23
|
+
|
|
24
|
+
void init_hll(py::module& m);
|
|
25
|
+
void init_kll(py::module& m);
|
|
26
|
+
void init_fi(py::module& m);
|
|
27
|
+
void init_cpc(py::module& m);
|
|
28
|
+
void init_theta(py::module& m);
|
|
29
|
+
void init_vo(py::module& m);
|
|
30
|
+
void init_vector_of_kll(py::module& m);
|
|
31
|
+
|
|
32
|
+
PYBIND11_MODULE(datasketches, m) {
|
|
33
|
+
init_hll(m);
|
|
34
|
+
init_kll(m);
|
|
35
|
+
init_fi(m);
|
|
36
|
+
init_cpc(m);
|
|
37
|
+
init_theta(m);
|
|
38
|
+
init_vo(m);
|
|
39
|
+
init_vector_of_kll(m);
|
|
40
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "frequent_items_sketch.hpp"
|
|
21
|
+
|
|
22
|
+
#include <pybind11/pybind11.h>
|
|
23
|
+
#include <sstream>
|
|
24
|
+
|
|
25
|
+
namespace py = pybind11;
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
namespace python {
|
|
29
|
+
|
|
30
|
+
template<typename T>
|
|
31
|
+
frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
|
|
32
|
+
std::string skStr = skBytes; // implicit cast
|
|
33
|
+
return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
template<typename T>
|
|
37
|
+
py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
|
|
38
|
+
auto serResult = sk.serialize();
|
|
39
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// maybe possible to disambiguate the static vs method get_epsilon calls, but
|
|
43
|
+
// this is easier for now
|
|
44
|
+
template<typename T>
|
|
45
|
+
double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
|
|
46
|
+
return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
template<typename T>
|
|
50
|
+
py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
51
|
+
frequent_items_error_type err_type,
|
|
52
|
+
uint64_t threshold = 0) {
|
|
53
|
+
if (threshold == 0) { threshold = sk.get_maximum_error(); }
|
|
54
|
+
|
|
55
|
+
py::list list;
|
|
56
|
+
auto items = sk.get_frequent_items(err_type, threshold);
|
|
57
|
+
for (auto iter = items.begin(); iter != items.end(); ++iter) {
|
|
58
|
+
py::tuple t = py::make_tuple(iter->get_item(),
|
|
59
|
+
iter->get_estimate(),
|
|
60
|
+
iter->get_lower_bound(),
|
|
61
|
+
iter->get_upper_bound());
|
|
62
|
+
list.append(t);
|
|
63
|
+
}
|
|
64
|
+
return list;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
namespace dspy = datasketches::python;
|
|
71
|
+
|
|
72
|
+
template<typename T>
|
|
73
|
+
void bind_fi_sketch(py::module &m, const char* name) {
|
|
74
|
+
using namespace datasketches;
|
|
75
|
+
|
|
76
|
+
py::class_<frequent_items_sketch<T>>(m, name)
|
|
77
|
+
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
|
78
|
+
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
|
79
|
+
"Produces a string summary of the sketch")
|
|
80
|
+
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
|
81
|
+
"Produces a string summary of the sketch")
|
|
82
|
+
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
|
|
83
|
+
"Updates the sketch with the given string and, optionally, a weight")
|
|
84
|
+
.def("get_frequent_items", &dspy::fi_sketch_get_frequent_items<T>, py::arg("err_type"), py::arg("threshold")=0)
|
|
85
|
+
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
|
|
86
|
+
"Merges the given sketch into this one")
|
|
87
|
+
.def("is_empty", &frequent_items_sketch<T>::is_empty,
|
|
88
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
89
|
+
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
|
|
90
|
+
"Returns the number of active items in the sketch")
|
|
91
|
+
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
|
|
92
|
+
"Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
|
|
93
|
+
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
|
|
94
|
+
"Returns the estimate of the weight (frequency) of the given item.\n"
|
|
95
|
+
"Note: The true frequency of a item would be the sum of the counts as a result of the "
|
|
96
|
+
"two update functions.")
|
|
97
|
+
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
|
|
98
|
+
"Returns the guaranteed lower bound weight (frequency) of the given item.")
|
|
99
|
+
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
|
|
100
|
+
"Returns the guaranteed upper bound weight (frequency) of the given item.")
|
|
101
|
+
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
|
|
102
|
+
"Returns the epsilon value used by the sketch to compute error")
|
|
103
|
+
.def_static("get_epsilon_for_lg_size", &dspy::fi_sketch_get_generic_epsilon<T>, py::arg("lg_max_map_size"),
|
|
104
|
+
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
|
105
|
+
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
|
106
|
+
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
|
107
|
+
.def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes,
|
|
108
|
+
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
|
109
|
+
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
|
110
|
+
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|
|
111
|
+
;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
void init_fi(py::module &m) {
|
|
115
|
+
using namespace datasketches;
|
|
116
|
+
|
|
117
|
+
py::enum_<frequent_items_error_type>(m, "frequent_items_error_type")
|
|
118
|
+
.value("NO_FALSE_POSITIVES", NO_FALSE_POSITIVES)
|
|
119
|
+
.value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
|
|
120
|
+
.export_values();
|
|
121
|
+
|
|
122
|
+
bind_fi_sketch<std::string>(m, "frequent_strings_sketch");
|
|
123
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "hll.hpp"
|
|
21
|
+
|
|
22
|
+
#include <pybind11/pybind11.h>
|
|
23
|
+
|
|
24
|
+
namespace py = pybind11;
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
namespace python {
|
|
28
|
+
|
|
29
|
+
hll_sketch hll_sketch_deserialize(py::bytes skBytes) {
|
|
30
|
+
std::string skStr = skBytes; // implicit cast
|
|
31
|
+
return hll_sketch::deserialize(skStr.c_str(), skStr.length());
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
py::object hll_sketch_serialize_compact(const hll_sketch& sk) {
|
|
35
|
+
auto serResult = sk.serialize_compact();
|
|
36
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
py::object hll_sketch_serialize_updatable(const hll_sketch& sk) {
|
|
40
|
+
auto serResult = sk.serialize_updatable();
|
|
41
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
namespace dspy = datasketches::python;
|
|
48
|
+
|
|
49
|
+
void init_hll(py::module &m) {
|
|
50
|
+
using namespace datasketches;
|
|
51
|
+
|
|
52
|
+
py::enum_<target_hll_type>(m, "tgt_hll_type", "Target HLL flavor")
|
|
53
|
+
.value("HLL_4", HLL_4)
|
|
54
|
+
.value("HLL_6", HLL_6)
|
|
55
|
+
.value("HLL_8", HLL_8)
|
|
56
|
+
.export_values();
|
|
57
|
+
|
|
58
|
+
py::class_<hll_sketch>(m, "hll_sketch")
|
|
59
|
+
.def(py::init<int>(), py::arg("lg_k"))
|
|
60
|
+
.def(py::init<int, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
|
|
61
|
+
.def(py::init<int, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
|
|
62
|
+
.def_static("deserialize", &dspy::hll_sketch_deserialize,
|
|
63
|
+
"Reads a bytes object and returns the corresponding hll_sketch")
|
|
64
|
+
.def("serialize_compact", &dspy::hll_sketch_serialize_compact,
|
|
65
|
+
"Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
|
|
66
|
+
.def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
|
|
67
|
+
"Serializes the sketch into a bytes object")
|
|
68
|
+
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
|
|
69
|
+
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
|
|
70
|
+
"Produces a string summary of the sketch")
|
|
71
|
+
.def("to_string", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
|
|
72
|
+
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
|
|
73
|
+
"Produces a string summary of the sketch")
|
|
74
|
+
.def_property_readonly("lg_config_k", &hll_sketch::get_lg_config_k, "Configured lg_k value for the sketch")
|
|
75
|
+
.def_property_readonly("tgt_type", &hll_sketch::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
|
|
76
|
+
.def("get_estimate", &hll_sketch::get_estimate,
|
|
77
|
+
"Estimate of the distinct count of the input stream")
|
|
78
|
+
.def("get_lower_bound", &hll_sketch::get_lower_bound, py::arg("num_std_devs"),
|
|
79
|
+
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
80
|
+
.def("get_upper_bound", &hll_sketch::get_upper_bound, py::arg("num_std_devs"),
|
|
81
|
+
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
82
|
+
.def("is_compact", &hll_sketch::is_compact,
|
|
83
|
+
"True if the sketch is compact, otherwise False")
|
|
84
|
+
.def("is_empty", &hll_sketch::is_empty,
|
|
85
|
+
"True if the sketch is empty, otherwise False")
|
|
86
|
+
.def("get_updatable_serialization_bytes", &hll_sketch::get_updatable_serialization_bytes,
|
|
87
|
+
"Returns the size of the serialized sketch")
|
|
88
|
+
.def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
|
|
89
|
+
"Returns the size of the serialized sketch when compressing the exception table if HLL_4")
|
|
90
|
+
.def("reset", &hll_sketch::reset,
|
|
91
|
+
"Resets the sketch to the empty state in coupon colleciton mode")
|
|
92
|
+
.def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
|
|
93
|
+
"Updates the sketch with the given integral value")
|
|
94
|
+
.def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
|
|
95
|
+
"Updates the sketch with the given floating point value")
|
|
96
|
+
.def("update", (void (hll_sketch::*)(const std::string&)) &hll_sketch::update, py::arg("datum"),
|
|
97
|
+
"Updates the sketch with the given string value")
|
|
98
|
+
.def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
|
|
99
|
+
py::arg("lg_k"), py::arg("tgt_type"),
|
|
100
|
+
"Provides a likely upper bound on serialization size for the given paramters")
|
|
101
|
+
.def_static("get_rel_err", &hll_sketch::get_rel_err,
|
|
102
|
+
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
|
103
|
+
"Retuns the a priori relative error bound for the given parameters")
|
|
104
|
+
;
|
|
105
|
+
|
|
106
|
+
py::class_<hll_union>(m, "hll_union")
|
|
107
|
+
.def(py::init<int>(), py::arg("lg_max_k"))
|
|
108
|
+
.def_property_readonly("lg_config_k", &hll_union::get_lg_config_k, "Configured lg_k value for the union")
|
|
109
|
+
.def_property_readonly("tgt_type", &hll_union::get_target_type, "Returns the HLL type (4, 6, or 8) when in estimation mode")
|
|
110
|
+
.def("get_estimate", &hll_union::get_estimate,
|
|
111
|
+
"Estimate of the distinct count of the input stream")
|
|
112
|
+
.def("get_lower_bound", &hll_union::get_lower_bound, py::arg("num_std_devs"),
|
|
113
|
+
"Returns the approximate lower error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
114
|
+
.def("get_upper_bound", &hll_union::get_upper_bound, py::arg("num_std_devs"),
|
|
115
|
+
"Returns the approximate upper error bound given the specified number of standard deviations in {1, 2, 3}")
|
|
116
|
+
.def("is_compact", &hll_union::is_compact,
|
|
117
|
+
"True if the union is compact, otherwise False")
|
|
118
|
+
.def("is_empty", &hll_union::is_empty,
|
|
119
|
+
"True if the union is empty, otherwise False")
|
|
120
|
+
.def("reset", &hll_union::reset,
|
|
121
|
+
"Resets the union to the empty state")
|
|
122
|
+
.def("get_result", &hll_union::get_result, py::arg("tgt_type")=HLL_4,
|
|
123
|
+
"Returns a sketch of the target type representing the current union state")
|
|
124
|
+
.def<void (hll_union::*)(const hll_sketch&)>("update", &hll_union::update, py::arg("sketch"),
|
|
125
|
+
"Updates the union with the given HLL sketch")
|
|
126
|
+
.def<void (hll_union::*)(int64_t)>("update", &hll_union::update, py::arg("datum"),
|
|
127
|
+
"Updates the union with the given integral value")
|
|
128
|
+
.def<void (hll_union::*)(double)>("update", &hll_union::update, py::arg("datum"),
|
|
129
|
+
"Updates the union with the given floating point value")
|
|
130
|
+
.def<void (hll_union::*)(const std::string&)>("update", &hll_union::update, py::arg("datum"),
|
|
131
|
+
"Updates the union with the given string value")
|
|
132
|
+
.def_static("get_rel_err", &hll_union::get_rel_err,
|
|
133
|
+
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
|
134
|
+
"Retuns the a priori relative error bound for the given parameters")
|
|
135
|
+
;
|
|
136
|
+
}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "kll_sketch.hpp"
|
|
21
|
+
|
|
22
|
+
#include <pybind11/pybind11.h>
|
|
23
|
+
#include <pybind11/stl.h>
|
|
24
|
+
#include <pybind11/numpy.h>
|
|
25
|
+
#include <sstream>
|
|
26
|
+
#include <vector>
|
|
27
|
+
|
|
28
|
+
namespace py = pybind11;
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
namespace python {
|
|
33
|
+
|
|
34
|
+
template<typename T>
|
|
35
|
+
kll_sketch<T> kll_sketch_deserialize(py::bytes skBytes) {
|
|
36
|
+
std::string skStr = skBytes; // implicit cast
|
|
37
|
+
return kll_sketch<T>::deserialize(skStr.c_str(), skStr.length());
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
template<typename T>
|
|
41
|
+
py::object kll_sketch_serialize(const kll_sketch<T>& sk) {
|
|
42
|
+
auto serResult = sk.serialize();
|
|
43
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// maybe possible to disambiguate the static vs method rank error calls, but
|
|
47
|
+
// this is easier for now
|
|
48
|
+
template<typename T>
|
|
49
|
+
double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
50
|
+
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
template<typename T>
|
|
54
|
+
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
55
|
+
std::vector<double>& fractions) {
|
|
56
|
+
size_t nQuantiles = fractions.size();
|
|
57
|
+
auto result = sk.get_quantiles(&fractions[0], nQuantiles);
|
|
58
|
+
|
|
59
|
+
// returning as std::vector<> would copy values to a list anyway
|
|
60
|
+
py::list list(nQuantiles);
|
|
61
|
+
for (size_t i = 0; i < nQuantiles; ++i) {
|
|
62
|
+
list[i] = result[i];
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return list;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
template<typename T>
|
|
69
|
+
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
70
|
+
std::vector<T>& split_points) {
|
|
71
|
+
size_t nPoints = split_points.size();
|
|
72
|
+
auto result = sk.get_PMF(&split_points[0], nPoints);
|
|
73
|
+
|
|
74
|
+
py::list list(nPoints + 1);
|
|
75
|
+
for (size_t i = 0; i <= nPoints; ++i) {
|
|
76
|
+
list[i] = result[i];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return list;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
template<typename T>
|
|
83
|
+
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
|
84
|
+
std::vector<T>& split_points) {
|
|
85
|
+
size_t nPoints = split_points.size();
|
|
86
|
+
auto result = sk.get_CDF(&split_points[0], nPoints);
|
|
87
|
+
|
|
88
|
+
py::list list(nPoints + 1);
|
|
89
|
+
for (size_t i = 0; i <= nPoints; ++i) {
|
|
90
|
+
list[i] = result[i];
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return list;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
template<typename T>
|
|
97
|
+
void kll_sketch_update(kll_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
|
|
98
|
+
if (items.ndim() != 1) {
|
|
99
|
+
throw std::invalid_argument("input data must have only one dimension. Found: "
|
|
100
|
+
+ std::to_string(items.ndim()));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
auto data = items.template unchecked<1>();
|
|
104
|
+
for (uint32_t i = 0; i < data.size(); ++i) {
|
|
105
|
+
sk.update(data(i));
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
namespace dspy = datasketches::python;
|
|
113
|
+
|
|
114
|
+
template<typename T>
|
|
115
|
+
void bind_kll_sketch(py::module &m, const char* name) {
|
|
116
|
+
using namespace datasketches;
|
|
117
|
+
|
|
118
|
+
py::class_<kll_sketch<T>>(m, name)
|
|
119
|
+
.def(py::init<uint16_t>(), py::arg("k")=kll_sketch<T>::DEFAULT_K)
|
|
120
|
+
.def(py::init<const kll_sketch<T>&>())
|
|
121
|
+
.def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
|
|
122
|
+
"Updates the sketch with the given value")
|
|
123
|
+
.def("update", &dspy::kll_sketch_update<T>, py::arg("array"),
|
|
124
|
+
"Updates the sketch with the values in the given array")
|
|
125
|
+
.def("merge", (void (kll_sketch<T>::*)(const kll_sketch<T>&)) &kll_sketch<T>::merge, py::arg("sketch"),
|
|
126
|
+
"Merges the provided sketch into the this one")
|
|
127
|
+
.def("__str__", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
128
|
+
"Produces a string summary of the sketch")
|
|
129
|
+
.def("to_string", &kll_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
130
|
+
"Produces a string summary of the sketch")
|
|
131
|
+
.def("is_empty", &kll_sketch<T>::is_empty,
|
|
132
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
133
|
+
.def("get_n", &kll_sketch<T>::get_n,
|
|
134
|
+
"Returns the length of the input stream")
|
|
135
|
+
.def("get_num_retained", &kll_sketch<T>::get_num_retained,
|
|
136
|
+
"Returns the number of retained items (samples) in the sketch")
|
|
137
|
+
.def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
|
|
138
|
+
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
139
|
+
.def("get_min_value", &kll_sketch<T>::get_min_value,
|
|
140
|
+
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
141
|
+
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
|
142
|
+
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
143
|
+
.def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("fraction"),
|
|
144
|
+
"Returns an approximation to the value of the data item "
|
|
145
|
+
"that would be preceded by the given fraction of a hypothetical sorted "
|
|
146
|
+
"version of the input stream so far.\n"
|
|
147
|
+
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
|
148
|
+
"so it should not be called multiple times to get different quantiles from the same "
|
|
149
|
+
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
150
|
+
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
|
151
|
+
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
152
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
|
153
|
+
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
154
|
+
"This returns an array that could have been generated by using get_quantile() for each "
|
|
155
|
+
"fractional rank separately, but would be very inefficient. "
|
|
156
|
+
"This method incurs the internal set-up overhead once and obtains multiple quantile values in "
|
|
157
|
+
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
|
158
|
+
"to get_quantile().\n"
|
|
159
|
+
"If the sketch is empty this returns an empty vector.")
|
|
160
|
+
.def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"),
|
|
161
|
+
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
162
|
+
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
163
|
+
"get_normalized_rank_error(False) function.\n"
|
|
164
|
+
"If the sketch is empty this returns nan.")
|
|
165
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
|
166
|
+
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
167
|
+
"given a set of split points (values).\n"
|
|
168
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
169
|
+
"get_normalized_rank_error(True) function.\n"
|
|
170
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
171
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
|
172
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
173
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
174
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
|
175
|
+
"the maximum value.\n"
|
|
176
|
+
"It is not necessary to include either the min or max values in these split points.")
|
|
177
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
|
178
|
+
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
|
179
|
+
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
|
180
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
181
|
+
"get_normalized_rank_error(True) function.\n"
|
|
182
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
183
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
|
184
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
185
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
186
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
|
187
|
+
"the maximum value.\n"
|
|
188
|
+
"It is not necessary to include either the min or max values in these split points.")
|
|
189
|
+
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
|
190
|
+
py::arg("as_pmf"),
|
|
191
|
+
"Gets the normalized rank error for this sketch.\n"
|
|
192
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
|
193
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
|
194
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
|
195
|
+
.def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
|
|
196
|
+
py::arg("k"), py::arg("as_pmf"),
|
|
197
|
+
"Gets the normalized rank error given parameters k and the pmf flag.\n"
|
|
198
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
|
199
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
|
200
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
|
201
|
+
.def("serialize", &dspy::kll_sketch_serialize<T>, "Serailizes the sketch into a bytes object")
|
|
202
|
+
.def_static("deserialize", &dspy::kll_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
|
|
203
|
+
;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
void init_kll(py::module &m) {
|
|
207
|
+
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
|
208
|
+
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
|
209
|
+
}
|