datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <sstream>
|
|
21
|
+
#include <pybind11/pybind11.h>
|
|
22
|
+
|
|
23
|
+
#include "theta_sketch.hpp"
|
|
24
|
+
#include "theta_union.hpp"
|
|
25
|
+
#include "theta_intersection.hpp"
|
|
26
|
+
#include "theta_a_not_b.hpp"
|
|
27
|
+
#include "common_defs.hpp"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
namespace py = pybind11;
|
|
31
|
+
|
|
32
|
+
namespace datasketches {
|
|
33
|
+
namespace python {
|
|
34
|
+
|
|
35
|
+
update_theta_sketch update_theta_sketch_factory(uint8_t lg_k, double p, uint64_t seed) {
|
|
36
|
+
update_theta_sketch::builder builder;
|
|
37
|
+
builder.set_lg_k(lg_k);
|
|
38
|
+
builder.set_p(p);
|
|
39
|
+
builder.set_seed(seed);
|
|
40
|
+
return builder.build();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
|
|
44
|
+
theta_union::builder builder;
|
|
45
|
+
builder.set_lg_k(lg_k);
|
|
46
|
+
builder.set_p(p);
|
|
47
|
+
builder.set_seed(seed);
|
|
48
|
+
return builder.build();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
theta_sketch* theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
|
|
52
|
+
std::string skStr = skBytes; // implicit cast
|
|
53
|
+
return theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed).release();
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
py::object theta_sketch_serialize(const theta_sketch& sk) {
|
|
57
|
+
auto serResult = sk.serialize();
|
|
58
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
|
|
62
|
+
return sk.get_seed_hash();
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
update_theta_sketch update_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
|
|
66
|
+
std::string skStr = skBytes; // implicit cast
|
|
67
|
+
return update_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, uint64_t seed) {
|
|
71
|
+
std::string skStr = skBytes; // implicit cast
|
|
72
|
+
return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), seed);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
namespace dspy = datasketches::python;
|
|
79
|
+
|
|
80
|
+
void init_theta(py::module &m) {
|
|
81
|
+
using namespace datasketches;
|
|
82
|
+
|
|
83
|
+
py::class_<theta_sketch>(m, "theta_sketch")
|
|
84
|
+
.def("serialize", &dspy::theta_sketch_serialize,
|
|
85
|
+
"Serializes the sketch into a bytes object")
|
|
86
|
+
.def_static("deserialize", &dspy::theta_sketch_deserialize, py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
|
|
87
|
+
"Reads a bytes object and returns the corresponding cpc_sketch")
|
|
88
|
+
.def("__str__", &theta_sketch::to_string, py::arg("print_items")=false,
|
|
89
|
+
"Produces a string summary of the sketch")
|
|
90
|
+
.def("to_string", &theta_sketch::to_string, py::arg("print_items")=false,
|
|
91
|
+
"Produces a string summary of the sketch")
|
|
92
|
+
.def("is_empty", &theta_sketch::is_empty,
|
|
93
|
+
"Returns True if the sketch is empty, otherwise Dalse")
|
|
94
|
+
.def("get_estimate", &theta_sketch::get_estimate,
|
|
95
|
+
"Estimate of the distinct count of the input stream")
|
|
96
|
+
.def("get_upper_bound", &theta_sketch::get_upper_bound, py::arg("num_std_devs"),
|
|
97
|
+
"Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}")
|
|
98
|
+
.def("get_lower_bound", &theta_sketch::get_lower_bound, py::arg("num_std_devs"),
|
|
99
|
+
"Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}")
|
|
100
|
+
.def("is_estimation_mode", &theta_sketch::is_estimation_mode,
|
|
101
|
+
"Returns True if sketch is in estimation mode, otherwise False")
|
|
102
|
+
.def("get_theta", &theta_sketch::get_theta,
|
|
103
|
+
"Returns theta (effective sampling rate) as a fraction from 0 to 1")
|
|
104
|
+
.def("get_num_retained", &theta_sketch::get_num_retained,
|
|
105
|
+
"Retunrs the number of items currently in the sketch")
|
|
106
|
+
.def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
|
|
107
|
+
"Returns a hash of the seed used in the sketch")
|
|
108
|
+
.def("is_ordered", &theta_sketch::is_ordered,
|
|
109
|
+
"Returns True if the sketch entries are sorted, otherwise False")
|
|
110
|
+
;
|
|
111
|
+
|
|
112
|
+
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
|
|
113
|
+
.def(py::init(&dspy::update_theta_sketch_factory),
|
|
114
|
+
py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
|
115
|
+
.def(py::init<const update_theta_sketch&>())
|
|
116
|
+
.def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
|
|
117
|
+
"Updates the sketch with the given integral value")
|
|
118
|
+
.def("update", (void (update_theta_sketch::*)(double)) &update_theta_sketch::update, py::arg("datum"),
|
|
119
|
+
"Updates the sketch with the given floating point value")
|
|
120
|
+
.def("update", (void (update_theta_sketch::*)(const std::string&)) &update_theta_sketch::update, py::arg("datum"),
|
|
121
|
+
"Updates the sketch with the given string")
|
|
122
|
+
.def("compact", &update_theta_sketch::compact, py::arg("ordered")=true,
|
|
123
|
+
"Returns a compacted form of the sketch, optionally sorting it")
|
|
124
|
+
.def_static("deserialize", &dspy::update_theta_sketch_deserialize,
|
|
125
|
+
py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
|
|
126
|
+
"Reads a bytes object and returns the corresponding update_theta_sketch")
|
|
127
|
+
;
|
|
128
|
+
|
|
129
|
+
py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
|
|
130
|
+
.def(py::init<const compact_theta_sketch&>())
|
|
131
|
+
.def(py::init<const theta_sketch&, bool>())
|
|
132
|
+
.def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
|
|
133
|
+
py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
|
|
134
|
+
"Reads a bytes object and returns the corresponding update_theta_sketch")
|
|
135
|
+
;
|
|
136
|
+
|
|
137
|
+
py::class_<theta_union>(m, "theta_union")
|
|
138
|
+
.def(py::init(&dspy::theta_union_factory),
|
|
139
|
+
py::arg("lg_k")=update_theta_sketch::builder::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
|
140
|
+
.def("update", &theta_union::update, py::arg("sketch"),
|
|
141
|
+
"Updates the union with the given sketch")
|
|
142
|
+
.def("get_result", &theta_union::get_result, py::arg("ordered")=true,
|
|
143
|
+
"Returns the sketch corresponding to the union result")
|
|
144
|
+
;
|
|
145
|
+
|
|
146
|
+
py::class_<theta_intersection>(m, "theta_intersection")
|
|
147
|
+
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
|
|
148
|
+
.def(py::init<const theta_intersection&>())
|
|
149
|
+
.def("update", &theta_intersection::update, py::arg("sketch"),
|
|
150
|
+
"Intersections the provided sketch with the current intersection state")
|
|
151
|
+
.def("get_result", &theta_intersection::get_result, py::arg("ordered")=true,
|
|
152
|
+
"Returns the sketch corresponding to the intersection result")
|
|
153
|
+
.def("has_result", &theta_intersection::has_result,
|
|
154
|
+
"Returns True if the intersection has a valid result, otherwisel False")
|
|
155
|
+
;
|
|
156
|
+
|
|
157
|
+
py::class_<theta_a_not_b>(m, "theta_a_not_b")
|
|
158
|
+
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
|
|
159
|
+
.def("compute", &theta_a_not_b::compute, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
|
|
160
|
+
"Returns a sketch with the reuslt of appying the A-not-B operation on the given inputs")
|
|
161
|
+
;
|
|
162
|
+
}
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "kll_sketch.hpp"
|
|
21
|
+
|
|
22
|
+
#include <pybind11/pybind11.h>
|
|
23
|
+
#include <pybind11/stl.h>
|
|
24
|
+
#include <pybind11/numpy.h>
|
|
25
|
+
#include <sstream>
|
|
26
|
+
#include <vector>
|
|
27
|
+
|
|
28
|
+
namespace py = pybind11;
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
// Wrapper class for Numpy compatibility
|
|
33
|
+
template <typename T, typename C = std::less<T>, typename S = serde<T>>
|
|
34
|
+
class vector_of_kll_sketches {
|
|
35
|
+
public:
|
|
36
|
+
static const uint32_t DEFAULT_K = kll_sketch<T, C, S>::DEFAULT_K;
|
|
37
|
+
static const uint32_t DEFAULT_D = 1;
|
|
38
|
+
|
|
39
|
+
explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
|
|
40
|
+
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
|
41
|
+
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
|
42
|
+
vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
|
|
43
|
+
vector_of_kll_sketches<T,C,S>& operator=(vector_of_kll_sketches&& other);
|
|
44
|
+
|
|
45
|
+
// container parameters
|
|
46
|
+
inline uint32_t get_k() const;
|
|
47
|
+
inline uint32_t get_d() const;
|
|
48
|
+
|
|
49
|
+
// sketch updates/merges
|
|
50
|
+
void update(const py::array_t<T>& items);
|
|
51
|
+
void merge(const vector_of_kll_sketches<T>& other);
|
|
52
|
+
|
|
53
|
+
// returns a single sketch combining all data in the array
|
|
54
|
+
kll_sketch<T,C,S> collapse(const py::array_t<int>& isk) const;
|
|
55
|
+
|
|
56
|
+
// sketch queries returning an array of results
|
|
57
|
+
py::array is_empty() const;
|
|
58
|
+
py::array get_n() const;
|
|
59
|
+
py::array is_estimation_mode() const;
|
|
60
|
+
py::array get_min_values() const;
|
|
61
|
+
py::array get_max_values() const;
|
|
62
|
+
py::array get_num_retained() const;
|
|
63
|
+
py::array get_quantiles(const py::array_t<double>& fractions, const py::array_t<int>& isk) const;
|
|
64
|
+
py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
|
|
65
|
+
py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
|
66
|
+
py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
|
67
|
+
|
|
68
|
+
// human-readable output
|
|
69
|
+
std::string to_string(bool print_levels = false, bool print_items = false) const;
|
|
70
|
+
|
|
71
|
+
// binary output/input
|
|
72
|
+
py::list serialize(py::array_t<uint32_t>& isk);
|
|
73
|
+
// note: deserialize() replaces the sketch at the specified
|
|
74
|
+
// index. Not a static method.
|
|
75
|
+
void deserialize(const py::bytes& sk_bytes, uint32_t idx);
|
|
76
|
+
|
|
77
|
+
private:
|
|
78
|
+
std::vector<uint32_t> get_indices(const py::array_t<int>& isk) const;
|
|
79
|
+
|
|
80
|
+
const uint32_t k_; // kll sketch k parameter
|
|
81
|
+
const uint32_t d_; // number of dimensions (here: sketches) to hold
|
|
82
|
+
std::vector<kll_sketch<T,C,S>> sketches_;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
template<typename T, typename C, typename S>
|
|
86
|
+
vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(uint32_t k, uint32_t d):
|
|
87
|
+
k_(k),
|
|
88
|
+
d_(d)
|
|
89
|
+
{
|
|
90
|
+
// check d is valid (k is checked by kll_sketch)
|
|
91
|
+
if (d < 1) {
|
|
92
|
+
throw std::invalid_argument("D must be >= 1: " + std::to_string(d));
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
sketches_.reserve(d);
|
|
96
|
+
// spawn the sketches
|
|
97
|
+
for (uint32_t i = 0; i < d; i++) {
|
|
98
|
+
sketches_.emplace_back(k);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
template<typename T, typename C, typename S>
|
|
103
|
+
vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
|
|
104
|
+
k_(other.k_),
|
|
105
|
+
d_(other.d_),
|
|
106
|
+
sketches_(other.sketches_)
|
|
107
|
+
{}
|
|
108
|
+
|
|
109
|
+
template<typename T, typename C, typename S>
|
|
110
|
+
vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
|
|
111
|
+
k_(other.k_),
|
|
112
|
+
d_(other.d_),
|
|
113
|
+
sketches_(std::move(other.sketches_))
|
|
114
|
+
{}
|
|
115
|
+
|
|
116
|
+
template<typename T, typename C, typename S>
|
|
117
|
+
vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(const vector_of_kll_sketches& other) {
|
|
118
|
+
vector_of_kll_sketches<T,C,S> copy(other);
|
|
119
|
+
k_ = copy.k_;
|
|
120
|
+
d_ = copy.d_;
|
|
121
|
+
std::swap(sketches_, copy.sketches_);
|
|
122
|
+
return *this;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
template<typename T, typename C, typename S>
|
|
126
|
+
vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(vector_of_kll_sketches&& other) {
|
|
127
|
+
k_ = other.k_;
|
|
128
|
+
d_ = other.d_;
|
|
129
|
+
std::swap(sketches_, other.sketches_);
|
|
130
|
+
return *this;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
template<typename T, typename C, typename S>
|
|
134
|
+
uint32_t vector_of_kll_sketches<T,C,S>::get_k() const {
|
|
135
|
+
return k_;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
template<typename T, typename C, typename S>
|
|
139
|
+
uint32_t vector_of_kll_sketches<T,C,S>::get_d() const {
|
|
140
|
+
return d_;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
template<typename T, typename C, typename S>
|
|
144
|
+
std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array_t<int>& isk) const {
|
|
145
|
+
std::vector<uint32_t> indices;
|
|
146
|
+
if (isk.size() == 1) {
|
|
147
|
+
auto data = isk.unchecked();
|
|
148
|
+
if (data(0) == -1) {
|
|
149
|
+
indices.reserve(d_);
|
|
150
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
151
|
+
indices.push_back(i);
|
|
152
|
+
}
|
|
153
|
+
} else {
|
|
154
|
+
indices.push_back(static_cast<uint32_t>(data(0)));
|
|
155
|
+
}
|
|
156
|
+
} else {
|
|
157
|
+
auto data = isk.unchecked<1>();
|
|
158
|
+
indices.reserve(isk.size());
|
|
159
|
+
for (uint32_t i = 0; i < isk.size(); ++i) {
|
|
160
|
+
const uint32_t idx = static_cast<uint32_t>(data(i));
|
|
161
|
+
if (idx < d_) {
|
|
162
|
+
indices.push_back(idx);
|
|
163
|
+
} else {
|
|
164
|
+
throw std::invalid_argument("request for invalid dimenions >= d ("
|
|
165
|
+
+ std::to_string(d_) +"): "+ std::to_string(idx));
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return indices;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Checks if each sketch is empty or not
|
|
173
|
+
template<typename T, typename C, typename S>
|
|
174
|
+
py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
|
|
175
|
+
std::vector<bool> vals(d_);
|
|
176
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
177
|
+
vals[i] = sketches_[i].is_empty();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
return py::cast(vals);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Updates each sketch with values
|
|
184
|
+
// Currently: all values must be present
|
|
185
|
+
// TODO: allow subsets of sketches to be updated
|
|
186
|
+
template<typename T, typename C, typename S>
|
|
187
|
+
void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
|
|
188
|
+
|
|
189
|
+
size_t ndim = items.ndim();
|
|
190
|
+
|
|
191
|
+
if (items.shape(ndim-1) != d_) {
|
|
192
|
+
throw std::invalid_argument("input data must have rows with " + std::to_string(d_)
|
|
193
|
+
+ " elements. Found: " + std::to_string(items.shape(ndim-1)));
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (ndim == 1) {
|
|
197
|
+
// 1D case: single value to update per sketch
|
|
198
|
+
auto data = items.template unchecked<1>();
|
|
199
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
200
|
+
sketches_[i].update(data(i));
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
else if (ndim == 2) {
|
|
204
|
+
// 2D case: multiple values to update per sketch
|
|
205
|
+
auto data = items.template unchecked<2>();
|
|
206
|
+
if (items.flags() & py::array::f_style) {
|
|
207
|
+
for (uint32_t j = 0; j < d_; ++j) {
|
|
208
|
+
for (uint32_t i = 0; i < items.shape(0); ++i) {
|
|
209
|
+
sketches_[j].update(data(i,j));
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
} else { // py::array::c_style or py::array::forcecast
|
|
213
|
+
for (uint32_t i = 0; i < items.shape(0); ++i) {
|
|
214
|
+
for (uint32_t j = 0; j < d_; ++j) {
|
|
215
|
+
sketches_[j].update(data(i,j));
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
else {
|
|
221
|
+
throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim));
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Merges two arrays of sketches
|
|
226
|
+
// Currently: all values must be present
|
|
227
|
+
template<typename T, typename C, typename S>
|
|
228
|
+
void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other) {
|
|
229
|
+
if (d_ != other.get_d()) {
|
|
230
|
+
throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
|
|
231
|
+
+ " vs " + std::to_string(other.d_));
|
|
232
|
+
} else {
|
|
233
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
234
|
+
sketches_[i].merge(other.sketches_[i]);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
template<typename T, typename C, typename S>
|
|
240
|
+
kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>& isk) const {
|
|
241
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
242
|
+
|
|
243
|
+
kll_sketch<T,C,S> result(k_);
|
|
244
|
+
for (auto& idx : inds) {
|
|
245
|
+
result.merge(sketches_[idx]);
|
|
246
|
+
}
|
|
247
|
+
return result;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Number of updates for each sketch
|
|
251
|
+
template<typename T, typename C, typename S>
|
|
252
|
+
py::array vector_of_kll_sketches<T,C,S>::get_n() const {
|
|
253
|
+
std::vector<uint64_t> vals(d_);
|
|
254
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
255
|
+
vals[i] = sketches_[i].get_n();
|
|
256
|
+
}
|
|
257
|
+
return py::cast(vals);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Number of retained values for each sketch
|
|
261
|
+
template<typename T, typename C, typename S>
|
|
262
|
+
py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
|
|
263
|
+
std::vector<uint32_t> vals(d_);
|
|
264
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
265
|
+
vals[i] = sketches_[i].get_num_retained();
|
|
266
|
+
}
|
|
267
|
+
return py::cast(vals);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
// Gets the minimum value of each sketch
|
|
271
|
+
// TODO: allow subsets of sketches
|
|
272
|
+
template<typename T, typename C, typename S>
|
|
273
|
+
py::array vector_of_kll_sketches<T,C,S>::get_min_values() const {
|
|
274
|
+
std::vector<T> vals(d_);
|
|
275
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
276
|
+
vals[i] = sketches_[i].get_min_value();
|
|
277
|
+
}
|
|
278
|
+
return py::cast(vals);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Gets the maximum value of each sketch
|
|
282
|
+
// TODO: allow subsets of sketches
|
|
283
|
+
template<typename T, typename C, typename S>
|
|
284
|
+
py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
|
|
285
|
+
std::vector<T> vals(d_);
|
|
286
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
287
|
+
vals[i] = sketches_[i].get_max_value();
|
|
288
|
+
}
|
|
289
|
+
return py::cast(vals);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Summary of each sketch as one long string
|
|
293
|
+
// Users should use .split('\n\n') when calling it to build a list of each
|
|
294
|
+
// sketch's summary
|
|
295
|
+
template<typename T, typename C, typename S>
|
|
296
|
+
std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool print_items) const {
|
|
297
|
+
std::ostringstream ss;
|
|
298
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
299
|
+
// all streams into 1 string, for compatibility with Python's str() behavior
|
|
300
|
+
// users will need to split by \n\n, e.g., str(kll).split('\n\n')
|
|
301
|
+
if (i > 0) ss << "\n";
|
|
302
|
+
ss << sketches_[i].to_string(print_levels, print_items);
|
|
303
|
+
}
|
|
304
|
+
return ss.str();
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
template<typename T, typename C, typename S>
|
|
308
|
+
py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
|
|
309
|
+
std::vector<bool> vals(d_);
|
|
310
|
+
for (uint32_t i = 0; i < d_; ++i) {
|
|
311
|
+
vals[i] = sketches_[i].is_estimation_mode();
|
|
312
|
+
}
|
|
313
|
+
return py::cast(vals);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Value of sketch(es) corresponding to some quantile(s)
|
|
317
|
+
template<typename T, typename C, typename S>
|
|
318
|
+
py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>& fractions,
|
|
319
|
+
const py::array_t<int>& isk) const {
|
|
320
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
321
|
+
size_t num_sketches = inds.size();
|
|
322
|
+
size_t num_quantiles = fractions.size();
|
|
323
|
+
|
|
324
|
+
std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
|
|
325
|
+
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
326
|
+
auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
|
|
327
|
+
for (size_t j = 0; j < num_quantiles; ++j) {
|
|
328
|
+
quants[i][j] = quant[j];
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return py::cast(quants);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Value of sketch(es) corresponding to some rank(s)
|
|
336
|
+
template<typename T, typename C, typename S>
|
|
337
|
+
py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
|
|
338
|
+
const py::array_t<int>& isk) const {
|
|
339
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
340
|
+
size_t num_sketches = inds.size();
|
|
341
|
+
size_t num_ranks = values.size();
|
|
342
|
+
auto vals = values.data();
|
|
343
|
+
|
|
344
|
+
std::vector<std::vector<float>> ranks(num_sketches, std::vector<float>(num_ranks));
|
|
345
|
+
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
346
|
+
for (size_t j = 0; j < num_ranks; ++j) {
|
|
347
|
+
ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
return py::cast(ranks);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// PMF(s) of sketch(es)
|
|
355
|
+
template<typename T, typename C, typename S>
|
|
356
|
+
py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_points,
|
|
357
|
+
const py::array_t<int>& isk) const {
|
|
358
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
359
|
+
size_t num_sketches = inds.size();
|
|
360
|
+
size_t num_splits = split_points.size();
|
|
361
|
+
|
|
362
|
+
std::vector<std::vector<T>> pmfs(num_sketches, std::vector<T>(num_splits + 1));
|
|
363
|
+
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
364
|
+
auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits);
|
|
365
|
+
for (size_t j = 0; j <= num_splits; ++j) {
|
|
366
|
+
pmfs[i][j] = pmf[j];
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
return py::cast(pmfs);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// CDF(s) of sketch(es)
|
|
374
|
+
template<typename T, typename C, typename S>
|
|
375
|
+
py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_points,
|
|
376
|
+
const py::array_t<int>& isk) const {
|
|
377
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
378
|
+
size_t num_sketches = inds.size();
|
|
379
|
+
size_t num_splits = split_points.size();
|
|
380
|
+
|
|
381
|
+
std::vector<std::vector<T>> cdfs(num_sketches, std::vector<T>(num_splits + 1));
|
|
382
|
+
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
383
|
+
auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits);
|
|
384
|
+
for (size_t j = 0; j <= num_splits; ++j) {
|
|
385
|
+
cdfs[i][j] = cdf[j];
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
return py::cast(cdfs);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
template<typename T, typename C, typename S>
|
|
393
|
+
void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
|
|
394
|
+
uint32_t idx) {
|
|
395
|
+
if (idx >= d_) {
|
|
396
|
+
throw std::invalid_argument("request for invalid dimenions >= d ("
|
|
397
|
+
+ std::to_string(d_) +"): "+ std::to_string(idx));
|
|
398
|
+
}
|
|
399
|
+
std::string skStr = sk_bytes; // implicit cast
|
|
400
|
+
// load the sketch into the proper index
|
|
401
|
+
sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
template<typename T, typename C, typename S>
|
|
405
|
+
py::list vector_of_kll_sketches<T,C,S>::serialize(py::array_t<uint32_t>& isk) {
|
|
406
|
+
std::vector<uint32_t> inds = get_indices(isk);
|
|
407
|
+
const size_t num_sketches = inds.size();
|
|
408
|
+
|
|
409
|
+
py::list list(num_sketches);
|
|
410
|
+
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
411
|
+
auto serResult = sketches_[inds[i]].serialize();
|
|
412
|
+
list[i] = py::bytes((char*)serResult.data(), serResult.size());
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return list;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
namespace python {
|
|
419
|
+
template<typename T>
|
|
420
|
+
double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
421
|
+
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
} // namespace datasketches::python
|
|
425
|
+
|
|
426
|
+
} // namespace datasketches
|
|
427
|
+
|
|
428
|
+
namespace dspy = datasketches::python;
|
|
429
|
+
|
|
430
|
+
template<typename T>
|
|
431
|
+
void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
432
|
+
using namespace datasketches;
|
|
433
|
+
|
|
434
|
+
py::class_<vector_of_kll_sketches<T>>(m, name)
|
|
435
|
+
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_sketches<T>::DEFAULT_K,
|
|
436
|
+
py::arg("d")=vector_of_kll_sketches<T>::DEFAULT_D)
|
|
437
|
+
.def(py::init<const vector_of_kll_sketches<T>&>())
|
|
438
|
+
// allow user to retrieve k or d, in case it's instantiated w/ defaults
|
|
439
|
+
.def("get_k", &vector_of_kll_sketches<T>::get_k,
|
|
440
|
+
"Returns the value of `k` of the sketch(es)")
|
|
441
|
+
.def("get_d", &vector_of_kll_sketches<T>::get_d,
|
|
442
|
+
"Returns the number of sketches")
|
|
443
|
+
.def("update", &vector_of_kll_sketches<T>::update, py::arg("items"),
|
|
444
|
+
"Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan")
|
|
445
|
+
.def("__str__", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
446
|
+
"Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
|
|
447
|
+
.def("to_string", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false,
|
|
448
|
+
py::arg("print_items")=false,
|
|
449
|
+
"Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
|
|
450
|
+
.def("is_empty", &vector_of_kll_sketches<T>::is_empty,
|
|
451
|
+
"Returns whether the sketch(es) is(are) empty of not")
|
|
452
|
+
.def("get_n", &vector_of_kll_sketches<T>::get_n,
|
|
453
|
+
"Returns the number of values seen by the sketch(es)")
|
|
454
|
+
.def("get_num_retained", &vector_of_kll_sketches<T>::get_num_retained,
|
|
455
|
+
"Returns the number of values retained by the sketch(es)")
|
|
456
|
+
.def("is_estimation_mode", &vector_of_kll_sketches<T>::is_estimation_mode,
|
|
457
|
+
"Returns whether the sketch(es) is(are) in estimation mode")
|
|
458
|
+
.def("get_min_values", &vector_of_kll_sketches<T>::get_min_values,
|
|
459
|
+
"Returns the minimum value(s) of the sketch(es)")
|
|
460
|
+
.def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
|
|
461
|
+
"Returns the maximum value(s) of the sketch(es)")
|
|
462
|
+
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("fractions"),
|
|
463
|
+
py::arg("isk")=-1,
|
|
464
|
+
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `fractions` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
|
465
|
+
.def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
|
|
466
|
+
py::arg("isk")=-1,
|
|
467
|
+
"Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
|
468
|
+
.def("get_pmf", &vector_of_kll_sketches<T>::get_pmf, py::arg("split_points"), py::arg("isk")=-1,
|
|
469
|
+
"Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)")
|
|
470
|
+
.def("get_cdf", &vector_of_kll_sketches<T>::get_cdf, py::arg("split_points"), py::arg("isk")=-1,
|
|
471
|
+
"Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)")
|
|
472
|
+
.def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
|
|
473
|
+
py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error")
|
|
474
|
+
.def("serialize", &vector_of_kll_sketches<T>::serialize, py::arg("isk")=-1,
|
|
475
|
+
"Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)")
|
|
476
|
+
.def("deserialize", &vector_of_kll_sketches<T>::deserialize, py::arg("skBytes"), py::arg("isk"),
|
|
477
|
+
"Deserializes the specified sketch. `isk` must be an int.")
|
|
478
|
+
.def("merge", &vector_of_kll_sketches<T>::merge, py::arg("array_of_sketches"),
|
|
479
|
+
"Merges the input array of KLL sketches into the existing array.")
|
|
480
|
+
.def("collapse", &vector_of_kll_sketches<T>::collapse, py::arg("isk")=-1,
|
|
481
|
+
"Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)")
|
|
482
|
+
;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
void init_vector_of_kll(py::module &m) {
|
|
486
|
+
bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
|
|
487
|
+
bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
|
|
488
|
+
}
|