datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -1,490 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
-
* or more contributor license agreements. See the NOTICE file
|
4
|
-
* distributed with this work for additional information
|
5
|
-
* regarding copyright ownership. The ASF licenses this file
|
6
|
-
* to you under the Apache License, Version 2.0 (the
|
7
|
-
* "License"); you may not use this file except in compliance
|
8
|
-
* with the License. You may obtain a copy of the License at
|
9
|
-
*
|
10
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
-
*
|
12
|
-
* Unless required by applicable law or agreed to in writing,
|
13
|
-
* software distributed under the License is distributed on an
|
14
|
-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
-
* KIND, either express or implied. See the License for the
|
16
|
-
* specific language governing permissions and limitations
|
17
|
-
* under the License.
|
18
|
-
*/
|
19
|
-
|
20
|
-
#include "kll_sketch.hpp"
|
21
|
-
|
22
|
-
#include <pybind11/pybind11.h>
|
23
|
-
#include <pybind11/stl.h>
|
24
|
-
#include <pybind11/numpy.h>
|
25
|
-
#include <sstream>
|
26
|
-
#include <vector>
|
27
|
-
#include <stdexcept>
|
28
|
-
|
29
|
-
namespace py = pybind11;
|
30
|
-
|
31
|
-
namespace datasketches {
|
32
|
-
|
33
|
-
namespace vector_of_kll_constants {
|
34
|
-
static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
|
35
|
-
static const uint32_t DEFAULT_D = 1;
|
36
|
-
}
|
37
|
-
|
38
|
-
// Wrapper class for Numpy compatibility
|
39
|
-
template <typename T, typename C = std::less<T>>
|
40
|
-
class vector_of_kll_sketches {
|
41
|
-
public:
|
42
|
-
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
43
|
-
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
44
|
-
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
45
|
-
vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
|
46
|
-
vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
|
47
|
-
|
48
|
-
// container parameters
|
49
|
-
inline uint32_t get_k() const;
|
50
|
-
inline uint32_t get_d() const;
|
51
|
-
|
52
|
-
// sketch updates/merges
|
53
|
-
void update(const py::array_t<T>& items);
|
54
|
-
void merge(const vector_of_kll_sketches<T>& other);
|
55
|
-
|
56
|
-
// returns a single sketch combining all data in the array
|
57
|
-
kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
|
58
|
-
|
59
|
-
// sketch queries returning an array of results
|
60
|
-
py::array is_empty() const;
|
61
|
-
py::array get_n() const;
|
62
|
-
py::array is_estimation_mode() const;
|
63
|
-
py::array get_min_values() const;
|
64
|
-
py::array get_max_values() const;
|
65
|
-
py::array get_num_retained() const;
|
66
|
-
py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
|
67
|
-
py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
|
68
|
-
py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
69
|
-
py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
70
|
-
|
71
|
-
// human-readable output
|
72
|
-
std::string to_string(bool print_levels = false, bool print_items = false) const;
|
73
|
-
|
74
|
-
// binary output/input
|
75
|
-
py::list serialize(const py::array_t<int>& isk);
|
76
|
-
// note: deserialize() replaces the sketch at the specified
|
77
|
-
// index. Not a static method.
|
78
|
-
void deserialize(const py::bytes& sk_bytes, uint32_t idx);
|
79
|
-
|
80
|
-
private:
|
81
|
-
std::vector<uint32_t> get_indices(const py::array_t<int>& isk) const;
|
82
|
-
|
83
|
-
const uint32_t k_; // kll sketch k parameter
|
84
|
-
const uint32_t d_; // number of dimensions (here: sketches) to hold
|
85
|
-
std::vector<kll_sketch<T, C>> sketches_;
|
86
|
-
};
|
87
|
-
|
88
|
-
template<typename T, typename C>
|
89
|
-
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
|
90
|
-
k_(k),
|
91
|
-
d_(d)
|
92
|
-
{
|
93
|
-
// check d is valid (k is checked by kll_sketch)
|
94
|
-
if (d < 1) {
|
95
|
-
throw std::invalid_argument("D must be >= 1: " + std::to_string(d));
|
96
|
-
}
|
97
|
-
|
98
|
-
sketches_.reserve(d);
|
99
|
-
// spawn the sketches
|
100
|
-
for (uint32_t i = 0; i < d; i++) {
|
101
|
-
sketches_.emplace_back(k);
|
102
|
-
}
|
103
|
-
}
|
104
|
-
|
105
|
-
template<typename T, typename C>
|
106
|
-
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
|
107
|
-
k_(other.k_),
|
108
|
-
d_(other.d_),
|
109
|
-
sketches_(other.sketches_)
|
110
|
-
{}
|
111
|
-
|
112
|
-
template<typename T, typename C>
|
113
|
-
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
|
114
|
-
k_(other.k_),
|
115
|
-
d_(other.d_),
|
116
|
-
sketches_(std::move(other.sketches_))
|
117
|
-
{}
|
118
|
-
|
119
|
-
template<typename T, typename C>
|
120
|
-
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
|
121
|
-
vector_of_kll_sketches<T, C> copy(other);
|
122
|
-
k_ = copy.k_;
|
123
|
-
d_ = copy.d_;
|
124
|
-
std::swap(sketches_, copy.sketches_);
|
125
|
-
return *this;
|
126
|
-
}
|
127
|
-
|
128
|
-
template<typename T, typename C>
|
129
|
-
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
|
130
|
-
k_ = other.k_;
|
131
|
-
d_ = other.d_;
|
132
|
-
std::swap(sketches_, other.sketches_);
|
133
|
-
return *this;
|
134
|
-
}
|
135
|
-
|
136
|
-
template<typename T, typename C>
|
137
|
-
uint32_t vector_of_kll_sketches<T, C>::get_k() const {
|
138
|
-
return k_;
|
139
|
-
}
|
140
|
-
|
141
|
-
template<typename T, typename C>
|
142
|
-
uint32_t vector_of_kll_sketches<T, C>::get_d() const {
|
143
|
-
return d_;
|
144
|
-
}
|
145
|
-
|
146
|
-
template<typename T, typename C>
|
147
|
-
std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
|
148
|
-
std::vector<uint32_t> indices;
|
149
|
-
if (isk.size() == 1) {
|
150
|
-
auto data = isk.unchecked();
|
151
|
-
if (data(0) == -1) {
|
152
|
-
indices.reserve(d_);
|
153
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
154
|
-
indices.push_back(i);
|
155
|
-
}
|
156
|
-
} else {
|
157
|
-
indices.push_back(static_cast<uint32_t>(data(0)));
|
158
|
-
}
|
159
|
-
} else {
|
160
|
-
auto data = isk.unchecked<1>();
|
161
|
-
indices.reserve(isk.size());
|
162
|
-
for (uint32_t i = 0; i < isk.size(); ++i) {
|
163
|
-
const uint32_t idx = static_cast<uint32_t>(data(i));
|
164
|
-
if (idx < d_) {
|
165
|
-
indices.push_back(idx);
|
166
|
-
} else {
|
167
|
-
throw std::invalid_argument("request for invalid dimenions >= d ("
|
168
|
-
+ std::to_string(d_) +"): "+ std::to_string(idx));
|
169
|
-
}
|
170
|
-
}
|
171
|
-
}
|
172
|
-
return indices;
|
173
|
-
}
|
174
|
-
|
175
|
-
// Checks if each sketch is empty or not
|
176
|
-
template<typename T, typename C>
|
177
|
-
py::array vector_of_kll_sketches<T, C>::is_empty() const {
|
178
|
-
std::vector<bool> vals(d_);
|
179
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
180
|
-
vals[i] = sketches_[i].is_empty();
|
181
|
-
}
|
182
|
-
|
183
|
-
return py::cast(vals);
|
184
|
-
}
|
185
|
-
|
186
|
-
// Updates each sketch with values
|
187
|
-
// Currently: all values must be present
|
188
|
-
// TODO: allow subsets of sketches to be updated
|
189
|
-
template<typename T, typename C>
|
190
|
-
void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
|
191
|
-
|
192
|
-
size_t ndim = items.ndim();
|
193
|
-
|
194
|
-
if (items.shape(ndim-1) != d_) {
|
195
|
-
throw std::invalid_argument("input data must have rows with " + std::to_string(d_)
|
196
|
-
+ " elements. Found: " + std::to_string(items.shape(ndim-1)));
|
197
|
-
}
|
198
|
-
|
199
|
-
if (ndim == 1) {
|
200
|
-
// 1D case: single value to update per sketch
|
201
|
-
auto data = items.template unchecked<1>();
|
202
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
203
|
-
sketches_[i].update(data(i));
|
204
|
-
}
|
205
|
-
}
|
206
|
-
else if (ndim == 2) {
|
207
|
-
// 2D case: multiple values to update per sketch
|
208
|
-
auto data = items.template unchecked<2>();
|
209
|
-
if (items.flags() & py::array::f_style) {
|
210
|
-
for (uint32_t j = 0; j < d_; ++j) {
|
211
|
-
for (uint32_t i = 0; i < items.shape(0); ++i) {
|
212
|
-
sketches_[j].update(data(i,j));
|
213
|
-
}
|
214
|
-
}
|
215
|
-
} else { // py::array::c_style or py::array::forcecast
|
216
|
-
for (uint32_t i = 0; i < items.shape(0); ++i) {
|
217
|
-
for (uint32_t j = 0; j < d_; ++j) {
|
218
|
-
sketches_[j].update(data(i,j));
|
219
|
-
}
|
220
|
-
}
|
221
|
-
}
|
222
|
-
}
|
223
|
-
else {
|
224
|
-
throw std::invalid_argument("Update input must be 2 or fewer dimensions : " + std::to_string(ndim));
|
225
|
-
}
|
226
|
-
}
|
227
|
-
|
228
|
-
// Merges two arrays of sketches
|
229
|
-
// Currently: all values must be present
|
230
|
-
template<typename T, typename C>
|
231
|
-
void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
|
232
|
-
if (d_ != other.get_d()) {
|
233
|
-
throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
|
234
|
-
+ " vs " + std::to_string(other.d_));
|
235
|
-
} else {
|
236
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
237
|
-
sketches_[i].merge(other.sketches_[i]);
|
238
|
-
}
|
239
|
-
}
|
240
|
-
}
|
241
|
-
|
242
|
-
template<typename T, typename C>
|
243
|
-
kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
|
244
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
245
|
-
|
246
|
-
kll_sketch<T, C> result(k_);
|
247
|
-
for (auto& idx : inds) {
|
248
|
-
result.merge(sketches_[idx]);
|
249
|
-
}
|
250
|
-
return result;
|
251
|
-
}
|
252
|
-
|
253
|
-
// Number of updates for each sketch
|
254
|
-
template<typename T, typename C>
|
255
|
-
py::array vector_of_kll_sketches<T, C>::get_n() const {
|
256
|
-
std::vector<uint64_t> vals(d_);
|
257
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
258
|
-
vals[i] = sketches_[i].get_n();
|
259
|
-
}
|
260
|
-
return py::cast(vals);
|
261
|
-
}
|
262
|
-
|
263
|
-
// Number of retained values for each sketch
|
264
|
-
template<typename T, typename C>
|
265
|
-
py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
|
266
|
-
std::vector<uint32_t> vals(d_);
|
267
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
268
|
-
vals[i] = sketches_[i].get_num_retained();
|
269
|
-
}
|
270
|
-
return py::cast(vals);
|
271
|
-
}
|
272
|
-
|
273
|
-
// Gets the minimum value of each sketch
|
274
|
-
// TODO: allow subsets of sketches
|
275
|
-
template<typename T, typename C>
|
276
|
-
py::array vector_of_kll_sketches<T, C>::get_min_values() const {
|
277
|
-
std::vector<T> vals(d_);
|
278
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
279
|
-
vals[i] = sketches_[i].get_min_item();
|
280
|
-
}
|
281
|
-
return py::cast(vals);
|
282
|
-
}
|
283
|
-
|
284
|
-
// Gets the maximum value of each sketch
|
285
|
-
// TODO: allow subsets of sketches
|
286
|
-
template<typename T, typename C>
|
287
|
-
py::array vector_of_kll_sketches<T, C>::get_max_values() const {
|
288
|
-
std::vector<T> vals(d_);
|
289
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
290
|
-
vals[i] = sketches_[i].get_max_item();
|
291
|
-
}
|
292
|
-
return py::cast(vals);
|
293
|
-
}
|
294
|
-
|
295
|
-
// Summary of each sketch as one long string
|
296
|
-
// Users should use .split('\n\n') when calling it to build a list of each
|
297
|
-
// sketch's summary
|
298
|
-
template<typename T, typename C>
|
299
|
-
std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
|
300
|
-
std::ostringstream ss;
|
301
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
302
|
-
// all streams into 1 string, for compatibility with Python's str() behavior
|
303
|
-
// users will need to split by \n\n, e.g., str(kll).split('\n\n')
|
304
|
-
if (i > 0) ss << "\n";
|
305
|
-
ss << sketches_[i].to_string(print_levels, print_items);
|
306
|
-
}
|
307
|
-
return ss.str();
|
308
|
-
}
|
309
|
-
|
310
|
-
template<typename T, typename C>
|
311
|
-
py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
|
312
|
-
std::vector<bool> vals(d_);
|
313
|
-
for (uint32_t i = 0; i < d_; ++i) {
|
314
|
-
vals[i] = sketches_[i].is_estimation_mode();
|
315
|
-
}
|
316
|
-
return py::cast(vals);
|
317
|
-
}
|
318
|
-
|
319
|
-
// Value of sketch(es) corresponding to some quantile(s)
|
320
|
-
template<typename T, typename C>
|
321
|
-
py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
|
322
|
-
const py::array_t<int>& isk) const {
|
323
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
324
|
-
size_t num_sketches = inds.size();
|
325
|
-
size_t num_quantiles = ranks.size();
|
326
|
-
|
327
|
-
std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
|
328
|
-
for (uint32_t i = 0; i < num_sketches; ++i) {
|
329
|
-
for (size_t j = 0; j < num_quantiles; ++j) {
|
330
|
-
quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
|
331
|
-
}
|
332
|
-
}
|
333
|
-
|
334
|
-
return py::cast(quants);
|
335
|
-
}
|
336
|
-
|
337
|
-
// Value of sketch(es) corresponding to some rank(s)
|
338
|
-
template<typename T, typename C>
|
339
|
-
py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
|
340
|
-
const py::array_t<int>& isk) const {
|
341
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
342
|
-
size_t num_sketches = inds.size();
|
343
|
-
size_t num_ranks = values.size();
|
344
|
-
auto vals = values.data();
|
345
|
-
|
346
|
-
std::vector<std::vector<float>> ranks(num_sketches, std::vector<float>(num_ranks));
|
347
|
-
for (uint32_t i = 0; i < num_sketches; ++i) {
|
348
|
-
for (size_t j = 0; j < num_ranks; ++j) {
|
349
|
-
ranks[i][j] = sketches_[inds[i]].get_rank(vals[j]);
|
350
|
-
}
|
351
|
-
}
|
352
|
-
|
353
|
-
return py::cast(ranks);
|
354
|
-
}
|
355
|
-
|
356
|
-
// PMF(s) of sketch(es)
|
357
|
-
template<typename T, typename C>
|
358
|
-
py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
|
359
|
-
const py::array_t<int>& isk) const {
|
360
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
361
|
-
size_t num_sketches = inds.size();
|
362
|
-
size_t num_splits = split_points.size();
|
363
|
-
|
364
|
-
std::vector<std::vector<T>> pmfs(num_sketches, std::vector<T>(num_splits + 1));
|
365
|
-
for (uint32_t i = 0; i < num_sketches; ++i) {
|
366
|
-
auto pmf = sketches_[inds[i]].get_PMF(split_points.data(), num_splits);
|
367
|
-
for (size_t j = 0; j <= num_splits; ++j) {
|
368
|
-
pmfs[i][j] = pmf[j];
|
369
|
-
}
|
370
|
-
}
|
371
|
-
|
372
|
-
return py::cast(pmfs);
|
373
|
-
}
|
374
|
-
|
375
|
-
// CDF(s) of sketch(es)
|
376
|
-
template<typename T, typename C>
|
377
|
-
py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
|
378
|
-
const py::array_t<int>& isk) const {
|
379
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
380
|
-
size_t num_sketches = inds.size();
|
381
|
-
size_t num_splits = split_points.size();
|
382
|
-
|
383
|
-
std::vector<std::vector<T>> cdfs(num_sketches, std::vector<T>(num_splits + 1));
|
384
|
-
for (uint32_t i = 0; i < num_sketches; ++i) {
|
385
|
-
auto cdf = sketches_[inds[i]].get_CDF(split_points.data(), num_splits);
|
386
|
-
for (size_t j = 0; j <= num_splits; ++j) {
|
387
|
-
cdfs[i][j] = cdf[j];
|
388
|
-
}
|
389
|
-
}
|
390
|
-
|
391
|
-
return py::cast(cdfs);
|
392
|
-
}
|
393
|
-
|
394
|
-
template<typename T, typename C>
|
395
|
-
void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
|
396
|
-
uint32_t idx) {
|
397
|
-
if (idx >= d_) {
|
398
|
-
throw std::invalid_argument("request for invalid dimenions >= d ("
|
399
|
-
+ std::to_string(d_) +"): "+ std::to_string(idx));
|
400
|
-
}
|
401
|
-
std::string skStr = sk_bytes; // implicit cast
|
402
|
-
// load the sketch into the proper index
|
403
|
-
sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
|
404
|
-
}
|
405
|
-
|
406
|
-
template<typename T, typename C>
|
407
|
-
py::list vector_of_kll_sketches<T, C>::serialize(const py::array_t<int>& isk) {
|
408
|
-
std::vector<uint32_t> inds = get_indices(isk);
|
409
|
-
const size_t num_sketches = inds.size();
|
410
|
-
|
411
|
-
py::list list(num_sketches);
|
412
|
-
for (uint32_t i = 0; i < num_sketches; ++i) {
|
413
|
-
auto serResult = sketches_[inds[i]].serialize();
|
414
|
-
list[i] = py::bytes((char*)serResult.data(), serResult.size());
|
415
|
-
}
|
416
|
-
|
417
|
-
return list;
|
418
|
-
}
|
419
|
-
|
420
|
-
namespace python {
|
421
|
-
template<typename T>
|
422
|
-
double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
423
|
-
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
424
|
-
}
|
425
|
-
|
426
|
-
} // namespace datasketches::python
|
427
|
-
|
428
|
-
} // namespace datasketches
|
429
|
-
|
430
|
-
namespace dspy = datasketches::python;
|
431
|
-
|
432
|
-
template<typename T>
|
433
|
-
void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
434
|
-
using namespace datasketches;
|
435
|
-
|
436
|
-
py::class_<vector_of_kll_sketches<T>>(m, name)
|
437
|
-
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
|
438
|
-
py::arg("d")=vector_of_kll_constants::DEFAULT_D)
|
439
|
-
.def(py::init<const vector_of_kll_sketches<T>&>())
|
440
|
-
// allow user to retrieve k or d, in case it's instantiated w/ defaults
|
441
|
-
.def("get_k", &vector_of_kll_sketches<T>::get_k,
|
442
|
-
"Returns the value of `k` of the sketch(es)")
|
443
|
-
.def("get_d", &vector_of_kll_sketches<T>::get_d,
|
444
|
-
"Returns the number of sketches")
|
445
|
-
.def("update", &vector_of_kll_sketches<T>::update, py::arg("items"),
|
446
|
-
"Updates the sketch(es) with value(s). Must be a 1D array of size equal to the number of sketches. Can also be 2D array of shape (n_updates, n_sketches). If a sketch does not have a value to update, use np.nan")
|
447
|
-
.def("__str__", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
448
|
-
"Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
|
449
|
-
.def("to_string", &vector_of_kll_sketches<T>::to_string, py::arg("print_levels")=false,
|
450
|
-
py::arg("print_items")=false,
|
451
|
-
"Produces a string summary of all sketches. Users should split the returned string by '\n\n'")
|
452
|
-
.def("is_empty", &vector_of_kll_sketches<T>::is_empty,
|
453
|
-
"Returns whether the sketch(es) is(are) empty of not")
|
454
|
-
.def("get_n", &vector_of_kll_sketches<T>::get_n,
|
455
|
-
"Returns the number of values seen by the sketch(es)")
|
456
|
-
.def("get_num_retained", &vector_of_kll_sketches<T>::get_num_retained,
|
457
|
-
"Returns the number of values retained by the sketch(es)")
|
458
|
-
.def("is_estimation_mode", &vector_of_kll_sketches<T>::is_estimation_mode,
|
459
|
-
"Returns whether the sketch(es) is(are) in estimation mode")
|
460
|
-
.def("get_min_values", &vector_of_kll_sketches<T>::get_min_values,
|
461
|
-
"Returns the minimum value(s) of the sketch(es)")
|
462
|
-
.def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
|
463
|
-
"Returns the maximum value(s) of the sketch(es)")
|
464
|
-
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
|
465
|
-
py::arg("isk")=-1,
|
466
|
-
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
467
|
-
.def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
|
468
|
-
py::arg("isk")=-1,
|
469
|
-
"Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
470
|
-
.def("get_pmf", &vector_of_kll_sketches<T>::get_pmf, py::arg("split_points"), py::arg("isk")=-1,
|
471
|
-
"Returns the probability mass function (PMF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the PMF for (default: all sketches)")
|
472
|
-
.def("get_cdf", &vector_of_kll_sketches<T>::get_cdf, py::arg("split_points"), py::arg("isk")=-1,
|
473
|
-
"Returns the cumulative distribution function (CDF) at `split_points` of the specified sketch(es). `split_points` should be a list/array of floats between 0 and 1 (inclusive). `isk` specifies which sketch(es) to return the CDF for (default: all sketches)")
|
474
|
-
.def_static("get_normalized_rank_error", &dspy::kll_sketch_generic_normalized_rank_error<T>,
|
475
|
-
py::arg("k"), py::arg("as_pmf"), "Returns the normalized rank error")
|
476
|
-
.def("serialize", &vector_of_kll_sketches<T>::serialize, py::arg("isk")=-1,
|
477
|
-
"Serializes the specified sketch(es). `isk` can be an int or a list/array of ints (default: all sketches)")
|
478
|
-
.def("deserialize", &vector_of_kll_sketches<T>::deserialize, py::arg("skBytes"), py::arg("isk"),
|
479
|
-
"Deserializes the specified sketch. `isk` must be an int.")
|
480
|
-
.def("merge", &vector_of_kll_sketches<T>::merge, py::arg("array_of_sketches"),
|
481
|
-
"Merges the input array of KLL sketches into the existing array.")
|
482
|
-
.def("collapse", &vector_of_kll_sketches<T>::collapse, py::arg("isk")=-1,
|
483
|
-
"Returns the result of collapsing all sketches in the array into a single sketch. 'isk' can be an int or a list/array of ints (default: all sketches)")
|
484
|
-
;
|
485
|
-
}
|
486
|
-
|
487
|
-
void init_vector_of_kll(py::module &m) {
|
488
|
-
bind_vector_of_kll_sketches<int>(m, "vector_of_kll_ints_sketches");
|
489
|
-
bind_vector_of_kll_sketches<float>(m, "vector_of_kll_floats_sketches");
|
490
|
-
}
|
@@ -1,173 +0,0 @@
|
|
1
|
-
/*
|
2
|
-
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
-
* or more contributor license agreements. See the NOTICE file
|
4
|
-
* distributed with this work for additional information
|
5
|
-
* regarding copyright ownership. The ASF licenses this file
|
6
|
-
* to you under the Apache License, Version 2.0 (the
|
7
|
-
* "License"); you may not use this file except in compliance
|
8
|
-
* with the License. You may obtain a copy of the License at
|
9
|
-
*
|
10
|
-
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
-
*
|
12
|
-
* Unless required by applicable law or agreed to in writing,
|
13
|
-
* software distributed under the License is distributed on an
|
14
|
-
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
-
* KIND, either express or implied. See the License for the
|
16
|
-
* specific language governing permissions and limitations
|
17
|
-
* under the License.
|
18
|
-
*/
|
19
|
-
|
20
|
-
#include "var_opt_sketch.hpp"
|
21
|
-
#include "var_opt_union.hpp"
|
22
|
-
#include "py_serde.hpp"
|
23
|
-
|
24
|
-
#include <pybind11/pybind11.h>
|
25
|
-
|
26
|
-
namespace py = pybind11;
|
27
|
-
|
28
|
-
namespace datasketches {
|
29
|
-
|
30
|
-
namespace python {
|
31
|
-
|
32
|
-
template<typename T>
|
33
|
-
var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
|
34
|
-
std::string skStr = skBytes; // implicit cast
|
35
|
-
return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
|
36
|
-
}
|
37
|
-
|
38
|
-
template<typename T>
|
39
|
-
py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
40
|
-
auto serResult = sk.serialize(0, sd);
|
41
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
42
|
-
}
|
43
|
-
|
44
|
-
template<typename T>
|
45
|
-
size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
46
|
-
return sk.get_serialized_size_bytes(sd);
|
47
|
-
}
|
48
|
-
|
49
|
-
template<typename T>
|
50
|
-
var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
|
51
|
-
std::string uStr = uBytes; // implicit cast
|
52
|
-
return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
|
53
|
-
}
|
54
|
-
|
55
|
-
template<typename T>
|
56
|
-
py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
|
57
|
-
auto serResult = u.serialize(0, sd);
|
58
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
59
|
-
}
|
60
|
-
|
61
|
-
template<typename T>
|
62
|
-
size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
|
63
|
-
return u.get_serialized_size_bytes(sd);
|
64
|
-
}
|
65
|
-
|
66
|
-
template<typename T>
|
67
|
-
py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
|
68
|
-
py::list list;
|
69
|
-
for (auto item : sk) {
|
70
|
-
py::tuple t = py::make_tuple(item.first, item.second);
|
71
|
-
list.append(t);
|
72
|
-
}
|
73
|
-
return list;
|
74
|
-
}
|
75
|
-
|
76
|
-
template<typename T>
|
77
|
-
py::dict vo_sketch_estimate_subset_sum(const var_opt_sketch<T>& sk, const std::function<bool(T)> func) {
|
78
|
-
subset_summary summary = sk.estimate_subset_sum(func);
|
79
|
-
py::dict d;
|
80
|
-
d["estimate"] = summary.estimate;
|
81
|
-
d["lower_bound"] = summary.lower_bound;
|
82
|
-
d["upper_bound"] = summary.upper_bound;
|
83
|
-
d["total_sketch_weight"] = summary.total_sketch_weight;
|
84
|
-
return d;
|
85
|
-
}
|
86
|
-
|
87
|
-
template<typename T>
|
88
|
-
std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
|
89
|
-
if (print_items) {
|
90
|
-
std::ostringstream ss;
|
91
|
-
ss << sk.to_string();
|
92
|
-
ss << "### VarOpt Sketch Items" << std::endl;
|
93
|
-
int i = 0;
|
94
|
-
for (auto item : sk) {
|
95
|
-
// item.second is always a double
|
96
|
-
// item.first is an arbitrary py::object, so get the value by
|
97
|
-
// using internal str() method then casting to C++ std::string
|
98
|
-
py::str item_pystr(item.first);
|
99
|
-
std::string item_str = py::cast<std::string>(item_pystr);
|
100
|
-
ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
|
101
|
-
}
|
102
|
-
return ss.str();
|
103
|
-
} else {
|
104
|
-
return sk.to_string();
|
105
|
-
}
|
106
|
-
}
|
107
|
-
|
108
|
-
}
|
109
|
-
}
|
110
|
-
|
111
|
-
namespace dspy = datasketches::python;
|
112
|
-
|
113
|
-
template<typename T>
|
114
|
-
void bind_vo_sketch(py::module &m, const char* name) {
|
115
|
-
using namespace datasketches;
|
116
|
-
|
117
|
-
py::class_<var_opt_sketch<T>>(m, name)
|
118
|
-
.def(py::init<uint32_t>(), py::arg("k"))
|
119
|
-
.def("__str__", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
|
120
|
-
"Produces a string summary of the sketch")
|
121
|
-
.def("to_string", &dspy::vo_sketch_to_string<T>, py::arg("print_items")=false,
|
122
|
-
"Produces a string summary of the sketch")
|
123
|
-
.def("update", (void (var_opt_sketch<T>::*)(const T&, double)) &var_opt_sketch<T>::update, py::arg("item"), py::arg("weight")=1.0,
|
124
|
-
"Updates the sketch with the given value and weight")
|
125
|
-
.def_property_readonly("k", &var_opt_sketch<T>::get_k,
|
126
|
-
"Returns the sketch's maximum configured sample size")
|
127
|
-
.def_property_readonly("n", &var_opt_sketch<T>::get_n,
|
128
|
-
"Returns the total stream length")
|
129
|
-
.def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
|
130
|
-
"Returns the number of samples currently in the sketch")
|
131
|
-
.def("get_samples", &dspy::vo_sketch_get_samples<T>,
|
132
|
-
"Returns the set of samples in the sketch")
|
133
|
-
.def("is_empty", &var_opt_sketch<T>::is_empty,
|
134
|
-
"Returns True if the sketch is empty, otherwise False")
|
135
|
-
.def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
|
136
|
-
"Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
|
137
|
-
"as upper and lower bounds on the estimate and the total weight processed by the sketch")
|
138
|
-
.def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
|
139
|
-
"Computes the size in bytes needed to serialize the current sketch")
|
140
|
-
.def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
|
141
|
-
.def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
142
|
-
"Constructs a var opt sketch from the given bytes using the provided serde")
|
143
|
-
.def("__iter__", [](const var_opt_sketch<T>& sk) { return py::make_iterator(sk.begin(), sk.end()); });
|
144
|
-
}
|
145
|
-
|
146
|
-
template<typename T>
|
147
|
-
void bind_vo_union(py::module &m, const char* name) {
|
148
|
-
using namespace datasketches;
|
149
|
-
|
150
|
-
py::class_<var_opt_union<T>>(m, name)
|
151
|
-
.def(py::init<uint32_t>(), py::arg("max_k"))
|
152
|
-
.def("__str__", &var_opt_union<T>::to_string,
|
153
|
-
"Produces a string summary of the sketch")
|
154
|
-
.def("to_string", &var_opt_union<T>::to_string,
|
155
|
-
"Produces a string summary of the sketch")
|
156
|
-
.def("update", (void (var_opt_union<T>::*)(const var_opt_sketch<T>& sk)) &var_opt_union<T>::update, py::arg("sketch"),
|
157
|
-
"Updates the union with the given sketch")
|
158
|
-
.def("get_result", &var_opt_union<T>::get_result,
|
159
|
-
"Returns a sketch corresponding to the union result")
|
160
|
-
.def("reset", &var_opt_union<T>::reset,
|
161
|
-
"Resets the union to the empty state")
|
162
|
-
.def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
|
163
|
-
"Computes the size in bytes needed to serialize the current sketch")
|
164
|
-
.def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
|
165
|
-
.def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
166
|
-
"Constructs a var opt union from the given bytes using the provided serde")
|
167
|
-
;
|
168
|
-
}
|
169
|
-
|
170
|
-
void init_vo(py::module &m) {
|
171
|
-
bind_vo_sketch<py::object>(m, "var_opt_sketch");
|
172
|
-
bind_vo_union<py::object>(m, "var_opt_union");
|
173
|
-
}
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
-
# or more contributor license agreements. See the NOTICE file
|
3
|
-
# distributed with this work for additional information
|
4
|
-
# regarding copyright ownership. The ASF licenses this file
|
5
|
-
# to you under the Apache License, Version 2.0 (the
|
6
|
-
# "License"); you may not use this file except in compliance
|
7
|
-
# with the License. You may obtain a copy of the License at
|
8
|
-
#
|
9
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
-
#
|
11
|
-
# Unless required by applicable law or agreed to in writing,
|
12
|
-
# software distributed under the License is distributed on an
|
13
|
-
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
-
# KIND, either express or implied. See the License for the
|
15
|
-
# specific language governing permissions and limitations
|
16
|
-
# under the License.
|