datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _QUANTILE_CONDITIONAL_HPP_
|
|
21
|
+
#define _QUANTILE_CONDITIONAL_HPP_
|
|
22
|
+
|
|
23
|
+
/*
|
|
24
|
+
This header defines conditionally compiled functions shared
|
|
25
|
+
across the set of quantile family sketches.
|
|
26
|
+
*/
|
|
27
|
+
|
|
28
|
+
#include "common_defs.hpp"
|
|
29
|
+
#include "py_serde.hpp"
|
|
30
|
+
|
|
31
|
+
#include <pybind11/pybind11.h>
|
|
32
|
+
#include <pybind11/numpy.h>
|
|
33
|
+
|
|
34
|
+
namespace py = pybind11;
|
|
35
|
+
|
|
36
|
+
// Serialization
|
|
37
|
+
// std::string and arithmetic types, where we don't need a separate serde
|
|
38
|
+
template<typename T, typename SK, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
|
|
39
|
+
void add_serialization(py::class_<SK>& clazz) {
|
|
40
|
+
clazz.def(
|
|
41
|
+
"serialize",
|
|
42
|
+
[](const SK& sk) {
|
|
43
|
+
auto bytes = sk.serialize();
|
|
44
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
45
|
+
},
|
|
46
|
+
"Serializes the sketch into a bytes object."
|
|
47
|
+
)
|
|
48
|
+
.def_static(
|
|
49
|
+
"deserialize",
|
|
50
|
+
[](const std::string& bytes) { return SK::deserialize(bytes.data(), bytes.size()); },
|
|
51
|
+
py::arg("bytes"),
|
|
52
|
+
"Deserializes the sketch from a bytes object."
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// py::object and other types where the caller must provide a serde
|
|
57
|
+
template<typename T, typename SK, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
|
|
58
|
+
void add_serialization(py::class_<SK>& clazz) {
|
|
59
|
+
clazz.def(
|
|
60
|
+
"serialize",
|
|
61
|
+
[](const SK& sk, datasketches::py_object_serde& serde) {
|
|
62
|
+
auto bytes = sk.serialize(0, serde);
|
|
63
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
64
|
+
}, py::arg("serde"),
|
|
65
|
+
"Serializes the sketch into a bytes object using the provided serde."
|
|
66
|
+
)
|
|
67
|
+
.def_static(
|
|
68
|
+
"deserialize",
|
|
69
|
+
[](const std::string& bytes, datasketches::py_object_serde& serde) {
|
|
70
|
+
return SK::deserialize(bytes.data(), bytes.size(), serde);
|
|
71
|
+
}, py::arg("bytes"), py::arg("serde"),
|
|
72
|
+
"Deserializes the sketch from a bytes object using the provided serde."
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Vector Updates
|
|
77
|
+
// * Only allowed for POD types based on numpy restriction, which
|
|
78
|
+
// is equivalent to both std::is_trivial and std::is_standard_layout.
|
|
79
|
+
// * Nothing is added to types that are not PODs.
|
|
80
|
+
// POD type
|
|
81
|
+
template<typename T, typename SK, typename std::enable_if<std::is_trivial<T>::value && std::is_standard_layout<T>::value, bool>::type = 0>
|
|
82
|
+
void add_vector_update(py::class_<SK>& clazz) {
|
|
83
|
+
clazz.def(
|
|
84
|
+
"update",
|
|
85
|
+
[](SK& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
|
|
86
|
+
if (items.ndim() != 1) {
|
|
87
|
+
throw std::invalid_argument("input data must have only one dimension. Found: "
|
|
88
|
+
+ std::to_string(items.ndim()));
|
|
89
|
+
}
|
|
90
|
+
auto array = items.template unchecked<1>();
|
|
91
|
+
for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
|
|
92
|
+
},
|
|
93
|
+
py::arg("array"),
|
|
94
|
+
"Updates the sketch with the values in the given array"
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// non-POD type
|
|
99
|
+
template<typename T, typename SK, typename std::enable_if<!std::is_trivial<T>::value || !std::is_standard_layout<T>::value, bool>::type = 0>
|
|
100
|
+
void add_vector_update(py::class_<SK>& clazz) {
|
|
101
|
+
unused(clazz);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
#endif // _QUANTILE_CONDITIONAL_HPP_
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <memory>
|
|
21
|
+
#include <pybind11/pybind11.h>
|
|
22
|
+
|
|
23
|
+
#ifndef _TUPLE_POLICY_HPP_
|
|
24
|
+
#define _TUPLE_POLICY_HPP_
|
|
25
|
+
|
|
26
|
+
namespace py = pybind11;
|
|
27
|
+
|
|
28
|
+
namespace datasketches {
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @brief tuple_policy provides the underlying base class from
|
|
32
|
+
* which native Python policies ultimately inherit. The actual
|
|
33
|
+
* policies implement TuplePolicy, as shown in TuplePolicy.py
|
|
34
|
+
*/
|
|
35
|
+
struct tuple_policy {
|
|
36
|
+
virtual py::object create_summary() const = 0;
|
|
37
|
+
virtual py::object update_summary(py::object& summary, const py::object& update) const = 0;
|
|
38
|
+
virtual py::object operator()(py::object& summary, const py::object& update) const = 0;
|
|
39
|
+
virtual ~tuple_policy() = default;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* @brief TuplePolicy provides the "trampoline" class for pybind11
|
|
44
|
+
* that allows for a native Python implementation of tuple
|
|
45
|
+
* sketch policies.
|
|
46
|
+
*/
|
|
47
|
+
struct TuplePolicy : public tuple_policy {
|
|
48
|
+
using tuple_policy::tuple_policy;
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* @brief Create a summary object
|
|
52
|
+
*
|
|
53
|
+
* @return py::object representing a new summary
|
|
54
|
+
*/
|
|
55
|
+
py::object create_summary() const override {
|
|
56
|
+
PYBIND11_OVERRIDE_PURE(
|
|
57
|
+
py::object, // Return type
|
|
58
|
+
tuple_policy, // Parent class
|
|
59
|
+
create_summary, // Name of function in C++ (must match Python name)
|
|
60
|
+
// Argument(s) -- if any
|
|
61
|
+
);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @brief Update a summary object using this policy
|
|
66
|
+
*
|
|
67
|
+
* @param summary The current summary to update
|
|
68
|
+
* @param update The new value with which to update the summary
|
|
69
|
+
* @return py::object The updated summary
|
|
70
|
+
*/
|
|
71
|
+
py::object update_summary(py::object& summary, const py::object& update) const override {
|
|
72
|
+
PYBIND11_OVERRIDE_PURE(
|
|
73
|
+
py::object, // Return type
|
|
74
|
+
tuple_policy, // Parent class
|
|
75
|
+
update_summary, // Name of function in C++ (must match Python name)
|
|
76
|
+
summary, update // Arguments
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* @brief Applies this policy to summary with the provided update
|
|
82
|
+
*
|
|
83
|
+
* @param summary The current summary on which to apply the policy
|
|
84
|
+
* @param update An update to apply to the current summary
|
|
85
|
+
* @return py::object The potentially modified summary
|
|
86
|
+
*/
|
|
87
|
+
py::object operator()(py::object& summary, const py::object& update) const override {
|
|
88
|
+
PYBIND11_OVERRIDE_PURE_NAME(
|
|
89
|
+
py::object, // Return type
|
|
90
|
+
tuple_policy, // Parent class
|
|
91
|
+
"__call__", // Name of function in python
|
|
92
|
+
operator(), // Name of function in C++
|
|
93
|
+
summary, update // Arguemnts
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
/* The tuple_policy_holder provides a concrete class that dispatches calls
|
|
99
|
+
* from the sketch to the tuple_policy. This class is needed to provide a
|
|
100
|
+
* concrete object to produce a compiled library, but library users should
|
|
101
|
+
* never need to use this directly.
|
|
102
|
+
*/
|
|
103
|
+
struct tuple_policy_holder {
|
|
104
|
+
explicit tuple_policy_holder(std::shared_ptr<tuple_policy> policy) : _policy(policy) {}
|
|
105
|
+
tuple_policy_holder(const tuple_policy_holder& other) : _policy(other._policy) {}
|
|
106
|
+
tuple_policy_holder(tuple_policy_holder&& other) : _policy(std::move(other._policy)) {}
|
|
107
|
+
tuple_policy_holder& operator=(const tuple_policy_holder& other) { _policy = other._policy; return *this; }
|
|
108
|
+
tuple_policy_holder& operator=(tuple_policy_holder&& other) { std::swap(_policy, other._policy); return *this; }
|
|
109
|
+
|
|
110
|
+
py::object create() const { return _policy->create_summary(); }
|
|
111
|
+
|
|
112
|
+
void update(py::object& summary, const py::object& update) const {
|
|
113
|
+
summary = _policy->update_summary(summary, update);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
void operator()(py::object& summary, const py::object& update) const {
|
|
117
|
+
summary = _policy->operator()(summary, update);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
private:
|
|
121
|
+
std::shared_ptr<tuple_policy> _policy;
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
/* A degenerate policy used to enable Jaccard Similarity on tuple sketches,
|
|
125
|
+
* where the computation requires a union and intersection over the keys but
|
|
126
|
+
* does not need to observe the summaries.
|
|
127
|
+
*/
|
|
128
|
+
struct dummy_jaccard_policy {
|
|
129
|
+
void operator()(py::object&, const py::object&) const {
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
#endif // _TUPLE_POLICY_HPP_
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <pybind11/pybind11.h>
|
|
21
|
+
|
|
22
|
+
#include "count_min.hpp"
|
|
23
|
+
#include "common_defs.hpp"
|
|
24
|
+
|
|
25
|
+
namespace py = pybind11;
|
|
26
|
+
|
|
27
|
+
template<typename W>
|
|
28
|
+
void bind_count_min_sketch(py::module &m, const char* name) {
|
|
29
|
+
using namespace datasketches;
|
|
30
|
+
|
|
31
|
+
py::class_<count_min_sketch<W>>(m, name)
|
|
32
|
+
.def(py::init<uint8_t, uint32_t, uint64_t>(), py::arg("num_hashes"), py::arg("num_buckets"), py::arg("seed")=DEFAULT_SEED)
|
|
33
|
+
.def(py::init<const count_min_sketch<W>&>())
|
|
34
|
+
.def_static("suggest_num_buckets", &count_min_sketch<W>::suggest_num_buckets, py::arg("relative_error"),
|
|
35
|
+
"Suggests the number of buckets needed to achieve an accuracy within the provided "
|
|
36
|
+
"relative_error. For example, when relative_error = 0.05, the returned frequency estimates "
|
|
37
|
+
"satisfy the 'relative_error' guarantee that never overestimates the weights but may "
|
|
38
|
+
"underestimate the weights by 5% of the total weight in the sketch. "
|
|
39
|
+
"Returns the number of hash buckets at every level of the sketch required in order to obtain "
|
|
40
|
+
"the specified relative error.")
|
|
41
|
+
.def_static("suggest_num_hashes", &count_min_sketch<W>::suggest_num_hashes, py::arg("confidence"),
|
|
42
|
+
"Suggests the number of hashes needed to achieve the provided confidence. For example, "
|
|
43
|
+
"with 95% confidence, frequency estimates satisfy the 'relative_error' guarantee. "
|
|
44
|
+
"Returns the number of hash functions that are required in order to achieve the specified "
|
|
45
|
+
"confidence of the sketch. confidence = 1 - delta, with delta denoting the sketch failure probability.")
|
|
46
|
+
.def("__str__", &count_min_sketch<W>::to_string,
|
|
47
|
+
"Produces a string summary of the sketch")
|
|
48
|
+
.def("to_string", &count_min_sketch<W>::to_string,
|
|
49
|
+
"Produces a string summary of the sketch")
|
|
50
|
+
.def("is_empty", &count_min_sketch<W>::is_empty,
|
|
51
|
+
"Returns True if the sketch has seen no items, otherwise False")
|
|
52
|
+
.def("get_num_hashes", &count_min_sketch<W>::get_num_hashes,
|
|
53
|
+
"Returns the configured number of hashes for the sketch")
|
|
54
|
+
.def("get_num_buckets", &count_min_sketch<W>::get_num_buckets,
|
|
55
|
+
"Returns the configured number of buckets for the sketch")
|
|
56
|
+
.def("get_seed", &count_min_sketch<W>::get_seed,
|
|
57
|
+
"Returns the base hash seed for the sketch")
|
|
58
|
+
.def("get_relative_error", &count_min_sketch<W>::get_relative_error,
|
|
59
|
+
"Returns the maximum permissible error for any frequency estimate query")
|
|
60
|
+
.def("get_total_weight", &count_min_sketch<W>::get_total_weight,
|
|
61
|
+
"Returns the total weight currently inserted into the stream")
|
|
62
|
+
.def("update", static_cast<void (count_min_sketch<W>::*)(int64_t, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0,
|
|
63
|
+
"Updates the sketch with the given 64-bit integer value")
|
|
64
|
+
.def("update", static_cast<void (count_min_sketch<W>::*)(const std::string&, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0,
|
|
65
|
+
"Updates the sketch with the given string")
|
|
66
|
+
.def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
|
|
67
|
+
"Returns an estimate of the frequency of the provided 64-bit integer value")
|
|
68
|
+
.def("get_estimate", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
|
|
69
|
+
"Returns an estimate of the frequency of the provided string")
|
|
70
|
+
.def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
|
|
71
|
+
"Returns an upper bound on the estimate for the given 64-bit integer value")
|
|
72
|
+
.def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
|
|
73
|
+
"Returns an upper bound on the estimate for the provided string")
|
|
74
|
+
.def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
|
|
75
|
+
"Returns an lower bound on the estimate for the given 64-bit integer value")
|
|
76
|
+
.def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
|
|
77
|
+
"Returns an lower bound on the estimate for the provided string")
|
|
78
|
+
.def("merge", &count_min_sketch<W>::merge, py::arg("other"),
|
|
79
|
+
"Merges the provided other sketch into this one")
|
|
80
|
+
.def("get_serialized_size_bytes", &count_min_sketch<W>::get_serialized_size_bytes,
|
|
81
|
+
"Returns the size in bytes of the serialized image of the sketch")
|
|
82
|
+
.def(
|
|
83
|
+
"serialize",
|
|
84
|
+
[](const count_min_sketch<W>& sk) {
|
|
85
|
+
auto bytes = sk.serialize();
|
|
86
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
87
|
+
},
|
|
88
|
+
"Serializes the sketch into a bytes object"
|
|
89
|
+
)
|
|
90
|
+
.def_static(
|
|
91
|
+
"deserialize",
|
|
92
|
+
[](const std::string& bytes) { return count_min_sketch<W>::deserialize(bytes.data(), bytes.size()); },
|
|
93
|
+
py::arg("bytes"),
|
|
94
|
+
"Reads a bytes object and returns the corresponding count_min_sketch"
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
void init_count_min(py::module &m) {
|
|
99
|
+
bind_count_min_sketch<double>(m, "count_min_sketch");
|
|
100
|
+
}
|
|
101
|
+
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#include <sstream>
|
|
21
20
|
#include <pybind11/pybind11.h>
|
|
22
21
|
|
|
23
22
|
#include "cpc_sketch.hpp"
|
|
@@ -27,28 +26,6 @@
|
|
|
27
26
|
|
|
28
27
|
namespace py = pybind11;
|
|
29
28
|
|
|
30
|
-
namespace datasketches {
|
|
31
|
-
namespace python {
|
|
32
|
-
|
|
33
|
-
cpc_sketch* cpc_sketch_deserialize(py::bytes skBytes) {
|
|
34
|
-
std::string skStr = skBytes; // implicit cast
|
|
35
|
-
return new cpc_sketch(cpc_sketch::deserialize(skStr.c_str(), skStr.length()));
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
py::object cpc_sketch_serialize(const cpc_sketch& sk) {
|
|
39
|
-
auto serResult = sk.serialize();
|
|
40
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
cpc_sketch* cpc_union_get_result(const cpc_union& u) {
|
|
44
|
-
return new cpc_sketch(u.get_result());
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
namespace dspy = datasketches::python;
|
|
51
|
-
|
|
52
29
|
void init_cpc(py::module &m) {
|
|
53
30
|
using namespace datasketches;
|
|
54
31
|
|
|
@@ -59,10 +36,6 @@ void init_cpc(py::module &m) {
|
|
|
59
36
|
"Produces a string summary of the sketch")
|
|
60
37
|
.def("to_string", &cpc_sketch::to_string,
|
|
61
38
|
"Produces a string summary of the sketch")
|
|
62
|
-
.def("serialize", &dspy::cpc_sketch_serialize,
|
|
63
|
-
"Serializes the sketch into a bytes object")
|
|
64
|
-
.def_static("deserialize", &dspy::cpc_sketch_deserialize,
|
|
65
|
-
"Reads a bytes object and returns the corresponding cpc_sketch")
|
|
66
39
|
.def<void (cpc_sketch::*)(uint64_t)>("update", &cpc_sketch::update, py::arg("datum"),
|
|
67
40
|
"Updates the sketch with the given 64-bit integer value")
|
|
68
41
|
.def<void (cpc_sketch::*)(double)>("update", &cpc_sketch::update, py::arg("datum"),
|
|
@@ -70,21 +43,34 @@ void init_cpc(py::module &m) {
|
|
|
70
43
|
.def<void (cpc_sketch::*)(const std::string&)>("update", &cpc_sketch::update, py::arg("datum"),
|
|
71
44
|
"Updates the sketch with the given string")
|
|
72
45
|
.def("is_empty", &cpc_sketch::is_empty,
|
|
73
|
-
"Returns True if the sketch is empty, otherwise
|
|
46
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
74
47
|
.def("get_estimate", &cpc_sketch::get_estimate,
|
|
75
48
|
"Estimate of the distinct count of the input stream")
|
|
76
49
|
.def("get_lower_bound", &cpc_sketch::get_lower_bound, py::arg("kappa"),
|
|
77
50
|
"Returns an approximate lower bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
|
|
78
51
|
.def("get_upper_bound", &cpc_sketch::get_upper_bound, py::arg("kappa"),
|
|
79
52
|
"Returns an approximate upper bound on the estimate for kappa values in {1, 2, 3}, roughly corresponding to standard deviations")
|
|
80
|
-
|
|
53
|
+
.def(
|
|
54
|
+
"serialize",
|
|
55
|
+
[](const cpc_sketch& sk) {
|
|
56
|
+
auto bytes = sk.serialize();
|
|
57
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
58
|
+
},
|
|
59
|
+
"Serializes the sketch into a bytes object"
|
|
60
|
+
)
|
|
61
|
+
.def_static(
|
|
62
|
+
"deserialize",
|
|
63
|
+
[](const std::string& bytes) { return cpc_sketch::deserialize(bytes.data(), bytes.size()); },
|
|
64
|
+
py::arg("bytes"),
|
|
65
|
+
"Reads a bytes object and returns the corresponding cpc_sketch"
|
|
66
|
+
);
|
|
81
67
|
|
|
82
68
|
py::class_<cpc_union>(m, "cpc_union")
|
|
83
69
|
.def(py::init<uint8_t, uint64_t>(), py::arg("lg_k"), py::arg("seed")=DEFAULT_SEED)
|
|
84
70
|
.def(py::init<const cpc_union&>())
|
|
85
71
|
.def("update", (void (cpc_union::*)(const cpc_sketch&)) &cpc_union::update, py::arg("sketch"),
|
|
86
72
|
"Updates the union with the provided CPC sketch")
|
|
87
|
-
.def("get_result", &
|
|
73
|
+
.def("get_result", &cpc_union::get_result,
|
|
88
74
|
"Returns a CPC sketch with the result of the union")
|
|
89
75
|
;
|
|
90
76
|
}
|
|
@@ -27,9 +27,12 @@ void init_kll(py::module& m);
|
|
|
27
27
|
void init_fi(py::module& m);
|
|
28
28
|
void init_cpc(py::module& m);
|
|
29
29
|
void init_theta(py::module& m);
|
|
30
|
+
void init_tuple(py::module& m);
|
|
30
31
|
void init_vo(py::module& m);
|
|
31
32
|
void init_req(py::module& m);
|
|
32
33
|
void init_quantiles(py::module& m);
|
|
34
|
+
void init_count_min(py::module& m);
|
|
35
|
+
void init_density(py::module& m);
|
|
33
36
|
void init_vector_of_kll(py::module& m);
|
|
34
37
|
|
|
35
38
|
// supporting objects
|
|
@@ -42,9 +45,12 @@ PYBIND11_MODULE(_datasketches, m) {
|
|
|
42
45
|
init_fi(m);
|
|
43
46
|
init_cpc(m);
|
|
44
47
|
init_theta(m);
|
|
48
|
+
init_tuple(m);
|
|
45
49
|
init_vo(m);
|
|
46
50
|
init_req(m);
|
|
47
51
|
init_quantiles(m);
|
|
52
|
+
init_count_min(m);
|
|
53
|
+
init_density(m);
|
|
48
54
|
init_vector_of_kll(m);
|
|
49
55
|
|
|
50
56
|
init_kolmogorov_smirnov(m);
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <pybind11/pybind11.h>
|
|
21
|
+
#include <pybind11/stl.h>
|
|
22
|
+
#include <pybind11/numpy.h>
|
|
23
|
+
#include <vector>
|
|
24
|
+
|
|
25
|
+
#include "kernel_function.hpp"
|
|
26
|
+
#include "density_sketch.hpp"
|
|
27
|
+
|
|
28
|
+
namespace py = pybind11;
|
|
29
|
+
|
|
30
|
+
template<typename T, typename K>
|
|
31
|
+
void bind_density_sketch(py::module &m, const char* name) {
|
|
32
|
+
using namespace datasketches;
|
|
33
|
+
|
|
34
|
+
py::class_<density_sketch<T, K>>(m, name)
|
|
35
|
+
.def(
|
|
36
|
+
py::init([](uint16_t k, uint32_t dim, std::shared_ptr<kernel_function> kernel) {
|
|
37
|
+
kernel_function_holder holder(kernel);
|
|
38
|
+
return density_sketch<T, K>(k, dim, holder);
|
|
39
|
+
}),
|
|
40
|
+
py::arg("k"), py::arg("dim"), py::arg("kernel"))
|
|
41
|
+
.def("update", static_cast<void (density_sketch<T, K>::*)(const std::vector<T>&)>(&density_sketch<T, K>::update),
|
|
42
|
+
"Updates the sketch with the given vector")
|
|
43
|
+
.def("merge", static_cast<void (density_sketch<T, K>::*)(const density_sketch<T, K>&)>(&density_sketch<T, K>::merge), py::arg("sketch"),
|
|
44
|
+
"Merges the provided sketch into this one")
|
|
45
|
+
.def("is_empty", &density_sketch<T, K>::is_empty,
|
|
46
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
47
|
+
.def("get_k", &density_sketch<T, K>::get_k,
|
|
48
|
+
"Returns the configured parameter k")
|
|
49
|
+
.def("get_dim", &density_sketch<T, K>::get_dim,
|
|
50
|
+
"Returns the configured parameter dim")
|
|
51
|
+
.def("get_n", &density_sketch<T, K>::get_n,
|
|
52
|
+
"Returns the length of the input stream")
|
|
53
|
+
.def("get_num_retained", &density_sketch<T, K>::get_num_retained,
|
|
54
|
+
"Returns the number of retained items (samples) in the sketch")
|
|
55
|
+
.def("is_estimation_mode", &density_sketch<T, K>::is_estimation_mode,
|
|
56
|
+
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
57
|
+
.def("get_estimate", &density_sketch<T, K>::get_estimate, py::arg("point"),
|
|
58
|
+
"Returns an approximate density at the given point")
|
|
59
|
+
.def("__str__", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
60
|
+
"Produces a string summary of the sketch")
|
|
61
|
+
.def("to_string", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
62
|
+
"Produces a string summary of the sketch")
|
|
63
|
+
.def("__iter__", [](const density_sketch<T, K>& s){ return py::make_iterator(s.begin(), s.end()); })
|
|
64
|
+
.def("serialize",
|
|
65
|
+
[](const density_sketch<T, K>& sk) {
|
|
66
|
+
auto bytes = sk.serialize();
|
|
67
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
68
|
+
},
|
|
69
|
+
"Serializes the sketch into a bytes object"
|
|
70
|
+
)
|
|
71
|
+
.def_static(
|
|
72
|
+
"deserialize",
|
|
73
|
+
[](const std::string& bytes, std::shared_ptr<kernel_function> kernel) {
|
|
74
|
+
kernel_function_holder holder(kernel);
|
|
75
|
+
return density_sketch<T, K>::deserialize(bytes.data(), bytes.size(), holder);
|
|
76
|
+
},
|
|
77
|
+
py::arg("bytes"), py::arg("kernel"),
|
|
78
|
+
"Reads a bytes object and returns the corresponding density_sketch"
|
|
79
|
+
);;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
void init_density(py::module &m) {
|
|
83
|
+
using namespace datasketches;
|
|
84
|
+
|
|
85
|
+
// generic kernel function
|
|
86
|
+
py::class_<kernel_function, KernelFunction, std::shared_ptr<kernel_function>>(m, "KernelFunction")
|
|
87
|
+
.def(py::init())
|
|
88
|
+
.def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b"))
|
|
89
|
+
;
|
|
90
|
+
|
|
91
|
+
// the old sketch names can almost be defined, but the kernel_function_holder won't work in init()
|
|
92
|
+
//bind_density_sketch<float, gaussian_kernel<float>>(m, "density_floats_sketch");
|
|
93
|
+
//bind_density_sketch<double, gaussian_kernel<double>>(m, "density_doubles_sketch");
|
|
94
|
+
bind_density_sketch<double, kernel_function_holder>(m, "_density_sketch");
|
|
95
|
+
}
|