datasketches 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
|
@@ -17,105 +17,158 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
+
|
|
21
|
+
#include "py_serde.hpp"
|
|
22
|
+
#include "py_object_ostream.hpp"
|
|
20
23
|
#include "frequent_items_sketch.hpp"
|
|
21
24
|
|
|
22
25
|
#include <pybind11/pybind11.h>
|
|
23
|
-
#include <sstream>
|
|
24
|
-
|
|
25
|
-
namespace py = pybind11;
|
|
26
|
-
|
|
27
|
-
namespace datasketches {
|
|
28
|
-
namespace python {
|
|
29
|
-
|
|
30
|
-
template<typename T>
|
|
31
|
-
frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
|
|
32
|
-
std::string skStr = skBytes; // implicit cast
|
|
33
|
-
return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
|
|
34
|
-
}
|
|
35
26
|
|
|
36
|
-
|
|
37
|
-
py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
|
|
38
|
-
auto serResult = sk.serialize();
|
|
39
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// maybe possible to disambiguate the static vs method get_epsilon calls, but
|
|
43
|
-
// this is easier for now
|
|
44
|
-
template<typename T>
|
|
45
|
-
double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
|
|
46
|
-
return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
template<typename T>
|
|
50
|
-
py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
51
|
-
frequent_items_error_type err_type,
|
|
52
|
-
uint64_t threshold = 0) {
|
|
53
|
-
if (threshold == 0) { threshold = sk.get_maximum_error(); }
|
|
54
|
-
|
|
55
|
-
py::list list;
|
|
56
|
-
auto items = sk.get_frequent_items(err_type, threshold);
|
|
57
|
-
for (auto iter = items.begin(); iter != items.end(); ++iter) {
|
|
58
|
-
py::tuple t = py::make_tuple(iter->get_item(),
|
|
59
|
-
iter->get_estimate(),
|
|
60
|
-
iter->get_lower_bound(),
|
|
61
|
-
iter->get_upper_bound());
|
|
62
|
-
list.append(t);
|
|
63
|
-
}
|
|
64
|
-
return list;
|
|
65
|
-
}
|
|
27
|
+
#include <ostream>
|
|
66
28
|
|
|
67
|
-
|
|
68
|
-
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
|
69
|
-
return sk.get_serialized_size_bytes();
|
|
70
|
-
}
|
|
29
|
+
namespace py = pybind11;
|
|
71
30
|
|
|
72
|
-
|
|
73
|
-
|
|
31
|
+
// forward declarations
|
|
32
|
+
// std::string and arithmetic types, where we don't need a separate serde
|
|
33
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
|
|
34
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
|
|
74
35
|
|
|
75
|
-
|
|
36
|
+
// py::object and other types where the caller must provide a serde
|
|
37
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
|
|
38
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
|
|
76
39
|
|
|
77
|
-
template<typename T>
|
|
40
|
+
template<typename T, typename W, typename H, typename E>
|
|
78
41
|
void bind_fi_sketch(py::module &m, const char* name) {
|
|
79
42
|
using namespace datasketches;
|
|
80
43
|
|
|
81
|
-
py::class_<frequent_items_sketch<T>>(m, name)
|
|
44
|
+
auto fi_class = py::class_<frequent_items_sketch<T, W, H, E>>(m, name)
|
|
82
45
|
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
|
83
|
-
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
|
46
|
+
.def("__str__", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
|
|
84
47
|
"Produces a string summary of the sketch")
|
|
85
|
-
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
|
48
|
+
.def("to_string", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
|
|
86
49
|
"Produces a string summary of the sketch")
|
|
87
|
-
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
|
|
50
|
+
.def("update", (void (frequent_items_sketch<T, W, H, E>::*)(const T&, uint64_t)) &frequent_items_sketch<T, W, H, E>::update, py::arg("item"), py::arg("weight")=1,
|
|
88
51
|
"Updates the sketch with the given string and, optionally, a weight")
|
|
89
|
-
.def("
|
|
90
|
-
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
|
|
52
|
+
.def("merge", (void (frequent_items_sketch<T, W, H, E>::*)(const frequent_items_sketch<T, W, H, E>&)) &frequent_items_sketch<T, W, H, E>::merge,
|
|
91
53
|
"Merges the given sketch into this one")
|
|
92
|
-
.def("is_empty", &frequent_items_sketch<T>::is_empty,
|
|
54
|
+
.def("is_empty", &frequent_items_sketch<T, W, H, E>::is_empty,
|
|
93
55
|
"Returns True if the sketch is empty, otherwise False")
|
|
94
|
-
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
|
|
56
|
+
.def("get_num_active_items", &frequent_items_sketch<T, W, H, E>::get_num_active_items,
|
|
95
57
|
"Returns the number of active items in the sketch")
|
|
96
|
-
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
|
|
58
|
+
.def("get_total_weight", &frequent_items_sketch<T, W, H, E>::get_total_weight,
|
|
97
59
|
"Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
|
|
98
|
-
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
|
|
60
|
+
.def("get_estimate", &frequent_items_sketch<T, W, H, E>::get_estimate, py::arg("item"),
|
|
99
61
|
"Returns the estimate of the weight (frequency) of the given item.\n"
|
|
100
62
|
"Note: The true frequency of a item would be the sum of the counts as a result of the "
|
|
101
63
|
"two update functions.")
|
|
102
|
-
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
|
|
64
|
+
.def("get_lower_bound", &frequent_items_sketch<T, W, H, E>::get_lower_bound, py::arg("item"),
|
|
103
65
|
"Returns the guaranteed lower bound weight (frequency) of the given item.")
|
|
104
|
-
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
|
|
66
|
+
.def("get_upper_bound", &frequent_items_sketch<T, W, H, E>::get_upper_bound, py::arg("item"),
|
|
105
67
|
"Returns the guaranteed upper bound weight (frequency) of the given item.")
|
|
106
|
-
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
|
|
68
|
+
.def("get_sketch_epsilon", (double (frequent_items_sketch<T, W, H, E>::*)(void) const) &frequent_items_sketch<T, W, H, E>::get_epsilon,
|
|
107
69
|
"Returns the epsilon value used by the sketch to compute error")
|
|
108
|
-
.
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
70
|
+
.def(
|
|
71
|
+
"get_frequent_items",
|
|
72
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, frequent_items_error_type err_type, uint64_t threshold) {
|
|
73
|
+
if (threshold == 0) threshold = sk.get_maximum_error();
|
|
74
|
+
py::list list;
|
|
75
|
+
auto rows = sk.get_frequent_items(err_type, threshold);
|
|
76
|
+
for (auto row: rows) {
|
|
77
|
+
list.append(py::make_tuple(
|
|
78
|
+
row.get_item(),
|
|
79
|
+
row.get_estimate(),
|
|
80
|
+
row.get_lower_bound(),
|
|
81
|
+
row.get_upper_bound())
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
return list;
|
|
85
|
+
},
|
|
86
|
+
py::arg("err_type"), py::arg("threshold")=0
|
|
87
|
+
)
|
|
88
|
+
.def_static(
|
|
89
|
+
"get_epsilon_for_lg_size",
|
|
90
|
+
[](uint8_t lg_max_map_size) { return frequent_items_sketch<T, W, H, E>::get_epsilon(lg_max_map_size); },
|
|
91
|
+
py::arg("lg_max_map_size"),
|
|
92
|
+
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)"
|
|
93
|
+
)
|
|
94
|
+
.def_static(
|
|
95
|
+
"get_apriori_error",
|
|
96
|
+
&frequent_items_sketch<T, W, H, E>::get_apriori_error,
|
|
97
|
+
py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
|
98
|
+
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight."
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
// serialization may need a caller-provided serde depending on the sketch type, so
|
|
102
|
+
// we use a separate method to handle that appropriately based on type T.
|
|
103
|
+
add_serialization(fi_class);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// std::string or arithmetic types, for which we have a built-in serde
|
|
107
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type>
|
|
108
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
|
|
109
|
+
using namespace datasketches;
|
|
110
|
+
clazz.def(
|
|
111
|
+
"get_serialized_size_bytes",
|
|
112
|
+
[](const frequent_items_sketch<T, W, H, E>& sk) { return sk.get_serialized_size_bytes(); },
|
|
113
|
+
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at."
|
|
114
|
+
)
|
|
115
|
+
.def(
|
|
116
|
+
"serialize",
|
|
117
|
+
[](const frequent_items_sketch<T, W, H, E>& sk) {
|
|
118
|
+
auto bytes = sk.serialize();
|
|
119
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
120
|
+
},
|
|
121
|
+
"Serializes the sketch into a bytes object."
|
|
122
|
+
)
|
|
123
|
+
.def_static(
|
|
124
|
+
"deserialize",
|
|
125
|
+
[](const std::string& bytes) { return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size()); },
|
|
126
|
+
py::arg("bytes"),
|
|
127
|
+
"Reads a bytes object and returns the corresponding frequent_strings_sketch."
|
|
128
|
+
);
|
|
117
129
|
}
|
|
118
130
|
|
|
131
|
+
// py::object or any other type that requires a provided serde
|
|
132
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type>
|
|
133
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
|
|
134
|
+
using namespace datasketches;
|
|
135
|
+
clazz.def(
|
|
136
|
+
"get_serialized_size_bytes",
|
|
137
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) { return sk.get_serialized_size_bytes(serde); },
|
|
138
|
+
py::arg("serde"),
|
|
139
|
+
"Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at."
|
|
140
|
+
)
|
|
141
|
+
.def(
|
|
142
|
+
"serialize",
|
|
143
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) {
|
|
144
|
+
auto bytes = sk.serialize(0, serde);
|
|
145
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
146
|
+
}, py::arg("serde"),
|
|
147
|
+
"Serializes the sketch into a bytes object using the provided serde."
|
|
148
|
+
)
|
|
149
|
+
.def_static(
|
|
150
|
+
"deserialize",
|
|
151
|
+
[](const std::string& bytes, py_object_serde& serde) {
|
|
152
|
+
return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size(), serde);
|
|
153
|
+
}, py::arg("bytes"), py::arg("serde"),
|
|
154
|
+
"Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch."
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// calls class __hash__ method
|
|
159
|
+
struct py_hash_caller {
|
|
160
|
+
size_t operator()(const py::object& a) const {
|
|
161
|
+
return py::hash(a);
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
// calls class __eq__ method
|
|
166
|
+
struct py_equal_caller {
|
|
167
|
+
bool operator()(const py::object& a, const py::object& b) const {
|
|
168
|
+
return a.equal(b);
|
|
169
|
+
}
|
|
170
|
+
};
|
|
171
|
+
|
|
119
172
|
void init_fi(py::module &m) {
|
|
120
173
|
using namespace datasketches;
|
|
121
174
|
|
|
@@ -124,5 +177,6 @@ void init_fi(py::module &m) {
|
|
|
124
177
|
.value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
|
|
125
178
|
.export_values();
|
|
126
179
|
|
|
127
|
-
bind_fi_sketch<std::string
|
|
180
|
+
bind_fi_sketch<std::string, uint64_t, std::hash<std::string>, std::equal_to<std::string>>(m, "frequent_strings_sketch");
|
|
181
|
+
bind_fi_sketch<py::object, uint64_t, py_hash_caller, py_equal_caller>(m, "frequent_items_sketch");
|
|
128
182
|
}
|
|
@@ -17,34 +17,11 @@
|
|
|
17
17
|
* under the License.
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
|
-
#include "hll.hpp"
|
|
21
|
-
|
|
22
20
|
#include <pybind11/pybind11.h>
|
|
23
21
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
namespace datasketches {
|
|
27
|
-
namespace python {
|
|
28
|
-
|
|
29
|
-
hll_sketch hll_sketch_deserialize(py::bytes skBytes) {
|
|
30
|
-
std::string skStr = skBytes; // implicit cast
|
|
31
|
-
return hll_sketch::deserialize(skStr.c_str(), skStr.length());
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
py::object hll_sketch_serialize_compact(const hll_sketch& sk) {
|
|
35
|
-
auto serResult = sk.serialize_compact();
|
|
36
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
py::object hll_sketch_serialize_updatable(const hll_sketch& sk) {
|
|
40
|
-
auto serResult = sk.serialize_updatable();
|
|
41
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
}
|
|
45
|
-
}
|
|
22
|
+
#include "hll.hpp"
|
|
46
23
|
|
|
47
|
-
namespace
|
|
24
|
+
namespace py = pybind11;
|
|
48
25
|
|
|
49
26
|
void init_hll(py::module &m) {
|
|
50
27
|
using namespace datasketches;
|
|
@@ -59,12 +36,6 @@ void init_hll(py::module &m) {
|
|
|
59
36
|
.def(py::init<uint8_t>(), py::arg("lg_k"))
|
|
60
37
|
.def(py::init<uint8_t, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
|
|
61
38
|
.def(py::init<uint8_t, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
|
|
62
|
-
.def_static("deserialize", &dspy::hll_sketch_deserialize,
|
|
63
|
-
"Reads a bytes object and returns the corresponding hll_sketch")
|
|
64
|
-
.def("serialize_compact", &dspy::hll_sketch_serialize_compact,
|
|
65
|
-
"Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
|
|
66
|
-
.def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
|
|
67
|
-
"Serializes the sketch into a bytes object")
|
|
68
39
|
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
|
|
69
40
|
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
|
|
70
41
|
"Produces a string summary of the sketch")
|
|
@@ -88,7 +59,7 @@ void init_hll(py::module &m) {
|
|
|
88
59
|
.def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
|
|
89
60
|
"Returns the size of the serialized sketch when compressing the exception table if HLL_4")
|
|
90
61
|
.def("reset", &hll_sketch::reset,
|
|
91
|
-
"Resets the sketch to the empty state in coupon
|
|
62
|
+
"Resets the sketch to the empty state in coupon collection mode")
|
|
92
63
|
.def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
|
|
93
64
|
"Updates the sketch with the given integral value")
|
|
94
65
|
.def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
|
|
@@ -97,11 +68,32 @@ void init_hll(py::module &m) {
|
|
|
97
68
|
"Updates the sketch with the given string value")
|
|
98
69
|
.def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
|
|
99
70
|
py::arg("lg_k"), py::arg("tgt_type"),
|
|
100
|
-
"Provides a likely upper bound on serialization size for the given
|
|
71
|
+
"Provides a likely upper bound on serialization size for the given parameters")
|
|
101
72
|
.def_static("get_rel_err", &hll_sketch::get_rel_err,
|
|
102
73
|
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
|
103
|
-
"
|
|
104
|
-
|
|
74
|
+
"Returns the a priori relative error bound for the given parameters")
|
|
75
|
+
.def(
|
|
76
|
+
"serialize_compact",
|
|
77
|
+
[](const hll_sketch& sk) {
|
|
78
|
+
auto bytes = sk.serialize_compact();
|
|
79
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
80
|
+
},
|
|
81
|
+
"Serializes the sketch into a bytes object, compressing the exception table if HLL_4"
|
|
82
|
+
)
|
|
83
|
+
.def(
|
|
84
|
+
"serialize_updatable",
|
|
85
|
+
[](const hll_sketch& sk) {
|
|
86
|
+
auto bytes = sk.serialize_updatable();
|
|
87
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
|
88
|
+
},
|
|
89
|
+
"Serializes the sketch into a bytes object"
|
|
90
|
+
)
|
|
91
|
+
.def_static(
|
|
92
|
+
"deserialize",
|
|
93
|
+
[](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); },
|
|
94
|
+
py::arg("bytes"),
|
|
95
|
+
"Reads a bytes object and returns the corresponding hll_sketch"
|
|
96
|
+
);
|
|
105
97
|
|
|
106
98
|
py::class_<hll_union>(m, "hll_union")
|
|
107
99
|
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
|
@@ -129,6 +121,6 @@ void init_hll(py::module &m) {
|
|
|
129
121
|
"Updates the union with the given string value")
|
|
130
122
|
.def_static("get_rel_err", &hll_union::get_rel_err,
|
|
131
123
|
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
|
132
|
-
"
|
|
124
|
+
"Returns the a priori relative error bound for the given parameters")
|
|
133
125
|
;
|
|
134
126
|
}
|