datasketches 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -17,105 +17,158 @@
|
|
17
17
|
* under the License.
|
18
18
|
*/
|
19
19
|
|
20
|
+
|
21
|
+
#include "py_serde.hpp"
|
22
|
+
#include "py_object_ostream.hpp"
|
20
23
|
#include "frequent_items_sketch.hpp"
|
21
24
|
|
22
25
|
#include <pybind11/pybind11.h>
|
23
|
-
#include <sstream>
|
24
|
-
|
25
|
-
namespace py = pybind11;
|
26
|
-
|
27
|
-
namespace datasketches {
|
28
|
-
namespace python {
|
29
|
-
|
30
|
-
template<typename T>
|
31
|
-
frequent_items_sketch<T> fi_sketch_deserialize(py::bytes skBytes) {
|
32
|
-
std::string skStr = skBytes; // implicit cast
|
33
|
-
return frequent_items_sketch<T>::deserialize(skStr.c_str(), skStr.length());
|
34
|
-
}
|
35
26
|
|
36
|
-
|
37
|
-
py::object fi_sketch_serialize(const frequent_items_sketch<T>& sk) {
|
38
|
-
auto serResult = sk.serialize();
|
39
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
40
|
-
}
|
41
|
-
|
42
|
-
// maybe possible to disambiguate the static vs method get_epsilon calls, but
|
43
|
-
// this is easier for now
|
44
|
-
template<typename T>
|
45
|
-
double fi_sketch_get_generic_epsilon(uint8_t lg_max_map_size) {
|
46
|
-
return frequent_items_sketch<T>::get_epsilon(lg_max_map_size);
|
47
|
-
}
|
48
|
-
|
49
|
-
template<typename T>
|
50
|
-
py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
51
|
-
frequent_items_error_type err_type,
|
52
|
-
uint64_t threshold = 0) {
|
53
|
-
if (threshold == 0) { threshold = sk.get_maximum_error(); }
|
54
|
-
|
55
|
-
py::list list;
|
56
|
-
auto items = sk.get_frequent_items(err_type, threshold);
|
57
|
-
for (auto iter = items.begin(); iter != items.end(); ++iter) {
|
58
|
-
py::tuple t = py::make_tuple(iter->get_item(),
|
59
|
-
iter->get_estimate(),
|
60
|
-
iter->get_lower_bound(),
|
61
|
-
iter->get_upper_bound());
|
62
|
-
list.append(t);
|
63
|
-
}
|
64
|
-
return list;
|
65
|
-
}
|
27
|
+
#include <ostream>
|
66
28
|
|
67
|
-
|
68
|
-
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
69
|
-
return sk.get_serialized_size_bytes();
|
70
|
-
}
|
29
|
+
namespace py = pybind11;
|
71
30
|
|
72
|
-
|
73
|
-
|
31
|
+
// forward declarations
|
32
|
+
// std::string and arithmetic types, where we don't need a separate serde
|
33
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type = 0>
|
34
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
|
74
35
|
|
75
|
-
|
36
|
+
// py::object and other types where the caller must provide a serde
|
37
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type = 0>
|
38
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz);
|
76
39
|
|
77
|
-
template<typename T>
|
40
|
+
template<typename T, typename W, typename H, typename E>
|
78
41
|
void bind_fi_sketch(py::module &m, const char* name) {
|
79
42
|
using namespace datasketches;
|
80
43
|
|
81
|
-
py::class_<frequent_items_sketch<T>>(m, name)
|
44
|
+
auto fi_class = py::class_<frequent_items_sketch<T, W, H, E>>(m, name)
|
82
45
|
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
83
|
-
.def("__str__", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
46
|
+
.def("__str__", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
|
84
47
|
"Produces a string summary of the sketch")
|
85
|
-
.def("to_string", &frequent_items_sketch<T>::to_string, py::arg("print_items")=false,
|
48
|
+
.def("to_string", &frequent_items_sketch<T, W, H, E>::to_string, py::arg("print_items")=false,
|
86
49
|
"Produces a string summary of the sketch")
|
87
|
-
.def("update", (void (frequent_items_sketch<T>::*)(const T&, uint64_t)) &frequent_items_sketch<T>::update, py::arg("item"), py::arg("weight")=1,
|
50
|
+
.def("update", (void (frequent_items_sketch<T, W, H, E>::*)(const T&, uint64_t)) &frequent_items_sketch<T, W, H, E>::update, py::arg("item"), py::arg("weight")=1,
|
88
51
|
"Updates the sketch with the given string and, optionally, a weight")
|
89
|
-
.def("
|
90
|
-
.def("merge", (void (frequent_items_sketch<T>::*)(const frequent_items_sketch<T>&)) &frequent_items_sketch<T>::merge,
|
52
|
+
.def("merge", (void (frequent_items_sketch<T, W, H, E>::*)(const frequent_items_sketch<T, W, H, E>&)) &frequent_items_sketch<T, W, H, E>::merge,
|
91
53
|
"Merges the given sketch into this one")
|
92
|
-
.def("is_empty", &frequent_items_sketch<T>::is_empty,
|
54
|
+
.def("is_empty", &frequent_items_sketch<T, W, H, E>::is_empty,
|
93
55
|
"Returns True if the sketch is empty, otherwise False")
|
94
|
-
.def("get_num_active_items", &frequent_items_sketch<T>::get_num_active_items,
|
56
|
+
.def("get_num_active_items", &frequent_items_sketch<T, W, H, E>::get_num_active_items,
|
95
57
|
"Returns the number of active items in the sketch")
|
96
|
-
.def("get_total_weight", &frequent_items_sketch<T>::get_total_weight,
|
58
|
+
.def("get_total_weight", &frequent_items_sketch<T, W, H, E>::get_total_weight,
|
97
59
|
"Returns the sum of the weights (frequencies) in the stream seen so far by the sketch")
|
98
|
-
.def("get_estimate", &frequent_items_sketch<T>::get_estimate, py::arg("item"),
|
60
|
+
.def("get_estimate", &frequent_items_sketch<T, W, H, E>::get_estimate, py::arg("item"),
|
99
61
|
"Returns the estimate of the weight (frequency) of the given item.\n"
|
100
62
|
"Note: The true frequency of a item would be the sum of the counts as a result of the "
|
101
63
|
"two update functions.")
|
102
|
-
.def("get_lower_bound", &frequent_items_sketch<T>::get_lower_bound, py::arg("item"),
|
64
|
+
.def("get_lower_bound", &frequent_items_sketch<T, W, H, E>::get_lower_bound, py::arg("item"),
|
103
65
|
"Returns the guaranteed lower bound weight (frequency) of the given item.")
|
104
|
-
.def("get_upper_bound", &frequent_items_sketch<T>::get_upper_bound, py::arg("item"),
|
66
|
+
.def("get_upper_bound", &frequent_items_sketch<T, W, H, E>::get_upper_bound, py::arg("item"),
|
105
67
|
"Returns the guaranteed upper bound weight (frequency) of the given item.")
|
106
|
-
.def("get_sketch_epsilon", (double (frequent_items_sketch<T>::*)(void) const) &frequent_items_sketch<T>::get_epsilon,
|
68
|
+
.def("get_sketch_epsilon", (double (frequent_items_sketch<T, W, H, E>::*)(void) const) &frequent_items_sketch<T, W, H, E>::get_epsilon,
|
107
69
|
"Returns the epsilon value used by the sketch to compute error")
|
108
|
-
.
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
70
|
+
.def(
|
71
|
+
"get_frequent_items",
|
72
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, frequent_items_error_type err_type, uint64_t threshold) {
|
73
|
+
if (threshold == 0) threshold = sk.get_maximum_error();
|
74
|
+
py::list list;
|
75
|
+
auto rows = sk.get_frequent_items(err_type, threshold);
|
76
|
+
for (auto row: rows) {
|
77
|
+
list.append(py::make_tuple(
|
78
|
+
row.get_item(),
|
79
|
+
row.get_estimate(),
|
80
|
+
row.get_lower_bound(),
|
81
|
+
row.get_upper_bound())
|
82
|
+
);
|
83
|
+
}
|
84
|
+
return list;
|
85
|
+
},
|
86
|
+
py::arg("err_type"), py::arg("threshold")=0
|
87
|
+
)
|
88
|
+
.def_static(
|
89
|
+
"get_epsilon_for_lg_size",
|
90
|
+
[](uint8_t lg_max_map_size) { return frequent_items_sketch<T, W, H, E>::get_epsilon(lg_max_map_size); },
|
91
|
+
py::arg("lg_max_map_size"),
|
92
|
+
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)"
|
93
|
+
)
|
94
|
+
.def_static(
|
95
|
+
"get_apriori_error",
|
96
|
+
&frequent_items_sketch<T, W, H, E>::get_apriori_error,
|
97
|
+
py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
98
|
+
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight."
|
99
|
+
);
|
100
|
+
|
101
|
+
// serialization may need a caller-provided serde depending on the sketch type, so
|
102
|
+
// we use a separate method to handle that appropriately based on type T.
|
103
|
+
add_serialization(fi_class);
|
104
|
+
}
|
105
|
+
|
106
|
+
// std::string or arithmetic types, for which we have a built-in serde
|
107
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<std::is_arithmetic<T>::value || std::is_same<std::string, T>::value, bool>::type>
|
108
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
|
109
|
+
using namespace datasketches;
|
110
|
+
clazz.def(
|
111
|
+
"get_serialized_size_bytes",
|
112
|
+
[](const frequent_items_sketch<T, W, H, E>& sk) { return sk.get_serialized_size_bytes(); },
|
113
|
+
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at."
|
114
|
+
)
|
115
|
+
.def(
|
116
|
+
"serialize",
|
117
|
+
[](const frequent_items_sketch<T, W, H, E>& sk) {
|
118
|
+
auto bytes = sk.serialize();
|
119
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
120
|
+
},
|
121
|
+
"Serializes the sketch into a bytes object."
|
122
|
+
)
|
123
|
+
.def_static(
|
124
|
+
"deserialize",
|
125
|
+
[](const std::string& bytes) { return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size()); },
|
126
|
+
py::arg("bytes"),
|
127
|
+
"Reads a bytes object and returns the corresponding frequent_strings_sketch."
|
128
|
+
);
|
117
129
|
}
|
118
130
|
|
131
|
+
// py::object or any other type that requires a provided serde
|
132
|
+
template<typename T, typename W, typename H, typename E, typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<std::string, T>::value, bool>::type>
|
133
|
+
void add_serialization(py::class_<datasketches::frequent_items_sketch<T, W, H, E>>& clazz) {
|
134
|
+
using namespace datasketches;
|
135
|
+
clazz.def(
|
136
|
+
"get_serialized_size_bytes",
|
137
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) { return sk.get_serialized_size_bytes(serde); },
|
138
|
+
py::arg("serde"),
|
139
|
+
"Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at."
|
140
|
+
)
|
141
|
+
.def(
|
142
|
+
"serialize",
|
143
|
+
[](const frequent_items_sketch<T, W, H, E>& sk, py_object_serde& serde) {
|
144
|
+
auto bytes = sk.serialize(0, serde);
|
145
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
146
|
+
}, py::arg("serde"),
|
147
|
+
"Serializes the sketch into a bytes object using the provided serde."
|
148
|
+
)
|
149
|
+
.def_static(
|
150
|
+
"deserialize",
|
151
|
+
[](const std::string& bytes, py_object_serde& serde) {
|
152
|
+
return frequent_items_sketch<T, W, H, E>::deserialize(bytes.data(), bytes.size(), serde);
|
153
|
+
}, py::arg("bytes"), py::arg("serde"),
|
154
|
+
"Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch."
|
155
|
+
);
|
156
|
+
}
|
157
|
+
|
158
|
+
// calls class __hash__ method
|
159
|
+
struct py_hash_caller {
|
160
|
+
size_t operator()(const py::object& a) const {
|
161
|
+
return py::hash(a);
|
162
|
+
}
|
163
|
+
};
|
164
|
+
|
165
|
+
// calls class __eq__ method
|
166
|
+
struct py_equal_caller {
|
167
|
+
bool operator()(const py::object& a, const py::object& b) const {
|
168
|
+
return a.equal(b);
|
169
|
+
}
|
170
|
+
};
|
171
|
+
|
119
172
|
void init_fi(py::module &m) {
|
120
173
|
using namespace datasketches;
|
121
174
|
|
@@ -124,5 +177,6 @@ void init_fi(py::module &m) {
|
|
124
177
|
.value("NO_FALSE_NEGATIVES", NO_FALSE_NEGATIVES)
|
125
178
|
.export_values();
|
126
179
|
|
127
|
-
bind_fi_sketch<std::string
|
180
|
+
bind_fi_sketch<std::string, uint64_t, std::hash<std::string>, std::equal_to<std::string>>(m, "frequent_strings_sketch");
|
181
|
+
bind_fi_sketch<py::object, uint64_t, py_hash_caller, py_equal_caller>(m, "frequent_items_sketch");
|
128
182
|
}
|
@@ -17,34 +17,11 @@
|
|
17
17
|
* under the License.
|
18
18
|
*/
|
19
19
|
|
20
|
-
#include "hll.hpp"
|
21
|
-
|
22
20
|
#include <pybind11/pybind11.h>
|
23
21
|
|
24
|
-
|
25
|
-
|
26
|
-
namespace datasketches {
|
27
|
-
namespace python {
|
28
|
-
|
29
|
-
hll_sketch hll_sketch_deserialize(py::bytes skBytes) {
|
30
|
-
std::string skStr = skBytes; // implicit cast
|
31
|
-
return hll_sketch::deserialize(skStr.c_str(), skStr.length());
|
32
|
-
}
|
33
|
-
|
34
|
-
py::object hll_sketch_serialize_compact(const hll_sketch& sk) {
|
35
|
-
auto serResult = sk.serialize_compact();
|
36
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
37
|
-
}
|
38
|
-
|
39
|
-
py::object hll_sketch_serialize_updatable(const hll_sketch& sk) {
|
40
|
-
auto serResult = sk.serialize_updatable();
|
41
|
-
return py::bytes((char*)serResult.data(), serResult.size());
|
42
|
-
}
|
43
|
-
|
44
|
-
}
|
45
|
-
}
|
22
|
+
#include "hll.hpp"
|
46
23
|
|
47
|
-
namespace
|
24
|
+
namespace py = pybind11;
|
48
25
|
|
49
26
|
void init_hll(py::module &m) {
|
50
27
|
using namespace datasketches;
|
@@ -59,12 +36,6 @@ void init_hll(py::module &m) {
|
|
59
36
|
.def(py::init<uint8_t>(), py::arg("lg_k"))
|
60
37
|
.def(py::init<uint8_t, target_hll_type>(), py::arg("lg_k"), py::arg("tgt_type"))
|
61
38
|
.def(py::init<uint8_t, target_hll_type, bool>(), py::arg("lg_k"), py::arg("tgt_type"), py::arg("start_max_size")=false)
|
62
|
-
.def_static("deserialize", &dspy::hll_sketch_deserialize,
|
63
|
-
"Reads a bytes object and returns the corresponding hll_sketch")
|
64
|
-
.def("serialize_compact", &dspy::hll_sketch_serialize_compact,
|
65
|
-
"Serializes the sketch into a bytes object, compressiong the exception table if HLL_4")
|
66
|
-
.def("serialize_updatable", &dspy::hll_sketch_serialize_updatable,
|
67
|
-
"Serializes the sketch into a bytes object")
|
68
39
|
.def("__str__", (std::string (hll_sketch::*)(bool,bool,bool,bool) const) &hll_sketch::to_string,
|
69
40
|
py::arg("summary")=true, py::arg("detail")=false, py::arg("aux_detail")=false, py::arg("all")=false,
|
70
41
|
"Produces a string summary of the sketch")
|
@@ -88,7 +59,7 @@ void init_hll(py::module &m) {
|
|
88
59
|
.def("get_compact_serialization_bytes", &hll_sketch::get_compact_serialization_bytes,
|
89
60
|
"Returns the size of the serialized sketch when compressing the exception table if HLL_4")
|
90
61
|
.def("reset", &hll_sketch::reset,
|
91
|
-
"Resets the sketch to the empty state in coupon
|
62
|
+
"Resets the sketch to the empty state in coupon collection mode")
|
92
63
|
.def("update", (void (hll_sketch::*)(int64_t)) &hll_sketch::update, py::arg("datum"),
|
93
64
|
"Updates the sketch with the given integral value")
|
94
65
|
.def("update", (void (hll_sketch::*)(double)) &hll_sketch::update, py::arg("datum"),
|
@@ -97,11 +68,32 @@ void init_hll(py::module &m) {
|
|
97
68
|
"Updates the sketch with the given string value")
|
98
69
|
.def_static("get_max_updatable_serialization_bytes", &hll_sketch::get_max_updatable_serialization_bytes,
|
99
70
|
py::arg("lg_k"), py::arg("tgt_type"),
|
100
|
-
"Provides a likely upper bound on serialization size for the given
|
71
|
+
"Provides a likely upper bound on serialization size for the given parameters")
|
101
72
|
.def_static("get_rel_err", &hll_sketch::get_rel_err,
|
102
73
|
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
103
|
-
"
|
104
|
-
|
74
|
+
"Returns the a priori relative error bound for the given parameters")
|
75
|
+
.def(
|
76
|
+
"serialize_compact",
|
77
|
+
[](const hll_sketch& sk) {
|
78
|
+
auto bytes = sk.serialize_compact();
|
79
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
80
|
+
},
|
81
|
+
"Serializes the sketch into a bytes object, compressing the exception table if HLL_4"
|
82
|
+
)
|
83
|
+
.def(
|
84
|
+
"serialize_updatable",
|
85
|
+
[](const hll_sketch& sk) {
|
86
|
+
auto bytes = sk.serialize_updatable();
|
87
|
+
return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size());
|
88
|
+
},
|
89
|
+
"Serializes the sketch into a bytes object"
|
90
|
+
)
|
91
|
+
.def_static(
|
92
|
+
"deserialize",
|
93
|
+
[](const std::string& bytes) { return hll_sketch::deserialize(bytes.data(), bytes.size()); },
|
94
|
+
py::arg("bytes"),
|
95
|
+
"Reads a bytes object and returns the corresponding hll_sketch"
|
96
|
+
);
|
105
97
|
|
106
98
|
py::class_<hll_union>(m, "hll_union")
|
107
99
|
.def(py::init<uint8_t>(), py::arg("lg_max_k"))
|
@@ -129,6 +121,6 @@ void init_hll(py::module &m) {
|
|
129
121
|
"Updates the union with the given string value")
|
130
122
|
.def_static("get_rel_err", &hll_union::get_rel_err,
|
131
123
|
py::arg("upper_bound"), py::arg("unioned"), py::arg("lg_k"), py::arg("num_std_devs"),
|
132
|
-
"
|
124
|
+
"Returns the a priori relative error bound for the given parameters")
|
133
125
|
;
|
134
126
|
}
|