datasketches 0.2.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE +4 -6
- data/NOTICE +6 -5
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/LICENSE +4 -6
- data/vendor/datasketches-cpp/MANIFEST.in +21 -4
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/{test/test_runner.cpp → include/version.hpp.in} +15 -8
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +37 -7
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +17 -10
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +55 -42
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +4 -4
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +197 -233
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +42 -32
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +17 -13
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +1 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +19 -1
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +20 -19
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +241 -233
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +117 -104
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +3 -3
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +5 -5
- data/vendor/datasketches-cpp/setup.py +14 -3
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +3 -2
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +41 -35
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +27 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -7
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
- data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
@@ -36,18 +36,14 @@ namespace vector_of_kll_constants {
|
|
36
36
|
}
|
37
37
|
|
38
38
|
// Wrapper class for Numpy compatibility
|
39
|
-
template <typename T, typename C = std::less<T
|
39
|
+
template <typename T, typename C = std::less<T>>
|
40
40
|
class vector_of_kll_sketches {
|
41
41
|
public:
|
42
|
-
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
43
|
-
static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
|
44
|
-
static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
|
45
|
-
|
46
42
|
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
47
43
|
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
48
44
|
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
49
|
-
vector_of_kll_sketches<T,C
|
50
|
-
vector_of_kll_sketches<T,C
|
45
|
+
vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
|
46
|
+
vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
|
51
47
|
|
52
48
|
// container parameters
|
53
49
|
inline uint32_t get_k() const;
|
@@ -58,7 +54,7 @@ class vector_of_kll_sketches {
|
|
58
54
|
void merge(const vector_of_kll_sketches<T>& other);
|
59
55
|
|
60
56
|
// returns a single sketch combining all data in the array
|
61
|
-
kll_sketch<T,C
|
57
|
+
kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
|
62
58
|
|
63
59
|
// sketch queries returning an array of results
|
64
60
|
py::array is_empty() const;
|
@@ -67,7 +63,7 @@ class vector_of_kll_sketches {
|
|
67
63
|
py::array get_min_values() const;
|
68
64
|
py::array get_max_values() const;
|
69
65
|
py::array get_num_retained() const;
|
70
|
-
py::array get_quantiles(const py::array_t<double>&
|
66
|
+
py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
|
71
67
|
py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
|
72
68
|
py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
73
69
|
py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
@@ -86,11 +82,11 @@ class vector_of_kll_sketches {
|
|
86
82
|
|
87
83
|
const uint32_t k_; // kll sketch k parameter
|
88
84
|
const uint32_t d_; // number of dimensions (here: sketches) to hold
|
89
|
-
std::vector<kll_sketch<T,C
|
85
|
+
std::vector<kll_sketch<T, C>> sketches_;
|
90
86
|
};
|
91
87
|
|
92
|
-
template<typename T, typename C
|
93
|
-
vector_of_kll_sketches<T,C
|
88
|
+
template<typename T, typename C>
|
89
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
|
94
90
|
k_(k),
|
95
91
|
d_(d)
|
96
92
|
{
|
@@ -106,49 +102,49 @@ d_(d)
|
|
106
102
|
}
|
107
103
|
}
|
108
104
|
|
109
|
-
template<typename T, typename C
|
110
|
-
vector_of_kll_sketches<T,C
|
105
|
+
template<typename T, typename C>
|
106
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
|
111
107
|
k_(other.k_),
|
112
108
|
d_(other.d_),
|
113
109
|
sketches_(other.sketches_)
|
114
110
|
{}
|
115
111
|
|
116
|
-
template<typename T, typename C
|
117
|
-
vector_of_kll_sketches<T,C
|
112
|
+
template<typename T, typename C>
|
113
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
|
118
114
|
k_(other.k_),
|
119
115
|
d_(other.d_),
|
120
116
|
sketches_(std::move(other.sketches_))
|
121
117
|
{}
|
122
118
|
|
123
|
-
template<typename T, typename C
|
124
|
-
vector_of_kll_sketches<T,C
|
125
|
-
vector_of_kll_sketches<T,C
|
119
|
+
template<typename T, typename C>
|
120
|
+
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
|
121
|
+
vector_of_kll_sketches<T, C> copy(other);
|
126
122
|
k_ = copy.k_;
|
127
123
|
d_ = copy.d_;
|
128
124
|
std::swap(sketches_, copy.sketches_);
|
129
125
|
return *this;
|
130
126
|
}
|
131
127
|
|
132
|
-
template<typename T, typename C
|
133
|
-
vector_of_kll_sketches<T,C
|
128
|
+
template<typename T, typename C>
|
129
|
+
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
|
134
130
|
k_ = other.k_;
|
135
131
|
d_ = other.d_;
|
136
132
|
std::swap(sketches_, other.sketches_);
|
137
133
|
return *this;
|
138
134
|
}
|
139
135
|
|
140
|
-
template<typename T, typename C
|
141
|
-
uint32_t vector_of_kll_sketches<T,C
|
136
|
+
template<typename T, typename C>
|
137
|
+
uint32_t vector_of_kll_sketches<T, C>::get_k() const {
|
142
138
|
return k_;
|
143
139
|
}
|
144
140
|
|
145
|
-
template<typename T, typename C
|
146
|
-
uint32_t vector_of_kll_sketches<T,C
|
141
|
+
template<typename T, typename C>
|
142
|
+
uint32_t vector_of_kll_sketches<T, C>::get_d() const {
|
147
143
|
return d_;
|
148
144
|
}
|
149
145
|
|
150
|
-
template<typename T, typename C
|
151
|
-
std::vector<uint32_t> vector_of_kll_sketches<T,C
|
146
|
+
template<typename T, typename C>
|
147
|
+
std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
|
152
148
|
std::vector<uint32_t> indices;
|
153
149
|
if (isk.size() == 1) {
|
154
150
|
auto data = isk.unchecked();
|
@@ -177,8 +173,8 @@ std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array
|
|
177
173
|
}
|
178
174
|
|
179
175
|
// Checks if each sketch is empty or not
|
180
|
-
template<typename T, typename C
|
181
|
-
py::array vector_of_kll_sketches<T,C
|
176
|
+
template<typename T, typename C>
|
177
|
+
py::array vector_of_kll_sketches<T, C>::is_empty() const {
|
182
178
|
std::vector<bool> vals(d_);
|
183
179
|
for (uint32_t i = 0; i < d_; ++i) {
|
184
180
|
vals[i] = sketches_[i].is_empty();
|
@@ -190,8 +186,8 @@ py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
|
|
190
186
|
// Updates each sketch with values
|
191
187
|
// Currently: all values must be present
|
192
188
|
// TODO: allow subsets of sketches to be updated
|
193
|
-
template<typename T, typename C
|
194
|
-
void vector_of_kll_sketches<T,C
|
189
|
+
template<typename T, typename C>
|
190
|
+
void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
|
195
191
|
|
196
192
|
size_t ndim = items.ndim();
|
197
193
|
|
@@ -231,8 +227,8 @@ void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
|
|
231
227
|
|
232
228
|
// Merges two arrays of sketches
|
233
229
|
// Currently: all values must be present
|
234
|
-
template<typename T, typename C
|
235
|
-
void vector_of_kll_sketches<T,C
|
230
|
+
template<typename T, typename C>
|
231
|
+
void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
|
236
232
|
if (d_ != other.get_d()) {
|
237
233
|
throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
|
238
234
|
+ " vs " + std::to_string(other.d_));
|
@@ -243,11 +239,11 @@ void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other
|
|
243
239
|
}
|
244
240
|
}
|
245
241
|
|
246
|
-
template<typename T, typename C
|
247
|
-
kll_sketch<T,C
|
242
|
+
template<typename T, typename C>
|
243
|
+
kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
|
248
244
|
std::vector<uint32_t> inds = get_indices(isk);
|
249
245
|
|
250
|
-
kll_sketch<T,C
|
246
|
+
kll_sketch<T, C> result(k_);
|
251
247
|
for (auto& idx : inds) {
|
252
248
|
result.merge(sketches_[idx]);
|
253
249
|
}
|
@@ -255,8 +251,8 @@ kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>
|
|
255
251
|
}
|
256
252
|
|
257
253
|
// Number of updates for each sketch
|
258
|
-
template<typename T, typename C
|
259
|
-
py::array vector_of_kll_sketches<T,C
|
254
|
+
template<typename T, typename C>
|
255
|
+
py::array vector_of_kll_sketches<T, C>::get_n() const {
|
260
256
|
std::vector<uint64_t> vals(d_);
|
261
257
|
for (uint32_t i = 0; i < d_; ++i) {
|
262
258
|
vals[i] = sketches_[i].get_n();
|
@@ -265,8 +261,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_n() const {
|
|
265
261
|
}
|
266
262
|
|
267
263
|
// Number of retained values for each sketch
|
268
|
-
template<typename T, typename C
|
269
|
-
py::array vector_of_kll_sketches<T,C
|
264
|
+
template<typename T, typename C>
|
265
|
+
py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
|
270
266
|
std::vector<uint32_t> vals(d_);
|
271
267
|
for (uint32_t i = 0; i < d_; ++i) {
|
272
268
|
vals[i] = sketches_[i].get_num_retained();
|
@@ -276,22 +272,22 @@ py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
|
|
276
272
|
|
277
273
|
// Gets the minimum value of each sketch
|
278
274
|
// TODO: allow subsets of sketches
|
279
|
-
template<typename T, typename C
|
280
|
-
py::array vector_of_kll_sketches<T,C
|
275
|
+
template<typename T, typename C>
|
276
|
+
py::array vector_of_kll_sketches<T, C>::get_min_values() const {
|
281
277
|
std::vector<T> vals(d_);
|
282
278
|
for (uint32_t i = 0; i < d_; ++i) {
|
283
|
-
vals[i] = sketches_[i].
|
279
|
+
vals[i] = sketches_[i].get_min_item();
|
284
280
|
}
|
285
281
|
return py::cast(vals);
|
286
282
|
}
|
287
283
|
|
288
284
|
// Gets the maximum value of each sketch
|
289
285
|
// TODO: allow subsets of sketches
|
290
|
-
template<typename T, typename C
|
291
|
-
py::array vector_of_kll_sketches<T,C
|
286
|
+
template<typename T, typename C>
|
287
|
+
py::array vector_of_kll_sketches<T, C>::get_max_values() const {
|
292
288
|
std::vector<T> vals(d_);
|
293
289
|
for (uint32_t i = 0; i < d_; ++i) {
|
294
|
-
vals[i] = sketches_[i].
|
290
|
+
vals[i] = sketches_[i].get_max_item();
|
295
291
|
}
|
296
292
|
return py::cast(vals);
|
297
293
|
}
|
@@ -299,8 +295,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
|
|
299
295
|
// Summary of each sketch as one long string
|
300
296
|
// Users should use .split('\n\n') when calling it to build a list of each
|
301
297
|
// sketch's summary
|
302
|
-
template<typename T, typename C
|
303
|
-
std::string vector_of_kll_sketches<T,C
|
298
|
+
template<typename T, typename C>
|
299
|
+
std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
|
304
300
|
std::ostringstream ss;
|
305
301
|
for (uint32_t i = 0; i < d_; ++i) {
|
306
302
|
// all streams into 1 string, for compatibility with Python's str() behavior
|
@@ -311,8 +307,8 @@ std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool pri
|
|
311
307
|
return ss.str();
|
312
308
|
}
|
313
309
|
|
314
|
-
template<typename T, typename C
|
315
|
-
py::array vector_of_kll_sketches<T,C
|
310
|
+
template<typename T, typename C>
|
311
|
+
py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
|
316
312
|
std::vector<bool> vals(d_);
|
317
313
|
for (uint32_t i = 0; i < d_; ++i) {
|
318
314
|
vals[i] = sketches_[i].is_estimation_mode();
|
@@ -321,18 +317,17 @@ py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
|
|
321
317
|
}
|
322
318
|
|
323
319
|
// Value of sketch(es) corresponding to some quantile(s)
|
324
|
-
template<typename T, typename C
|
325
|
-
py::array vector_of_kll_sketches<T,C
|
320
|
+
template<typename T, typename C>
|
321
|
+
py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
|
326
322
|
const py::array_t<int>& isk) const {
|
327
323
|
std::vector<uint32_t> inds = get_indices(isk);
|
328
324
|
size_t num_sketches = inds.size();
|
329
|
-
size_t num_quantiles =
|
325
|
+
size_t num_quantiles = ranks.size();
|
330
326
|
|
331
327
|
std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
|
332
328
|
for (uint32_t i = 0; i < num_sketches; ++i) {
|
333
|
-
auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
|
334
329
|
for (size_t j = 0; j < num_quantiles; ++j) {
|
335
|
-
quants[i][j] =
|
330
|
+
quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
|
336
331
|
}
|
337
332
|
}
|
338
333
|
|
@@ -340,8 +335,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>
|
|
340
335
|
}
|
341
336
|
|
342
337
|
// Value of sketch(es) corresponding to some rank(s)
|
343
|
-
template<typename T, typename C
|
344
|
-
py::array vector_of_kll_sketches<T,C
|
338
|
+
template<typename T, typename C>
|
339
|
+
py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
|
345
340
|
const py::array_t<int>& isk) const {
|
346
341
|
std::vector<uint32_t> inds = get_indices(isk);
|
347
342
|
size_t num_sketches = inds.size();
|
@@ -359,8 +354,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
|
|
359
354
|
}
|
360
355
|
|
361
356
|
// PMF(s) of sketch(es)
|
362
|
-
template<typename T, typename C
|
363
|
-
py::array vector_of_kll_sketches<T,C
|
357
|
+
template<typename T, typename C>
|
358
|
+
py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
|
364
359
|
const py::array_t<int>& isk) const {
|
365
360
|
std::vector<uint32_t> inds = get_indices(isk);
|
366
361
|
size_t num_sketches = inds.size();
|
@@ -378,8 +373,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_poi
|
|
378
373
|
}
|
379
374
|
|
380
375
|
// CDF(s) of sketch(es)
|
381
|
-
template<typename T, typename C
|
382
|
-
py::array vector_of_kll_sketches<T,C
|
376
|
+
template<typename T, typename C>
|
377
|
+
py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
|
383
378
|
const py::array_t<int>& isk) const {
|
384
379
|
std::vector<uint32_t> inds = get_indices(isk);
|
385
380
|
size_t num_sketches = inds.size();
|
@@ -396,8 +391,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_poi
|
|
396
391
|
return py::cast(cdfs);
|
397
392
|
}
|
398
393
|
|
399
|
-
template<typename T, typename C
|
400
|
-
void vector_of_kll_sketches<T,C
|
394
|
+
template<typename T, typename C>
|
395
|
+
void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
|
401
396
|
uint32_t idx) {
|
402
397
|
if (idx >= d_) {
|
403
398
|
throw std::invalid_argument("request for invalid dimenions >= d ("
|
@@ -408,8 +403,8 @@ void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
|
|
408
403
|
sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
|
409
404
|
}
|
410
405
|
|
411
|
-
template<typename T, typename C
|
412
|
-
py::list vector_of_kll_sketches<T,C
|
406
|
+
template<typename T, typename C>
|
407
|
+
py::list vector_of_kll_sketches<T, C>::serialize(py::array_t<uint32_t>& isk) {
|
413
408
|
std::vector<uint32_t> inds = get_indices(isk);
|
414
409
|
const size_t num_sketches = inds.size();
|
415
410
|
|
@@ -466,9 +461,9 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
466
461
|
"Returns the minimum value(s) of the sketch(es)")
|
467
462
|
.def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
|
468
463
|
"Returns the maximum value(s) of the sketch(es)")
|
469
|
-
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("
|
464
|
+
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
|
470
465
|
py::arg("isk")=-1,
|
471
|
-
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `
|
466
|
+
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
472
467
|
.def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
|
473
468
|
py::arg("isk")=-1,
|
474
469
|
"Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
@@ -19,16 +19,50 @@
|
|
19
19
|
|
20
20
|
#include "var_opt_sketch.hpp"
|
21
21
|
#include "var_opt_union.hpp"
|
22
|
+
#include "py_serde.hpp"
|
22
23
|
|
23
24
|
#include <pybind11/pybind11.h>
|
24
|
-
#include <pybind11/functional.h>
|
25
|
-
#include <sstream>
|
26
25
|
|
27
26
|
namespace py = pybind11;
|
28
27
|
|
29
28
|
namespace datasketches {
|
29
|
+
|
30
30
|
namespace python {
|
31
31
|
|
32
|
+
template<typename T>
|
33
|
+
var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
|
34
|
+
std::string skStr = skBytes; // implicit cast
|
35
|
+
return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
|
36
|
+
}
|
37
|
+
|
38
|
+
template<typename T>
|
39
|
+
py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
40
|
+
auto serResult = sk.serialize(0, sd);
|
41
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
42
|
+
}
|
43
|
+
|
44
|
+
template<typename T>
|
45
|
+
size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
46
|
+
return sk.get_serialized_size_bytes(sd);
|
47
|
+
}
|
48
|
+
|
49
|
+
template<typename T>
|
50
|
+
var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
|
51
|
+
std::string uStr = uBytes; // implicit cast
|
52
|
+
return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
|
53
|
+
}
|
54
|
+
|
55
|
+
template<typename T>
|
56
|
+
py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
|
57
|
+
auto serResult = u.serialize(0, sd);
|
58
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
59
|
+
}
|
60
|
+
|
61
|
+
template<typename T>
|
62
|
+
size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
|
63
|
+
return u.get_serialized_size_bytes(sd);
|
64
|
+
}
|
65
|
+
|
32
66
|
template<typename T>
|
33
67
|
py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
|
34
68
|
py::list list;
|
@@ -63,7 +97,6 @@ std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
|
|
63
97
|
// using internal str() method then casting to C++ std::string
|
64
98
|
py::str item_pystr(item.first);
|
65
99
|
std::string item_str = py::cast<std::string>(item_pystr);
|
66
|
-
// item.second is guaranteed to be a double
|
67
100
|
ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
|
68
101
|
}
|
69
102
|
return ss.str();
|
@@ -96,17 +129,17 @@ void bind_vo_sketch(py::module &m, const char* name) {
|
|
96
129
|
.def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
|
97
130
|
"Returns the number of samples currently in the sketch")
|
98
131
|
.def("get_samples", &dspy::vo_sketch_get_samples<T>,
|
99
|
-
"
|
132
|
+
"Returns the set of samples in the sketch")
|
100
133
|
.def("is_empty", &var_opt_sketch<T>::is_empty,
|
101
134
|
"Returns True if the sketch is empty, otherwise False")
|
102
135
|
.def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
|
103
136
|
"Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
|
104
137
|
"as upper and lower bounds on the estimate and the total weight processed by the sketch")
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
138
|
+
.def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
|
139
|
+
"Computes the size in bytes needed to serialize the current sketch")
|
140
|
+
.def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
|
141
|
+
.def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
142
|
+
"Constructs a var opt sketch from the given bytes using the provided serde")
|
110
143
|
;
|
111
144
|
}
|
112
145
|
|
@@ -126,11 +159,11 @@ void bind_vo_union(py::module &m, const char* name) {
|
|
126
159
|
"Returns a sketch corresponding to the union result")
|
127
160
|
.def("reset", &var_opt_union<T>::reset,
|
128
161
|
"Resets the union to the empty state")
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
162
|
+
.def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
|
163
|
+
"Computes the size in bytes needed to serialize the current sketch")
|
164
|
+
.def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
|
165
|
+
.def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
166
|
+
"Constructs a var opt union from the given bytes using the provided serde")
|
134
167
|
;
|
135
168
|
}
|
136
169
|
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
@@ -50,7 +50,7 @@ class reqTest(unittest.TestCase):
|
|
50
50
|
pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
|
51
51
|
cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
|
52
52
|
self.assertEqual(len(cdf), len(pts)+1)
|
53
|
-
|
53
|
+
|
54
54
|
# For relative error quantiles, the error depends on the actual rank
|
55
55
|
# so we need to use that to detemrine the bounds
|
56
56
|
est = req.get_rank(0.999, True)
|
@@ -16,7 +16,7 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import var_opt_sketch, var_opt_union
|
19
|
+
from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe
|
20
20
|
|
21
21
|
class VoTest(unittest.TestCase):
|
22
22
|
def test_vo_example(self):
|
@@ -97,5 +97,29 @@ class VoTest(unittest.TestCase):
|
|
97
97
|
# calls to __str__() with parameters.
|
98
98
|
print(result.to_string(True))
|
99
99
|
|
100
|
+
# finally, we can serialize the sketch by providing an
|
101
|
+
# appropriate serde class.
|
102
|
+
expected_size = result.get_serialized_size_bytes(PyIntsSerDe())
|
103
|
+
b = result.serialize(PyIntsSerDe())
|
104
|
+
self.assertEqual(expected_size, len(b))
|
105
|
+
|
106
|
+
# if we try to deserialize with the wrong serde, things break
|
107
|
+
try:
|
108
|
+
var_opt_sketch.deserialize(b, PyStringsSerDe())
|
109
|
+
self.fail()
|
110
|
+
except:
|
111
|
+
# expected; do nothing
|
112
|
+
self.assertTrue(True)
|
113
|
+
|
114
|
+
# using the correct serde gives us back a copy of the original
|
115
|
+
rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe())
|
116
|
+
self.assertEqual(result.k, rebuilt.k)
|
117
|
+
self.assertEqual(result.num_samples, rebuilt.num_samples)
|
118
|
+
self.assertEqual(result.n, rebuilt.n)
|
119
|
+
summary1 = result.estimate_subset_sum(geq_zero)
|
120
|
+
summary2 = rebuilt.estimate_subset_sum(geq_zero)
|
121
|
+
self.assertEqual(summary1['estimate'], summary2['estimate'])
|
122
|
+
self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight'])
|
123
|
+
|
100
124
|
if __name__ == '__main__':
|
101
125
|
unittest.main()
|