datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -36,18 +36,14 @@ namespace vector_of_kll_constants {
|
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
// Wrapper class for Numpy compatibility
|
|
39
|
-
template <typename T, typename C = std::less<T
|
|
39
|
+
template <typename T, typename C = std::less<T>>
|
|
40
40
|
class vector_of_kll_sketches {
|
|
41
41
|
public:
|
|
42
|
-
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
43
|
-
static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
|
|
44
|
-
static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
|
|
45
|
-
|
|
46
42
|
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
|
47
43
|
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
|
48
44
|
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
|
49
|
-
vector_of_kll_sketches<T,C
|
|
50
|
-
vector_of_kll_sketches<T,C
|
|
45
|
+
vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
|
|
46
|
+
vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
|
|
51
47
|
|
|
52
48
|
// container parameters
|
|
53
49
|
inline uint32_t get_k() const;
|
|
@@ -58,7 +54,7 @@ class vector_of_kll_sketches {
|
|
|
58
54
|
void merge(const vector_of_kll_sketches<T>& other);
|
|
59
55
|
|
|
60
56
|
// returns a single sketch combining all data in the array
|
|
61
|
-
kll_sketch<T,C
|
|
57
|
+
kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
|
|
62
58
|
|
|
63
59
|
// sketch queries returning an array of results
|
|
64
60
|
py::array is_empty() const;
|
|
@@ -67,7 +63,7 @@ class vector_of_kll_sketches {
|
|
|
67
63
|
py::array get_min_values() const;
|
|
68
64
|
py::array get_max_values() const;
|
|
69
65
|
py::array get_num_retained() const;
|
|
70
|
-
py::array get_quantiles(const py::array_t<double>&
|
|
66
|
+
py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
|
|
71
67
|
py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
|
|
72
68
|
py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
|
73
69
|
py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
|
|
@@ -86,11 +82,11 @@ class vector_of_kll_sketches {
|
|
|
86
82
|
|
|
87
83
|
const uint32_t k_; // kll sketch k parameter
|
|
88
84
|
const uint32_t d_; // number of dimensions (here: sketches) to hold
|
|
89
|
-
std::vector<kll_sketch<T,C
|
|
85
|
+
std::vector<kll_sketch<T, C>> sketches_;
|
|
90
86
|
};
|
|
91
87
|
|
|
92
|
-
template<typename T, typename C
|
|
93
|
-
vector_of_kll_sketches<T,C
|
|
88
|
+
template<typename T, typename C>
|
|
89
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
|
|
94
90
|
k_(k),
|
|
95
91
|
d_(d)
|
|
96
92
|
{
|
|
@@ -106,49 +102,49 @@ d_(d)
|
|
|
106
102
|
}
|
|
107
103
|
}
|
|
108
104
|
|
|
109
|
-
template<typename T, typename C
|
|
110
|
-
vector_of_kll_sketches<T,C
|
|
105
|
+
template<typename T, typename C>
|
|
106
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
|
|
111
107
|
k_(other.k_),
|
|
112
108
|
d_(other.d_),
|
|
113
109
|
sketches_(other.sketches_)
|
|
114
110
|
{}
|
|
115
111
|
|
|
116
|
-
template<typename T, typename C
|
|
117
|
-
vector_of_kll_sketches<T,C
|
|
112
|
+
template<typename T, typename C>
|
|
113
|
+
vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
|
|
118
114
|
k_(other.k_),
|
|
119
115
|
d_(other.d_),
|
|
120
116
|
sketches_(std::move(other.sketches_))
|
|
121
117
|
{}
|
|
122
118
|
|
|
123
|
-
template<typename T, typename C
|
|
124
|
-
vector_of_kll_sketches<T,C
|
|
125
|
-
vector_of_kll_sketches<T,C
|
|
119
|
+
template<typename T, typename C>
|
|
120
|
+
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
|
|
121
|
+
vector_of_kll_sketches<T, C> copy(other);
|
|
126
122
|
k_ = copy.k_;
|
|
127
123
|
d_ = copy.d_;
|
|
128
124
|
std::swap(sketches_, copy.sketches_);
|
|
129
125
|
return *this;
|
|
130
126
|
}
|
|
131
127
|
|
|
132
|
-
template<typename T, typename C
|
|
133
|
-
vector_of_kll_sketches<T,C
|
|
128
|
+
template<typename T, typename C>
|
|
129
|
+
vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
|
|
134
130
|
k_ = other.k_;
|
|
135
131
|
d_ = other.d_;
|
|
136
132
|
std::swap(sketches_, other.sketches_);
|
|
137
133
|
return *this;
|
|
138
134
|
}
|
|
139
135
|
|
|
140
|
-
template<typename T, typename C
|
|
141
|
-
uint32_t vector_of_kll_sketches<T,C
|
|
136
|
+
template<typename T, typename C>
|
|
137
|
+
uint32_t vector_of_kll_sketches<T, C>::get_k() const {
|
|
142
138
|
return k_;
|
|
143
139
|
}
|
|
144
140
|
|
|
145
|
-
template<typename T, typename C
|
|
146
|
-
uint32_t vector_of_kll_sketches<T,C
|
|
141
|
+
template<typename T, typename C>
|
|
142
|
+
uint32_t vector_of_kll_sketches<T, C>::get_d() const {
|
|
147
143
|
return d_;
|
|
148
144
|
}
|
|
149
145
|
|
|
150
|
-
template<typename T, typename C
|
|
151
|
-
std::vector<uint32_t> vector_of_kll_sketches<T,C
|
|
146
|
+
template<typename T, typename C>
|
|
147
|
+
std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
|
|
152
148
|
std::vector<uint32_t> indices;
|
|
153
149
|
if (isk.size() == 1) {
|
|
154
150
|
auto data = isk.unchecked();
|
|
@@ -177,8 +173,8 @@ std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array
|
|
|
177
173
|
}
|
|
178
174
|
|
|
179
175
|
// Checks if each sketch is empty or not
|
|
180
|
-
template<typename T, typename C
|
|
181
|
-
py::array vector_of_kll_sketches<T,C
|
|
176
|
+
template<typename T, typename C>
|
|
177
|
+
py::array vector_of_kll_sketches<T, C>::is_empty() const {
|
|
182
178
|
std::vector<bool> vals(d_);
|
|
183
179
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
184
180
|
vals[i] = sketches_[i].is_empty();
|
|
@@ -190,8 +186,8 @@ py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
|
|
|
190
186
|
// Updates each sketch with values
|
|
191
187
|
// Currently: all values must be present
|
|
192
188
|
// TODO: allow subsets of sketches to be updated
|
|
193
|
-
template<typename T, typename C
|
|
194
|
-
void vector_of_kll_sketches<T,C
|
|
189
|
+
template<typename T, typename C>
|
|
190
|
+
void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
|
|
195
191
|
|
|
196
192
|
size_t ndim = items.ndim();
|
|
197
193
|
|
|
@@ -231,8 +227,8 @@ void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
|
|
|
231
227
|
|
|
232
228
|
// Merges two arrays of sketches
|
|
233
229
|
// Currently: all values must be present
|
|
234
|
-
template<typename T, typename C
|
|
235
|
-
void vector_of_kll_sketches<T,C
|
|
230
|
+
template<typename T, typename C>
|
|
231
|
+
void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
|
|
236
232
|
if (d_ != other.get_d()) {
|
|
237
233
|
throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
|
|
238
234
|
+ " vs " + std::to_string(other.d_));
|
|
@@ -243,11 +239,11 @@ void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other
|
|
|
243
239
|
}
|
|
244
240
|
}
|
|
245
241
|
|
|
246
|
-
template<typename T, typename C
|
|
247
|
-
kll_sketch<T,C
|
|
242
|
+
template<typename T, typename C>
|
|
243
|
+
kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
|
|
248
244
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
249
245
|
|
|
250
|
-
kll_sketch<T,C
|
|
246
|
+
kll_sketch<T, C> result(k_);
|
|
251
247
|
for (auto& idx : inds) {
|
|
252
248
|
result.merge(sketches_[idx]);
|
|
253
249
|
}
|
|
@@ -255,8 +251,8 @@ kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>
|
|
|
255
251
|
}
|
|
256
252
|
|
|
257
253
|
// Number of updates for each sketch
|
|
258
|
-
template<typename T, typename C
|
|
259
|
-
py::array vector_of_kll_sketches<T,C
|
|
254
|
+
template<typename T, typename C>
|
|
255
|
+
py::array vector_of_kll_sketches<T, C>::get_n() const {
|
|
260
256
|
std::vector<uint64_t> vals(d_);
|
|
261
257
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
262
258
|
vals[i] = sketches_[i].get_n();
|
|
@@ -265,8 +261,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_n() const {
|
|
|
265
261
|
}
|
|
266
262
|
|
|
267
263
|
// Number of retained values for each sketch
|
|
268
|
-
template<typename T, typename C
|
|
269
|
-
py::array vector_of_kll_sketches<T,C
|
|
264
|
+
template<typename T, typename C>
|
|
265
|
+
py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
|
|
270
266
|
std::vector<uint32_t> vals(d_);
|
|
271
267
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
272
268
|
vals[i] = sketches_[i].get_num_retained();
|
|
@@ -276,22 +272,22 @@ py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
|
|
|
276
272
|
|
|
277
273
|
// Gets the minimum value of each sketch
|
|
278
274
|
// TODO: allow subsets of sketches
|
|
279
|
-
template<typename T, typename C
|
|
280
|
-
py::array vector_of_kll_sketches<T,C
|
|
275
|
+
template<typename T, typename C>
|
|
276
|
+
py::array vector_of_kll_sketches<T, C>::get_min_values() const {
|
|
281
277
|
std::vector<T> vals(d_);
|
|
282
278
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
283
|
-
vals[i] = sketches_[i].
|
|
279
|
+
vals[i] = sketches_[i].get_min_item();
|
|
284
280
|
}
|
|
285
281
|
return py::cast(vals);
|
|
286
282
|
}
|
|
287
283
|
|
|
288
284
|
// Gets the maximum value of each sketch
|
|
289
285
|
// TODO: allow subsets of sketches
|
|
290
|
-
template<typename T, typename C
|
|
291
|
-
py::array vector_of_kll_sketches<T,C
|
|
286
|
+
template<typename T, typename C>
|
|
287
|
+
py::array vector_of_kll_sketches<T, C>::get_max_values() const {
|
|
292
288
|
std::vector<T> vals(d_);
|
|
293
289
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
294
|
-
vals[i] = sketches_[i].
|
|
290
|
+
vals[i] = sketches_[i].get_max_item();
|
|
295
291
|
}
|
|
296
292
|
return py::cast(vals);
|
|
297
293
|
}
|
|
@@ -299,8 +295,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
|
|
|
299
295
|
// Summary of each sketch as one long string
|
|
300
296
|
// Users should use .split('\n\n') when calling it to build a list of each
|
|
301
297
|
// sketch's summary
|
|
302
|
-
template<typename T, typename C
|
|
303
|
-
std::string vector_of_kll_sketches<T,C
|
|
298
|
+
template<typename T, typename C>
|
|
299
|
+
std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
|
|
304
300
|
std::ostringstream ss;
|
|
305
301
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
306
302
|
// all streams into 1 string, for compatibility with Python's str() behavior
|
|
@@ -311,8 +307,8 @@ std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool pri
|
|
|
311
307
|
return ss.str();
|
|
312
308
|
}
|
|
313
309
|
|
|
314
|
-
template<typename T, typename C
|
|
315
|
-
py::array vector_of_kll_sketches<T,C
|
|
310
|
+
template<typename T, typename C>
|
|
311
|
+
py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
|
|
316
312
|
std::vector<bool> vals(d_);
|
|
317
313
|
for (uint32_t i = 0; i < d_; ++i) {
|
|
318
314
|
vals[i] = sketches_[i].is_estimation_mode();
|
|
@@ -321,18 +317,17 @@ py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
|
|
|
321
317
|
}
|
|
322
318
|
|
|
323
319
|
// Value of sketch(es) corresponding to some quantile(s)
|
|
324
|
-
template<typename T, typename C
|
|
325
|
-
py::array vector_of_kll_sketches<T,C
|
|
320
|
+
template<typename T, typename C>
|
|
321
|
+
py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
|
|
326
322
|
const py::array_t<int>& isk) const {
|
|
327
323
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
328
324
|
size_t num_sketches = inds.size();
|
|
329
|
-
size_t num_quantiles =
|
|
325
|
+
size_t num_quantiles = ranks.size();
|
|
330
326
|
|
|
331
327
|
std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
|
|
332
328
|
for (uint32_t i = 0; i < num_sketches; ++i) {
|
|
333
|
-
auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
|
|
334
329
|
for (size_t j = 0; j < num_quantiles; ++j) {
|
|
335
|
-
quants[i][j] =
|
|
330
|
+
quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
|
|
336
331
|
}
|
|
337
332
|
}
|
|
338
333
|
|
|
@@ -340,8 +335,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>
|
|
|
340
335
|
}
|
|
341
336
|
|
|
342
337
|
// Value of sketch(es) corresponding to some rank(s)
|
|
343
|
-
template<typename T, typename C
|
|
344
|
-
py::array vector_of_kll_sketches<T,C
|
|
338
|
+
template<typename T, typename C>
|
|
339
|
+
py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
|
|
345
340
|
const py::array_t<int>& isk) const {
|
|
346
341
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
347
342
|
size_t num_sketches = inds.size();
|
|
@@ -359,8 +354,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
|
|
|
359
354
|
}
|
|
360
355
|
|
|
361
356
|
// PMF(s) of sketch(es)
|
|
362
|
-
template<typename T, typename C
|
|
363
|
-
py::array vector_of_kll_sketches<T,C
|
|
357
|
+
template<typename T, typename C>
|
|
358
|
+
py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
|
|
364
359
|
const py::array_t<int>& isk) const {
|
|
365
360
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
366
361
|
size_t num_sketches = inds.size();
|
|
@@ -378,8 +373,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_poi
|
|
|
378
373
|
}
|
|
379
374
|
|
|
380
375
|
// CDF(s) of sketch(es)
|
|
381
|
-
template<typename T, typename C
|
|
382
|
-
py::array vector_of_kll_sketches<T,C
|
|
376
|
+
template<typename T, typename C>
|
|
377
|
+
py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
|
|
383
378
|
const py::array_t<int>& isk) const {
|
|
384
379
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
385
380
|
size_t num_sketches = inds.size();
|
|
@@ -396,8 +391,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_poi
|
|
|
396
391
|
return py::cast(cdfs);
|
|
397
392
|
}
|
|
398
393
|
|
|
399
|
-
template<typename T, typename C
|
|
400
|
-
void vector_of_kll_sketches<T,C
|
|
394
|
+
template<typename T, typename C>
|
|
395
|
+
void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
|
|
401
396
|
uint32_t idx) {
|
|
402
397
|
if (idx >= d_) {
|
|
403
398
|
throw std::invalid_argument("request for invalid dimenions >= d ("
|
|
@@ -408,8 +403,8 @@ void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
|
|
|
408
403
|
sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
|
|
409
404
|
}
|
|
410
405
|
|
|
411
|
-
template<typename T, typename C
|
|
412
|
-
py::list vector_of_kll_sketches<T,C
|
|
406
|
+
template<typename T, typename C>
|
|
407
|
+
py::list vector_of_kll_sketches<T, C>::serialize(py::array_t<uint32_t>& isk) {
|
|
413
408
|
std::vector<uint32_t> inds = get_indices(isk);
|
|
414
409
|
const size_t num_sketches = inds.size();
|
|
415
410
|
|
|
@@ -466,9 +461,9 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
|
466
461
|
"Returns the minimum value(s) of the sketch(es)")
|
|
467
462
|
.def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
|
|
468
463
|
"Returns the maximum value(s) of the sketch(es)")
|
|
469
|
-
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("
|
|
464
|
+
.def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
|
|
470
465
|
py::arg("isk")=-1,
|
|
471
|
-
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `
|
|
466
|
+
"Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
|
472
467
|
.def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
|
|
473
468
|
py::arg("isk")=-1,
|
|
474
469
|
"Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
|
|
@@ -19,16 +19,50 @@
|
|
|
19
19
|
|
|
20
20
|
#include "var_opt_sketch.hpp"
|
|
21
21
|
#include "var_opt_union.hpp"
|
|
22
|
+
#include "py_serde.hpp"
|
|
22
23
|
|
|
23
24
|
#include <pybind11/pybind11.h>
|
|
24
|
-
#include <pybind11/functional.h>
|
|
25
|
-
#include <sstream>
|
|
26
25
|
|
|
27
26
|
namespace py = pybind11;
|
|
28
27
|
|
|
29
28
|
namespace datasketches {
|
|
29
|
+
|
|
30
30
|
namespace python {
|
|
31
31
|
|
|
32
|
+
template<typename T>
|
|
33
|
+
var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
|
|
34
|
+
std::string skStr = skBytes; // implicit cast
|
|
35
|
+
return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template<typename T>
|
|
39
|
+
py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
|
40
|
+
auto serResult = sk.serialize(0, sd);
|
|
41
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
template<typename T>
|
|
45
|
+
size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
|
|
46
|
+
return sk.get_serialized_size_bytes(sd);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
template<typename T>
|
|
50
|
+
var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
|
|
51
|
+
std::string uStr = uBytes; // implicit cast
|
|
52
|
+
return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
template<typename T>
|
|
56
|
+
py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
|
|
57
|
+
auto serResult = u.serialize(0, sd);
|
|
58
|
+
return py::bytes((char*)serResult.data(), serResult.size());
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
template<typename T>
|
|
62
|
+
size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
|
|
63
|
+
return u.get_serialized_size_bytes(sd);
|
|
64
|
+
}
|
|
65
|
+
|
|
32
66
|
template<typename T>
|
|
33
67
|
py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
|
|
34
68
|
py::list list;
|
|
@@ -63,7 +97,6 @@ std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
|
|
|
63
97
|
// using internal str() method then casting to C++ std::string
|
|
64
98
|
py::str item_pystr(item.first);
|
|
65
99
|
std::string item_str = py::cast<std::string>(item_pystr);
|
|
66
|
-
// item.second is guaranteed to be a double
|
|
67
100
|
ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
|
|
68
101
|
}
|
|
69
102
|
return ss.str();
|
|
@@ -96,17 +129,17 @@ void bind_vo_sketch(py::module &m, const char* name) {
|
|
|
96
129
|
.def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
|
|
97
130
|
"Returns the number of samples currently in the sketch")
|
|
98
131
|
.def("get_samples", &dspy::vo_sketch_get_samples<T>,
|
|
99
|
-
"
|
|
132
|
+
"Returns the set of samples in the sketch")
|
|
100
133
|
.def("is_empty", &var_opt_sketch<T>::is_empty,
|
|
101
134
|
"Returns True if the sketch is empty, otherwise False")
|
|
102
135
|
.def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
|
|
103
136
|
"Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
|
|
104
137
|
"as upper and lower bounds on the estimate and the total weight processed by the sketch")
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
138
|
+
.def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
|
|
139
|
+
"Computes the size in bytes needed to serialize the current sketch")
|
|
140
|
+
.def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
|
|
141
|
+
.def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
|
142
|
+
"Constructs a var opt sketch from the given bytes using the provided serde")
|
|
110
143
|
;
|
|
111
144
|
}
|
|
112
145
|
|
|
@@ -126,11 +159,11 @@ void bind_vo_union(py::module &m, const char* name) {
|
|
|
126
159
|
"Returns a sketch corresponding to the union result")
|
|
127
160
|
.def("reset", &var_opt_union<T>::reset,
|
|
128
161
|
"Resets the union to the empty state")
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
162
|
+
.def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
|
|
163
|
+
"Computes the size in bytes needed to serialize the current sketch")
|
|
164
|
+
.def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
|
|
165
|
+
.def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
|
|
166
|
+
"Constructs a var opt union from the given bytes using the provided serde")
|
|
134
167
|
;
|
|
135
168
|
}
|
|
136
169
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
@@ -50,7 +50,7 @@ class reqTest(unittest.TestCase):
|
|
|
50
50
|
pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
|
|
51
51
|
cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
|
|
52
52
|
self.assertEqual(len(cdf), len(pts)+1)
|
|
53
|
-
|
|
53
|
+
|
|
54
54
|
# For relative error quantiles, the error depends on the actual rank
|
|
55
55
|
# so we need to use that to detemrine the bounds
|
|
56
56
|
est = req.get_rank(0.999, True)
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
18
|
import unittest
|
|
19
|
-
from datasketches import var_opt_sketch, var_opt_union
|
|
19
|
+
from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe
|
|
20
20
|
|
|
21
21
|
class VoTest(unittest.TestCase):
|
|
22
22
|
def test_vo_example(self):
|
|
@@ -97,5 +97,29 @@ class VoTest(unittest.TestCase):
|
|
|
97
97
|
# calls to __str__() with parameters.
|
|
98
98
|
print(result.to_string(True))
|
|
99
99
|
|
|
100
|
+
# finally, we can serialize the sketch by providing an
|
|
101
|
+
# appropriate serde class.
|
|
102
|
+
expected_size = result.get_serialized_size_bytes(PyIntsSerDe())
|
|
103
|
+
b = result.serialize(PyIntsSerDe())
|
|
104
|
+
self.assertEqual(expected_size, len(b))
|
|
105
|
+
|
|
106
|
+
# if we try to deserialize with the wrong serde, things break
|
|
107
|
+
try:
|
|
108
|
+
var_opt_sketch.deserialize(b, PyStringsSerDe())
|
|
109
|
+
self.fail()
|
|
110
|
+
except:
|
|
111
|
+
# expected; do nothing
|
|
112
|
+
self.assertTrue(True)
|
|
113
|
+
|
|
114
|
+
# using the correct serde gives us back a copy of the original
|
|
115
|
+
rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe())
|
|
116
|
+
self.assertEqual(result.k, rebuilt.k)
|
|
117
|
+
self.assertEqual(result.num_samples, rebuilt.num_samples)
|
|
118
|
+
self.assertEqual(result.n, rebuilt.n)
|
|
119
|
+
summary1 = result.estimate_subset_sum(geq_zero)
|
|
120
|
+
summary2 = rebuilt.estimate_subset_sum(geq_zero)
|
|
121
|
+
self.assertEqual(summary1['estimate'], summary2['estimate'])
|
|
122
|
+
self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight'])
|
|
123
|
+
|
|
100
124
|
if __name__ == '__main__':
|
|
101
125
|
unittest.main()
|