datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -36,18 +36,14 @@ namespace vector_of_kll_constants {
36
36
  }
37
37
 
38
38
  // Wrapper class for Numpy compatibility
39
- template <typename T, typename C = std::less<T>, typename S = serde<T>>
39
+ template <typename T, typename C = std::less<T>>
40
40
  class vector_of_kll_sketches {
41
41
  public:
42
- // TODO: Redundant and deprecated. Will be removed in next major version release.
43
- static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
44
- static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
45
-
46
42
  explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
47
43
  vector_of_kll_sketches(const vector_of_kll_sketches& other);
48
44
  vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
49
- vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
50
- vector_of_kll_sketches<T,C,S>& operator=(vector_of_kll_sketches&& other);
45
+ vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
46
+ vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
51
47
 
52
48
  // container parameters
53
49
  inline uint32_t get_k() const;
@@ -58,7 +54,7 @@ class vector_of_kll_sketches {
58
54
  void merge(const vector_of_kll_sketches<T>& other);
59
55
 
60
56
  // returns a single sketch combining all data in the array
61
- kll_sketch<T,C,S> collapse(const py::array_t<int>& isk) const;
57
+ kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
62
58
 
63
59
  // sketch queries returning an array of results
64
60
  py::array is_empty() const;
@@ -67,7 +63,7 @@ class vector_of_kll_sketches {
67
63
  py::array get_min_values() const;
68
64
  py::array get_max_values() const;
69
65
  py::array get_num_retained() const;
70
- py::array get_quantiles(const py::array_t<double>& fractions, const py::array_t<int>& isk) const;
66
+ py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
71
67
  py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
72
68
  py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
73
69
  py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
@@ -86,11 +82,11 @@ class vector_of_kll_sketches {
86
82
 
87
83
  const uint32_t k_; // kll sketch k parameter
88
84
  const uint32_t d_; // number of dimensions (here: sketches) to hold
89
- std::vector<kll_sketch<T,C,S>> sketches_;
85
+ std::vector<kll_sketch<T, C>> sketches_;
90
86
  };
91
87
 
92
- template<typename T, typename C, typename S>
93
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(uint32_t k, uint32_t d):
88
+ template<typename T, typename C>
89
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
94
90
  k_(k),
95
91
  d_(d)
96
92
  {
@@ -106,49 +102,49 @@ d_(d)
106
102
  }
107
103
  }
108
104
 
109
- template<typename T, typename C, typename S>
110
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
105
+ template<typename T, typename C>
106
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
111
107
  k_(other.k_),
112
108
  d_(other.d_),
113
109
  sketches_(other.sketches_)
114
110
  {}
115
111
 
116
- template<typename T, typename C, typename S>
117
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
112
+ template<typename T, typename C>
113
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
118
114
  k_(other.k_),
119
115
  d_(other.d_),
120
116
  sketches_(std::move(other.sketches_))
121
117
  {}
122
118
 
123
- template<typename T, typename C, typename S>
124
- vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(const vector_of_kll_sketches& other) {
125
- vector_of_kll_sketches<T,C,S> copy(other);
119
+ template<typename T, typename C>
120
+ vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
121
+ vector_of_kll_sketches<T, C> copy(other);
126
122
  k_ = copy.k_;
127
123
  d_ = copy.d_;
128
124
  std::swap(sketches_, copy.sketches_);
129
125
  return *this;
130
126
  }
131
127
 
132
- template<typename T, typename C, typename S>
133
- vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(vector_of_kll_sketches&& other) {
128
+ template<typename T, typename C>
129
+ vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
134
130
  k_ = other.k_;
135
131
  d_ = other.d_;
136
132
  std::swap(sketches_, other.sketches_);
137
133
  return *this;
138
134
  }
139
135
 
140
- template<typename T, typename C, typename S>
141
- uint32_t vector_of_kll_sketches<T,C,S>::get_k() const {
136
+ template<typename T, typename C>
137
+ uint32_t vector_of_kll_sketches<T, C>::get_k() const {
142
138
  return k_;
143
139
  }
144
140
 
145
- template<typename T, typename C, typename S>
146
- uint32_t vector_of_kll_sketches<T,C,S>::get_d() const {
141
+ template<typename T, typename C>
142
+ uint32_t vector_of_kll_sketches<T, C>::get_d() const {
147
143
  return d_;
148
144
  }
149
145
 
150
- template<typename T, typename C, typename S>
151
- std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array_t<int>& isk) const {
146
+ template<typename T, typename C>
147
+ std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
152
148
  std::vector<uint32_t> indices;
153
149
  if (isk.size() == 1) {
154
150
  auto data = isk.unchecked();
@@ -177,8 +173,8 @@ std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array
177
173
  }
178
174
 
179
175
  // Checks if each sketch is empty or not
180
- template<typename T, typename C, typename S>
181
- py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
176
+ template<typename T, typename C>
177
+ py::array vector_of_kll_sketches<T, C>::is_empty() const {
182
178
  std::vector<bool> vals(d_);
183
179
  for (uint32_t i = 0; i < d_; ++i) {
184
180
  vals[i] = sketches_[i].is_empty();
@@ -190,8 +186,8 @@ py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
190
186
  // Updates each sketch with values
191
187
  // Currently: all values must be present
192
188
  // TODO: allow subsets of sketches to be updated
193
- template<typename T, typename C, typename S>
194
- void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
189
+ template<typename T, typename C>
190
+ void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
195
191
 
196
192
  size_t ndim = items.ndim();
197
193
 
@@ -231,8 +227,8 @@ void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
231
227
 
232
228
  // Merges two arrays of sketches
233
229
  // Currently: all values must be present
234
- template<typename T, typename C, typename S>
235
- void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other) {
230
+ template<typename T, typename C>
231
+ void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
236
232
  if (d_ != other.get_d()) {
237
233
  throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
238
234
  + " vs " + std::to_string(other.d_));
@@ -243,11 +239,11 @@ void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other
243
239
  }
244
240
  }
245
241
 
246
- template<typename T, typename C, typename S>
247
- kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>& isk) const {
242
+ template<typename T, typename C>
243
+ kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
248
244
  std::vector<uint32_t> inds = get_indices(isk);
249
245
 
250
- kll_sketch<T,C,S> result(k_);
246
+ kll_sketch<T, C> result(k_);
251
247
  for (auto& idx : inds) {
252
248
  result.merge(sketches_[idx]);
253
249
  }
@@ -255,8 +251,8 @@ kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>
255
251
  }
256
252
 
257
253
  // Number of updates for each sketch
258
- template<typename T, typename C, typename S>
259
- py::array vector_of_kll_sketches<T,C,S>::get_n() const {
254
+ template<typename T, typename C>
255
+ py::array vector_of_kll_sketches<T, C>::get_n() const {
260
256
  std::vector<uint64_t> vals(d_);
261
257
  for (uint32_t i = 0; i < d_; ++i) {
262
258
  vals[i] = sketches_[i].get_n();
@@ -265,8 +261,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_n() const {
265
261
  }
266
262
 
267
263
  // Number of retained values for each sketch
268
- template<typename T, typename C, typename S>
269
- py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
264
+ template<typename T, typename C>
265
+ py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
270
266
  std::vector<uint32_t> vals(d_);
271
267
  for (uint32_t i = 0; i < d_; ++i) {
272
268
  vals[i] = sketches_[i].get_num_retained();
@@ -276,22 +272,22 @@ py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
276
272
 
277
273
  // Gets the minimum value of each sketch
278
274
  // TODO: allow subsets of sketches
279
- template<typename T, typename C, typename S>
280
- py::array vector_of_kll_sketches<T,C,S>::get_min_values() const {
275
+ template<typename T, typename C>
276
+ py::array vector_of_kll_sketches<T, C>::get_min_values() const {
281
277
  std::vector<T> vals(d_);
282
278
  for (uint32_t i = 0; i < d_; ++i) {
283
- vals[i] = sketches_[i].get_min_value();
279
+ vals[i] = sketches_[i].get_min_item();
284
280
  }
285
281
  return py::cast(vals);
286
282
  }
287
283
 
288
284
  // Gets the maximum value of each sketch
289
285
  // TODO: allow subsets of sketches
290
- template<typename T, typename C, typename S>
291
- py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
286
+ template<typename T, typename C>
287
+ py::array vector_of_kll_sketches<T, C>::get_max_values() const {
292
288
  std::vector<T> vals(d_);
293
289
  for (uint32_t i = 0; i < d_; ++i) {
294
- vals[i] = sketches_[i].get_max_value();
290
+ vals[i] = sketches_[i].get_max_item();
295
291
  }
296
292
  return py::cast(vals);
297
293
  }
@@ -299,8 +295,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
299
295
  // Summary of each sketch as one long string
300
296
  // Users should use .split('\n\n') when calling it to build a list of each
301
297
  // sketch's summary
302
- template<typename T, typename C, typename S>
303
- std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool print_items) const {
298
+ template<typename T, typename C>
299
+ std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
304
300
  std::ostringstream ss;
305
301
  for (uint32_t i = 0; i < d_; ++i) {
306
302
  // all streams into 1 string, for compatibility with Python's str() behavior
@@ -311,8 +307,8 @@ std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool pri
311
307
  return ss.str();
312
308
  }
313
309
 
314
- template<typename T, typename C, typename S>
315
- py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
310
+ template<typename T, typename C>
311
+ py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
316
312
  std::vector<bool> vals(d_);
317
313
  for (uint32_t i = 0; i < d_; ++i) {
318
314
  vals[i] = sketches_[i].is_estimation_mode();
@@ -321,18 +317,17 @@ py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
321
317
  }
322
318
 
323
319
  // Value of sketch(es) corresponding to some quantile(s)
324
- template<typename T, typename C, typename S>
325
- py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>& fractions,
320
+ template<typename T, typename C>
321
+ py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
326
322
  const py::array_t<int>& isk) const {
327
323
  std::vector<uint32_t> inds = get_indices(isk);
328
324
  size_t num_sketches = inds.size();
329
- size_t num_quantiles = fractions.size();
325
+ size_t num_quantiles = ranks.size();
330
326
 
331
327
  std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
332
328
  for (uint32_t i = 0; i < num_sketches; ++i) {
333
- auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
334
329
  for (size_t j = 0; j < num_quantiles; ++j) {
335
- quants[i][j] = quant[j];
330
+ quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
336
331
  }
337
332
  }
338
333
 
@@ -340,8 +335,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>
340
335
  }
341
336
 
342
337
  // Value of sketch(es) corresponding to some rank(s)
343
- template<typename T, typename C, typename S>
344
- py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
338
+ template<typename T, typename C>
339
+ py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
345
340
  const py::array_t<int>& isk) const {
346
341
  std::vector<uint32_t> inds = get_indices(isk);
347
342
  size_t num_sketches = inds.size();
@@ -359,8 +354,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
359
354
  }
360
355
 
361
356
  // PMF(s) of sketch(es)
362
- template<typename T, typename C, typename S>
363
- py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_points,
357
+ template<typename T, typename C>
358
+ py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
364
359
  const py::array_t<int>& isk) const {
365
360
  std::vector<uint32_t> inds = get_indices(isk);
366
361
  size_t num_sketches = inds.size();
@@ -378,8 +373,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_poi
378
373
  }
379
374
 
380
375
  // CDF(s) of sketch(es)
381
- template<typename T, typename C, typename S>
382
- py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_points,
376
+ template<typename T, typename C>
377
+ py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
383
378
  const py::array_t<int>& isk) const {
384
379
  std::vector<uint32_t> inds = get_indices(isk);
385
380
  size_t num_sketches = inds.size();
@@ -396,8 +391,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_poi
396
391
  return py::cast(cdfs);
397
392
  }
398
393
 
399
- template<typename T, typename C, typename S>
400
- void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
394
+ template<typename T, typename C>
395
+ void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
401
396
  uint32_t idx) {
402
397
  if (idx >= d_) {
403
398
  throw std::invalid_argument("request for invalid dimenions >= d ("
@@ -408,8 +403,8 @@ void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
408
403
  sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
409
404
  }
410
405
 
411
- template<typename T, typename C, typename S>
412
- py::list vector_of_kll_sketches<T,C,S>::serialize(py::array_t<uint32_t>& isk) {
406
+ template<typename T, typename C>
407
+ py::list vector_of_kll_sketches<T, C>::serialize(py::array_t<uint32_t>& isk) {
413
408
  std::vector<uint32_t> inds = get_indices(isk);
414
409
  const size_t num_sketches = inds.size();
415
410
 
@@ -466,9 +461,9 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
466
461
  "Returns the minimum value(s) of the sketch(es)")
467
462
  .def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
468
463
  "Returns the maximum value(s) of the sketch(es)")
469
- .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("fractions"),
464
+ .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
470
465
  py::arg("isk")=-1,
471
- "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `fractions` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
466
+ "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
472
467
  .def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
473
468
  py::arg("isk")=-1,
474
469
  "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
@@ -19,16 +19,50 @@
19
19
 
20
20
  #include "var_opt_sketch.hpp"
21
21
  #include "var_opt_union.hpp"
22
+ #include "py_serde.hpp"
22
23
 
23
24
  #include <pybind11/pybind11.h>
24
- #include <pybind11/functional.h>
25
- #include <sstream>
26
25
 
27
26
  namespace py = pybind11;
28
27
 
29
28
  namespace datasketches {
29
+
30
30
  namespace python {
31
31
 
32
+ template<typename T>
33
+ var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
34
+ std::string skStr = skBytes; // implicit cast
35
+ return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
36
+ }
37
+
38
+ template<typename T>
39
+ py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
40
+ auto serResult = sk.serialize(0, sd);
41
+ return py::bytes((char*)serResult.data(), serResult.size());
42
+ }
43
+
44
+ template<typename T>
45
+ size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
46
+ return sk.get_serialized_size_bytes(sd);
47
+ }
48
+
49
+ template<typename T>
50
+ var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
51
+ std::string uStr = uBytes; // implicit cast
52
+ return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
53
+ }
54
+
55
+ template<typename T>
56
+ py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
57
+ auto serResult = u.serialize(0, sd);
58
+ return py::bytes((char*)serResult.data(), serResult.size());
59
+ }
60
+
61
+ template<typename T>
62
+ size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
63
+ return u.get_serialized_size_bytes(sd);
64
+ }
65
+
32
66
  template<typename T>
33
67
  py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
34
68
  py::list list;
@@ -63,7 +97,6 @@ std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
63
97
  // using internal str() method then casting to C++ std::string
64
98
  py::str item_pystr(item.first);
65
99
  std::string item_str = py::cast<std::string>(item_pystr);
66
- // item.second is guaranteed to be a double
67
100
  ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
68
101
  }
69
102
  return ss.str();
@@ -96,17 +129,17 @@ void bind_vo_sketch(py::module &m, const char* name) {
96
129
  .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
97
130
  "Returns the number of samples currently in the sketch")
98
131
  .def("get_samples", &dspy::vo_sketch_get_samples<T>,
99
- "Retyrns the set of samples in the sketch")
132
+ "Returns the set of samples in the sketch")
100
133
  .def("is_empty", &var_opt_sketch<T>::is_empty,
101
134
  "Returns True if the sketch is empty, otherwise False")
102
135
  .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
103
136
  "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
104
137
  "as upper and lower bounds on the estimate and the total weight processed by the sketch")
105
- // As of writing, not yet clear how to serialize arbitrary python objects,
106
- // especially in any sort of language-portable way
107
- //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
108
- //.def("serialize", &dspy::vo_sketch_serialize<T>)
109
- //.def_static("deserialize", &dspy::vo_sketch_deserialize<T>)
138
+ .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
139
+ "Computes the size in bytes needed to serialize the current sketch")
140
+ .def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
141
+ .def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
142
+ "Constructs a var opt sketch from the given bytes using the provided serde")
110
143
  ;
111
144
  }
112
145
 
@@ -126,11 +159,11 @@ void bind_vo_union(py::module &m, const char* name) {
126
159
  "Returns a sketch corresponding to the union result")
127
160
  .def("reset", &var_opt_union<T>::reset,
128
161
  "Resets the union to the empty state")
129
- // As of writing, not yet clear how to serialize arbitrary python objects,
130
- // especially in any sort of language-portable way
131
- //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
132
- //.def("serialize", &dspy::vo_union_serialize<T>)
133
- //.def_static("deserialize", &dspy::vo_union_deserialize<T>)
162
+ .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
163
+ "Computes the size in bytes needed to serialize the current sketch")
164
+ .def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
165
+ .def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
166
+ "Constructs a var opt union from the given bytes using the provided serde")
134
167
  ;
135
168
  }
136
169
 
@@ -0,0 +1,16 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
@@ -50,7 +50,7 @@ class reqTest(unittest.TestCase):
50
50
  pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
51
51
  cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
52
52
  self.assertEqual(len(cdf), len(pts)+1)
53
-
53
+
54
54
  # For relative error quantiles, the error depends on the actual rank
55
55
  # so we need to use that to detemrine the bounds
56
56
  est = req.get_rank(0.999, True)
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import var_opt_sketch, var_opt_union
19
+ from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe
20
20
 
21
21
  class VoTest(unittest.TestCase):
22
22
  def test_vo_example(self):
@@ -97,5 +97,29 @@ class VoTest(unittest.TestCase):
97
97
  # calls to __str__() with parameters.
98
98
  print(result.to_string(True))
99
99
 
100
+ # finally, we can serialize the sketch by providing an
101
+ # appropriate serde class.
102
+ expected_size = result.get_serialized_size_bytes(PyIntsSerDe())
103
+ b = result.serialize(PyIntsSerDe())
104
+ self.assertEqual(expected_size, len(b))
105
+
106
+ # if we try to deserialize with the wrong serde, things break
107
+ try:
108
+ var_opt_sketch.deserialize(b, PyStringsSerDe())
109
+ self.fail()
110
+ except:
111
+ # expected; do nothing
112
+ self.assertTrue(True)
113
+
114
+ # using the correct serde gives us back a copy of the original
115
+ rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe())
116
+ self.assertEqual(result.k, rebuilt.k)
117
+ self.assertEqual(result.num_samples, rebuilt.num_samples)
118
+ self.assertEqual(result.n, rebuilt.n)
119
+ summary1 = result.estimate_subset_sum(geq_zero)
120
+ summary2 = rebuilt.estimate_subset_sum(geq_zero)
121
+ self.assertEqual(summary1['estimate'], summary2['estimate'])
122
+ self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight'])
123
+
100
124
  if __name__ == '__main__':
101
125
  unittest.main()