datasketches 0.2.7 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/NOTICE +1 -1
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  10. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  11. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  12. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  13. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  15. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  16. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  17. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -1
  19. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  22. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  23. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  24. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  27. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  28. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  29. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  30. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  31. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  32. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  34. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  36. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  37. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  38. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  39. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  40. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  41. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  42. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  43. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  44. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  45. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  46. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  47. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  48. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  49. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +63 -68
  50. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  51. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  52. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  53. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  54. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  55. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  56. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  57. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  58. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  59. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  60. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  61. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  62. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  63. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  64. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  65. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  68. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  69. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  70. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  72. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  73. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  74. data/vendor/datasketches-cpp/setup.py +14 -2
  75. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  76. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  77. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  78. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  79. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  80. data/vendor/datasketches-cpp/tox.ini +26 -0
  81. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  82. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  83. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  84. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  85. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  86. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  87. metadata +15 -6
  88. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -36,18 +36,14 @@ namespace vector_of_kll_constants {
36
36
  }
37
37
 
38
38
  // Wrapper class for Numpy compatibility
39
- template <typename T, typename C = std::less<T>, typename S = serde<T>>
39
+ template <typename T, typename C = std::less<T>>
40
40
  class vector_of_kll_sketches {
41
41
  public:
42
- // TODO: Redundant and deprecated. Will be removed in next major version release.
43
- static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
44
- static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
45
-
46
42
  explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
47
43
  vector_of_kll_sketches(const vector_of_kll_sketches& other);
48
44
  vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
49
- vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
50
- vector_of_kll_sketches<T,C,S>& operator=(vector_of_kll_sketches&& other);
45
+ vector_of_kll_sketches<T, C>& operator=(const vector_of_kll_sketches& other);
46
+ vector_of_kll_sketches<T, C>& operator=(vector_of_kll_sketches&& other);
51
47
 
52
48
  // container parameters
53
49
  inline uint32_t get_k() const;
@@ -58,7 +54,7 @@ class vector_of_kll_sketches {
58
54
  void merge(const vector_of_kll_sketches<T>& other);
59
55
 
60
56
  // returns a single sketch combining all data in the array
61
- kll_sketch<T,C,S> collapse(const py::array_t<int>& isk) const;
57
+ kll_sketch<T, C> collapse(const py::array_t<int>& isk) const;
62
58
 
63
59
  // sketch queries returning an array of results
64
60
  py::array is_empty() const;
@@ -67,7 +63,7 @@ class vector_of_kll_sketches {
67
63
  py::array get_min_values() const;
68
64
  py::array get_max_values() const;
69
65
  py::array get_num_retained() const;
70
- py::array get_quantiles(const py::array_t<double>& fractions, const py::array_t<int>& isk) const;
66
+ py::array get_quantiles(const py::array_t<double>& ranks, const py::array_t<int>& isk) const;
71
67
  py::array get_ranks(const py::array_t<T>& values, const py::array_t<int>& isk) const;
72
68
  py::array get_pmf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
73
69
  py::array get_cdf(const py::array_t<T>& split_points, const py::array_t<int>& isk) const;
@@ -76,7 +72,7 @@ class vector_of_kll_sketches {
76
72
  std::string to_string(bool print_levels = false, bool print_items = false) const;
77
73
 
78
74
  // binary output/input
79
- py::list serialize(py::array_t<uint32_t>& isk);
75
+ py::list serialize(const py::array_t<int>& isk);
80
76
  // note: deserialize() replaces the sketch at the specified
81
77
  // index. Not a static method.
82
78
  void deserialize(const py::bytes& sk_bytes, uint32_t idx);
@@ -86,11 +82,11 @@ class vector_of_kll_sketches {
86
82
 
87
83
  const uint32_t k_; // kll sketch k parameter
88
84
  const uint32_t d_; // number of dimensions (here: sketches) to hold
89
- std::vector<kll_sketch<T,C,S>> sketches_;
85
+ std::vector<kll_sketch<T, C>> sketches_;
90
86
  };
91
87
 
92
- template<typename T, typename C, typename S>
93
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(uint32_t k, uint32_t d):
88
+ template<typename T, typename C>
89
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(uint32_t k, uint32_t d):
94
90
  k_(k),
95
91
  d_(d)
96
92
  {
@@ -106,49 +102,49 @@ d_(d)
106
102
  }
107
103
  }
108
104
 
109
- template<typename T, typename C, typename S>
110
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
105
+ template<typename T, typename C>
106
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(const vector_of_kll_sketches& other) :
111
107
  k_(other.k_),
112
108
  d_(other.d_),
113
109
  sketches_(other.sketches_)
114
110
  {}
115
111
 
116
- template<typename T, typename C, typename S>
117
- vector_of_kll_sketches<T,C,S>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
112
+ template<typename T, typename C>
113
+ vector_of_kll_sketches<T, C>::vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept :
118
114
  k_(other.k_),
119
115
  d_(other.d_),
120
116
  sketches_(std::move(other.sketches_))
121
117
  {}
122
118
 
123
- template<typename T, typename C, typename S>
124
- vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(const vector_of_kll_sketches& other) {
125
- vector_of_kll_sketches<T,C,S> copy(other);
119
+ template<typename T, typename C>
120
+ vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(const vector_of_kll_sketches& other) {
121
+ vector_of_kll_sketches<T, C> copy(other);
126
122
  k_ = copy.k_;
127
123
  d_ = copy.d_;
128
124
  std::swap(sketches_, copy.sketches_);
129
125
  return *this;
130
126
  }
131
127
 
132
- template<typename T, typename C, typename S>
133
- vector_of_kll_sketches<T,C,S>& vector_of_kll_sketches<T,C,S>::operator=(vector_of_kll_sketches&& other) {
128
+ template<typename T, typename C>
129
+ vector_of_kll_sketches<T, C>& vector_of_kll_sketches<T, C>::operator=(vector_of_kll_sketches&& other) {
134
130
  k_ = other.k_;
135
131
  d_ = other.d_;
136
132
  std::swap(sketches_, other.sketches_);
137
133
  return *this;
138
134
  }
139
135
 
140
- template<typename T, typename C, typename S>
141
- uint32_t vector_of_kll_sketches<T,C,S>::get_k() const {
136
+ template<typename T, typename C>
137
+ uint32_t vector_of_kll_sketches<T, C>::get_k() const {
142
138
  return k_;
143
139
  }
144
140
 
145
- template<typename T, typename C, typename S>
146
- uint32_t vector_of_kll_sketches<T,C,S>::get_d() const {
141
+ template<typename T, typename C>
142
+ uint32_t vector_of_kll_sketches<T, C>::get_d() const {
147
143
  return d_;
148
144
  }
149
145
 
150
- template<typename T, typename C, typename S>
151
- std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array_t<int>& isk) const {
146
+ template<typename T, typename C>
147
+ std::vector<uint32_t> vector_of_kll_sketches<T, C>::get_indices(const py::array_t<int>& isk) const {
152
148
  std::vector<uint32_t> indices;
153
149
  if (isk.size() == 1) {
154
150
  auto data = isk.unchecked();
@@ -177,8 +173,8 @@ std::vector<uint32_t> vector_of_kll_sketches<T,C,S>::get_indices(const py::array
177
173
  }
178
174
 
179
175
  // Checks if each sketch is empty or not
180
- template<typename T, typename C, typename S>
181
- py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
176
+ template<typename T, typename C>
177
+ py::array vector_of_kll_sketches<T, C>::is_empty() const {
182
178
  std::vector<bool> vals(d_);
183
179
  for (uint32_t i = 0; i < d_; ++i) {
184
180
  vals[i] = sketches_[i].is_empty();
@@ -190,8 +186,8 @@ py::array vector_of_kll_sketches<T,C,S>::is_empty() const {
190
186
  // Updates each sketch with values
191
187
  // Currently: all values must be present
192
188
  // TODO: allow subsets of sketches to be updated
193
- template<typename T, typename C, typename S>
194
- void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
189
+ template<typename T, typename C>
190
+ void vector_of_kll_sketches<T, C>::update(const py::array_t<T>& items) {
195
191
 
196
192
  size_t ndim = items.ndim();
197
193
 
@@ -231,8 +227,8 @@ void vector_of_kll_sketches<T,C,S>::update(const py::array_t<T>& items) {
231
227
 
232
228
  // Merges two arrays of sketches
233
229
  // Currently: all values must be present
234
- template<typename T, typename C, typename S>
235
- void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other) {
230
+ template<typename T, typename C>
231
+ void vector_of_kll_sketches<T, C>::merge(const vector_of_kll_sketches<T>& other) {
236
232
  if (d_ != other.get_d()) {
237
233
  throw std::invalid_argument("Must have same number of dimensions to merge: " + std::to_string(d_)
238
234
  + " vs " + std::to_string(other.d_));
@@ -243,11 +239,11 @@ void vector_of_kll_sketches<T,C,S>::merge(const vector_of_kll_sketches<T>& other
243
239
  }
244
240
  }
245
241
 
246
- template<typename T, typename C, typename S>
247
- kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>& isk) const {
242
+ template<typename T, typename C>
243
+ kll_sketch<T, C> vector_of_kll_sketches<T, C>::collapse(const py::array_t<int>& isk) const {
248
244
  std::vector<uint32_t> inds = get_indices(isk);
249
245
 
250
- kll_sketch<T,C,S> result(k_);
246
+ kll_sketch<T, C> result(k_);
251
247
  for (auto& idx : inds) {
252
248
  result.merge(sketches_[idx]);
253
249
  }
@@ -255,8 +251,8 @@ kll_sketch<T,C,S> vector_of_kll_sketches<T,C,S>::collapse(const py::array_t<int>
255
251
  }
256
252
 
257
253
  // Number of updates for each sketch
258
- template<typename T, typename C, typename S>
259
- py::array vector_of_kll_sketches<T,C,S>::get_n() const {
254
+ template<typename T, typename C>
255
+ py::array vector_of_kll_sketches<T, C>::get_n() const {
260
256
  std::vector<uint64_t> vals(d_);
261
257
  for (uint32_t i = 0; i < d_; ++i) {
262
258
  vals[i] = sketches_[i].get_n();
@@ -265,8 +261,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_n() const {
265
261
  }
266
262
 
267
263
  // Number of retained values for each sketch
268
- template<typename T, typename C, typename S>
269
- py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
264
+ template<typename T, typename C>
265
+ py::array vector_of_kll_sketches<T, C>::get_num_retained() const {
270
266
  std::vector<uint32_t> vals(d_);
271
267
  for (uint32_t i = 0; i < d_; ++i) {
272
268
  vals[i] = sketches_[i].get_num_retained();
@@ -276,22 +272,22 @@ py::array vector_of_kll_sketches<T,C,S>::get_num_retained() const {
276
272
 
277
273
  // Gets the minimum value of each sketch
278
274
  // TODO: allow subsets of sketches
279
- template<typename T, typename C, typename S>
280
- py::array vector_of_kll_sketches<T,C,S>::get_min_values() const {
275
+ template<typename T, typename C>
276
+ py::array vector_of_kll_sketches<T, C>::get_min_values() const {
281
277
  std::vector<T> vals(d_);
282
278
  for (uint32_t i = 0; i < d_; ++i) {
283
- vals[i] = sketches_[i].get_min_value();
279
+ vals[i] = sketches_[i].get_min_item();
284
280
  }
285
281
  return py::cast(vals);
286
282
  }
287
283
 
288
284
  // Gets the maximum value of each sketch
289
285
  // TODO: allow subsets of sketches
290
- template<typename T, typename C, typename S>
291
- py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
286
+ template<typename T, typename C>
287
+ py::array vector_of_kll_sketches<T, C>::get_max_values() const {
292
288
  std::vector<T> vals(d_);
293
289
  for (uint32_t i = 0; i < d_; ++i) {
294
- vals[i] = sketches_[i].get_max_value();
290
+ vals[i] = sketches_[i].get_max_item();
295
291
  }
296
292
  return py::cast(vals);
297
293
  }
@@ -299,8 +295,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_max_values() const {
299
295
  // Summary of each sketch as one long string
300
296
  // Users should use .split('\n\n') when calling it to build a list of each
301
297
  // sketch's summary
302
- template<typename T, typename C, typename S>
303
- std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool print_items) const {
298
+ template<typename T, typename C>
299
+ std::string vector_of_kll_sketches<T, C>::to_string(bool print_levels, bool print_items) const {
304
300
  std::ostringstream ss;
305
301
  for (uint32_t i = 0; i < d_; ++i) {
306
302
  // all streams into 1 string, for compatibility with Python's str() behavior
@@ -311,8 +307,8 @@ std::string vector_of_kll_sketches<T,C,S>::to_string(bool print_levels, bool pri
311
307
  return ss.str();
312
308
  }
313
309
 
314
- template<typename T, typename C, typename S>
315
- py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
310
+ template<typename T, typename C>
311
+ py::array vector_of_kll_sketches<T, C>::is_estimation_mode() const {
316
312
  std::vector<bool> vals(d_);
317
313
  for (uint32_t i = 0; i < d_; ++i) {
318
314
  vals[i] = sketches_[i].is_estimation_mode();
@@ -321,18 +317,17 @@ py::array vector_of_kll_sketches<T,C,S>::is_estimation_mode() const {
321
317
  }
322
318
 
323
319
  // Value of sketch(es) corresponding to some quantile(s)
324
- template<typename T, typename C, typename S>
325
- py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>& fractions,
320
+ template<typename T, typename C>
321
+ py::array vector_of_kll_sketches<T, C>::get_quantiles(const py::array_t<double>& ranks,
326
322
  const py::array_t<int>& isk) const {
327
323
  std::vector<uint32_t> inds = get_indices(isk);
328
324
  size_t num_sketches = inds.size();
329
- size_t num_quantiles = fractions.size();
325
+ size_t num_quantiles = ranks.size();
330
326
 
331
327
  std::vector<std::vector<T>> quants(num_sketches, std::vector<T>(num_quantiles));
332
328
  for (uint32_t i = 0; i < num_sketches; ++i) {
333
- auto quant = sketches_[inds[i]].get_quantiles(fractions.data(), num_quantiles);
334
329
  for (size_t j = 0; j < num_quantiles; ++j) {
335
- quants[i][j] = quant[j];
330
+ quants[i][j] = sketches_[inds[i]].get_quantile(ranks.data()[j]);
336
331
  }
337
332
  }
338
333
 
@@ -340,8 +335,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_quantiles(const py::array_t<double>
340
335
  }
341
336
 
342
337
  // Value of sketch(es) corresponding to some rank(s)
343
- template<typename T, typename C, typename S>
344
- py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
338
+ template<typename T, typename C>
339
+ py::array vector_of_kll_sketches<T, C>::get_ranks(const py::array_t<T>& values,
345
340
  const py::array_t<int>& isk) const {
346
341
  std::vector<uint32_t> inds = get_indices(isk);
347
342
  size_t num_sketches = inds.size();
@@ -359,8 +354,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_ranks(const py::array_t<T>& values,
359
354
  }
360
355
 
361
356
  // PMF(s) of sketch(es)
362
- template<typename T, typename C, typename S>
363
- py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_points,
357
+ template<typename T, typename C>
358
+ py::array vector_of_kll_sketches<T, C>::get_pmf(const py::array_t<T>& split_points,
364
359
  const py::array_t<int>& isk) const {
365
360
  std::vector<uint32_t> inds = get_indices(isk);
366
361
  size_t num_sketches = inds.size();
@@ -378,8 +373,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_pmf(const py::array_t<T>& split_poi
378
373
  }
379
374
 
380
375
  // CDF(s) of sketch(es)
381
- template<typename T, typename C, typename S>
382
- py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_points,
376
+ template<typename T, typename C>
377
+ py::array vector_of_kll_sketches<T, C>::get_cdf(const py::array_t<T>& split_points,
383
378
  const py::array_t<int>& isk) const {
384
379
  std::vector<uint32_t> inds = get_indices(isk);
385
380
  size_t num_sketches = inds.size();
@@ -396,8 +391,8 @@ py::array vector_of_kll_sketches<T,C,S>::get_cdf(const py::array_t<T>& split_poi
396
391
  return py::cast(cdfs);
397
392
  }
398
393
 
399
- template<typename T, typename C, typename S>
400
- void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
394
+ template<typename T, typename C>
395
+ void vector_of_kll_sketches<T, C>::deserialize(const py::bytes& sk_bytes,
401
396
  uint32_t idx) {
402
397
  if (idx >= d_) {
403
398
  throw std::invalid_argument("request for invalid dimenions >= d ("
@@ -408,8 +403,8 @@ void vector_of_kll_sketches<T,C,S>::deserialize(const py::bytes& sk_bytes,
408
403
  sketches_[idx] = std::move(kll_sketch<T>::deserialize(skStr.c_str(), skStr.length()));
409
404
  }
410
405
 
411
- template<typename T, typename C, typename S>
412
- py::list vector_of_kll_sketches<T,C,S>::serialize(py::array_t<uint32_t>& isk) {
406
+ template<typename T, typename C>
407
+ py::list vector_of_kll_sketches<T, C>::serialize(const py::array_t<int>& isk) {
413
408
  std::vector<uint32_t> inds = get_indices(isk);
414
409
  const size_t num_sketches = inds.size();
415
410
 
@@ -466,9 +461,9 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
466
461
  "Returns the minimum value(s) of the sketch(es)")
467
462
  .def("get_max_values", &vector_of_kll_sketches<T>::get_max_values,
468
463
  "Returns the maximum value(s) of the sketch(es)")
469
- .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("fractions"),
464
+ .def("get_quantiles", &vector_of_kll_sketches<T>::get_quantiles, py::arg("ranks"),
470
465
  py::arg("isk")=-1,
471
- "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `fractions` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
466
+ "Returns the value(s) associated with the specified quantile(s) for the specified sketch(es). `ranks` can be a float between 0 and 1 (inclusive), or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
472
467
  .def("get_ranks", &vector_of_kll_sketches<T>::get_ranks, py::arg("values"),
473
468
  py::arg("isk")=-1,
474
469
  "Returns the value(s) associated with the specified ranks(s) for the specified sketch(es). `values` can be an int between 0 and the number of values retained, or a list/array of values. `isk` specifies which sketch(es) to return the value(s) for (default: all sketches)")
@@ -19,16 +19,50 @@
19
19
 
20
20
  #include "var_opt_sketch.hpp"
21
21
  #include "var_opt_union.hpp"
22
+ #include "py_serde.hpp"
22
23
 
23
24
  #include <pybind11/pybind11.h>
24
- #include <pybind11/functional.h>
25
- #include <sstream>
26
25
 
27
26
  namespace py = pybind11;
28
27
 
29
28
  namespace datasketches {
29
+
30
30
  namespace python {
31
31
 
32
+ template<typename T>
33
+ var_opt_sketch<T> vo_sketch_deserialize(py::bytes& skBytes, py_object_serde& sd) {
34
+ std::string skStr = skBytes; // implicit cast
35
+ return var_opt_sketch<T>::deserialize(skStr.c_str(), skStr.length(), sd);
36
+ }
37
+
38
+ template<typename T>
39
+ py::object vo_sketch_serialize(const var_opt_sketch<T>& sk, py_object_serde& sd) {
40
+ auto serResult = sk.serialize(0, sd);
41
+ return py::bytes((char*)serResult.data(), serResult.size());
42
+ }
43
+
44
+ template<typename T>
45
+ size_t vo_sketch_size_bytes(const var_opt_sketch<T>& sk, py_object_serde& sd) {
46
+ return sk.get_serialized_size_bytes(sd);
47
+ }
48
+
49
+ template<typename T>
50
+ var_opt_union<T> vo_union_deserialize(py::bytes& uBytes, py_object_serde& sd) {
51
+ std::string uStr = uBytes; // implicit cast
52
+ return var_opt_union<T>::deserialize(uStr.c_str(), uStr.length(), sd);
53
+ }
54
+
55
+ template<typename T>
56
+ py::object vo_union_serialize(const var_opt_union<T>& u, py_object_serde& sd) {
57
+ auto serResult = u.serialize(0, sd);
58
+ return py::bytes((char*)serResult.data(), serResult.size());
59
+ }
60
+
61
+ template<typename T>
62
+ size_t vo_union_size_bytes(const var_opt_union<T>& u, py_object_serde& sd) {
63
+ return u.get_serialized_size_bytes(sd);
64
+ }
65
+
32
66
  template<typename T>
33
67
  py::list vo_sketch_get_samples(const var_opt_sketch<T>& sk) {
34
68
  py::list list;
@@ -63,7 +97,6 @@ std::string vo_sketch_to_string(const var_opt_sketch<T>& sk, bool print_items) {
63
97
  // using internal str() method then casting to C++ std::string
64
98
  py::str item_pystr(item.first);
65
99
  std::string item_str = py::cast<std::string>(item_pystr);
66
- // item.second is guaranteed to be a double
67
100
  ss << i++ << ": " << item_str << "\twt = " << item.second << std::endl;
68
101
  }
69
102
  return ss.str();
@@ -96,17 +129,17 @@ void bind_vo_sketch(py::module &m, const char* name) {
96
129
  .def_property_readonly("num_samples", &var_opt_sketch<T>::get_num_samples,
97
130
  "Returns the number of samples currently in the sketch")
98
131
  .def("get_samples", &dspy::vo_sketch_get_samples<T>,
99
- "Retyrns the set of samples in the sketch")
132
+ "Returns the set of samples in the sketch")
100
133
  .def("is_empty", &var_opt_sketch<T>::is_empty,
101
134
  "Returns True if the sketch is empty, otherwise False")
102
135
  .def("estimate_subset_sum", &dspy::vo_sketch_estimate_subset_sum<T>,
103
136
  "Applies a provided predicate to the sketch and returns the estimated total weight matching the predicate, as well "
104
137
  "as upper and lower bounds on the estimate and the total weight processed by the sketch")
105
- // As of writing, not yet clear how to serialize arbitrary python objects,
106
- // especially in any sort of language-portable way
107
- //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
108
- //.def("serialize", &dspy::vo_sketch_serialize<T>)
109
- //.def_static("deserialize", &dspy::vo_sketch_deserialize<T>)
138
+ .def("get_serialized_size_bytes", &dspy::vo_sketch_size_bytes<T>, py::arg("serde"),
139
+ "Computes the size in bytes needed to serialize the current sketch")
140
+ .def("serialize", &dspy::vo_sketch_serialize<T>, py::arg("serde"), "Serialize the var opt sketch using the provided serde")
141
+ .def_static("deserialize", &dspy::vo_sketch_deserialize<T>, py::arg("bytes"), py::arg("serde"),
142
+ "Constructs a var opt sketch from the given bytes using the provided serde")
110
143
  ;
111
144
  }
112
145
 
@@ -126,11 +159,11 @@ void bind_vo_union(py::module &m, const char* name) {
126
159
  "Returns a sketch corresponding to the union result")
127
160
  .def("reset", &var_opt_union<T>::reset,
128
161
  "Resets the union to the empty state")
129
- // As of writing, not yet clear how to serialize arbitrary python objects,
130
- // especially in any sort of language-portable way
131
- //.def("get_serialized_size_bytes", &var_opt_sketch<T>::get_serialized_size_bytes)
132
- //.def("serialize", &dspy::vo_union_serialize<T>)
133
- //.def_static("deserialize", &dspy::vo_union_deserialize<T>)
162
+ .def("get_serialized_size_bytes", &dspy::vo_union_size_bytes<T>, py::arg("serde"),
163
+ "Computes the size in bytes needed to serialize the current sketch")
164
+ .def("serialize", &dspy::vo_union_serialize<T>, py::arg("serde"), "Serialize the var opt union using the provided serde")
165
+ .def_static("deserialize", &dspy::vo_union_deserialize<T>, py::arg("bytes"), py::arg("serde"),
166
+ "Constructs a var opt union from the given bytes using the provided serde")
134
167
  ;
135
168
  }
136
169
 
@@ -0,0 +1,16 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
@@ -50,7 +50,7 @@ class reqTest(unittest.TestCase):
50
50
  pts = req.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772])
51
51
  cdf = req.get_cdf(pts) # include 1.0 at end to account for all probability mass
52
52
  self.assertEqual(len(cdf), len(pts)+1)
53
-
53
+
54
54
  # For relative error quantiles, the error depends on the actual rank
55
55
  # so we need to use that to detemrine the bounds
56
56
  est = req.get_rank(0.999, True)
@@ -16,7 +16,7 @@
16
16
  # under the License.
17
17
 
18
18
  import unittest
19
- from datasketches import var_opt_sketch, var_opt_union
19
+ from datasketches import var_opt_sketch, var_opt_union, PyIntsSerDe, PyStringsSerDe
20
20
 
21
21
  class VoTest(unittest.TestCase):
22
22
  def test_vo_example(self):
@@ -97,5 +97,29 @@ class VoTest(unittest.TestCase):
97
97
  # calls to __str__() with parameters.
98
98
  print(result.to_string(True))
99
99
 
100
+ # finally, we can serialize the sketch by providing an
101
+ # appropriate serde class.
102
+ expected_size = result.get_serialized_size_bytes(PyIntsSerDe())
103
+ b = result.serialize(PyIntsSerDe())
104
+ self.assertEqual(expected_size, len(b))
105
+
106
+ # if we try to deserialize with the wrong serde, things break
107
+ try:
108
+ var_opt_sketch.deserialize(b, PyStringsSerDe())
109
+ self.fail()
110
+ except:
111
+ # expected; do nothing
112
+ self.assertTrue(True)
113
+
114
+ # using the correct serde gives us back a copy of the original
115
+ rebuilt = var_opt_sketch.deserialize(b, PyIntsSerDe())
116
+ self.assertEqual(result.k, rebuilt.k)
117
+ self.assertEqual(result.num_samples, rebuilt.num_samples)
118
+ self.assertEqual(result.n, rebuilt.n)
119
+ summary1 = result.estimate_subset_sum(geq_zero)
120
+ summary2 = rebuilt.estimate_subset_sum(geq_zero)
121
+ self.assertEqual(summary1['estimate'], summary2['estimate'])
122
+ self.assertEqual(summary1['total_sketch_weight'], summary2['total_sketch_weight'])
123
+
100
124
  if __name__ == '__main__':
101
125
  unittest.main()