datasketches 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -25,6 +25,7 @@
25
25
  #include <cmath>
26
26
  #include <random>
27
27
  #include <algorithm>
28
+ #include <stdexcept>
28
29
 
29
30
  #include "var_opt_sketch.hpp"
30
31
  #include "serde.hpp"
@@ -311,8 +312,8 @@ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other)
311
312
 
312
313
  // implementation for fixed-size arithmetic types (integral and floating point)
313
314
  template<typename T, typename S, typename A>
314
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
315
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
315
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
316
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe&) const {
316
317
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
317
318
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
318
319
  num_bytes += h_ * sizeof(double); // weights
@@ -325,8 +326,8 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
325
326
 
326
327
  // implementation for all other types
327
328
  template<typename T, typename S, typename A>
328
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
329
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
329
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
330
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
330
331
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
331
332
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
332
333
  num_bytes += h_ * sizeof(double); // weights
@@ -335,13 +336,14 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
335
336
  }
336
337
  // must iterate over the items
337
338
  for (auto it: *this)
338
- num_bytes += S().size_of_item(it.first);
339
+ num_bytes += sd.size_of_item(it.first);
339
340
  return num_bytes;
340
341
  }
341
342
 
342
343
  template<typename T, typename S, typename A>
343
- std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
344
- const size_t size = header_size_bytes + get_serialized_size_bytes();
344
+ template<typename SerDe>
345
+ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
346
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
345
347
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, allocator_);
346
348
  uint8_t* ptr = bytes.data() + header_size_bytes;
347
349
  uint8_t* end_ptr = ptr + size;
@@ -400,8 +402,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
400
402
  }
401
403
 
402
404
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
403
- ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
404
- ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
+ ptr += sd.serialize(ptr, end_ptr - ptr, data_, h_);
406
+ ptr += sd.serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
407
  }
406
408
 
407
409
  size_t bytes_written = ptr - bytes.data();
@@ -413,7 +415,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
413
415
  }
414
416
 
415
417
  template<typename T, typename S, typename A>
416
- void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
418
+ template<typename SerDe>
419
+ void var_opt_sketch<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
417
420
  const bool empty = (h_ == 0) && (r_ == 0);
418
421
 
419
422
  const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
@@ -469,13 +472,19 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
469
472
  }
470
473
 
471
474
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
472
- S().serialize(os, data_, h_);
473
- S().serialize(os, &data_[h_ + 1], r_);
475
+ sd.serialize(os, data_, h_);
476
+ sd.serialize(os, &data_[h_ + 1], r_);
474
477
  }
475
478
  }
476
479
 
477
480
  template<typename T, typename S, typename A>
478
481
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
482
+ return deserialize(bytes, size, S(), allocator);
483
+ }
484
+
485
+ template<typename T, typename S, typename A>
486
+ template<typename SerDe>
487
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
479
488
  ensure_minimum_memory(size, 8);
480
489
  const char* ptr = static_cast<const char*>(bytes);
481
490
  const char* base = ptr;
@@ -559,10 +568,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
559
568
  items_deleter deleter(array_size, allocator);
560
569
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
561
570
 
562
- ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
571
+ ptr += sd.deserialize(ptr, end_ptr - ptr, items.get(), h);
563
572
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
564
573
 
565
- ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
574
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
566
575
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
567
576
 
568
577
  return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
@@ -571,6 +580,12 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
571
580
 
572
581
  template<typename T, typename S, typename A>
573
582
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
583
+ return deserialize(is, S(), allocator);
584
+ }
585
+
586
+ template<typename T, typename S, typename A>
587
+ template<typename SerDe>
588
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
574
589
  const auto first_byte = read<uint8_t>(is);
575
590
  uint8_t preamble_longs = first_byte & 0x3f;
576
591
  const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
@@ -640,10 +655,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
640
655
  items_deleter deleter(array_size, allocator);
641
656
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
642
657
 
643
- S().deserialize(is, items.get(), h); // aka &data_[0]
658
+ sd.deserialize(is, items.get(), h); // aka &data_[0]
644
659
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
645
660
 
646
- S().deserialize(is, &(items.get()[h + 1]), r);
661
+ sd.deserialize(is, &(items.get()[h + 1]), r);
647
662
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
648
663
 
649
664
  if (!is.good())
@@ -1683,16 +1698,6 @@ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
1683
1698
  return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1684
1699
  }
1685
1700
 
1686
-
1687
-
1688
- // ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
1689
-
1690
- namespace random_utils {
1691
- static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
1692
- static std::mt19937_64 rand(rd());
1693
- static std::uniform_real_distribution<> next_double(0.0, 1.0);
1694
- }
1695
-
1696
1701
  /**
1697
1702
  * Checks if target sampling allocation is more than 50% of max sampling size.
1698
1703
  * If so, returns max sampling size, otherwise passes through target size.
@@ -45,7 +45,11 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
45
45
  * author Kevin Lang
46
46
  * author Jon Malkin
47
47
  */
48
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
48
+ template<
49
+ typename T,
50
+ typename S = serde<T>, // deprecated, to be removed in the next major version
51
+ typename A = std::allocator<T>
52
+ >
49
53
  class var_opt_union {
50
54
 
51
55
  public:
@@ -88,14 +92,16 @@ public:
88
92
  /**
89
93
  * Computes size needed to serialize the current state of the union.
90
94
  * This version is for all other types and can be expensive since every item needs to be looked at.
95
+ * @param instance of a SerDe
91
96
  * @return size in bytes needed to serialize this sketch
92
97
  */
93
- size_t get_serialized_size_bytes() const;
94
-
98
+ template<typename SerDe = S>
99
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
100
+
95
101
  // This is a convenience alias for users
96
102
  // The type returned by the following serialize method
97
103
  typedef vector_u8<A> vector_bytes;
98
-
104
+
99
105
  /**
100
106
  * NOTE: This method may be deprecated in a future version.
101
107
  * This method serializes the sketch as a vector of bytes.
@@ -103,33 +109,62 @@ public:
103
109
  * It is a blank space of a given size.
104
110
  * This header is used in Datasketches PostgreSQL extension.
105
111
  * @param header_size_bytes space to reserve in front of the sketch
112
+ * @param instance of a SerDe
106
113
  */
107
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
114
+ template<typename SerDe = S>
115
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
108
116
 
109
117
  /**
110
118
  * NOTE: This method may be deprecated in a future version.
111
119
  * This method serializes the sketch into a given stream in a binary form
112
120
  * @param os output stream
121
+ * @param instance of a SerDe
113
122
  */
114
- void serialize(std::ostream& os) const;
123
+ template<typename SerDe = S>
124
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
115
125
 
116
126
  /**
117
127
  * NOTE: This method may be deprecated in a future version.
118
128
  * This method deserializes a union from a given stream.
119
129
  * @param is input stream
130
+ * @param instance of an Allocator
120
131
  * @return an instance of a union
121
132
  */
122
133
  static var_opt_union deserialize(std::istream& is, const A& allocator = A());
123
134
 
135
+ /**
136
+ * NOTE: This method may be deprecated in a future version.
137
+ * This method deserializes a union from a given stream.
138
+ * @param is input stream
139
+ * @param instance of a SerDe
140
+ * @param instance of an Allocator
141
+ * @return an instance of a union
142
+ */
143
+ template<typename SerDe = S>
144
+ static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
145
+
124
146
  /**
125
147
  * NOTE: This method may be deprecated in a future version.
126
148
  * This method deserializes a union from a given array of bytes.
127
149
  * @param bytes pointer to the array of bytes
128
150
  * @param size the size of the array
151
+ * @param instance of an Allocator
129
152
  * @return an instance of a union
130
153
  */
131
154
  static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
132
155
 
156
+ /**
157
+ * NOTE: This method may be deprecated in a future version.
158
+ * This method deserializes a union from a given array of bytes.
159
+ * @param bytes pointer to the array of bytes
160
+ * @param size the size of the array
161
+ * @param instance of a SerDe
162
+ * @param instance of an Allocator
163
+ * @return an instance of a union
164
+ */
165
+ template<typename SerDe = S>
166
+ static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
167
+
133
168
  /**
134
169
  * Prints a summary of the union as a string.
135
170
  * @return the summary as a string
@@ -24,6 +24,7 @@
24
24
 
25
25
  #include <cmath>
26
26
  #include <sstream>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -129,6 +130,12 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
130
 
130
131
  template<typename T, typename S, typename A>
131
132
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
133
+ return deserialize(is, S(), allocator);
134
+ }
135
+
136
+ template<typename T, typename S, typename A>
137
+ template<typename SerDe>
138
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
132
139
  const auto preamble_longs = read<uint8_t>(is);
133
140
  const auto serial_version = read<uint8_t>(is);
134
141
  const auto family_id = read<uint8_t>(is);
@@ -155,7 +162,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
155
162
  const auto outer_tau_numer = read<double>(is);
156
163
  const auto outer_tau_denom = read<uint64_t>(is);
157
164
 
158
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
165
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, sd, allocator);
159
166
 
160
167
  if (!is.good())
161
168
  throw std::runtime_error("error reading from std::istream");
@@ -165,6 +172,12 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
165
172
 
166
173
  template<typename T, typename S, typename A>
167
174
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
175
+ return deserialize(bytes, size, S(), allocator);
176
+ }
177
+
178
+ template<typename T, typename S, typename A>
179
+ template<typename SerDe>
180
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
168
181
  ensure_minimum_memory(size, 8);
169
182
  const char* ptr = static_cast<const char*>(bytes);
170
183
  uint8_t preamble_longs;
@@ -199,22 +212,24 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
199
212
  ptr += copy_from_mem(ptr, outer_tau_denom);
200
213
 
201
214
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
202
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
215
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, sd, allocator);
203
216
 
204
217
  return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
205
218
  }
206
219
 
207
220
  template<typename T, typename S, typename A>
208
- size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
221
+ template<typename SerDe>
222
+ size_t var_opt_union<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
209
223
  if (n_ == 0) {
210
224
  return PREAMBLE_LONGS_EMPTY << 3;
211
225
  } else {
212
- return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
226
+ return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes(sd);
213
227
  }
214
228
  }
215
229
 
216
230
  template<typename T, typename S, typename A>
217
- void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
231
+ template<typename SerDe>
232
+ void var_opt_union<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
218
233
  bool empty = (n_ == 0);
219
234
 
220
235
  const uint8_t serialization_version(SER_VER);
@@ -240,13 +255,14 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
240
255
  write(os, n_);
241
256
  write(os, outer_tau_numer_);
242
257
  write(os, outer_tau_denom_);
243
- gadget_.serialize(os);
258
+ gadget_.serialize(os, sd);
244
259
  }
245
260
  }
246
261
 
247
262
  template<typename T, typename S, typename A>
248
- std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
249
- const size_t size = header_size_bytes + get_serialized_size_bytes();
263
+ template<typename SerDe>
264
+ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
265
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
250
266
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
251
267
  uint8_t* ptr = bytes.data() + header_size_bytes;
252
268
 
@@ -278,7 +294,7 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
278
294
  ptr += copy_to_mem(outer_tau_numer_, ptr);
279
295
  ptr += copy_to_mem(outer_tau_denom_, ptr);
280
296
 
281
- auto gadget_bytes = gadget_.serialize();
297
+ auto gadget_bytes = gadget_.serialize(0, sd);
282
298
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
283
299
  }
284
300
 
@@ -39,7 +39,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
39
39
  var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
40
40
  for (int i = 0; i < 100; ++i) sk1.update(i);
41
41
  auto bytes1 = sk1.serialize();
42
- auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
42
+ auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
43
43
 
44
44
  std::stringstream ss;
45
45
  sk1.serialize(ss);
@@ -51,7 +51,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
51
51
  u1.update(sk3);
52
52
 
53
53
  auto bytes2 = u1.serialize();
54
- auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
54
+ auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
55
55
  }
56
56
  REQUIRE(test_allocator_total_bytes == 0);
57
57
  REQUIRE(test_allocator_net_allocations == 0);
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -81,7 +81,7 @@ class CMakeBuild(build_ext):
81
81
 
82
82
  setup(
83
83
  name='datasketches',
84
- version='3.3.0',
84
+ version='3.4.0',
85
85
  author='Apache Software Foundation',
86
86
  author_email='dev@datasketches.apache.org',
87
87
  description='The Apache DataSketches Library for Python',
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <cstdint>
24
24
  #include <string>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "bounds_binomial_proportions.hpp"
27
28
 
@@ -22,12 +22,13 @@
22
22
 
23
23
  #include <iostream>
24
24
  #include <iomanip>
25
+ #include <stdexcept>
25
26
 
26
27
  namespace datasketches {
27
28
 
28
29
  template<bool dummy>
29
30
  auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
- if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
31
32
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
33
 
33
34
  uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
@@ -43,10 +44,11 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
43
44
  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
44
45
  const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
45
46
  if (has_theta) {
46
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
47
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
47
48
  theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
48
49
  }
49
50
  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
51
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
50
52
  return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
51
53
  }
52
54
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
@@ -54,7 +56,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
54
56
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
55
57
  const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
56
58
  if (size < expected_size_bytes) {
57
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
59
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
58
60
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
59
61
  }
60
62
  const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
@@ -72,7 +74,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
72
74
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
73
75
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
74
76
  if (size < expected_size_bytes) {
75
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
77
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
76
78
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
77
79
  }
78
80
  return {false, true, seed_hash, num_entries, theta, entries};
@@ -91,7 +93,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
91
93
  } else {
92
94
  const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
93
95
  if (size < expected_size_bytes) {
94
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
96
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
95
97
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
96
98
  }
97
99
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
@@ -107,7 +109,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
107
109
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
108
110
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
109
111
  if (size < expected_size_bytes) {
110
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
112
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
111
113
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
112
114
  }
113
115
  return {false, true, seed_hash, num_entries, theta, entries};
@@ -20,6 +20,7 @@
20
20
  #include <iostream>
21
21
  #include <sstream>
22
22
  #include <algorithm>
23
+ #include <stdexcept>
23
24
 
24
25
  #include "conditional_forward.hpp"
25
26
 
@@ -21,6 +21,7 @@
21
21
  #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_back_inserter.hpp"
26
27
  #include "conditional_forward.hpp"
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <sstream>
24
24
  #include <vector>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "serde.hpp"
27
28
  #include "binomial_bounds.hpp"
@@ -453,7 +454,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
453
454
  const auto num_entries = read<uint32_t>(is);
454
455
  read<uint32_t>(is); //unused
455
456
  const auto theta = read<uint64_t>(is);
456
- std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
457
458
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
459
  if (!is_empty)
459
460
  read(is, entries.data(), sizeof(uint64_t) * entries.size());
@@ -470,12 +471,12 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
470
471
  if (preamble_longs == 1) {
471
472
  if (!is.good())
472
473
  throw std::runtime_error("error reading from std::istream");
473
- std::vector<uint64_t> entries(0, 0, allocator);
474
+ std::vector<uint64_t, A> entries(0, 0, allocator);
474
475
  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
476
  } else if (preamble_longs == 2) {
476
477
  const uint32_t num_entries = read<uint32_t>(is);
477
478
  read<uint32_t>(is); // unused
478
- std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
479
480
  if (num_entries == 0) {
480
481
  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
482
  }
@@ -488,7 +489,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
488
489
  read<uint32_t>(is); // unused
489
490
  const auto theta = read<uint64_t>(is);
490
491
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
- std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
492
493
  if (is_empty) {
493
494
  if (!is.good())
494
495
  throw std::runtime_error("error reading from std::istream");
@@ -514,47 +515,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
514
515
 
515
516
  template<typename A>
516
517
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
517
- ensure_minimum_memory(size, 8);
518
- const char* ptr = static_cast<const char*>(bytes);
519
- const char* base = ptr;
520
- uint8_t preamble_longs;
521
- ptr += copy_from_mem(ptr, preamble_longs);
522
- uint8_t serial_version;
523
- ptr += copy_from_mem(ptr, serial_version);
524
- uint8_t type;
525
- ptr += copy_from_mem(ptr, type);
526
- ptr += sizeof(uint16_t); // unused
527
- uint8_t flags_byte;
528
- ptr += copy_from_mem(ptr, flags_byte);
529
- uint16_t seed_hash;
530
- ptr += copy_from_mem(ptr, seed_hash);
531
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
532
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
533
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
534
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
535
-
536
- uint64_t theta = theta_constants::MAX_THETA;
537
- uint32_t num_entries = 0;
538
- if (!is_empty) {
539
- if (preamble_longs == 1) {
540
- num_entries = 1;
541
- } else {
542
- ensure_minimum_memory(size, 8); // read the first prelong before this method
543
- ptr += copy_from_mem(ptr, num_entries);
544
- ptr += sizeof(uint32_t); // unused
545
- if (preamble_longs > 2) {
546
- ensure_minimum_memory(size, (preamble_longs - 1) << 3);
547
- ptr += copy_from_mem(ptr, theta);
548
- }
549
- }
550
- }
551
- const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
552
- check_memory_size(ptr - base + entries_size_bytes, size);
553
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
554
- if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
555
-
556
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
557
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
518
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
519
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
558
520
  }
559
521
 
560
522
  // wrapped compact sketch
@@ -21,6 +21,7 @@
21
21
  #define THETA_UNION_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_forward.hpp"
26
27
 
@@ -23,6 +23,7 @@
23
23
  #include <iostream>
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
+ #include <stdexcept>
26
27
 
27
28
  #include "theta_helpers.hpp"
28
29
 
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_a_not_b.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_intersection.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta intersection: invalid", "[theta_intersection]") {
@@ -18,6 +18,7 @@
18
18
  */
19
19
 
20
20
  #include <sstream>
21
+ #include <stdexcept>
21
22
 
22
23
  #include <catch.hpp>
23
24
  #include <theta_union.hpp>