datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -25,6 +25,7 @@
25
25
  #include <cmath>
26
26
  #include <random>
27
27
  #include <algorithm>
28
+ #include <stdexcept>
28
29
 
29
30
  #include "var_opt_sketch.hpp"
30
31
  #include "serde.hpp"
@@ -311,8 +312,8 @@ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other)
311
312
 
312
313
  // implementation for fixed-size arithmetic types (integral and floating point)
313
314
  template<typename T, typename S, typename A>
314
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
315
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
315
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
316
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe&) const {
316
317
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
317
318
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
318
319
  num_bytes += h_ * sizeof(double); // weights
@@ -325,8 +326,8 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
325
326
 
326
327
  // implementation for all other types
327
328
  template<typename T, typename S, typename A>
328
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
329
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
329
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
330
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
330
331
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
331
332
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
332
333
  num_bytes += h_ * sizeof(double); // weights
@@ -335,13 +336,14 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
335
336
  }
336
337
  // must iterate over the items
337
338
  for (auto it: *this)
338
- num_bytes += S().size_of_item(it.first);
339
+ num_bytes += sd.size_of_item(it.first);
339
340
  return num_bytes;
340
341
  }
341
342
 
342
343
  template<typename T, typename S, typename A>
343
- std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
344
- const size_t size = header_size_bytes + get_serialized_size_bytes();
344
+ template<typename SerDe>
345
+ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
346
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
345
347
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, allocator_);
346
348
  uint8_t* ptr = bytes.data() + header_size_bytes;
347
349
  uint8_t* end_ptr = ptr + size;
@@ -400,8 +402,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
400
402
  }
401
403
 
402
404
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
403
- ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
404
- ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
+ ptr += sd.serialize(ptr, end_ptr - ptr, data_, h_);
406
+ ptr += sd.serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
407
  }
406
408
 
407
409
  size_t bytes_written = ptr - bytes.data();
@@ -413,7 +415,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
413
415
  }
414
416
 
415
417
  template<typename T, typename S, typename A>
416
- void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
418
+ template<typename SerDe>
419
+ void var_opt_sketch<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
417
420
  const bool empty = (h_ == 0) && (r_ == 0);
418
421
 
419
422
  const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
@@ -469,13 +472,19 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
469
472
  }
470
473
 
471
474
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
472
- S().serialize(os, data_, h_);
473
- S().serialize(os, &data_[h_ + 1], r_);
475
+ sd.serialize(os, data_, h_);
476
+ sd.serialize(os, &data_[h_ + 1], r_);
474
477
  }
475
478
  }
476
479
 
477
480
  template<typename T, typename S, typename A>
478
481
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
482
+ return deserialize(bytes, size, S(), allocator);
483
+ }
484
+
485
+ template<typename T, typename S, typename A>
486
+ template<typename SerDe>
487
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
479
488
  ensure_minimum_memory(size, 8);
480
489
  const char* ptr = static_cast<const char*>(bytes);
481
490
  const char* base = ptr;
@@ -559,10 +568,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
559
568
  items_deleter deleter(array_size, allocator);
560
569
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
561
570
 
562
- ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
571
+ ptr += sd.deserialize(ptr, end_ptr - ptr, items.get(), h);
563
572
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
564
573
 
565
- ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
574
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
566
575
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
567
576
 
568
577
  return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
@@ -571,6 +580,12 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
571
580
 
572
581
  template<typename T, typename S, typename A>
573
582
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
583
+ return deserialize(is, S(), allocator);
584
+ }
585
+
586
+ template<typename T, typename S, typename A>
587
+ template<typename SerDe>
588
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
574
589
  const auto first_byte = read<uint8_t>(is);
575
590
  uint8_t preamble_longs = first_byte & 0x3f;
576
591
  const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
@@ -640,10 +655,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
640
655
  items_deleter deleter(array_size, allocator);
641
656
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
642
657
 
643
- S().deserialize(is, items.get(), h); // aka &data_[0]
658
+ sd.deserialize(is, items.get(), h); // aka &data_[0]
644
659
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
645
660
 
646
- S().deserialize(is, &(items.get()[h + 1]), r);
661
+ sd.deserialize(is, &(items.get()[h + 1]), r);
647
662
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
648
663
 
649
664
  if (!is.good())
@@ -1683,16 +1698,6 @@ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
1683
1698
  return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1684
1699
  }
1685
1700
 
1686
-
1687
-
1688
- // ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
1689
-
1690
- namespace random_utils {
1691
- static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
1692
- static std::mt19937_64 rand(rd());
1693
- static std::uniform_real_distribution<> next_double(0.0, 1.0);
1694
- }
1695
-
1696
1701
  /**
1697
1702
  * Checks if target sampling allocation is more than 50% of max sampling size.
1698
1703
  * If so, returns max sampling size, otherwise passes through target size.
@@ -45,7 +45,11 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
45
45
  * author Kevin Lang
46
46
  * author Jon Malkin
47
47
  */
48
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
48
+ template<
49
+ typename T,
50
+ typename S = serde<T>, // deprecated, to be removed in the next major version
51
+ typename A = std::allocator<T>
52
+ >
49
53
  class var_opt_union {
50
54
 
51
55
  public:
@@ -88,14 +92,16 @@ public:
88
92
  /**
89
93
  * Computes size needed to serialize the current state of the union.
90
94
  * This version is for all other types and can be expensive since every item needs to be looked at.
95
+ * @param instance of a SerDe
91
96
  * @return size in bytes needed to serialize this sketch
92
97
  */
93
- size_t get_serialized_size_bytes() const;
94
-
98
+ template<typename SerDe = S>
99
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
100
+
95
101
  // This is a convenience alias for users
96
102
  // The type returned by the following serialize method
97
103
  typedef vector_u8<A> vector_bytes;
98
-
104
+
99
105
  /**
100
106
  * NOTE: This method may be deprecated in a future version.
101
107
  * This method serializes the sketch as a vector of bytes.
@@ -103,33 +109,62 @@ public:
103
109
  * It is a blank space of a given size.
104
110
  * This header is used in Datasketches PostgreSQL extension.
105
111
  * @param header_size_bytes space to reserve in front of the sketch
112
+ * @param instance of a SerDe
106
113
  */
107
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
114
+ template<typename SerDe = S>
115
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
108
116
 
109
117
  /**
110
118
  * NOTE: This method may be deprecated in a future version.
111
119
  * This method serializes the sketch into a given stream in a binary form
112
120
  * @param os output stream
121
+ * @param instance of a SerDe
113
122
  */
114
- void serialize(std::ostream& os) const;
123
+ template<typename SerDe = S>
124
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
115
125
 
116
126
  /**
117
127
  * NOTE: This method may be deprecated in a future version.
118
128
  * This method deserializes a union from a given stream.
119
129
  * @param is input stream
130
+ * @param instance of an Allocator
120
131
  * @return an instance of a union
121
132
  */
122
133
  static var_opt_union deserialize(std::istream& is, const A& allocator = A());
123
134
 
135
+ /**
136
+ * NOTE: This method may be deprecated in a future version.
137
+ * This method deserializes a union from a given stream.
138
+ * @param is input stream
139
+ * @param instance of a SerDe
140
+ * @param instance of an Allocator
141
+ * @return an instance of a union
142
+ */
143
+ template<typename SerDe = S>
144
+ static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
145
+
124
146
  /**
125
147
  * NOTE: This method may be deprecated in a future version.
126
148
  * This method deserializes a union from a given array of bytes.
127
149
  * @param bytes pointer to the array of bytes
128
150
  * @param size the size of the array
151
+ * @param instance of an Allocator
129
152
  * @return an instance of a union
130
153
  */
131
154
  static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
132
155
 
156
+ /**
157
+ * NOTE: This method may be deprecated in a future version.
158
+ * This method deserializes a union from a given array of bytes.
159
+ * @param bytes pointer to the array of bytes
160
+ * @param size the size of the array
161
+ * @param instance of a SerDe
162
+ * @param instance of an Allocator
163
+ * @return an instance of a union
164
+ */
165
+ template<typename SerDe = S>
166
+ static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
167
+
133
168
  /**
134
169
  * Prints a summary of the union as a string.
135
170
  * @return the summary as a string
@@ -24,6 +24,7 @@
24
24
 
25
25
  #include <cmath>
26
26
  #include <sstream>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -129,6 +130,12 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
130
 
130
131
  template<typename T, typename S, typename A>
131
132
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
133
+ return deserialize(is, S(), allocator);
134
+ }
135
+
136
+ template<typename T, typename S, typename A>
137
+ template<typename SerDe>
138
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
132
139
  const auto preamble_longs = read<uint8_t>(is);
133
140
  const auto serial_version = read<uint8_t>(is);
134
141
  const auto family_id = read<uint8_t>(is);
@@ -155,7 +162,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
155
162
  const auto outer_tau_numer = read<double>(is);
156
163
  const auto outer_tau_denom = read<uint64_t>(is);
157
164
 
158
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
165
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, sd, allocator);
159
166
 
160
167
  if (!is.good())
161
168
  throw std::runtime_error("error reading from std::istream");
@@ -165,6 +172,12 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
165
172
 
166
173
  template<typename T, typename S, typename A>
167
174
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
175
+ return deserialize(bytes, size, S(), allocator);
176
+ }
177
+
178
+ template<typename T, typename S, typename A>
179
+ template<typename SerDe>
180
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
168
181
  ensure_minimum_memory(size, 8);
169
182
  const char* ptr = static_cast<const char*>(bytes);
170
183
  uint8_t preamble_longs;
@@ -199,22 +212,24 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
199
212
  ptr += copy_from_mem(ptr, outer_tau_denom);
200
213
 
201
214
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
202
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
215
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, sd, allocator);
203
216
 
204
217
  return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
205
218
  }
206
219
 
207
220
  template<typename T, typename S, typename A>
208
- size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
221
+ template<typename SerDe>
222
+ size_t var_opt_union<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
209
223
  if (n_ == 0) {
210
224
  return PREAMBLE_LONGS_EMPTY << 3;
211
225
  } else {
212
- return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
226
+ return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes(sd);
213
227
  }
214
228
  }
215
229
 
216
230
  template<typename T, typename S, typename A>
217
- void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
231
+ template<typename SerDe>
232
+ void var_opt_union<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
218
233
  bool empty = (n_ == 0);
219
234
 
220
235
  const uint8_t serialization_version(SER_VER);
@@ -240,13 +255,14 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
240
255
  write(os, n_);
241
256
  write(os, outer_tau_numer_);
242
257
  write(os, outer_tau_denom_);
243
- gadget_.serialize(os);
258
+ gadget_.serialize(os, sd);
244
259
  }
245
260
  }
246
261
 
247
262
  template<typename T, typename S, typename A>
248
- std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
249
- const size_t size = header_size_bytes + get_serialized_size_bytes();
263
+ template<typename SerDe>
264
+ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
265
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
250
266
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
251
267
  uint8_t* ptr = bytes.data() + header_size_bytes;
252
268
 
@@ -278,7 +294,7 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
278
294
  ptr += copy_to_mem(outer_tau_numer_, ptr);
279
295
  ptr += copy_to_mem(outer_tau_denom_, ptr);
280
296
 
281
- auto gadget_bytes = gadget_.serialize();
297
+ auto gadget_bytes = gadget_.serialize(0, sd);
282
298
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
283
299
  }
284
300
 
@@ -39,7 +39,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
39
39
  var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
40
40
  for (int i = 0; i < 100; ++i) sk1.update(i);
41
41
  auto bytes1 = sk1.serialize();
42
- auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
42
+ auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
43
43
 
44
44
  std::stringstream ss;
45
45
  sk1.serialize(ss);
@@ -51,7 +51,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
51
51
  u1.update(sk3);
52
52
 
53
53
  auto bytes2 = u1.serialize();
54
- auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
54
+ auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
55
55
  }
56
56
  REQUIRE(test_allocator_total_bytes == 0);
57
57
  REQUIRE(test_allocator_net_allocations == 0);
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -81,7 +81,7 @@ class CMakeBuild(build_ext):
81
81
 
82
82
  setup(
83
83
  name='datasketches',
84
- version='3.3.0',
84
+ version='3.4.0',
85
85
  author='Apache Software Foundation',
86
86
  author_email='dev@datasketches.apache.org',
87
87
  description='The Apache DataSketches Library for Python',
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <cstdint>
24
24
  #include <string>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "bounds_binomial_proportions.hpp"
27
28
 
@@ -22,12 +22,13 @@
22
22
 
23
23
  #include <iostream>
24
24
  #include <iomanip>
25
+ #include <stdexcept>
25
26
 
26
27
  namespace datasketches {
27
28
 
28
29
  template<bool dummy>
29
30
  auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
- if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
31
32
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
33
 
33
34
  uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
@@ -43,10 +44,11 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
43
44
  checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
44
45
  const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
45
46
  if (has_theta) {
46
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
47
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
47
48
  theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
48
49
  }
49
50
  if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
51
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
50
52
  return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
51
53
  }
52
54
  const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
@@ -54,7 +56,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
54
56
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
55
57
  const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
56
58
  if (size < expected_size_bytes) {
57
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
59
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
58
60
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
59
61
  }
60
62
  const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
@@ -72,7 +74,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
72
74
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
73
75
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
74
76
  if (size < expected_size_bytes) {
75
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
77
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
76
78
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
77
79
  }
78
80
  return {false, true, seed_hash, num_entries, theta, entries};
@@ -91,7 +93,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
91
93
  } else {
92
94
  const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
93
95
  if (size < expected_size_bytes) {
94
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
96
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
95
97
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
96
98
  }
97
99
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
@@ -107,7 +109,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
107
109
  const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
108
110
  const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
109
111
  if (size < expected_size_bytes) {
110
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
112
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
111
113
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
112
114
  }
113
115
  return {false, true, seed_hash, num_entries, theta, entries};
@@ -20,6 +20,7 @@
20
20
  #include <iostream>
21
21
  #include <sstream>
22
22
  #include <algorithm>
23
+ #include <stdexcept>
23
24
 
24
25
  #include "conditional_forward.hpp"
25
26
 
@@ -21,6 +21,7 @@
21
21
  #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_back_inserter.hpp"
26
27
  #include "conditional_forward.hpp"
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <sstream>
24
24
  #include <vector>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "serde.hpp"
27
28
  #include "binomial_bounds.hpp"
@@ -453,7 +454,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
453
454
  const auto num_entries = read<uint32_t>(is);
454
455
  read<uint32_t>(is); //unused
455
456
  const auto theta = read<uint64_t>(is);
456
- std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
457
458
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
459
  if (!is_empty)
459
460
  read(is, entries.data(), sizeof(uint64_t) * entries.size());
@@ -470,12 +471,12 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
470
471
  if (preamble_longs == 1) {
471
472
  if (!is.good())
472
473
  throw std::runtime_error("error reading from std::istream");
473
- std::vector<uint64_t> entries(0, 0, allocator);
474
+ std::vector<uint64_t, A> entries(0, 0, allocator);
474
475
  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
476
  } else if (preamble_longs == 2) {
476
477
  const uint32_t num_entries = read<uint32_t>(is);
477
478
  read<uint32_t>(is); // unused
478
- std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
479
480
  if (num_entries == 0) {
480
481
  return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
482
  }
@@ -488,7 +489,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
488
489
  read<uint32_t>(is); // unused
489
490
  const auto theta = read<uint64_t>(is);
490
491
  bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
- std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
492
493
  if (is_empty) {
493
494
  if (!is.good())
494
495
  throw std::runtime_error("error reading from std::istream");
@@ -514,47 +515,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
514
515
 
515
516
  template<typename A>
516
517
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
517
- ensure_minimum_memory(size, 8);
518
- const char* ptr = static_cast<const char*>(bytes);
519
- const char* base = ptr;
520
- uint8_t preamble_longs;
521
- ptr += copy_from_mem(ptr, preamble_longs);
522
- uint8_t serial_version;
523
- ptr += copy_from_mem(ptr, serial_version);
524
- uint8_t type;
525
- ptr += copy_from_mem(ptr, type);
526
- ptr += sizeof(uint16_t); // unused
527
- uint8_t flags_byte;
528
- ptr += copy_from_mem(ptr, flags_byte);
529
- uint16_t seed_hash;
530
- ptr += copy_from_mem(ptr, seed_hash);
531
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
532
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
533
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
534
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
535
-
536
- uint64_t theta = theta_constants::MAX_THETA;
537
- uint32_t num_entries = 0;
538
- if (!is_empty) {
539
- if (preamble_longs == 1) {
540
- num_entries = 1;
541
- } else {
542
- ensure_minimum_memory(size, 8); // read the first prelong before this method
543
- ptr += copy_from_mem(ptr, num_entries);
544
- ptr += sizeof(uint32_t); // unused
545
- if (preamble_longs > 2) {
546
- ensure_minimum_memory(size, (preamble_longs - 1) << 3);
547
- ptr += copy_from_mem(ptr, theta);
548
- }
549
- }
550
- }
551
- const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
552
- check_memory_size(ptr - base + entries_size_bytes, size);
553
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
554
- if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
555
-
556
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
557
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
518
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
519
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
558
520
  }
559
521
 
560
522
  // wrapped compact sketch
@@ -21,6 +21,7 @@
21
21
  #define THETA_UNION_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_forward.hpp"
26
27
 
@@ -23,6 +23,7 @@
23
23
  #include <iostream>
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
+ #include <stdexcept>
26
27
 
27
28
  #include "theta_helpers.hpp"
28
29
 
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_a_not_b.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_intersection.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta intersection: invalid", "[theta_intersection]") {
@@ -18,6 +18,7 @@
18
18
  */
19
19
 
20
20
  #include <sstream>
21
+ #include <stdexcept>
21
22
 
22
23
  #include <catch.hpp>
23
24
  #include <theta_union.hpp>