datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -26,68 +26,77 @@
26
26
  #include "serde.hpp"
27
27
  #include "binomial_bounds.hpp"
28
28
  #include "theta_helpers.hpp"
29
+ #include "compact_theta_sketch_parser.hpp"
29
30
 
30
31
  namespace datasketches {
31
32
 
32
33
  template<typename A>
33
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
34
+ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
34
35
  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
35
36
  }
36
37
 
37
38
  template<typename A>
38
- double theta_sketch_alloc<A>::get_theta() const {
39
+ double base_theta_sketch_alloc<A>::get_theta() const {
39
40
  return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
40
41
  }
41
42
 
42
43
  template<typename A>
43
- double theta_sketch_alloc<A>::get_estimate() const {
44
+ double base_theta_sketch_alloc<A>::get_estimate() const {
44
45
  return get_num_retained() / get_theta();
45
46
  }
46
47
 
47
48
  template<typename A>
48
- double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
+ double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
50
  if (!is_estimation_mode()) return get_num_retained();
50
51
  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
51
52
  }
52
53
 
53
54
  template<typename A>
54
- double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
+ double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
56
  if (!is_estimation_mode()) return get_num_retained();
56
57
  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
57
58
  }
58
59
 
59
60
  template<typename A>
60
- string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
61
- ostrstream os;
61
+ string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
62
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
63
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
64
+ std::ostringstream os;
62
65
  os << "### Theta sketch summary:" << std::endl;
63
- os << " num retained entries : " << get_num_retained() << std::endl;
64
- os << " seed hash : " << get_seed_hash() << std::endl;
65
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
66
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
67
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
68
- os << " theta (fraction) : " << get_theta() << std::endl;
69
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
66
+ os << " num retained entries : " << this->get_num_retained() << std::endl;
67
+ os << " seed hash : " << this->get_seed_hash() << std::endl;
68
+ os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
69
+ os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
70
+ os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
71
+ os << " theta (fraction) : " << this->get_theta() << std::endl;
72
+ os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
70
73
  os << " estimate : " << this->get_estimate() << std::endl;
71
74
  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
72
75
  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
73
76
  print_specifics(os);
74
77
  os << "### End sketch summary" << std::endl;
75
- if (detail) {
78
+ if (print_details) {
79
+ print_items(os);
80
+ }
81
+ return string<A>(os.str().c_str(), this->get_allocator());
82
+ }
83
+
84
+ template<typename A>
85
+ void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
76
86
  os << "### Retained entries" << std::endl;
77
87
  for (const auto& hash: *this) {
78
88
  os << hash << std::endl;
79
89
  }
80
90
  os << "### End retained entries" << std::endl;
81
- }
82
- return os.str();
83
91
  }
84
92
 
93
+
85
94
  // update sketch
86
95
 
87
96
  template<typename A>
88
97
  update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
89
- uint64_t theta, uint64_t seed, const A& allocator):
90
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
98
+ float p, uint64_t theta, uint64_t seed, const A& allocator):
99
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
91
100
  {}
92
101
 
93
102
  template<typename A>
@@ -102,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
102
111
 
103
112
  template<typename A>
104
113
  bool update_theta_sketch_alloc<A>::is_ordered() const {
105
- return false;
114
+ return table_.num_entries_ > 1 ? false : true;
106
115
  }
107
116
 
108
117
  template<typename A>
109
118
  uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
110
- return table_.theta_;
119
+ return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
111
120
  }
112
121
 
113
122
  template<typename A>
@@ -201,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
201
210
  table_.trim();
202
211
  }
203
212
 
213
+ template<typename A>
214
+ void update_theta_sketch_alloc<A>::reset() {
215
+ table_.reset();
216
+ }
217
+
204
218
  template<typename A>
205
219
  auto update_theta_sketch_alloc<A>::begin() -> iterator {
206
220
  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
@@ -227,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
227
241
  }
228
242
 
229
243
  template<typename A>
230
- void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
244
+ void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
231
245
  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
232
246
  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
233
247
  os << " resize factor : " << (1 << table_.rf_) << std::endl;
@@ -240,29 +254,32 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
240
254
 
241
255
  template<typename A>
242
256
  update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
243
- return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
257
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
244
258
  }
245
259
 
246
260
  // compact sketch
247
261
 
248
262
  template<typename A>
249
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
263
+ template<typename Other>
264
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
250
265
  is_empty_(other.is_empty()),
251
266
  is_ordered_(other.is_ordered() || ordered),
252
267
  seed_hash_(other.get_seed_hash()),
253
268
  theta_(other.get_theta64()),
254
269
  entries_(other.get_allocator())
255
270
  {
256
- entries_.reserve(other.get_num_retained());
257
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
258
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
271
+ if (!other.is_empty()) {
272
+ entries_.reserve(other.get_num_retained());
273
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
274
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
275
+ }
259
276
  }
260
277
 
261
278
  template<typename A>
262
279
  compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
263
280
  std::vector<uint64_t, A>&& entries):
264
281
  is_empty_(is_empty),
265
- is_ordered_(is_ordered),
282
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
266
283
  seed_hash_(seed_hash),
267
284
  theta_(theta),
268
285
  entries_(std::move(entries))
@@ -290,7 +307,7 @@ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
290
307
 
291
308
  template<typename A>
292
309
  uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
293
- return entries_.size();
310
+ return static_cast<uint32_t>(entries_.size());
294
311
  }
295
312
 
296
313
  template<typename A>
@@ -300,58 +317,58 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
300
317
 
301
318
  template<typename A>
302
319
  auto compact_theta_sketch_alloc<A>::begin() -> iterator {
303
- return iterator(entries_.data(), entries_.size(), 0);
320
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
304
321
  }
305
322
 
306
323
  template<typename A>
307
324
  auto compact_theta_sketch_alloc<A>::end() -> iterator {
308
- return iterator(nullptr, 0, entries_.size());
325
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
309
326
  }
310
327
 
311
328
  template<typename A>
312
329
  auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
313
- return const_iterator(entries_.data(), entries_.size(), 0);
330
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
314
331
  }
315
332
 
316
333
  template<typename A>
317
334
  auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
318
- return const_iterator(nullptr, 0, entries_.size());
335
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
319
336
  }
320
337
 
321
338
  template<typename A>
322
- void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
339
+ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
323
340
 
324
341
  template<typename A>
325
342
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
326
343
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
327
344
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
328
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
345
+ write(os, preamble_longs);
329
346
  const uint8_t serial_version = SERIAL_VERSION;
330
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
347
+ write(os, serial_version);
331
348
  const uint8_t type = SKETCH_TYPE;
332
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
349
+ write(os, type);
333
350
  const uint16_t unused16 = 0;
334
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
351
+ write(os, unused16);
335
352
  const uint8_t flags_byte(
336
353
  (1 << flags::IS_COMPACT) |
337
354
  (1 << flags::IS_READ_ONLY) |
338
355
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
339
356
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
340
357
  );
341
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
358
+ write(os, flags_byte);
342
359
  const uint16_t seed_hash = get_seed_hash();
343
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
360
+ write(os, seed_hash);
344
361
  if (!this->is_empty()) {
345
362
  if (!is_single_item) {
346
- const uint32_t num_entries = entries_.size();
347
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
363
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
364
+ write(os, num_entries);
348
365
  const uint32_t unused32 = 0;
349
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
366
+ write(os, unused32);
350
367
  if (this->is_estimation_mode()) {
351
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
368
+ write(os, this->theta_);
352
369
  }
353
370
  }
354
- os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
371
+ write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
355
372
  }
356
373
  }
357
374
 
@@ -364,30 +381,28 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
364
381
  vector_bytes bytes(size, 0, entries_.get_allocator());
365
382
  uint8_t* ptr = bytes.data() + header_size_bytes;
366
383
 
367
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
384
+ ptr += copy_to_mem(preamble_longs, ptr);
368
385
  const uint8_t serial_version = SERIAL_VERSION;
369
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
386
+ ptr += copy_to_mem(serial_version, ptr);
370
387
  const uint8_t type = SKETCH_TYPE;
371
- ptr += copy_to_mem(&type, ptr, sizeof(type));
372
- const uint16_t unused16 = 0;
373
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
388
+ ptr += copy_to_mem(type, ptr);
389
+ ptr += sizeof(uint16_t); // unused
374
390
  const uint8_t flags_byte(
375
391
  (1 << flags::IS_COMPACT) |
376
392
  (1 << flags::IS_READ_ONLY) |
377
393
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
378
394
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
379
395
  );
380
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
396
+ ptr += copy_to_mem(flags_byte, ptr);
381
397
  const uint16_t seed_hash = get_seed_hash();
382
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
398
+ ptr += copy_to_mem(seed_hash, ptr);
383
399
  if (!this->is_empty()) {
384
400
  if (!is_single_item) {
385
- const uint32_t num_entries = entries_.size();
386
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
387
- const uint32_t unused32 = 0;
388
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
401
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
402
+ ptr += copy_to_mem(num_entries, ptr);
403
+ ptr += sizeof(uint32_t);
389
404
  if (this->is_estimation_mode()) {
390
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
405
+ ptr += copy_to_mem(theta_, ptr);
391
406
  }
392
407
  }
393
408
  ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
@@ -397,43 +412,104 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
397
412
 
398
413
  template<typename A>
399
414
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
400
- uint8_t preamble_longs;
401
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
402
- uint8_t serial_version;
403
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
404
- uint8_t type;
405
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
406
- uint16_t unused16;
407
- is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
408
- uint8_t flags_byte;
409
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
410
- uint16_t seed_hash;
411
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
412
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
413
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
414
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
415
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
415
+ const auto preamble_longs = read<uint8_t>(is);
416
+ const auto serial_version = read<uint8_t>(is);
417
+ const auto type = read<uint8_t>(is);
418
+ switch (serial_version) {
419
+ case SERIAL_VERSION: {
420
+ read<uint16_t>(is); // unused
421
+ const auto flags_byte = read<uint8_t>(is);
422
+ const auto seed_hash = read<uint16_t>(is);
423
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
424
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
425
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
426
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
427
+
428
+ uint64_t theta = theta_constants::MAX_THETA;
429
+ uint32_t num_entries = 0;
430
+ if (!is_empty) {
431
+ if (preamble_longs == 1) {
432
+ num_entries = 1;
433
+ } else {
434
+ num_entries = read<uint32_t>(is);
435
+ read<uint32_t>(is); // unused
436
+ if (preamble_longs > 2) {
437
+ theta = read<uint64_t>(is);
438
+ }
439
+ }
440
+ }
441
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
442
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
416
443
 
417
- uint64_t theta = theta_constants::MAX_THETA;
418
- uint32_t num_entries = 0;
419
- if (!is_empty) {
420
- if (preamble_longs == 1) {
421
- num_entries = 1;
422
- } else {
423
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
424
- uint32_t unused32;
425
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
426
- if (preamble_longs > 2) {
427
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
444
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
445
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
446
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
447
+ }
448
+ case 1: {
449
+ const auto seed_hash = compute_seed_hash(seed);
450
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
451
+ read<uint8_t>(is); // unused
452
+ read<uint32_t>(is); // unused
453
+ const auto num_entries = read<uint32_t>(is);
454
+ read<uint32_t>(is); //unused
455
+ const auto theta = read<uint64_t>(is);
456
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
+ if (!is_empty)
459
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
460
+ if (!is.good())
461
+ throw std::runtime_error("error reading from std::istream");
462
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
463
+ }
464
+ case 2: {
465
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
466
+ read<uint8_t>(is); // unused
467
+ read<uint16_t>(is); // unused
468
+ const uint16_t seed_hash = read<uint16_t>(is);
469
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
470
+ if (preamble_longs == 1) {
471
+ if (!is.good())
472
+ throw std::runtime_error("error reading from std::istream");
473
+ std::vector<uint64_t> entries(0, 0, allocator);
474
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
+ } else if (preamble_longs == 2) {
476
+ const uint32_t num_entries = read<uint32_t>(is);
477
+ read<uint32_t>(is); // unused
478
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ if (num_entries == 0) {
480
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
+ }
482
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
483
+ if (!is.good())
484
+ throw std::runtime_error("error reading from std::istream");
485
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
486
+ } else if (preamble_longs == 3) {
487
+ const uint32_t num_entries = read<uint32_t>(is);
488
+ read<uint32_t>(is); // unused
489
+ const auto theta = read<uint64_t>(is);
490
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ if (is_empty) {
493
+ if (!is.good())
494
+ throw std::runtime_error("error reading from std::istream");
495
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
496
+ } else {
497
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
498
+ if (!is.good())
499
+ throw std::runtime_error("error reading from std::istream");
500
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
501
+ }
502
+ } else {
503
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
428
504
  }
429
- }
430
505
  }
431
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
432
- if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
433
-
434
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
435
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
436
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
506
+ default:
507
+ // this should always fail since the valid cases are handled above
508
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
509
+ // this throw is never reached, because check_serial_version will throw an informative exception.
510
+ // This is only here to avoid a compiler warning about a path without a return value.
511
+ throw std::invalid_argument("unexpected sketch serialization version");
512
+ }
437
513
  }
438
514
 
439
515
  template<typename A>
@@ -442,17 +518,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
442
518
  const char* ptr = static_cast<const char*>(bytes);
443
519
  const char* base = ptr;
444
520
  uint8_t preamble_longs;
445
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
521
+ ptr += copy_from_mem(ptr, preamble_longs);
446
522
  uint8_t serial_version;
447
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
523
+ ptr += copy_from_mem(ptr, serial_version);
448
524
  uint8_t type;
449
- ptr += copy_from_mem(ptr, &type, sizeof(type));
450
- uint16_t unused16;
451
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
525
+ ptr += copy_from_mem(ptr, type);
526
+ ptr += sizeof(uint16_t); // unused
452
527
  uint8_t flags_byte;
453
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
528
+ ptr += copy_from_mem(ptr, flags_byte);
454
529
  uint16_t seed_hash;
455
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
530
+ ptr += copy_from_mem(ptr, seed_hash);
456
531
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
532
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
458
533
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -465,12 +540,11 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
465
540
  num_entries = 1;
466
541
  } else {
467
542
  ensure_minimum_memory(size, 8); // read the first prelong before this method
468
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
469
- uint32_t unused32;
470
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
543
+ ptr += copy_from_mem(ptr, num_entries);
544
+ ptr += sizeof(uint32_t); // unused
471
545
  if (preamble_longs > 2) {
472
546
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
473
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
547
+ ptr += copy_from_mem(ptr, theta);
474
548
  }
475
549
  }
476
550
  }
@@ -483,7 +557,77 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
483
557
  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
484
558
  }
485
559
 
560
+ // wrapped compact sketch
561
+
562
+ template<typename A>
563
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
564
+ uint64_t theta, const uint64_t* entries):
565
+ is_empty_(is_empty),
566
+ is_ordered_(is_ordered),
567
+ seed_hash_(seed_hash),
568
+ num_entries_(num_entries),
569
+ theta_(theta),
570
+ entries_(entries)
571
+ {}
572
+
573
+ template<typename A>
574
+ const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
575
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
576
+ return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
577
+ }
578
+
579
+ template<typename A>
580
+ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
581
+ return A();
582
+ }
583
+
584
+ template<typename A>
585
+ bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
586
+ return is_empty_;
587
+ }
588
+
589
+ template<typename A>
590
+ bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
591
+ return is_ordered_;
592
+ }
593
+
594
+ template<typename A>
595
+ uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
596
+ return theta_;
597
+ }
598
+
599
+ template<typename A>
600
+ uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
601
+ return static_cast<uint32_t>(num_entries_);
602
+ }
603
+
604
+ template<typename A>
605
+ uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
606
+ return seed_hash_;
607
+ }
608
+
609
+ template<typename A>
610
+ auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
611
+ return entries_;
612
+ }
613
+
614
+ template<typename A>
615
+ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
616
+ return entries_ + num_entries_;
617
+ }
618
+
619
+ template<typename A>
620
+ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
621
+
622
+ template<typename A>
623
+ void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
624
+ os << "### Retained entries" << std::endl;
625
+ for (const auto& hash: *this) {
626
+ os << hash << std::endl;
627
+ }
628
+ os << "### End retained entries" << std::endl;
629
+ }
630
+
486
631
  } /* namespace datasketches */
487
632
 
488
633
  #endif
489
-
@@ -35,13 +35,13 @@ public:
35
35
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
36
  using resize_factor = theta_constants::resize_factor;
37
37
 
38
- struct pass_through_policy {
39
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ struct nop_policy {
39
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(internal_entry);
40
41
  unused(incoming_entry);
41
- return internal_entry;
42
42
  }
43
43
  };
44
- using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
44
+ using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
45
45
 
46
46
  // No constructor here. Use builder instead.
47
47
  class builder;
@@ -60,11 +60,16 @@ public:
60
60
  */
61
61
  CompactSketch get_result(bool ordered = true) const;
62
62
 
63
+ /**
64
+ * Reset the union to the initial empty state
65
+ */
66
+ void reset();
67
+
63
68
  private:
64
69
  State state_;
65
70
 
66
71
  // for builder
67
- theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
72
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
68
73
  };
69
74
 
70
75
  template<typename A>
@@ -38,7 +38,7 @@ public:
38
38
  using resize_factor = typename hash_table::resize_factor;
39
39
  using comparator = compare_by_key<ExtractKey>;
40
40
 
41
- theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
41
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
42
42
 
43
43
  template<typename FwdSketch>
44
44
  void update(FwdSketch&& sketch);
@@ -47,6 +47,8 @@ public:
47
47
 
48
48
  const Policy& get_policy() const;
49
49
 
50
+ void reset();
51
+
50
52
  private:
51
53
  Policy policy_;
52
54
  hash_table table_;
@@ -28,9 +28,9 @@ namespace datasketches {
28
28
 
29
29
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
30
30
  theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
31
- uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
31
+ float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
32
  policy_(policy),
33
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
33
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
34
34
  union_theta_(table_.theta_)
35
35
  {}
36
36
 
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
43
43
  if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
44
44
  for (auto& entry: sketch) {
45
45
  const uint64_t hash = EK()(entry);
46
- if (hash < union_theta_) {
46
+ if (hash < union_theta_ && hash < table_.theta_) {
47
47
  auto result = table_.find(hash);
48
48
  if (!result.second) {
49
49
  table_.insert(result.first, conditional_forward<SS>(entry));
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
84
84
  return policy_;
85
85
  }
86
86
 
87
+ template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
88
+ void theta_union_base<EN, EK, P, S, CS, A>::reset() {
89
+ table_.reset();
90
+ union_theta_ = table_.theta_;
91
+ }
92
+
87
93
  } /* namespace datasketches */
88
94
 
89
95
  #endif