datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -26,68 +26,77 @@
26
26
  #include "serde.hpp"
27
27
  #include "binomial_bounds.hpp"
28
28
  #include "theta_helpers.hpp"
29
+ #include "compact_theta_sketch_parser.hpp"
29
30
 
30
31
  namespace datasketches {
31
32
 
32
33
  template<typename A>
33
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
34
+ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
34
35
  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
35
36
  }
36
37
 
37
38
  template<typename A>
38
- double theta_sketch_alloc<A>::get_theta() const {
39
+ double base_theta_sketch_alloc<A>::get_theta() const {
39
40
  return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
40
41
  }
41
42
 
42
43
  template<typename A>
43
- double theta_sketch_alloc<A>::get_estimate() const {
44
+ double base_theta_sketch_alloc<A>::get_estimate() const {
44
45
  return get_num_retained() / get_theta();
45
46
  }
46
47
 
47
48
  template<typename A>
48
- double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
+ double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
50
  if (!is_estimation_mode()) return get_num_retained();
50
51
  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
51
52
  }
52
53
 
53
54
  template<typename A>
54
- double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
+ double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
56
  if (!is_estimation_mode()) return get_num_retained();
56
57
  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
57
58
  }
58
59
 
59
60
  template<typename A>
60
- string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
61
- ostrstream os;
61
+ string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
62
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
63
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
64
+ std::ostringstream os;
62
65
  os << "### Theta sketch summary:" << std::endl;
63
- os << " num retained entries : " << get_num_retained() << std::endl;
64
- os << " seed hash : " << get_seed_hash() << std::endl;
65
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
66
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
67
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
68
- os << " theta (fraction) : " << get_theta() << std::endl;
69
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
66
+ os << " num retained entries : " << this->get_num_retained() << std::endl;
67
+ os << " seed hash : " << this->get_seed_hash() << std::endl;
68
+ os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
69
+ os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
70
+ os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
71
+ os << " theta (fraction) : " << this->get_theta() << std::endl;
72
+ os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
70
73
  os << " estimate : " << this->get_estimate() << std::endl;
71
74
  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
72
75
  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
73
76
  print_specifics(os);
74
77
  os << "### End sketch summary" << std::endl;
75
- if (detail) {
78
+ if (print_details) {
79
+ print_items(os);
80
+ }
81
+ return string<A>(os.str().c_str(), this->get_allocator());
82
+ }
83
+
84
+ template<typename A>
85
+ void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
76
86
  os << "### Retained entries" << std::endl;
77
87
  for (const auto& hash: *this) {
78
88
  os << hash << std::endl;
79
89
  }
80
90
  os << "### End retained entries" << std::endl;
81
- }
82
- return os.str();
83
91
  }
84
92
 
93
+
85
94
  // update sketch
86
95
 
87
96
  template<typename A>
88
97
  update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
89
- uint64_t theta, uint64_t seed, const A& allocator):
90
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
98
+ float p, uint64_t theta, uint64_t seed, const A& allocator):
99
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
91
100
  {}
92
101
 
93
102
  template<typename A>
@@ -102,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
102
111
 
103
112
  template<typename A>
104
113
  bool update_theta_sketch_alloc<A>::is_ordered() const {
105
- return false;
114
+ return table_.num_entries_ > 1 ? false : true;
106
115
  }
107
116
 
108
117
  template<typename A>
109
118
  uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
110
- return table_.theta_;
119
+ return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
111
120
  }
112
121
 
113
122
  template<typename A>
@@ -201,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
201
210
  table_.trim();
202
211
  }
203
212
 
213
+ template<typename A>
214
+ void update_theta_sketch_alloc<A>::reset() {
215
+ table_.reset();
216
+ }
217
+
204
218
  template<typename A>
205
219
  auto update_theta_sketch_alloc<A>::begin() -> iterator {
206
220
  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
@@ -227,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
227
241
  }
228
242
 
229
243
  template<typename A>
230
- void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
244
+ void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
231
245
  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
232
246
  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
233
247
  os << " resize factor : " << (1 << table_.rf_) << std::endl;
@@ -240,29 +254,32 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
240
254
 
241
255
  template<typename A>
242
256
  update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
243
- return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
257
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
244
258
  }
245
259
 
246
260
  // compact sketch
247
261
 
248
262
  template<typename A>
249
- compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
263
+ template<typename Other>
264
+ compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
250
265
  is_empty_(other.is_empty()),
251
266
  is_ordered_(other.is_ordered() || ordered),
252
267
  seed_hash_(other.get_seed_hash()),
253
268
  theta_(other.get_theta64()),
254
269
  entries_(other.get_allocator())
255
270
  {
256
- entries_.reserve(other.get_num_retained());
257
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
258
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
271
+ if (!other.is_empty()) {
272
+ entries_.reserve(other.get_num_retained());
273
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
274
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
275
+ }
259
276
  }
260
277
 
261
278
  template<typename A>
262
279
  compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
263
280
  std::vector<uint64_t, A>&& entries):
264
281
  is_empty_(is_empty),
265
- is_ordered_(is_ordered),
282
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
266
283
  seed_hash_(seed_hash),
267
284
  theta_(theta),
268
285
  entries_(std::move(entries))
@@ -290,7 +307,7 @@ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
290
307
 
291
308
  template<typename A>
292
309
  uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
293
- return entries_.size();
310
+ return static_cast<uint32_t>(entries_.size());
294
311
  }
295
312
 
296
313
  template<typename A>
@@ -300,58 +317,58 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
300
317
 
301
318
  template<typename A>
302
319
  auto compact_theta_sketch_alloc<A>::begin() -> iterator {
303
- return iterator(entries_.data(), entries_.size(), 0);
320
+ return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
304
321
  }
305
322
 
306
323
  template<typename A>
307
324
  auto compact_theta_sketch_alloc<A>::end() -> iterator {
308
- return iterator(nullptr, 0, entries_.size());
325
+ return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
309
326
  }
310
327
 
311
328
  template<typename A>
312
329
  auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
313
- return const_iterator(entries_.data(), entries_.size(), 0);
330
+ return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
314
331
  }
315
332
 
316
333
  template<typename A>
317
334
  auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
318
- return const_iterator(nullptr, 0, entries_.size());
335
+ return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
319
336
  }
320
337
 
321
338
  template<typename A>
322
- void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
339
+ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
323
340
 
324
341
  template<typename A>
325
342
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
326
343
  const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
327
344
  const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
328
- os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
345
+ write(os, preamble_longs);
329
346
  const uint8_t serial_version = SERIAL_VERSION;
330
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
347
+ write(os, serial_version);
331
348
  const uint8_t type = SKETCH_TYPE;
332
- os.write(reinterpret_cast<const char*>(&type), sizeof(type));
349
+ write(os, type);
333
350
  const uint16_t unused16 = 0;
334
- os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
351
+ write(os, unused16);
335
352
  const uint8_t flags_byte(
336
353
  (1 << flags::IS_COMPACT) |
337
354
  (1 << flags::IS_READ_ONLY) |
338
355
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
339
356
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
340
357
  );
341
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
358
+ write(os, flags_byte);
342
359
  const uint16_t seed_hash = get_seed_hash();
343
- os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
360
+ write(os, seed_hash);
344
361
  if (!this->is_empty()) {
345
362
  if (!is_single_item) {
346
- const uint32_t num_entries = entries_.size();
347
- os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
363
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
364
+ write(os, num_entries);
348
365
  const uint32_t unused32 = 0;
349
- os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
366
+ write(os, unused32);
350
367
  if (this->is_estimation_mode()) {
351
- os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
368
+ write(os, this->theta_);
352
369
  }
353
370
  }
354
- os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
371
+ write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
355
372
  }
356
373
  }
357
374
 
@@ -364,30 +381,28 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
364
381
  vector_bytes bytes(size, 0, entries_.get_allocator());
365
382
  uint8_t* ptr = bytes.data() + header_size_bytes;
366
383
 
367
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
384
+ ptr += copy_to_mem(preamble_longs, ptr);
368
385
  const uint8_t serial_version = SERIAL_VERSION;
369
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
386
+ ptr += copy_to_mem(serial_version, ptr);
370
387
  const uint8_t type = SKETCH_TYPE;
371
- ptr += copy_to_mem(&type, ptr, sizeof(type));
372
- const uint16_t unused16 = 0;
373
- ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
388
+ ptr += copy_to_mem(type, ptr);
389
+ ptr += sizeof(uint16_t); // unused
374
390
  const uint8_t flags_byte(
375
391
  (1 << flags::IS_COMPACT) |
376
392
  (1 << flags::IS_READ_ONLY) |
377
393
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
378
394
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
379
395
  );
380
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
396
+ ptr += copy_to_mem(flags_byte, ptr);
381
397
  const uint16_t seed_hash = get_seed_hash();
382
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
398
+ ptr += copy_to_mem(seed_hash, ptr);
383
399
  if (!this->is_empty()) {
384
400
  if (!is_single_item) {
385
- const uint32_t num_entries = entries_.size();
386
- ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
387
- const uint32_t unused32 = 0;
388
- ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
401
+ const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
402
+ ptr += copy_to_mem(num_entries, ptr);
403
+ ptr += sizeof(uint32_t);
389
404
  if (this->is_estimation_mode()) {
390
- ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
405
+ ptr += copy_to_mem(theta_, ptr);
391
406
  }
392
407
  }
393
408
  ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
@@ -397,43 +412,104 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
397
412
 
398
413
  template<typename A>
399
414
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
400
- uint8_t preamble_longs;
401
- is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
402
- uint8_t serial_version;
403
- is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
404
- uint8_t type;
405
- is.read(reinterpret_cast<char*>(&type), sizeof(type));
406
- uint16_t unused16;
407
- is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
408
- uint8_t flags_byte;
409
- is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
410
- uint16_t seed_hash;
411
- is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
412
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
413
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
414
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
415
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
415
+ const auto preamble_longs = read<uint8_t>(is);
416
+ const auto serial_version = read<uint8_t>(is);
417
+ const auto type = read<uint8_t>(is);
418
+ switch (serial_version) {
419
+ case SERIAL_VERSION: {
420
+ read<uint16_t>(is); // unused
421
+ const auto flags_byte = read<uint8_t>(is);
422
+ const auto seed_hash = read<uint16_t>(is);
423
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
424
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
425
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
426
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
427
+
428
+ uint64_t theta = theta_constants::MAX_THETA;
429
+ uint32_t num_entries = 0;
430
+ if (!is_empty) {
431
+ if (preamble_longs == 1) {
432
+ num_entries = 1;
433
+ } else {
434
+ num_entries = read<uint32_t>(is);
435
+ read<uint32_t>(is); // unused
436
+ if (preamble_longs > 2) {
437
+ theta = read<uint64_t>(is);
438
+ }
439
+ }
440
+ }
441
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
442
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
416
443
 
417
- uint64_t theta = theta_constants::MAX_THETA;
418
- uint32_t num_entries = 0;
419
- if (!is_empty) {
420
- if (preamble_longs == 1) {
421
- num_entries = 1;
422
- } else {
423
- is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
424
- uint32_t unused32;
425
- is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
426
- if (preamble_longs > 2) {
427
- is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
444
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
445
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
446
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
447
+ }
448
+ case 1: {
449
+ const auto seed_hash = compute_seed_hash(seed);
450
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
451
+ read<uint8_t>(is); // unused
452
+ read<uint32_t>(is); // unused
453
+ const auto num_entries = read<uint32_t>(is);
454
+ read<uint32_t>(is); //unused
455
+ const auto theta = read<uint64_t>(is);
456
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
+ if (!is_empty)
459
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
460
+ if (!is.good())
461
+ throw std::runtime_error("error reading from std::istream");
462
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
463
+ }
464
+ case 2: {
465
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
466
+ read<uint8_t>(is); // unused
467
+ read<uint16_t>(is); // unused
468
+ const uint16_t seed_hash = read<uint16_t>(is);
469
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
470
+ if (preamble_longs == 1) {
471
+ if (!is.good())
472
+ throw std::runtime_error("error reading from std::istream");
473
+ std::vector<uint64_t> entries(0, 0, allocator);
474
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
+ } else if (preamble_longs == 2) {
476
+ const uint32_t num_entries = read<uint32_t>(is);
477
+ read<uint32_t>(is); // unused
478
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ if (num_entries == 0) {
480
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
+ }
482
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
483
+ if (!is.good())
484
+ throw std::runtime_error("error reading from std::istream");
485
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
486
+ } else if (preamble_longs == 3) {
487
+ const uint32_t num_entries = read<uint32_t>(is);
488
+ read<uint32_t>(is); // unused
489
+ const auto theta = read<uint64_t>(is);
490
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ if (is_empty) {
493
+ if (!is.good())
494
+ throw std::runtime_error("error reading from std::istream");
495
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
496
+ } else {
497
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
498
+ if (!is.good())
499
+ throw std::runtime_error("error reading from std::istream");
500
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
501
+ }
502
+ } else {
503
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
428
504
  }
429
- }
430
505
  }
431
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
432
- if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
433
-
434
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
435
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
436
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
506
+ default:
507
+ // this should always fail since the valid cases are handled above
508
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
509
+ // this throw is never reached, because check_serial_version will throw an informative exception.
510
+ // This is only here to avoid a compiler warning about a path without a return value.
511
+ throw std::invalid_argument("unexpected sketch serialization version");
512
+ }
437
513
  }
438
514
 
439
515
  template<typename A>
@@ -442,17 +518,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
442
518
  const char* ptr = static_cast<const char*>(bytes);
443
519
  const char* base = ptr;
444
520
  uint8_t preamble_longs;
445
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
521
+ ptr += copy_from_mem(ptr, preamble_longs);
446
522
  uint8_t serial_version;
447
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
523
+ ptr += copy_from_mem(ptr, serial_version);
448
524
  uint8_t type;
449
- ptr += copy_from_mem(ptr, &type, sizeof(type));
450
- uint16_t unused16;
451
- ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
525
+ ptr += copy_from_mem(ptr, type);
526
+ ptr += sizeof(uint16_t); // unused
452
527
  uint8_t flags_byte;
453
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
528
+ ptr += copy_from_mem(ptr, flags_byte);
454
529
  uint16_t seed_hash;
455
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
530
+ ptr += copy_from_mem(ptr, seed_hash);
456
531
  checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
532
  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
458
533
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
@@ -465,12 +540,11 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
465
540
  num_entries = 1;
466
541
  } else {
467
542
  ensure_minimum_memory(size, 8); // read the first prelong before this method
468
- ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
469
- uint32_t unused32;
470
- ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
543
+ ptr += copy_from_mem(ptr, num_entries);
544
+ ptr += sizeof(uint32_t); // unused
471
545
  if (preamble_longs > 2) {
472
546
  ensure_minimum_memory(size, (preamble_longs - 1) << 3);
473
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
547
+ ptr += copy_from_mem(ptr, theta);
474
548
  }
475
549
  }
476
550
  }
@@ -483,7 +557,77 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
483
557
  return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
484
558
  }
485
559
 
560
+ // wrapped compact sketch
561
+
562
+ template<typename A>
563
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
564
+ uint64_t theta, const uint64_t* entries):
565
+ is_empty_(is_empty),
566
+ is_ordered_(is_ordered),
567
+ seed_hash_(seed_hash),
568
+ num_entries_(num_entries),
569
+ theta_(theta),
570
+ entries_(entries)
571
+ {}
572
+
573
+ template<typename A>
574
+ const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
575
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
576
+ return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
577
+ }
578
+
579
+ template<typename A>
580
+ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
581
+ return A();
582
+ }
583
+
584
+ template<typename A>
585
+ bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
586
+ return is_empty_;
587
+ }
588
+
589
+ template<typename A>
590
+ bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
591
+ return is_ordered_;
592
+ }
593
+
594
+ template<typename A>
595
+ uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
596
+ return theta_;
597
+ }
598
+
599
+ template<typename A>
600
+ uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
601
+ return static_cast<uint32_t>(num_entries_);
602
+ }
603
+
604
+ template<typename A>
605
+ uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
606
+ return seed_hash_;
607
+ }
608
+
609
+ template<typename A>
610
+ auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
611
+ return entries_;
612
+ }
613
+
614
+ template<typename A>
615
+ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
616
+ return entries_ + num_entries_;
617
+ }
618
+
619
+ template<typename A>
620
+ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
621
+
622
+ template<typename A>
623
+ void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
624
+ os << "### Retained entries" << std::endl;
625
+ for (const auto& hash: *this) {
626
+ os << hash << std::endl;
627
+ }
628
+ os << "### End retained entries" << std::endl;
629
+ }
630
+
486
631
  } /* namespace datasketches */
487
632
 
488
633
  #endif
489
-
@@ -35,13 +35,13 @@ public:
35
35
  using CompactSketch = compact_theta_sketch_alloc<Allocator>;
36
36
  using resize_factor = theta_constants::resize_factor;
37
37
 
38
- struct pass_through_policy {
39
- uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
38
+ struct nop_policy {
39
+ void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
40
+ unused(internal_entry);
40
41
  unused(incoming_entry);
41
- return internal_entry;
42
42
  }
43
43
  };
44
- using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
44
+ using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
45
45
 
46
46
  // No constructor here. Use builder instead.
47
47
  class builder;
@@ -60,11 +60,16 @@ public:
60
60
  */
61
61
  CompactSketch get_result(bool ordered = true) const;
62
62
 
63
+ /**
64
+ * Reset the union to the initial empty state
65
+ */
66
+ void reset();
67
+
63
68
  private:
64
69
  State state_;
65
70
 
66
71
  // for builder
67
- theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
72
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
68
73
  };
69
74
 
70
75
  template<typename A>
@@ -38,7 +38,7 @@ public:
38
38
  using resize_factor = typename hash_table::resize_factor;
39
39
  using comparator = compare_by_key<ExtractKey>;
40
40
 
41
- theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
41
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
42
42
 
43
43
  template<typename FwdSketch>
44
44
  void update(FwdSketch&& sketch);
@@ -47,6 +47,8 @@ public:
47
47
 
48
48
  const Policy& get_policy() const;
49
49
 
50
+ void reset();
51
+
50
52
  private:
51
53
  Policy policy_;
52
54
  hash_table table_;
@@ -28,9 +28,9 @@ namespace datasketches {
28
28
 
29
29
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
30
30
  theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
31
- uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
31
+ float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
32
  policy_(policy),
33
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
33
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
34
34
  union_theta_(table_.theta_)
35
35
  {}
36
36
 
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
43
43
  if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
44
44
  for (auto& entry: sketch) {
45
45
  const uint64_t hash = EK()(entry);
46
- if (hash < union_theta_) {
46
+ if (hash < union_theta_ && hash < table_.theta_) {
47
47
  auto result = table_.find(hash);
48
48
  if (!result.second) {
49
49
  table_.insert(result.first, conditional_forward<SS>(entry));
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
84
84
  return policy_;
85
85
  }
86
86
 
87
+ template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
88
+ void theta_union_base<EN, EK, P, S, CS, A>::reset() {
89
+ table_.reset();
90
+ union_theta_ = table_.theta_;
91
+ }
92
+
87
93
  } /* namespace datasketches */
88
94
 
89
95
  #endif