datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <sstream>
24
24
  #include <vector>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "serde.hpp"
27
28
  #include "binomial_bounds.hpp"
@@ -31,64 +32,72 @@
31
32
  namespace datasketches {
32
33
 
33
34
  template<typename A>
34
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
35
+ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
35
36
  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
36
37
  }
37
38
 
38
39
  template<typename A>
39
- double theta_sketch_alloc<A>::get_theta() const {
40
+ double base_theta_sketch_alloc<A>::get_theta() const {
40
41
  return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
41
42
  }
42
43
 
43
44
  template<typename A>
44
- double theta_sketch_alloc<A>::get_estimate() const {
45
+ double base_theta_sketch_alloc<A>::get_estimate() const {
45
46
  return get_num_retained() / get_theta();
46
47
  }
47
48
 
48
49
  template<typename A>
49
- double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
50
+ double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
50
51
  if (!is_estimation_mode()) return get_num_retained();
51
52
  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
52
53
  }
53
54
 
54
55
  template<typename A>
55
- double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
56
+ double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
56
57
  if (!is_estimation_mode()) return get_num_retained();
57
58
  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
58
59
  }
59
60
 
60
61
  template<typename A>
61
- string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
62
- ostrstream os;
62
+ string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
63
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
64
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
65
+ std::ostringstream os;
63
66
  os << "### Theta sketch summary:" << std::endl;
64
- os << " num retained entries : " << get_num_retained() << std::endl;
65
- os << " seed hash : " << get_seed_hash() << std::endl;
66
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
67
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
68
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
69
- os << " theta (fraction) : " << get_theta() << std::endl;
70
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
67
+ os << " num retained entries : " << this->get_num_retained() << std::endl;
68
+ os << " seed hash : " << this->get_seed_hash() << std::endl;
69
+ os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
70
+ os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
71
+ os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
72
+ os << " theta (fraction) : " << this->get_theta() << std::endl;
73
+ os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
71
74
  os << " estimate : " << this->get_estimate() << std::endl;
72
75
  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
73
76
  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
74
77
  print_specifics(os);
75
78
  os << "### End sketch summary" << std::endl;
76
- if (detail) {
79
+ if (print_details) {
80
+ print_items(os);
81
+ }
82
+ return string<A>(os.str().c_str(), this->get_allocator());
83
+ }
84
+
85
+ template<typename A>
86
+ void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
77
87
  os << "### Retained entries" << std::endl;
78
88
  for (const auto& hash: *this) {
79
89
  os << hash << std::endl;
80
90
  }
81
91
  os << "### End retained entries" << std::endl;
82
- }
83
- return os.str();
84
92
  }
85
93
 
94
+
86
95
  // update sketch
87
96
 
88
97
  template<typename A>
89
98
  update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
90
- uint64_t theta, uint64_t seed, const A& allocator):
91
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
99
+ float p, uint64_t theta, uint64_t seed, const A& allocator):
100
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
92
101
  {}
93
102
 
94
103
  template<typename A>
@@ -103,12 +112,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
103
112
 
104
113
  template<typename A>
105
114
  bool update_theta_sketch_alloc<A>::is_ordered() const {
106
- return false;
115
+ return table_.num_entries_ > 1 ? false : true;
107
116
  }
108
117
 
109
118
  template<typename A>
110
119
  uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
111
- return table_.theta_;
120
+ return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
112
121
  }
113
122
 
114
123
  template<typename A>
@@ -202,6 +211,11 @@ void update_theta_sketch_alloc<A>::trim() {
202
211
  table_.trim();
203
212
  }
204
213
 
214
+ template<typename A>
215
+ void update_theta_sketch_alloc<A>::reset() {
216
+ table_.reset();
217
+ }
218
+
205
219
  template<typename A>
206
220
  auto update_theta_sketch_alloc<A>::begin() -> iterator {
207
221
  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
@@ -228,7 +242,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
228
242
  }
229
243
 
230
244
  template<typename A>
231
- void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
245
+ void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
232
246
  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
233
247
  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
234
248
  os << " resize factor : " << (1 << table_.rf_) << std::endl;
@@ -241,7 +255,7 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
241
255
 
242
256
  template<typename A>
243
257
  update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
244
- return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
258
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
245
259
  }
246
260
 
247
261
  // compact sketch
@@ -255,16 +269,18 @@ seed_hash_(other.get_seed_hash()),
255
269
  theta_(other.get_theta64()),
256
270
  entries_(other.get_allocator())
257
271
  {
258
- entries_.reserve(other.get_num_retained());
259
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
260
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
272
+ if (!other.is_empty()) {
273
+ entries_.reserve(other.get_num_retained());
274
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
275
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
276
+ }
261
277
  }
262
278
 
263
279
  template<typename A>
264
280
  compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
265
281
  std::vector<uint64_t, A>&& entries):
266
282
  is_empty_(is_empty),
267
- is_ordered_(is_ordered),
283
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
268
284
  seed_hash_(seed_hash),
269
285
  theta_(theta),
270
286
  entries_(std::move(entries))
@@ -321,7 +337,7 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
321
337
  }
322
338
 
323
339
  template<typename A>
324
- void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
340
+ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
325
341
 
326
342
  template<typename A>
327
343
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
@@ -400,78 +416,107 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
400
416
  const auto preamble_longs = read<uint8_t>(is);
401
417
  const auto serial_version = read<uint8_t>(is);
402
418
  const auto type = read<uint8_t>(is);
403
- read<uint16_t>(is); // unused
404
- const auto flags_byte = read<uint8_t>(is);
405
- const auto seed_hash = read<uint16_t>(is);
406
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
407
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
408
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
409
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
410
-
411
- uint64_t theta = theta_constants::MAX_THETA;
412
- uint32_t num_entries = 0;
413
- if (!is_empty) {
414
- if (preamble_longs == 1) {
415
- num_entries = 1;
416
- } else {
417
- num_entries = read<uint32_t>(is);
419
+ switch (serial_version) {
420
+ case SERIAL_VERSION: {
421
+ read<uint16_t>(is); // unused
422
+ const auto flags_byte = read<uint8_t>(is);
423
+ const auto seed_hash = read<uint16_t>(is);
424
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
425
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
426
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
427
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
428
+
429
+ uint64_t theta = theta_constants::MAX_THETA;
430
+ uint32_t num_entries = 0;
431
+ if (!is_empty) {
432
+ if (preamble_longs == 1) {
433
+ num_entries = 1;
434
+ } else {
435
+ num_entries = read<uint32_t>(is);
436
+ read<uint32_t>(is); // unused
437
+ if (preamble_longs > 2) {
438
+ theta = read<uint64_t>(is);
439
+ }
440
+ }
441
+ }
442
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
443
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
444
+
445
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
446
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
447
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
448
+ }
449
+ case 1: {
450
+ const auto seed_hash = compute_seed_hash(seed);
451
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
452
+ read<uint8_t>(is); // unused
418
453
  read<uint32_t>(is); // unused
419
- if (preamble_longs > 2) {
420
- theta = read<uint64_t>(is);
454
+ const auto num_entries = read<uint32_t>(is);
455
+ read<uint32_t>(is); //unused
456
+ const auto theta = read<uint64_t>(is);
457
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
458
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
459
+ if (!is_empty)
460
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
461
+ if (!is.good())
462
+ throw std::runtime_error("error reading from std::istream");
463
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
464
+ }
465
+ case 2: {
466
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
467
+ read<uint8_t>(is); // unused
468
+ read<uint16_t>(is); // unused
469
+ const uint16_t seed_hash = read<uint16_t>(is);
470
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
471
+ if (preamble_longs == 1) {
472
+ if (!is.good())
473
+ throw std::runtime_error("error reading from std::istream");
474
+ std::vector<uint64_t, A> entries(0, 0, allocator);
475
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
476
+ } else if (preamble_longs == 2) {
477
+ const uint32_t num_entries = read<uint32_t>(is);
478
+ read<uint32_t>(is); // unused
479
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
480
+ if (num_entries == 0) {
481
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
482
+ }
483
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
484
+ if (!is.good())
485
+ throw std::runtime_error("error reading from std::istream");
486
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
487
+ } else if (preamble_longs == 3) {
488
+ const uint32_t num_entries = read<uint32_t>(is);
489
+ read<uint32_t>(is); // unused
490
+ const auto theta = read<uint64_t>(is);
491
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
492
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
493
+ if (is_empty) {
494
+ if (!is.good())
495
+ throw std::runtime_error("error reading from std::istream");
496
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
497
+ } else {
498
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
499
+ if (!is.good())
500
+ throw std::runtime_error("error reading from std::istream");
501
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
502
+ }
503
+ } else {
504
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
421
505
  }
422
- }
423
506
  }
424
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
425
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
426
-
427
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
428
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
429
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
507
+ default:
508
+ // this should always fail since the valid cases are handled above
509
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
510
+ // this throw is never reached, because check_serial_version will throw an informative exception.
511
+ // This is only here to avoid a compiler warning about a path without a return value.
512
+ throw std::invalid_argument("unexpected sketch serialization version");
513
+ }
430
514
  }
431
515
 
432
516
  template<typename A>
433
517
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
434
- ensure_minimum_memory(size, 8);
435
- const char* ptr = static_cast<const char*>(bytes);
436
- const char* base = ptr;
437
- uint8_t preamble_longs;
438
- ptr += copy_from_mem(ptr, preamble_longs);
439
- uint8_t serial_version;
440
- ptr += copy_from_mem(ptr, serial_version);
441
- uint8_t type;
442
- ptr += copy_from_mem(ptr, type);
443
- ptr += sizeof(uint16_t); // unused
444
- uint8_t flags_byte;
445
- ptr += copy_from_mem(ptr, flags_byte);
446
- uint16_t seed_hash;
447
- ptr += copy_from_mem(ptr, seed_hash);
448
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
449
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
450
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
451
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
452
-
453
- uint64_t theta = theta_constants::MAX_THETA;
454
- uint32_t num_entries = 0;
455
- if (!is_empty) {
456
- if (preamble_longs == 1) {
457
- num_entries = 1;
458
- } else {
459
- ensure_minimum_memory(size, 8); // read the first prelong before this method
460
- ptr += copy_from_mem(ptr, num_entries);
461
- ptr += sizeof(uint32_t); // unused
462
- if (preamble_longs > 2) {
463
- ensure_minimum_memory(size, (preamble_longs - 1) << 3);
464
- ptr += copy_from_mem(ptr, theta);
465
- }
466
- }
467
- }
468
- const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
469
- check_memory_size(ptr - base + entries_size_bytes, size);
470
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
471
- if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
472
-
473
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
474
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
518
+ auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
519
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
475
520
  }
476
521
 
477
522
  // wrapped compact sketch
@@ -533,6 +578,18 @@ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
578
  return entries_ + num_entries_;
534
579
  }
535
580
 
581
+ template<typename A>
582
+ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
583
+
584
+ template<typename A>
585
+ void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
586
+ os << "### Retained entries" << std::endl;
587
+ for (const auto& hash: *this) {
588
+ os << hash << std::endl;
589
+ }
590
+ os << "### End retained entries" << std::endl;
591
+ }
592
+
536
593
  } /* namespace datasketches */
537
594
 
538
595
  #endif
@@ -60,11 +60,16 @@ public:
60
60
  */
61
61
  CompactSketch get_result(bool ordered = true) const;
62
62
 
63
+ /**
64
+ * Reset the union to the initial empty state
65
+ */
66
+ void reset();
67
+
63
68
  private:
64
69
  State state_;
65
70
 
66
71
  // for builder
67
- theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
72
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
68
73
  };
69
74
 
70
75
  template<typename A>
@@ -38,7 +38,7 @@ public:
38
38
  using resize_factor = typename hash_table::resize_factor;
39
39
  using comparator = compare_by_key<ExtractKey>;
40
40
 
41
- theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
41
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
42
42
 
43
43
  template<typename FwdSketch>
44
44
  void update(FwdSketch&& sketch);
@@ -47,6 +47,8 @@ public:
47
47
 
48
48
  const Policy& get_policy() const;
49
49
 
50
+ void reset();
51
+
50
52
  private:
51
53
  Policy policy_;
52
54
  hash_table table_;
@@ -21,6 +21,7 @@
21
21
  #define THETA_UNION_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_forward.hpp"
26
27
 
@@ -28,9 +29,9 @@ namespace datasketches {
28
29
 
29
30
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
30
31
  theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
31
- uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
+ float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
33
  policy_(policy),
33
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
34
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
34
35
  union_theta_(table_.theta_)
35
36
  {}
36
37
 
@@ -84,6 +85,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
84
85
  return policy_;
85
86
  }
86
87
 
88
+ template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
89
+ void theta_union_base<EN, EK, P, S, CS, A>::reset() {
90
+ table_.reset();
91
+ union_theta_ = table_.theta_;
92
+ }
93
+
87
94
  } /* namespace datasketches */
88
95
 
89
96
  #endif
@@ -23,8 +23,8 @@
23
23
  namespace datasketches {
24
24
 
25
25
  template<typename A>
26
- theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
38
  return state_.get_result(ordered);
39
39
  }
40
40
 
41
+ template<typename A>
42
+ void theta_union_alloc<A>::reset() {
43
+ state_.reset();
44
+ }
45
+
41
46
  template<typename A>
42
47
  theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
43
48
 
44
49
  template<typename A>
45
50
  auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
- return theta_union_alloc(
47
- this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
- this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
51
+ return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
49
52
  }
50
53
 
51
54
  } /* namespace datasketches */
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
41
41
  using comparator = compare_by_key<ExtractKey>;
42
42
 
43
- theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
44
- uint64_t seed, const Allocator& allocator, bool is_empty = true);
43
+ theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44
+ uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45
45
  theta_update_sketch_base(const theta_update_sketch_base& other);
46
46
  theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47
47
  ~theta_update_sketch_base();
@@ -75,6 +75,7 @@ struct theta_update_sketch_base {
75
75
  uint8_t lg_cur_size_;
76
76
  uint8_t lg_nom_size_;
77
77
  resize_factor rf_;
78
+ float p_;
78
79
  uint32_t num_entries_;
79
80
  uint64_t theta_;
80
81
  uint64_t seed_;
@@ -83,6 +84,7 @@ struct theta_update_sketch_base {
83
84
  void resize();
84
85
  void rebuild();
85
86
  void trim();
87
+ void reset();
86
88
 
87
89
  static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
88
90
  static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -94,7 +96,7 @@ struct theta_update_sketch_base {
94
96
  template<typename Derived, typename Allocator>
95
97
  class theta_base_builder {
96
98
  public:
97
- // TODO: Redundant and deprecated. Will be removed in next major verison release.
99
+ // TODO: Redundant and deprecated. Will be removed in next major version release.
98
100
  using resize_factor = theta_constants::resize_factor;
99
101
  static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
100
102
  static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
@@ -149,7 +151,6 @@ protected:
149
151
 
150
152
  uint64_t starting_theta() const;
151
153
  uint8_t starting_lg_size() const;
152
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
153
154
  };
154
155
 
155
156
  // key extractor
@@ -23,16 +23,20 @@
23
23
  #include <iostream>
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
+ #include <stdexcept>
27
+
28
+ #include "theta_helpers.hpp"
26
29
 
27
30
  namespace datasketches {
28
31
 
29
32
  template<typename EN, typename EK, typename A>
30
- theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
33
+ theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
31
34
  allocator_(allocator),
32
35
  is_empty_(is_empty),
33
36
  lg_cur_size_(lg_cur_size),
34
37
  lg_nom_size_(lg_nom_size),
35
38
  rf_(rf),
39
+ p_(p),
36
40
  num_entries_(0),
37
41
  theta_(theta),
38
42
  seed_(seed),
@@ -52,6 +56,7 @@ is_empty_(other.is_empty_),
52
56
  lg_cur_size_(other.lg_cur_size_),
53
57
  lg_nom_size_(other.lg_nom_size_),
54
58
  rf_(other.rf_),
59
+ p_(other.p_),
55
60
  num_entries_(other.num_entries_),
56
61
  theta_(other.theta_),
57
62
  seed_(other.seed_),
@@ -77,6 +82,7 @@ is_empty_(other.is_empty_),
77
82
  lg_cur_size_(other.lg_cur_size_),
78
83
  lg_nom_size_(other.lg_nom_size_),
79
84
  rf_(other.rf_),
85
+ p_(other.p_),
80
86
  num_entries_(other.num_entries_),
81
87
  theta_(other.theta_),
82
88
  seed_(other.seed_),
@@ -105,6 +111,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
105
111
  std::swap(lg_cur_size_, copy.lg_cur_size_);
106
112
  std::swap(lg_nom_size_, copy.lg_nom_size_);
107
113
  std::swap(rf_, copy.rf_);
114
+ std::swap(p_, copy.p_);
108
115
  std::swap(num_entries_, copy.num_entries_);
109
116
  std::swap(theta_, copy.theta_);
110
117
  std::swap(seed_, copy.seed_);
@@ -119,6 +126,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
119
126
  std::swap(lg_cur_size_, other.lg_cur_size_);
120
127
  std::swap(lg_nom_size_, other.lg_nom_size_);
121
128
  std::swap(rf_, other.rf_);
129
+ std::swap(p_, other.p_);
122
130
  std::swap(num_entries_, other.num_entries_);
123
131
  std::swap(theta_, other.theta_);
124
132
  std::swap(seed_, other.seed_);
@@ -247,6 +255,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
247
255
  if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
248
256
  }
249
257
 
258
+ template<typename EN, typename EK, typename A>
259
+ void theta_update_sketch_base<EN, EK, A>::reset() {
260
+ const size_t cur_size = 1ULL << lg_cur_size_;
261
+ for (size_t i = 0; i < cur_size; ++i) {
262
+ if (EK()(entries_[i]) != 0) {
263
+ entries_[i].~EN();
264
+ EK()(entries_[i]) = 0;
265
+ }
266
+ }
267
+ const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
268
+ lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
269
+ if (starting_lg_size != lg_cur_size_) {
270
+ allocator_.deallocate(entries_, cur_size);
271
+ lg_cur_size_ = starting_lg_size;
272
+ const size_t new_size = 1ULL << starting_lg_size;
273
+ entries_ = allocator_.allocate(new_size);
274
+ for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
275
+ }
276
+ num_entries_ = 0;
277
+ theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
278
+ is_empty_ = true;
279
+ }
280
+
250
281
  template<typename EN, typename EK, typename A>
251
282
  void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
252
283
  // find the first empty slot
@@ -310,18 +341,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
310
341
 
311
342
  template<typename Derived, typename Allocator>
312
343
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
313
- if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
314
- return theta_constants::MAX_THETA;
344
+ return theta_build_helper<true>::starting_theta_from_p(p_);
315
345
  }
316
346
 
317
347
  template<typename Derived, typename Allocator>
318
348
  uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
319
- return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
320
- }
321
-
322
- template<typename Derived, typename Allocator>
323
- uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
324
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
349
+ return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
325
350
  }
326
351
 
327
352
  // iterator
@@ -43,4 +43,5 @@ target_sources(theta_test
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
45
  theta_jaccard_similarity_test.cpp
46
+ theta_setop_test.cpp
46
47
  )
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_a_not_b.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta a-not-b: empty", "[theta_a_not_b]") {
@@ -21,6 +21,8 @@
21
21
 
22
22
  #include <theta_intersection.hpp>
23
23
 
24
+ #include <stdexcept>
25
+
24
26
  namespace datasketches {
25
27
 
26
28
  TEST_CASE("theta intersection: invalid", "[theta_intersection]") {