datasketches 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +7 -7
  4. data/ext/datasketches/theta_wrapper.cpp +20 -4
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
  7. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  8. data/vendor/datasketches-cpp/README.md +76 -9
  9. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  10. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  11. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  12. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  13. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
  14. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  15. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  16. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
  17. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  18. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  20. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
  21. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  22. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  24. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
  25. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
  26. data/vendor/datasketches-cpp/setup.py +1 -1
  27. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  28. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
  29. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  30. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  31. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  32. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  33. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
  34. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  35. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  36. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
  37. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  38. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  39. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
  40. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  41. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  42. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  43. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  44. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  46. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  47. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
  48. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
  49. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  50. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  51. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  52. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  53. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  54. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  55. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
  56. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  57. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  58. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  59. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  60. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
  61. metadata +11 -6
  62. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  63. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,64 +31,72 @@
31
31
  namespace datasketches {
32
32
 
33
33
  template<typename A>
34
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
34
+ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
35
35
  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
36
36
  }
37
37
 
38
38
  template<typename A>
39
- double theta_sketch_alloc<A>::get_theta() const {
39
+ double base_theta_sketch_alloc<A>::get_theta() const {
40
40
  return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
41
41
  }
42
42
 
43
43
  template<typename A>
44
- double theta_sketch_alloc<A>::get_estimate() const {
44
+ double base_theta_sketch_alloc<A>::get_estimate() const {
45
45
  return get_num_retained() / get_theta();
46
46
  }
47
47
 
48
48
  template<typename A>
49
- double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
+ double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
50
50
  if (!is_estimation_mode()) return get_num_retained();
51
51
  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
52
52
  }
53
53
 
54
54
  template<typename A>
55
- double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
+ double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
56
56
  if (!is_estimation_mode()) return get_num_retained();
57
57
  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
58
58
  }
59
59
 
60
60
  template<typename A>
61
- string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
62
- ostrstream os;
61
+ string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
62
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
63
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
64
+ std::ostringstream os;
63
65
  os << "### Theta sketch summary:" << std::endl;
64
- os << " num retained entries : " << get_num_retained() << std::endl;
65
- os << " seed hash : " << get_seed_hash() << std::endl;
66
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
67
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
68
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
69
- os << " theta (fraction) : " << get_theta() << std::endl;
70
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
66
+ os << " num retained entries : " << this->get_num_retained() << std::endl;
67
+ os << " seed hash : " << this->get_seed_hash() << std::endl;
68
+ os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
69
+ os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
70
+ os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
71
+ os << " theta (fraction) : " << this->get_theta() << std::endl;
72
+ os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
71
73
  os << " estimate : " << this->get_estimate() << std::endl;
72
74
  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
73
75
  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
74
76
  print_specifics(os);
75
77
  os << "### End sketch summary" << std::endl;
76
- if (detail) {
78
+ if (print_details) {
79
+ print_items(os);
80
+ }
81
+ return string<A>(os.str().c_str(), this->get_allocator());
82
+ }
83
+
84
+ template<typename A>
85
+ void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
77
86
  os << "### Retained entries" << std::endl;
78
87
  for (const auto& hash: *this) {
79
88
  os << hash << std::endl;
80
89
  }
81
90
  os << "### End retained entries" << std::endl;
82
- }
83
- return os.str();
84
91
  }
85
92
 
93
+
86
94
  // update sketch
87
95
 
88
96
  template<typename A>
89
97
  update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
90
- uint64_t theta, uint64_t seed, const A& allocator):
91
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
98
+ float p, uint64_t theta, uint64_t seed, const A& allocator):
99
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
92
100
  {}
93
101
 
94
102
  template<typename A>
@@ -103,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
103
111
 
104
112
  template<typename A>
105
113
  bool update_theta_sketch_alloc<A>::is_ordered() const {
106
- return false;
114
+ return table_.num_entries_ > 1 ? false : true;
107
115
  }
108
116
 
109
117
  template<typename A>
110
118
  uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
111
- return table_.theta_;
119
+ return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
112
120
  }
113
121
 
114
122
  template<typename A>
@@ -202,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
202
210
  table_.trim();
203
211
  }
204
212
 
213
+ template<typename A>
214
+ void update_theta_sketch_alloc<A>::reset() {
215
+ table_.reset();
216
+ }
217
+
205
218
  template<typename A>
206
219
  auto update_theta_sketch_alloc<A>::begin() -> iterator {
207
220
  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
@@ -228,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
228
241
  }
229
242
 
230
243
  template<typename A>
231
- void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
244
+ void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
232
245
  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
233
246
  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
234
247
  os << " resize factor : " << (1 << table_.rf_) << std::endl;
@@ -241,7 +254,7 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
241
254
 
242
255
  template<typename A>
243
256
  update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
244
- return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
257
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
245
258
  }
246
259
 
247
260
  // compact sketch
@@ -255,16 +268,18 @@ seed_hash_(other.get_seed_hash()),
255
268
  theta_(other.get_theta64()),
256
269
  entries_(other.get_allocator())
257
270
  {
258
- entries_.reserve(other.get_num_retained());
259
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
260
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
271
+ if (!other.is_empty()) {
272
+ entries_.reserve(other.get_num_retained());
273
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
274
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
275
+ }
261
276
  }
262
277
 
263
278
  template<typename A>
264
279
  compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
265
280
  std::vector<uint64_t, A>&& entries):
266
281
  is_empty_(is_empty),
267
- is_ordered_(is_ordered),
282
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
268
283
  seed_hash_(seed_hash),
269
284
  theta_(theta),
270
285
  entries_(std::move(entries))
@@ -321,7 +336,7 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
321
336
  }
322
337
 
323
338
  template<typename A>
324
- void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
339
+ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
325
340
 
326
341
  template<typename A>
327
342
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
@@ -400,33 +415,101 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
400
415
  const auto preamble_longs = read<uint8_t>(is);
401
416
  const auto serial_version = read<uint8_t>(is);
402
417
  const auto type = read<uint8_t>(is);
403
- read<uint16_t>(is); // unused
404
- const auto flags_byte = read<uint8_t>(is);
405
- const auto seed_hash = read<uint16_t>(is);
406
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
407
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
408
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
409
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
418
+ switch (serial_version) {
419
+ case SERIAL_VERSION: {
420
+ read<uint16_t>(is); // unused
421
+ const auto flags_byte = read<uint8_t>(is);
422
+ const auto seed_hash = read<uint16_t>(is);
423
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
424
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
425
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
426
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
427
+
428
+ uint64_t theta = theta_constants::MAX_THETA;
429
+ uint32_t num_entries = 0;
430
+ if (!is_empty) {
431
+ if (preamble_longs == 1) {
432
+ num_entries = 1;
433
+ } else {
434
+ num_entries = read<uint32_t>(is);
435
+ read<uint32_t>(is); // unused
436
+ if (preamble_longs > 2) {
437
+ theta = read<uint64_t>(is);
438
+ }
439
+ }
440
+ }
441
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
442
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
410
443
 
411
- uint64_t theta = theta_constants::MAX_THETA;
412
- uint32_t num_entries = 0;
413
- if (!is_empty) {
414
- if (preamble_longs == 1) {
415
- num_entries = 1;
416
- } else {
417
- num_entries = read<uint32_t>(is);
444
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
445
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
446
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
447
+ }
448
+ case 1: {
449
+ const auto seed_hash = compute_seed_hash(seed);
450
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
451
+ read<uint8_t>(is); // unused
418
452
  read<uint32_t>(is); // unused
419
- if (preamble_longs > 2) {
420
- theta = read<uint64_t>(is);
453
+ const auto num_entries = read<uint32_t>(is);
454
+ read<uint32_t>(is); //unused
455
+ const auto theta = read<uint64_t>(is);
456
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
+ if (!is_empty)
459
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
460
+ if (!is.good())
461
+ throw std::runtime_error("error reading from std::istream");
462
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
463
+ }
464
+ case 2: {
465
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
466
+ read<uint8_t>(is); // unused
467
+ read<uint16_t>(is); // unused
468
+ const uint16_t seed_hash = read<uint16_t>(is);
469
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
470
+ if (preamble_longs == 1) {
471
+ if (!is.good())
472
+ throw std::runtime_error("error reading from std::istream");
473
+ std::vector<uint64_t> entries(0, 0, allocator);
474
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
+ } else if (preamble_longs == 2) {
476
+ const uint32_t num_entries = read<uint32_t>(is);
477
+ read<uint32_t>(is); // unused
478
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ if (num_entries == 0) {
480
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
+ }
482
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
483
+ if (!is.good())
484
+ throw std::runtime_error("error reading from std::istream");
485
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
486
+ } else if (preamble_longs == 3) {
487
+ const uint32_t num_entries = read<uint32_t>(is);
488
+ read<uint32_t>(is); // unused
489
+ const auto theta = read<uint64_t>(is);
490
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ if (is_empty) {
493
+ if (!is.good())
494
+ throw std::runtime_error("error reading from std::istream");
495
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
496
+ } else {
497
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
498
+ if (!is.good())
499
+ throw std::runtime_error("error reading from std::istream");
500
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
501
+ }
502
+ } else {
503
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
421
504
  }
422
- }
423
505
  }
424
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
425
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
426
-
427
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
428
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
429
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
506
+ default:
507
+ // this should always fail since the valid cases are handled above
508
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
509
+ // this throw is never reached, because check_serial_version will throw an informative exception.
510
+ // This is only here to avoid a compiler warning about a path without a return value.
511
+ throw std::invalid_argument("unexpected sketch serialization version");
512
+ }
430
513
  }
431
514
 
432
515
  template<typename A>
@@ -533,6 +616,18 @@ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
616
  return entries_ + num_entries_;
534
617
  }
535
618
 
619
+ template<typename A>
620
+ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
621
+
622
+ template<typename A>
623
+ void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
624
+ os << "### Retained entries" << std::endl;
625
+ for (const auto& hash: *this) {
626
+ os << hash << std::endl;
627
+ }
628
+ os << "### End retained entries" << std::endl;
629
+ }
630
+
536
631
  } /* namespace datasketches */
537
632
 
538
633
  #endif
@@ -60,11 +60,16 @@ public:
60
60
  */
61
61
  CompactSketch get_result(bool ordered = true) const;
62
62
 
63
+ /**
64
+ * Reset the union to the initial empty state
65
+ */
66
+ void reset();
67
+
63
68
  private:
64
69
  State state_;
65
70
 
66
71
  // for builder
67
- theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
72
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
68
73
  };
69
74
 
70
75
  template<typename A>
@@ -38,7 +38,7 @@ public:
38
38
  using resize_factor = typename hash_table::resize_factor;
39
39
  using comparator = compare_by_key<ExtractKey>;
40
40
 
41
- theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
41
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
42
42
 
43
43
  template<typename FwdSketch>
44
44
  void update(FwdSketch&& sketch);
@@ -47,6 +47,8 @@ public:
47
47
 
48
48
  const Policy& get_policy() const;
49
49
 
50
+ void reset();
51
+
50
52
  private:
51
53
  Policy policy_;
52
54
  hash_table table_;
@@ -28,9 +28,9 @@ namespace datasketches {
28
28
 
29
29
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
30
30
  theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
31
- uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
31
+ float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
32
  policy_(policy),
33
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
33
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
34
34
  union_theta_(table_.theta_)
35
35
  {}
36
36
 
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
84
84
  return policy_;
85
85
  }
86
86
 
87
+ template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
88
+ void theta_union_base<EN, EK, P, S, CS, A>::reset() {
89
+ table_.reset();
90
+ union_theta_ = table_.theta_;
91
+ }
92
+
87
93
  } /* namespace datasketches */
88
94
 
89
95
  #endif
@@ -23,8 +23,8 @@
23
23
  namespace datasketches {
24
24
 
25
25
  template<typename A>
26
- theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
38
  return state_.get_result(ordered);
39
39
  }
40
40
 
41
+ template<typename A>
42
+ void theta_union_alloc<A>::reset() {
43
+ state_.reset();
44
+ }
45
+
41
46
  template<typename A>
42
47
  theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
43
48
 
44
49
  template<typename A>
45
50
  auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
- return theta_union_alloc(
47
- this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
- this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
51
+ return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
49
52
  }
50
53
 
51
54
  } /* namespace datasketches */
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
41
41
  using comparator = compare_by_key<ExtractKey>;
42
42
 
43
- theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
44
- uint64_t seed, const Allocator& allocator, bool is_empty = true);
43
+ theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44
+ uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45
45
  theta_update_sketch_base(const theta_update_sketch_base& other);
46
46
  theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47
47
  ~theta_update_sketch_base();
@@ -75,6 +75,7 @@ struct theta_update_sketch_base {
75
75
  uint8_t lg_cur_size_;
76
76
  uint8_t lg_nom_size_;
77
77
  resize_factor rf_;
78
+ float p_;
78
79
  uint32_t num_entries_;
79
80
  uint64_t theta_;
80
81
  uint64_t seed_;
@@ -83,6 +84,7 @@ struct theta_update_sketch_base {
83
84
  void resize();
84
85
  void rebuild();
85
86
  void trim();
87
+ void reset();
86
88
 
87
89
  static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
88
90
  static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -94,7 +96,7 @@ struct theta_update_sketch_base {
94
96
  template<typename Derived, typename Allocator>
95
97
  class theta_base_builder {
96
98
  public:
97
- // TODO: Redundant and deprecated. Will be removed in next major verison release.
99
+ // TODO: Redundant and deprecated. Will be removed in next major version release.
98
100
  using resize_factor = theta_constants::resize_factor;
99
101
  static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
100
102
  static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
@@ -149,7 +151,6 @@ protected:
149
151
 
150
152
  uint64_t starting_theta() const;
151
153
  uint8_t starting_lg_size() const;
152
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
153
154
  };
154
155
 
155
156
  // key extractor
@@ -24,15 +24,18 @@
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
26
 
27
+ #include "theta_helpers.hpp"
28
+
27
29
  namespace datasketches {
28
30
 
29
31
  template<typename EN, typename EK, typename A>
30
- theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
32
+ theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
31
33
  allocator_(allocator),
32
34
  is_empty_(is_empty),
33
35
  lg_cur_size_(lg_cur_size),
34
36
  lg_nom_size_(lg_nom_size),
35
37
  rf_(rf),
38
+ p_(p),
36
39
  num_entries_(0),
37
40
  theta_(theta),
38
41
  seed_(seed),
@@ -52,6 +55,7 @@ is_empty_(other.is_empty_),
52
55
  lg_cur_size_(other.lg_cur_size_),
53
56
  lg_nom_size_(other.lg_nom_size_),
54
57
  rf_(other.rf_),
58
+ p_(other.p_),
55
59
  num_entries_(other.num_entries_),
56
60
  theta_(other.theta_),
57
61
  seed_(other.seed_),
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
77
81
  lg_cur_size_(other.lg_cur_size_),
78
82
  lg_nom_size_(other.lg_nom_size_),
79
83
  rf_(other.rf_),
84
+ p_(other.p_),
80
85
  num_entries_(other.num_entries_),
81
86
  theta_(other.theta_),
82
87
  seed_(other.seed_),
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
105
110
  std::swap(lg_cur_size_, copy.lg_cur_size_);
106
111
  std::swap(lg_nom_size_, copy.lg_nom_size_);
107
112
  std::swap(rf_, copy.rf_);
113
+ std::swap(p_, copy.p_);
108
114
  std::swap(num_entries_, copy.num_entries_);
109
115
  std::swap(theta_, copy.theta_);
110
116
  std::swap(seed_, copy.seed_);
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
119
125
  std::swap(lg_cur_size_, other.lg_cur_size_);
120
126
  std::swap(lg_nom_size_, other.lg_nom_size_);
121
127
  std::swap(rf_, other.rf_);
128
+ std::swap(p_, other.p_);
122
129
  std::swap(num_entries_, other.num_entries_);
123
130
  std::swap(theta_, other.theta_);
124
131
  std::swap(seed_, other.seed_);
@@ -247,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
247
254
  if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
248
255
  }
249
256
 
257
+ template<typename EN, typename EK, typename A>
258
+ void theta_update_sketch_base<EN, EK, A>::reset() {
259
+ const size_t cur_size = 1ULL << lg_cur_size_;
260
+ for (size_t i = 0; i < cur_size; ++i) {
261
+ if (EK()(entries_[i]) != 0) {
262
+ entries_[i].~EN();
263
+ EK()(entries_[i]) = 0;
264
+ }
265
+ }
266
+ const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
267
+ lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
268
+ if (starting_lg_size != lg_cur_size_) {
269
+ allocator_.deallocate(entries_, cur_size);
270
+ lg_cur_size_ = starting_lg_size;
271
+ const size_t new_size = 1ULL << starting_lg_size;
272
+ entries_ = allocator_.allocate(new_size);
273
+ for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
274
+ }
275
+ num_entries_ = 0;
276
+ theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
277
+ is_empty_ = true;
278
+ }
279
+
250
280
  template<typename EN, typename EK, typename A>
251
281
  void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
252
282
  // find the first empty slot
@@ -310,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
310
340
 
311
341
  template<typename Derived, typename Allocator>
312
342
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
313
- if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
314
- return theta_constants::MAX_THETA;
343
+ return theta_build_helper<true>::starting_theta_from_p(p_);
315
344
  }
316
345
 
317
346
  template<typename Derived, typename Allocator>
318
347
  uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
319
- return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
320
- }
321
-
322
- template<typename Derived, typename Allocator>
323
- uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
324
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
348
+ return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
325
349
  }
326
350
 
327
351
  // iterator
@@ -43,4 +43,5 @@ target_sources(theta_test
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
45
  theta_jaccard_similarity_test.cpp
46
+ theta_setop_test.cpp
46
47
  )