datasketches 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +7 -7
  4. data/ext/datasketches/theta_wrapper.cpp +20 -4
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
  7. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  8. data/vendor/datasketches-cpp/README.md +76 -9
  9. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  10. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  11. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  12. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  13. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
  14. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  15. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  16. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
  17. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  18. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  20. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
  21. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  22. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  24. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
  25. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
  26. data/vendor/datasketches-cpp/setup.py +1 -1
  27. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  28. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
  29. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  30. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  31. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  32. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  33. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
  34. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  35. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  36. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
  37. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  38. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  39. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
  40. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  41. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  42. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  43. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  44. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  46. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  47. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
  48. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
  49. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  50. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  51. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  52. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  53. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  54. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  55. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
  56. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  57. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  58. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  59. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  60. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
  61. metadata +11 -6
  62. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  63. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,64 +31,72 @@
31
31
  namespace datasketches {
32
32
 
33
33
  template<typename A>
34
- bool theta_sketch_alloc<A>::is_estimation_mode() const {
34
+ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
35
35
  return get_theta64() < theta_constants::MAX_THETA && !is_empty();
36
36
  }
37
37
 
38
38
  template<typename A>
39
- double theta_sketch_alloc<A>::get_theta() const {
39
+ double base_theta_sketch_alloc<A>::get_theta() const {
40
40
  return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
41
41
  }
42
42
 
43
43
  template<typename A>
44
- double theta_sketch_alloc<A>::get_estimate() const {
44
+ double base_theta_sketch_alloc<A>::get_estimate() const {
45
45
  return get_num_retained() / get_theta();
46
46
  }
47
47
 
48
48
  template<typename A>
49
- double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
49
+ double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
50
50
  if (!is_estimation_mode()) return get_num_retained();
51
51
  return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
52
52
  }
53
53
 
54
54
  template<typename A>
55
- double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
55
+ double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
56
56
  if (!is_estimation_mode()) return get_num_retained();
57
57
  return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
58
58
  }
59
59
 
60
60
  template<typename A>
61
- string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
62
- ostrstream os;
61
+ string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
62
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
63
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
64
+ std::ostringstream os;
63
65
  os << "### Theta sketch summary:" << std::endl;
64
- os << " num retained entries : " << get_num_retained() << std::endl;
65
- os << " seed hash : " << get_seed_hash() << std::endl;
66
- os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
67
- os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
68
- os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
69
- os << " theta (fraction) : " << get_theta() << std::endl;
70
- os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
66
+ os << " num retained entries : " << this->get_num_retained() << std::endl;
67
+ os << " seed hash : " << this->get_seed_hash() << std::endl;
68
+ os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
69
+ os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
70
+ os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
71
+ os << " theta (fraction) : " << this->get_theta() << std::endl;
72
+ os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
71
73
  os << " estimate : " << this->get_estimate() << std::endl;
72
74
  os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
73
75
  os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
74
76
  print_specifics(os);
75
77
  os << "### End sketch summary" << std::endl;
76
- if (detail) {
78
+ if (print_details) {
79
+ print_items(os);
80
+ }
81
+ return string<A>(os.str().c_str(), this->get_allocator());
82
+ }
83
+
84
+ template<typename A>
85
+ void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
77
86
  os << "### Retained entries" << std::endl;
78
87
  for (const auto& hash: *this) {
79
88
  os << hash << std::endl;
80
89
  }
81
90
  os << "### End retained entries" << std::endl;
82
- }
83
- return os.str();
84
91
  }
85
92
 
93
+
86
94
  // update sketch
87
95
 
88
96
  template<typename A>
89
97
  update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
90
- uint64_t theta, uint64_t seed, const A& allocator):
91
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
98
+ float p, uint64_t theta, uint64_t seed, const A& allocator):
99
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
92
100
  {}
93
101
 
94
102
  template<typename A>
@@ -103,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
103
111
 
104
112
  template<typename A>
105
113
  bool update_theta_sketch_alloc<A>::is_ordered() const {
106
- return false;
114
+ return table_.num_entries_ > 1 ? false : true;
107
115
  }
108
116
 
109
117
  template<typename A>
110
118
  uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
111
- return table_.theta_;
119
+ return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
112
120
  }
113
121
 
114
122
  template<typename A>
@@ -202,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
202
210
  table_.trim();
203
211
  }
204
212
 
213
+ template<typename A>
214
+ void update_theta_sketch_alloc<A>::reset() {
215
+ table_.reset();
216
+ }
217
+
205
218
  template<typename A>
206
219
  auto update_theta_sketch_alloc<A>::begin() -> iterator {
207
220
  return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
@@ -228,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
228
241
  }
229
242
 
230
243
  template<typename A>
231
- void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
244
+ void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
232
245
  os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
233
246
  os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
234
247
  os << " resize factor : " << (1 << table_.rf_) << std::endl;
@@ -241,7 +254,7 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
241
254
 
242
255
  template<typename A>
243
256
  update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
244
- return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
257
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
245
258
  }
246
259
 
247
260
  // compact sketch
@@ -255,16 +268,18 @@ seed_hash_(other.get_seed_hash()),
255
268
  theta_(other.get_theta64()),
256
269
  entries_(other.get_allocator())
257
270
  {
258
- entries_.reserve(other.get_num_retained());
259
- std::copy(other.begin(), other.end(), std::back_inserter(entries_));
260
- if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
271
+ if (!other.is_empty()) {
272
+ entries_.reserve(other.get_num_retained());
273
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
274
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
275
+ }
261
276
  }
262
277
 
263
278
  template<typename A>
264
279
  compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
265
280
  std::vector<uint64_t, A>&& entries):
266
281
  is_empty_(is_empty),
267
- is_ordered_(is_ordered),
282
+ is_ordered_(is_ordered || (entries.size() <= 1ULL)),
268
283
  seed_hash_(seed_hash),
269
284
  theta_(theta),
270
285
  entries_(std::move(entries))
@@ -321,7 +336,7 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
321
336
  }
322
337
 
323
338
  template<typename A>
324
- void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
339
+ void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
325
340
 
326
341
  template<typename A>
327
342
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
@@ -400,33 +415,101 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
400
415
  const auto preamble_longs = read<uint8_t>(is);
401
416
  const auto serial_version = read<uint8_t>(is);
402
417
  const auto type = read<uint8_t>(is);
403
- read<uint16_t>(is); // unused
404
- const auto flags_byte = read<uint8_t>(is);
405
- const auto seed_hash = read<uint16_t>(is);
406
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
407
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
408
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
409
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
418
+ switch (serial_version) {
419
+ case SERIAL_VERSION: {
420
+ read<uint16_t>(is); // unused
421
+ const auto flags_byte = read<uint8_t>(is);
422
+ const auto seed_hash = read<uint16_t>(is);
423
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
424
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
425
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
426
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
427
+
428
+ uint64_t theta = theta_constants::MAX_THETA;
429
+ uint32_t num_entries = 0;
430
+ if (!is_empty) {
431
+ if (preamble_longs == 1) {
432
+ num_entries = 1;
433
+ } else {
434
+ num_entries = read<uint32_t>(is);
435
+ read<uint32_t>(is); // unused
436
+ if (preamble_longs > 2) {
437
+ theta = read<uint64_t>(is);
438
+ }
439
+ }
440
+ }
441
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
442
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
410
443
 
411
- uint64_t theta = theta_constants::MAX_THETA;
412
- uint32_t num_entries = 0;
413
- if (!is_empty) {
414
- if (preamble_longs == 1) {
415
- num_entries = 1;
416
- } else {
417
- num_entries = read<uint32_t>(is);
444
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
445
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
446
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
447
+ }
448
+ case 1: {
449
+ const auto seed_hash = compute_seed_hash(seed);
450
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
451
+ read<uint8_t>(is); // unused
418
452
  read<uint32_t>(is); // unused
419
- if (preamble_longs > 2) {
420
- theta = read<uint64_t>(is);
453
+ const auto num_entries = read<uint32_t>(is);
454
+ read<uint32_t>(is); //unused
455
+ const auto theta = read<uint64_t>(is);
456
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
457
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
458
+ if (!is_empty)
459
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
460
+ if (!is.good())
461
+ throw std::runtime_error("error reading from std::istream");
462
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
463
+ }
464
+ case 2: {
465
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
466
+ read<uint8_t>(is); // unused
467
+ read<uint16_t>(is); // unused
468
+ const uint16_t seed_hash = read<uint16_t>(is);
469
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
470
+ if (preamble_longs == 1) {
471
+ if (!is.good())
472
+ throw std::runtime_error("error reading from std::istream");
473
+ std::vector<uint64_t> entries(0, 0, allocator);
474
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
475
+ } else if (preamble_longs == 2) {
476
+ const uint32_t num_entries = read<uint32_t>(is);
477
+ read<uint32_t>(is); // unused
478
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
479
+ if (num_entries == 0) {
480
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
481
+ }
482
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
483
+ if (!is.good())
484
+ throw std::runtime_error("error reading from std::istream");
485
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
486
+ } else if (preamble_longs == 3) {
487
+ const uint32_t num_entries = read<uint32_t>(is);
488
+ read<uint32_t>(is); // unused
489
+ const auto theta = read<uint64_t>(is);
490
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
491
+ std::vector<uint64_t> entries(num_entries, 0, allocator);
492
+ if (is_empty) {
493
+ if (!is.good())
494
+ throw std::runtime_error("error reading from std::istream");
495
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
496
+ } else {
497
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
498
+ if (!is.good())
499
+ throw std::runtime_error("error reading from std::istream");
500
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
501
+ }
502
+ } else {
503
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
421
504
  }
422
- }
423
505
  }
424
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
425
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
426
-
427
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
428
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
429
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
506
+ default:
507
+ // this should always fail since the valid cases are handled above
508
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
509
+ // this throw is never reached, because check_serial_version will throw an informative exception.
510
+ // This is only here to avoid a compiler warning about a path without a return value.
511
+ throw std::invalid_argument("unexpected sketch serialization version");
512
+ }
430
513
  }
431
514
 
432
515
  template<typename A>
@@ -533,6 +616,18 @@ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
533
616
  return entries_ + num_entries_;
534
617
  }
535
618
 
619
+ template<typename A>
620
+ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
621
+
622
+ template<typename A>
623
+ void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
624
+ os << "### Retained entries" << std::endl;
625
+ for (const auto& hash: *this) {
626
+ os << hash << std::endl;
627
+ }
628
+ os << "### End retained entries" << std::endl;
629
+ }
630
+
536
631
  } /* namespace datasketches */
537
632
 
538
633
  #endif
@@ -60,11 +60,16 @@ public:
60
60
  */
61
61
  CompactSketch get_result(bool ordered = true) const;
62
62
 
63
+ /**
64
+ * Reset the union to the initial empty state
65
+ */
66
+ void reset();
67
+
63
68
  private:
64
69
  State state_;
65
70
 
66
71
  // for builder
67
- theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
72
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
68
73
  };
69
74
 
70
75
  template<typename A>
@@ -38,7 +38,7 @@ public:
38
38
  using resize_factor = typename hash_table::resize_factor;
39
39
  using comparator = compare_by_key<ExtractKey>;
40
40
 
41
- theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
41
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
42
42
 
43
43
  template<typename FwdSketch>
44
44
  void update(FwdSketch&& sketch);
@@ -47,6 +47,8 @@ public:
47
47
 
48
48
  const Policy& get_policy() const;
49
49
 
50
+ void reset();
51
+
50
52
  private:
51
53
  Policy policy_;
52
54
  hash_table table_;
@@ -28,9 +28,9 @@ namespace datasketches {
28
28
 
29
29
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
30
30
  theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
31
- uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
31
+ float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
32
32
  policy_(policy),
33
- table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
33
+ table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
34
34
  union_theta_(table_.theta_)
35
35
  {}
36
36
 
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
84
84
  return policy_;
85
85
  }
86
86
 
87
+ template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
88
+ void theta_union_base<EN, EK, P, S, CS, A>::reset() {
89
+ table_.reset();
90
+ union_theta_ = table_.theta_;
91
+ }
92
+
87
93
  } /* namespace datasketches */
88
94
 
89
95
  #endif
@@ -23,8 +23,8 @@
23
23
  namespace datasketches {
24
24
 
25
25
  template<typename A>
26
- theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
27
- state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
26
+ theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
27
+ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
28
28
  {}
29
29
 
30
30
  template<typename A>
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
38
38
  return state_.get_result(ordered);
39
39
  }
40
40
 
41
+ template<typename A>
42
+ void theta_union_alloc<A>::reset() {
43
+ state_.reset();
44
+ }
45
+
41
46
  template<typename A>
42
47
  theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
43
48
 
44
49
  template<typename A>
45
50
  auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
46
- return theta_union_alloc(
47
- this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
48
- this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
51
+ return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
49
52
  }
50
53
 
51
54
  } /* namespace datasketches */
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
40
40
  using resize_factor = theta_constants::resize_factor;
41
41
  using comparator = compare_by_key<ExtractKey>;
42
42
 
43
- theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
44
- uint64_t seed, const Allocator& allocator, bool is_empty = true);
43
+ theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
44
+ uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
45
45
  theta_update_sketch_base(const theta_update_sketch_base& other);
46
46
  theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
47
47
  ~theta_update_sketch_base();
@@ -75,6 +75,7 @@ struct theta_update_sketch_base {
75
75
  uint8_t lg_cur_size_;
76
76
  uint8_t lg_nom_size_;
77
77
  resize_factor rf_;
78
+ float p_;
78
79
  uint32_t num_entries_;
79
80
  uint64_t theta_;
80
81
  uint64_t seed_;
@@ -83,6 +84,7 @@ struct theta_update_sketch_base {
83
84
  void resize();
84
85
  void rebuild();
85
86
  void trim();
87
+ void reset();
86
88
 
87
89
  static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
88
90
  static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
@@ -94,7 +96,7 @@ struct theta_update_sketch_base {
94
96
  template<typename Derived, typename Allocator>
95
97
  class theta_base_builder {
96
98
  public:
97
- // TODO: Redundant and deprecated. Will be removed in next major verison release.
99
+ // TODO: Redundant and deprecated. Will be removed in next major version release.
98
100
  using resize_factor = theta_constants::resize_factor;
99
101
  static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
100
102
  static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
@@ -149,7 +151,6 @@ protected:
149
151
 
150
152
  uint64_t starting_theta() const;
151
153
  uint8_t starting_lg_size() const;
152
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
153
154
  };
154
155
 
155
156
  // key extractor
@@ -24,15 +24,18 @@
24
24
  #include <sstream>
25
25
  #include <algorithm>
26
26
 
27
+ #include "theta_helpers.hpp"
28
+
27
29
  namespace datasketches {
28
30
 
29
31
  template<typename EN, typename EK, typename A>
30
- theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
32
+ theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
31
33
  allocator_(allocator),
32
34
  is_empty_(is_empty),
33
35
  lg_cur_size_(lg_cur_size),
34
36
  lg_nom_size_(lg_nom_size),
35
37
  rf_(rf),
38
+ p_(p),
36
39
  num_entries_(0),
37
40
  theta_(theta),
38
41
  seed_(seed),
@@ -52,6 +55,7 @@ is_empty_(other.is_empty_),
52
55
  lg_cur_size_(other.lg_cur_size_),
53
56
  lg_nom_size_(other.lg_nom_size_),
54
57
  rf_(other.rf_),
58
+ p_(other.p_),
55
59
  num_entries_(other.num_entries_),
56
60
  theta_(other.theta_),
57
61
  seed_(other.seed_),
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
77
81
  lg_cur_size_(other.lg_cur_size_),
78
82
  lg_nom_size_(other.lg_nom_size_),
79
83
  rf_(other.rf_),
84
+ p_(other.p_),
80
85
  num_entries_(other.num_entries_),
81
86
  theta_(other.theta_),
82
87
  seed_(other.seed_),
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
105
110
  std::swap(lg_cur_size_, copy.lg_cur_size_);
106
111
  std::swap(lg_nom_size_, copy.lg_nom_size_);
107
112
  std::swap(rf_, copy.rf_);
113
+ std::swap(p_, copy.p_);
108
114
  std::swap(num_entries_, copy.num_entries_);
109
115
  std::swap(theta_, copy.theta_);
110
116
  std::swap(seed_, copy.seed_);
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
119
125
  std::swap(lg_cur_size_, other.lg_cur_size_);
120
126
  std::swap(lg_nom_size_, other.lg_nom_size_);
121
127
  std::swap(rf_, other.rf_);
128
+ std::swap(p_, other.p_);
122
129
  std::swap(num_entries_, other.num_entries_);
123
130
  std::swap(theta_, other.theta_);
124
131
  std::swap(seed_, other.seed_);
@@ -247,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
247
254
  if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
248
255
  }
249
256
 
257
+ template<typename EN, typename EK, typename A>
258
+ void theta_update_sketch_base<EN, EK, A>::reset() {
259
+ const size_t cur_size = 1ULL << lg_cur_size_;
260
+ for (size_t i = 0; i < cur_size; ++i) {
261
+ if (EK()(entries_[i]) != 0) {
262
+ entries_[i].~EN();
263
+ EK()(entries_[i]) = 0;
264
+ }
265
+ }
266
+ const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
267
+ lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
268
+ if (starting_lg_size != lg_cur_size_) {
269
+ allocator_.deallocate(entries_, cur_size);
270
+ lg_cur_size_ = starting_lg_size;
271
+ const size_t new_size = 1ULL << starting_lg_size;
272
+ entries_ = allocator_.allocate(new_size);
273
+ for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
274
+ }
275
+ num_entries_ = 0;
276
+ theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
277
+ is_empty_ = true;
278
+ }
279
+
250
280
  template<typename EN, typename EK, typename A>
251
281
  void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
252
282
  // find the first empty slot
@@ -310,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
310
340
 
311
341
  template<typename Derived, typename Allocator>
312
342
  uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
313
- if (p_ < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p_);
314
- return theta_constants::MAX_THETA;
343
+ return theta_build_helper<true>::starting_theta_from_p(p_);
315
344
  }
316
345
 
317
346
  template<typename Derived, typename Allocator>
318
347
  uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
319
- return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
320
- }
321
-
322
- template<typename Derived, typename Allocator>
323
- uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
324
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
348
+ return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
325
349
  }
326
350
 
327
351
  // iterator
@@ -43,4 +43,5 @@ target_sources(theta_test
43
43
  theta_intersection_test.cpp
44
44
  theta_a_not_b_test.cpp
45
45
  theta_jaccard_similarity_test.cpp
46
+ theta_setop_test.cpp
46
47
  )