datasketches 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +7 -7
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
- metadata +11 -6
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,64 +31,72 @@
|
|
31
31
|
namespace datasketches {
|
32
32
|
|
33
33
|
template<typename A>
|
34
|
-
bool
|
34
|
+
bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
|
35
35
|
return get_theta64() < theta_constants::MAX_THETA && !is_empty();
|
36
36
|
}
|
37
37
|
|
38
38
|
template<typename A>
|
39
|
-
double
|
39
|
+
double base_theta_sketch_alloc<A>::get_theta() const {
|
40
40
|
return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
|
41
41
|
}
|
42
42
|
|
43
43
|
template<typename A>
|
44
|
-
double
|
44
|
+
double base_theta_sketch_alloc<A>::get_estimate() const {
|
45
45
|
return get_num_retained() / get_theta();
|
46
46
|
}
|
47
47
|
|
48
48
|
template<typename A>
|
49
|
-
double
|
49
|
+
double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
|
50
50
|
if (!is_estimation_mode()) return get_num_retained();
|
51
51
|
return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
|
52
52
|
}
|
53
53
|
|
54
54
|
template<typename A>
|
55
|
-
double
|
55
|
+
double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
|
56
56
|
if (!is_estimation_mode()) return get_num_retained();
|
57
57
|
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
58
58
|
}
|
59
59
|
|
60
60
|
template<typename A>
|
61
|
-
string<A>
|
62
|
-
|
61
|
+
string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
|
62
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
63
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
64
|
+
std::ostringstream os;
|
63
65
|
os << "### Theta sketch summary:" << std::endl;
|
64
|
-
os << " num retained entries : " << get_num_retained() << std::endl;
|
65
|
-
os << " seed hash : " << get_seed_hash() << std::endl;
|
66
|
-
os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
|
67
|
-
os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
|
68
|
-
os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
69
|
-
os << " theta (fraction) : " << get_theta() << std::endl;
|
70
|
-
os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
|
66
|
+
os << " num retained entries : " << this->get_num_retained() << std::endl;
|
67
|
+
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
68
|
+
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
69
|
+
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
70
|
+
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
71
|
+
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
72
|
+
os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
|
71
73
|
os << " estimate : " << this->get_estimate() << std::endl;
|
72
74
|
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
73
75
|
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
74
76
|
print_specifics(os);
|
75
77
|
os << "### End sketch summary" << std::endl;
|
76
|
-
if (
|
78
|
+
if (print_details) {
|
79
|
+
print_items(os);
|
80
|
+
}
|
81
|
+
return string<A>(os.str().c_str(), this->get_allocator());
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename A>
|
85
|
+
void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
77
86
|
os << "### Retained entries" << std::endl;
|
78
87
|
for (const auto& hash: *this) {
|
79
88
|
os << hash << std::endl;
|
80
89
|
}
|
81
90
|
os << "### End retained entries" << std::endl;
|
82
|
-
}
|
83
|
-
return os.str();
|
84
91
|
}
|
85
92
|
|
93
|
+
|
86
94
|
// update sketch
|
87
95
|
|
88
96
|
template<typename A>
|
89
97
|
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
90
|
-
uint64_t theta, uint64_t seed, const A& allocator):
|
91
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
|
98
|
+
float p, uint64_t theta, uint64_t seed, const A& allocator):
|
99
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
|
92
100
|
{}
|
93
101
|
|
94
102
|
template<typename A>
|
@@ -103,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
|
|
103
111
|
|
104
112
|
template<typename A>
|
105
113
|
bool update_theta_sketch_alloc<A>::is_ordered() const {
|
106
|
-
return false;
|
114
|
+
return table_.num_entries_ > 1 ? false : true;
|
107
115
|
}
|
108
116
|
|
109
117
|
template<typename A>
|
110
118
|
uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
|
111
|
-
return table_.theta_;
|
119
|
+
return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
|
112
120
|
}
|
113
121
|
|
114
122
|
template<typename A>
|
@@ -202,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
|
|
202
210
|
table_.trim();
|
203
211
|
}
|
204
212
|
|
213
|
+
template<typename A>
|
214
|
+
void update_theta_sketch_alloc<A>::reset() {
|
215
|
+
table_.reset();
|
216
|
+
}
|
217
|
+
|
205
218
|
template<typename A>
|
206
219
|
auto update_theta_sketch_alloc<A>::begin() -> iterator {
|
207
220
|
return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
@@ -228,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
|
|
228
241
|
}
|
229
242
|
|
230
243
|
template<typename A>
|
231
|
-
void update_theta_sketch_alloc<A>::print_specifics(
|
244
|
+
void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
|
232
245
|
os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
|
233
246
|
os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
|
234
247
|
os << " resize factor : " << (1 << table_.rf_) << std::endl;
|
@@ -241,7 +254,7 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
|
|
241
254
|
|
242
255
|
template<typename A>
|
243
256
|
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
244
|
-
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
257
|
+
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
245
258
|
}
|
246
259
|
|
247
260
|
// compact sketch
|
@@ -255,16 +268,18 @@ seed_hash_(other.get_seed_hash()),
|
|
255
268
|
theta_(other.get_theta64()),
|
256
269
|
entries_(other.get_allocator())
|
257
270
|
{
|
258
|
-
|
259
|
-
|
260
|
-
|
271
|
+
if (!other.is_empty()) {
|
272
|
+
entries_.reserve(other.get_num_retained());
|
273
|
+
std::copy(other.begin(), other.end(), std::back_inserter(entries_));
|
274
|
+
if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
|
275
|
+
}
|
261
276
|
}
|
262
277
|
|
263
278
|
template<typename A>
|
264
279
|
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
|
265
280
|
std::vector<uint64_t, A>&& entries):
|
266
281
|
is_empty_(is_empty),
|
267
|
-
is_ordered_(is_ordered),
|
282
|
+
is_ordered_(is_ordered || (entries.size() <= 1ULL)),
|
268
283
|
seed_hash_(seed_hash),
|
269
284
|
theta_(theta),
|
270
285
|
entries_(std::move(entries))
|
@@ -321,7 +336,7 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
321
336
|
}
|
322
337
|
|
323
338
|
template<typename A>
|
324
|
-
void compact_theta_sketch_alloc<A>::print_specifics(
|
339
|
+
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
325
340
|
|
326
341
|
template<typename A>
|
327
342
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
@@ -400,33 +415,101 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
400
415
|
const auto preamble_longs = read<uint8_t>(is);
|
401
416
|
const auto serial_version = read<uint8_t>(is);
|
402
417
|
const auto type = read<uint8_t>(is);
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
418
|
+
switch (serial_version) {
|
419
|
+
case SERIAL_VERSION: {
|
420
|
+
read<uint16_t>(is); // unused
|
421
|
+
const auto flags_byte = read<uint8_t>(is);
|
422
|
+
const auto seed_hash = read<uint16_t>(is);
|
423
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
424
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
425
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
426
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
427
|
+
|
428
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
429
|
+
uint32_t num_entries = 0;
|
430
|
+
if (!is_empty) {
|
431
|
+
if (preamble_longs == 1) {
|
432
|
+
num_entries = 1;
|
433
|
+
} else {
|
434
|
+
num_entries = read<uint32_t>(is);
|
435
|
+
read<uint32_t>(is); // unused
|
436
|
+
if (preamble_longs > 2) {
|
437
|
+
theta = read<uint64_t>(is);
|
438
|
+
}
|
439
|
+
}
|
440
|
+
}
|
441
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
442
|
+
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
410
443
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
444
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
445
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
446
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
447
|
+
}
|
448
|
+
case 1: {
|
449
|
+
const auto seed_hash = compute_seed_hash(seed);
|
450
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
451
|
+
read<uint8_t>(is); // unused
|
418
452
|
read<uint32_t>(is); // unused
|
419
|
-
|
420
|
-
|
453
|
+
const auto num_entries = read<uint32_t>(is);
|
454
|
+
read<uint32_t>(is); //unused
|
455
|
+
const auto theta = read<uint64_t>(is);
|
456
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
457
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
458
|
+
if (!is_empty)
|
459
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
460
|
+
if (!is.good())
|
461
|
+
throw std::runtime_error("error reading from std::istream");
|
462
|
+
return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
|
463
|
+
}
|
464
|
+
case 2: {
|
465
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
466
|
+
read<uint8_t>(is); // unused
|
467
|
+
read<uint16_t>(is); // unused
|
468
|
+
const uint16_t seed_hash = read<uint16_t>(is);
|
469
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
470
|
+
if (preamble_longs == 1) {
|
471
|
+
if (!is.good())
|
472
|
+
throw std::runtime_error("error reading from std::istream");
|
473
|
+
std::vector<uint64_t> entries(0, 0, allocator);
|
474
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
475
|
+
} else if (preamble_longs == 2) {
|
476
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
477
|
+
read<uint32_t>(is); // unused
|
478
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
479
|
+
if (num_entries == 0) {
|
480
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
481
|
+
}
|
482
|
+
read(is, entries.data(), entries.size() * sizeof(uint64_t));
|
483
|
+
if (!is.good())
|
484
|
+
throw std::runtime_error("error reading from std::istream");
|
485
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
486
|
+
} else if (preamble_longs == 3) {
|
487
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
488
|
+
read<uint32_t>(is); // unused
|
489
|
+
const auto theta = read<uint64_t>(is);
|
490
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
491
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
492
|
+
if (is_empty) {
|
493
|
+
if (!is.good())
|
494
|
+
throw std::runtime_error("error reading from std::istream");
|
495
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
|
496
|
+
} else {
|
497
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
498
|
+
if (!is.good())
|
499
|
+
throw std::runtime_error("error reading from std::istream");
|
500
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
|
501
|
+
}
|
502
|
+
} else {
|
503
|
+
throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
|
421
504
|
}
|
422
|
-
}
|
423
505
|
}
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
506
|
+
default:
|
507
|
+
// this should always fail since the valid cases are handled above
|
508
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
509
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
510
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
511
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
512
|
+
}
|
430
513
|
}
|
431
514
|
|
432
515
|
template<typename A>
|
@@ -533,6 +616,18 @@ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
533
616
|
return entries_ + num_entries_;
|
534
617
|
}
|
535
618
|
|
619
|
+
template<typename A>
|
620
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
621
|
+
|
622
|
+
template<typename A>
|
623
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
624
|
+
os << "### Retained entries" << std::endl;
|
625
|
+
for (const auto& hash: *this) {
|
626
|
+
os << hash << std::endl;
|
627
|
+
}
|
628
|
+
os << "### End retained entries" << std::endl;
|
629
|
+
}
|
630
|
+
|
536
631
|
} /* namespace datasketches */
|
537
632
|
|
538
633
|
#endif
|
@@ -60,11 +60,16 @@ public:
|
|
60
60
|
*/
|
61
61
|
CompactSketch get_result(bool ordered = true) const;
|
62
62
|
|
63
|
+
/**
|
64
|
+
* Reset the union to the initial empty state
|
65
|
+
*/
|
66
|
+
void reset();
|
67
|
+
|
63
68
|
private:
|
64
69
|
State state_;
|
65
70
|
|
66
71
|
// for builder
|
67
|
-
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
72
|
+
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
68
73
|
};
|
69
74
|
|
70
75
|
template<typename A>
|
@@ -38,7 +38,7 @@ public:
|
|
38
38
|
using resize_factor = typename hash_table::resize_factor;
|
39
39
|
using comparator = compare_by_key<ExtractKey>;
|
40
40
|
|
41
|
-
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
41
|
+
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
42
42
|
|
43
43
|
template<typename FwdSketch>
|
44
44
|
void update(FwdSketch&& sketch);
|
@@ -47,6 +47,8 @@ public:
|
|
47
47
|
|
48
48
|
const Policy& get_policy() const;
|
49
49
|
|
50
|
+
void reset();
|
51
|
+
|
50
52
|
private:
|
51
53
|
Policy policy_;
|
52
54
|
hash_table table_;
|
@@ -28,9 +28,9 @@ namespace datasketches {
|
|
28
28
|
|
29
29
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
30
30
|
theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
31
|
-
uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
31
|
+
float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
32
32
|
policy_(policy),
|
33
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
|
33
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
|
34
34
|
union_theta_(table_.theta_)
|
35
35
|
{}
|
36
36
|
|
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
|
|
84
84
|
return policy_;
|
85
85
|
}
|
86
86
|
|
87
|
+
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
88
|
+
void theta_union_base<EN, EK, P, S, CS, A>::reset() {
|
89
|
+
table_.reset();
|
90
|
+
union_theta_ = table_.theta_;
|
91
|
+
}
|
92
|
+
|
87
93
|
} /* namespace datasketches */
|
88
94
|
|
89
95
|
#endif
|
@@ -23,8 +23,8 @@
|
|
23
23
|
namespace datasketches {
|
24
24
|
|
25
25
|
template<typename A>
|
26
|
-
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
-
state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
|
26
|
+
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
|
28
28
|
{}
|
29
29
|
|
30
30
|
template<typename A>
|
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
38
38
|
return state_.get_result(ordered);
|
39
39
|
}
|
40
40
|
|
41
|
+
template<typename A>
|
42
|
+
void theta_union_alloc<A>::reset() {
|
43
|
+
state_.reset();
|
44
|
+
}
|
45
|
+
|
41
46
|
template<typename A>
|
42
47
|
theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
43
48
|
|
44
49
|
template<typename A>
|
45
50
|
auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
|
46
|
-
return theta_union_alloc(
|
47
|
-
this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
|
48
|
-
this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
51
|
+
return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
49
52
|
}
|
50
53
|
|
51
54
|
} /* namespace datasketches */
|
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
|
|
40
40
|
using resize_factor = theta_constants::resize_factor;
|
41
41
|
using comparator = compare_by_key<ExtractKey>;
|
42
42
|
|
43
|
-
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
44
|
-
uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
43
|
+
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
44
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
45
45
|
theta_update_sketch_base(const theta_update_sketch_base& other);
|
46
46
|
theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
|
47
47
|
~theta_update_sketch_base();
|
@@ -75,6 +75,7 @@ struct theta_update_sketch_base {
|
|
75
75
|
uint8_t lg_cur_size_;
|
76
76
|
uint8_t lg_nom_size_;
|
77
77
|
resize_factor rf_;
|
78
|
+
float p_;
|
78
79
|
uint32_t num_entries_;
|
79
80
|
uint64_t theta_;
|
80
81
|
uint64_t seed_;
|
@@ -83,6 +84,7 @@ struct theta_update_sketch_base {
|
|
83
84
|
void resize();
|
84
85
|
void rebuild();
|
85
86
|
void trim();
|
87
|
+
void reset();
|
86
88
|
|
87
89
|
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
88
90
|
static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
|
@@ -94,7 +96,7 @@ struct theta_update_sketch_base {
|
|
94
96
|
template<typename Derived, typename Allocator>
|
95
97
|
class theta_base_builder {
|
96
98
|
public:
|
97
|
-
// TODO: Redundant and deprecated. Will be removed in next major
|
99
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
98
100
|
using resize_factor = theta_constants::resize_factor;
|
99
101
|
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
100
102
|
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
@@ -149,7 +151,6 @@ protected:
|
|
149
151
|
|
150
152
|
uint64_t starting_theta() const;
|
151
153
|
uint8_t starting_lg_size() const;
|
152
|
-
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
153
154
|
};
|
154
155
|
|
155
156
|
// key extractor
|
@@ -24,15 +24,18 @@
|
|
24
24
|
#include <sstream>
|
25
25
|
#include <algorithm>
|
26
26
|
|
27
|
+
#include "theta_helpers.hpp"
|
28
|
+
|
27
29
|
namespace datasketches {
|
28
30
|
|
29
31
|
template<typename EN, typename EK, typename A>
|
30
|
-
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
32
|
+
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
31
33
|
allocator_(allocator),
|
32
34
|
is_empty_(is_empty),
|
33
35
|
lg_cur_size_(lg_cur_size),
|
34
36
|
lg_nom_size_(lg_nom_size),
|
35
37
|
rf_(rf),
|
38
|
+
p_(p),
|
36
39
|
num_entries_(0),
|
37
40
|
theta_(theta),
|
38
41
|
seed_(seed),
|
@@ -52,6 +55,7 @@ is_empty_(other.is_empty_),
|
|
52
55
|
lg_cur_size_(other.lg_cur_size_),
|
53
56
|
lg_nom_size_(other.lg_nom_size_),
|
54
57
|
rf_(other.rf_),
|
58
|
+
p_(other.p_),
|
55
59
|
num_entries_(other.num_entries_),
|
56
60
|
theta_(other.theta_),
|
57
61
|
seed_(other.seed_),
|
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
|
|
77
81
|
lg_cur_size_(other.lg_cur_size_),
|
78
82
|
lg_nom_size_(other.lg_nom_size_),
|
79
83
|
rf_(other.rf_),
|
84
|
+
p_(other.p_),
|
80
85
|
num_entries_(other.num_entries_),
|
81
86
|
theta_(other.theta_),
|
82
87
|
seed_(other.seed_),
|
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
105
110
|
std::swap(lg_cur_size_, copy.lg_cur_size_);
|
106
111
|
std::swap(lg_nom_size_, copy.lg_nom_size_);
|
107
112
|
std::swap(rf_, copy.rf_);
|
113
|
+
std::swap(p_, copy.p_);
|
108
114
|
std::swap(num_entries_, copy.num_entries_);
|
109
115
|
std::swap(theta_, copy.theta_);
|
110
116
|
std::swap(seed_, copy.seed_);
|
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
119
125
|
std::swap(lg_cur_size_, other.lg_cur_size_);
|
120
126
|
std::swap(lg_nom_size_, other.lg_nom_size_);
|
121
127
|
std::swap(rf_, other.rf_);
|
128
|
+
std::swap(p_, other.p_);
|
122
129
|
std::swap(num_entries_, other.num_entries_);
|
123
130
|
std::swap(theta_, other.theta_);
|
124
131
|
std::swap(seed_, other.seed_);
|
@@ -247,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
|
|
247
254
|
if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
|
248
255
|
}
|
249
256
|
|
257
|
+
template<typename EN, typename EK, typename A>
|
258
|
+
void theta_update_sketch_base<EN, EK, A>::reset() {
|
259
|
+
const size_t cur_size = 1ULL << lg_cur_size_;
|
260
|
+
for (size_t i = 0; i < cur_size; ++i) {
|
261
|
+
if (EK()(entries_[i]) != 0) {
|
262
|
+
entries_[i].~EN();
|
263
|
+
EK()(entries_[i]) = 0;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
|
267
|
+
lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
|
268
|
+
if (starting_lg_size != lg_cur_size_) {
|
269
|
+
allocator_.deallocate(entries_, cur_size);
|
270
|
+
lg_cur_size_ = starting_lg_size;
|
271
|
+
const size_t new_size = 1ULL << starting_lg_size;
|
272
|
+
entries_ = allocator_.allocate(new_size);
|
273
|
+
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
274
|
+
}
|
275
|
+
num_entries_ = 0;
|
276
|
+
theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
|
277
|
+
is_empty_ = true;
|
278
|
+
}
|
279
|
+
|
250
280
|
template<typename EN, typename EK, typename A>
|
251
281
|
void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
|
252
282
|
// find the first empty slot
|
@@ -310,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
|
|
310
340
|
|
311
341
|
template<typename Derived, typename Allocator>
|
312
342
|
uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
313
|
-
|
314
|
-
return theta_constants::MAX_THETA;
|
343
|
+
return theta_build_helper<true>::starting_theta_from_p(p_);
|
315
344
|
}
|
316
345
|
|
317
346
|
template<typename Derived, typename Allocator>
|
318
347
|
uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
|
319
|
-
return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
320
|
-
}
|
321
|
-
|
322
|
-
template<typename Derived, typename Allocator>
|
323
|
-
uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
324
|
-
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
348
|
+
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
325
349
|
}
|
326
350
|
|
327
351
|
// iterator
|
Binary file
|