datasketches 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +7 -7
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
- metadata +11 -6
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -31,64 +31,72 @@
|
|
31
31
|
namespace datasketches {
|
32
32
|
|
33
33
|
template<typename A>
|
34
|
-
bool
|
34
|
+
bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
|
35
35
|
return get_theta64() < theta_constants::MAX_THETA && !is_empty();
|
36
36
|
}
|
37
37
|
|
38
38
|
template<typename A>
|
39
|
-
double
|
39
|
+
double base_theta_sketch_alloc<A>::get_theta() const {
|
40
40
|
return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
|
41
41
|
}
|
42
42
|
|
43
43
|
template<typename A>
|
44
|
-
double
|
44
|
+
double base_theta_sketch_alloc<A>::get_estimate() const {
|
45
45
|
return get_num_retained() / get_theta();
|
46
46
|
}
|
47
47
|
|
48
48
|
template<typename A>
|
49
|
-
double
|
49
|
+
double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
|
50
50
|
if (!is_estimation_mode()) return get_num_retained();
|
51
51
|
return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
|
52
52
|
}
|
53
53
|
|
54
54
|
template<typename A>
|
55
|
-
double
|
55
|
+
double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
|
56
56
|
if (!is_estimation_mode()) return get_num_retained();
|
57
57
|
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
58
58
|
}
|
59
59
|
|
60
60
|
template<typename A>
|
61
|
-
string<A>
|
62
|
-
|
61
|
+
string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
|
62
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
63
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
64
|
+
std::ostringstream os;
|
63
65
|
os << "### Theta sketch summary:" << std::endl;
|
64
|
-
os << " num retained entries : " << get_num_retained() << std::endl;
|
65
|
-
os << " seed hash : " << get_seed_hash() << std::endl;
|
66
|
-
os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
|
67
|
-
os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
|
68
|
-
os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
69
|
-
os << " theta (fraction) : " << get_theta() << std::endl;
|
70
|
-
os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
|
66
|
+
os << " num retained entries : " << this->get_num_retained() << std::endl;
|
67
|
+
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
68
|
+
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
69
|
+
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
70
|
+
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
71
|
+
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
72
|
+
os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
|
71
73
|
os << " estimate : " << this->get_estimate() << std::endl;
|
72
74
|
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
73
75
|
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
74
76
|
print_specifics(os);
|
75
77
|
os << "### End sketch summary" << std::endl;
|
76
|
-
if (
|
78
|
+
if (print_details) {
|
79
|
+
print_items(os);
|
80
|
+
}
|
81
|
+
return string<A>(os.str().c_str(), this->get_allocator());
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename A>
|
85
|
+
void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
77
86
|
os << "### Retained entries" << std::endl;
|
78
87
|
for (const auto& hash: *this) {
|
79
88
|
os << hash << std::endl;
|
80
89
|
}
|
81
90
|
os << "### End retained entries" << std::endl;
|
82
|
-
}
|
83
|
-
return os.str();
|
84
91
|
}
|
85
92
|
|
93
|
+
|
86
94
|
// update sketch
|
87
95
|
|
88
96
|
template<typename A>
|
89
97
|
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
90
|
-
uint64_t theta, uint64_t seed, const A& allocator):
|
91
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
|
98
|
+
float p, uint64_t theta, uint64_t seed, const A& allocator):
|
99
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
|
92
100
|
{}
|
93
101
|
|
94
102
|
template<typename A>
|
@@ -103,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
|
|
103
111
|
|
104
112
|
template<typename A>
|
105
113
|
bool update_theta_sketch_alloc<A>::is_ordered() const {
|
106
|
-
return false;
|
114
|
+
return table_.num_entries_ > 1 ? false : true;
|
107
115
|
}
|
108
116
|
|
109
117
|
template<typename A>
|
110
118
|
uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
|
111
|
-
return table_.theta_;
|
119
|
+
return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
|
112
120
|
}
|
113
121
|
|
114
122
|
template<typename A>
|
@@ -202,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
|
|
202
210
|
table_.trim();
|
203
211
|
}
|
204
212
|
|
213
|
+
template<typename A>
|
214
|
+
void update_theta_sketch_alloc<A>::reset() {
|
215
|
+
table_.reset();
|
216
|
+
}
|
217
|
+
|
205
218
|
template<typename A>
|
206
219
|
auto update_theta_sketch_alloc<A>::begin() -> iterator {
|
207
220
|
return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
@@ -228,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
|
|
228
241
|
}
|
229
242
|
|
230
243
|
template<typename A>
|
231
|
-
void update_theta_sketch_alloc<A>::print_specifics(
|
244
|
+
void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
|
232
245
|
os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
|
233
246
|
os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
|
234
247
|
os << " resize factor : " << (1 << table_.rf_) << std::endl;
|
@@ -241,7 +254,7 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
|
|
241
254
|
|
242
255
|
template<typename A>
|
243
256
|
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
244
|
-
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
257
|
+
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
245
258
|
}
|
246
259
|
|
247
260
|
// compact sketch
|
@@ -255,16 +268,18 @@ seed_hash_(other.get_seed_hash()),
|
|
255
268
|
theta_(other.get_theta64()),
|
256
269
|
entries_(other.get_allocator())
|
257
270
|
{
|
258
|
-
|
259
|
-
|
260
|
-
|
271
|
+
if (!other.is_empty()) {
|
272
|
+
entries_.reserve(other.get_num_retained());
|
273
|
+
std::copy(other.begin(), other.end(), std::back_inserter(entries_));
|
274
|
+
if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
|
275
|
+
}
|
261
276
|
}
|
262
277
|
|
263
278
|
template<typename A>
|
264
279
|
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
|
265
280
|
std::vector<uint64_t, A>&& entries):
|
266
281
|
is_empty_(is_empty),
|
267
|
-
is_ordered_(is_ordered),
|
282
|
+
is_ordered_(is_ordered || (entries.size() <= 1ULL)),
|
268
283
|
seed_hash_(seed_hash),
|
269
284
|
theta_(theta),
|
270
285
|
entries_(std::move(entries))
|
@@ -321,7 +336,7 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
321
336
|
}
|
322
337
|
|
323
338
|
template<typename A>
|
324
|
-
void compact_theta_sketch_alloc<A>::print_specifics(
|
339
|
+
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
325
340
|
|
326
341
|
template<typename A>
|
327
342
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
@@ -400,33 +415,101 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
400
415
|
const auto preamble_longs = read<uint8_t>(is);
|
401
416
|
const auto serial_version = read<uint8_t>(is);
|
402
417
|
const auto type = read<uint8_t>(is);
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
418
|
+
switch (serial_version) {
|
419
|
+
case SERIAL_VERSION: {
|
420
|
+
read<uint16_t>(is); // unused
|
421
|
+
const auto flags_byte = read<uint8_t>(is);
|
422
|
+
const auto seed_hash = read<uint16_t>(is);
|
423
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
424
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
425
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
426
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
427
|
+
|
428
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
429
|
+
uint32_t num_entries = 0;
|
430
|
+
if (!is_empty) {
|
431
|
+
if (preamble_longs == 1) {
|
432
|
+
num_entries = 1;
|
433
|
+
} else {
|
434
|
+
num_entries = read<uint32_t>(is);
|
435
|
+
read<uint32_t>(is); // unused
|
436
|
+
if (preamble_longs > 2) {
|
437
|
+
theta = read<uint64_t>(is);
|
438
|
+
}
|
439
|
+
}
|
440
|
+
}
|
441
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
442
|
+
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
410
443
|
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
444
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
445
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
446
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
447
|
+
}
|
448
|
+
case 1: {
|
449
|
+
const auto seed_hash = compute_seed_hash(seed);
|
450
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
451
|
+
read<uint8_t>(is); // unused
|
418
452
|
read<uint32_t>(is); // unused
|
419
|
-
|
420
|
-
|
453
|
+
const auto num_entries = read<uint32_t>(is);
|
454
|
+
read<uint32_t>(is); //unused
|
455
|
+
const auto theta = read<uint64_t>(is);
|
456
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
457
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
458
|
+
if (!is_empty)
|
459
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
460
|
+
if (!is.good())
|
461
|
+
throw std::runtime_error("error reading from std::istream");
|
462
|
+
return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
|
463
|
+
}
|
464
|
+
case 2: {
|
465
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
466
|
+
read<uint8_t>(is); // unused
|
467
|
+
read<uint16_t>(is); // unused
|
468
|
+
const uint16_t seed_hash = read<uint16_t>(is);
|
469
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
470
|
+
if (preamble_longs == 1) {
|
471
|
+
if (!is.good())
|
472
|
+
throw std::runtime_error("error reading from std::istream");
|
473
|
+
std::vector<uint64_t> entries(0, 0, allocator);
|
474
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
475
|
+
} else if (preamble_longs == 2) {
|
476
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
477
|
+
read<uint32_t>(is); // unused
|
478
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
479
|
+
if (num_entries == 0) {
|
480
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
481
|
+
}
|
482
|
+
read(is, entries.data(), entries.size() * sizeof(uint64_t));
|
483
|
+
if (!is.good())
|
484
|
+
throw std::runtime_error("error reading from std::istream");
|
485
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
486
|
+
} else if (preamble_longs == 3) {
|
487
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
488
|
+
read<uint32_t>(is); // unused
|
489
|
+
const auto theta = read<uint64_t>(is);
|
490
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
491
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
492
|
+
if (is_empty) {
|
493
|
+
if (!is.good())
|
494
|
+
throw std::runtime_error("error reading from std::istream");
|
495
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
|
496
|
+
} else {
|
497
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
498
|
+
if (!is.good())
|
499
|
+
throw std::runtime_error("error reading from std::istream");
|
500
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
|
501
|
+
}
|
502
|
+
} else {
|
503
|
+
throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
|
421
504
|
}
|
422
|
-
}
|
423
505
|
}
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
506
|
+
default:
|
507
|
+
// this should always fail since the valid cases are handled above
|
508
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
509
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
510
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
511
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
512
|
+
}
|
430
513
|
}
|
431
514
|
|
432
515
|
template<typename A>
|
@@ -533,6 +616,18 @@ auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
533
616
|
return entries_ + num_entries_;
|
534
617
|
}
|
535
618
|
|
619
|
+
template<typename A>
|
620
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
621
|
+
|
622
|
+
template<typename A>
|
623
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
624
|
+
os << "### Retained entries" << std::endl;
|
625
|
+
for (const auto& hash: *this) {
|
626
|
+
os << hash << std::endl;
|
627
|
+
}
|
628
|
+
os << "### End retained entries" << std::endl;
|
629
|
+
}
|
630
|
+
|
536
631
|
} /* namespace datasketches */
|
537
632
|
|
538
633
|
#endif
|
@@ -60,11 +60,16 @@ public:
|
|
60
60
|
*/
|
61
61
|
CompactSketch get_result(bool ordered = true) const;
|
62
62
|
|
63
|
+
/**
|
64
|
+
* Reset the union to the initial empty state
|
65
|
+
*/
|
66
|
+
void reset();
|
67
|
+
|
63
68
|
private:
|
64
69
|
State state_;
|
65
70
|
|
66
71
|
// for builder
|
67
|
-
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
72
|
+
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
68
73
|
};
|
69
74
|
|
70
75
|
template<typename A>
|
@@ -38,7 +38,7 @@ public:
|
|
38
38
|
using resize_factor = typename hash_table::resize_factor;
|
39
39
|
using comparator = compare_by_key<ExtractKey>;
|
40
40
|
|
41
|
-
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
41
|
+
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
42
42
|
|
43
43
|
template<typename FwdSketch>
|
44
44
|
void update(FwdSketch&& sketch);
|
@@ -47,6 +47,8 @@ public:
|
|
47
47
|
|
48
48
|
const Policy& get_policy() const;
|
49
49
|
|
50
|
+
void reset();
|
51
|
+
|
50
52
|
private:
|
51
53
|
Policy policy_;
|
52
54
|
hash_table table_;
|
@@ -28,9 +28,9 @@ namespace datasketches {
|
|
28
28
|
|
29
29
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
30
30
|
theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
31
|
-
uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
31
|
+
float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
32
32
|
policy_(policy),
|
33
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
|
33
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
|
34
34
|
union_theta_(table_.theta_)
|
35
35
|
{}
|
36
36
|
|
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
|
|
84
84
|
return policy_;
|
85
85
|
}
|
86
86
|
|
87
|
+
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
88
|
+
void theta_union_base<EN, EK, P, S, CS, A>::reset() {
|
89
|
+
table_.reset();
|
90
|
+
union_theta_ = table_.theta_;
|
91
|
+
}
|
92
|
+
|
87
93
|
} /* namespace datasketches */
|
88
94
|
|
89
95
|
#endif
|
@@ -23,8 +23,8 @@
|
|
23
23
|
namespace datasketches {
|
24
24
|
|
25
25
|
template<typename A>
|
26
|
-
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
-
state_(lg_cur_size, lg_nom_size, rf, theta, seed, nop_policy(), allocator)
|
26
|
+
theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator):
|
27
|
+
state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator)
|
28
28
|
{}
|
29
29
|
|
30
30
|
template<typename A>
|
@@ -38,14 +38,17 @@ auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
|
|
38
38
|
return state_.get_result(ordered);
|
39
39
|
}
|
40
40
|
|
41
|
+
template<typename A>
|
42
|
+
void theta_union_alloc<A>::reset() {
|
43
|
+
state_.reset();
|
44
|
+
}
|
45
|
+
|
41
46
|
template<typename A>
|
42
47
|
theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
|
43
48
|
|
44
49
|
template<typename A>
|
45
50
|
auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
|
46
|
-
return theta_union_alloc(
|
47
|
-
this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
|
48
|
-
this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
51
|
+
return theta_union_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
49
52
|
}
|
50
53
|
|
51
54
|
} /* namespace datasketches */
|
@@ -40,8 +40,8 @@ struct theta_update_sketch_base {
|
|
40
40
|
using resize_factor = theta_constants::resize_factor;
|
41
41
|
using comparator = compare_by_key<ExtractKey>;
|
42
42
|
|
43
|
-
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
44
|
-
uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
43
|
+
theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
44
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true);
|
45
45
|
theta_update_sketch_base(const theta_update_sketch_base& other);
|
46
46
|
theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
|
47
47
|
~theta_update_sketch_base();
|
@@ -75,6 +75,7 @@ struct theta_update_sketch_base {
|
|
75
75
|
uint8_t lg_cur_size_;
|
76
76
|
uint8_t lg_nom_size_;
|
77
77
|
resize_factor rf_;
|
78
|
+
float p_;
|
78
79
|
uint32_t num_entries_;
|
79
80
|
uint64_t theta_;
|
80
81
|
uint64_t seed_;
|
@@ -83,6 +84,7 @@ struct theta_update_sketch_base {
|
|
83
84
|
void resize();
|
84
85
|
void rebuild();
|
85
86
|
void trim();
|
87
|
+
void reset();
|
86
88
|
|
87
89
|
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
|
88
90
|
static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
|
@@ -94,7 +96,7 @@ struct theta_update_sketch_base {
|
|
94
96
|
template<typename Derived, typename Allocator>
|
95
97
|
class theta_base_builder {
|
96
98
|
public:
|
97
|
-
// TODO: Redundant and deprecated. Will be removed in next major
|
99
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
98
100
|
using resize_factor = theta_constants::resize_factor;
|
99
101
|
static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
|
100
102
|
static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
|
@@ -149,7 +151,6 @@ protected:
|
|
149
151
|
|
150
152
|
uint64_t starting_theta() const;
|
151
153
|
uint8_t starting_lg_size() const;
|
152
|
-
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
|
153
154
|
};
|
154
155
|
|
155
156
|
// key extractor
|
@@ -24,15 +24,18 @@
|
|
24
24
|
#include <sstream>
|
25
25
|
#include <algorithm>
|
26
26
|
|
27
|
+
#include "theta_helpers.hpp"
|
28
|
+
|
27
29
|
namespace datasketches {
|
28
30
|
|
29
31
|
template<typename EN, typename EK, typename A>
|
30
|
-
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
32
|
+
theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
|
31
33
|
allocator_(allocator),
|
32
34
|
is_empty_(is_empty),
|
33
35
|
lg_cur_size_(lg_cur_size),
|
34
36
|
lg_nom_size_(lg_nom_size),
|
35
37
|
rf_(rf),
|
38
|
+
p_(p),
|
36
39
|
num_entries_(0),
|
37
40
|
theta_(theta),
|
38
41
|
seed_(seed),
|
@@ -52,6 +55,7 @@ is_empty_(other.is_empty_),
|
|
52
55
|
lg_cur_size_(other.lg_cur_size_),
|
53
56
|
lg_nom_size_(other.lg_nom_size_),
|
54
57
|
rf_(other.rf_),
|
58
|
+
p_(other.p_),
|
55
59
|
num_entries_(other.num_entries_),
|
56
60
|
theta_(other.theta_),
|
57
61
|
seed_(other.seed_),
|
@@ -77,6 +81,7 @@ is_empty_(other.is_empty_),
|
|
77
81
|
lg_cur_size_(other.lg_cur_size_),
|
78
82
|
lg_nom_size_(other.lg_nom_size_),
|
79
83
|
rf_(other.rf_),
|
84
|
+
p_(other.p_),
|
80
85
|
num_entries_(other.num_entries_),
|
81
86
|
theta_(other.theta_),
|
82
87
|
seed_(other.seed_),
|
@@ -105,6 +110,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
105
110
|
std::swap(lg_cur_size_, copy.lg_cur_size_);
|
106
111
|
std::swap(lg_nom_size_, copy.lg_nom_size_);
|
107
112
|
std::swap(rf_, copy.rf_);
|
113
|
+
std::swap(p_, copy.p_);
|
108
114
|
std::swap(num_entries_, copy.num_entries_);
|
109
115
|
std::swap(theta_, copy.theta_);
|
110
116
|
std::swap(seed_, copy.seed_);
|
@@ -119,6 +125,7 @@ theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operat
|
|
119
125
|
std::swap(lg_cur_size_, other.lg_cur_size_);
|
120
126
|
std::swap(lg_nom_size_, other.lg_nom_size_);
|
121
127
|
std::swap(rf_, other.rf_);
|
128
|
+
std::swap(p_, other.p_);
|
122
129
|
std::swap(num_entries_, other.num_entries_);
|
123
130
|
std::swap(theta_, other.theta_);
|
124
131
|
std::swap(seed_, other.seed_);
|
@@ -247,6 +254,29 @@ void theta_update_sketch_base<EN, EK, A>::trim() {
|
|
247
254
|
if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
|
248
255
|
}
|
249
256
|
|
257
|
+
template<typename EN, typename EK, typename A>
|
258
|
+
void theta_update_sketch_base<EN, EK, A>::reset() {
|
259
|
+
const size_t cur_size = 1ULL << lg_cur_size_;
|
260
|
+
for (size_t i = 0; i < cur_size; ++i) {
|
261
|
+
if (EK()(entries_[i]) != 0) {
|
262
|
+
entries_[i].~EN();
|
263
|
+
EK()(entries_[i]) = 0;
|
264
|
+
}
|
265
|
+
}
|
266
|
+
const uint8_t starting_lg_size = theta_build_helper<true>::starting_sub_multiple(
|
267
|
+
lg_nom_size_ + 1, theta_constants::MIN_LG_K, static_cast<uint8_t>(rf_));
|
268
|
+
if (starting_lg_size != lg_cur_size_) {
|
269
|
+
allocator_.deallocate(entries_, cur_size);
|
270
|
+
lg_cur_size_ = starting_lg_size;
|
271
|
+
const size_t new_size = 1ULL << starting_lg_size;
|
272
|
+
entries_ = allocator_.allocate(new_size);
|
273
|
+
for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
|
274
|
+
}
|
275
|
+
num_entries_ = 0;
|
276
|
+
theta_ = theta_build_helper<true>::starting_theta_from_p(p_);
|
277
|
+
is_empty_ = true;
|
278
|
+
}
|
279
|
+
|
250
280
|
template<typename EN, typename EK, typename A>
|
251
281
|
void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
|
252
282
|
// find the first empty slot
|
@@ -310,18 +340,12 @@ Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
|
|
310
340
|
|
311
341
|
template<typename Derived, typename Allocator>
|
312
342
|
uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
|
313
|
-
|
314
|
-
return theta_constants::MAX_THETA;
|
343
|
+
return theta_build_helper<true>::starting_theta_from_p(p_);
|
315
344
|
}
|
316
345
|
|
317
346
|
template<typename Derived, typename Allocator>
|
318
347
|
uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
|
319
|
-
return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
320
|
-
}
|
321
|
-
|
322
|
-
template<typename Derived, typename Allocator>
|
323
|
-
uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
324
|
-
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
348
|
+
return theta_build_helper<true>::starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
|
325
349
|
}
|
326
350
|
|
327
351
|
// iterator
|
Binary file
|