datasketches 0.2.0 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +7 -7
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
- data/vendor/datasketches-cpp/python/README.md +50 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
- metadata +18 -7
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -26,68 +26,77 @@
|
|
|
26
26
|
#include "serde.hpp"
|
|
27
27
|
#include "binomial_bounds.hpp"
|
|
28
28
|
#include "theta_helpers.hpp"
|
|
29
|
+
#include "compact_theta_sketch_parser.hpp"
|
|
29
30
|
|
|
30
31
|
namespace datasketches {
|
|
31
32
|
|
|
32
33
|
template<typename A>
|
|
33
|
-
bool
|
|
34
|
+
bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
|
|
34
35
|
return get_theta64() < theta_constants::MAX_THETA && !is_empty();
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
template<typename A>
|
|
38
|
-
double
|
|
39
|
+
double base_theta_sketch_alloc<A>::get_theta() const {
|
|
39
40
|
return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
|
|
40
41
|
}
|
|
41
42
|
|
|
42
43
|
template<typename A>
|
|
43
|
-
double
|
|
44
|
+
double base_theta_sketch_alloc<A>::get_estimate() const {
|
|
44
45
|
return get_num_retained() / get_theta();
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
template<typename A>
|
|
48
|
-
double
|
|
49
|
+
double base_theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
|
|
49
50
|
if (!is_estimation_mode()) return get_num_retained();
|
|
50
51
|
return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
template<typename A>
|
|
54
|
-
double
|
|
55
|
+
double base_theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
|
|
55
56
|
if (!is_estimation_mode()) return get_num_retained();
|
|
56
57
|
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
57
58
|
}
|
|
58
59
|
|
|
59
60
|
template<typename A>
|
|
60
|
-
string<A>
|
|
61
|
-
|
|
61
|
+
string<A> base_theta_sketch_alloc<A>::to_string(bool print_details) const {
|
|
62
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
63
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
64
|
+
std::ostringstream os;
|
|
62
65
|
os << "### Theta sketch summary:" << std::endl;
|
|
63
|
-
os << " num retained entries : " << get_num_retained() << std::endl;
|
|
64
|
-
os << " seed hash : " << get_seed_hash() << std::endl;
|
|
65
|
-
os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
|
|
66
|
-
os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
|
|
67
|
-
os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
|
68
|
-
os << " theta (fraction) : " << get_theta() << std::endl;
|
|
69
|
-
os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
|
|
66
|
+
os << " num retained entries : " << this->get_num_retained() << std::endl;
|
|
67
|
+
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
|
68
|
+
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
|
69
|
+
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
|
70
|
+
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
|
71
|
+
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
|
72
|
+
os << " theta (raw 64-bit) : " << this->get_theta64() << std::endl;
|
|
70
73
|
os << " estimate : " << this->get_estimate() << std::endl;
|
|
71
74
|
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
72
75
|
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
73
76
|
print_specifics(os);
|
|
74
77
|
os << "### End sketch summary" << std::endl;
|
|
75
|
-
if (
|
|
78
|
+
if (print_details) {
|
|
79
|
+
print_items(os);
|
|
80
|
+
}
|
|
81
|
+
return string<A>(os.str().c_str(), this->get_allocator());
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
template<typename A>
|
|
85
|
+
void theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
|
76
86
|
os << "### Retained entries" << std::endl;
|
|
77
87
|
for (const auto& hash: *this) {
|
|
78
88
|
os << hash << std::endl;
|
|
79
89
|
}
|
|
80
90
|
os << "### End retained entries" << std::endl;
|
|
81
|
-
}
|
|
82
|
-
return os.str();
|
|
83
91
|
}
|
|
84
92
|
|
|
93
|
+
|
|
85
94
|
// update sketch
|
|
86
95
|
|
|
87
96
|
template<typename A>
|
|
88
97
|
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
89
|
-
uint64_t theta, uint64_t seed, const A& allocator):
|
|
90
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
|
|
98
|
+
float p, uint64_t theta, uint64_t seed, const A& allocator):
|
|
99
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator)
|
|
91
100
|
{}
|
|
92
101
|
|
|
93
102
|
template<typename A>
|
|
@@ -102,12 +111,12 @@ bool update_theta_sketch_alloc<A>::is_empty() const {
|
|
|
102
111
|
|
|
103
112
|
template<typename A>
|
|
104
113
|
bool update_theta_sketch_alloc<A>::is_ordered() const {
|
|
105
|
-
return false;
|
|
114
|
+
return table_.num_entries_ > 1 ? false : true;
|
|
106
115
|
}
|
|
107
116
|
|
|
108
117
|
template<typename A>
|
|
109
118
|
uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
|
|
110
|
-
return table_.theta_;
|
|
119
|
+
return is_empty() ? theta_constants::MAX_THETA : table_.theta_;
|
|
111
120
|
}
|
|
112
121
|
|
|
113
122
|
template<typename A>
|
|
@@ -201,6 +210,11 @@ void update_theta_sketch_alloc<A>::trim() {
|
|
|
201
210
|
table_.trim();
|
|
202
211
|
}
|
|
203
212
|
|
|
213
|
+
template<typename A>
|
|
214
|
+
void update_theta_sketch_alloc<A>::reset() {
|
|
215
|
+
table_.reset();
|
|
216
|
+
}
|
|
217
|
+
|
|
204
218
|
template<typename A>
|
|
205
219
|
auto update_theta_sketch_alloc<A>::begin() -> iterator {
|
|
206
220
|
return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
|
|
@@ -227,7 +241,7 @@ compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered
|
|
|
227
241
|
}
|
|
228
242
|
|
|
229
243
|
template<typename A>
|
|
230
|
-
void update_theta_sketch_alloc<A>::print_specifics(
|
|
244
|
+
void update_theta_sketch_alloc<A>::print_specifics(std::ostringstream& os) const {
|
|
231
245
|
os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
|
|
232
246
|
os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
|
|
233
247
|
os << " resize factor : " << (1 << table_.rf_) << std::endl;
|
|
@@ -240,29 +254,32 @@ update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_b
|
|
|
240
254
|
|
|
241
255
|
template<typename A>
|
|
242
256
|
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
|
243
|
-
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
|
|
257
|
+
return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->p_, this->starting_theta(), this->seed_, this->allocator_);
|
|
244
258
|
}
|
|
245
259
|
|
|
246
260
|
// compact sketch
|
|
247
261
|
|
|
248
262
|
template<typename A>
|
|
249
|
-
|
|
263
|
+
template<typename Other>
|
|
264
|
+
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Other& other, bool ordered):
|
|
250
265
|
is_empty_(other.is_empty()),
|
|
251
266
|
is_ordered_(other.is_ordered() || ordered),
|
|
252
267
|
seed_hash_(other.get_seed_hash()),
|
|
253
268
|
theta_(other.get_theta64()),
|
|
254
269
|
entries_(other.get_allocator())
|
|
255
270
|
{
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
271
|
+
if (!other.is_empty()) {
|
|
272
|
+
entries_.reserve(other.get_num_retained());
|
|
273
|
+
std::copy(other.begin(), other.end(), std::back_inserter(entries_));
|
|
274
|
+
if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
|
|
275
|
+
}
|
|
259
276
|
}
|
|
260
277
|
|
|
261
278
|
template<typename A>
|
|
262
279
|
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
|
|
263
280
|
std::vector<uint64_t, A>&& entries):
|
|
264
281
|
is_empty_(is_empty),
|
|
265
|
-
is_ordered_(is_ordered),
|
|
282
|
+
is_ordered_(is_ordered || (entries.size() <= 1ULL)),
|
|
266
283
|
seed_hash_(seed_hash),
|
|
267
284
|
theta_(theta),
|
|
268
285
|
entries_(std::move(entries))
|
|
@@ -290,7 +307,7 @@ uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
|
|
|
290
307
|
|
|
291
308
|
template<typename A>
|
|
292
309
|
uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
|
|
293
|
-
return entries_.size();
|
|
310
|
+
return static_cast<uint32_t>(entries_.size());
|
|
294
311
|
}
|
|
295
312
|
|
|
296
313
|
template<typename A>
|
|
@@ -300,58 +317,58 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
|
300
317
|
|
|
301
318
|
template<typename A>
|
|
302
319
|
auto compact_theta_sketch_alloc<A>::begin() -> iterator {
|
|
303
|
-
return iterator(entries_.data(), entries_.size(), 0);
|
|
320
|
+
return iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
|
|
304
321
|
}
|
|
305
322
|
|
|
306
323
|
template<typename A>
|
|
307
324
|
auto compact_theta_sketch_alloc<A>::end() -> iterator {
|
|
308
|
-
return iterator(nullptr, 0, entries_.size());
|
|
325
|
+
return iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
|
|
309
326
|
}
|
|
310
327
|
|
|
311
328
|
template<typename A>
|
|
312
329
|
auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
|
|
313
|
-
return const_iterator(entries_.data(), entries_.size(), 0);
|
|
330
|
+
return const_iterator(entries_.data(), static_cast<uint32_t>(entries_.size()), 0);
|
|
314
331
|
}
|
|
315
332
|
|
|
316
333
|
template<typename A>
|
|
317
334
|
auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
318
|
-
return const_iterator(nullptr, 0, entries_.size());
|
|
335
|
+
return const_iterator(nullptr, 0, static_cast<uint32_t>(entries_.size()));
|
|
319
336
|
}
|
|
320
337
|
|
|
321
338
|
template<typename A>
|
|
322
|
-
void compact_theta_sketch_alloc<A>::print_specifics(
|
|
339
|
+
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
|
323
340
|
|
|
324
341
|
template<typename A>
|
|
325
342
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
326
343
|
const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
|
|
327
344
|
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
328
|
-
|
|
345
|
+
write(os, preamble_longs);
|
|
329
346
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
330
|
-
|
|
347
|
+
write(os, serial_version);
|
|
331
348
|
const uint8_t type = SKETCH_TYPE;
|
|
332
|
-
|
|
349
|
+
write(os, type);
|
|
333
350
|
const uint16_t unused16 = 0;
|
|
334
|
-
|
|
351
|
+
write(os, unused16);
|
|
335
352
|
const uint8_t flags_byte(
|
|
336
353
|
(1 << flags::IS_COMPACT) |
|
|
337
354
|
(1 << flags::IS_READ_ONLY) |
|
|
338
355
|
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
339
356
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
340
357
|
);
|
|
341
|
-
|
|
358
|
+
write(os, flags_byte);
|
|
342
359
|
const uint16_t seed_hash = get_seed_hash();
|
|
343
|
-
|
|
360
|
+
write(os, seed_hash);
|
|
344
361
|
if (!this->is_empty()) {
|
|
345
362
|
if (!is_single_item) {
|
|
346
|
-
const uint32_t num_entries = entries_.size();
|
|
347
|
-
|
|
363
|
+
const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
364
|
+
write(os, num_entries);
|
|
348
365
|
const uint32_t unused32 = 0;
|
|
349
|
-
|
|
366
|
+
write(os, unused32);
|
|
350
367
|
if (this->is_estimation_mode()) {
|
|
351
|
-
|
|
368
|
+
write(os, this->theta_);
|
|
352
369
|
}
|
|
353
370
|
}
|
|
354
|
-
|
|
371
|
+
write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
|
|
355
372
|
}
|
|
356
373
|
}
|
|
357
374
|
|
|
@@ -364,30 +381,28 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
|
364
381
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
|
365
382
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
366
383
|
|
|
367
|
-
ptr += copy_to_mem(
|
|
384
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
368
385
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
369
|
-
ptr += copy_to_mem(
|
|
386
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
370
387
|
const uint8_t type = SKETCH_TYPE;
|
|
371
|
-
ptr += copy_to_mem(
|
|
372
|
-
|
|
373
|
-
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
|
|
388
|
+
ptr += copy_to_mem(type, ptr);
|
|
389
|
+
ptr += sizeof(uint16_t); // unused
|
|
374
390
|
const uint8_t flags_byte(
|
|
375
391
|
(1 << flags::IS_COMPACT) |
|
|
376
392
|
(1 << flags::IS_READ_ONLY) |
|
|
377
393
|
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
|
378
394
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
|
379
395
|
);
|
|
380
|
-
ptr += copy_to_mem(
|
|
396
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
381
397
|
const uint16_t seed_hash = get_seed_hash();
|
|
382
|
-
ptr += copy_to_mem(
|
|
398
|
+
ptr += copy_to_mem(seed_hash, ptr);
|
|
383
399
|
if (!this->is_empty()) {
|
|
384
400
|
if (!is_single_item) {
|
|
385
|
-
const uint32_t num_entries = entries_.size();
|
|
386
|
-
ptr += copy_to_mem(
|
|
387
|
-
|
|
388
|
-
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
401
|
+
const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
|
|
402
|
+
ptr += copy_to_mem(num_entries, ptr);
|
|
403
|
+
ptr += sizeof(uint32_t);
|
|
389
404
|
if (this->is_estimation_mode()) {
|
|
390
|
-
ptr += copy_to_mem(
|
|
405
|
+
ptr += copy_to_mem(theta_, ptr);
|
|
391
406
|
}
|
|
392
407
|
}
|
|
393
408
|
ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
|
|
@@ -397,43 +412,104 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
|
397
412
|
|
|
398
413
|
template<typename A>
|
|
399
414
|
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
uint8_t
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
415
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
416
|
+
const auto serial_version = read<uint8_t>(is);
|
|
417
|
+
const auto type = read<uint8_t>(is);
|
|
418
|
+
switch (serial_version) {
|
|
419
|
+
case SERIAL_VERSION: {
|
|
420
|
+
read<uint16_t>(is); // unused
|
|
421
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
422
|
+
const auto seed_hash = read<uint16_t>(is);
|
|
423
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
424
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
425
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
426
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
427
|
+
|
|
428
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
429
|
+
uint32_t num_entries = 0;
|
|
430
|
+
if (!is_empty) {
|
|
431
|
+
if (preamble_longs == 1) {
|
|
432
|
+
num_entries = 1;
|
|
433
|
+
} else {
|
|
434
|
+
num_entries = read<uint32_t>(is);
|
|
435
|
+
read<uint32_t>(is); // unused
|
|
436
|
+
if (preamble_longs > 2) {
|
|
437
|
+
theta = read<uint64_t>(is);
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
442
|
+
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
|
416
443
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
444
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
445
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
446
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
447
|
+
}
|
|
448
|
+
case 1: {
|
|
449
|
+
const auto seed_hash = compute_seed_hash(seed);
|
|
450
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
451
|
+
read<uint8_t>(is); // unused
|
|
452
|
+
read<uint32_t>(is); // unused
|
|
453
|
+
const auto num_entries = read<uint32_t>(is);
|
|
454
|
+
read<uint32_t>(is); //unused
|
|
455
|
+
const auto theta = read<uint64_t>(is);
|
|
456
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
457
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
458
|
+
if (!is_empty)
|
|
459
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
|
460
|
+
if (!is.good())
|
|
461
|
+
throw std::runtime_error("error reading from std::istream");
|
|
462
|
+
return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
|
|
463
|
+
}
|
|
464
|
+
case 2: {
|
|
465
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
466
|
+
read<uint8_t>(is); // unused
|
|
467
|
+
read<uint16_t>(is); // unused
|
|
468
|
+
const uint16_t seed_hash = read<uint16_t>(is);
|
|
469
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
470
|
+
if (preamble_longs == 1) {
|
|
471
|
+
if (!is.good())
|
|
472
|
+
throw std::runtime_error("error reading from std::istream");
|
|
473
|
+
std::vector<uint64_t> entries(0, 0, allocator);
|
|
474
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
|
475
|
+
} else if (preamble_longs == 2) {
|
|
476
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
|
477
|
+
read<uint32_t>(is); // unused
|
|
478
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
479
|
+
if (num_entries == 0) {
|
|
480
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
|
481
|
+
}
|
|
482
|
+
read(is, entries.data(), entries.size() * sizeof(uint64_t));
|
|
483
|
+
if (!is.good())
|
|
484
|
+
throw std::runtime_error("error reading from std::istream");
|
|
485
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
|
486
|
+
} else if (preamble_longs == 3) {
|
|
487
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
|
488
|
+
read<uint32_t>(is); // unused
|
|
489
|
+
const auto theta = read<uint64_t>(is);
|
|
490
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
491
|
+
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
492
|
+
if (is_empty) {
|
|
493
|
+
if (!is.good())
|
|
494
|
+
throw std::runtime_error("error reading from std::istream");
|
|
495
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
|
|
496
|
+
} else {
|
|
497
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
|
498
|
+
if (!is.good())
|
|
499
|
+
throw std::runtime_error("error reading from std::istream");
|
|
500
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
|
|
501
|
+
}
|
|
502
|
+
} else {
|
|
503
|
+
throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
|
|
428
504
|
}
|
|
429
|
-
}
|
|
430
505
|
}
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
506
|
+
default:
|
|
507
|
+
// this should always fail since the valid cases are handled above
|
|
508
|
+
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
509
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
|
510
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
|
511
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
|
512
|
+
}
|
|
437
513
|
}
|
|
438
514
|
|
|
439
515
|
template<typename A>
|
|
@@ -442,17 +518,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
|
|
|
442
518
|
const char* ptr = static_cast<const char*>(bytes);
|
|
443
519
|
const char* base = ptr;
|
|
444
520
|
uint8_t preamble_longs;
|
|
445
|
-
ptr += copy_from_mem(ptr,
|
|
521
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
446
522
|
uint8_t serial_version;
|
|
447
|
-
ptr += copy_from_mem(ptr,
|
|
523
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
448
524
|
uint8_t type;
|
|
449
|
-
ptr += copy_from_mem(ptr,
|
|
450
|
-
uint16_t
|
|
451
|
-
ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
|
|
525
|
+
ptr += copy_from_mem(ptr, type);
|
|
526
|
+
ptr += sizeof(uint16_t); // unused
|
|
452
527
|
uint8_t flags_byte;
|
|
453
|
-
ptr += copy_from_mem(ptr,
|
|
528
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
454
529
|
uint16_t seed_hash;
|
|
455
|
-
ptr += copy_from_mem(ptr,
|
|
530
|
+
ptr += copy_from_mem(ptr, seed_hash);
|
|
456
531
|
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
457
532
|
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
458
533
|
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
@@ -465,12 +540,11 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
|
|
|
465
540
|
num_entries = 1;
|
|
466
541
|
} else {
|
|
467
542
|
ensure_minimum_memory(size, 8); // read the first prelong before this method
|
|
468
|
-
ptr += copy_from_mem(ptr,
|
|
469
|
-
uint32_t
|
|
470
|
-
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
543
|
+
ptr += copy_from_mem(ptr, num_entries);
|
|
544
|
+
ptr += sizeof(uint32_t); // unused
|
|
471
545
|
if (preamble_longs > 2) {
|
|
472
546
|
ensure_minimum_memory(size, (preamble_longs - 1) << 3);
|
|
473
|
-
ptr += copy_from_mem(ptr,
|
|
547
|
+
ptr += copy_from_mem(ptr, theta);
|
|
474
548
|
}
|
|
475
549
|
}
|
|
476
550
|
}
|
|
@@ -483,7 +557,77 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
|
|
|
483
557
|
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
484
558
|
}
|
|
485
559
|
|
|
560
|
+
// wrapped compact sketch
|
|
561
|
+
|
|
562
|
+
template<typename A>
|
|
563
|
+
wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
|
|
564
|
+
uint64_t theta, const uint64_t* entries):
|
|
565
|
+
is_empty_(is_empty),
|
|
566
|
+
is_ordered_(is_ordered),
|
|
567
|
+
seed_hash_(seed_hash),
|
|
568
|
+
num_entries_(num_entries),
|
|
569
|
+
theta_(theta),
|
|
570
|
+
entries_(entries)
|
|
571
|
+
{}
|
|
572
|
+
|
|
573
|
+
template<typename A>
|
|
574
|
+
const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
|
|
575
|
+
auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
|
|
576
|
+
return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
template<typename A>
|
|
580
|
+
A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
|
|
581
|
+
return A();
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
template<typename A>
|
|
585
|
+
bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
|
|
586
|
+
return is_empty_;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
template<typename A>
|
|
590
|
+
bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
|
|
591
|
+
return is_ordered_;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
template<typename A>
|
|
595
|
+
uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
|
|
596
|
+
return theta_;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
template<typename A>
|
|
600
|
+
uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
|
|
601
|
+
return static_cast<uint32_t>(num_entries_);
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
template<typename A>
|
|
605
|
+
uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
606
|
+
return seed_hash_;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
template<typename A>
|
|
610
|
+
auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
|
|
611
|
+
return entries_;
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
template<typename A>
|
|
615
|
+
auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
616
|
+
return entries_ + num_entries_;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
template<typename A>
|
|
620
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
|
621
|
+
|
|
622
|
+
template<typename A>
|
|
623
|
+
void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
|
624
|
+
os << "### Retained entries" << std::endl;
|
|
625
|
+
for (const auto& hash: *this) {
|
|
626
|
+
os << hash << std::endl;
|
|
627
|
+
}
|
|
628
|
+
os << "### End retained entries" << std::endl;
|
|
629
|
+
}
|
|
630
|
+
|
|
486
631
|
} /* namespace datasketches */
|
|
487
632
|
|
|
488
633
|
#endif
|
|
489
|
-
|
|
@@ -35,13 +35,13 @@ public:
|
|
|
35
35
|
using CompactSketch = compact_theta_sketch_alloc<Allocator>;
|
|
36
36
|
using resize_factor = theta_constants::resize_factor;
|
|
37
37
|
|
|
38
|
-
struct
|
|
39
|
-
|
|
38
|
+
struct nop_policy {
|
|
39
|
+
void operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
|
|
40
|
+
unused(internal_entry);
|
|
40
41
|
unused(incoming_entry);
|
|
41
|
-
return internal_entry;
|
|
42
42
|
}
|
|
43
43
|
};
|
|
44
|
-
using State = theta_union_base<Entry, ExtractKey,
|
|
44
|
+
using State = theta_union_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>;
|
|
45
45
|
|
|
46
46
|
// No constructor here. Use builder instead.
|
|
47
47
|
class builder;
|
|
@@ -60,11 +60,16 @@ public:
|
|
|
60
60
|
*/
|
|
61
61
|
CompactSketch get_result(bool ordered = true) const;
|
|
62
62
|
|
|
63
|
+
/**
|
|
64
|
+
* Reset the union to the initial empty state
|
|
65
|
+
*/
|
|
66
|
+
void reset();
|
|
67
|
+
|
|
63
68
|
private:
|
|
64
69
|
State state_;
|
|
65
70
|
|
|
66
71
|
// for builder
|
|
67
|
-
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
|
72
|
+
theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator);
|
|
68
73
|
};
|
|
69
74
|
|
|
70
75
|
template<typename A>
|
|
@@ -38,7 +38,7 @@ public:
|
|
|
38
38
|
using resize_factor = typename hash_table::resize_factor;
|
|
39
39
|
using comparator = compare_by_key<ExtractKey>;
|
|
40
40
|
|
|
41
|
-
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
|
41
|
+
theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
|
|
42
42
|
|
|
43
43
|
template<typename FwdSketch>
|
|
44
44
|
void update(FwdSketch&& sketch);
|
|
@@ -47,6 +47,8 @@ public:
|
|
|
47
47
|
|
|
48
48
|
const Policy& get_policy() const;
|
|
49
49
|
|
|
50
|
+
void reset();
|
|
51
|
+
|
|
50
52
|
private:
|
|
51
53
|
Policy policy_;
|
|
52
54
|
hash_table table_;
|
|
@@ -28,9 +28,9 @@ namespace datasketches {
|
|
|
28
28
|
|
|
29
29
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
|
30
30
|
theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
31
|
-
uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
|
31
|
+
float p, uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
|
|
32
32
|
policy_(policy),
|
|
33
|
-
table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
|
|
33
|
+
table_(lg_cur_size, lg_nom_size, rf, p, theta, seed, allocator),
|
|
34
34
|
union_theta_(table_.theta_)
|
|
35
35
|
{}
|
|
36
36
|
|
|
@@ -43,7 +43,7 @@ void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
43
43
|
if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
|
|
44
44
|
for (auto& entry: sketch) {
|
|
45
45
|
const uint64_t hash = EK()(entry);
|
|
46
|
-
if (hash < union_theta_) {
|
|
46
|
+
if (hash < union_theta_ && hash < table_.theta_) {
|
|
47
47
|
auto result = table_.find(hash);
|
|
48
48
|
if (!result.second) {
|
|
49
49
|
table_.insert(result.first, conditional_forward<SS>(entry));
|
|
@@ -84,6 +84,12 @@ const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
|
|
|
84
84
|
return policy_;
|
|
85
85
|
}
|
|
86
86
|
|
|
87
|
+
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
|
88
|
+
void theta_union_base<EN, EK, P, S, CS, A>::reset() {
|
|
89
|
+
table_.reset();
|
|
90
|
+
union_theta_ = table_.theta_;
|
|
91
|
+
}
|
|
92
|
+
|
|
87
93
|
} /* namespace datasketches */
|
|
88
94
|
|
|
89
95
|
#endif
|