datasketches 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/cpc_wrapper.cpp +12 -13
- data/ext/datasketches/ext.cpp +1 -1
- data/ext/datasketches/ext.h +4 -0
- data/ext/datasketches/extconf.rb +1 -1
- data/ext/datasketches/fi_wrapper.cpp +6 -8
- data/ext/datasketches/hll_wrapper.cpp +13 -14
- data/ext/datasketches/kll_wrapper.cpp +28 -76
- data/ext/datasketches/theta_wrapper.cpp +27 -41
- data/ext/datasketches/vo_wrapper.cpp +4 -6
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/README.md +4 -4
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +7 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +3 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +28 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +8 -5
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +19 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +6 -6
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +0 -6
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +9 -9
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +40 -28
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +140 -124
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +32 -57
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +9 -8
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +34 -48
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +10 -10
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +45 -77
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +11 -12
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +15 -14
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +10 -21
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -3
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +28 -55
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +8 -8
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +9 -11
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +2 -1
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +34 -31
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +3 -28
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/hll.hpp +6 -34
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +7 -7
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +46 -50
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +3 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +10 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +93 -75
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +45 -42
- data/vendor/datasketches-cpp/python/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/python/README.md +6 -3
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -2
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +3 -1
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +36 -26
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -1
- data/vendor/datasketches-cpp/python/tests/kll_test.py +3 -3
- data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
- data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +17 -8
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +501 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +82 -70
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -31
- data/vendor/datasketches-cpp/setup.py +5 -3
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +2 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +2 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +22 -29
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
- data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
- data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +6 -22
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +132 -266
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +200 -650
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +5 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +3 -19
- data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +6 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +2 -3
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -234
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +6 -6
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -4
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +2 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -4
- metadata +43 -34
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
|
@@ -0,0 +1,810 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef REQ_SKETCH_IMPL_HPP_
|
|
21
|
+
#define REQ_SKETCH_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <sstream>
|
|
24
|
+
#include <stdexcept>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
template<typename T, typename C, typename S, typename A>
|
|
29
|
+
req_sketch<T, C, S, A>::req_sketch(uint16_t k, bool hra, const A& allocator):
|
|
30
|
+
allocator_(allocator),
|
|
31
|
+
k_(std::max(static_cast<int>(k) & -2, static_cast<int>(req_constants::MIN_K))), //rounds down one if odd
|
|
32
|
+
hra_(hra),
|
|
33
|
+
max_nom_size_(0),
|
|
34
|
+
num_retained_(0),
|
|
35
|
+
n_(0),
|
|
36
|
+
compactors_(allocator),
|
|
37
|
+
min_value_(nullptr),
|
|
38
|
+
max_value_(nullptr)
|
|
39
|
+
{
|
|
40
|
+
grow();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
template<typename T, typename C, typename S, typename A>
|
|
44
|
+
req_sketch<T, C, S, A>::~req_sketch() {
|
|
45
|
+
if (min_value_ != nullptr) {
|
|
46
|
+
min_value_->~T();
|
|
47
|
+
allocator_.deallocate(min_value_, 1);
|
|
48
|
+
}
|
|
49
|
+
if (max_value_ != nullptr) {
|
|
50
|
+
max_value_->~T();
|
|
51
|
+
allocator_.deallocate(max_value_, 1);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
template<typename T, typename C, typename S, typename A>
|
|
56
|
+
req_sketch<T, C, S, A>::req_sketch(const req_sketch& other):
|
|
57
|
+
allocator_(other.allocator_),
|
|
58
|
+
k_(other.k_),
|
|
59
|
+
hra_(other.hra_),
|
|
60
|
+
max_nom_size_(other.max_nom_size_),
|
|
61
|
+
num_retained_(other.num_retained_),
|
|
62
|
+
n_(other.n_),
|
|
63
|
+
compactors_(other.compactors_),
|
|
64
|
+
min_value_(nullptr),
|
|
65
|
+
max_value_(nullptr)
|
|
66
|
+
{
|
|
67
|
+
if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
|
|
68
|
+
if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
template<typename T, typename C, typename S, typename A>
|
|
72
|
+
req_sketch<T, C, S, A>::req_sketch(req_sketch&& other) noexcept :
|
|
73
|
+
allocator_(std::move(other.allocator_)),
|
|
74
|
+
k_(other.k_),
|
|
75
|
+
hra_(other.hra_),
|
|
76
|
+
max_nom_size_(other.max_nom_size_),
|
|
77
|
+
num_retained_(other.num_retained_),
|
|
78
|
+
n_(other.n_),
|
|
79
|
+
compactors_(std::move(other.compactors_)),
|
|
80
|
+
min_value_(other.min_value_),
|
|
81
|
+
max_value_(other.max_value_)
|
|
82
|
+
{
|
|
83
|
+
other.min_value_ = nullptr;
|
|
84
|
+
other.max_value_ = nullptr;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
template<typename T, typename C, typename S, typename A>
|
|
88
|
+
req_sketch<T, C, S, A>& req_sketch<T, C, S, A>::operator=(const req_sketch& other) {
|
|
89
|
+
req_sketch copy(other);
|
|
90
|
+
std::swap(allocator_, copy.allocator_);
|
|
91
|
+
std::swap(k_, copy.k_);
|
|
92
|
+
std::swap(hra_, copy.hra_);
|
|
93
|
+
std::swap(max_nom_size_, copy.max_nom_size_);
|
|
94
|
+
std::swap(num_retained_, copy.num_retained_);
|
|
95
|
+
std::swap(n_, copy.n_);
|
|
96
|
+
std::swap(compactors_, copy.compactors_);
|
|
97
|
+
std::swap(min_value_, copy.min_value_);
|
|
98
|
+
std::swap(max_value_, copy.max_value_);
|
|
99
|
+
return *this;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
template<typename T, typename C, typename S, typename A>
|
|
103
|
+
req_sketch<T, C, S, A>& req_sketch<T, C, S, A>::operator=(req_sketch&& other) {
|
|
104
|
+
std::swap(allocator_, other.allocator_);
|
|
105
|
+
std::swap(k_, other.k_);
|
|
106
|
+
std::swap(hra_, other.hra_);
|
|
107
|
+
std::swap(max_nom_size_, other.max_nom_size_);
|
|
108
|
+
std::swap(num_retained_, other.num_retained_);
|
|
109
|
+
std::swap(n_, other.n_);
|
|
110
|
+
std::swap(compactors_, other.compactors_);
|
|
111
|
+
std::swap(min_value_, other.min_value_);
|
|
112
|
+
std::swap(max_value_, other.max_value_);
|
|
113
|
+
return *this;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
template<typename T, typename C, typename S, typename A>
|
|
117
|
+
uint16_t req_sketch<T, C, S, A>::get_k() const {
|
|
118
|
+
return k_;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
template<typename T, typename C, typename S, typename A>
|
|
122
|
+
bool req_sketch<T, C, S, A>::is_HRA() const {
|
|
123
|
+
return hra_;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
template<typename T, typename C, typename S, typename A>
|
|
127
|
+
bool req_sketch<T, C, S, A>::is_empty() const {
|
|
128
|
+
return n_ == 0;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
template<typename T, typename C, typename S, typename A>
|
|
132
|
+
uint64_t req_sketch<T, C, S, A>::get_n() const {
|
|
133
|
+
return n_;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
template<typename T, typename C, typename S, typename A>
|
|
137
|
+
uint32_t req_sketch<T, C, S, A>::get_num_retained() const {
|
|
138
|
+
return num_retained_;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
template<typename T, typename C, typename S, typename A>
|
|
142
|
+
bool req_sketch<T, C, S, A>::is_estimation_mode() const {
|
|
143
|
+
return compactors_.size() > 1;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
template<typename T, typename C, typename S, typename A>
|
|
147
|
+
template<typename FwdT>
|
|
148
|
+
void req_sketch<T, C, S, A>::update(FwdT&& item) {
|
|
149
|
+
if (!check_update_value(item)) { return; }
|
|
150
|
+
if (is_empty()) {
|
|
151
|
+
min_value_ = new (allocator_.allocate(1)) T(item);
|
|
152
|
+
max_value_ = new (allocator_.allocate(1)) T(item);
|
|
153
|
+
} else {
|
|
154
|
+
if (C()(item, *min_value_)) *min_value_ = item;
|
|
155
|
+
if (C()(*max_value_, item)) *max_value_ = item;
|
|
156
|
+
}
|
|
157
|
+
compactors_[0].append(std::forward<FwdT>(item));
|
|
158
|
+
++num_retained_;
|
|
159
|
+
++n_;
|
|
160
|
+
if (num_retained_ == max_nom_size_) compress();
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
template<typename T, typename C, typename S, typename A>
|
|
164
|
+
template<typename FwdSk>
|
|
165
|
+
void req_sketch<T, C, S, A>::merge(FwdSk&& other) {
|
|
166
|
+
if (is_HRA() != other.is_HRA()) throw std::invalid_argument("merging HRA and LRA is not valid");
|
|
167
|
+
if (other.is_empty()) return;
|
|
168
|
+
if (is_empty()) {
|
|
169
|
+
min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
|
|
170
|
+
max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
|
|
171
|
+
} else {
|
|
172
|
+
if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
|
|
173
|
+
if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
|
|
174
|
+
}
|
|
175
|
+
// grow until this has at least as many compactors as other
|
|
176
|
+
while (get_num_levels() < other.get_num_levels()) grow();
|
|
177
|
+
// merge the items in all height compactors
|
|
178
|
+
for (size_t i = 0; i < other.get_num_levels(); ++i) {
|
|
179
|
+
compactors_[i].merge(conditional_forward<FwdSk>(other.compactors_[i]));
|
|
180
|
+
}
|
|
181
|
+
n_ += other.n_;
|
|
182
|
+
update_max_nom_size();
|
|
183
|
+
update_num_retained();
|
|
184
|
+
if (num_retained_ >= max_nom_size_) compress();
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
template<typename T, typename C, typename S, typename A>
|
|
188
|
+
const T& req_sketch<T, C, S, A>::get_min_value() const {
|
|
189
|
+
if (is_empty()) return get_invalid_value();
|
|
190
|
+
return *min_value_;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
template<typename T, typename C, typename S, typename A>
|
|
194
|
+
const T& req_sketch<T, C, S, A>::get_max_value() const {
|
|
195
|
+
if (is_empty()) return get_invalid_value();
|
|
196
|
+
return *max_value_;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
template<typename T, typename C, typename S, typename A>
|
|
200
|
+
template<bool inclusive>
|
|
201
|
+
double req_sketch<T, C, S, A>::get_rank(const T& item) const {
|
|
202
|
+
uint64_t weight = 0;
|
|
203
|
+
for (const auto& compactor: compactors_) {
|
|
204
|
+
weight += compactor.template compute_weight<inclusive>(item);
|
|
205
|
+
}
|
|
206
|
+
return static_cast<double>(weight) / n_;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
template<typename T, typename C, typename S, typename A>
|
|
210
|
+
template<bool inclusive>
|
|
211
|
+
auto req_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
|
|
212
|
+
auto buckets = get_CDF<inclusive>(split_points, size);
|
|
213
|
+
for (uint32_t i = size; i > 0; --i) {
|
|
214
|
+
buckets[i] -= buckets[i - 1];
|
|
215
|
+
}
|
|
216
|
+
return buckets;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
template<typename T, typename C, typename S, typename A>
|
|
220
|
+
template<bool inclusive>
|
|
221
|
+
auto req_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
|
|
222
|
+
vector_double buckets(allocator_);
|
|
223
|
+
if (is_empty()) return buckets;
|
|
224
|
+
check_split_points(split_points, size);
|
|
225
|
+
buckets.reserve(size + 1);
|
|
226
|
+
for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank<inclusive>(split_points[i]));
|
|
227
|
+
buckets.push_back(1);
|
|
228
|
+
return buckets;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
template<typename T, typename C, typename S, typename A>
|
|
232
|
+
template<bool inclusive>
|
|
233
|
+
const T& req_sketch<T, C, S, A>::get_quantile(double rank) const {
|
|
234
|
+
if (is_empty()) return get_invalid_value();
|
|
235
|
+
if (rank == 0.0) return *min_value_;
|
|
236
|
+
if (rank == 1.0) return *max_value_;
|
|
237
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
238
|
+
throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
|
|
239
|
+
}
|
|
240
|
+
return *(get_quantile_calculator<inclusive>()->get_quantile(rank));
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
template<typename T, typename C, typename S, typename A>
|
|
244
|
+
template<bool inclusive>
|
|
245
|
+
std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
|
|
246
|
+
std::vector<T, A> quantiles(allocator_);
|
|
247
|
+
if (is_empty()) return quantiles;
|
|
248
|
+
QuantileCalculatorPtr quantile_calculator(nullptr, calculator_deleter(allocator_));
|
|
249
|
+
quantiles.reserve(size);
|
|
250
|
+
for (uint32_t i = 0; i < size; ++i) {
|
|
251
|
+
const double rank = ranks[i];
|
|
252
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
253
|
+
throw std::invalid_argument("rank cannot be less than zero or greater than 1.0");
|
|
254
|
+
}
|
|
255
|
+
if (rank == 0.0) quantiles.push_back(*min_value_);
|
|
256
|
+
else if (rank == 1.0) quantiles.push_back(*max_value_);
|
|
257
|
+
else {
|
|
258
|
+
if (!quantile_calculator) {
|
|
259
|
+
// has side effect of sorting level zero if needed
|
|
260
|
+
quantile_calculator = const_cast<req_sketch*>(this)->get_quantile_calculator<inclusive>();
|
|
261
|
+
}
|
|
262
|
+
quantiles.push_back(*(quantile_calculator->get_quantile(rank)));
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
return quantiles;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
template<typename T, typename C, typename S, typename A>
|
|
269
|
+
class req_sketch<T, C, S, A>::calculator_deleter {
|
|
270
|
+
public:
|
|
271
|
+
calculator_deleter(const AllocCalc& allocator): allocator_(allocator) {}
|
|
272
|
+
void operator() (QuantileCalculator* ptr) {
|
|
273
|
+
if (ptr != nullptr) {
|
|
274
|
+
ptr->~QuantileCalculator();
|
|
275
|
+
allocator_.deallocate(ptr, 1);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
private:
|
|
279
|
+
AllocCalc allocator_;
|
|
280
|
+
};
|
|
281
|
+
|
|
282
|
+
template<typename T, typename C, typename S, typename A>
|
|
283
|
+
template<bool inclusive>
|
|
284
|
+
auto req_sketch<T, C, S, A>::get_quantile_calculator() const -> QuantileCalculatorPtr {
|
|
285
|
+
if (!compactors_[0].is_sorted()) {
|
|
286
|
+
const_cast<Compactor&>(compactors_[0]).sort(); // allow this side effect
|
|
287
|
+
}
|
|
288
|
+
AllocCalc ac(allocator_);
|
|
289
|
+
QuantileCalculatorPtr quantile_calculator(
|
|
290
|
+
new (ac.allocate(1)) req_quantile_calculator<T, C, A>(n_, ac),
|
|
291
|
+
calculator_deleter(ac)
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
for (auto& compactor: compactors_) {
|
|
295
|
+
quantile_calculator->add(compactor.begin(), compactor.end(), compactor.get_lg_weight());
|
|
296
|
+
}
|
|
297
|
+
quantile_calculator->template convert_to_cummulative<inclusive>();
|
|
298
|
+
return quantile_calculator;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
template<typename T, typename C, typename S, typename A>
|
|
302
|
+
double req_sketch<T, C, S, A>::get_rank_lower_bound(double rank, uint8_t num_std_dev) const {
|
|
303
|
+
return get_rank_lb(get_k(), get_num_levels(), rank, num_std_dev, get_n(), hra_);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
template<typename T, typename C, typename S, typename A>
|
|
307
|
+
double req_sketch<T, C, S, A>::get_rank_upper_bound(double rank, uint8_t num_std_dev) const {
|
|
308
|
+
return get_rank_ub(get_k(), get_num_levels(), rank, num_std_dev, get_n(), hra_);
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
template<typename T, typename C, typename S, typename A>
|
|
312
|
+
double req_sketch<T, C, S, A>::get_RSE(uint16_t k, double rank, bool hra, uint64_t n) {
|
|
313
|
+
return get_rank_lb(k, 2, rank, 1, n, hra);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
template<typename T, typename C, typename S, typename A>
|
|
317
|
+
double req_sketch<T, C, S, A>::get_rank_lb(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra) {
|
|
318
|
+
if (is_exact_rank(k, num_levels, rank, n, hra)) return rank;
|
|
319
|
+
const double relative = relative_rse_factor() / k * (hra ? 1.0 - rank : rank);
|
|
320
|
+
const double fixed = FIXED_RSE_FACTOR / k;
|
|
321
|
+
const double lb_rel = rank - num_std_dev * relative;
|
|
322
|
+
const double lb_fix = rank - num_std_dev * fixed;
|
|
323
|
+
return std::max(lb_rel, lb_fix);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
template<typename T, typename C, typename S, typename A>
|
|
327
|
+
double req_sketch<T, C, S, A>::get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra) {
|
|
328
|
+
if (is_exact_rank(k, num_levels, rank, n, hra)) return rank;
|
|
329
|
+
const double relative = relative_rse_factor() / k * (hra ? 1.0 - rank : rank);
|
|
330
|
+
const double fixed = FIXED_RSE_FACTOR / k;
|
|
331
|
+
const double ub_rel = rank + num_std_dev * relative;
|
|
332
|
+
const double ub_fix = rank + num_std_dev * fixed;
|
|
333
|
+
return std::min(ub_rel, ub_fix);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
template<typename T, typename C, typename S, typename A>
|
|
337
|
+
bool req_sketch<T, C, S, A>::is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra) {
|
|
338
|
+
const unsigned base_cap = k * req_constants::INIT_NUM_SECTIONS;
|
|
339
|
+
if (num_levels == 1 || n <= base_cap) return true;
|
|
340
|
+
const double exact_rank_thresh = static_cast<double>(base_cap) / n;
|
|
341
|
+
return (hra && rank >= 1.0 - exact_rank_thresh) || (!hra && rank <= exact_rank_thresh);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
template<typename T, typename C, typename S, typename A>
|
|
345
|
+
double req_sketch<T, C, S, A>::relative_rse_factor() {
|
|
346
|
+
return sqrt(0.0512 / req_constants::INIT_NUM_SECTIONS);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
350
|
+
template<typename T, typename C, typename S, typename A>
|
|
351
|
+
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
352
|
+
size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
353
|
+
size_t size = PREAMBLE_SIZE_BYTES;
|
|
354
|
+
if (is_empty()) return size;
|
|
355
|
+
if (is_estimation_mode()) {
|
|
356
|
+
size += sizeof(n_) + sizeof(TT) * 2; // min and max
|
|
357
|
+
}
|
|
358
|
+
if (n_ == 1) {
|
|
359
|
+
size += sizeof(TT);
|
|
360
|
+
} else {
|
|
361
|
+
for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
|
|
362
|
+
}
|
|
363
|
+
return size;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// implementation for all other types
|
|
367
|
+
template<typename T, typename C, typename S, typename A>
|
|
368
|
+
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
369
|
+
size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
370
|
+
size_t size = PREAMBLE_SIZE_BYTES;
|
|
371
|
+
if (is_empty()) return size;
|
|
372
|
+
if (is_estimation_mode()) {
|
|
373
|
+
size += sizeof(n_);
|
|
374
|
+
size += S().size_of_item(*min_value_);
|
|
375
|
+
size += S().size_of_item(*max_value_);
|
|
376
|
+
}
|
|
377
|
+
if (n_ == 1) {
|
|
378
|
+
size += S().size_of_item(*compactors_[0].begin());
|
|
379
|
+
} else {
|
|
380
|
+
for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
|
|
381
|
+
}
|
|
382
|
+
return size;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
template<typename T, typename C, typename S, typename A>
|
|
386
|
+
void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
|
387
|
+
const uint8_t preamble_ints = is_estimation_mode() ? 4 : 2;
|
|
388
|
+
write(os, preamble_ints);
|
|
389
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
390
|
+
write(os, serial_version);
|
|
391
|
+
const uint8_t family = FAMILY;
|
|
392
|
+
write(os, family);
|
|
393
|
+
const bool raw_items = n_ <= req_constants::MIN_K;
|
|
394
|
+
const uint8_t flags_byte(
|
|
395
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
396
|
+
| (hra_ ? 1 << flags::IS_HIGH_RANK : 0)
|
|
397
|
+
| (raw_items ? 1 << flags::RAW_ITEMS : 0)
|
|
398
|
+
| (compactors_[0].is_sorted() ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
399
|
+
);
|
|
400
|
+
write(os, flags_byte);
|
|
401
|
+
write(os, k_);
|
|
402
|
+
const uint8_t num_levels = is_empty() ? 0 : get_num_levels();
|
|
403
|
+
write(os, num_levels);
|
|
404
|
+
const uint8_t num_raw_items = raw_items ? n_ : 0;
|
|
405
|
+
write(os, num_raw_items);
|
|
406
|
+
if (is_empty()) return;
|
|
407
|
+
if (is_estimation_mode()) {
|
|
408
|
+
write(os, n_);
|
|
409
|
+
S().serialize(os, min_value_, 1);
|
|
410
|
+
S().serialize(os, max_value_, 1);
|
|
411
|
+
}
|
|
412
|
+
if (raw_items) {
|
|
413
|
+
S().serialize(os, compactors_[0].begin(), num_raw_items);
|
|
414
|
+
} else {
|
|
415
|
+
for (const auto& compactor: compactors_) compactor.serialize(os, S());
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
template<typename T, typename C, typename S, typename A>
|
|
420
|
+
auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
421
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
|
422
|
+
vector_bytes bytes(size, 0, allocator_);
|
|
423
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
424
|
+
const uint8_t* end_ptr = ptr + size;
|
|
425
|
+
|
|
426
|
+
const uint8_t preamble_ints = is_estimation_mode() ? 4 : 2;
|
|
427
|
+
ptr += copy_to_mem(preamble_ints, ptr);
|
|
428
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
429
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
430
|
+
const uint8_t family = FAMILY;
|
|
431
|
+
ptr += copy_to_mem(family, ptr);
|
|
432
|
+
const bool raw_items = n_ <= req_constants::MIN_K;
|
|
433
|
+
const uint8_t flags_byte(
|
|
434
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
435
|
+
| (hra_ ? 1 << flags::IS_HIGH_RANK : 0)
|
|
436
|
+
| (raw_items ? 1 << flags::RAW_ITEMS : 0)
|
|
437
|
+
| (compactors_[0].is_sorted() ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
438
|
+
);
|
|
439
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
440
|
+
ptr += copy_to_mem(k_, ptr);
|
|
441
|
+
const uint8_t num_levels = is_empty() ? 0 : get_num_levels();
|
|
442
|
+
ptr += copy_to_mem(num_levels, ptr);
|
|
443
|
+
const uint8_t num_raw_items = raw_items ? n_ : 0;
|
|
444
|
+
ptr += copy_to_mem(num_raw_items, ptr);
|
|
445
|
+
if (!is_empty()) {
|
|
446
|
+
if (is_estimation_mode()) {
|
|
447
|
+
ptr += copy_to_mem(n_, ptr);
|
|
448
|
+
ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
|
|
449
|
+
ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
|
|
450
|
+
}
|
|
451
|
+
if (raw_items) {
|
|
452
|
+
ptr += S().serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
|
|
453
|
+
} else {
|
|
454
|
+
for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, S());
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
return bytes;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
template<typename T, typename C, typename S, typename A>
|
|
461
|
+
req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
462
|
+
const auto preamble_ints = read<uint8_t>(is);
|
|
463
|
+
const auto serial_version = read<uint8_t>(is);
|
|
464
|
+
const auto family_id = read<uint8_t>(is);
|
|
465
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
466
|
+
const auto k = read<uint16_t>(is);
|
|
467
|
+
const auto num_levels = read<uint8_t>(is);
|
|
468
|
+
const auto num_raw_items = read<uint8_t>(is);
|
|
469
|
+
|
|
470
|
+
check_preamble_ints(preamble_ints, num_levels);
|
|
471
|
+
check_serial_version(serial_version);
|
|
472
|
+
check_family_id(family_id);
|
|
473
|
+
|
|
474
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
475
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
476
|
+
const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK);
|
|
477
|
+
if (is_empty) return req_sketch(k, hra, allocator);
|
|
478
|
+
|
|
479
|
+
A alloc(allocator);
|
|
480
|
+
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
|
|
481
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
482
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
483
|
+
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
484
|
+
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
485
|
+
|
|
486
|
+
const bool raw_items = flags_byte & (1 << flags::RAW_ITEMS);
|
|
487
|
+
const bool is_level_0_sorted = flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED);
|
|
488
|
+
std::vector<Compactor, AllocCompactor> compactors(allocator);
|
|
489
|
+
|
|
490
|
+
uint64_t n = 1;
|
|
491
|
+
if (num_levels > 1) {
|
|
492
|
+
n = read<uint64_t>(is);
|
|
493
|
+
S().deserialize(is, min_value_buffer.get(), 1);
|
|
494
|
+
// serde call did not throw, repackage with destrtuctor
|
|
495
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
496
|
+
S().deserialize(is, max_value_buffer.get(), 1);
|
|
497
|
+
// serde call did not throw, repackage with destrtuctor
|
|
498
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
if (raw_items) {
|
|
502
|
+
compactors.push_back(Compactor::deserialize(is, S(), allocator, is_level_0_sorted, k, num_raw_items, hra));
|
|
503
|
+
} else {
|
|
504
|
+
for (size_t i = 0; i < num_levels; ++i) {
|
|
505
|
+
compactors.push_back(Compactor::deserialize(is, S(), allocator, i == 0 ? is_level_0_sorted : true, hra));
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
if (num_levels == 1) {
|
|
509
|
+
const auto begin = compactors[0].begin();
|
|
510
|
+
const auto end = compactors[0].end();
|
|
511
|
+
n = compactors[0].get_num_items();
|
|
512
|
+
auto min_it = begin;
|
|
513
|
+
auto max_it = begin;
|
|
514
|
+
for (auto it = begin; it != end; ++it) {
|
|
515
|
+
if (C()(*it, *min_it)) min_it = it;
|
|
516
|
+
if (C()(*max_it, *it)) max_it = it;
|
|
517
|
+
}
|
|
518
|
+
new (min_value_buffer.get()) T(*min_it);
|
|
519
|
+
// copy did not throw, repackage with destrtuctor
|
|
520
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
521
|
+
new (max_value_buffer.get()) T(*max_it);
|
|
522
|
+
// copy did not throw, repackage with destrtuctor
|
|
523
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
527
|
+
return req_sketch(k, hra, n, std::move(min_value), std::move(max_value), std::move(compactors));
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
template<typename T, typename C, typename S, typename A>
|
|
531
|
+
req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
532
|
+
ensure_minimum_memory(size, 8);
|
|
533
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
534
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
|
535
|
+
|
|
536
|
+
uint8_t preamble_ints;
|
|
537
|
+
ptr += copy_from_mem(ptr, preamble_ints);
|
|
538
|
+
uint8_t serial_version;
|
|
539
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
540
|
+
uint8_t family_id;
|
|
541
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
542
|
+
uint8_t flags_byte;
|
|
543
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
544
|
+
uint16_t k;
|
|
545
|
+
ptr += copy_from_mem(ptr, k);
|
|
546
|
+
uint8_t num_levels;
|
|
547
|
+
ptr += copy_from_mem(ptr, num_levels);
|
|
548
|
+
uint8_t num_raw_items;
|
|
549
|
+
ptr += copy_from_mem(ptr, num_raw_items);
|
|
550
|
+
|
|
551
|
+
check_preamble_ints(preamble_ints, num_levels);
|
|
552
|
+
check_serial_version(serial_version);
|
|
553
|
+
check_family_id(family_id);
|
|
554
|
+
|
|
555
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
556
|
+
const bool hra = flags_byte & (1 << flags::IS_HIGH_RANK);
|
|
557
|
+
if (is_empty) return req_sketch(k, hra, allocator);
|
|
558
|
+
|
|
559
|
+
A alloc(allocator);
|
|
560
|
+
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
|
|
561
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
562
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
563
|
+
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
564
|
+
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
565
|
+
|
|
566
|
+
const bool raw_items = flags_byte & (1 << flags::RAW_ITEMS);
|
|
567
|
+
const bool is_level_0_sorted = flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED);
|
|
568
|
+
std::vector<Compactor, AllocCompactor> compactors(allocator);
|
|
569
|
+
|
|
570
|
+
uint64_t n = 1;
|
|
571
|
+
if (num_levels > 1) {
|
|
572
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(n));
|
|
573
|
+
ptr += copy_from_mem(ptr, n);
|
|
574
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
|
|
575
|
+
// serde call did not throw, repackage with destrtuctor
|
|
576
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
577
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
|
|
578
|
+
// serde call did not throw, repackage with destrtuctor
|
|
579
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
if (raw_items) {
|
|
583
|
+
auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, is_level_0_sorted, k, num_raw_items, hra);
|
|
584
|
+
compactors.push_back(std::move(pair.first));
|
|
585
|
+
ptr += pair.second;
|
|
586
|
+
} else {
|
|
587
|
+
for (size_t i = 0; i < num_levels; ++i) {
|
|
588
|
+
auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, i == 0 ? is_level_0_sorted : true, hra);
|
|
589
|
+
compactors.push_back(std::move(pair.first));
|
|
590
|
+
ptr += pair.second;
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
if (num_levels == 1) {
|
|
594
|
+
const auto begin = compactors[0].begin();
|
|
595
|
+
const auto end = compactors[0].end();
|
|
596
|
+
n = compactors[0].get_num_items();
|
|
597
|
+
auto min_it = begin;
|
|
598
|
+
auto max_it = begin;
|
|
599
|
+
for (auto it = begin; it != end; ++it) {
|
|
600
|
+
if (C()(*it, *min_it)) min_it = it;
|
|
601
|
+
if (C()(*max_it, *it)) max_it = it;
|
|
602
|
+
}
|
|
603
|
+
new (min_value_buffer.get()) T(*min_it);
|
|
604
|
+
// copy did not throw, repackage with destrtuctor
|
|
605
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
606
|
+
new (max_value_buffer.get()) T(*max_it);
|
|
607
|
+
// copy did not throw, repackage with destrtuctor
|
|
608
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
return req_sketch(k, hra, n, std::move(min_value), std::move(max_value), std::move(compactors));
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
template<typename T, typename C, typename S, typename A>
|
|
615
|
+
void req_sketch<T, C, S, A>::grow() {
|
|
616
|
+
const uint8_t lg_weight = get_num_levels();
|
|
617
|
+
compactors_.push_back(Compactor(hra_, lg_weight, k_, allocator_));
|
|
618
|
+
update_max_nom_size();
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
template<typename T, typename C, typename S, typename A>
|
|
622
|
+
uint8_t req_sketch<T, C, S, A>::get_num_levels() const {
|
|
623
|
+
return compactors_.size();
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
template<typename T, typename C, typename S, typename A>
|
|
627
|
+
void req_sketch<T, C, S, A>::update_max_nom_size() {
|
|
628
|
+
max_nom_size_ = 0;
|
|
629
|
+
for (const auto& compactor: compactors_) max_nom_size_ += compactor.get_nom_capacity();
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
template<typename T, typename C, typename S, typename A>
|
|
633
|
+
void req_sketch<T, C, S, A>::update_num_retained() {
|
|
634
|
+
num_retained_ = 0;
|
|
635
|
+
for (const auto& compactor: compactors_) num_retained_ += compactor.get_num_items();
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
template<typename T, typename C, typename S, typename A>
|
|
639
|
+
void req_sketch<T, C, S, A>::compress() {
|
|
640
|
+
for (size_t h = 0; h < compactors_.size(); ++h) {
|
|
641
|
+
if (compactors_[h].get_num_items() >= compactors_[h].get_nom_capacity()) {
|
|
642
|
+
if (h == 0) compactors_[0].sort();
|
|
643
|
+
if (h + 1 >= get_num_levels()) { // at the top?
|
|
644
|
+
grow(); // add a level, increases max_nom_size
|
|
645
|
+
}
|
|
646
|
+
auto pair = compactors_[h].compact(compactors_[h + 1]);
|
|
647
|
+
num_retained_ -= pair.first;
|
|
648
|
+
max_nom_size_ += pair.second;
|
|
649
|
+
if (LAZY_COMPRESSION && num_retained_ < max_nom_size_) break;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
template<typename T, typename C, typename S, typename A>
|
|
655
|
+
string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
|
|
656
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
657
|
+
os << "### REQ sketch summary:" << std::endl;
|
|
658
|
+
os << " K : " << k_ << std::endl;
|
|
659
|
+
os << " High Rank Acc : " << (hra_ ? "true" : "false") << std::endl;
|
|
660
|
+
os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
|
|
661
|
+
os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
|
662
|
+
os << " Sorted : " << (compactors_[0].is_sorted() ? "true" : "false") << std::endl;
|
|
663
|
+
os << " N : " << n_ << std::endl;
|
|
664
|
+
os << " Levels : " << compactors_.size() << std::endl;
|
|
665
|
+
os << " Retained items : " << num_retained_ << std::endl;
|
|
666
|
+
os << " Capacity items : " << max_nom_size_ << std::endl;
|
|
667
|
+
if (!is_empty()) {
|
|
668
|
+
os << " Min value : " << *min_value_ << std::endl;
|
|
669
|
+
os << " Max value : " << *max_value_ << std::endl;
|
|
670
|
+
}
|
|
671
|
+
os << "### End sketch summary" << std::endl;
|
|
672
|
+
|
|
673
|
+
if (print_levels) {
|
|
674
|
+
os << "### REQ sketch levels:" << std::endl;
|
|
675
|
+
os << " index: nominal capacity, actual size" << std::endl;
|
|
676
|
+
for (uint8_t i = 0; i < compactors_.size(); i++) {
|
|
677
|
+
os << " " << (unsigned int) i << ": "
|
|
678
|
+
<< compactors_[i].get_nom_capacity() << ", "
|
|
679
|
+
<< compactors_[i].get_num_items() << std::endl;
|
|
680
|
+
}
|
|
681
|
+
os << "### End sketch levels" << std::endl;
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
if (print_items) {
|
|
685
|
+
os << "### REQ sketch data:" << std::endl;
|
|
686
|
+
unsigned level = 0;
|
|
687
|
+
for (const auto& compactor: compactors_) {
|
|
688
|
+
os << " level " << level << ": " << std::endl;
|
|
689
|
+
for (auto it = compactor.begin(); it != compactor.end(); ++it) {
|
|
690
|
+
os << " " << *it << std::endl;
|
|
691
|
+
}
|
|
692
|
+
++level;
|
|
693
|
+
}
|
|
694
|
+
os << "### End sketch data" << std::endl;
|
|
695
|
+
}
|
|
696
|
+
return os.str();
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
template<typename T, typename C, typename S, typename A>
|
|
700
|
+
class req_sketch<T, C, S, A>::item_deleter {
|
|
701
|
+
public:
|
|
702
|
+
item_deleter(const A& allocator): allocator_(allocator) {}
|
|
703
|
+
void operator() (T* ptr) {
|
|
704
|
+
if (ptr != nullptr) {
|
|
705
|
+
ptr->~T();
|
|
706
|
+
allocator_.deallocate(ptr, 1);
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
private:
|
|
710
|
+
A allocator_;
|
|
711
|
+
};
|
|
712
|
+
|
|
713
|
+
template<typename T, typename C, typename S, typename A>
|
|
714
|
+
req_sketch<T, C, S, A>::req_sketch(uint32_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors):
|
|
715
|
+
allocator_(compactors.get_allocator()),
|
|
716
|
+
k_(k),
|
|
717
|
+
hra_(hra),
|
|
718
|
+
max_nom_size_(0),
|
|
719
|
+
num_retained_(0),
|
|
720
|
+
n_(n),
|
|
721
|
+
compactors_(std::move(compactors)),
|
|
722
|
+
min_value_(min_value.release()),
|
|
723
|
+
max_value_(max_value.release())
|
|
724
|
+
{
|
|
725
|
+
update_max_nom_size();
|
|
726
|
+
update_num_retained();
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
template<typename T, typename C, typename S, typename A>
|
|
730
|
+
void req_sketch<T, C, S, A>::check_preamble_ints(uint8_t preamble_ints, uint8_t num_levels) {
|
|
731
|
+
const uint8_t expected_preamble_ints = num_levels > 1 ? 4 : 2;
|
|
732
|
+
if (preamble_ints != expected_preamble_ints) {
|
|
733
|
+
throw std::invalid_argument("Possible corruption: preamble ints must be "
|
|
734
|
+
+ std::to_string(expected_preamble_ints) + ", got " + std::to_string(preamble_ints));
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
template<typename T, typename C, typename S, typename A>
|
|
739
|
+
void req_sketch<T, C, S, A>::check_serial_version(uint8_t serial_version) {
|
|
740
|
+
if (serial_version != SERIAL_VERSION) {
|
|
741
|
+
throw std::invalid_argument("Possible corruption: serial version mismatch: expected "
|
|
742
|
+
+ std::to_string(SERIAL_VERSION)
|
|
743
|
+
+ ", got " + std::to_string(serial_version));
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
template<typename T, typename C, typename S, typename A>
|
|
748
|
+
void req_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
|
|
749
|
+
if (family_id != FAMILY) {
|
|
750
|
+
throw std::invalid_argument("Possible corruption: family mismatch: expected "
|
|
751
|
+
+ std::to_string(FAMILY) + ", got " + std::to_string(family_id));
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
template<typename T, typename C, typename S, typename A>
|
|
756
|
+
auto req_sketch<T, C, S, A>::begin() const -> const_iterator {
|
|
757
|
+
return const_iterator(compactors_.begin(), compactors_.end());
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
template<typename T, typename C, typename S, typename A>
|
|
761
|
+
auto req_sketch<T, C, S, A>::end() const -> const_iterator {
|
|
762
|
+
return const_iterator(compactors_.end(), compactors_.end());
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
// iterator
|
|
766
|
+
|
|
767
|
+
template<typename T, typename C, typename S, typename A>
|
|
768
|
+
req_sketch<T, C, S, A>::const_iterator::const_iterator(LevelsIterator begin, LevelsIterator end):
|
|
769
|
+
levels_it_(begin),
|
|
770
|
+
levels_end_(end),
|
|
771
|
+
compactor_it_((*levels_it_).begin())
|
|
772
|
+
{}
|
|
773
|
+
|
|
774
|
+
template<typename T, typename C, typename S, typename A>
|
|
775
|
+
auto req_sketch<T, C, S, A>::const_iterator::operator++() -> const_iterator& {
|
|
776
|
+
++compactor_it_;
|
|
777
|
+
if (compactor_it_ == (*levels_it_).end()) {
|
|
778
|
+
++levels_it_;
|
|
779
|
+
if (levels_it_ != levels_end_) compactor_it_ = (*levels_it_).begin();
|
|
780
|
+
}
|
|
781
|
+
return *this;
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
template<typename T, typename C, typename S, typename A>
|
|
785
|
+
auto req_sketch<T, C, S, A>::const_iterator::operator++(int) -> const_iterator& {
|
|
786
|
+
const_iterator tmp(*this);
|
|
787
|
+
operator++();
|
|
788
|
+
return tmp;
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
template<typename T, typename C, typename S, typename A>
|
|
792
|
+
bool req_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
|
|
793
|
+
if (levels_it_ != other.levels_it_) return false;
|
|
794
|
+
if (levels_it_ == levels_end_) return true;
|
|
795
|
+
return compactor_it_ == other.compactor_it_;
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
template<typename T, typename C, typename S, typename A>
|
|
799
|
+
bool req_sketch<T, C, S, A>::const_iterator::operator!=(const const_iterator& other) const {
|
|
800
|
+
return !operator==(other);
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
template<typename T, typename C, typename S, typename A>
|
|
804
|
+
std::pair<const T&, const uint64_t> req_sketch<T, C, S, A>::const_iterator::operator*() const {
|
|
805
|
+
return std::pair<const T&, const uint64_t>(*compactor_it_, 1 << (*levels_it_).get_lg_weight());
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
} /* namespace datasketches */
|
|
809
|
+
|
|
810
|
+
#endif
|