datasketches 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -0,0 +1,595 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _TDIGEST_IMPL_HPP_
|
21
|
+
#define _TDIGEST_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <cmath>
|
24
|
+
#include <sstream>
|
25
|
+
|
26
|
+
#include "common_defs.hpp"
|
27
|
+
#include "memory_operations.hpp"
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
template<typename T, typename A>
|
32
|
+
tdigest<T, A>::tdigest(uint16_t k, const A& allocator):
|
33
|
+
tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(), vector_centroid(allocator), 0, vector_t(allocator))
|
34
|
+
{}
|
35
|
+
|
36
|
+
template<typename T, typename A>
|
37
|
+
void tdigest<T, A>::update(T value) {
|
38
|
+
if (std::isnan(value)) return;
|
39
|
+
if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress();
|
40
|
+
buffer_.push_back(value);
|
41
|
+
min_ = std::min(min_, value);
|
42
|
+
max_ = std::max(max_, value);
|
43
|
+
}
|
44
|
+
|
45
|
+
template<typename T, typename A>
|
46
|
+
void tdigest<T, A>::merge(tdigest& other) {
|
47
|
+
if (other.is_empty()) return;
|
48
|
+
vector_centroid tmp(buffer_.get_allocator());
|
49
|
+
tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
|
50
|
+
for (const T value: buffer_) tmp.push_back(centroid(value, 1));
|
51
|
+
for (const T value: other.buffer_) tmp.push_back(centroid(value, 1));
|
52
|
+
std::copy(other.centroids_.begin(), other.centroids_.end(), std::back_inserter(tmp));
|
53
|
+
merge(tmp, buffer_.size() + other.get_total_weight());
|
54
|
+
}
|
55
|
+
|
56
|
+
template<typename T, typename A>
|
57
|
+
void tdigest<T, A>::compress() {
|
58
|
+
if (buffer_.size() == 0) return;
|
59
|
+
vector_centroid tmp(buffer_.get_allocator());
|
60
|
+
tmp.reserve(buffer_.size() + centroids_.size());
|
61
|
+
for (const T value: buffer_) tmp.push_back(centroid(value, 1));
|
62
|
+
merge(tmp, buffer_.size());
|
63
|
+
}
|
64
|
+
|
65
|
+
template<typename T, typename A>
|
66
|
+
bool tdigest<T, A>::is_empty() const {
|
67
|
+
return centroids_.empty() && buffer_.empty();
|
68
|
+
}
|
69
|
+
|
70
|
+
template<typename T, typename A>
|
71
|
+
T tdigest<T, A>::get_min_value() const {
|
72
|
+
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
73
|
+
return min_;
|
74
|
+
}
|
75
|
+
|
76
|
+
template<typename T, typename A>
|
77
|
+
T tdigest<T, A>::get_max_value() const {
|
78
|
+
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
79
|
+
return max_;
|
80
|
+
}
|
81
|
+
|
82
|
+
template<typename T, typename A>
|
83
|
+
uint64_t tdigest<T, A>::get_total_weight() const {
|
84
|
+
return centroids_weight_ + buffer_.size();
|
85
|
+
}
|
86
|
+
|
87
|
+
template<typename T, typename A>
|
88
|
+
double tdigest<T, A>::get_rank(T value) const {
|
89
|
+
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
90
|
+
if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN");
|
91
|
+
if (value < min_) return 0;
|
92
|
+
if (value > max_) return 1;
|
93
|
+
// one centroid and value == min_ == max_
|
94
|
+
if ((centroids_.size() + buffer_.size()) == 1) return 0.5;
|
95
|
+
|
96
|
+
const_cast<tdigest*>(this)->compress(); // side effect
|
97
|
+
|
98
|
+
// left tail
|
99
|
+
const T first_mean = centroids_.front().get_mean();
|
100
|
+
if (value < first_mean) {
|
101
|
+
if (first_mean - min_ > 0) {
|
102
|
+
if (value == min_) return 0.5 / centroids_weight_;
|
103
|
+
return (1.0 + (value - min_) / (first_mean - min_) * (centroids_.front().get_weight() / 2.0 - 1.0)); // ?
|
104
|
+
}
|
105
|
+
return 0; // should never happen
|
106
|
+
}
|
107
|
+
|
108
|
+
// right tail
|
109
|
+
const T last_mean = centroids_.back().get_mean();
|
110
|
+
if (value > last_mean) {
|
111
|
+
if (max_ - last_mean > 0) {
|
112
|
+
if (value == max_) return 1.0 - 0.5 / centroids_weight_;
|
113
|
+
return 1.0 - ((1.0 + (max_ - value) / (max_ - last_mean) * (centroids_.back().get_weight() / 2.0 - 1.0)) / centroids_weight_); // ?
|
114
|
+
}
|
115
|
+
return 1; // should never happen
|
116
|
+
}
|
117
|
+
|
118
|
+
auto lower = std::lower_bound(centroids_.begin(), centroids_.end(), centroid(value, 1), centroid_cmp());
|
119
|
+
if (lower == centroids_.end()) throw std::logic_error("lower == end in get_rank()");
|
120
|
+
auto upper = std::upper_bound(lower, centroids_.end(), centroid(value, 1), centroid_cmp());
|
121
|
+
if (upper == centroids_.begin()) throw std::logic_error("upper == begin in get_rank()");
|
122
|
+
if (value < lower->get_mean()) --lower;
|
123
|
+
if (upper == centroids_.end() || !((upper - 1)->get_mean() < value)) --upper;
|
124
|
+
double weight_below = 0;
|
125
|
+
auto it = centroids_.begin();
|
126
|
+
while (it != lower) {
|
127
|
+
weight_below += it->get_weight();
|
128
|
+
++it;
|
129
|
+
}
|
130
|
+
weight_below += lower->get_weight() / 2.0;
|
131
|
+
double weight_delta = 0;
|
132
|
+
while (it != upper) {
|
133
|
+
weight_delta += it->get_weight();
|
134
|
+
++it;
|
135
|
+
}
|
136
|
+
weight_delta -= lower->get_weight() / 2.0;
|
137
|
+
weight_delta += upper->get_weight() / 2.0;
|
138
|
+
if (upper->get_mean() - lower->get_mean() > 0) {
|
139
|
+
return (weight_below + weight_delta * (value - lower->get_mean()) / (upper->get_mean() - lower->get_mean())) / centroids_weight_;
|
140
|
+
}
|
141
|
+
return (weight_below + weight_delta / 2.0) / centroids_weight_;
|
142
|
+
}
|
143
|
+
|
144
|
+
template<typename T, typename A>
|
145
|
+
T tdigest<T, A>::get_quantile(double rank) const {
|
146
|
+
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
147
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
148
|
+
throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
|
149
|
+
}
|
150
|
+
const_cast<tdigest*>(this)->compress(); // side effect
|
151
|
+
if (centroids_.size() == 1) return centroids_.front().get_mean();
|
152
|
+
|
153
|
+
// at least 2 centroids
|
154
|
+
const double weight = rank * centroids_weight_;
|
155
|
+
if (weight < 1) return min_;
|
156
|
+
if (weight > centroids_weight_ - 1.0) return max_;
|
157
|
+
const double first_weight = centroids_.front().get_weight();
|
158
|
+
if (first_weight > 1 && weight < first_weight / 2.0) {
|
159
|
+
return min_ + (weight - 1.0) / (first_weight / 2.0 - 1.0) * (centroids_.front().get_mean() - min_);
|
160
|
+
}
|
161
|
+
const double last_weight = centroids_.back().get_weight();
|
162
|
+
if (last_weight > 1 && centroids_weight_ - weight <= last_weight / 2.0) {
|
163
|
+
return max_ + (centroids_weight_ - weight - 1.0) / (last_weight / 2.0 - 1.0) * (max_ - centroids_.back().get_mean());
|
164
|
+
}
|
165
|
+
|
166
|
+
// interpolate between extremes
|
167
|
+
double weight_so_far = first_weight / 2.0;
|
168
|
+
for (size_t i = 0; i < centroids_.size() - 1; ++i) {
|
169
|
+
const double dw = (centroids_[i].get_weight() + centroids_[i + 1].get_weight()) / 2.0;
|
170
|
+
if (weight_so_far + dw > weight) {
|
171
|
+
// the target weight is between centroids i and i+1
|
172
|
+
double left_weight = 0;
|
173
|
+
if (centroids_[i].get_weight() == 1) {
|
174
|
+
if (weight - weight_so_far < 0.5) return centroids_[i].get_mean();
|
175
|
+
left_weight = 0.5;
|
176
|
+
}
|
177
|
+
double right_weight = 0;
|
178
|
+
if (centroids_[i + 1].get_weight() == 1) {
|
179
|
+
if (weight_so_far + dw - weight <= 0.5) return centroids_[i + 1].get_mean();
|
180
|
+
right_weight = 0.5;
|
181
|
+
}
|
182
|
+
const double w1 = weight - weight_so_far - left_weight;
|
183
|
+
const double w2 = weight_so_far + dw - weight - right_weight;
|
184
|
+
return weighted_average(centroids_[i].get_mean(), w1, centroids_[i + 1].get_mean(), w2);
|
185
|
+
}
|
186
|
+
weight_so_far += dw;
|
187
|
+
}
|
188
|
+
const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0;
|
189
|
+
const double w2 = centroids_.back().get_weight() / 2.0 - w1;
|
190
|
+
return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
|
191
|
+
}
|
192
|
+
|
193
|
+
template<typename T, typename A>
|
194
|
+
uint16_t tdigest<T, A>::get_k() const {
|
195
|
+
return k_;
|
196
|
+
}
|
197
|
+
|
198
|
+
template<typename T, typename A>
|
199
|
+
string<A> tdigest<T, A>::to_string(bool print_centroids) const {
|
200
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
201
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
202
|
+
std::ostringstream os;
|
203
|
+
os << "### t-Digest summary:" << std::endl;
|
204
|
+
os << " Nominal k : " << k_ << std::endl;
|
205
|
+
os << " Centroids : " << centroids_.size() << std::endl;
|
206
|
+
os << " Buffered : " << buffer_.size() << std::endl;
|
207
|
+
os << " Centroids capacity : " << centroids_capacity_ << std::endl;
|
208
|
+
os << " Buffer capacity : " << centroids_capacity_ * BUFFER_MULTIPLIER << std::endl;
|
209
|
+
os << " Centroids Weight : " << centroids_weight_ << std::endl;
|
210
|
+
os << " Total Weight : " << get_total_weight() << std::endl;
|
211
|
+
os << " Reverse Merge : " << (reverse_merge_ ? "true" : "false") << std::endl;
|
212
|
+
if (!is_empty()) {
|
213
|
+
os << " Min : " << min_ << std::endl;
|
214
|
+
os << " Max : " << max_ << std::endl;
|
215
|
+
}
|
216
|
+
os << "### End t-Digest summary" << std::endl;
|
217
|
+
if (print_centroids) {
|
218
|
+
if (centroids_.size() > 0) {
|
219
|
+
os << "Centroids:" << std::endl;
|
220
|
+
int i = 0;
|
221
|
+
for (const auto& c: centroids_) {
|
222
|
+
os << i++ << ": " << c.get_mean() << ", " << c.get_weight() << std::endl;
|
223
|
+
}
|
224
|
+
}
|
225
|
+
if (buffer_.size() > 0) {
|
226
|
+
os << "Buffer:" << std::endl;
|
227
|
+
int i = 0;
|
228
|
+
for (const T value: buffer_) {
|
229
|
+
os << i++ << ": " << value << std::endl;
|
230
|
+
}
|
231
|
+
}
|
232
|
+
}
|
233
|
+
return string<A>(os.str().c_str(), buffer_.get_allocator());
|
234
|
+
}
|
235
|
+
|
236
|
+
// assumes that there is enough room in the input buffer to add centroids from this tdigest
|
237
|
+
template<typename T, typename A>
|
238
|
+
void tdigest<T, A>::merge(vector_centroid& buffer, W weight) {
|
239
|
+
std::copy(centroids_.begin(), centroids_.end(), std::back_inserter(buffer));
|
240
|
+
centroids_.clear();
|
241
|
+
std::stable_sort(buffer.begin(), buffer.end(), centroid_cmp());
|
242
|
+
if (reverse_merge_) std::reverse(buffer.begin(), buffer.end());
|
243
|
+
centroids_weight_ += weight;
|
244
|
+
auto it = buffer.begin();
|
245
|
+
centroids_.push_back(*it);
|
246
|
+
++it;
|
247
|
+
double weight_so_far = 0;
|
248
|
+
while (it != buffer.end()) {
|
249
|
+
const double proposed_weight = centroids_.back().get_weight() + it->get_weight();
|
250
|
+
bool add_this = false;
|
251
|
+
if (std::distance(buffer.begin(), it) != 1 && std::distance(buffer.end(), it) != 1) {
|
252
|
+
const double q0 = weight_so_far / centroids_weight_;
|
253
|
+
const double q2 = (weight_so_far + proposed_weight) / centroids_weight_;
|
254
|
+
const double normalizer = scale_function().normalizer(2 * k_, centroids_weight_);
|
255
|
+
add_this = proposed_weight <= centroids_weight_ * std::min(scale_function().max(q0, normalizer), scale_function().max(q2, normalizer));
|
256
|
+
}
|
257
|
+
if (add_this) {
|
258
|
+
centroids_.back().add(*it);
|
259
|
+
} else {
|
260
|
+
weight_so_far += centroids_.back().get_weight();
|
261
|
+
centroids_.push_back(*it);
|
262
|
+
}
|
263
|
+
++it;
|
264
|
+
}
|
265
|
+
if (reverse_merge_) std::reverse(centroids_.begin(), centroids_.end());
|
266
|
+
min_ = std::min(min_, centroids_.front().get_mean());
|
267
|
+
max_ = std::max(max_, centroids_.back().get_mean());
|
268
|
+
reverse_merge_ = !reverse_merge_;
|
269
|
+
buffer_.clear();
|
270
|
+
}
|
271
|
+
|
272
|
+
template<typename T, typename A>
|
273
|
+
double tdigest<T, A>::weighted_average(double x1, double w1, double x2, double w2) {
|
274
|
+
return (x1 * w1 + x2 * w2) / (w1 + w2);
|
275
|
+
}
|
276
|
+
|
277
|
+
template<typename T, typename A>
|
278
|
+
void tdigest<T, A>::serialize(std::ostream& os, bool with_buffer) const {
|
279
|
+
if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
|
280
|
+
write(os, get_preamble_longs());
|
281
|
+
write(os, SERIAL_VERSION);
|
282
|
+
write(os, SKETCH_TYPE);
|
283
|
+
write(os, k_);
|
284
|
+
const uint8_t flags_byte(
|
285
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
286
|
+
| (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
|
287
|
+
| (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
|
288
|
+
);
|
289
|
+
write(os, flags_byte);
|
290
|
+
write<uint16_t>(os, 0); // unused
|
291
|
+
if (is_empty()) return;
|
292
|
+
if (is_single_value()) {
|
293
|
+
write(os, min_);
|
294
|
+
return;
|
295
|
+
}
|
296
|
+
write(os, static_cast<uint32_t>(centroids_.size()));
|
297
|
+
write(os, static_cast<uint32_t>(buffer_.size()));
|
298
|
+
write(os, min_);
|
299
|
+
write(os, max_);
|
300
|
+
if (centroids_.size() > 0) write(os, centroids_.data(), centroids_.size() * sizeof(centroid));
|
301
|
+
if (buffer_.size() > 0) write(os, buffer_.data(), buffer_.size() * sizeof(T));
|
302
|
+
}
|
303
|
+
|
304
|
+
template<typename T, typename A>
|
305
|
+
uint8_t tdigest<T, A>::get_preamble_longs() const {
|
306
|
+
return is_empty() || is_single_value() ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
|
307
|
+
}
|
308
|
+
|
309
|
+
template<typename T, typename A>
|
310
|
+
size_t tdigest<T, A>::get_serialized_size_bytes(bool with_buffer) const {
|
311
|
+
if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
|
312
|
+
size_t size_bytes = get_preamble_longs() * sizeof(uint64_t);
|
313
|
+
if (is_empty()) return size_bytes;
|
314
|
+
if (is_single_value()) return size_bytes + sizeof(T);
|
315
|
+
size_bytes += sizeof(T) * 2 // min and max
|
316
|
+
+ sizeof(centroid) * centroids_.size();
|
317
|
+
if (with_buffer) size_bytes += sizeof(T) * buffer_.size(); // count is a part of preamble
|
318
|
+
return size_bytes;
|
319
|
+
}
|
320
|
+
|
321
|
+
template<typename T, typename A>
|
322
|
+
auto tdigest<T, A>::serialize(unsigned header_size_bytes, bool with_buffer) const -> vector_bytes {
|
323
|
+
if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
|
324
|
+
vector_bytes bytes(get_serialized_size_bytes(with_buffer), 0, buffer_.get_allocator());
|
325
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
326
|
+
*ptr++ = get_preamble_longs();
|
327
|
+
*ptr++ = SERIAL_VERSION;
|
328
|
+
*ptr++ = SKETCH_TYPE;
|
329
|
+
ptr += copy_to_mem(k_, ptr);
|
330
|
+
const uint8_t flags_byte(
|
331
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
332
|
+
| (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
|
333
|
+
| (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
|
334
|
+
);
|
335
|
+
*ptr++ = flags_byte;
|
336
|
+
ptr += 2; // unused
|
337
|
+
if (is_empty()) return bytes;
|
338
|
+
if (is_single_value()) {
|
339
|
+
copy_to_mem(min_, ptr);
|
340
|
+
return bytes;
|
341
|
+
}
|
342
|
+
ptr += copy_to_mem(static_cast<uint32_t>(centroids_.size()), ptr);
|
343
|
+
ptr += copy_to_mem(static_cast<uint32_t>(buffer_.size()), ptr);
|
344
|
+
ptr += copy_to_mem(min_, ptr);
|
345
|
+
ptr += copy_to_mem(max_, ptr);
|
346
|
+
if (centroids_.size() > 0) ptr += copy_to_mem(centroids_.data(), ptr, centroids_.size() * sizeof(centroid));
|
347
|
+
if (buffer_.size() > 0) copy_to_mem(buffer_.data(), ptr, buffer_.size() * sizeof(T));
|
348
|
+
return bytes;
|
349
|
+
}
|
350
|
+
|
351
|
+
template<typename T, typename A>
|
352
|
+
tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
|
353
|
+
const auto preamble_longs = read<uint8_t>(is);
|
354
|
+
const auto serial_version = read<uint8_t>(is);
|
355
|
+
const auto sketch_type = read<uint8_t>(is);
|
356
|
+
if (sketch_type != SKETCH_TYPE) {
|
357
|
+
if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(is, allocator);
|
358
|
+
throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
|
359
|
+
}
|
360
|
+
if (serial_version != SERIAL_VERSION) {
|
361
|
+
throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
|
362
|
+
}
|
363
|
+
const auto k = read<uint16_t>(is);
|
364
|
+
const auto flags_byte = read<uint8_t>(is);
|
365
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
366
|
+
const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
|
367
|
+
const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
|
368
|
+
if (preamble_longs != expected_preamble_longs) {
|
369
|
+
throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
|
370
|
+
}
|
371
|
+
read<uint16_t>(is); // unused
|
372
|
+
|
373
|
+
if (is_empty) return tdigest(k, allocator);
|
374
|
+
|
375
|
+
const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
|
376
|
+
if (is_single_value) {
|
377
|
+
const T value = read<T>(is);
|
378
|
+
return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
|
379
|
+
}
|
380
|
+
|
381
|
+
const auto num_centroids = read<uint32_t>(is);
|
382
|
+
const auto num_buffered = read<uint32_t>(is);
|
383
|
+
|
384
|
+
const T min = read<T>(is);
|
385
|
+
const T max = read<T>(is);
|
386
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
387
|
+
if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid));
|
388
|
+
vector_t buffer(num_buffered, 0, allocator);
|
389
|
+
if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T));
|
390
|
+
uint64_t weight = 0;
|
391
|
+
for (const auto& c: centroids) weight += c.get_weight();
|
392
|
+
return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
|
393
|
+
}
|
394
|
+
|
395
|
+
template<typename T, typename A>
|
396
|
+
tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
397
|
+
ensure_minimum_memory(size, 8);
|
398
|
+
const char* ptr = static_cast<const char*>(bytes);
|
399
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
400
|
+
|
401
|
+
const uint8_t preamble_longs = *ptr++;
|
402
|
+
const uint8_t serial_version = *ptr++;
|
403
|
+
const uint8_t sketch_type = *ptr++;
|
404
|
+
if (sketch_type != SKETCH_TYPE) {
|
405
|
+
if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(ptr, end_ptr - ptr, allocator);
|
406
|
+
throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
|
407
|
+
}
|
408
|
+
if (serial_version != SERIAL_VERSION) {
|
409
|
+
throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
|
410
|
+
}
|
411
|
+
uint16_t k;
|
412
|
+
ptr += copy_from_mem(ptr, k);
|
413
|
+
const uint8_t flags_byte = *ptr++;
|
414
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
415
|
+
const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
|
416
|
+
const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
|
417
|
+
if (preamble_longs != expected_preamble_longs) {
|
418
|
+
throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
|
419
|
+
}
|
420
|
+
ptr += 2; // unused
|
421
|
+
|
422
|
+
if (is_empty) return tdigest(k, allocator);
|
423
|
+
|
424
|
+
const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
|
425
|
+
if (is_single_value) {
|
426
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(T));
|
427
|
+
T value;
|
428
|
+
ptr += copy_from_mem(ptr, value);
|
429
|
+
return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
|
430
|
+
}
|
431
|
+
|
432
|
+
ensure_minimum_memory(end_ptr - ptr, 8);
|
433
|
+
uint32_t num_centroids;
|
434
|
+
ptr += copy_from_mem(ptr, num_centroids);
|
435
|
+
uint32_t num_buffered;
|
436
|
+
ptr += copy_from_mem(ptr, num_buffered);
|
437
|
+
|
438
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(T) * 2 + sizeof(centroid) * num_centroids + sizeof(T) * num_buffered);
|
439
|
+
T min;
|
440
|
+
ptr += copy_from_mem(ptr, min);
|
441
|
+
T max;
|
442
|
+
ptr += copy_from_mem(ptr, max);
|
443
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
444
|
+
if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid));
|
445
|
+
vector_t buffer(num_buffered, 0, allocator);
|
446
|
+
if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T));
|
447
|
+
uint64_t weight = 0;
|
448
|
+
for (const auto& c: centroids) weight += c.get_weight();
|
449
|
+
return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
|
450
|
+
}
|
451
|
+
|
452
|
+
// compatibility with the format of the reference implementation
|
453
|
+
// default byte order of ByteBuffer is used there, which is big endian
|
454
|
+
template<typename T, typename A>
|
455
|
+
tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& allocator) {
|
456
|
+
// this method was called because the first three bytes were zeros
|
457
|
+
// so read one more byte to see if it looks like the reference implementation format
|
458
|
+
const auto type = read<uint8_t>(is);
|
459
|
+
if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
|
460
|
+
throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
|
461
|
+
}
|
462
|
+
if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
|
463
|
+
const auto min = read_big_endian<double>(is);
|
464
|
+
const auto max = read_big_endian<double>(is);
|
465
|
+
const auto k = static_cast<uint16_t>(read_big_endian<double>(is));
|
466
|
+
const auto num_centroids = read_big_endian<uint32_t>(is);
|
467
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
468
|
+
uint64_t total_weight = 0;
|
469
|
+
for (auto& c: centroids) {
|
470
|
+
const W weight = static_cast<W>(read_big_endian<double>(is));
|
471
|
+
const auto mean = read_big_endian<double>(is);
|
472
|
+
c = centroid(mean, weight);
|
473
|
+
total_weight += weight;
|
474
|
+
}
|
475
|
+
return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
|
476
|
+
}
|
477
|
+
// COMPAT_FLOAT: compatibility with asSmallBytes()
|
478
|
+
const auto min = read_big_endian<double>(is); // reference implementation uses doubles for min and max
|
479
|
+
const auto max = read_big_endian<double>(is);
|
480
|
+
const auto k = static_cast<uint16_t>(read_big_endian<float>(is));
|
481
|
+
// reference implementation stores capacities of the array of centroids and the buffer as shorts
|
482
|
+
// they can be derived from k in the constructor
|
483
|
+
read<uint32_t>(is); // unused
|
484
|
+
const auto num_centroids = read_big_endian<uint16_t>(is);
|
485
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
486
|
+
uint64_t total_weight = 0;
|
487
|
+
for (auto& c: centroids) {
|
488
|
+
const W weight = static_cast<W>(read_big_endian<float>(is));
|
489
|
+
const auto mean = read_big_endian<float>(is);
|
490
|
+
c = centroid(mean, weight);
|
491
|
+
total_weight += weight;
|
492
|
+
}
|
493
|
+
return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
|
494
|
+
}
|
495
|
+
|
496
|
+
// compatibility with the format of the reference implementation
|
497
|
+
// default byte order of ByteBuffer is used there, which is big endian
|
498
|
+
template<typename T, typename A>
|
499
|
+
tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size, const A& allocator) {
|
500
|
+
const char* ptr = static_cast<const char*>(bytes);
|
501
|
+
// this method was called because the first three bytes were zeros
|
502
|
+
// so read one more byte to see if it looks like the reference implementation format
|
503
|
+
const auto type = *ptr++;
|
504
|
+
if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
|
505
|
+
throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
|
506
|
+
}
|
507
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
508
|
+
if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
|
509
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 3 + sizeof(uint32_t));
|
510
|
+
double min;
|
511
|
+
ptr += copy_from_mem(ptr, min);
|
512
|
+
min = byteswap(min);
|
513
|
+
double max;
|
514
|
+
ptr += copy_from_mem(ptr, max);
|
515
|
+
max = byteswap(max);
|
516
|
+
double k_double;
|
517
|
+
ptr += copy_from_mem(ptr, k_double);
|
518
|
+
const uint16_t k = static_cast<uint16_t>(byteswap(k_double));
|
519
|
+
uint32_t num_centroids;
|
520
|
+
ptr += copy_from_mem(ptr, num_centroids);
|
521
|
+
num_centroids = byteswap(num_centroids);
|
522
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(double) * num_centroids * 2);
|
523
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
524
|
+
uint64_t total_weight = 0;
|
525
|
+
for (auto& c: centroids) {
|
526
|
+
double weight;
|
527
|
+
ptr += copy_from_mem(ptr, weight);
|
528
|
+
weight = byteswap(weight);
|
529
|
+
double mean;
|
530
|
+
ptr += copy_from_mem(ptr, mean);
|
531
|
+
mean = byteswap(mean);
|
532
|
+
c = centroid(mean, static_cast<W>(weight));
|
533
|
+
total_weight += static_cast<uint64_t>(weight);
|
534
|
+
}
|
535
|
+
return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
|
536
|
+
}
|
537
|
+
// COMPAT_FLOAT: compatibility with asSmallBytes()
|
538
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 2 + sizeof(float) + sizeof(uint16_t) * 3);
|
539
|
+
double min; // reference implementation uses doubles for min and max
|
540
|
+
ptr += copy_from_mem(ptr, min);
|
541
|
+
min = byteswap(min);
|
542
|
+
double max;
|
543
|
+
ptr += copy_from_mem(ptr, max);
|
544
|
+
max = byteswap(max);
|
545
|
+
float k_float;
|
546
|
+
ptr += copy_from_mem(ptr, k_float);
|
547
|
+
const uint16_t k = static_cast<uint16_t>(byteswap(k_float));
|
548
|
+
// reference implementation stores capacities of the array of centroids and the buffer as shorts
|
549
|
+
// they can be derived from k in the constructor
|
550
|
+
ptr += sizeof(uint32_t); // unused
|
551
|
+
uint16_t num_centroids;
|
552
|
+
ptr += copy_from_mem(ptr, num_centroids);
|
553
|
+
num_centroids = byteswap(num_centroids);
|
554
|
+
ensure_minimum_memory(end_ptr - ptr, sizeof(float) * num_centroids * 2);
|
555
|
+
vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
|
556
|
+
uint64_t total_weight = 0;
|
557
|
+
for (auto& c: centroids) {
|
558
|
+
float weight;
|
559
|
+
ptr += copy_from_mem(ptr, weight);
|
560
|
+
weight = byteswap(weight);
|
561
|
+
float mean;
|
562
|
+
ptr += copy_from_mem(ptr, mean);
|
563
|
+
mean = byteswap(mean);
|
564
|
+
c = centroid(mean, static_cast<W>(weight));
|
565
|
+
total_weight += static_cast<uint64_t>(weight);
|
566
|
+
}
|
567
|
+
return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
|
568
|
+
}
|
569
|
+
|
570
|
+
template<typename T, typename A>
|
571
|
+
bool tdigest<T, A>::is_single_value() const {
|
572
|
+
return get_total_weight() == 1;
|
573
|
+
}
|
574
|
+
|
575
|
+
template<typename T, typename A>
|
576
|
+
tdigest<T, A>::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer):
|
577
|
+
reverse_merge_(reverse_merge),
|
578
|
+
k_(k),
|
579
|
+
min_(min),
|
580
|
+
max_(max),
|
581
|
+
centroids_capacity_(0),
|
582
|
+
centroids_(std::move(centroids)),
|
583
|
+
centroids_weight_(weight),
|
584
|
+
buffer_(std::move(buffer))
|
585
|
+
{
|
586
|
+
if (k < 10) throw std::invalid_argument("k must be at least 10");
|
587
|
+
const size_t fudge = k < 30 ? 30 : 10;
|
588
|
+
centroids_capacity_ = 2 * k_ + fudge;
|
589
|
+
centroids_.reserve(centroids_capacity_);
|
590
|
+
buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
|
591
|
+
}
|
592
|
+
|
593
|
+
} /* namespace datasketches */
|
594
|
+
|
595
|
+
#endif // _TDIGEST_IMPL_HPP_
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
add_executable(tdigest_test)
|
19
|
+
|
20
|
+
target_link_libraries(tdigest_test tdigest common common_test_lib)
|
21
|
+
|
22
|
+
set_target_properties(tdigest_test PROPERTIES
|
23
|
+
CXX_STANDARD_REQUIRED YES
|
24
|
+
)
|
25
|
+
|
26
|
+
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" tdigest_TEST_BINARY_PATH)
|
27
|
+
string(APPEND tdigest_TEST_BINARY_PATH "/")
|
28
|
+
target_compile_definitions(tdigest_test
|
29
|
+
PRIVATE
|
30
|
+
TEST_BINARY_INPUT_PATH="${tdigest_TEST_BINARY_PATH}"
|
31
|
+
)
|
32
|
+
|
33
|
+
add_test(
|
34
|
+
NAME tdigest_test
|
35
|
+
COMMAND tdigest_test
|
36
|
+
)
|
37
|
+
|
38
|
+
target_sources(tdigest_test
|
39
|
+
PRIVATE
|
40
|
+
tdigest_test.cpp
|
41
|
+
tdigest_custom_allocator_test.cpp
|
42
|
+
)
|
43
|
+
|
44
|
+
if (SERDE_COMPAT)
|
45
|
+
target_sources(tdigest_test
|
46
|
+
PRIVATE
|
47
|
+
tdigest_deserialize_from_java_test.cpp
|
48
|
+
)
|
49
|
+
endif()
|
50
|
+
|
51
|
+
if (GENERATE)
|
52
|
+
target_sources(tdigest_test
|
53
|
+
PRIVATE
|
54
|
+
tdigest_serialize_for_java.cpp
|
55
|
+
)
|
56
|
+
endif()
|
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
|
22
|
+
#include "tdigest.hpp"
|
23
|
+
#include "test_allocator.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
using alloc_d = test_allocator<double>;
|
28
|
+
using tdigest_d = tdigest<double, alloc_d>;
|
29
|
+
|
30
|
+
TEST_CASE("tdigest custom allocator", "[tdigest]") {
|
31
|
+
test_allocator_total_bytes = 0;
|
32
|
+
test_allocator_net_allocations = 0;
|
33
|
+
{
|
34
|
+
tdigest_d td(100, alloc_d(0));
|
35
|
+
for (int i = 0; i < 10000; ++i) td.update(static_cast<double>(i));
|
36
|
+
REQUIRE(test_allocator_total_bytes != 0);
|
37
|
+
REQUIRE(test_allocator_net_allocations != 0);
|
38
|
+
}
|
39
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
40
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
41
|
+
}
|
42
|
+
|
43
|
+
} /* namespace datasketches */
|