datasketches 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -0,0 +1,595 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_IMPL_HPP_
21
+ #define _TDIGEST_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <sstream>
25
+
26
+ #include "common_defs.hpp"
27
+ #include "memory_operations.hpp"
28
+
29
+ namespace datasketches {
30
+
31
+ template<typename T, typename A>
32
+ tdigest<T, A>::tdigest(uint16_t k, const A& allocator):
33
+ tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(), vector_centroid(allocator), 0, vector_t(allocator))
34
+ {}
35
+
36
+ template<typename T, typename A>
37
+ void tdigest<T, A>::update(T value) {
38
+ if (std::isnan(value)) return;
39
+ if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress();
40
+ buffer_.push_back(value);
41
+ min_ = std::min(min_, value);
42
+ max_ = std::max(max_, value);
43
+ }
44
+
45
+ template<typename T, typename A>
46
+ void tdigest<T, A>::merge(tdigest& other) {
47
+ if (other.is_empty()) return;
48
+ vector_centroid tmp(buffer_.get_allocator());
49
+ tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
50
+ for (const T value: buffer_) tmp.push_back(centroid(value, 1));
51
+ for (const T value: other.buffer_) tmp.push_back(centroid(value, 1));
52
+ std::copy(other.centroids_.begin(), other.centroids_.end(), std::back_inserter(tmp));
53
+ merge(tmp, buffer_.size() + other.get_total_weight());
54
+ }
55
+
56
+ template<typename T, typename A>
57
+ void tdigest<T, A>::compress() {
58
+ if (buffer_.size() == 0) return;
59
+ vector_centroid tmp(buffer_.get_allocator());
60
+ tmp.reserve(buffer_.size() + centroids_.size());
61
+ for (const T value: buffer_) tmp.push_back(centroid(value, 1));
62
+ merge(tmp, buffer_.size());
63
+ }
64
+
65
+ template<typename T, typename A>
66
+ bool tdigest<T, A>::is_empty() const {
67
+ return centroids_.empty() && buffer_.empty();
68
+ }
69
+
70
+ template<typename T, typename A>
71
+ T tdigest<T, A>::get_min_value() const {
72
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
73
+ return min_;
74
+ }
75
+
76
+ template<typename T, typename A>
77
+ T tdigest<T, A>::get_max_value() const {
78
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
79
+ return max_;
80
+ }
81
+
82
+ template<typename T, typename A>
83
+ uint64_t tdigest<T, A>::get_total_weight() const {
84
+ return centroids_weight_ + buffer_.size();
85
+ }
86
+
87
+ template<typename T, typename A>
88
+ double tdigest<T, A>::get_rank(T value) const {
89
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
90
+ if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN");
91
+ if (value < min_) return 0;
92
+ if (value > max_) return 1;
93
+ // one centroid and value == min_ == max_
94
+ if ((centroids_.size() + buffer_.size()) == 1) return 0.5;
95
+
96
+ const_cast<tdigest*>(this)->compress(); // side effect
97
+
98
+ // left tail
99
+ const T first_mean = centroids_.front().get_mean();
100
+ if (value < first_mean) {
101
+ if (first_mean - min_ > 0) {
102
+ if (value == min_) return 0.5 / centroids_weight_;
103
+ return (1.0 + (value - min_) / (first_mean - min_) * (centroids_.front().get_weight() / 2.0 - 1.0)); // ?
104
+ }
105
+ return 0; // should never happen
106
+ }
107
+
108
+ // right tail
109
+ const T last_mean = centroids_.back().get_mean();
110
+ if (value > last_mean) {
111
+ if (max_ - last_mean > 0) {
112
+ if (value == max_) return 1.0 - 0.5 / centroids_weight_;
113
+ return 1.0 - ((1.0 + (max_ - value) / (max_ - last_mean) * (centroids_.back().get_weight() / 2.0 - 1.0)) / centroids_weight_); // ?
114
+ }
115
+ return 1; // should never happen
116
+ }
117
+
118
+ auto lower = std::lower_bound(centroids_.begin(), centroids_.end(), centroid(value, 1), centroid_cmp());
119
+ if (lower == centroids_.end()) throw std::logic_error("lower == end in get_rank()");
120
+ auto upper = std::upper_bound(lower, centroids_.end(), centroid(value, 1), centroid_cmp());
121
+ if (upper == centroids_.begin()) throw std::logic_error("upper == begin in get_rank()");
122
+ if (value < lower->get_mean()) --lower;
123
+ if (upper == centroids_.end() || !((upper - 1)->get_mean() < value)) --upper;
124
+ double weight_below = 0;
125
+ auto it = centroids_.begin();
126
+ while (it != lower) {
127
+ weight_below += it->get_weight();
128
+ ++it;
129
+ }
130
+ weight_below += lower->get_weight() / 2.0;
131
+ double weight_delta = 0;
132
+ while (it != upper) {
133
+ weight_delta += it->get_weight();
134
+ ++it;
135
+ }
136
+ weight_delta -= lower->get_weight() / 2.0;
137
+ weight_delta += upper->get_weight() / 2.0;
138
+ if (upper->get_mean() - lower->get_mean() > 0) {
139
+ return (weight_below + weight_delta * (value - lower->get_mean()) / (upper->get_mean() - lower->get_mean())) / centroids_weight_;
140
+ }
141
+ return (weight_below + weight_delta / 2.0) / centroids_weight_;
142
+ }
143
+
144
+ template<typename T, typename A>
145
+ T tdigest<T, A>::get_quantile(double rank) const {
146
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
147
+ if ((rank < 0.0) || (rank > 1.0)) {
148
+ throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
149
+ }
150
+ const_cast<tdigest*>(this)->compress(); // side effect
151
+ if (centroids_.size() == 1) return centroids_.front().get_mean();
152
+
153
+ // at least 2 centroids
154
+ const double weight = rank * centroids_weight_;
155
+ if (weight < 1) return min_;
156
+ if (weight > centroids_weight_ - 1.0) return max_;
157
+ const double first_weight = centroids_.front().get_weight();
158
+ if (first_weight > 1 && weight < first_weight / 2.0) {
159
+ return min_ + (weight - 1.0) / (first_weight / 2.0 - 1.0) * (centroids_.front().get_mean() - min_);
160
+ }
161
+ const double last_weight = centroids_.back().get_weight();
162
+ if (last_weight > 1 && centroids_weight_ - weight <= last_weight / 2.0) {
163
+ return max_ + (centroids_weight_ - weight - 1.0) / (last_weight / 2.0 - 1.0) * (max_ - centroids_.back().get_mean());
164
+ }
165
+
166
+ // interpolate between extremes
167
+ double weight_so_far = first_weight / 2.0;
168
+ for (size_t i = 0; i < centroids_.size() - 1; ++i) {
169
+ const double dw = (centroids_[i].get_weight() + centroids_[i + 1].get_weight()) / 2.0;
170
+ if (weight_so_far + dw > weight) {
171
+ // the target weight is between centroids i and i+1
172
+ double left_weight = 0;
173
+ if (centroids_[i].get_weight() == 1) {
174
+ if (weight - weight_so_far < 0.5) return centroids_[i].get_mean();
175
+ left_weight = 0.5;
176
+ }
177
+ double right_weight = 0;
178
+ if (centroids_[i + 1].get_weight() == 1) {
179
+ if (weight_so_far + dw - weight <= 0.5) return centroids_[i + 1].get_mean();
180
+ right_weight = 0.5;
181
+ }
182
+ const double w1 = weight - weight_so_far - left_weight;
183
+ const double w2 = weight_so_far + dw - weight - right_weight;
184
+ return weighted_average(centroids_[i].get_mean(), w1, centroids_[i + 1].get_mean(), w2);
185
+ }
186
+ weight_so_far += dw;
187
+ }
188
+ const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0;
189
+ const double w2 = centroids_.back().get_weight() / 2.0 - w1;
190
+ return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
191
+ }
192
+
193
+ template<typename T, typename A>
194
+ uint16_t tdigest<T, A>::get_k() const {
195
+ return k_;
196
+ }
197
+
198
+ template<typename T, typename A>
199
+ string<A> tdigest<T, A>::to_string(bool print_centroids) const {
200
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
201
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
202
+ std::ostringstream os;
203
+ os << "### t-Digest summary:" << std::endl;
204
+ os << " Nominal k : " << k_ << std::endl;
205
+ os << " Centroids : " << centroids_.size() << std::endl;
206
+ os << " Buffered : " << buffer_.size() << std::endl;
207
+ os << " Centroids capacity : " << centroids_capacity_ << std::endl;
208
+ os << " Buffer capacity : " << centroids_capacity_ * BUFFER_MULTIPLIER << std::endl;
209
+ os << " Centroids Weight : " << centroids_weight_ << std::endl;
210
+ os << " Total Weight : " << get_total_weight() << std::endl;
211
+ os << " Reverse Merge : " << (reverse_merge_ ? "true" : "false") << std::endl;
212
+ if (!is_empty()) {
213
+ os << " Min : " << min_ << std::endl;
214
+ os << " Max : " << max_ << std::endl;
215
+ }
216
+ os << "### End t-Digest summary" << std::endl;
217
+ if (print_centroids) {
218
+ if (centroids_.size() > 0) {
219
+ os << "Centroids:" << std::endl;
220
+ int i = 0;
221
+ for (const auto& c: centroids_) {
222
+ os << i++ << ": " << c.get_mean() << ", " << c.get_weight() << std::endl;
223
+ }
224
+ }
225
+ if (buffer_.size() > 0) {
226
+ os << "Buffer:" << std::endl;
227
+ int i = 0;
228
+ for (const T value: buffer_) {
229
+ os << i++ << ": " << value << std::endl;
230
+ }
231
+ }
232
+ }
233
+ return string<A>(os.str().c_str(), buffer_.get_allocator());
234
+ }
235
+
236
+ // assumes that there is enough room in the input buffer to add centroids from this tdigest
237
+ template<typename T, typename A>
238
+ void tdigest<T, A>::merge(vector_centroid& buffer, W weight) {
239
+ std::copy(centroids_.begin(), centroids_.end(), std::back_inserter(buffer));
240
+ centroids_.clear();
241
+ std::stable_sort(buffer.begin(), buffer.end(), centroid_cmp());
242
+ if (reverse_merge_) std::reverse(buffer.begin(), buffer.end());
243
+ centroids_weight_ += weight;
244
+ auto it = buffer.begin();
245
+ centroids_.push_back(*it);
246
+ ++it;
247
+ double weight_so_far = 0;
248
+ while (it != buffer.end()) {
249
+ const double proposed_weight = centroids_.back().get_weight() + it->get_weight();
250
+ bool add_this = false;
251
+ if (std::distance(buffer.begin(), it) != 1 && std::distance(buffer.end(), it) != 1) {
252
+ const double q0 = weight_so_far / centroids_weight_;
253
+ const double q2 = (weight_so_far + proposed_weight) / centroids_weight_;
254
+ const double normalizer = scale_function().normalizer(2 * k_, centroids_weight_);
255
+ add_this = proposed_weight <= centroids_weight_ * std::min(scale_function().max(q0, normalizer), scale_function().max(q2, normalizer));
256
+ }
257
+ if (add_this) {
258
+ centroids_.back().add(*it);
259
+ } else {
260
+ weight_so_far += centroids_.back().get_weight();
261
+ centroids_.push_back(*it);
262
+ }
263
+ ++it;
264
+ }
265
+ if (reverse_merge_) std::reverse(centroids_.begin(), centroids_.end());
266
+ min_ = std::min(min_, centroids_.front().get_mean());
267
+ max_ = std::max(max_, centroids_.back().get_mean());
268
+ reverse_merge_ = !reverse_merge_;
269
+ buffer_.clear();
270
+ }
271
+
272
+ template<typename T, typename A>
273
+ double tdigest<T, A>::weighted_average(double x1, double w1, double x2, double w2) {
274
+ return (x1 * w1 + x2 * w2) / (w1 + w2);
275
+ }
276
+
277
+ template<typename T, typename A>
278
+ void tdigest<T, A>::serialize(std::ostream& os, bool with_buffer) const {
279
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
280
+ write(os, get_preamble_longs());
281
+ write(os, SERIAL_VERSION);
282
+ write(os, SKETCH_TYPE);
283
+ write(os, k_);
284
+ const uint8_t flags_byte(
285
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
286
+ | (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
287
+ | (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
288
+ );
289
+ write(os, flags_byte);
290
+ write<uint16_t>(os, 0); // unused
291
+ if (is_empty()) return;
292
+ if (is_single_value()) {
293
+ write(os, min_);
294
+ return;
295
+ }
296
+ write(os, static_cast<uint32_t>(centroids_.size()));
297
+ write(os, static_cast<uint32_t>(buffer_.size()));
298
+ write(os, min_);
299
+ write(os, max_);
300
+ if (centroids_.size() > 0) write(os, centroids_.data(), centroids_.size() * sizeof(centroid));
301
+ if (buffer_.size() > 0) write(os, buffer_.data(), buffer_.size() * sizeof(T));
302
+ }
303
+
304
+ template<typename T, typename A>
305
+ uint8_t tdigest<T, A>::get_preamble_longs() const {
306
+ return is_empty() || is_single_value() ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
307
+ }
308
+
309
+ template<typename T, typename A>
310
+ size_t tdigest<T, A>::get_serialized_size_bytes(bool with_buffer) const {
311
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
312
+ size_t size_bytes = get_preamble_longs() * sizeof(uint64_t);
313
+ if (is_empty()) return size_bytes;
314
+ if (is_single_value()) return size_bytes + sizeof(T);
315
+ size_bytes += sizeof(T) * 2 // min and max
316
+ + sizeof(centroid) * centroids_.size();
317
+ if (with_buffer) size_bytes += sizeof(T) * buffer_.size(); // count is a part of preamble
318
+ return size_bytes;
319
+ }
320
+
321
+ template<typename T, typename A>
322
+ auto tdigest<T, A>::serialize(unsigned header_size_bytes, bool with_buffer) const -> vector_bytes {
323
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
324
+ vector_bytes bytes(get_serialized_size_bytes(with_buffer), 0, buffer_.get_allocator());
325
+ uint8_t* ptr = bytes.data() + header_size_bytes;
326
+ *ptr++ = get_preamble_longs();
327
+ *ptr++ = SERIAL_VERSION;
328
+ *ptr++ = SKETCH_TYPE;
329
+ ptr += copy_to_mem(k_, ptr);
330
+ const uint8_t flags_byte(
331
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
332
+ | (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
333
+ | (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
334
+ );
335
+ *ptr++ = flags_byte;
336
+ ptr += 2; // unused
337
+ if (is_empty()) return bytes;
338
+ if (is_single_value()) {
339
+ copy_to_mem(min_, ptr);
340
+ return bytes;
341
+ }
342
+ ptr += copy_to_mem(static_cast<uint32_t>(centroids_.size()), ptr);
343
+ ptr += copy_to_mem(static_cast<uint32_t>(buffer_.size()), ptr);
344
+ ptr += copy_to_mem(min_, ptr);
345
+ ptr += copy_to_mem(max_, ptr);
346
+ if (centroids_.size() > 0) ptr += copy_to_mem(centroids_.data(), ptr, centroids_.size() * sizeof(centroid));
347
+ if (buffer_.size() > 0) copy_to_mem(buffer_.data(), ptr, buffer_.size() * sizeof(T));
348
+ return bytes;
349
+ }
350
+
351
+ template<typename T, typename A>
352
+ tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
353
+ const auto preamble_longs = read<uint8_t>(is);
354
+ const auto serial_version = read<uint8_t>(is);
355
+ const auto sketch_type = read<uint8_t>(is);
356
+ if (sketch_type != SKETCH_TYPE) {
357
+ if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(is, allocator);
358
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
359
+ }
360
+ if (serial_version != SERIAL_VERSION) {
361
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
362
+ }
363
+ const auto k = read<uint16_t>(is);
364
+ const auto flags_byte = read<uint8_t>(is);
365
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
366
+ const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
367
+ const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
368
+ if (preamble_longs != expected_preamble_longs) {
369
+ throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
370
+ }
371
+ read<uint16_t>(is); // unused
372
+
373
+ if (is_empty) return tdigest(k, allocator);
374
+
375
+ const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
376
+ if (is_single_value) {
377
+ const T value = read<T>(is);
378
+ return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
379
+ }
380
+
381
+ const auto num_centroids = read<uint32_t>(is);
382
+ const auto num_buffered = read<uint32_t>(is);
383
+
384
+ const T min = read<T>(is);
385
+ const T max = read<T>(is);
386
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
387
+ if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid));
388
+ vector_t buffer(num_buffered, 0, allocator);
389
+ if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T));
390
+ uint64_t weight = 0;
391
+ for (const auto& c: centroids) weight += c.get_weight();
392
+ return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
393
+ }
394
+
395
+ template<typename T, typename A>
396
+ tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
397
+ ensure_minimum_memory(size, 8);
398
+ const char* ptr = static_cast<const char*>(bytes);
399
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
400
+
401
+ const uint8_t preamble_longs = *ptr++;
402
+ const uint8_t serial_version = *ptr++;
403
+ const uint8_t sketch_type = *ptr++;
404
+ if (sketch_type != SKETCH_TYPE) {
405
+ if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(ptr, end_ptr - ptr, allocator);
406
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
407
+ }
408
+ if (serial_version != SERIAL_VERSION) {
409
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
410
+ }
411
+ uint16_t k;
412
+ ptr += copy_from_mem(ptr, k);
413
+ const uint8_t flags_byte = *ptr++;
414
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
415
+ const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
416
+ const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
417
+ if (preamble_longs != expected_preamble_longs) {
418
+ throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
419
+ }
420
+ ptr += 2; // unused
421
+
422
+ if (is_empty) return tdigest(k, allocator);
423
+
424
+ const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
425
+ if (is_single_value) {
426
+ ensure_minimum_memory(end_ptr - ptr, sizeof(T));
427
+ T value;
428
+ ptr += copy_from_mem(ptr, value);
429
+ return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
430
+ }
431
+
432
+ ensure_minimum_memory(end_ptr - ptr, 8);
433
+ uint32_t num_centroids;
434
+ ptr += copy_from_mem(ptr, num_centroids);
435
+ uint32_t num_buffered;
436
+ ptr += copy_from_mem(ptr, num_buffered);
437
+
438
+ ensure_minimum_memory(end_ptr - ptr, sizeof(T) * 2 + sizeof(centroid) * num_centroids + sizeof(T) * num_buffered);
439
+ T min;
440
+ ptr += copy_from_mem(ptr, min);
441
+ T max;
442
+ ptr += copy_from_mem(ptr, max);
443
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
444
+ if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid));
445
+ vector_t buffer(num_buffered, 0, allocator);
446
+ if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T));
447
+ uint64_t weight = 0;
448
+ for (const auto& c: centroids) weight += c.get_weight();
449
+ return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
450
+ }
451
+
452
+ // compatibility with the format of the reference implementation
453
+ // default byte order of ByteBuffer is used there, which is big endian
454
+ template<typename T, typename A>
455
+ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& allocator) {
456
+ // this method was called because the first three bytes were zeros
457
+ // so read one more byte to see if it looks like the reference implementation format
458
+ const auto type = read<uint8_t>(is);
459
+ if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
460
+ throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
461
+ }
462
+ if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
463
+ const auto min = read_big_endian<double>(is);
464
+ const auto max = read_big_endian<double>(is);
465
+ const auto k = static_cast<uint16_t>(read_big_endian<double>(is));
466
+ const auto num_centroids = read_big_endian<uint32_t>(is);
467
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
468
+ uint64_t total_weight = 0;
469
+ for (auto& c: centroids) {
470
+ const W weight = static_cast<W>(read_big_endian<double>(is));
471
+ const auto mean = read_big_endian<double>(is);
472
+ c = centroid(mean, weight);
473
+ total_weight += weight;
474
+ }
475
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
476
+ }
477
+ // COMPAT_FLOAT: compatibility with asSmallBytes()
478
+ const auto min = read_big_endian<double>(is); // reference implementation uses doubles for min and max
479
+ const auto max = read_big_endian<double>(is);
480
+ const auto k = static_cast<uint16_t>(read_big_endian<float>(is));
481
+ // reference implementation stores capacities of the array of centroids and the buffer as shorts
482
+ // they can be derived from k in the constructor
483
+ read<uint32_t>(is); // unused
484
+ const auto num_centroids = read_big_endian<uint16_t>(is);
485
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
486
+ uint64_t total_weight = 0;
487
+ for (auto& c: centroids) {
488
+ const W weight = static_cast<W>(read_big_endian<float>(is));
489
+ const auto mean = read_big_endian<float>(is);
490
+ c = centroid(mean, weight);
491
+ total_weight += weight;
492
+ }
493
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
494
+ }
495
+
496
+ // compatibility with the format of the reference implementation
497
+ // default byte order of ByteBuffer is used there, which is big endian
498
+ template<typename T, typename A>
499
+ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size, const A& allocator) {
500
+ const char* ptr = static_cast<const char*>(bytes);
501
+ // this method was called because the first three bytes were zeros
502
+ // so read one more byte to see if it looks like the reference implementation format
503
+ const auto type = *ptr++;
504
+ if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
505
+ throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
506
+ }
507
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
508
+ if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
509
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 3 + sizeof(uint32_t));
510
+ double min;
511
+ ptr += copy_from_mem(ptr, min);
512
+ min = byteswap(min);
513
+ double max;
514
+ ptr += copy_from_mem(ptr, max);
515
+ max = byteswap(max);
516
+ double k_double;
517
+ ptr += copy_from_mem(ptr, k_double);
518
+ const uint16_t k = static_cast<uint16_t>(byteswap(k_double));
519
+ uint32_t num_centroids;
520
+ ptr += copy_from_mem(ptr, num_centroids);
521
+ num_centroids = byteswap(num_centroids);
522
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * num_centroids * 2);
523
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
524
+ uint64_t total_weight = 0;
525
+ for (auto& c: centroids) {
526
+ double weight;
527
+ ptr += copy_from_mem(ptr, weight);
528
+ weight = byteswap(weight);
529
+ double mean;
530
+ ptr += copy_from_mem(ptr, mean);
531
+ mean = byteswap(mean);
532
+ c = centroid(mean, static_cast<W>(weight));
533
+ total_weight += static_cast<uint64_t>(weight);
534
+ }
535
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
536
+ }
537
+ // COMPAT_FLOAT: compatibility with asSmallBytes()
538
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 2 + sizeof(float) + sizeof(uint16_t) * 3);
539
+ double min; // reference implementation uses doubles for min and max
540
+ ptr += copy_from_mem(ptr, min);
541
+ min = byteswap(min);
542
+ double max;
543
+ ptr += copy_from_mem(ptr, max);
544
+ max = byteswap(max);
545
+ float k_float;
546
+ ptr += copy_from_mem(ptr, k_float);
547
+ const uint16_t k = static_cast<uint16_t>(byteswap(k_float));
548
+ // reference implementation stores capacities of the array of centroids and the buffer as shorts
549
+ // they can be derived from k in the constructor
550
+ ptr += sizeof(uint32_t); // unused
551
+ uint16_t num_centroids;
552
+ ptr += copy_from_mem(ptr, num_centroids);
553
+ num_centroids = byteswap(num_centroids);
554
+ ensure_minimum_memory(end_ptr - ptr, sizeof(float) * num_centroids * 2);
555
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
556
+ uint64_t total_weight = 0;
557
+ for (auto& c: centroids) {
558
+ float weight;
559
+ ptr += copy_from_mem(ptr, weight);
560
+ weight = byteswap(weight);
561
+ float mean;
562
+ ptr += copy_from_mem(ptr, mean);
563
+ mean = byteswap(mean);
564
+ c = centroid(mean, static_cast<W>(weight));
565
+ total_weight += static_cast<uint64_t>(weight);
566
+ }
567
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
568
+ }
569
+
570
+ template<typename T, typename A>
571
+ bool tdigest<T, A>::is_single_value() const {
572
+ return get_total_weight() == 1;
573
+ }
574
+
575
+ template<typename T, typename A>
576
+ tdigest<T, A>::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer):
577
+ reverse_merge_(reverse_merge),
578
+ k_(k),
579
+ min_(min),
580
+ max_(max),
581
+ centroids_capacity_(0),
582
+ centroids_(std::move(centroids)),
583
+ centroids_weight_(weight),
584
+ buffer_(std::move(buffer))
585
+ {
586
+ if (k < 10) throw std::invalid_argument("k must be at least 10");
587
+ const size_t fudge = k < 30 ? 30 : 10;
588
+ centroids_capacity_ = 2 * k_ + fudge;
589
+ centroids_.reserve(centroids_capacity_);
590
+ buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
591
+ }
592
+
593
+ } /* namespace datasketches */
594
+
595
+ #endif // _TDIGEST_IMPL_HPP_
@@ -0,0 +1,56 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(tdigest_test)
19
+
20
+ target_link_libraries(tdigest_test tdigest common common_test_lib)
21
+
22
+ set_target_properties(tdigest_test PROPERTIES
23
+ CXX_STANDARD_REQUIRED YES
24
+ )
25
+
26
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" tdigest_TEST_BINARY_PATH)
27
+ string(APPEND tdigest_TEST_BINARY_PATH "/")
28
+ target_compile_definitions(tdigest_test
29
+ PRIVATE
30
+ TEST_BINARY_INPUT_PATH="${tdigest_TEST_BINARY_PATH}"
31
+ )
32
+
33
+ add_test(
34
+ NAME tdigest_test
35
+ COMMAND tdigest_test
36
+ )
37
+
38
+ target_sources(tdigest_test
39
+ PRIVATE
40
+ tdigest_test.cpp
41
+ tdigest_custom_allocator_test.cpp
42
+ )
43
+
44
+ if (SERDE_COMPAT)
45
+ target_sources(tdigest_test
46
+ PRIVATE
47
+ tdigest_deserialize_from_java_test.cpp
48
+ )
49
+ endif()
50
+
51
+ if (GENERATE)
52
+ target_sources(tdigest_test
53
+ PRIVATE
54
+ tdigest_serialize_for_java.cpp
55
+ )
56
+ endif()
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "tdigest.hpp"
23
+ #include "test_allocator.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ using alloc_d = test_allocator<double>;
28
+ using tdigest_d = tdigest<double, alloc_d>;
29
+
30
+ TEST_CASE("tdigest custom allocator", "[tdigest]") {
31
+ test_allocator_total_bytes = 0;
32
+ test_allocator_net_allocations = 0;
33
+ {
34
+ tdigest_d td(100, alloc_d(0));
35
+ for (int i = 0; i < 10000; ++i) td.update(static_cast<double>(i));
36
+ REQUIRE(test_allocator_total_bytes != 0);
37
+ REQUIRE(test_allocator_net_allocations != 0);
38
+ }
39
+ REQUIRE(test_allocator_total_bytes == 0);
40
+ REQUIRE(test_allocator_net_allocations == 0);
41
+ }
42
+
43
+ } /* namespace datasketches */