datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,632 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_IMPL_HPP_
21
+ #define _TDIGEST_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <cmath>
25
+ #include <sstream>
26
+
27
+ #include "common_defs.hpp"
28
+ #include "memory_operations.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename T, typename A>
33
+ tdigest<T, A>::tdigest(uint16_t k, const A& allocator):
34
+ tdigest(false, k, std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(), vector_centroid(allocator), 0, vector_t(allocator))
35
+ {}
36
+
37
+ template<typename T, typename A>
38
+ void tdigest<T, A>::update(T value) {
39
+ if (std::isnan(value)) return;
40
+ if (buffer_.size() == centroids_capacity_ * BUFFER_MULTIPLIER) compress();
41
+ buffer_.push_back(value);
42
+ min_ = std::min(min_, value);
43
+ max_ = std::max(max_, value);
44
+ }
45
+
46
+ template<typename T, typename A>
47
+ void tdigest<T, A>::merge(const tdigest& other) {
48
+ if (other.is_empty()) return;
49
+ vector_centroid tmp(buffer_.get_allocator());
50
+ tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
51
+ for (const T value: buffer_) tmp.push_back(centroid(value, 1));
52
+ for (const T value: other.buffer_) tmp.push_back(centroid(value, 1));
53
+ std::copy(other.centroids_.begin(), other.centroids_.end(), std::back_inserter(tmp));
54
+ merge(tmp, buffer_.size() + other.get_total_weight());
55
+ }
56
+
57
+ template<typename T, typename A>
58
+ void tdigest<T, A>::compress() {
59
+ if (buffer_.size() == 0) return;
60
+ vector_centroid tmp(buffer_.get_allocator());
61
+ tmp.reserve(buffer_.size() + centroids_.size());
62
+ for (const T value: buffer_) tmp.push_back(centroid(value, 1));
63
+ merge(tmp, buffer_.size());
64
+ }
65
+
66
+ template<typename T, typename A>
67
+ bool tdigest<T, A>::is_empty() const {
68
+ return centroids_.empty() && buffer_.empty();
69
+ }
70
+
71
+ template<typename T, typename A>
72
+ T tdigest<T, A>::get_min_value() const {
73
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
74
+ return min_;
75
+ }
76
+
77
+ template<typename T, typename A>
78
+ T tdigest<T, A>::get_max_value() const {
79
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
80
+ return max_;
81
+ }
82
+
83
+ template<typename T, typename A>
84
+ uint64_t tdigest<T, A>::get_total_weight() const {
85
+ return centroids_weight_ + buffer_.size();
86
+ }
87
+
88
+ template<typename T, typename A>
89
+ A tdigest<T, A>::get_allocator() const {
90
+ return buffer_.get_allocator();
91
+ }
92
+
93
+ template<typename T, typename A>
94
+ double tdigest<T, A>::get_rank(T value) const {
95
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
96
+ if (std::isnan(value)) throw std::invalid_argument("operation is undefined for NaN");
97
+ if (value < min_) return 0;
98
+ if (value > max_) return 1;
99
+ // one centroid and value == min_ == max_
100
+ if ((centroids_.size() + buffer_.size()) == 1) return 0.5;
101
+
102
+ const_cast<tdigest*>(this)->compress(); // side effect
103
+
104
+ // left tail
105
+ const T first_mean = centroids_.front().get_mean();
106
+ if (value < first_mean) {
107
+ if (first_mean - min_ > 0) {
108
+ if (value == min_) return 0.5 / centroids_weight_;
109
+ return (1.0 + (value - min_) / (first_mean - min_) * (centroids_.front().get_weight() / 2.0 - 1.0)); // ?
110
+ }
111
+ return 0; // should never happen
112
+ }
113
+
114
+ // right tail
115
+ const T last_mean = centroids_.back().get_mean();
116
+ if (value > last_mean) {
117
+ if (max_ - last_mean > 0) {
118
+ if (value == max_) return 1.0 - 0.5 / centroids_weight_;
119
+ return 1.0 - ((1.0 + (max_ - value) / (max_ - last_mean) * (centroids_.back().get_weight() / 2.0 - 1.0)) / centroids_weight_); // ?
120
+ }
121
+ return 1; // should never happen
122
+ }
123
+
124
+ auto lower = std::lower_bound(centroids_.begin(), centroids_.end(), centroid(value, 1), centroid_cmp());
125
+ if (lower == centroids_.end()) throw std::logic_error("lower == end in get_rank()");
126
+ auto upper = std::upper_bound(lower, centroids_.end(), centroid(value, 1), centroid_cmp());
127
+ if (upper == centroids_.begin()) throw std::logic_error("upper == begin in get_rank()");
128
+ if (value < lower->get_mean()) --lower;
129
+ if (upper == centroids_.end() || !((upper - 1)->get_mean() < value)) --upper;
130
+ double weight_below = 0;
131
+ auto it = centroids_.begin();
132
+ while (it != lower) {
133
+ weight_below += it->get_weight();
134
+ ++it;
135
+ }
136
+ weight_below += lower->get_weight() / 2.0;
137
+ double weight_delta = 0;
138
+ while (it != upper) {
139
+ weight_delta += it->get_weight();
140
+ ++it;
141
+ }
142
+ weight_delta -= lower->get_weight() / 2.0;
143
+ weight_delta += upper->get_weight() / 2.0;
144
+ if (upper->get_mean() - lower->get_mean() > 0) {
145
+ return (weight_below + weight_delta * (value - lower->get_mean()) / (upper->get_mean() - lower->get_mean())) / centroids_weight_;
146
+ }
147
+ return (weight_below + weight_delta / 2.0) / centroids_weight_;
148
+ }
149
+
150
+ template<typename T, typename A>
151
+ T tdigest<T, A>::get_quantile(double rank) const {
152
+ if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
153
+ if ((rank < 0.0) || (rank > 1.0)) {
154
+ throw std::invalid_argument("Normalized rank cannot be less than 0 or greater than 1");
155
+ }
156
+ const_cast<tdigest*>(this)->compress(); // side effect
157
+ if (centroids_.size() == 1) return centroids_.front().get_mean();
158
+
159
+ // at least 2 centroids
160
+ const double weight = rank * centroids_weight_;
161
+ if (weight < 1) return min_;
162
+ if (weight > centroids_weight_ - 1.0) return max_;
163
+ const double first_weight = centroids_.front().get_weight();
164
+ if (first_weight > 1 && weight < first_weight / 2.0) {
165
+ return min_ + (weight - 1.0) / (first_weight / 2.0 - 1.0) * (centroids_.front().get_mean() - min_);
166
+ }
167
+ const double last_weight = centroids_.back().get_weight();
168
+ if (last_weight > 1 && centroids_weight_ - weight <= last_weight / 2.0) {
169
+ return max_ + (centroids_weight_ - weight - 1.0) / (last_weight / 2.0 - 1.0) * (max_ - centroids_.back().get_mean());
170
+ }
171
+
172
+ // interpolate between extremes
173
+ double weight_so_far = first_weight / 2.0;
174
+ for (size_t i = 0; i < centroids_.size() - 1; ++i) {
175
+ const double dw = (centroids_[i].get_weight() + centroids_[i + 1].get_weight()) / 2.0;
176
+ if (weight_so_far + dw > weight) {
177
+ // the target weight is between centroids i and i+1
178
+ double left_weight = 0;
179
+ if (centroids_[i].get_weight() == 1) {
180
+ if (weight - weight_so_far < 0.5) return centroids_[i].get_mean();
181
+ left_weight = 0.5;
182
+ }
183
+ double right_weight = 0;
184
+ if (centroids_[i + 1].get_weight() == 1) {
185
+ if (weight_so_far + dw - weight <= 0.5) return centroids_[i + 1].get_mean();
186
+ right_weight = 0.5;
187
+ }
188
+ const double w1 = weight - weight_so_far - left_weight;
189
+ const double w2 = weight_so_far + dw - weight - right_weight;
190
+ return weighted_average(centroids_[i].get_mean(), w1, centroids_[i + 1].get_mean(), w2);
191
+ }
192
+ weight_so_far += dw;
193
+ }
194
+ const double w1 = weight - centroids_weight_ - centroids_.back().get_weight() / 2.0;
195
+ const double w2 = centroids_.back().get_weight() / 2.0 - w1;
196
+ return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
197
+ }
198
+
199
+ template<typename T, typename A>
200
+ auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
201
+ auto buckets = get_CDF(split_points, size);
202
+ for (uint32_t i = size; i > 0; --i) {
203
+ buckets[i] -= buckets[i - 1];
204
+ }
205
+ return buckets;
206
+ }
207
+
208
+ template<typename T, typename A>
209
+ auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
210
+ check_split_points(split_points, size);
211
+ vector_double ranks(get_allocator());
212
+ ranks.reserve(size + 1);
213
+ for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
214
+ ranks.push_back(1);
215
+ return ranks;
216
+ }
217
+
218
+ template<typename T, typename A>
219
+ uint16_t tdigest<T, A>::get_k() const {
220
+ return k_;
221
+ }
222
+
223
+ template<typename T, typename A>
224
+ string<A> tdigest<T, A>::to_string(bool print_centroids) const {
225
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
226
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
227
+ std::ostringstream os;
228
+ os << "### t-Digest summary:" << std::endl;
229
+ os << " Nominal k : " << k_ << std::endl;
230
+ os << " Centroids : " << centroids_.size() << std::endl;
231
+ os << " Buffered : " << buffer_.size() << std::endl;
232
+ os << " Centroids capacity : " << centroids_capacity_ << std::endl;
233
+ os << " Buffer capacity : " << centroids_capacity_ * BUFFER_MULTIPLIER << std::endl;
234
+ os << " Centroids Weight : " << centroids_weight_ << std::endl;
235
+ os << " Total Weight : " << get_total_weight() << std::endl;
236
+ os << " Reverse Merge : " << (reverse_merge_ ? "true" : "false") << std::endl;
237
+ if (!is_empty()) {
238
+ os << " Min : " << min_ << std::endl;
239
+ os << " Max : " << max_ << std::endl;
240
+ }
241
+ os << "### End t-Digest summary" << std::endl;
242
+ if (print_centroids) {
243
+ if (centroids_.size() > 0) {
244
+ os << "Centroids:" << std::endl;
245
+ int i = 0;
246
+ for (const auto& c: centroids_) {
247
+ os << i++ << ": " << c.get_mean() << ", " << c.get_weight() << std::endl;
248
+ }
249
+ }
250
+ if (buffer_.size() > 0) {
251
+ os << "Buffer:" << std::endl;
252
+ int i = 0;
253
+ for (const T value: buffer_) {
254
+ os << i++ << ": " << value << std::endl;
255
+ }
256
+ }
257
+ }
258
+ return string<A>(os.str().c_str(), buffer_.get_allocator());
259
+ }
260
+
261
+ // assumes that there is enough room in the input buffer to add centroids from this tdigest
262
+ template<typename T, typename A>
263
+ void tdigest<T, A>::merge(vector_centroid& buffer, W weight) {
264
+ std::copy(centroids_.begin(), centroids_.end(), std::back_inserter(buffer));
265
+ centroids_.clear();
266
+ std::stable_sort(buffer.begin(), buffer.end(), centroid_cmp());
267
+ if (reverse_merge_) std::reverse(buffer.begin(), buffer.end());
268
+ centroids_weight_ += weight;
269
+ auto it = buffer.begin();
270
+ centroids_.push_back(*it);
271
+ ++it;
272
+ double weight_so_far = 0;
273
+ while (it != buffer.end()) {
274
+ const double proposed_weight = centroids_.back().get_weight() + it->get_weight();
275
+ bool add_this = false;
276
+ if (std::distance(buffer.begin(), it) != 1 && std::distance(buffer.end(), it) != 1) {
277
+ const double q0 = weight_so_far / centroids_weight_;
278
+ const double q2 = (weight_so_far + proposed_weight) / centroids_weight_;
279
+ const double normalizer = scale_function().normalizer(2 * k_, centroids_weight_);
280
+ add_this = proposed_weight <= centroids_weight_ * std::min(scale_function().max(q0, normalizer), scale_function().max(q2, normalizer));
281
+ }
282
+ if (add_this) {
283
+ centroids_.back().add(*it);
284
+ } else {
285
+ weight_so_far += centroids_.back().get_weight();
286
+ centroids_.push_back(*it);
287
+ }
288
+ ++it;
289
+ }
290
+ if (reverse_merge_) std::reverse(centroids_.begin(), centroids_.end());
291
+ min_ = std::min(min_, centroids_.front().get_mean());
292
+ max_ = std::max(max_, centroids_.back().get_mean());
293
+ reverse_merge_ = !reverse_merge_;
294
+ buffer_.clear();
295
+ }
296
+
297
+ template<typename T, typename A>
298
+ double tdigest<T, A>::weighted_average(double x1, double w1, double x2, double w2) {
299
+ return (x1 * w1 + x2 * w2) / (w1 + w2);
300
+ }
301
+
302
+ template<typename T, typename A>
303
+ void tdigest<T, A>::serialize(std::ostream& os, bool with_buffer) const {
304
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
305
+ write(os, get_preamble_longs());
306
+ write(os, SERIAL_VERSION);
307
+ write(os, SKETCH_TYPE);
308
+ write(os, k_);
309
+ const uint8_t flags_byte(
310
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
311
+ | (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
312
+ | (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
313
+ );
314
+ write(os, flags_byte);
315
+ write<uint16_t>(os, 0); // unused
316
+ if (is_empty()) return;
317
+ if (is_single_value()) {
318
+ write(os, min_);
319
+ return;
320
+ }
321
+ write(os, static_cast<uint32_t>(centroids_.size()));
322
+ write(os, static_cast<uint32_t>(buffer_.size()));
323
+ write(os, min_);
324
+ write(os, max_);
325
+ if (centroids_.size() > 0) write(os, centroids_.data(), centroids_.size() * sizeof(centroid));
326
+ if (buffer_.size() > 0) write(os, buffer_.data(), buffer_.size() * sizeof(T));
327
+ }
328
+
329
+ template<typename T, typename A>
330
+ uint8_t tdigest<T, A>::get_preamble_longs() const {
331
+ return is_empty() || is_single_value() ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
332
+ }
333
+
334
+ template<typename T, typename A>
335
+ size_t tdigest<T, A>::get_serialized_size_bytes(bool with_buffer) const {
336
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
337
+ size_t size_bytes = get_preamble_longs() * sizeof(uint64_t);
338
+ if (is_empty()) return size_bytes;
339
+ if (is_single_value()) return size_bytes + sizeof(T);
340
+ size_bytes += sizeof(T) * 2 // min and max
341
+ + sizeof(centroid) * centroids_.size();
342
+ if (with_buffer) size_bytes += sizeof(T) * buffer_.size(); // count is a part of preamble
343
+ return size_bytes;
344
+ }
345
+
346
+ template<typename T, typename A>
347
+ auto tdigest<T, A>::serialize(unsigned header_size_bytes, bool with_buffer) const -> vector_bytes {
348
+ if (!with_buffer) const_cast<tdigest*>(this)->compress(); // side effect
349
+ vector_bytes bytes(get_serialized_size_bytes(with_buffer), 0, buffer_.get_allocator());
350
+ uint8_t* ptr = bytes.data() + header_size_bytes;
351
+ *ptr++ = get_preamble_longs();
352
+ *ptr++ = SERIAL_VERSION;
353
+ *ptr++ = SKETCH_TYPE;
354
+ ptr += copy_to_mem(k_, ptr);
355
+ const uint8_t flags_byte(
356
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
357
+ | (is_single_value() ? 1 << flags::IS_SINGLE_VALUE : 0)
358
+ | (reverse_merge_ ? 1 << flags::REVERSE_MERGE : 0)
359
+ );
360
+ *ptr++ = flags_byte;
361
+ ptr += 2; // unused
362
+ if (is_empty()) return bytes;
363
+ if (is_single_value()) {
364
+ copy_to_mem(min_, ptr);
365
+ return bytes;
366
+ }
367
+ ptr += copy_to_mem(static_cast<uint32_t>(centroids_.size()), ptr);
368
+ ptr += copy_to_mem(static_cast<uint32_t>(buffer_.size()), ptr);
369
+ ptr += copy_to_mem(min_, ptr);
370
+ ptr += copy_to_mem(max_, ptr);
371
+ if (centroids_.size() > 0) ptr += copy_to_mem(centroids_.data(), ptr, centroids_.size() * sizeof(centroid));
372
+ if (buffer_.size() > 0) copy_to_mem(buffer_.data(), ptr, buffer_.size() * sizeof(T));
373
+ return bytes;
374
+ }
375
+
376
+ template<typename T, typename A>
377
+ tdigest<T, A> tdigest<T, A>::deserialize(std::istream& is, const A& allocator) {
378
+ const auto preamble_longs = read<uint8_t>(is);
379
+ const auto serial_version = read<uint8_t>(is);
380
+ const auto sketch_type = read<uint8_t>(is);
381
+ if (sketch_type != SKETCH_TYPE) {
382
+ if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(is, allocator);
383
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
384
+ }
385
+ if (serial_version != SERIAL_VERSION) {
386
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
387
+ }
388
+ const auto k = read<uint16_t>(is);
389
+ const auto flags_byte = read<uint8_t>(is);
390
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
391
+ const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
392
+ const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
393
+ if (preamble_longs != expected_preamble_longs) {
394
+ throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
395
+ }
396
+ read<uint16_t>(is); // unused
397
+
398
+ if (is_empty) return tdigest(k, allocator);
399
+
400
+ const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
401
+ if (is_single_value) {
402
+ const T value = read<T>(is);
403
+ return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
404
+ }
405
+
406
+ const auto num_centroids = read<uint32_t>(is);
407
+ const auto num_buffered = read<uint32_t>(is);
408
+
409
+ const T min = read<T>(is);
410
+ const T max = read<T>(is);
411
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
412
+ if (num_centroids > 0) read(is, centroids.data(), num_centroids * sizeof(centroid));
413
+ vector_t buffer(num_buffered, 0, allocator);
414
+ if (num_buffered > 0) read(is, buffer.data(), num_buffered * sizeof(T));
415
+ uint64_t weight = 0;
416
+ for (const auto& c: centroids) weight += c.get_weight();
417
+ return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
418
+ }
419
+
420
+ template<typename T, typename A>
421
+ tdigest<T, A> tdigest<T, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
422
+ ensure_minimum_memory(size, 8);
423
+ const char* ptr = static_cast<const char*>(bytes);
424
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
425
+
426
+ const uint8_t preamble_longs = *ptr++;
427
+ const uint8_t serial_version = *ptr++;
428
+ const uint8_t sketch_type = *ptr++;
429
+ if (sketch_type != SKETCH_TYPE) {
430
+ if (preamble_longs == 0 && serial_version == 0 && sketch_type == 0) return deserialize_compat(ptr, end_ptr - ptr, allocator);
431
+ throw std::invalid_argument("sketch type mismatch: expected " + std::to_string(SKETCH_TYPE) + ", actual " + std::to_string(sketch_type));
432
+ }
433
+ if (serial_version != SERIAL_VERSION) {
434
+ throw std::invalid_argument("serial version mismatch: expected " + std::to_string(SERIAL_VERSION) + ", actual " + std::to_string(serial_version));
435
+ }
436
+ uint16_t k;
437
+ ptr += copy_from_mem(ptr, k);
438
+ const uint8_t flags_byte = *ptr++;
439
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
440
+ const bool is_single_value = flags_byte & (1 << flags::IS_SINGLE_VALUE);
441
+ const uint8_t expected_preamble_longs = is_empty || is_single_value ? PREAMBLE_LONGS_EMPTY_OR_SINGLE : PREAMBLE_LONGS_MULTIPLE;
442
+ if (preamble_longs != expected_preamble_longs) {
443
+ throw std::invalid_argument("preamble longs mismatch: expected " + std::to_string(expected_preamble_longs) + ", actual " + std::to_string(preamble_longs));
444
+ }
445
+ ptr += 2; // unused
446
+
447
+ if (is_empty) return tdigest(k, allocator);
448
+
449
+ const bool reverse_merge = flags_byte & (1 << flags::REVERSE_MERGE);
450
+ if (is_single_value) {
451
+ ensure_minimum_memory(end_ptr - ptr, sizeof(T));
452
+ T value;
453
+ ptr += copy_from_mem(ptr, value);
454
+ return tdigest(reverse_merge, k, value, value, vector_centroid(1, centroid(value, 1), allocator), 1, vector_t(allocator));
455
+ }
456
+
457
+ ensure_minimum_memory(end_ptr - ptr, 8);
458
+ uint32_t num_centroids;
459
+ ptr += copy_from_mem(ptr, num_centroids);
460
+ uint32_t num_buffered;
461
+ ptr += copy_from_mem(ptr, num_buffered);
462
+
463
+ ensure_minimum_memory(end_ptr - ptr, sizeof(T) * 2 + sizeof(centroid) * num_centroids + sizeof(T) * num_buffered);
464
+ T min;
465
+ ptr += copy_from_mem(ptr, min);
466
+ T max;
467
+ ptr += copy_from_mem(ptr, max);
468
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
469
+ if (num_centroids > 0) ptr += copy_from_mem(ptr, centroids.data(), num_centroids * sizeof(centroid));
470
+ vector_t buffer(num_buffered, 0, allocator);
471
+ if (num_buffered > 0) copy_from_mem(ptr, buffer.data(), num_buffered * sizeof(T));
472
+ uint64_t weight = 0;
473
+ for (const auto& c: centroids) weight += c.get_weight();
474
+ return tdigest(reverse_merge, k, min, max, std::move(centroids), weight, std::move(buffer));
475
+ }
476
+
477
+ // compatibility with the format of the reference implementation
478
+ // default byte order of ByteBuffer is used there, which is big endian
479
+ template<typename T, typename A>
480
+ tdigest<T, A> tdigest<T, A>::deserialize_compat(std::istream& is, const A& allocator) {
481
+ // this method was called because the first three bytes were zeros
482
+ // so read one more byte to see if it looks like the reference implementation format
483
+ const auto type = read<uint8_t>(is);
484
+ if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
485
+ throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
486
+ }
487
+ if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
488
+ const auto min = read_big_endian<double>(is);
489
+ const auto max = read_big_endian<double>(is);
490
+ const auto k = static_cast<uint16_t>(read_big_endian<double>(is));
491
+ const auto num_centroids = read_big_endian<uint32_t>(is);
492
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
493
+ uint64_t total_weight = 0;
494
+ for (auto& c: centroids) {
495
+ const W weight = static_cast<W>(read_big_endian<double>(is));
496
+ const auto mean = read_big_endian<double>(is);
497
+ c = centroid(mean, weight);
498
+ total_weight += weight;
499
+ }
500
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
501
+ }
502
+ // COMPAT_FLOAT: compatibility with asSmallBytes()
503
+ const auto min = read_big_endian<double>(is); // reference implementation uses doubles for min and max
504
+ const auto max = read_big_endian<double>(is);
505
+ const auto k = static_cast<uint16_t>(read_big_endian<float>(is));
506
+ // reference implementation stores capacities of the array of centroids and the buffer as shorts
507
+ // they can be derived from k in the constructor
508
+ read<uint32_t>(is); // unused
509
+ const auto num_centroids = read_big_endian<uint16_t>(is);
510
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
511
+ uint64_t total_weight = 0;
512
+ for (auto& c: centroids) {
513
+ const W weight = static_cast<W>(read_big_endian<float>(is));
514
+ const auto mean = read_big_endian<float>(is);
515
+ c = centroid(mean, weight);
516
+ total_weight += weight;
517
+ }
518
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
519
+ }
520
+
521
+ // compatibility with the format of the reference implementation
522
+ // default byte order of ByteBuffer is used there, which is big endian
523
+ template<typename T, typename A>
524
+ tdigest<T, A> tdigest<T, A>::deserialize_compat(const void* bytes, size_t size, const A& allocator) {
525
+ const char* ptr = static_cast<const char*>(bytes);
526
+ // this method was called because the first three bytes were zeros
527
+ // so read one more byte to see if it looks like the reference implementation format
528
+ const auto type = *ptr++;
529
+ if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
530
+ throw std::invalid_argument("unexpected sketch preamble: 0 0 0 " + std::to_string(type));
531
+ }
532
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
533
+ if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
534
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 3 + sizeof(uint32_t));
535
+ double min;
536
+ ptr += copy_from_mem(ptr, min);
537
+ min = byteswap(min);
538
+ double max;
539
+ ptr += copy_from_mem(ptr, max);
540
+ max = byteswap(max);
541
+ double k_double;
542
+ ptr += copy_from_mem(ptr, k_double);
543
+ const uint16_t k = static_cast<uint16_t>(byteswap(k_double));
544
+ uint32_t num_centroids;
545
+ ptr += copy_from_mem(ptr, num_centroids);
546
+ num_centroids = byteswap(num_centroids);
547
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * num_centroids * 2);
548
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
549
+ uint64_t total_weight = 0;
550
+ for (auto& c: centroids) {
551
+ double weight;
552
+ ptr += copy_from_mem(ptr, weight);
553
+ weight = byteswap(weight);
554
+ double mean;
555
+ ptr += copy_from_mem(ptr, mean);
556
+ mean = byteswap(mean);
557
+ c = centroid(mean, static_cast<W>(weight));
558
+ total_weight += static_cast<uint64_t>(weight);
559
+ }
560
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
561
+ }
562
+ // COMPAT_FLOAT: compatibility with asSmallBytes()
563
+ ensure_minimum_memory(end_ptr - ptr, sizeof(double) * 2 + sizeof(float) + sizeof(uint16_t) * 3);
564
+ double min; // reference implementation uses doubles for min and max
565
+ ptr += copy_from_mem(ptr, min);
566
+ min = byteswap(min);
567
+ double max;
568
+ ptr += copy_from_mem(ptr, max);
569
+ max = byteswap(max);
570
+ float k_float;
571
+ ptr += copy_from_mem(ptr, k_float);
572
+ const uint16_t k = static_cast<uint16_t>(byteswap(k_float));
573
+ // reference implementation stores capacities of the array of centroids and the buffer as shorts
574
+ // they can be derived from k in the constructor
575
+ ptr += sizeof(uint32_t); // unused
576
+ uint16_t num_centroids;
577
+ ptr += copy_from_mem(ptr, num_centroids);
578
+ num_centroids = byteswap(num_centroids);
579
+ ensure_minimum_memory(end_ptr - ptr, sizeof(float) * num_centroids * 2);
580
+ vector_centroid centroids(num_centroids, centroid(0, 0), allocator);
581
+ uint64_t total_weight = 0;
582
+ for (auto& c: centroids) {
583
+ float weight;
584
+ ptr += copy_from_mem(ptr, weight);
585
+ weight = byteswap(weight);
586
+ float mean;
587
+ ptr += copy_from_mem(ptr, mean);
588
+ mean = byteswap(mean);
589
+ c = centroid(mean, static_cast<W>(weight));
590
+ total_weight += static_cast<uint64_t>(weight);
591
+ }
592
+ return tdigest(false, k, min, max, std::move(centroids), total_weight, vector_t(allocator));
593
+ }
594
+
595
+ template<typename T, typename A>
596
+ bool tdigest<T, A>::is_single_value() const {
597
+ return get_total_weight() == 1;
598
+ }
599
+
600
+ template<typename T, typename A>
601
+ tdigest<T, A>::tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t weight, vector_t&& buffer):
602
+ reverse_merge_(reverse_merge),
603
+ k_(k),
604
+ min_(min),
605
+ max_(max),
606
+ centroids_capacity_(0),
607
+ centroids_(std::move(centroids)),
608
+ centroids_weight_(weight),
609
+ buffer_(std::move(buffer))
610
+ {
611
+ if (k < 10) throw std::invalid_argument("k must be at least 10");
612
+ const size_t fudge = k < 30 ? 30 : 10;
613
+ centroids_capacity_ = 2 * k_ + fudge;
614
+ centroids_.reserve(centroids_capacity_);
615
+ buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
616
+ }
617
+
618
+ template<typename T, typename A>
619
+ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
620
+ for (uint32_t i = 0; i < size ; i++) {
621
+ if (std::isnan(values[i])) {
622
+ throw std::invalid_argument("Values must not be NaN");
623
+ }
624
+ if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
625
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
626
+ }
627
+ }
628
+ }
629
+
630
+ } /* namespace datasketches */
631
+
632
+ #endif // _TDIGEST_IMPL_HPP_
@@ -0,0 +1,56 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_executable(tdigest_test)
19
+
20
+ target_link_libraries(tdigest_test tdigest common common_test_lib)
21
+
22
+ set_target_properties(tdigest_test PROPERTIES
23
+ CXX_STANDARD_REQUIRED YES
24
+ )
25
+
26
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" tdigest_TEST_BINARY_PATH)
27
+ string(APPEND tdigest_TEST_BINARY_PATH "/")
28
+ target_compile_definitions(tdigest_test
29
+ PRIVATE
30
+ TEST_BINARY_INPUT_PATH="${tdigest_TEST_BINARY_PATH}"
31
+ )
32
+
33
+ add_test(
34
+ NAME tdigest_test
35
+ COMMAND tdigest_test
36
+ )
37
+
38
+ target_sources(tdigest_test
39
+ PRIVATE
40
+ tdigest_test.cpp
41
+ tdigest_custom_allocator_test.cpp
42
+ )
43
+
44
+ if (SERDE_COMPAT)
45
+ target_sources(tdigest_test
46
+ PRIVATE
47
+ tdigest_deserialize_from_java_test.cpp
48
+ )
49
+ endif()
50
+
51
+ if (GENERATE)
52
+ target_sources(tdigest_test
53
+ PRIVATE
54
+ tdigest_serialize_for_java.cpp
55
+ )
56
+ endif()