datasketches 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -0,0 +1,254 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_HPP_
21
+ #define _TDIGEST_HPP_
22
+
23
+ #include <cstddef>
24
+ #include <limits>
25
+ #include <type_traits>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // this is equivalent of K_2 (default) in the Java implementation mentioned below
33
+ // Generates cluster sizes proportional to q*(1-q).
34
+ // The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
35
+ struct scale_function {
36
+ double max(double q, double normalizer) const {
37
+ return q * (1 - q) / normalizer;
38
+ }
39
+ double normalizer(double compression, double n) const {
40
+ return compression / z(compression, n);
41
+ }
42
+ double z(double compression, double n) const {
43
+ return 4 * std::log(n / compression) + 24;
44
+ }
45
+ };
46
+
47
+ // forward declaration
48
+ template <typename T, typename Allocator = std::allocator<T>> class tdigest;
49
+
50
+ /// TDigest float sketch
51
+ using tdigest_float = tdigest<float>;
52
+ /// TDigest double sketch
53
+ using tdigest_double = tdigest<double>;
54
+
55
+ /**
56
+ * t-Digest for estimating quantiles and ranks.
57
+ * This implementation is based on the following paper:
58
+ * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
59
+ * and the following implementation in Java:
60
+ * https://github.com/tdunning/t-digest
61
+ * This implementation is similar to MergingDigest in the above Java implementation
62
+ */
63
+ template <typename T, typename Allocator>
64
+ class tdigest {
65
+ // exclude long double by not using std::is_floating_point
66
+ static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
67
+ static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
68
+ public:
69
+ using value_type = T;
70
+ using allocator_type = Allocator;
71
+
72
+ static const uint16_t DEFAULT_K = 200;
73
+
74
+ using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
75
+
76
+ class centroid {
77
+ public:
78
+ centroid(T value, W weight): mean_(value), weight_(weight) {}
79
+ void add(const centroid& other) {
80
+ weight_ += other.weight_;
81
+ mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
82
+ }
83
+ T get_mean() const { return mean_; }
84
+ W get_weight() const { return weight_; }
85
+ private:
86
+ T mean_;
87
+ W weight_;
88
+ };
89
+ using vector_t = std::vector<T, Allocator>;
90
+ using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+
93
+ struct centroid_cmp {
94
+ centroid_cmp() {}
95
+ bool operator()(const centroid& a, const centroid& b) const {
96
+ if (a.get_mean() < b.get_mean()) return true;
97
+ return false;
98
+ }
99
+ };
100
+
101
+ /**
102
+ * Constructor
103
+ * @param k affects the size of the sketch and its estimation error
104
+ * @param allocator used to allocate memory
105
+ */
106
+ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
107
+
108
+ /**
109
+ * Update this t-Digest with the given value
110
+ * @param value to update the t-Digest with
111
+ */
112
+ void update(T value);
113
+
114
+ /**
115
+ * Merge the given t-Digest into this one
116
+ * @param other t-Digest to merge
117
+ */
118
+ void merge(tdigest& other);
119
+
120
+ /**
121
+ * Process buffered values and merge centroids if needed
122
+ */
123
+ void compress();
124
+
125
+ /**
126
+ * @return true if t-Digest has not seen any data
127
+ */
128
+ bool is_empty() const;
129
+
130
+ /**
131
+ * @return minimum value seen by t-Digest
132
+ */
133
+ T get_min_value() const;
134
+
135
+ /**
136
+ * @return maximum value seen by t-Digest
137
+ */
138
+ T get_max_value() const;
139
+
140
+ /**
141
+ * @return total weight
142
+ */
143
+ uint64_t get_total_weight() const;
144
+
145
+ /**
146
+ * Compute approximate normalized rank of the given value.
147
+ * @param value to be ranked
148
+ * @return normalized rank (from 0 to 1 inclusive)
149
+ */
150
+ double get_rank(T value) const;
151
+
152
+ /**
153
+ * Compute approximate quantile value corresponding to the given normalized rank
154
+ * @param rank normalized rank (from 0 to 1 inclusive)
155
+ * @return quantile value corresponding to the given rank
156
+ */
157
+ T get_quantile(double rank) const;
158
+
159
+ /**
160
+ * @return parameter k (compression) that was used to configure this t-Digest
161
+ */
162
+ uint16_t get_k() const;
163
+
164
+ /**
165
+ * Human-readable summary of this t-Digest as a string
166
+ * @param print_centroids if true append the list of centroids with weights
167
+ * @return summary of this t-Digest
168
+ */
169
+ string<Allocator> to_string(bool print_centroids = false) const;
170
+
171
+ /**
172
+ * Computes size needed to serialize the current state.
173
+ * @param with_buffer optionally serialize buffered values avoiding compression
174
+ * @return size in bytes needed to serialize this tdigest
175
+ */
176
+ size_t get_serialized_size_bytes(bool with_buffer = false) const;
177
+
178
+ /**
179
+ * This method serializes t-Digest into a given stream in a binary form
180
+ * @param os output stream
181
+ * @param with_buffer optionally serialize buffered values avoiding compression
182
+ */
183
+ void serialize(std::ostream& os, bool with_buffer = false) const;
184
+
185
+ /**
186
+ * This method serializes t-Digest as a vector of bytes.
187
+ * An optional header can be reserved in front of the sketch.
188
+ * It is an uninitialized space of a given size.
189
+ * @param header_size_bytes space to reserve in front of the sketch
190
+ * @param with_buffer optionally serialize buffered values avoiding compression
191
+ * @return serialized sketch as a vector of bytes
192
+ */
193
+ vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
194
+
195
+ /**
196
+ * This method deserializes t-Digest from a given stream.
197
+ * @param is input stream
198
+ * @param allocator instance of an Allocator
199
+ * @return an instance of t-Digest
200
+ */
201
+ static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
202
+
203
+ /**
204
+ * This method deserializes t-Digest from a given array of bytes.
205
+ * @param bytes pointer to the array of bytes
206
+ * @param size the size of the array
207
+ * @param allocator instance of an Allocator
208
+ * @return an instance of t-Digest
209
+ */
210
+ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
211
+
212
+ private:
213
+ bool reverse_merge_;
214
+ uint16_t k_;
215
+ uint16_t internal_k_;
216
+ T min_;
217
+ T max_;
218
+ size_t centroids_capacity_;
219
+ vector_centroid centroids_;
220
+ uint64_t centroids_weight_;
221
+ size_t buffer_capacity_;
222
+ vector_t buffer_;
223
+
224
+ static const size_t BUFFER_MULTIPLIER = 4;
225
+
226
+ static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
227
+ static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
228
+ static const uint8_t SERIAL_VERSION = 1;
229
+ static const uint8_t SKETCH_TYPE = 20;
230
+
231
+ static const uint8_t COMPAT_DOUBLE = 1;
232
+ static const uint8_t COMPAT_FLOAT = 2;
233
+
234
+ enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
235
+
236
+ bool is_single_value() const;
237
+ uint8_t get_preamble_longs() const;
238
+ void merge(vector_centroid& buffer, W weight);
239
+
240
+ // for deserialize
241
+ tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
242
+
243
+ static double weighted_average(double x1, double w1, double x2, double w2);
244
+
245
+ // for compatibility with format of the reference implementation
246
+ static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
247
+ static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
248
+ };
249
+
250
+ } /* namespace datasketches */
251
+
252
+ #include "tdigest_impl.hpp"
253
+
254
+ #endif // _TDIGEST_HPP_