datasketches 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -0,0 +1,254 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_HPP_
21
+ #define _TDIGEST_HPP_
22
+
23
+ #include <cstddef>
24
+ #include <limits>
25
+ #include <type_traits>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // this is equivalent of K_2 (default) in the Java implementation mentioned below
33
+ // Generates cluster sizes proportional to q*(1-q).
34
+ // The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
35
+ struct scale_function {
36
+ double max(double q, double normalizer) const {
37
+ return q * (1 - q) / normalizer;
38
+ }
39
+ double normalizer(double compression, double n) const {
40
+ return compression / z(compression, n);
41
+ }
42
+ double z(double compression, double n) const {
43
+ return 4 * std::log(n / compression) + 24;
44
+ }
45
+ };
46
+
47
+ // forward declaration
48
+ template <typename T, typename Allocator = std::allocator<T>> class tdigest;
49
+
50
+ /// TDigest float sketch
51
+ using tdigest_float = tdigest<float>;
52
+ /// TDigest double sketch
53
+ using tdigest_double = tdigest<double>;
54
+
55
+ /**
56
+ * t-Digest for estimating quantiles and ranks.
57
+ * This implementation is based on the following paper:
58
+ * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
59
+ * and the following implementation in Java:
60
+ * https://github.com/tdunning/t-digest
61
+ * This implementation is similar to MergingDigest in the above Java implementation
62
+ */
63
+ template <typename T, typename Allocator>
64
+ class tdigest {
65
+ // exclude long double by not using std::is_floating_point
66
+ static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
67
+ static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
68
+ public:
69
+ using value_type = T;
70
+ using allocator_type = Allocator;
71
+
72
+ static const uint16_t DEFAULT_K = 200;
73
+
74
+ using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
75
+
76
+ class centroid {
77
+ public:
78
+ centroid(T value, W weight): mean_(value), weight_(weight) {}
79
+ void add(const centroid& other) {
80
+ weight_ += other.weight_;
81
+ mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
82
+ }
83
+ T get_mean() const { return mean_; }
84
+ W get_weight() const { return weight_; }
85
+ private:
86
+ T mean_;
87
+ W weight_;
88
+ };
89
+ using vector_t = std::vector<T, Allocator>;
90
+ using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+
93
+ struct centroid_cmp {
94
+ centroid_cmp() {}
95
+ bool operator()(const centroid& a, const centroid& b) const {
96
+ if (a.get_mean() < b.get_mean()) return true;
97
+ return false;
98
+ }
99
+ };
100
+
101
+ /**
102
+ * Constructor
103
+ * @param k affects the size of the sketch and its estimation error
104
+ * @param allocator used to allocate memory
105
+ */
106
+ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
107
+
108
+ /**
109
+ * Update this t-Digest with the given value
110
+ * @param value to update the t-Digest with
111
+ */
112
+ void update(T value);
113
+
114
+ /**
115
+ * Merge the given t-Digest into this one
116
+ * @param other t-Digest to merge
117
+ */
118
+ void merge(tdigest& other);
119
+
120
+ /**
121
+ * Process buffered values and merge centroids if needed
122
+ */
123
+ void compress();
124
+
125
+ /**
126
+ * @return true if t-Digest has not seen any data
127
+ */
128
+ bool is_empty() const;
129
+
130
+ /**
131
+ * @return minimum value seen by t-Digest
132
+ */
133
+ T get_min_value() const;
134
+
135
+ /**
136
+ * @return maximum value seen by t-Digest
137
+ */
138
+ T get_max_value() const;
139
+
140
+ /**
141
+ * @return total weight
142
+ */
143
+ uint64_t get_total_weight() const;
144
+
145
+ /**
146
+ * Compute approximate normalized rank of the given value.
147
+ * @param value to be ranked
148
+ * @return normalized rank (from 0 to 1 inclusive)
149
+ */
150
+ double get_rank(T value) const;
151
+
152
+ /**
153
+ * Compute approximate quantile value corresponding to the given normalized rank
154
+ * @param rank normalized rank (from 0 to 1 inclusive)
155
+ * @return quantile value corresponding to the given rank
156
+ */
157
+ T get_quantile(double rank) const;
158
+
159
+ /**
160
+ * @return parameter k (compression) that was used to configure this t-Digest
161
+ */
162
+ uint16_t get_k() const;
163
+
164
+ /**
165
+ * Human-readable summary of this t-Digest as a string
166
+ * @param print_centroids if true append the list of centroids with weights
167
+ * @return summary of this t-Digest
168
+ */
169
+ string<Allocator> to_string(bool print_centroids = false) const;
170
+
171
+ /**
172
+ * Computes size needed to serialize the current state.
173
+ * @param with_buffer optionally serialize buffered values avoiding compression
174
+ * @return size in bytes needed to serialize this tdigest
175
+ */
176
+ size_t get_serialized_size_bytes(bool with_buffer = false) const;
177
+
178
+ /**
179
+ * This method serializes t-Digest into a given stream in a binary form
180
+ * @param os output stream
181
+ * @param with_buffer optionally serialize buffered values avoiding compression
182
+ */
183
+ void serialize(std::ostream& os, bool with_buffer = false) const;
184
+
185
+ /**
186
+ * This method serializes t-Digest as a vector of bytes.
187
+ * An optional header can be reserved in front of the sketch.
188
+ * It is an uninitialized space of a given size.
189
+ * @param header_size_bytes space to reserve in front of the sketch
190
+ * @param with_buffer optionally serialize buffered values avoiding compression
191
+ * @return serialized sketch as a vector of bytes
192
+ */
193
+ vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
194
+
195
+ /**
196
+ * This method deserializes t-Digest from a given stream.
197
+ * @param is input stream
198
+ * @param allocator instance of an Allocator
199
+ * @return an instance of t-Digest
200
+ */
201
+ static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
202
+
203
+ /**
204
+ * This method deserializes t-Digest from a given array of bytes.
205
+ * @param bytes pointer to the array of bytes
206
+ * @param size the size of the array
207
+ * @param allocator instance of an Allocator
208
+ * @return an instance of t-Digest
209
+ */
210
+ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
211
+
212
+ private:
213
+ bool reverse_merge_;
214
+ uint16_t k_;
215
+ uint16_t internal_k_;
216
+ T min_;
217
+ T max_;
218
+ size_t centroids_capacity_;
219
+ vector_centroid centroids_;
220
+ uint64_t centroids_weight_;
221
+ size_t buffer_capacity_;
222
+ vector_t buffer_;
223
+
224
+ static const size_t BUFFER_MULTIPLIER = 4;
225
+
226
+ static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
227
+ static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
228
+ static const uint8_t SERIAL_VERSION = 1;
229
+ static const uint8_t SKETCH_TYPE = 20;
230
+
231
+ static const uint8_t COMPAT_DOUBLE = 1;
232
+ static const uint8_t COMPAT_FLOAT = 2;
233
+
234
+ enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
235
+
236
+ bool is_single_value() const;
237
+ uint8_t get_preamble_longs() const;
238
+ void merge(vector_centroid& buffer, W weight);
239
+
240
+ // for deserialize
241
+ tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
242
+
243
+ static double weighted_average(double x1, double w1, double x2, double w2);
244
+
245
+ // for compatibility with format of the reference implementation
246
+ static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
247
+ static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
248
+ };
249
+
250
+ } /* namespace datasketches */
251
+
252
+ #include "tdigest_impl.hpp"
253
+
254
+ #endif // _TDIGEST_HPP_