datasketches 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -0,0 +1,254 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _TDIGEST_HPP_
|
21
|
+
#define _TDIGEST_HPP_
|
22
|
+
|
23
|
+
#include <cstddef>
|
24
|
+
#include <limits>
|
25
|
+
#include <type_traits>
|
26
|
+
#include <vector>
|
27
|
+
|
28
|
+
#include "common_defs.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
// this is equivalent of K_2 (default) in the Java implementation mentioned below
|
33
|
+
// Generates cluster sizes proportional to q*(1-q).
|
34
|
+
// The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
|
35
|
+
struct scale_function {
|
36
|
+
double max(double q, double normalizer) const {
|
37
|
+
return q * (1 - q) / normalizer;
|
38
|
+
}
|
39
|
+
double normalizer(double compression, double n) const {
|
40
|
+
return compression / z(compression, n);
|
41
|
+
}
|
42
|
+
double z(double compression, double n) const {
|
43
|
+
return 4 * std::log(n / compression) + 24;
|
44
|
+
}
|
45
|
+
};
|
46
|
+
|
47
|
+
// forward declaration
|
48
|
+
template <typename T, typename Allocator = std::allocator<T>> class tdigest;
|
49
|
+
|
50
|
+
/// TDigest float sketch
|
51
|
+
using tdigest_float = tdigest<float>;
|
52
|
+
/// TDigest double sketch
|
53
|
+
using tdigest_double = tdigest<double>;
|
54
|
+
|
55
|
+
/**
|
56
|
+
* t-Digest for estimating quantiles and ranks.
|
57
|
+
* This implementation is based on the following paper:
|
58
|
+
* Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
|
59
|
+
* and the following implementation in Java:
|
60
|
+
* https://github.com/tdunning/t-digest
|
61
|
+
* This implementation is similar to MergingDigest in the above Java implementation
|
62
|
+
*/
|
63
|
+
template <typename T, typename Allocator>
|
64
|
+
class tdigest {
|
65
|
+
// exclude long double by not using std::is_floating_point
|
66
|
+
static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
|
67
|
+
static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
|
68
|
+
public:
|
69
|
+
using value_type = T;
|
70
|
+
using allocator_type = Allocator;
|
71
|
+
|
72
|
+
static const uint16_t DEFAULT_K = 200;
|
73
|
+
|
74
|
+
using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
|
75
|
+
|
76
|
+
class centroid {
|
77
|
+
public:
|
78
|
+
centroid(T value, W weight): mean_(value), weight_(weight) {}
|
79
|
+
void add(const centroid& other) {
|
80
|
+
weight_ += other.weight_;
|
81
|
+
mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
|
82
|
+
}
|
83
|
+
T get_mean() const { return mean_; }
|
84
|
+
W get_weight() const { return weight_; }
|
85
|
+
private:
|
86
|
+
T mean_;
|
87
|
+
W weight_;
|
88
|
+
};
|
89
|
+
using vector_t = std::vector<T, Allocator>;
|
90
|
+
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
|
91
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
92
|
+
|
93
|
+
struct centroid_cmp {
|
94
|
+
centroid_cmp() {}
|
95
|
+
bool operator()(const centroid& a, const centroid& b) const {
|
96
|
+
if (a.get_mean() < b.get_mean()) return true;
|
97
|
+
return false;
|
98
|
+
}
|
99
|
+
};
|
100
|
+
|
101
|
+
/**
|
102
|
+
* Constructor
|
103
|
+
* @param k affects the size of the sketch and its estimation error
|
104
|
+
* @param allocator used to allocate memory
|
105
|
+
*/
|
106
|
+
explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Update this t-Digest with the given value
|
110
|
+
* @param value to update the t-Digest with
|
111
|
+
*/
|
112
|
+
void update(T value);
|
113
|
+
|
114
|
+
/**
|
115
|
+
* Merge the given t-Digest into this one
|
116
|
+
* @param other t-Digest to merge
|
117
|
+
*/
|
118
|
+
void merge(tdigest& other);
|
119
|
+
|
120
|
+
/**
|
121
|
+
* Process buffered values and merge centroids if needed
|
122
|
+
*/
|
123
|
+
void compress();
|
124
|
+
|
125
|
+
/**
|
126
|
+
* @return true if t-Digest has not seen any data
|
127
|
+
*/
|
128
|
+
bool is_empty() const;
|
129
|
+
|
130
|
+
/**
|
131
|
+
* @return minimum value seen by t-Digest
|
132
|
+
*/
|
133
|
+
T get_min_value() const;
|
134
|
+
|
135
|
+
/**
|
136
|
+
* @return maximum value seen by t-Digest
|
137
|
+
*/
|
138
|
+
T get_max_value() const;
|
139
|
+
|
140
|
+
/**
|
141
|
+
* @return total weight
|
142
|
+
*/
|
143
|
+
uint64_t get_total_weight() const;
|
144
|
+
|
145
|
+
/**
|
146
|
+
* Compute approximate normalized rank of the given value.
|
147
|
+
* @param value to be ranked
|
148
|
+
* @return normalized rank (from 0 to 1 inclusive)
|
149
|
+
*/
|
150
|
+
double get_rank(T value) const;
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Compute approximate quantile value corresponding to the given normalized rank
|
154
|
+
* @param rank normalized rank (from 0 to 1 inclusive)
|
155
|
+
* @return quantile value corresponding to the given rank
|
156
|
+
*/
|
157
|
+
T get_quantile(double rank) const;
|
158
|
+
|
159
|
+
/**
|
160
|
+
* @return parameter k (compression) that was used to configure this t-Digest
|
161
|
+
*/
|
162
|
+
uint16_t get_k() const;
|
163
|
+
|
164
|
+
/**
|
165
|
+
* Human-readable summary of this t-Digest as a string
|
166
|
+
* @param print_centroids if true append the list of centroids with weights
|
167
|
+
* @return summary of this t-Digest
|
168
|
+
*/
|
169
|
+
string<Allocator> to_string(bool print_centroids = false) const;
|
170
|
+
|
171
|
+
/**
|
172
|
+
* Computes size needed to serialize the current state.
|
173
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
174
|
+
* @return size in bytes needed to serialize this tdigest
|
175
|
+
*/
|
176
|
+
size_t get_serialized_size_bytes(bool with_buffer = false) const;
|
177
|
+
|
178
|
+
/**
|
179
|
+
* This method serializes t-Digest into a given stream in a binary form
|
180
|
+
* @param os output stream
|
181
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
182
|
+
*/
|
183
|
+
void serialize(std::ostream& os, bool with_buffer = false) const;
|
184
|
+
|
185
|
+
/**
|
186
|
+
* This method serializes t-Digest as a vector of bytes.
|
187
|
+
* An optional header can be reserved in front of the sketch.
|
188
|
+
* It is an uninitialized space of a given size.
|
189
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
190
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
191
|
+
* @return serialized sketch as a vector of bytes
|
192
|
+
*/
|
193
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
|
194
|
+
|
195
|
+
/**
|
196
|
+
* This method deserializes t-Digest from a given stream.
|
197
|
+
* @param is input stream
|
198
|
+
* @param allocator instance of an Allocator
|
199
|
+
* @return an instance of t-Digest
|
200
|
+
*/
|
201
|
+
static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
|
202
|
+
|
203
|
+
/**
|
204
|
+
* This method deserializes t-Digest from a given array of bytes.
|
205
|
+
* @param bytes pointer to the array of bytes
|
206
|
+
* @param size the size of the array
|
207
|
+
* @param allocator instance of an Allocator
|
208
|
+
* @return an instance of t-Digest
|
209
|
+
*/
|
210
|
+
static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
211
|
+
|
212
|
+
private:
|
213
|
+
bool reverse_merge_;
|
214
|
+
uint16_t k_;
|
215
|
+
uint16_t internal_k_;
|
216
|
+
T min_;
|
217
|
+
T max_;
|
218
|
+
size_t centroids_capacity_;
|
219
|
+
vector_centroid centroids_;
|
220
|
+
uint64_t centroids_weight_;
|
221
|
+
size_t buffer_capacity_;
|
222
|
+
vector_t buffer_;
|
223
|
+
|
224
|
+
static const size_t BUFFER_MULTIPLIER = 4;
|
225
|
+
|
226
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
|
227
|
+
static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
|
228
|
+
static const uint8_t SERIAL_VERSION = 1;
|
229
|
+
static const uint8_t SKETCH_TYPE = 20;
|
230
|
+
|
231
|
+
static const uint8_t COMPAT_DOUBLE = 1;
|
232
|
+
static const uint8_t COMPAT_FLOAT = 2;
|
233
|
+
|
234
|
+
enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
|
235
|
+
|
236
|
+
bool is_single_value() const;
|
237
|
+
uint8_t get_preamble_longs() const;
|
238
|
+
void merge(vector_centroid& buffer, W weight);
|
239
|
+
|
240
|
+
// for deserialize
|
241
|
+
tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
|
242
|
+
|
243
|
+
static double weighted_average(double x1, double w1, double x2, double w2);
|
244
|
+
|
245
|
+
// for compatibility with format of the reference implementation
|
246
|
+
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
|
247
|
+
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
248
|
+
};
|
249
|
+
|
250
|
+
} /* namespace datasketches */
|
251
|
+
|
252
|
+
#include "tdigest_impl.hpp"
|
253
|
+
|
254
|
+
#endif // _TDIGEST_HPP_
|