datasketches 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -0,0 +1,254 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _TDIGEST_HPP_
|
21
|
+
#define _TDIGEST_HPP_
|
22
|
+
|
23
|
+
#include <cstddef>
|
24
|
+
#include <limits>
|
25
|
+
#include <type_traits>
|
26
|
+
#include <vector>
|
27
|
+
|
28
|
+
#include "common_defs.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
// this is equivalent of K_2 (default) in the Java implementation mentioned below
|
33
|
+
// Generates cluster sizes proportional to q*(1-q).
|
34
|
+
// The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
|
35
|
+
struct scale_function {
|
36
|
+
double max(double q, double normalizer) const {
|
37
|
+
return q * (1 - q) / normalizer;
|
38
|
+
}
|
39
|
+
double normalizer(double compression, double n) const {
|
40
|
+
return compression / z(compression, n);
|
41
|
+
}
|
42
|
+
double z(double compression, double n) const {
|
43
|
+
return 4 * std::log(n / compression) + 24;
|
44
|
+
}
|
45
|
+
};
|
46
|
+
|
47
|
+
// forward declaration
|
48
|
+
template <typename T, typename Allocator = std::allocator<T>> class tdigest;
|
49
|
+
|
50
|
+
/// TDigest float sketch
|
51
|
+
using tdigest_float = tdigest<float>;
|
52
|
+
/// TDigest double sketch
|
53
|
+
using tdigest_double = tdigest<double>;
|
54
|
+
|
55
|
+
/**
|
56
|
+
* t-Digest for estimating quantiles and ranks.
|
57
|
+
* This implementation is based on the following paper:
|
58
|
+
* Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
|
59
|
+
* and the following implementation in Java:
|
60
|
+
* https://github.com/tdunning/t-digest
|
61
|
+
* This implementation is similar to MergingDigest in the above Java implementation
|
62
|
+
*/
|
63
|
+
template <typename T, typename Allocator>
|
64
|
+
class tdigest {
|
65
|
+
// exclude long double by not using std::is_floating_point
|
66
|
+
static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
|
67
|
+
static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
|
68
|
+
public:
|
69
|
+
using value_type = T;
|
70
|
+
using allocator_type = Allocator;
|
71
|
+
|
72
|
+
static const uint16_t DEFAULT_K = 200;
|
73
|
+
|
74
|
+
using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
|
75
|
+
|
76
|
+
class centroid {
|
77
|
+
public:
|
78
|
+
centroid(T value, W weight): mean_(value), weight_(weight) {}
|
79
|
+
void add(const centroid& other) {
|
80
|
+
weight_ += other.weight_;
|
81
|
+
mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
|
82
|
+
}
|
83
|
+
T get_mean() const { return mean_; }
|
84
|
+
W get_weight() const { return weight_; }
|
85
|
+
private:
|
86
|
+
T mean_;
|
87
|
+
W weight_;
|
88
|
+
};
|
89
|
+
using vector_t = std::vector<T, Allocator>;
|
90
|
+
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
|
91
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
92
|
+
|
93
|
+
struct centroid_cmp {
|
94
|
+
centroid_cmp() {}
|
95
|
+
bool operator()(const centroid& a, const centroid& b) const {
|
96
|
+
if (a.get_mean() < b.get_mean()) return true;
|
97
|
+
return false;
|
98
|
+
}
|
99
|
+
};
|
100
|
+
|
101
|
+
/**
|
102
|
+
* Constructor
|
103
|
+
* @param k affects the size of the sketch and its estimation error
|
104
|
+
* @param allocator used to allocate memory
|
105
|
+
*/
|
106
|
+
explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
|
107
|
+
|
108
|
+
/**
|
109
|
+
* Update this t-Digest with the given value
|
110
|
+
* @param value to update the t-Digest with
|
111
|
+
*/
|
112
|
+
void update(T value);
|
113
|
+
|
114
|
+
/**
|
115
|
+
* Merge the given t-Digest into this one
|
116
|
+
* @param other t-Digest to merge
|
117
|
+
*/
|
118
|
+
void merge(tdigest& other);
|
119
|
+
|
120
|
+
/**
|
121
|
+
* Process buffered values and merge centroids if needed
|
122
|
+
*/
|
123
|
+
void compress();
|
124
|
+
|
125
|
+
/**
|
126
|
+
* @return true if t-Digest has not seen any data
|
127
|
+
*/
|
128
|
+
bool is_empty() const;
|
129
|
+
|
130
|
+
/**
|
131
|
+
* @return minimum value seen by t-Digest
|
132
|
+
*/
|
133
|
+
T get_min_value() const;
|
134
|
+
|
135
|
+
/**
|
136
|
+
* @return maximum value seen by t-Digest
|
137
|
+
*/
|
138
|
+
T get_max_value() const;
|
139
|
+
|
140
|
+
/**
|
141
|
+
* @return total weight
|
142
|
+
*/
|
143
|
+
uint64_t get_total_weight() const;
|
144
|
+
|
145
|
+
/**
|
146
|
+
* Compute approximate normalized rank of the given value.
|
147
|
+
* @param value to be ranked
|
148
|
+
* @return normalized rank (from 0 to 1 inclusive)
|
149
|
+
*/
|
150
|
+
double get_rank(T value) const;
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Compute approximate quantile value corresponding to the given normalized rank
|
154
|
+
* @param rank normalized rank (from 0 to 1 inclusive)
|
155
|
+
* @return quantile value corresponding to the given rank
|
156
|
+
*/
|
157
|
+
T get_quantile(double rank) const;
|
158
|
+
|
159
|
+
/**
|
160
|
+
* @return parameter k (compression) that was used to configure this t-Digest
|
161
|
+
*/
|
162
|
+
uint16_t get_k() const;
|
163
|
+
|
164
|
+
/**
|
165
|
+
* Human-readable summary of this t-Digest as a string
|
166
|
+
* @param print_centroids if true append the list of centroids with weights
|
167
|
+
* @return summary of this t-Digest
|
168
|
+
*/
|
169
|
+
string<Allocator> to_string(bool print_centroids = false) const;
|
170
|
+
|
171
|
+
/**
|
172
|
+
* Computes size needed to serialize the current state.
|
173
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
174
|
+
* @return size in bytes needed to serialize this tdigest
|
175
|
+
*/
|
176
|
+
size_t get_serialized_size_bytes(bool with_buffer = false) const;
|
177
|
+
|
178
|
+
/**
|
179
|
+
* This method serializes t-Digest into a given stream in a binary form
|
180
|
+
* @param os output stream
|
181
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
182
|
+
*/
|
183
|
+
void serialize(std::ostream& os, bool with_buffer = false) const;
|
184
|
+
|
185
|
+
/**
|
186
|
+
* This method serializes t-Digest as a vector of bytes.
|
187
|
+
* An optional header can be reserved in front of the sketch.
|
188
|
+
* It is an uninitialized space of a given size.
|
189
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
190
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
191
|
+
* @return serialized sketch as a vector of bytes
|
192
|
+
*/
|
193
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
|
194
|
+
|
195
|
+
/**
|
196
|
+
* This method deserializes t-Digest from a given stream.
|
197
|
+
* @param is input stream
|
198
|
+
* @param allocator instance of an Allocator
|
199
|
+
* @return an instance of t-Digest
|
200
|
+
*/
|
201
|
+
static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
|
202
|
+
|
203
|
+
/**
|
204
|
+
* This method deserializes t-Digest from a given array of bytes.
|
205
|
+
* @param bytes pointer to the array of bytes
|
206
|
+
* @param size the size of the array
|
207
|
+
* @param allocator instance of an Allocator
|
208
|
+
* @return an instance of t-Digest
|
209
|
+
*/
|
210
|
+
static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
211
|
+
|
212
|
+
private:
|
213
|
+
bool reverse_merge_;
|
214
|
+
uint16_t k_;
|
215
|
+
uint16_t internal_k_;
|
216
|
+
T min_;
|
217
|
+
T max_;
|
218
|
+
size_t centroids_capacity_;
|
219
|
+
vector_centroid centroids_;
|
220
|
+
uint64_t centroids_weight_;
|
221
|
+
size_t buffer_capacity_;
|
222
|
+
vector_t buffer_;
|
223
|
+
|
224
|
+
static const size_t BUFFER_MULTIPLIER = 4;
|
225
|
+
|
226
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
|
227
|
+
static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
|
228
|
+
static const uint8_t SERIAL_VERSION = 1;
|
229
|
+
static const uint8_t SKETCH_TYPE = 20;
|
230
|
+
|
231
|
+
static const uint8_t COMPAT_DOUBLE = 1;
|
232
|
+
static const uint8_t COMPAT_FLOAT = 2;
|
233
|
+
|
234
|
+
enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
|
235
|
+
|
236
|
+
bool is_single_value() const;
|
237
|
+
uint8_t get_preamble_longs() const;
|
238
|
+
void merge(vector_centroid& buffer, W weight);
|
239
|
+
|
240
|
+
// for deserialize
|
241
|
+
tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
|
242
|
+
|
243
|
+
static double weighted_average(double x1, double w1, double x2, double w2);
|
244
|
+
|
245
|
+
// for compatibility with format of the reference implementation
|
246
|
+
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
|
247
|
+
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
248
|
+
};
|
249
|
+
|
250
|
+
} /* namespace datasketches */
|
251
|
+
|
252
|
+
#include "tdigest_impl.hpp"
|
253
|
+
|
254
|
+
#endif // _TDIGEST_HPP_
|