datasketches 0.4.2 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
@@ -0,0 +1,41 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
12
|
+
# software distributed under the License is distributed on an
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
14
|
+
# KIND, either express or implied. See the License for the
|
15
|
+
# specific language governing permissions and limitations
|
16
|
+
# under the License.
|
17
|
+
|
18
|
+
add_library(tdigest INTERFACE)
|
19
|
+
|
20
|
+
add_library(${PROJECT_NAME}::TDIGEST ALIAS tdigest)
|
21
|
+
|
22
|
+
if (BUILD_TESTS)
|
23
|
+
add_subdirectory(test)
|
24
|
+
endif()
|
25
|
+
|
26
|
+
target_include_directories(tdigest
|
27
|
+
INTERFACE
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
30
|
+
)
|
31
|
+
|
32
|
+
target_link_libraries(tdigest INTERFACE common)
|
33
|
+
|
34
|
+
install(TARGETS tdigest
|
35
|
+
EXPORT ${PROJECT_NAME}
|
36
|
+
)
|
37
|
+
|
38
|
+
install(FILES
|
39
|
+
include/tdigest.hpp
|
40
|
+
include/tdigest_impl.hpp
|
41
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
@@ -0,0 +1,304 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _TDIGEST_HPP_
|
21
|
+
#define _TDIGEST_HPP_
|
22
|
+
|
23
|
+
#include <cstddef>
|
24
|
+
#include <limits>
|
25
|
+
#include <type_traits>
|
26
|
+
#include <vector>
|
27
|
+
|
28
|
+
#include "common_defs.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
// this is equivalent of K_2 (default) in the Java implementation mentioned below
|
33
|
+
// Generates cluster sizes proportional to q*(1-q).
|
34
|
+
// The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
|
35
|
+
struct scale_function {
|
36
|
+
double max(double q, double normalizer) const {
|
37
|
+
return q * (1 - q) / normalizer;
|
38
|
+
}
|
39
|
+
double normalizer(double compression, double n) const {
|
40
|
+
return compression / z(compression, n);
|
41
|
+
}
|
42
|
+
double z(double compression, double n) const {
|
43
|
+
return 4 * std::log(n / compression) + 24;
|
44
|
+
}
|
45
|
+
};
|
46
|
+
|
47
|
+
// forward declaration
|
48
|
+
template <typename T, typename Allocator = std::allocator<T>> class tdigest;
|
49
|
+
|
50
|
+
/// TDigest float sketch
|
51
|
+
using tdigest_float = tdigest<float>;
|
52
|
+
/// TDigest double sketch
|
53
|
+
using tdigest_double = tdigest<double>;
|
54
|
+
|
55
|
+
/**
|
56
|
+
* t-Digest for estimating quantiles and ranks.
|
57
|
+
* This implementation is based on the following paper:
|
58
|
+
* Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
|
59
|
+
* and the following implementation in Java:
|
60
|
+
* https://github.com/tdunning/t-digest
|
61
|
+
* This implementation is similar to MergingDigest in the above Java implementation
|
62
|
+
*/
|
63
|
+
template <typename T, typename Allocator>
|
64
|
+
class tdigest {
|
65
|
+
// exclude long double by not using std::is_floating_point
|
66
|
+
static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
|
67
|
+
static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
|
68
|
+
public:
|
69
|
+
using value_type = T;
|
70
|
+
using allocator_type = Allocator;
|
71
|
+
|
72
|
+
static const uint16_t DEFAULT_K = 200;
|
73
|
+
|
74
|
+
using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
|
75
|
+
|
76
|
+
class centroid {
|
77
|
+
public:
|
78
|
+
centroid(T value, W weight): mean_(value), weight_(weight) {}
|
79
|
+
void add(const centroid& other) {
|
80
|
+
weight_ += other.weight_;
|
81
|
+
mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
|
82
|
+
}
|
83
|
+
T get_mean() const { return mean_; }
|
84
|
+
W get_weight() const { return weight_; }
|
85
|
+
private:
|
86
|
+
T mean_;
|
87
|
+
W weight_;
|
88
|
+
};
|
89
|
+
using vector_t = std::vector<T, Allocator>;
|
90
|
+
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
|
91
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
92
|
+
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
93
|
+
|
94
|
+
struct centroid_cmp {
|
95
|
+
centroid_cmp() {}
|
96
|
+
bool operator()(const centroid& a, const centroid& b) const {
|
97
|
+
if (a.get_mean() < b.get_mean()) return true;
|
98
|
+
return false;
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Constructor
|
104
|
+
* @param k affects the size of the sketch and its estimation error
|
105
|
+
* @param allocator used to allocate memory
|
106
|
+
*/
|
107
|
+
explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
|
108
|
+
|
109
|
+
/**
|
110
|
+
* Update this t-Digest with the given value
|
111
|
+
* @param value to update the t-Digest with
|
112
|
+
*/
|
113
|
+
void update(T value);
|
114
|
+
|
115
|
+
/**
|
116
|
+
* Merge the given t-Digest into this one
|
117
|
+
* @param other t-Digest to merge
|
118
|
+
*/
|
119
|
+
void merge(const tdigest& other);
|
120
|
+
|
121
|
+
/**
|
122
|
+
* Process buffered values and merge centroids if needed
|
123
|
+
*/
|
124
|
+
void compress();
|
125
|
+
|
126
|
+
/**
|
127
|
+
* @return true if t-Digest has not seen any data
|
128
|
+
*/
|
129
|
+
bool is_empty() const;
|
130
|
+
|
131
|
+
/**
|
132
|
+
* @return minimum value seen by t-Digest
|
133
|
+
*/
|
134
|
+
T get_min_value() const;
|
135
|
+
|
136
|
+
/**
|
137
|
+
* @return maximum value seen by t-Digest
|
138
|
+
*/
|
139
|
+
T get_max_value() const;
|
140
|
+
|
141
|
+
/**
|
142
|
+
* @return total weight
|
143
|
+
*/
|
144
|
+
uint64_t get_total_weight() const;
|
145
|
+
|
146
|
+
/**
|
147
|
+
* Returns an instance of the allocator for this t-Digest.
|
148
|
+
* @return allocator
|
149
|
+
*/
|
150
|
+
Allocator get_allocator() const;
|
151
|
+
|
152
|
+
/**
|
153
|
+
* Compute approximate normalized rank of the given value.
|
154
|
+
*
|
155
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
156
|
+
*
|
157
|
+
* @param value to be ranked
|
158
|
+
* @return normalized rank (from 0 to 1 inclusive)
|
159
|
+
*/
|
160
|
+
double get_rank(T value) const;
|
161
|
+
|
162
|
+
/**
|
163
|
+
* Compute approximate quantile value corresponding to the given normalized rank
|
164
|
+
*
|
165
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
166
|
+
*
|
167
|
+
* @param rank normalized rank (from 0 to 1 inclusive)
|
168
|
+
* @return quantile value corresponding to the given rank
|
169
|
+
*/
|
170
|
+
T get_quantile(double rank) const;
|
171
|
+
|
172
|
+
/**
|
173
|
+
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
174
|
+
* given a set of split points.
|
175
|
+
*
|
176
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
177
|
+
*
|
178
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
179
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
180
|
+
*
|
181
|
+
* @param size the number of split points in the array
|
182
|
+
*
|
183
|
+
* @return an array of m+1 doubles each of which is an approximation
|
184
|
+
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
|
185
|
+
*/
|
186
|
+
vector_double get_PMF(const T* split_points, uint32_t size) const;
|
187
|
+
|
188
|
+
/**
|
189
|
+
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
190
|
+
* cumulative analog of the PMF, of the input stream given a set of split points.
|
191
|
+
*
|
192
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
193
|
+
*
|
194
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
195
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
196
|
+
*
|
197
|
+
* @param size the number of split points in the array
|
198
|
+
*
|
199
|
+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
|
200
|
+
* of the input stream given the split_points. The value at array position j of the returned
|
201
|
+
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
202
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
203
|
+
* that is always 1.
|
204
|
+
*/
|
205
|
+
vector_double get_CDF(const T* split_points, uint32_t size) const;
|
206
|
+
|
207
|
+
/**
|
208
|
+
* @return parameter k (compression) that was used to configure this t-Digest
|
209
|
+
*/
|
210
|
+
uint16_t get_k() const;
|
211
|
+
|
212
|
+
/**
|
213
|
+
* Human-readable summary of this t-Digest as a string
|
214
|
+
* @param print_centroids if true append the list of centroids with weights
|
215
|
+
* @return summary of this t-Digest
|
216
|
+
*/
|
217
|
+
string<Allocator> to_string(bool print_centroids = false) const;
|
218
|
+
|
219
|
+
/**
|
220
|
+
* Computes size needed to serialize the current state.
|
221
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
222
|
+
* @return size in bytes needed to serialize this tdigest
|
223
|
+
*/
|
224
|
+
size_t get_serialized_size_bytes(bool with_buffer = false) const;
|
225
|
+
|
226
|
+
/**
|
227
|
+
* This method serializes t-Digest into a given stream in a binary form
|
228
|
+
* @param os output stream
|
229
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
230
|
+
*/
|
231
|
+
void serialize(std::ostream& os, bool with_buffer = false) const;
|
232
|
+
|
233
|
+
/**
|
234
|
+
* This method serializes t-Digest as a vector of bytes.
|
235
|
+
* An optional header can be reserved in front of the sketch.
|
236
|
+
* It is an uninitialized space of a given size.
|
237
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
238
|
+
* @param with_buffer optionally serialize buffered values avoiding compression
|
239
|
+
* @return serialized sketch as a vector of bytes
|
240
|
+
*/
|
241
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
|
242
|
+
|
243
|
+
/**
|
244
|
+
* This method deserializes t-Digest from a given stream.
|
245
|
+
* @param is input stream
|
246
|
+
* @param allocator instance of an Allocator
|
247
|
+
* @return an instance of t-Digest
|
248
|
+
*/
|
249
|
+
static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
|
250
|
+
|
251
|
+
/**
|
252
|
+
* This method deserializes t-Digest from a given array of bytes.
|
253
|
+
* @param bytes pointer to the array of bytes
|
254
|
+
* @param size the size of the array
|
255
|
+
* @param allocator instance of an Allocator
|
256
|
+
* @return an instance of t-Digest
|
257
|
+
*/
|
258
|
+
static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
259
|
+
|
260
|
+
private:
|
261
|
+
bool reverse_merge_;
|
262
|
+
uint16_t k_;
|
263
|
+
uint16_t internal_k_;
|
264
|
+
T min_;
|
265
|
+
T max_;
|
266
|
+
size_t centroids_capacity_;
|
267
|
+
vector_centroid centroids_;
|
268
|
+
uint64_t centroids_weight_;
|
269
|
+
size_t buffer_capacity_;
|
270
|
+
vector_t buffer_;
|
271
|
+
|
272
|
+
static const size_t BUFFER_MULTIPLIER = 4;
|
273
|
+
|
274
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
|
275
|
+
static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
|
276
|
+
static const uint8_t SERIAL_VERSION = 1;
|
277
|
+
static const uint8_t SKETCH_TYPE = 20;
|
278
|
+
|
279
|
+
static const uint8_t COMPAT_DOUBLE = 1;
|
280
|
+
static const uint8_t COMPAT_FLOAT = 2;
|
281
|
+
|
282
|
+
enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
|
283
|
+
|
284
|
+
bool is_single_value() const;
|
285
|
+
uint8_t get_preamble_longs() const;
|
286
|
+
void merge(vector_centroid& buffer, W weight);
|
287
|
+
|
288
|
+
// for deserialize
|
289
|
+
tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
|
290
|
+
|
291
|
+
static double weighted_average(double x1, double w1, double x2, double w2);
|
292
|
+
|
293
|
+
// for compatibility with format of the reference implementation
|
294
|
+
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
|
295
|
+
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
296
|
+
|
297
|
+
static inline void check_split_points(const T* values, uint32_t size);
|
298
|
+
};
|
299
|
+
|
300
|
+
} /* namespace datasketches */
|
301
|
+
|
302
|
+
#include "tdigest_impl.hpp"
|
303
|
+
|
304
|
+
#endif // _TDIGEST_HPP_
|