datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,41 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(tdigest INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::TDIGEST ALIAS tdigest)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(tdigest
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(tdigest INTERFACE common)
33
+
34
+ install(TARGETS tdigest
35
+ EXPORT ${PROJECT_NAME}
36
+ )
37
+
38
+ install(FILES
39
+ include/tdigest.hpp
40
+ include/tdigest_impl.hpp
41
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -0,0 +1,304 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_HPP_
21
+ #define _TDIGEST_HPP_
22
+
23
+ #include <cstddef>
24
+ #include <limits>
25
+ #include <type_traits>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // this is equivalent of K_2 (default) in the Java implementation mentioned below
33
+ // Generates cluster sizes proportional to q*(1-q).
34
+ // The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
35
+ struct scale_function {
36
+ double max(double q, double normalizer) const {
37
+ return q * (1 - q) / normalizer;
38
+ }
39
+ double normalizer(double compression, double n) const {
40
+ return compression / z(compression, n);
41
+ }
42
+ double z(double compression, double n) const {
43
+ return 4 * std::log(n / compression) + 24;
44
+ }
45
+ };
46
+
47
+ // forward declaration
48
+ template <typename T, typename Allocator = std::allocator<T>> class tdigest;
49
+
50
+ /// TDigest float sketch
51
+ using tdigest_float = tdigest<float>;
52
+ /// TDigest double sketch
53
+ using tdigest_double = tdigest<double>;
54
+
55
+ /**
56
+ * t-Digest for estimating quantiles and ranks.
57
+ * This implementation is based on the following paper:
58
+ * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
59
+ * and the following implementation in Java:
60
+ * https://github.com/tdunning/t-digest
61
+ * This implementation is similar to MergingDigest in the above Java implementation
62
+ */
63
+ template <typename T, typename Allocator>
64
+ class tdigest {
65
+ // exclude long double by not using std::is_floating_point
66
+ static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
67
+ static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
68
+ public:
69
+ using value_type = T;
70
+ using allocator_type = Allocator;
71
+
72
+ static const uint16_t DEFAULT_K = 200;
73
+
74
+ using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
75
+
76
+ class centroid {
77
+ public:
78
+ centroid(T value, W weight): mean_(value), weight_(weight) {}
79
+ void add(const centroid& other) {
80
+ weight_ += other.weight_;
81
+ mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
82
+ }
83
+ T get_mean() const { return mean_; }
84
+ W get_weight() const { return weight_; }
85
+ private:
86
+ T mean_;
87
+ W weight_;
88
+ };
89
+ using vector_t = std::vector<T, Allocator>;
90
+ using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
93
+
94
+ struct centroid_cmp {
95
+ centroid_cmp() {}
96
+ bool operator()(const centroid& a, const centroid& b) const {
97
+ if (a.get_mean() < b.get_mean()) return true;
98
+ return false;
99
+ }
100
+ };
101
+
102
+ /**
103
+ * Constructor
104
+ * @param k affects the size of the sketch and its estimation error
105
+ * @param allocator used to allocate memory
106
+ */
107
+ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
108
+
109
+ /**
110
+ * Update this t-Digest with the given value
111
+ * @param value to update the t-Digest with
112
+ */
113
+ void update(T value);
114
+
115
+ /**
116
+ * Merge the given t-Digest into this one
117
+ * @param other t-Digest to merge
118
+ */
119
+ void merge(const tdigest& other);
120
+
121
+ /**
122
+ * Process buffered values and merge centroids if needed
123
+ */
124
+ void compress();
125
+
126
+ /**
127
+ * @return true if t-Digest has not seen any data
128
+ */
129
+ bool is_empty() const;
130
+
131
+ /**
132
+ * @return minimum value seen by t-Digest
133
+ */
134
+ T get_min_value() const;
135
+
136
+ /**
137
+ * @return maximum value seen by t-Digest
138
+ */
139
+ T get_max_value() const;
140
+
141
+ /**
142
+ * @return total weight
143
+ */
144
+ uint64_t get_total_weight() const;
145
+
146
+ /**
147
+ * Returns an instance of the allocator for this t-Digest.
148
+ * @return allocator
149
+ */
150
+ Allocator get_allocator() const;
151
+
152
+ /**
153
+ * Compute approximate normalized rank of the given value.
154
+ *
155
+ * <p>If the sketch is empty this throws std::runtime_error.
156
+ *
157
+ * @param value to be ranked
158
+ * @return normalized rank (from 0 to 1 inclusive)
159
+ */
160
+ double get_rank(T value) const;
161
+
162
+ /**
163
+ * Compute approximate quantile value corresponding to the given normalized rank
164
+ *
165
+ * <p>If the sketch is empty this throws std::runtime_error.
166
+ *
167
+ * @param rank normalized rank (from 0 to 1 inclusive)
168
+ * @return quantile value corresponding to the given rank
169
+ */
170
+ T get_quantile(double rank) const;
171
+
172
+ /**
173
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
174
+ * given a set of split points.
175
+ *
176
+ * <p>If the sketch is empty this throws std::runtime_error.
177
+ *
178
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
179
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
180
+ *
181
+ * @param size the number of split points in the array
182
+ *
183
+ * @return an array of m+1 doubles each of which is an approximation
184
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
185
+ */
186
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
187
+
188
+ /**
189
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
190
+ * cumulative analog of the PMF, of the input stream given a set of split points.
191
+ *
192
+ * <p>If the sketch is empty this throws std::runtime_error.
193
+ *
194
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
195
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
196
+ *
197
+ * @param size the number of split points in the array
198
+ *
199
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
200
+ * of the input stream given the split_points. The value at array position j of the returned
201
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
202
+ * array. This can be viewed as array of ranks of the given split points plus one more value
203
+ * that is always 1.
204
+ */
205
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
206
+
207
+ /**
208
+ * @return parameter k (compression) that was used to configure this t-Digest
209
+ */
210
+ uint16_t get_k() const;
211
+
212
+ /**
213
+ * Human-readable summary of this t-Digest as a string
214
+ * @param print_centroids if true append the list of centroids with weights
215
+ * @return summary of this t-Digest
216
+ */
217
+ string<Allocator> to_string(bool print_centroids = false) const;
218
+
219
+ /**
220
+ * Computes size needed to serialize the current state.
221
+ * @param with_buffer optionally serialize buffered values avoiding compression
222
+ * @return size in bytes needed to serialize this tdigest
223
+ */
224
+ size_t get_serialized_size_bytes(bool with_buffer = false) const;
225
+
226
+ /**
227
+ * This method serializes t-Digest into a given stream in a binary form
228
+ * @param os output stream
229
+ * @param with_buffer optionally serialize buffered values avoiding compression
230
+ */
231
+ void serialize(std::ostream& os, bool with_buffer = false) const;
232
+
233
+ /**
234
+ * This method serializes t-Digest as a vector of bytes.
235
+ * An optional header can be reserved in front of the sketch.
236
+ * It is an uninitialized space of a given size.
237
+ * @param header_size_bytes space to reserve in front of the sketch
238
+ * @param with_buffer optionally serialize buffered values avoiding compression
239
+ * @return serialized sketch as a vector of bytes
240
+ */
241
+ vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
242
+
243
+ /**
244
+ * This method deserializes t-Digest from a given stream.
245
+ * @param is input stream
246
+ * @param allocator instance of an Allocator
247
+ * @return an instance of t-Digest
248
+ */
249
+ static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
250
+
251
+ /**
252
+ * This method deserializes t-Digest from a given array of bytes.
253
+ * @param bytes pointer to the array of bytes
254
+ * @param size the size of the array
255
+ * @param allocator instance of an Allocator
256
+ * @return an instance of t-Digest
257
+ */
258
+ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
259
+
260
+ private:
261
+ bool reverse_merge_;
262
+ uint16_t k_;
263
+ uint16_t internal_k_;
264
+ T min_;
265
+ T max_;
266
+ size_t centroids_capacity_;
267
+ vector_centroid centroids_;
268
+ uint64_t centroids_weight_;
269
+ size_t buffer_capacity_;
270
+ vector_t buffer_;
271
+
272
+ static const size_t BUFFER_MULTIPLIER = 4;
273
+
274
+ static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
275
+ static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
276
+ static const uint8_t SERIAL_VERSION = 1;
277
+ static const uint8_t SKETCH_TYPE = 20;
278
+
279
+ static const uint8_t COMPAT_DOUBLE = 1;
280
+ static const uint8_t COMPAT_FLOAT = 2;
281
+
282
+ enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
283
+
284
+ bool is_single_value() const;
285
+ uint8_t get_preamble_longs() const;
286
+ void merge(vector_centroid& buffer, W weight);
287
+
288
+ // for deserialize
289
+ tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
290
+
291
+ static double weighted_average(double x1, double w1, double x2, double w2);
292
+
293
+ // for compatibility with format of the reference implementation
294
+ static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
295
+ static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
296
+
297
+ static inline void check_split_points(const T* values, uint32_t size);
298
+ };
299
+
300
+ } /* namespace datasketches */
301
+
302
+ #include "tdigest_impl.hpp"
303
+
304
+ #endif // _TDIGEST_HPP_