datasketches 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,41 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(tdigest INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::TDIGEST ALIAS tdigest)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(tdigest
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(tdigest INTERFACE common)
33
+
34
+ install(TARGETS tdigest
35
+ EXPORT ${PROJECT_NAME}
36
+ )
37
+
38
+ install(FILES
39
+ include/tdigest.hpp
40
+ include/tdigest_impl.hpp
41
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -0,0 +1,304 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _TDIGEST_HPP_
21
+ #define _TDIGEST_HPP_
22
+
23
+ #include <cstddef>
24
+ #include <limits>
25
+ #include <type_traits>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // this is equivalent of K_2 (default) in the Java implementation mentioned below
33
+ // Generates cluster sizes proportional to q*(1-q).
34
+ // The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples.
35
+ struct scale_function {
36
+ double max(double q, double normalizer) const {
37
+ return q * (1 - q) / normalizer;
38
+ }
39
+ double normalizer(double compression, double n) const {
40
+ return compression / z(compression, n);
41
+ }
42
+ double z(double compression, double n) const {
43
+ return 4 * std::log(n / compression) + 24;
44
+ }
45
+ };
46
+
47
+ // forward declaration
48
+ template <typename T, typename Allocator = std::allocator<T>> class tdigest;
49
+
50
+ /// TDigest float sketch
51
+ using tdigest_float = tdigest<float>;
52
+ /// TDigest double sketch
53
+ using tdigest_double = tdigest<double>;
54
+
55
+ /**
56
+ * t-Digest for estimating quantiles and ranks.
57
+ * This implementation is based on the following paper:
58
+ * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests
59
+ * and the following implementation in Java:
60
+ * https://github.com/tdunning/t-digest
61
+ * This implementation is similar to MergingDigest in the above Java implementation
62
+ */
63
+ template <typename T, typename Allocator>
64
+ class tdigest {
65
+ // exclude long double by not using std::is_floating_point
66
+ static_assert(std::is_same<T, double>::value || std::is_same<T, float>::value, "Either double or float type expected");
67
+ static_assert(std::numeric_limits<T>::is_iec559, "IEEE 754 compatibility required");
68
+ public:
69
+ using value_type = T;
70
+ using allocator_type = Allocator;
71
+
72
+ static const uint16_t DEFAULT_K = 200;
73
+
74
+ using W = typename std::conditional<std::is_same<T, double>::value, uint64_t, uint32_t>::type;
75
+
76
+ class centroid {
77
+ public:
78
+ centroid(T value, W weight): mean_(value), weight_(weight) {}
79
+ void add(const centroid& other) {
80
+ weight_ += other.weight_;
81
+ mean_ += (other.mean_ - mean_) * other.weight_ / weight_;
82
+ }
83
+ T get_mean() const { return mean_; }
84
+ W get_weight() const { return weight_; }
85
+ private:
86
+ T mean_;
87
+ W weight_;
88
+ };
89
+ using vector_t = std::vector<T, Allocator>;
90
+ using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
93
+
94
+ struct centroid_cmp {
95
+ centroid_cmp() {}
96
+ bool operator()(const centroid& a, const centroid& b) const {
97
+ if (a.get_mean() < b.get_mean()) return true;
98
+ return false;
99
+ }
100
+ };
101
+
102
+ /**
103
+ * Constructor
104
+ * @param k affects the size of the sketch and its estimation error
105
+ * @param allocator used to allocate memory
106
+ */
107
+ explicit tdigest(uint16_t k = DEFAULT_K, const Allocator& allocator = Allocator());
108
+
109
+ /**
110
+ * Update this t-Digest with the given value
111
+ * @param value to update the t-Digest with
112
+ */
113
+ void update(T value);
114
+
115
+ /**
116
+ * Merge the given t-Digest into this one
117
+ * @param other t-Digest to merge
118
+ */
119
+ void merge(const tdigest& other);
120
+
121
+ /**
122
+ * Process buffered values and merge centroids if needed
123
+ */
124
+ void compress();
125
+
126
+ /**
127
+ * @return true if t-Digest has not seen any data
128
+ */
129
+ bool is_empty() const;
130
+
131
+ /**
132
+ * @return minimum value seen by t-Digest
133
+ */
134
+ T get_min_value() const;
135
+
136
+ /**
137
+ * @return maximum value seen by t-Digest
138
+ */
139
+ T get_max_value() const;
140
+
141
+ /**
142
+ * @return total weight
143
+ */
144
+ uint64_t get_total_weight() const;
145
+
146
+ /**
147
+ * Returns an instance of the allocator for this t-Digest.
148
+ * @return allocator
149
+ */
150
+ Allocator get_allocator() const;
151
+
152
+ /**
153
+ * Compute approximate normalized rank of the given value.
154
+ *
155
+ * <p>If the sketch is empty this throws std::runtime_error.
156
+ *
157
+ * @param value to be ranked
158
+ * @return normalized rank (from 0 to 1 inclusive)
159
+ */
160
+ double get_rank(T value) const;
161
+
162
+ /**
163
+ * Compute approximate quantile value corresponding to the given normalized rank
164
+ *
165
+ * <p>If the sketch is empty this throws std::runtime_error.
166
+ *
167
+ * @param rank normalized rank (from 0 to 1 inclusive)
168
+ * @return quantile value corresponding to the given rank
169
+ */
170
+ T get_quantile(double rank) const;
171
+
172
+ /**
173
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
174
+ * given a set of split points.
175
+ *
176
+ * <p>If the sketch is empty this throws std::runtime_error.
177
+ *
178
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
179
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
180
+ *
181
+ * @param size the number of split points in the array
182
+ *
183
+ * @return an array of m+1 doubles each of which is an approximation
184
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
185
+ */
186
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
187
+
188
+ /**
189
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
190
+ * cumulative analog of the PMF, of the input stream given a set of split points.
191
+ *
192
+ * <p>If the sketch is empty this throws std::runtime_error.
193
+ *
194
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
195
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
196
+ *
197
+ * @param size the number of split points in the array
198
+ *
199
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
200
+ * of the input stream given the split_points. The value at array position j of the returned
201
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
202
+ * array. This can be viewed as array of ranks of the given split points plus one more value
203
+ * that is always 1.
204
+ */
205
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
206
+
207
+ /**
208
+ * @return parameter k (compression) that was used to configure this t-Digest
209
+ */
210
+ uint16_t get_k() const;
211
+
212
+ /**
213
+ * Human-readable summary of this t-Digest as a string
214
+ * @param print_centroids if true append the list of centroids with weights
215
+ * @return summary of this t-Digest
216
+ */
217
+ string<Allocator> to_string(bool print_centroids = false) const;
218
+
219
+ /**
220
+ * Computes size needed to serialize the current state.
221
+ * @param with_buffer optionally serialize buffered values avoiding compression
222
+ * @return size in bytes needed to serialize this tdigest
223
+ */
224
+ size_t get_serialized_size_bytes(bool with_buffer = false) const;
225
+
226
+ /**
227
+ * This method serializes t-Digest into a given stream in a binary form
228
+ * @param os output stream
229
+ * @param with_buffer optionally serialize buffered values avoiding compression
230
+ */
231
+ void serialize(std::ostream& os, bool with_buffer = false) const;
232
+
233
+ /**
234
+ * This method serializes t-Digest as a vector of bytes.
235
+ * An optional header can be reserved in front of the sketch.
236
+ * It is an uninitialized space of a given size.
237
+ * @param header_size_bytes space to reserve in front of the sketch
238
+ * @param with_buffer optionally serialize buffered values avoiding compression
239
+ * @return serialized sketch as a vector of bytes
240
+ */
241
+ vector_bytes serialize(unsigned header_size_bytes = 0, bool with_buffer = false) const;
242
+
243
+ /**
244
+ * This method deserializes t-Digest from a given stream.
245
+ * @param is input stream
246
+ * @param allocator instance of an Allocator
247
+ * @return an instance of t-Digest
248
+ */
249
+ static tdigest deserialize(std::istream& is, const Allocator& allocator = Allocator());
250
+
251
+ /**
252
+ * This method deserializes t-Digest from a given array of bytes.
253
+ * @param bytes pointer to the array of bytes
254
+ * @param size the size of the array
255
+ * @param allocator instance of an Allocator
256
+ * @return an instance of t-Digest
257
+ */
258
+ static tdigest deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
259
+
260
+ private:
261
+ bool reverse_merge_;
262
+ uint16_t k_;
263
+ uint16_t internal_k_;
264
+ T min_;
265
+ T max_;
266
+ size_t centroids_capacity_;
267
+ vector_centroid centroids_;
268
+ uint64_t centroids_weight_;
269
+ size_t buffer_capacity_;
270
+ vector_t buffer_;
271
+
272
+ static const size_t BUFFER_MULTIPLIER = 4;
273
+
274
+ static const uint8_t PREAMBLE_LONGS_EMPTY_OR_SINGLE = 1;
275
+ static const uint8_t PREAMBLE_LONGS_MULTIPLE = 2;
276
+ static const uint8_t SERIAL_VERSION = 1;
277
+ static const uint8_t SKETCH_TYPE = 20;
278
+
279
+ static const uint8_t COMPAT_DOUBLE = 1;
280
+ static const uint8_t COMPAT_FLOAT = 2;
281
+
282
+ enum flags { IS_EMPTY, IS_SINGLE_VALUE, REVERSE_MERGE };
283
+
284
+ bool is_single_value() const;
285
+ uint8_t get_preamble_longs() const;
286
+ void merge(vector_centroid& buffer, W weight);
287
+
288
+ // for deserialize
289
+ tdigest(bool reverse_merge, uint16_t k, T min, T max, vector_centroid&& centroids, uint64_t total_weight_, vector_t&& buffer);
290
+
291
+ static double weighted_average(double x1, double w1, double x2, double w2);
292
+
293
+ // for compatibility with format of the reference implementation
294
+ static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
295
+ static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
296
+
297
+ static inline void check_split_points(const T* values, uint32_t size);
298
+ };
299
+
300
+ } /* namespace datasketches */
301
+
302
+ #include "tdigest_impl.hpp"
303
+
304
+ #endif // _TDIGEST_HPP_