datasketches 0.4.2 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -0,0 +1,54 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("tdigest double", "[serde_compat]") {
32
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
33
+ for (const unsigned n: n_arr) {
34
+ std::ifstream is;
35
+ is.exceptions(std::ios::failbit | std::ios::badbit);
36
+ is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
37
+ const auto td = tdigest<double>::deserialize(is);
38
+ REQUIRE(td.is_empty() == (n == 0));
39
+ REQUIRE(td.get_total_weight() == n);
40
+ if (n > 0) {
41
+ REQUIRE(td.get_min_value() == 1.0);
42
+ REQUIRE(td.get_max_value() == static_cast<double>(n));
43
+ REQUIRE(td.get_rank(0) == 0);
44
+ REQUIRE(td.get_rank(n + 1) == 1);
45
+ if (n == 1) {
46
+ REQUIRE(td.get_rank(n) == 0.5);
47
+ } else {
48
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+ } /* namespace datasketches */
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("tdigest double generate", "[serialize_for_java]") {
28
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
29
+ for (const unsigned n: n_arr) {
30
+ tdigest_double td(100);
31
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
32
+ std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
33
+ td.serialize(os);
34
+ }
35
+ }
36
+
37
+ TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
38
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
39
+ for (const unsigned n: n_arr) {
40
+ tdigest_double td(100);
41
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
42
+ std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
43
+ td.serialize(os, true);
44
+ }
45
+ }
46
+
47
+ TEST_CASE("tdigest float generate", "[serialize_for_java]") {
48
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
49
+ for (const unsigned n: n_arr) {
50
+ tdigest_float td(100);
51
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
52
+ std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
53
+ td.serialize(os);
54
+ }
55
+ }
56
+
57
+ TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
58
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
59
+ for (const unsigned n: n_arr) {
60
+ tdigest_float td(100);
61
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
62
+ std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
63
+ td.serialize(os, true);
64
+ }
65
+ }
66
+
67
+ } /* namespace datasketches */
@@ -0,0 +1,447 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <iostream>
22
+ #include <fstream>
23
+
24
+ #include "tdigest.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("empty", "[tdigest]") {
29
+ tdigest_double td(10);
30
+ // std::cout << td.to_string();
31
+ REQUIRE(td.is_empty());
32
+ REQUIRE(td.get_k() == 10);
33
+ REQUIRE(td.get_total_weight() == 0);
34
+ REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
35
+ REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
+ REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
+ REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ }
39
+
40
+ TEST_CASE("one value", "[tdigest]") {
41
+ tdigest_double td(100);
42
+ td.update(1);
43
+ REQUIRE(td.get_k() == 100);
44
+ REQUIRE(td.get_total_weight() == 1);
45
+ REQUIRE(td.get_min_value() == 1);
46
+ REQUIRE(td.get_max_value() == 1);
47
+ REQUIRE(td.get_rank(0.99) == 0);
48
+ REQUIRE(td.get_rank(1) == 0.5);
49
+ REQUIRE(td.get_rank(1.01) == 1);
50
+ REQUIRE(td.get_quantile(0) == 1);
51
+ REQUIRE(td.get_quantile(0.5) == 1);
52
+ REQUIRE(td.get_quantile(1) == 1);
53
+ }
54
+
55
+ TEST_CASE("many values", "[tdigest]") {
56
+ const size_t n = 10000;
57
+ tdigest_double td;
58
+ for (size_t i = 0; i < n; ++i) td.update(i);
59
+ // std::cout << td.to_string(true);
60
+ // td.compress();
61
+ // std::cout << td.to_string(true);
62
+ REQUIRE_FALSE(td.is_empty());
63
+ REQUIRE(td.get_total_weight() == n);
64
+ REQUIRE(td.get_min_value() == 0);
65
+ REQUIRE(td.get_max_value() == n - 1);
66
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
67
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
68
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
69
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
70
+ REQUIRE(td.get_rank(n) == 1);
71
+ REQUIRE(td.get_quantile(0) == 0);
72
+ REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
73
+ REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
+ REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
+ REQUIRE(td.get_quantile(1) == n - 1);
76
+ }
77
+
78
+ TEST_CASE("rank - two values", "[tdigest]") {
79
+ tdigest_double td(100);
80
+ td.update(1);
81
+ td.update(2);
82
+ // td.compress();
83
+ // std::cout << td.to_string(true);
84
+ REQUIRE(td.get_rank(0.99) == 0);
85
+ REQUIRE(td.get_rank(1) == 0.25);
86
+ REQUIRE(td.get_rank(1.25) == 0.375);
87
+ REQUIRE(td.get_rank(1.5) == 0.5);
88
+ REQUIRE(td.get_rank(1.75) == 0.625);
89
+ REQUIRE(td.get_rank(2) == 0.75);
90
+ REQUIRE(td.get_rank(2.01) == 1);
91
+ }
92
+
93
+ TEST_CASE("rank - repeated value", "[tdigest]") {
94
+ tdigest_double td(100);
95
+ td.update(1);
96
+ td.update(1);
97
+ td.update(1);
98
+ td.update(1);
99
+ // td.compress();
100
+ // std::cout << td.to_string(true);
101
+ REQUIRE(td.get_rank(0.99) == 0);
102
+ REQUIRE(td.get_rank(1) == 0.5);
103
+ REQUIRE(td.get_rank(1.01) == 1);
104
+ }
105
+
106
+ TEST_CASE("rank - repeated block", "[tdigest]") {
107
+ tdigest_double td(100);
108
+ td.update(1);
109
+ td.update(2);
110
+ td.update(2);
111
+ td.update(3);
112
+ // td.compress();
113
+ // std::cout << td.to_string(true);
114
+ REQUIRE(td.get_rank(0.99) == 0);
115
+ REQUIRE(td.get_rank(1) == 0.125);
116
+ REQUIRE(td.get_rank(2) == 0.5);
117
+ REQUIRE(td.get_rank(3) == 0.875);
118
+ REQUIRE(td.get_rank(3.01) == 1);
119
+ }
120
+
121
+ TEST_CASE("merge small", "[tdigest]") {
122
+ tdigest_double td1(10);
123
+ td1.update(1);
124
+ td1.update(2);
125
+ tdigest_double td2(10);
126
+ td2.update(2);
127
+ td2.update(3);
128
+ td1.merge(td2);
129
+ REQUIRE(td1.get_min_value() == 1);
130
+ REQUIRE(td1.get_max_value() == 3);
131
+ REQUIRE(td1.get_total_weight() == 4);
132
+ REQUIRE(td1.get_rank(0.99) == 0);
133
+ REQUIRE(td1.get_rank(1) == 0.125);
134
+ REQUIRE(td1.get_rank(2) == 0.5);
135
+ REQUIRE(td1.get_rank(3) == 0.875);
136
+ REQUIRE(td1.get_rank(3.01) == 1);
137
+ }
138
+
139
+ TEST_CASE("merge large", "[tdigest]") {
140
+ const size_t n = 10000;
141
+ tdigest_double td1;
142
+ tdigest_double td2;
143
+ for (size_t i = 0; i < n / 2; ++i) {
144
+ td1.update(i);
145
+ td2.update(n / 2 + i);
146
+ }
147
+ // std::cout << td1.to_string();
148
+ // std::cout << td2.to_string();
149
+ td1.merge(td2);
150
+ // td1.compress();
151
+ // std::cout << td1.to_string(true);
152
+ REQUIRE(td1.get_total_weight() == n);
153
+ REQUIRE(td1.get_min_value() == 0);
154
+ REQUIRE(td1.get_max_value() == n - 1);
155
+ REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
156
+ REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
157
+ REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
158
+ REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
159
+ REQUIRE(td1.get_rank(n) == 1);
160
+ }
161
+
162
+ TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
163
+ tdigest<double> td(100);
164
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
165
+ td.serialize(s);
166
+ auto deserialized_td = tdigest<double>::deserialize(s);
167
+ REQUIRE(td.get_k() == deserialized_td.get_k());
168
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
169
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
170
+ }
171
+
172
+ TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
173
+ tdigest<double> td;
174
+ td.update(123);
175
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
176
+ td.serialize(s);
177
+ auto deserialized_td = tdigest<double>::deserialize(s);
178
+ REQUIRE(deserialized_td.get_k() == 200);
179
+ REQUIRE(deserialized_td.get_total_weight() == 1);
180
+ REQUIRE_FALSE(deserialized_td.is_empty());
181
+ REQUIRE(deserialized_td.get_min_value() == 123);
182
+ REQUIRE(deserialized_td.get_max_value() == 123);
183
+ }
184
+
185
+ TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
186
+ tdigest<double> td;
187
+ td.update(123);
188
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
189
+ td.serialize(s, true);
190
+ auto deserialized_td = tdigest<double>::deserialize(s);
191
+ REQUIRE(deserialized_td.get_k() == 200);
192
+ REQUIRE(deserialized_td.get_total_weight() == 1);
193
+ REQUIRE_FALSE(deserialized_td.is_empty());
194
+ REQUIRE(deserialized_td.get_min_value() == 123);
195
+ REQUIRE(deserialized_td.get_max_value() == 123);
196
+ }
197
+
198
+ TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
199
+ tdigest<double> td(100);
200
+ for (int i = 0; i < 1000; ++i) td.update(i);
201
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
202
+ td.serialize(s);
203
+ auto deserialized_td = tdigest<double>::deserialize(s);
204
+ REQUIRE(td.get_k() == deserialized_td.get_k());
205
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
206
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
207
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
208
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
209
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
210
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
211
+ }
212
+
213
+ TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
214
+ tdigest<double> td(100);
215
+ for (int i = 0; i < 10000; ++i) td.update(i);
216
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
217
+ td.serialize(s, true);
218
+ auto deserialized_td = tdigest<double>::deserialize(s);
219
+ REQUIRE(td.get_k() == deserialized_td.get_k());
220
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
221
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
222
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
223
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
224
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
225
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
226
+ }
227
+
228
+ TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
229
+ tdigest<double> td(100);
230
+ auto bytes = td.serialize();
231
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
232
+ REQUIRE(td.get_k() == deserialized_td.get_k());
233
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
234
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
235
+ }
236
+
237
+ TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
238
+ tdigest<double> td(200);
239
+ td.update(123);
240
+ auto bytes = td.serialize();
241
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
242
+ REQUIRE(deserialized_td.get_k() == 200);
243
+ REQUIRE(deserialized_td.get_total_weight() == 1);
244
+ REQUIRE_FALSE(deserialized_td.is_empty());
245
+ REQUIRE(deserialized_td.get_min_value() == 123);
246
+ REQUIRE(deserialized_td.get_max_value() == 123);
247
+ }
248
+
249
+ TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
250
+ tdigest<double> td(200);
251
+ td.update(123);
252
+ auto bytes = td.serialize(0, true);
253
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
254
+ REQUIRE(deserialized_td.get_k() == 200);
255
+ REQUIRE(deserialized_td.get_total_weight() == 1);
256
+ REQUIRE_FALSE(deserialized_td.is_empty());
257
+ REQUIRE(deserialized_td.get_min_value() == 123);
258
+ REQUIRE(deserialized_td.get_max_value() == 123);
259
+ }
260
+
261
+ TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
262
+ tdigest<double> td(100);
263
+ for (int i = 0; i < 1000; ++i) td.update(i);
264
+ auto bytes = td.serialize();
265
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
266
+ REQUIRE(td.get_k() == deserialized_td.get_k());
267
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
268
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
269
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
270
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
271
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
272
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
273
+ }
274
+
275
+ TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
276
+ tdigest<double> td(100);
277
+ for (int i = 0; i < 10000; ++i) td.update(i);
278
+ auto bytes = td.serialize();
279
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
280
+ REQUIRE(td.get_k() == deserialized_td.get_k());
281
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
282
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
283
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
284
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
285
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
286
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
287
+ }
288
+
289
+ TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
290
+ tdigest<double> td(100);
291
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
292
+ td.serialize(s);
293
+ auto bytes = td.serialize();
294
+
295
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
296
+ for (size_t i = 0; i < bytes.size(); ++i) {
297
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
298
+ }
299
+
300
+ s.seekg(0); // rewind
301
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
302
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
303
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
304
+
305
+ REQUIRE(deserialized_td1.is_empty());
306
+ REQUIRE(deserialized_td2.is_empty());
307
+ REQUIRE(deserialized_td1.get_k() == 100);
308
+ REQUIRE(deserialized_td2.get_k() == 100);
309
+ REQUIRE(deserialized_td1.get_total_weight() == 0);
310
+ REQUIRE(deserialized_td2.get_total_weight() == 0);
311
+ }
312
+
313
+ TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
314
+ tdigest<double> td(100);
315
+ const int n = 1000;
316
+ for (int i = 0; i < n; ++i) td.update(i);
317
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
318
+ td.serialize(s);
319
+ auto bytes = td.serialize();
320
+
321
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
322
+ for (size_t i = 0; i < bytes.size(); ++i) {
323
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
324
+ }
325
+
326
+ s.seekg(0); // rewind
327
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
328
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
329
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
330
+
331
+ REQUIRE_FALSE(deserialized_td1.is_empty());
332
+ REQUIRE(deserialized_td1.get_k() == 100);
333
+ REQUIRE(deserialized_td1.get_total_weight() == n);
334
+ REQUIRE(deserialized_td1.get_min_value() == 0);
335
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
336
+
337
+ REQUIRE_FALSE(deserialized_td2.is_empty());
338
+ REQUIRE(deserialized_td2.get_k() == 100);
339
+ REQUIRE(deserialized_td2.get_total_weight() == n);
340
+ REQUIRE(deserialized_td2.get_min_value() == 0);
341
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
342
+
343
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
344
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
345
+ }
346
+
347
+ TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
348
+ tdigest<double> td(100);
349
+ const int n = 10000;
350
+ for (int i = 0; i < n; ++i) td.update(i);
351
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
352
+ td.serialize(s, true);
353
+ auto bytes = td.serialize(0, true);
354
+
355
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
356
+ for (size_t i = 0; i < bytes.size(); ++i) {
357
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
358
+ }
359
+
360
+ s.seekg(0); // rewind
361
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
362
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
363
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
364
+
365
+ REQUIRE_FALSE(deserialized_td1.is_empty());
366
+ REQUIRE(deserialized_td1.get_k() == 100);
367
+ REQUIRE(deserialized_td1.get_total_weight() == n);
368
+ REQUIRE(deserialized_td1.get_min_value() == 0);
369
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
370
+
371
+ REQUIRE_FALSE(deserialized_td2.is_empty());
372
+ REQUIRE(deserialized_td2.get_k() == 100);
373
+ REQUIRE(deserialized_td2.get_total_weight() == n);
374
+ REQUIRE(deserialized_td2.get_min_value() == 0);
375
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
376
+
377
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
378
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
379
+ }
380
+
381
+ TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
382
+ std::ifstream is;
383
+ is.exceptions(std::ios::failbit | std::ios::badbit);
384
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
385
+ const auto td = tdigest<double>::deserialize(is);
386
+ const size_t n = 10000;
387
+ REQUIRE(td.get_total_weight() == n);
388
+ REQUIRE(td.get_min_value() == 0);
389
+ REQUIRE(td.get_max_value() == n - 1);
390
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
391
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
392
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
393
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
394
+ REQUIRE(td.get_rank(n) == 1);
395
+ }
396
+
397
+ TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
398
+ std::ifstream is;
399
+ is.exceptions(std::ios::failbit | std::ios::badbit);
400
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
401
+ const auto td = tdigest<float>::deserialize(is);
402
+ const size_t n = 10000;
403
+ REQUIRE(td.get_total_weight() == n);
404
+ REQUIRE(td.get_min_value() == 0);
405
+ REQUIRE(td.get_max_value() == n - 1);
406
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
407
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
408
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
409
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
410
+ REQUIRE(td.get_rank(n) == 1);
411
+ }
412
+
413
+ TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
414
+ std::ifstream is;
415
+ is.exceptions(std::ios::failbit | std::ios::badbit);
416
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
417
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
418
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
419
+ const size_t n = 10000;
420
+ REQUIRE(td.get_total_weight() == n);
421
+ REQUIRE(td.get_min_value() == 0);
422
+ REQUIRE(td.get_max_value() == n - 1);
423
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
424
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
425
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
426
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
427
+ REQUIRE(td.get_rank(n) == 1);
428
+ }
429
+
430
+ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
431
+ std::ifstream is;
432
+ is.exceptions(std::ios::failbit | std::ios::badbit);
433
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
434
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
435
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
436
+ const size_t n = 10000;
437
+ REQUIRE(td.get_total_weight() == n);
438
+ REQUIRE(td.get_min_value() == 0);
439
+ REQUIRE(td.get_max_value() == n - 1);
440
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
441
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
442
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
443
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
444
+ REQUIRE(td.get_rank(n) == 1);
445
+ }
446
+
447
+ } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(theta
30
30
  )
31
31
 
32
32
  target_link_libraries(theta INTERFACE common)
33
- target_compile_features(theta INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS theta
36
35
  EXPORT ${PROJECT_NAME}
@@ -57,7 +57,7 @@ public:
57
57
  // consistent way of initializing theta from p
58
58
  // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
59
59
  static uint64_t starting_theta_from_p(float p) {
60
- if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
60
+ if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
61
61
  return theta_constants::MAX_THETA;
62
62
  }
63
63
 
@@ -417,6 +417,20 @@ public:
417
417
  virtual uint32_t get_num_retained() const;
418
418
  virtual uint16_t get_seed_hash() const;
419
419
 
420
+ /**
421
+ * Computes maximum serialized size in bytes
422
+ * @param lg_k nominal number of entries in the sketch
423
+ */
424
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
425
+
426
+ /**
427
+ * Computes size in bytes required to serialize the current state of the sketch.
428
+ * Computing compressed size is expensive. It takes iterating over all retained hashes,
429
+ * and the actual serialization will have to look at them again.
430
+ * @param compressed if true compressed size is returned (if applicable)
431
+ */
432
+ size_t get_serialized_size_bytes(bool compressed = false) const;
433
+
420
434
  /**
421
435
  * This method serializes the sketch into a given stream in a binary form
422
436
  * @param os output stream
@@ -486,8 +500,11 @@ private:
486
500
  uint64_t theta_;
487
501
  std::vector<uint64_t, Allocator> entries_;
488
502
 
503
+ uint8_t get_preamble_longs(bool compressed) const;
489
504
  bool is_suitable_for_compression() const;
490
- uint8_t compute_min_leading_zeros() const;
505
+ uint8_t compute_entry_bits() const;
506
+ uint8_t get_num_entries_bytes() const;
507
+ size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
491
508
  void serialize_version_4(std::ostream& os) const;
492
509
  vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
493
510