datasketches 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/README.md +2 -3
  9. data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
  10. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  11. data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
  12. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  13. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  14. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  16. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  20. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  21. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  24. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  25. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  26. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  27. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  28. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  29. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  30. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
  31. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
  32. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  33. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  34. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  35. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  36. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  37. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  38. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
  39. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  40. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  41. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  42. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  43. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
  44. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  46. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  47. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  48. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  49. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  50. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  51. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  52. metadata +13 -3
@@ -0,0 +1,54 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("tdigest double", "[serde_compat]") {
32
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
33
+ for (const unsigned n: n_arr) {
34
+ std::ifstream is;
35
+ is.exceptions(std::ios::failbit | std::ios::badbit);
36
+ is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
37
+ const auto td = tdigest<double>::deserialize(is);
38
+ REQUIRE(td.is_empty() == (n == 0));
39
+ REQUIRE(td.get_total_weight() == n);
40
+ if (n > 0) {
41
+ REQUIRE(td.get_min_value() == 1.0);
42
+ REQUIRE(td.get_max_value() == static_cast<double>(n));
43
+ REQUIRE(td.get_rank(0) == 0);
44
+ REQUIRE(td.get_rank(n + 1) == 1);
45
+ if (n == 1) {
46
+ REQUIRE(td.get_rank(n) == 0.5);
47
+ } else {
48
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+ } /* namespace datasketches */
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("tdigest double generate", "[serialize_for_java]") {
28
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
29
+ for (const unsigned n: n_arr) {
30
+ tdigest_double td(100);
31
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
32
+ std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
33
+ td.serialize(os);
34
+ }
35
+ }
36
+
37
+ TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
38
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
39
+ for (const unsigned n: n_arr) {
40
+ tdigest_double td(100);
41
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
42
+ std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
43
+ td.serialize(os, true);
44
+ }
45
+ }
46
+
47
+ TEST_CASE("tdigest float generate", "[serialize_for_java]") {
48
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
49
+ for (const unsigned n: n_arr) {
50
+ tdigest_float td(100);
51
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
52
+ std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
53
+ td.serialize(os);
54
+ }
55
+ }
56
+
57
+ TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
58
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
59
+ for (const unsigned n: n_arr) {
60
+ tdigest_float td(100);
61
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
62
+ std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
63
+ td.serialize(os, true);
64
+ }
65
+ }
66
+
67
+ } /* namespace datasketches */
@@ -0,0 +1,447 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <iostream>
22
+ #include <fstream>
23
+
24
+ #include "tdigest.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("empty", "[tdigest]") {
29
+ tdigest_double td(10);
30
+ // std::cout << td.to_string();
31
+ REQUIRE(td.is_empty());
32
+ REQUIRE(td.get_k() == 10);
33
+ REQUIRE(td.get_total_weight() == 0);
34
+ REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
35
+ REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
+ REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
+ REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ }
39
+
40
+ TEST_CASE("one value", "[tdigest]") {
41
+ tdigest_double td(100);
42
+ td.update(1);
43
+ REQUIRE(td.get_k() == 100);
44
+ REQUIRE(td.get_total_weight() == 1);
45
+ REQUIRE(td.get_min_value() == 1);
46
+ REQUIRE(td.get_max_value() == 1);
47
+ REQUIRE(td.get_rank(0.99) == 0);
48
+ REQUIRE(td.get_rank(1) == 0.5);
49
+ REQUIRE(td.get_rank(1.01) == 1);
50
+ REQUIRE(td.get_quantile(0) == 1);
51
+ REQUIRE(td.get_quantile(0.5) == 1);
52
+ REQUIRE(td.get_quantile(1) == 1);
53
+ }
54
+
55
+ TEST_CASE("many values", "[tdigest]") {
56
+ const size_t n = 10000;
57
+ tdigest_double td;
58
+ for (size_t i = 0; i < n; ++i) td.update(i);
59
+ // std::cout << td.to_string(true);
60
+ // td.compress();
61
+ // std::cout << td.to_string(true);
62
+ REQUIRE_FALSE(td.is_empty());
63
+ REQUIRE(td.get_total_weight() == n);
64
+ REQUIRE(td.get_min_value() == 0);
65
+ REQUIRE(td.get_max_value() == n - 1);
66
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
67
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
68
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
69
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
70
+ REQUIRE(td.get_rank(n) == 1);
71
+ REQUIRE(td.get_quantile(0) == 0);
72
+ REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
73
+ REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
+ REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
+ REQUIRE(td.get_quantile(1) == n - 1);
76
+ }
77
+
78
+ TEST_CASE("rank - two values", "[tdigest]") {
79
+ tdigest_double td(100);
80
+ td.update(1);
81
+ td.update(2);
82
+ // td.compress();
83
+ // std::cout << td.to_string(true);
84
+ REQUIRE(td.get_rank(0.99) == 0);
85
+ REQUIRE(td.get_rank(1) == 0.25);
86
+ REQUIRE(td.get_rank(1.25) == 0.375);
87
+ REQUIRE(td.get_rank(1.5) == 0.5);
88
+ REQUIRE(td.get_rank(1.75) == 0.625);
89
+ REQUIRE(td.get_rank(2) == 0.75);
90
+ REQUIRE(td.get_rank(2.01) == 1);
91
+ }
92
+
93
+ TEST_CASE("rank - repeated value", "[tdigest]") {
94
+ tdigest_double td(100);
95
+ td.update(1);
96
+ td.update(1);
97
+ td.update(1);
98
+ td.update(1);
99
+ // td.compress();
100
+ // std::cout << td.to_string(true);
101
+ REQUIRE(td.get_rank(0.99) == 0);
102
+ REQUIRE(td.get_rank(1) == 0.5);
103
+ REQUIRE(td.get_rank(1.01) == 1);
104
+ }
105
+
106
+ TEST_CASE("rank - repeated block", "[tdigest]") {
107
+ tdigest_double td(100);
108
+ td.update(1);
109
+ td.update(2);
110
+ td.update(2);
111
+ td.update(3);
112
+ // td.compress();
113
+ // std::cout << td.to_string(true);
114
+ REQUIRE(td.get_rank(0.99) == 0);
115
+ REQUIRE(td.get_rank(1) == 0.125);
116
+ REQUIRE(td.get_rank(2) == 0.5);
117
+ REQUIRE(td.get_rank(3) == 0.875);
118
+ REQUIRE(td.get_rank(3.01) == 1);
119
+ }
120
+
121
+ TEST_CASE("merge small", "[tdigest]") {
122
+ tdigest_double td1(10);
123
+ td1.update(1);
124
+ td1.update(2);
125
+ tdigest_double td2(10);
126
+ td2.update(2);
127
+ td2.update(3);
128
+ td1.merge(td2);
129
+ REQUIRE(td1.get_min_value() == 1);
130
+ REQUIRE(td1.get_max_value() == 3);
131
+ REQUIRE(td1.get_total_weight() == 4);
132
+ REQUIRE(td1.get_rank(0.99) == 0);
133
+ REQUIRE(td1.get_rank(1) == 0.125);
134
+ REQUIRE(td1.get_rank(2) == 0.5);
135
+ REQUIRE(td1.get_rank(3) == 0.875);
136
+ REQUIRE(td1.get_rank(3.01) == 1);
137
+ }
138
+
139
+ TEST_CASE("merge large", "[tdigest]") {
140
+ const size_t n = 10000;
141
+ tdigest_double td1;
142
+ tdigest_double td2;
143
+ for (size_t i = 0; i < n / 2; ++i) {
144
+ td1.update(i);
145
+ td2.update(n / 2 + i);
146
+ }
147
+ // std::cout << td1.to_string();
148
+ // std::cout << td2.to_string();
149
+ td1.merge(td2);
150
+ // td1.compress();
151
+ // std::cout << td1.to_string(true);
152
+ REQUIRE(td1.get_total_weight() == n);
153
+ REQUIRE(td1.get_min_value() == 0);
154
+ REQUIRE(td1.get_max_value() == n - 1);
155
+ REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
156
+ REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
157
+ REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
158
+ REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
159
+ REQUIRE(td1.get_rank(n) == 1);
160
+ }
161
+
162
+ TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
163
+ tdigest<double> td(100);
164
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
165
+ td.serialize(s);
166
+ auto deserialized_td = tdigest<double>::deserialize(s);
167
+ REQUIRE(td.get_k() == deserialized_td.get_k());
168
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
169
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
170
+ }
171
+
172
+ TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
173
+ tdigest<double> td;
174
+ td.update(123);
175
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
176
+ td.serialize(s);
177
+ auto deserialized_td = tdigest<double>::deserialize(s);
178
+ REQUIRE(deserialized_td.get_k() == 200);
179
+ REQUIRE(deserialized_td.get_total_weight() == 1);
180
+ REQUIRE_FALSE(deserialized_td.is_empty());
181
+ REQUIRE(deserialized_td.get_min_value() == 123);
182
+ REQUIRE(deserialized_td.get_max_value() == 123);
183
+ }
184
+
185
+ TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
186
+ tdigest<double> td;
187
+ td.update(123);
188
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
189
+ td.serialize(s, true);
190
+ auto deserialized_td = tdigest<double>::deserialize(s);
191
+ REQUIRE(deserialized_td.get_k() == 200);
192
+ REQUIRE(deserialized_td.get_total_weight() == 1);
193
+ REQUIRE_FALSE(deserialized_td.is_empty());
194
+ REQUIRE(deserialized_td.get_min_value() == 123);
195
+ REQUIRE(deserialized_td.get_max_value() == 123);
196
+ }
197
+
198
+ TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
199
+ tdigest<double> td(100);
200
+ for (int i = 0; i < 1000; ++i) td.update(i);
201
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
202
+ td.serialize(s);
203
+ auto deserialized_td = tdigest<double>::deserialize(s);
204
+ REQUIRE(td.get_k() == deserialized_td.get_k());
205
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
206
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
207
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
208
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
209
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
210
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
211
+ }
212
+
213
+ TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
214
+ tdigest<double> td(100);
215
+ for (int i = 0; i < 10000; ++i) td.update(i);
216
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
217
+ td.serialize(s, true);
218
+ auto deserialized_td = tdigest<double>::deserialize(s);
219
+ REQUIRE(td.get_k() == deserialized_td.get_k());
220
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
221
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
222
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
223
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
224
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
225
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
226
+ }
227
+
228
+ TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
229
+ tdigest<double> td(100);
230
+ auto bytes = td.serialize();
231
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
232
+ REQUIRE(td.get_k() == deserialized_td.get_k());
233
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
234
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
235
+ }
236
+
237
+ TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
238
+ tdigest<double> td(200);
239
+ td.update(123);
240
+ auto bytes = td.serialize();
241
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
242
+ REQUIRE(deserialized_td.get_k() == 200);
243
+ REQUIRE(deserialized_td.get_total_weight() == 1);
244
+ REQUIRE_FALSE(deserialized_td.is_empty());
245
+ REQUIRE(deserialized_td.get_min_value() == 123);
246
+ REQUIRE(deserialized_td.get_max_value() == 123);
247
+ }
248
+
249
+ TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
250
+ tdigest<double> td(200);
251
+ td.update(123);
252
+ auto bytes = td.serialize(0, true);
253
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
254
+ REQUIRE(deserialized_td.get_k() == 200);
255
+ REQUIRE(deserialized_td.get_total_weight() == 1);
256
+ REQUIRE_FALSE(deserialized_td.is_empty());
257
+ REQUIRE(deserialized_td.get_min_value() == 123);
258
+ REQUIRE(deserialized_td.get_max_value() == 123);
259
+ }
260
+
261
+ TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
262
+ tdigest<double> td(100);
263
+ for (int i = 0; i < 1000; ++i) td.update(i);
264
+ auto bytes = td.serialize();
265
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
266
+ REQUIRE(td.get_k() == deserialized_td.get_k());
267
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
268
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
269
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
270
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
271
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
272
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
273
+ }
274
+
275
+ TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
276
+ tdigest<double> td(100);
277
+ for (int i = 0; i < 10000; ++i) td.update(i);
278
+ auto bytes = td.serialize();
279
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
280
+ REQUIRE(td.get_k() == deserialized_td.get_k());
281
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
282
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
283
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
284
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
285
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
286
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
287
+ }
288
+
289
+ TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
290
+ tdigest<double> td(100);
291
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
292
+ td.serialize(s);
293
+ auto bytes = td.serialize();
294
+
295
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
296
+ for (size_t i = 0; i < bytes.size(); ++i) {
297
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
298
+ }
299
+
300
+ s.seekg(0); // rewind
301
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
302
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
303
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
304
+
305
+ REQUIRE(deserialized_td1.is_empty());
306
+ REQUIRE(deserialized_td2.is_empty());
307
+ REQUIRE(deserialized_td1.get_k() == 100);
308
+ REQUIRE(deserialized_td2.get_k() == 100);
309
+ REQUIRE(deserialized_td1.get_total_weight() == 0);
310
+ REQUIRE(deserialized_td2.get_total_weight() == 0);
311
+ }
312
+
313
+ TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
314
+ tdigest<double> td(100);
315
+ const int n = 1000;
316
+ for (int i = 0; i < n; ++i) td.update(i);
317
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
318
+ td.serialize(s);
319
+ auto bytes = td.serialize();
320
+
321
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
322
+ for (size_t i = 0; i < bytes.size(); ++i) {
323
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
324
+ }
325
+
326
+ s.seekg(0); // rewind
327
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
328
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
329
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
330
+
331
+ REQUIRE_FALSE(deserialized_td1.is_empty());
332
+ REQUIRE(deserialized_td1.get_k() == 100);
333
+ REQUIRE(deserialized_td1.get_total_weight() == n);
334
+ REQUIRE(deserialized_td1.get_min_value() == 0);
335
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
336
+
337
+ REQUIRE_FALSE(deserialized_td2.is_empty());
338
+ REQUIRE(deserialized_td2.get_k() == 100);
339
+ REQUIRE(deserialized_td2.get_total_weight() == n);
340
+ REQUIRE(deserialized_td2.get_min_value() == 0);
341
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
342
+
343
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
344
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
345
+ }
346
+
347
+ TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
348
+ tdigest<double> td(100);
349
+ const int n = 10000;
350
+ for (int i = 0; i < n; ++i) td.update(i);
351
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
352
+ td.serialize(s, true);
353
+ auto bytes = td.serialize(0, true);
354
+
355
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
356
+ for (size_t i = 0; i < bytes.size(); ++i) {
357
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
358
+ }
359
+
360
+ s.seekg(0); // rewind
361
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
362
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
363
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
364
+
365
+ REQUIRE_FALSE(deserialized_td1.is_empty());
366
+ REQUIRE(deserialized_td1.get_k() == 100);
367
+ REQUIRE(deserialized_td1.get_total_weight() == n);
368
+ REQUIRE(deserialized_td1.get_min_value() == 0);
369
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
370
+
371
+ REQUIRE_FALSE(deserialized_td2.is_empty());
372
+ REQUIRE(deserialized_td2.get_k() == 100);
373
+ REQUIRE(deserialized_td2.get_total_weight() == n);
374
+ REQUIRE(deserialized_td2.get_min_value() == 0);
375
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
376
+
377
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
378
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
379
+ }
380
+
381
+ TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
382
+ std::ifstream is;
383
+ is.exceptions(std::ios::failbit | std::ios::badbit);
384
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
385
+ const auto td = tdigest<double>::deserialize(is);
386
+ const size_t n = 10000;
387
+ REQUIRE(td.get_total_weight() == n);
388
+ REQUIRE(td.get_min_value() == 0);
389
+ REQUIRE(td.get_max_value() == n - 1);
390
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
391
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
392
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
393
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
394
+ REQUIRE(td.get_rank(n) == 1);
395
+ }
396
+
397
+ TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
398
+ std::ifstream is;
399
+ is.exceptions(std::ios::failbit | std::ios::badbit);
400
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
401
+ const auto td = tdigest<float>::deserialize(is);
402
+ const size_t n = 10000;
403
+ REQUIRE(td.get_total_weight() == n);
404
+ REQUIRE(td.get_min_value() == 0);
405
+ REQUIRE(td.get_max_value() == n - 1);
406
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
407
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
408
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
409
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
410
+ REQUIRE(td.get_rank(n) == 1);
411
+ }
412
+
413
+ TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
414
+ std::ifstream is;
415
+ is.exceptions(std::ios::failbit | std::ios::badbit);
416
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
417
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
418
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
419
+ const size_t n = 10000;
420
+ REQUIRE(td.get_total_weight() == n);
421
+ REQUIRE(td.get_min_value() == 0);
422
+ REQUIRE(td.get_max_value() == n - 1);
423
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
424
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
425
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
426
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
427
+ REQUIRE(td.get_rank(n) == 1);
428
+ }
429
+
430
+ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
431
+ std::ifstream is;
432
+ is.exceptions(std::ios::failbit | std::ios::badbit);
433
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
434
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
435
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
436
+ const size_t n = 10000;
437
+ REQUIRE(td.get_total_weight() == n);
438
+ REQUIRE(td.get_min_value() == 0);
439
+ REQUIRE(td.get_max_value() == n - 1);
440
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
441
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
442
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
443
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
444
+ REQUIRE(td.get_rank(n) == 1);
445
+ }
446
+
447
+ } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(theta
30
30
  )
31
31
 
32
32
  target_link_libraries(theta INTERFACE common)
33
- target_compile_features(theta INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS theta
36
35
  EXPORT ${PROJECT_NAME}
@@ -57,7 +57,7 @@ public:
57
57
  // consistent way of initializing theta from p
58
58
  // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
59
59
  static uint64_t starting_theta_from_p(float p) {
60
- if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
60
+ if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
61
61
  return theta_constants::MAX_THETA;
62
62
  }
63
63
 
@@ -417,6 +417,20 @@ public:
417
417
  virtual uint32_t get_num_retained() const;
418
418
  virtual uint16_t get_seed_hash() const;
419
419
 
420
+ /**
421
+ * Computes maximum serialized size in bytes
422
+ * @param lg_k nominal number of entries in the sketch
423
+ */
424
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
425
+
426
+ /**
427
+ * Computes size in bytes required to serialize the current state of the sketch.
428
+ * Computing compressed size is expensive. It takes iterating over all retained hashes,
429
+ * and the actual serialization will have to look at them again.
430
+ * @param compressed if true compressed size is returned (if applicable)
431
+ */
432
+ size_t get_serialized_size_bytes(bool compressed = false) const;
433
+
420
434
  /**
421
435
  * This method serializes the sketch into a given stream in a binary form
422
436
  * @param os output stream
@@ -486,8 +500,11 @@ private:
486
500
  uint64_t theta_;
487
501
  std::vector<uint64_t, Allocator> entries_;
488
502
 
503
+ uint8_t get_preamble_longs(bool compressed) const;
489
504
  bool is_suitable_for_compression() const;
490
- uint8_t compute_min_leading_zeros() const;
505
+ uint8_t compute_entry_bits() const;
506
+ uint8_t get_num_entries_bytes() const;
507
+ size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
491
508
  void serialize_version_4(std::ostream& os) const;
492
509
  vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
493
510