datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "tdigest.hpp"
23
+ #include "test_allocator.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ using alloc_d = test_allocator<double>;
28
+ using tdigest_d = tdigest<double, alloc_d>;
29
+
30
+ TEST_CASE("tdigest custom allocator", "[tdigest]") {
31
+ test_allocator_total_bytes = 0;
32
+ test_allocator_net_allocations = 0;
33
+ {
34
+ tdigest_d td(100, alloc_d(0));
35
+ for (int i = 0; i < 10000; ++i) td.update(static_cast<double>(i));
36
+ REQUIRE(test_allocator_total_bytes != 0);
37
+ REQUIRE(test_allocator_net_allocations != 0);
38
+ }
39
+ REQUIRE(test_allocator_total_bytes == 0);
40
+ REQUIRE(test_allocator_net_allocations == 0);
41
+ }
42
+
43
+ } /* namespace datasketches */
@@ -0,0 +1,54 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("tdigest double", "[serde_compat]") {
32
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
33
+ for (const unsigned n: n_arr) {
34
+ std::ifstream is;
35
+ is.exceptions(std::ios::failbit | std::ios::badbit);
36
+ is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
37
+ const auto td = tdigest<double>::deserialize(is);
38
+ REQUIRE(td.is_empty() == (n == 0));
39
+ REQUIRE(td.get_total_weight() == n);
40
+ if (n > 0) {
41
+ REQUIRE(td.get_min_value() == 1.0);
42
+ REQUIRE(td.get_max_value() == static_cast<double>(n));
43
+ REQUIRE(td.get_rank(0) == 0);
44
+ REQUIRE(td.get_rank(n + 1) == 1);
45
+ if (n == 1) {
46
+ REQUIRE(td.get_rank(n) == 0.5);
47
+ } else {
48
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+ } /* namespace datasketches */
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("tdigest double generate", "[serialize_for_java]") {
28
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
29
+ for (const unsigned n: n_arr) {
30
+ tdigest_double td(100);
31
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
32
+ std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
33
+ td.serialize(os);
34
+ }
35
+ }
36
+
37
+ TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
38
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
39
+ for (const unsigned n: n_arr) {
40
+ tdigest_double td(100);
41
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
42
+ std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
43
+ td.serialize(os, true);
44
+ }
45
+ }
46
+
47
+ TEST_CASE("tdigest float generate", "[serialize_for_java]") {
48
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
49
+ for (const unsigned n: n_arr) {
50
+ tdigest_float td(100);
51
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
52
+ std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
53
+ td.serialize(os);
54
+ }
55
+ }
56
+
57
+ TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
58
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
59
+ for (const unsigned n: n_arr) {
60
+ tdigest_float td(100);
61
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
62
+ std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
63
+ td.serialize(os, true);
64
+ }
65
+ }
66
+
67
+ } /* namespace datasketches */
@@ -0,0 +1,456 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <iostream>
22
+ #include <fstream>
23
+
24
+ #include "tdigest.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("empty", "[tdigest]") {
29
+ tdigest_double td(10);
30
+ // std::cout << td.to_string();
31
+ REQUIRE(td.is_empty());
32
+ REQUIRE(td.get_k() == 10);
33
+ REQUIRE(td.get_total_weight() == 0);
34
+ REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
35
+ REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
+ REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
+ REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ const double split_points[1] {0};
39
+ REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
40
+ REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
41
+ }
42
+
43
+ TEST_CASE("one value", "[tdigest]") {
44
+ tdigest_double td(100);
45
+ td.update(1);
46
+ REQUIRE(td.get_k() == 100);
47
+ REQUIRE(td.get_total_weight() == 1);
48
+ REQUIRE(td.get_min_value() == 1);
49
+ REQUIRE(td.get_max_value() == 1);
50
+ REQUIRE(td.get_rank(0.99) == 0);
51
+ REQUIRE(td.get_rank(1) == 0.5);
52
+ REQUIRE(td.get_rank(1.01) == 1);
53
+ REQUIRE(td.get_quantile(0) == 1);
54
+ REQUIRE(td.get_quantile(0.5) == 1);
55
+ REQUIRE(td.get_quantile(1) == 1);
56
+ }
57
+
58
+ TEST_CASE("many values", "[tdigest]") {
59
+ const size_t n = 10000;
60
+ tdigest_double td;
61
+ for (size_t i = 0; i < n; ++i) td.update(i);
62
+ REQUIRE_FALSE(td.is_empty());
63
+ REQUIRE(td.get_total_weight() == n);
64
+ REQUIRE(td.get_min_value() == 0);
65
+ REQUIRE(td.get_max_value() == n - 1);
66
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
67
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
68
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
69
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
70
+ REQUIRE(td.get_rank(n) == 1);
71
+ REQUIRE(td.get_quantile(0) == 0);
72
+ REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
73
+ REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
+ REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
+ REQUIRE(td.get_quantile(1) == n - 1);
76
+ const double split_points[1] {n / 2};
77
+ const auto pmf = td.get_PMF(split_points, 1);
78
+ REQUIRE(pmf.size() == 2);
79
+ REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
80
+ REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
81
+ const auto cdf = td.get_CDF(split_points, 1);
82
+ REQUIRE(cdf.size() == 2);
83
+ REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
84
+ REQUIRE(cdf[1] == 1);
85
+ }
86
+
87
+ TEST_CASE("rank - two values", "[tdigest]") {
88
+ tdigest_double td(100);
89
+ td.update(1);
90
+ td.update(2);
91
+ // td.compress();
92
+ // std::cout << td.to_string(true);
93
+ REQUIRE(td.get_rank(0.99) == 0);
94
+ REQUIRE(td.get_rank(1) == 0.25);
95
+ REQUIRE(td.get_rank(1.25) == 0.375);
96
+ REQUIRE(td.get_rank(1.5) == 0.5);
97
+ REQUIRE(td.get_rank(1.75) == 0.625);
98
+ REQUIRE(td.get_rank(2) == 0.75);
99
+ REQUIRE(td.get_rank(2.01) == 1);
100
+ }
101
+
102
+ TEST_CASE("rank - repeated value", "[tdigest]") {
103
+ tdigest_double td(100);
104
+ td.update(1);
105
+ td.update(1);
106
+ td.update(1);
107
+ td.update(1);
108
+ // td.compress();
109
+ // std::cout << td.to_string(true);
110
+ REQUIRE(td.get_rank(0.99) == 0);
111
+ REQUIRE(td.get_rank(1) == 0.5);
112
+ REQUIRE(td.get_rank(1.01) == 1);
113
+ }
114
+
115
+ TEST_CASE("rank - repeated block", "[tdigest]") {
116
+ tdigest_double td(100);
117
+ td.update(1);
118
+ td.update(2);
119
+ td.update(2);
120
+ td.update(3);
121
+ // td.compress();
122
+ // std::cout << td.to_string(true);
123
+ REQUIRE(td.get_rank(0.99) == 0);
124
+ REQUIRE(td.get_rank(1) == 0.125);
125
+ REQUIRE(td.get_rank(2) == 0.5);
126
+ REQUIRE(td.get_rank(3) == 0.875);
127
+ REQUIRE(td.get_rank(3.01) == 1);
128
+ }
129
+
130
+ TEST_CASE("merge small", "[tdigest]") {
131
+ tdigest_double td1(10);
132
+ td1.update(1);
133
+ td1.update(2);
134
+ tdigest_double td2(10);
135
+ td2.update(2);
136
+ td2.update(3);
137
+ td1.merge(td2);
138
+ REQUIRE(td1.get_min_value() == 1);
139
+ REQUIRE(td1.get_max_value() == 3);
140
+ REQUIRE(td1.get_total_weight() == 4);
141
+ REQUIRE(td1.get_rank(0.99) == 0);
142
+ REQUIRE(td1.get_rank(1) == 0.125);
143
+ REQUIRE(td1.get_rank(2) == 0.5);
144
+ REQUIRE(td1.get_rank(3) == 0.875);
145
+ REQUIRE(td1.get_rank(3.01) == 1);
146
+ }
147
+
148
+ TEST_CASE("merge large", "[tdigest]") {
149
+ const size_t n = 10000;
150
+ tdigest_double td1;
151
+ tdigest_double td2;
152
+ for (size_t i = 0; i < n / 2; ++i) {
153
+ td1.update(i);
154
+ td2.update(n / 2 + i);
155
+ }
156
+ // std::cout << td1.to_string();
157
+ // std::cout << td2.to_string();
158
+ td1.merge(td2);
159
+ // td1.compress();
160
+ // std::cout << td1.to_string(true);
161
+ REQUIRE(td1.get_total_weight() == n);
162
+ REQUIRE(td1.get_min_value() == 0);
163
+ REQUIRE(td1.get_max_value() == n - 1);
164
+ REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
165
+ REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
166
+ REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
167
+ REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
168
+ REQUIRE(td1.get_rank(n) == 1);
169
+ }
170
+
171
+ TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
172
+ tdigest<double> td(100);
173
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
174
+ td.serialize(s);
175
+ auto deserialized_td = tdigest<double>::deserialize(s);
176
+ REQUIRE(td.get_k() == deserialized_td.get_k());
177
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
178
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
179
+ }
180
+
181
+ TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
182
+ tdigest<double> td;
183
+ td.update(123);
184
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
185
+ td.serialize(s);
186
+ auto deserialized_td = tdigest<double>::deserialize(s);
187
+ REQUIRE(deserialized_td.get_k() == 200);
188
+ REQUIRE(deserialized_td.get_total_weight() == 1);
189
+ REQUIRE_FALSE(deserialized_td.is_empty());
190
+ REQUIRE(deserialized_td.get_min_value() == 123);
191
+ REQUIRE(deserialized_td.get_max_value() == 123);
192
+ }
193
+
194
+ TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
195
+ tdigest<double> td;
196
+ td.update(123);
197
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
198
+ td.serialize(s, true);
199
+ auto deserialized_td = tdigest<double>::deserialize(s);
200
+ REQUIRE(deserialized_td.get_k() == 200);
201
+ REQUIRE(deserialized_td.get_total_weight() == 1);
202
+ REQUIRE_FALSE(deserialized_td.is_empty());
203
+ REQUIRE(deserialized_td.get_min_value() == 123);
204
+ REQUIRE(deserialized_td.get_max_value() == 123);
205
+ }
206
+
207
+ TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
208
+ tdigest<double> td(100);
209
+ for (int i = 0; i < 1000; ++i) td.update(i);
210
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
211
+ td.serialize(s);
212
+ auto deserialized_td = tdigest<double>::deserialize(s);
213
+ REQUIRE(td.get_k() == deserialized_td.get_k());
214
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
215
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
216
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
217
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
218
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
219
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
220
+ }
221
+
222
+ TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
223
+ tdigest<double> td(100);
224
+ for (int i = 0; i < 10000; ++i) td.update(i);
225
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
226
+ td.serialize(s, true);
227
+ auto deserialized_td = tdigest<double>::deserialize(s);
228
+ REQUIRE(td.get_k() == deserialized_td.get_k());
229
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
230
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
231
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
232
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
233
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
234
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
235
+ }
236
+
237
+ TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
238
+ tdigest<double> td(100);
239
+ auto bytes = td.serialize();
240
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
241
+ REQUIRE(td.get_k() == deserialized_td.get_k());
242
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
243
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
244
+ }
245
+
246
+ TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
247
+ tdigest<double> td(200);
248
+ td.update(123);
249
+ auto bytes = td.serialize();
250
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
251
+ REQUIRE(deserialized_td.get_k() == 200);
252
+ REQUIRE(deserialized_td.get_total_weight() == 1);
253
+ REQUIRE_FALSE(deserialized_td.is_empty());
254
+ REQUIRE(deserialized_td.get_min_value() == 123);
255
+ REQUIRE(deserialized_td.get_max_value() == 123);
256
+ }
257
+
258
+ TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
259
+ tdigest<double> td(200);
260
+ td.update(123);
261
+ auto bytes = td.serialize(0, true);
262
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
263
+ REQUIRE(deserialized_td.get_k() == 200);
264
+ REQUIRE(deserialized_td.get_total_weight() == 1);
265
+ REQUIRE_FALSE(deserialized_td.is_empty());
266
+ REQUIRE(deserialized_td.get_min_value() == 123);
267
+ REQUIRE(deserialized_td.get_max_value() == 123);
268
+ }
269
+
270
+ TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
271
+ tdigest<double> td(100);
272
+ for (int i = 0; i < 1000; ++i) td.update(i);
273
+ auto bytes = td.serialize();
274
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
275
+ REQUIRE(td.get_k() == deserialized_td.get_k());
276
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
277
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
278
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
279
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
280
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
281
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
282
+ }
283
+
284
+ TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
285
+ tdigest<double> td(100);
286
+ for (int i = 0; i < 10000; ++i) td.update(i);
287
+ auto bytes = td.serialize();
288
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
289
+ REQUIRE(td.get_k() == deserialized_td.get_k());
290
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
291
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
292
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
293
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
294
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
295
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
296
+ }
297
+
298
+ TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
299
+ tdigest<double> td(100);
300
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
301
+ td.serialize(s);
302
+ auto bytes = td.serialize();
303
+
304
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
305
+ for (size_t i = 0; i < bytes.size(); ++i) {
306
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
307
+ }
308
+
309
+ s.seekg(0); // rewind
310
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
311
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
312
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
313
+
314
+ REQUIRE(deserialized_td1.is_empty());
315
+ REQUIRE(deserialized_td2.is_empty());
316
+ REQUIRE(deserialized_td1.get_k() == 100);
317
+ REQUIRE(deserialized_td2.get_k() == 100);
318
+ REQUIRE(deserialized_td1.get_total_weight() == 0);
319
+ REQUIRE(deserialized_td2.get_total_weight() == 0);
320
+ }
321
+
322
+ TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
323
+ tdigest<double> td(100);
324
+ const int n = 1000;
325
+ for (int i = 0; i < n; ++i) td.update(i);
326
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
327
+ td.serialize(s);
328
+ auto bytes = td.serialize();
329
+
330
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
331
+ for (size_t i = 0; i < bytes.size(); ++i) {
332
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
333
+ }
334
+
335
+ s.seekg(0); // rewind
336
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
337
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
338
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
339
+
340
+ REQUIRE_FALSE(deserialized_td1.is_empty());
341
+ REQUIRE(deserialized_td1.get_k() == 100);
342
+ REQUIRE(deserialized_td1.get_total_weight() == n);
343
+ REQUIRE(deserialized_td1.get_min_value() == 0);
344
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
345
+
346
+ REQUIRE_FALSE(deserialized_td2.is_empty());
347
+ REQUIRE(deserialized_td2.get_k() == 100);
348
+ REQUIRE(deserialized_td2.get_total_weight() == n);
349
+ REQUIRE(deserialized_td2.get_min_value() == 0);
350
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
351
+
352
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
353
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
354
+ }
355
+
356
+ TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
357
+ tdigest<double> td(100);
358
+ const int n = 10000;
359
+ for (int i = 0; i < n; ++i) td.update(i);
360
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
361
+ td.serialize(s, true);
362
+ auto bytes = td.serialize(0, true);
363
+
364
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
365
+ for (size_t i = 0; i < bytes.size(); ++i) {
366
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
367
+ }
368
+
369
+ s.seekg(0); // rewind
370
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
371
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
372
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
373
+
374
+ REQUIRE_FALSE(deserialized_td1.is_empty());
375
+ REQUIRE(deserialized_td1.get_k() == 100);
376
+ REQUIRE(deserialized_td1.get_total_weight() == n);
377
+ REQUIRE(deserialized_td1.get_min_value() == 0);
378
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
379
+
380
+ REQUIRE_FALSE(deserialized_td2.is_empty());
381
+ REQUIRE(deserialized_td2.get_k() == 100);
382
+ REQUIRE(deserialized_td2.get_total_weight() == n);
383
+ REQUIRE(deserialized_td2.get_min_value() == 0);
384
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
385
+
386
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
387
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
388
+ }
389
+
390
+ TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
391
+ std::ifstream is;
392
+ is.exceptions(std::ios::failbit | std::ios::badbit);
393
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
394
+ const auto td = tdigest<double>::deserialize(is);
395
+ const size_t n = 10000;
396
+ REQUIRE(td.get_total_weight() == n);
397
+ REQUIRE(td.get_min_value() == 0);
398
+ REQUIRE(td.get_max_value() == n - 1);
399
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
400
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
401
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
402
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
403
+ REQUIRE(td.get_rank(n) == 1);
404
+ }
405
+
406
+ TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
407
+ std::ifstream is;
408
+ is.exceptions(std::ios::failbit | std::ios::badbit);
409
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
410
+ const auto td = tdigest<float>::deserialize(is);
411
+ const size_t n = 10000;
412
+ REQUIRE(td.get_total_weight() == n);
413
+ REQUIRE(td.get_min_value() == 0);
414
+ REQUIRE(td.get_max_value() == n - 1);
415
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
416
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
417
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
418
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
419
+ REQUIRE(td.get_rank(n) == 1);
420
+ }
421
+
422
+ TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
423
+ std::ifstream is;
424
+ is.exceptions(std::ios::failbit | std::ios::badbit);
425
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
426
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
427
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
428
+ const size_t n = 10000;
429
+ REQUIRE(td.get_total_weight() == n);
430
+ REQUIRE(td.get_min_value() == 0);
431
+ REQUIRE(td.get_max_value() == n - 1);
432
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
433
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
434
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
435
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
436
+ REQUIRE(td.get_rank(n) == 1);
437
+ }
438
+
439
+ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
440
+ std::ifstream is;
441
+ is.exceptions(std::ios::failbit | std::ios::badbit);
442
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
443
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
444
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
445
+ const size_t n = 10000;
446
+ REQUIRE(td.get_total_weight() == n);
447
+ REQUIRE(td.get_min_value() == 0);
448
+ REQUIRE(td.get_max_value() == n - 1);
449
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
450
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
451
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
452
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
453
+ REQUIRE(td.get_rank(n) == 1);
454
+ }
455
+
456
+ } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(theta
30
30
  )
31
31
 
32
32
  target_link_libraries(theta INTERFACE common)
33
- target_compile_features(theta INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS theta
36
35
  EXPORT ${PROJECT_NAME}
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
329
329
 
330
330
  *ptr++ = static_cast<uint8_t>(values[3] >> 4);
331
331
 
332
- *ptr = static_cast<uint8_t>(values[3] >> 4);
332
+ *ptr = static_cast<uint8_t>(values[3] << 4);
333
333
  *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
334
334
 
335
335
  *ptr++ = static_cast<uint8_t>(values[4] >> 1);
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
4227
4227
  values[6] |= *ptr >> 1;
4228
4228
 
4229
4229
  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
4230
- values[7] |= *ptr++ << 24;
4230
+ values[7] |= static_cast<uint64_t>(*ptr++) << 24;
4231
4231
  values[7] |= *ptr++ << 16;
4232
4232
  values[7] |= *ptr++ << 8;
4233
4233
  values[7] |= *ptr;
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
4296
4296
  values[1] |= *ptr++ << 6;
4297
4297
  values[1] |= *ptr >> 2;
4298
4298
 
4299
- values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
4299
+ values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
4300
4300
  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
4301
4301
  values[2] |= *ptr++ << 17;
4302
4302
  values[2] |= *ptr++ << 9;
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
6201
6201
  case 61: pack_bits_61(values, ptr); break;
6202
6202
  case 62: pack_bits_62(values, ptr); break;
6203
6203
  case 63: pack_bits_63(values, ptr); break;
6204
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6204
+ default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
6205
6205
  }
6206
6206
  }
6207
6207
 
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
6270
6270
  case 61: unpack_bits_61(values, ptr); break;
6271
6271
  case 62: unpack_bits_62(values, ptr); break;
6272
6272
  case 63: unpack_bits_63(values, ptr); break;
6273
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6273
+ default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
6274
6274
  }
6275
6275
  }
6276
6276
 
@@ -57,7 +57,7 @@ public:
57
57
  // consistent way of initializing theta from p
58
58
  // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
59
59
  static uint64_t starting_theta_from_p(float p) {
60
- if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
60
+ if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
61
61
  return theta_constants::MAX_THETA;
62
62
  }
63
63