datasketches 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "tdigest.hpp"
23
+ #include "test_allocator.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ using alloc_d = test_allocator<double>;
28
+ using tdigest_d = tdigest<double, alloc_d>;
29
+
30
+ TEST_CASE("tdigest custom allocator", "[tdigest]") {
31
+ test_allocator_total_bytes = 0;
32
+ test_allocator_net_allocations = 0;
33
+ {
34
+ tdigest_d td(100, alloc_d(0));
35
+ for (int i = 0; i < 10000; ++i) td.update(static_cast<double>(i));
36
+ REQUIRE(test_allocator_total_bytes != 0);
37
+ REQUIRE(test_allocator_net_allocations != 0);
38
+ }
39
+ REQUIRE(test_allocator_total_bytes == 0);
40
+ REQUIRE(test_allocator_net_allocations == 0);
41
+ }
42
+
43
+ } /* namespace datasketches */
@@ -0,0 +1,54 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ // assume the binary sketches for this test have been generated by datasketches-java code
28
+ // in the subdirectory called "java" in the root directory of this project
29
+ static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
30
+
31
+ TEST_CASE("tdigest double", "[serde_compat]") {
32
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
33
+ for (const unsigned n: n_arr) {
34
+ std::ifstream is;
35
+ is.exceptions(std::ios::failbit | std::ios::badbit);
36
+ is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
37
+ const auto td = tdigest<double>::deserialize(is);
38
+ REQUIRE(td.is_empty() == (n == 0));
39
+ REQUIRE(td.get_total_weight() == n);
40
+ if (n > 0) {
41
+ REQUIRE(td.get_min_value() == 1.0);
42
+ REQUIRE(td.get_max_value() == static_cast<double>(n));
43
+ REQUIRE(td.get_rank(0) == 0);
44
+ REQUIRE(td.get_rank(n + 1) == 1);
45
+ if (n == 1) {
46
+ REQUIRE(td.get_rank(n) == 0.5);
47
+ } else {
48
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
49
+ }
50
+ }
51
+ }
52
+ }
53
+
54
+ } /* namespace datasketches */
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "tdigest.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("tdigest double generate", "[serialize_for_java]") {
28
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
29
+ for (const unsigned n: n_arr) {
30
+ tdigest_double td(100);
31
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
32
+ std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
33
+ td.serialize(os);
34
+ }
35
+ }
36
+
37
+ TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
38
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
39
+ for (const unsigned n: n_arr) {
40
+ tdigest_double td(100);
41
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
42
+ std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
43
+ td.serialize(os, true);
44
+ }
45
+ }
46
+
47
+ TEST_CASE("tdigest float generate", "[serialize_for_java]") {
48
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
49
+ for (const unsigned n: n_arr) {
50
+ tdigest_float td(100);
51
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
52
+ std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
53
+ td.serialize(os);
54
+ }
55
+ }
56
+
57
+ TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
58
+ const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
59
+ for (const unsigned n: n_arr) {
60
+ tdigest_float td(100);
61
+ for (unsigned i = 1; i <= n; ++i) td.update(i);
62
+ std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
63
+ td.serialize(os, true);
64
+ }
65
+ }
66
+
67
+ } /* namespace datasketches */
@@ -0,0 +1,456 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <iostream>
22
+ #include <fstream>
23
+
24
+ #include "tdigest.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ TEST_CASE("empty", "[tdigest]") {
29
+ tdigest_double td(10);
30
+ // std::cout << td.to_string();
31
+ REQUIRE(td.is_empty());
32
+ REQUIRE(td.get_k() == 10);
33
+ REQUIRE(td.get_total_weight() == 0);
34
+ REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
35
+ REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
+ REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
+ REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ const double split_points[1] {0};
39
+ REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
40
+ REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
41
+ }
42
+
43
+ TEST_CASE("one value", "[tdigest]") {
44
+ tdigest_double td(100);
45
+ td.update(1);
46
+ REQUIRE(td.get_k() == 100);
47
+ REQUIRE(td.get_total_weight() == 1);
48
+ REQUIRE(td.get_min_value() == 1);
49
+ REQUIRE(td.get_max_value() == 1);
50
+ REQUIRE(td.get_rank(0.99) == 0);
51
+ REQUIRE(td.get_rank(1) == 0.5);
52
+ REQUIRE(td.get_rank(1.01) == 1);
53
+ REQUIRE(td.get_quantile(0) == 1);
54
+ REQUIRE(td.get_quantile(0.5) == 1);
55
+ REQUIRE(td.get_quantile(1) == 1);
56
+ }
57
+
58
+ TEST_CASE("many values", "[tdigest]") {
59
+ const size_t n = 10000;
60
+ tdigest_double td;
61
+ for (size_t i = 0; i < n; ++i) td.update(i);
62
+ REQUIRE_FALSE(td.is_empty());
63
+ REQUIRE(td.get_total_weight() == n);
64
+ REQUIRE(td.get_min_value() == 0);
65
+ REQUIRE(td.get_max_value() == n - 1);
66
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
67
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
68
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
69
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
70
+ REQUIRE(td.get_rank(n) == 1);
71
+ REQUIRE(td.get_quantile(0) == 0);
72
+ REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
73
+ REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
+ REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
+ REQUIRE(td.get_quantile(1) == n - 1);
76
+ const double split_points[1] {n / 2};
77
+ const auto pmf = td.get_PMF(split_points, 1);
78
+ REQUIRE(pmf.size() == 2);
79
+ REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
80
+ REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
81
+ const auto cdf = td.get_CDF(split_points, 1);
82
+ REQUIRE(cdf.size() == 2);
83
+ REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
84
+ REQUIRE(cdf[1] == 1);
85
+ }
86
+
87
+ TEST_CASE("rank - two values", "[tdigest]") {
88
+ tdigest_double td(100);
89
+ td.update(1);
90
+ td.update(2);
91
+ // td.compress();
92
+ // std::cout << td.to_string(true);
93
+ REQUIRE(td.get_rank(0.99) == 0);
94
+ REQUIRE(td.get_rank(1) == 0.25);
95
+ REQUIRE(td.get_rank(1.25) == 0.375);
96
+ REQUIRE(td.get_rank(1.5) == 0.5);
97
+ REQUIRE(td.get_rank(1.75) == 0.625);
98
+ REQUIRE(td.get_rank(2) == 0.75);
99
+ REQUIRE(td.get_rank(2.01) == 1);
100
+ }
101
+
102
+ TEST_CASE("rank - repeated value", "[tdigest]") {
103
+ tdigest_double td(100);
104
+ td.update(1);
105
+ td.update(1);
106
+ td.update(1);
107
+ td.update(1);
108
+ // td.compress();
109
+ // std::cout << td.to_string(true);
110
+ REQUIRE(td.get_rank(0.99) == 0);
111
+ REQUIRE(td.get_rank(1) == 0.5);
112
+ REQUIRE(td.get_rank(1.01) == 1);
113
+ }
114
+
115
+ TEST_CASE("rank - repeated block", "[tdigest]") {
116
+ tdigest_double td(100);
117
+ td.update(1);
118
+ td.update(2);
119
+ td.update(2);
120
+ td.update(3);
121
+ // td.compress();
122
+ // std::cout << td.to_string(true);
123
+ REQUIRE(td.get_rank(0.99) == 0);
124
+ REQUIRE(td.get_rank(1) == 0.125);
125
+ REQUIRE(td.get_rank(2) == 0.5);
126
+ REQUIRE(td.get_rank(3) == 0.875);
127
+ REQUIRE(td.get_rank(3.01) == 1);
128
+ }
129
+
130
+ TEST_CASE("merge small", "[tdigest]") {
131
+ tdigest_double td1(10);
132
+ td1.update(1);
133
+ td1.update(2);
134
+ tdigest_double td2(10);
135
+ td2.update(2);
136
+ td2.update(3);
137
+ td1.merge(td2);
138
+ REQUIRE(td1.get_min_value() == 1);
139
+ REQUIRE(td1.get_max_value() == 3);
140
+ REQUIRE(td1.get_total_weight() == 4);
141
+ REQUIRE(td1.get_rank(0.99) == 0);
142
+ REQUIRE(td1.get_rank(1) == 0.125);
143
+ REQUIRE(td1.get_rank(2) == 0.5);
144
+ REQUIRE(td1.get_rank(3) == 0.875);
145
+ REQUIRE(td1.get_rank(3.01) == 1);
146
+ }
147
+
148
+ TEST_CASE("merge large", "[tdigest]") {
149
+ const size_t n = 10000;
150
+ tdigest_double td1;
151
+ tdigest_double td2;
152
+ for (size_t i = 0; i < n / 2; ++i) {
153
+ td1.update(i);
154
+ td2.update(n / 2 + i);
155
+ }
156
+ // std::cout << td1.to_string();
157
+ // std::cout << td2.to_string();
158
+ td1.merge(td2);
159
+ // td1.compress();
160
+ // std::cout << td1.to_string(true);
161
+ REQUIRE(td1.get_total_weight() == n);
162
+ REQUIRE(td1.get_min_value() == 0);
163
+ REQUIRE(td1.get_max_value() == n - 1);
164
+ REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
165
+ REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
166
+ REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
167
+ REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
168
+ REQUIRE(td1.get_rank(n) == 1);
169
+ }
170
+
171
+ TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
172
+ tdigest<double> td(100);
173
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
174
+ td.serialize(s);
175
+ auto deserialized_td = tdigest<double>::deserialize(s);
176
+ REQUIRE(td.get_k() == deserialized_td.get_k());
177
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
178
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
179
+ }
180
+
181
+ TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
182
+ tdigest<double> td;
183
+ td.update(123);
184
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
185
+ td.serialize(s);
186
+ auto deserialized_td = tdigest<double>::deserialize(s);
187
+ REQUIRE(deserialized_td.get_k() == 200);
188
+ REQUIRE(deserialized_td.get_total_weight() == 1);
189
+ REQUIRE_FALSE(deserialized_td.is_empty());
190
+ REQUIRE(deserialized_td.get_min_value() == 123);
191
+ REQUIRE(deserialized_td.get_max_value() == 123);
192
+ }
193
+
194
+ TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
195
+ tdigest<double> td;
196
+ td.update(123);
197
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
198
+ td.serialize(s, true);
199
+ auto deserialized_td = tdigest<double>::deserialize(s);
200
+ REQUIRE(deserialized_td.get_k() == 200);
201
+ REQUIRE(deserialized_td.get_total_weight() == 1);
202
+ REQUIRE_FALSE(deserialized_td.is_empty());
203
+ REQUIRE(deserialized_td.get_min_value() == 123);
204
+ REQUIRE(deserialized_td.get_max_value() == 123);
205
+ }
206
+
207
+ TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
208
+ tdigest<double> td(100);
209
+ for (int i = 0; i < 1000; ++i) td.update(i);
210
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
211
+ td.serialize(s);
212
+ auto deserialized_td = tdigest<double>::deserialize(s);
213
+ REQUIRE(td.get_k() == deserialized_td.get_k());
214
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
215
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
216
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
217
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
218
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
219
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
220
+ }
221
+
222
+ TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
223
+ tdigest<double> td(100);
224
+ for (int i = 0; i < 10000; ++i) td.update(i);
225
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
226
+ td.serialize(s, true);
227
+ auto deserialized_td = tdigest<double>::deserialize(s);
228
+ REQUIRE(td.get_k() == deserialized_td.get_k());
229
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
230
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
231
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
232
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
233
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
234
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
235
+ }
236
+
237
+ TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
238
+ tdigest<double> td(100);
239
+ auto bytes = td.serialize();
240
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
241
+ REQUIRE(td.get_k() == deserialized_td.get_k());
242
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
243
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
244
+ }
245
+
246
+ TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
247
+ tdigest<double> td(200);
248
+ td.update(123);
249
+ auto bytes = td.serialize();
250
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
251
+ REQUIRE(deserialized_td.get_k() == 200);
252
+ REQUIRE(deserialized_td.get_total_weight() == 1);
253
+ REQUIRE_FALSE(deserialized_td.is_empty());
254
+ REQUIRE(deserialized_td.get_min_value() == 123);
255
+ REQUIRE(deserialized_td.get_max_value() == 123);
256
+ }
257
+
258
+ TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
259
+ tdigest<double> td(200);
260
+ td.update(123);
261
+ auto bytes = td.serialize(0, true);
262
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
263
+ REQUIRE(deserialized_td.get_k() == 200);
264
+ REQUIRE(deserialized_td.get_total_weight() == 1);
265
+ REQUIRE_FALSE(deserialized_td.is_empty());
266
+ REQUIRE(deserialized_td.get_min_value() == 123);
267
+ REQUIRE(deserialized_td.get_max_value() == 123);
268
+ }
269
+
270
+ TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
271
+ tdigest<double> td(100);
272
+ for (int i = 0; i < 1000; ++i) td.update(i);
273
+ auto bytes = td.serialize();
274
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
275
+ REQUIRE(td.get_k() == deserialized_td.get_k());
276
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
277
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
278
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
279
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
280
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
281
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
282
+ }
283
+
284
+ TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
285
+ tdigest<double> td(100);
286
+ for (int i = 0; i < 10000; ++i) td.update(i);
287
+ auto bytes = td.serialize();
288
+ auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
289
+ REQUIRE(td.get_k() == deserialized_td.get_k());
290
+ REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
291
+ REQUIRE(td.is_empty() == deserialized_td.is_empty());
292
+ REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
293
+ REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
294
+ REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
295
+ REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
296
+ }
297
+
298
+ TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
299
+ tdigest<double> td(100);
300
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
301
+ td.serialize(s);
302
+ auto bytes = td.serialize();
303
+
304
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
305
+ for (size_t i = 0; i < bytes.size(); ++i) {
306
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
307
+ }
308
+
309
+ s.seekg(0); // rewind
310
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
311
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
312
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
313
+
314
+ REQUIRE(deserialized_td1.is_empty());
315
+ REQUIRE(deserialized_td2.is_empty());
316
+ REQUIRE(deserialized_td1.get_k() == 100);
317
+ REQUIRE(deserialized_td2.get_k() == 100);
318
+ REQUIRE(deserialized_td1.get_total_weight() == 0);
319
+ REQUIRE(deserialized_td2.get_total_weight() == 0);
320
+ }
321
+
322
+ TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
323
+ tdigest<double> td(100);
324
+ const int n = 1000;
325
+ for (int i = 0; i < n; ++i) td.update(i);
326
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
327
+ td.serialize(s);
328
+ auto bytes = td.serialize();
329
+
330
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
331
+ for (size_t i = 0; i < bytes.size(); ++i) {
332
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
333
+ }
334
+
335
+ s.seekg(0); // rewind
336
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
337
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
338
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
339
+
340
+ REQUIRE_FALSE(deserialized_td1.is_empty());
341
+ REQUIRE(deserialized_td1.get_k() == 100);
342
+ REQUIRE(deserialized_td1.get_total_weight() == n);
343
+ REQUIRE(deserialized_td1.get_min_value() == 0);
344
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
345
+
346
+ REQUIRE_FALSE(deserialized_td2.is_empty());
347
+ REQUIRE(deserialized_td2.get_k() == 100);
348
+ REQUIRE(deserialized_td2.get_total_weight() == n);
349
+ REQUIRE(deserialized_td2.get_min_value() == 0);
350
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
351
+
352
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
353
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
354
+ }
355
+
356
+ TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
357
+ tdigest<double> td(100);
358
+ const int n = 10000;
359
+ for (int i = 0; i < n; ++i) td.update(i);
360
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
361
+ td.serialize(s, true);
362
+ auto bytes = td.serialize(0, true);
363
+
364
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
365
+ for (size_t i = 0; i < bytes.size(); ++i) {
366
+ REQUIRE(((char*)bytes.data())[i] == (char)s.get());
367
+ }
368
+
369
+ s.seekg(0); // rewind
370
+ auto deserialized_td1 = tdigest<double>::deserialize(s);
371
+ auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
372
+ REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
373
+
374
+ REQUIRE_FALSE(deserialized_td1.is_empty());
375
+ REQUIRE(deserialized_td1.get_k() == 100);
376
+ REQUIRE(deserialized_td1.get_total_weight() == n);
377
+ REQUIRE(deserialized_td1.get_min_value() == 0);
378
+ REQUIRE(deserialized_td1.get_max_value() == n - 1);
379
+
380
+ REQUIRE_FALSE(deserialized_td2.is_empty());
381
+ REQUIRE(deserialized_td2.get_k() == 100);
382
+ REQUIRE(deserialized_td2.get_total_weight() == n);
383
+ REQUIRE(deserialized_td2.get_min_value() == 0);
384
+ REQUIRE(deserialized_td2.get_max_value() == n - 1);
385
+
386
+ REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
387
+ REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
388
+ }
389
+
390
+ TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
391
+ std::ifstream is;
392
+ is.exceptions(std::ios::failbit | std::ios::badbit);
393
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
394
+ const auto td = tdigest<double>::deserialize(is);
395
+ const size_t n = 10000;
396
+ REQUIRE(td.get_total_weight() == n);
397
+ REQUIRE(td.get_min_value() == 0);
398
+ REQUIRE(td.get_max_value() == n - 1);
399
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
400
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
401
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
402
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
403
+ REQUIRE(td.get_rank(n) == 1);
404
+ }
405
+
406
+ TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
407
+ std::ifstream is;
408
+ is.exceptions(std::ios::failbit | std::ios::badbit);
409
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
410
+ const auto td = tdigest<float>::deserialize(is);
411
+ const size_t n = 10000;
412
+ REQUIRE(td.get_total_weight() == n);
413
+ REQUIRE(td.get_min_value() == 0);
414
+ REQUIRE(td.get_max_value() == n - 1);
415
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
416
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
417
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
418
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
419
+ REQUIRE(td.get_rank(n) == 1);
420
+ }
421
+
422
+ TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
423
+ std::ifstream is;
424
+ is.exceptions(std::ios::failbit | std::ios::badbit);
425
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
426
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
427
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
428
+ const size_t n = 10000;
429
+ REQUIRE(td.get_total_weight() == n);
430
+ REQUIRE(td.get_min_value() == 0);
431
+ REQUIRE(td.get_max_value() == n - 1);
432
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
433
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
434
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
435
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
436
+ REQUIRE(td.get_rank(n) == 1);
437
+ }
438
+
439
+ TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
440
+ std::ifstream is;
441
+ is.exceptions(std::ios::failbit | std::ios::badbit);
442
+ is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
443
+ std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
444
+ const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
445
+ const size_t n = 10000;
446
+ REQUIRE(td.get_total_weight() == n);
447
+ REQUIRE(td.get_min_value() == 0);
448
+ REQUIRE(td.get_max_value() == n - 1);
449
+ REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
450
+ REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
451
+ REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
452
+ REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
453
+ REQUIRE(td.get_rank(n) == 1);
454
+ }
455
+
456
+ } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(theta
30
30
  )
31
31
 
32
32
  target_link_libraries(theta INTERFACE common)
33
- target_compile_features(theta INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS theta
36
35
  EXPORT ${PROJECT_NAME}
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
329
329
 
330
330
  *ptr++ = static_cast<uint8_t>(values[3] >> 4);
331
331
 
332
- *ptr = static_cast<uint8_t>(values[3] >> 4);
332
+ *ptr = static_cast<uint8_t>(values[3] << 4);
333
333
  *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
334
334
 
335
335
  *ptr++ = static_cast<uint8_t>(values[4] >> 1);
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
4227
4227
  values[6] |= *ptr >> 1;
4228
4228
 
4229
4229
  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
4230
- values[7] |= *ptr++ << 24;
4230
+ values[7] |= static_cast<uint64_t>(*ptr++) << 24;
4231
4231
  values[7] |= *ptr++ << 16;
4232
4232
  values[7] |= *ptr++ << 8;
4233
4233
  values[7] |= *ptr;
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
4296
4296
  values[1] |= *ptr++ << 6;
4297
4297
  values[1] |= *ptr >> 2;
4298
4298
 
4299
- values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
4299
+ values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
4300
4300
  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
4301
4301
  values[2] |= *ptr++ << 17;
4302
4302
  values[2] |= *ptr++ << 9;
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
6201
6201
  case 61: pack_bits_61(values, ptr); break;
6202
6202
  case 62: pack_bits_62(values, ptr); break;
6203
6203
  case 63: pack_bits_63(values, ptr); break;
6204
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6204
+ default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
6205
6205
  }
6206
6206
  }
6207
6207
 
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
6270
6270
  case 61: unpack_bits_61(values, ptr); break;
6271
6271
  case 62: unpack_bits_62(values, ptr); break;
6272
6272
  case 63: unpack_bits_63(values, ptr); break;
6273
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6273
+ default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
6274
6274
  }
6275
6275
  }
6276
6276
 
@@ -57,7 +57,7 @@ public:
57
57
  // consistent way of initializing theta from p
58
58
  // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
59
59
  static uint64_t starting_theta_from_p(float p) {
60
- if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
60
+ if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
61
61
  return theta_constants::MAX_THETA;
62
62
  }
63
63