datasketches 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -0,0 +1,54 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
|
23
|
+
#include "tdigest.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
28
|
+
// in the subdirectory called "java" in the root directory of this project
|
29
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
30
|
+
|
31
|
+
TEST_CASE("tdigest double", "[serde_compat]") {
|
32
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
33
|
+
for (const unsigned n: n_arr) {
|
34
|
+
std::ifstream is;
|
35
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
36
|
+
is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
37
|
+
const auto td = tdigest<double>::deserialize(is);
|
38
|
+
REQUIRE(td.is_empty() == (n == 0));
|
39
|
+
REQUIRE(td.get_total_weight() == n);
|
40
|
+
if (n > 0) {
|
41
|
+
REQUIRE(td.get_min_value() == 1.0);
|
42
|
+
REQUIRE(td.get_max_value() == static_cast<double>(n));
|
43
|
+
REQUIRE(td.get_rank(0) == 0);
|
44
|
+
REQUIRE(td.get_rank(n + 1) == 1);
|
45
|
+
if (n == 1) {
|
46
|
+
REQUIRE(td.get_rank(n) == 0.5);
|
47
|
+
} else {
|
48
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
|
49
|
+
}
|
50
|
+
}
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,67 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
|
23
|
+
#include "tdigest.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
TEST_CASE("tdigest double generate", "[serialize_for_java]") {
|
28
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
29
|
+
for (const unsigned n: n_arr) {
|
30
|
+
tdigest_double td(100);
|
31
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
32
|
+
std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
33
|
+
td.serialize(os);
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
|
38
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
39
|
+
for (const unsigned n: n_arr) {
|
40
|
+
tdigest_double td(100);
|
41
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
42
|
+
std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
43
|
+
td.serialize(os, true);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
TEST_CASE("tdigest float generate", "[serialize_for_java]") {
|
48
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
49
|
+
for (const unsigned n: n_arr) {
|
50
|
+
tdigest_float td(100);
|
51
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
52
|
+
std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
53
|
+
td.serialize(os);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
|
58
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
59
|
+
for (const unsigned n: n_arr) {
|
60
|
+
tdigest_float td(100);
|
61
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
62
|
+
std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
63
|
+
td.serialize(os, true);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,447 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <iostream>
|
22
|
+
#include <fstream>
|
23
|
+
|
24
|
+
#include "tdigest.hpp"
|
25
|
+
|
26
|
+
namespace datasketches {
|
27
|
+
|
28
|
+
TEST_CASE("empty", "[tdigest]") {
|
29
|
+
tdigest_double td(10);
|
30
|
+
// std::cout << td.to_string();
|
31
|
+
REQUIRE(td.is_empty());
|
32
|
+
REQUIRE(td.get_k() == 10);
|
33
|
+
REQUIRE(td.get_total_weight() == 0);
|
34
|
+
REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
|
35
|
+
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
|
36
|
+
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
|
37
|
+
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
|
38
|
+
}
|
39
|
+
|
40
|
+
TEST_CASE("one value", "[tdigest]") {
|
41
|
+
tdigest_double td(100);
|
42
|
+
td.update(1);
|
43
|
+
REQUIRE(td.get_k() == 100);
|
44
|
+
REQUIRE(td.get_total_weight() == 1);
|
45
|
+
REQUIRE(td.get_min_value() == 1);
|
46
|
+
REQUIRE(td.get_max_value() == 1);
|
47
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
48
|
+
REQUIRE(td.get_rank(1) == 0.5);
|
49
|
+
REQUIRE(td.get_rank(1.01) == 1);
|
50
|
+
REQUIRE(td.get_quantile(0) == 1);
|
51
|
+
REQUIRE(td.get_quantile(0.5) == 1);
|
52
|
+
REQUIRE(td.get_quantile(1) == 1);
|
53
|
+
}
|
54
|
+
|
55
|
+
TEST_CASE("many values", "[tdigest]") {
|
56
|
+
const size_t n = 10000;
|
57
|
+
tdigest_double td;
|
58
|
+
for (size_t i = 0; i < n; ++i) td.update(i);
|
59
|
+
// std::cout << td.to_string(true);
|
60
|
+
// td.compress();
|
61
|
+
// std::cout << td.to_string(true);
|
62
|
+
REQUIRE_FALSE(td.is_empty());
|
63
|
+
REQUIRE(td.get_total_weight() == n);
|
64
|
+
REQUIRE(td.get_min_value() == 0);
|
65
|
+
REQUIRE(td.get_max_value() == n - 1);
|
66
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
67
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
68
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
69
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
70
|
+
REQUIRE(td.get_rank(n) == 1);
|
71
|
+
REQUIRE(td.get_quantile(0) == 0);
|
72
|
+
REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
|
73
|
+
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
|
74
|
+
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
|
75
|
+
REQUIRE(td.get_quantile(1) == n - 1);
|
76
|
+
}
|
77
|
+
|
78
|
+
TEST_CASE("rank - two values", "[tdigest]") {
|
79
|
+
tdigest_double td(100);
|
80
|
+
td.update(1);
|
81
|
+
td.update(2);
|
82
|
+
// td.compress();
|
83
|
+
// std::cout << td.to_string(true);
|
84
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
85
|
+
REQUIRE(td.get_rank(1) == 0.25);
|
86
|
+
REQUIRE(td.get_rank(1.25) == 0.375);
|
87
|
+
REQUIRE(td.get_rank(1.5) == 0.5);
|
88
|
+
REQUIRE(td.get_rank(1.75) == 0.625);
|
89
|
+
REQUIRE(td.get_rank(2) == 0.75);
|
90
|
+
REQUIRE(td.get_rank(2.01) == 1);
|
91
|
+
}
|
92
|
+
|
93
|
+
TEST_CASE("rank - repeated value", "[tdigest]") {
|
94
|
+
tdigest_double td(100);
|
95
|
+
td.update(1);
|
96
|
+
td.update(1);
|
97
|
+
td.update(1);
|
98
|
+
td.update(1);
|
99
|
+
// td.compress();
|
100
|
+
// std::cout << td.to_string(true);
|
101
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
102
|
+
REQUIRE(td.get_rank(1) == 0.5);
|
103
|
+
REQUIRE(td.get_rank(1.01) == 1);
|
104
|
+
}
|
105
|
+
|
106
|
+
TEST_CASE("rank - repeated block", "[tdigest]") {
|
107
|
+
tdigest_double td(100);
|
108
|
+
td.update(1);
|
109
|
+
td.update(2);
|
110
|
+
td.update(2);
|
111
|
+
td.update(3);
|
112
|
+
// td.compress();
|
113
|
+
// std::cout << td.to_string(true);
|
114
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
115
|
+
REQUIRE(td.get_rank(1) == 0.125);
|
116
|
+
REQUIRE(td.get_rank(2) == 0.5);
|
117
|
+
REQUIRE(td.get_rank(3) == 0.875);
|
118
|
+
REQUIRE(td.get_rank(3.01) == 1);
|
119
|
+
}
|
120
|
+
|
121
|
+
TEST_CASE("merge small", "[tdigest]") {
|
122
|
+
tdigest_double td1(10);
|
123
|
+
td1.update(1);
|
124
|
+
td1.update(2);
|
125
|
+
tdigest_double td2(10);
|
126
|
+
td2.update(2);
|
127
|
+
td2.update(3);
|
128
|
+
td1.merge(td2);
|
129
|
+
REQUIRE(td1.get_min_value() == 1);
|
130
|
+
REQUIRE(td1.get_max_value() == 3);
|
131
|
+
REQUIRE(td1.get_total_weight() == 4);
|
132
|
+
REQUIRE(td1.get_rank(0.99) == 0);
|
133
|
+
REQUIRE(td1.get_rank(1) == 0.125);
|
134
|
+
REQUIRE(td1.get_rank(2) == 0.5);
|
135
|
+
REQUIRE(td1.get_rank(3) == 0.875);
|
136
|
+
REQUIRE(td1.get_rank(3.01) == 1);
|
137
|
+
}
|
138
|
+
|
139
|
+
TEST_CASE("merge large", "[tdigest]") {
|
140
|
+
const size_t n = 10000;
|
141
|
+
tdigest_double td1;
|
142
|
+
tdigest_double td2;
|
143
|
+
for (size_t i = 0; i < n / 2; ++i) {
|
144
|
+
td1.update(i);
|
145
|
+
td2.update(n / 2 + i);
|
146
|
+
}
|
147
|
+
// std::cout << td1.to_string();
|
148
|
+
// std::cout << td2.to_string();
|
149
|
+
td1.merge(td2);
|
150
|
+
// td1.compress();
|
151
|
+
// std::cout << td1.to_string(true);
|
152
|
+
REQUIRE(td1.get_total_weight() == n);
|
153
|
+
REQUIRE(td1.get_min_value() == 0);
|
154
|
+
REQUIRE(td1.get_max_value() == n - 1);
|
155
|
+
REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
|
156
|
+
REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
157
|
+
REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
158
|
+
REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
159
|
+
REQUIRE(td1.get_rank(n) == 1);
|
160
|
+
}
|
161
|
+
|
162
|
+
TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
|
163
|
+
tdigest<double> td(100);
|
164
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
165
|
+
td.serialize(s);
|
166
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
167
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
168
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
169
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
170
|
+
}
|
171
|
+
|
172
|
+
TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
|
173
|
+
tdigest<double> td;
|
174
|
+
td.update(123);
|
175
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
176
|
+
td.serialize(s);
|
177
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
178
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
179
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
180
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
181
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
182
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
183
|
+
}
|
184
|
+
|
185
|
+
TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
|
186
|
+
tdigest<double> td;
|
187
|
+
td.update(123);
|
188
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
189
|
+
td.serialize(s, true);
|
190
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
191
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
192
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
193
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
194
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
195
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
196
|
+
}
|
197
|
+
|
198
|
+
TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
|
199
|
+
tdigest<double> td(100);
|
200
|
+
for (int i = 0; i < 1000; ++i) td.update(i);
|
201
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
202
|
+
td.serialize(s);
|
203
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
204
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
205
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
206
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
207
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
208
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
209
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
210
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
211
|
+
}
|
212
|
+
|
213
|
+
TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
|
214
|
+
tdigest<double> td(100);
|
215
|
+
for (int i = 0; i < 10000; ++i) td.update(i);
|
216
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
217
|
+
td.serialize(s, true);
|
218
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
219
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
220
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
221
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
222
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
223
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
224
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
225
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
226
|
+
}
|
227
|
+
|
228
|
+
TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
|
229
|
+
tdigest<double> td(100);
|
230
|
+
auto bytes = td.serialize();
|
231
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
232
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
233
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
234
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
|
238
|
+
tdigest<double> td(200);
|
239
|
+
td.update(123);
|
240
|
+
auto bytes = td.serialize();
|
241
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
242
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
243
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
244
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
245
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
246
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
247
|
+
}
|
248
|
+
|
249
|
+
TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
|
250
|
+
tdigest<double> td(200);
|
251
|
+
td.update(123);
|
252
|
+
auto bytes = td.serialize(0, true);
|
253
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
254
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
255
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
256
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
257
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
258
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
259
|
+
}
|
260
|
+
|
261
|
+
TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
|
262
|
+
tdigest<double> td(100);
|
263
|
+
for (int i = 0; i < 1000; ++i) td.update(i);
|
264
|
+
auto bytes = td.serialize();
|
265
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
266
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
267
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
268
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
269
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
270
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
271
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
272
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
273
|
+
}
|
274
|
+
|
275
|
+
TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
|
276
|
+
tdigest<double> td(100);
|
277
|
+
for (int i = 0; i < 10000; ++i) td.update(i);
|
278
|
+
auto bytes = td.serialize();
|
279
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
280
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
281
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
282
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
283
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
284
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
285
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
286
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
287
|
+
}
|
288
|
+
|
289
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
|
290
|
+
tdigest<double> td(100);
|
291
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
292
|
+
td.serialize(s);
|
293
|
+
auto bytes = td.serialize();
|
294
|
+
|
295
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
296
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
297
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
298
|
+
}
|
299
|
+
|
300
|
+
s.seekg(0); // rewind
|
301
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
302
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
303
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
304
|
+
|
305
|
+
REQUIRE(deserialized_td1.is_empty());
|
306
|
+
REQUIRE(deserialized_td2.is_empty());
|
307
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
308
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
309
|
+
REQUIRE(deserialized_td1.get_total_weight() == 0);
|
310
|
+
REQUIRE(deserialized_td2.get_total_weight() == 0);
|
311
|
+
}
|
312
|
+
|
313
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
|
314
|
+
tdigest<double> td(100);
|
315
|
+
const int n = 1000;
|
316
|
+
for (int i = 0; i < n; ++i) td.update(i);
|
317
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
318
|
+
td.serialize(s);
|
319
|
+
auto bytes = td.serialize();
|
320
|
+
|
321
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
322
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
323
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
324
|
+
}
|
325
|
+
|
326
|
+
s.seekg(0); // rewind
|
327
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
328
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
329
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
330
|
+
|
331
|
+
REQUIRE_FALSE(deserialized_td1.is_empty());
|
332
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
333
|
+
REQUIRE(deserialized_td1.get_total_weight() == n);
|
334
|
+
REQUIRE(deserialized_td1.get_min_value() == 0);
|
335
|
+
REQUIRE(deserialized_td1.get_max_value() == n - 1);
|
336
|
+
|
337
|
+
REQUIRE_FALSE(deserialized_td2.is_empty());
|
338
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
339
|
+
REQUIRE(deserialized_td2.get_total_weight() == n);
|
340
|
+
REQUIRE(deserialized_td2.get_min_value() == 0);
|
341
|
+
REQUIRE(deserialized_td2.get_max_value() == n - 1);
|
342
|
+
|
343
|
+
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
|
344
|
+
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
|
345
|
+
}
|
346
|
+
|
347
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
|
348
|
+
tdigest<double> td(100);
|
349
|
+
const int n = 10000;
|
350
|
+
for (int i = 0; i < n; ++i) td.update(i);
|
351
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
352
|
+
td.serialize(s, true);
|
353
|
+
auto bytes = td.serialize(0, true);
|
354
|
+
|
355
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
356
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
357
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
358
|
+
}
|
359
|
+
|
360
|
+
s.seekg(0); // rewind
|
361
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
362
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
363
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
364
|
+
|
365
|
+
REQUIRE_FALSE(deserialized_td1.is_empty());
|
366
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
367
|
+
REQUIRE(deserialized_td1.get_total_weight() == n);
|
368
|
+
REQUIRE(deserialized_td1.get_min_value() == 0);
|
369
|
+
REQUIRE(deserialized_td1.get_max_value() == n - 1);
|
370
|
+
|
371
|
+
REQUIRE_FALSE(deserialized_td2.is_empty());
|
372
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
373
|
+
REQUIRE(deserialized_td2.get_total_weight() == n);
|
374
|
+
REQUIRE(deserialized_td2.get_min_value() == 0);
|
375
|
+
REQUIRE(deserialized_td2.get_max_value() == n - 1);
|
376
|
+
|
377
|
+
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
|
378
|
+
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
|
379
|
+
}
|
380
|
+
|
381
|
+
TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
|
382
|
+
std::ifstream is;
|
383
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
384
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
|
385
|
+
const auto td = tdigest<double>::deserialize(is);
|
386
|
+
const size_t n = 10000;
|
387
|
+
REQUIRE(td.get_total_weight() == n);
|
388
|
+
REQUIRE(td.get_min_value() == 0);
|
389
|
+
REQUIRE(td.get_max_value() == n - 1);
|
390
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
391
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
392
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
393
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
394
|
+
REQUIRE(td.get_rank(n) == 1);
|
395
|
+
}
|
396
|
+
|
397
|
+
TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
|
398
|
+
std::ifstream is;
|
399
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
400
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
|
401
|
+
const auto td = tdigest<float>::deserialize(is);
|
402
|
+
const size_t n = 10000;
|
403
|
+
REQUIRE(td.get_total_weight() == n);
|
404
|
+
REQUIRE(td.get_min_value() == 0);
|
405
|
+
REQUIRE(td.get_max_value() == n - 1);
|
406
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
407
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
408
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
409
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
410
|
+
REQUIRE(td.get_rank(n) == 1);
|
411
|
+
}
|
412
|
+
|
413
|
+
TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
|
414
|
+
std::ifstream is;
|
415
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
416
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
|
417
|
+
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
|
418
|
+
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
419
|
+
const size_t n = 10000;
|
420
|
+
REQUIRE(td.get_total_weight() == n);
|
421
|
+
REQUIRE(td.get_min_value() == 0);
|
422
|
+
REQUIRE(td.get_max_value() == n - 1);
|
423
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
424
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
425
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
426
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
427
|
+
REQUIRE(td.get_rank(n) == 1);
|
428
|
+
}
|
429
|
+
|
430
|
+
TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
|
431
|
+
std::ifstream is;
|
432
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
433
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
|
434
|
+
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
|
435
|
+
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
436
|
+
const size_t n = 10000;
|
437
|
+
REQUIRE(td.get_total_weight() == n);
|
438
|
+
REQUIRE(td.get_min_value() == 0);
|
439
|
+
REQUIRE(td.get_max_value() == n - 1);
|
440
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
441
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
442
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
443
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
444
|
+
REQUIRE(td.get_rank(n) == 1);
|
445
|
+
}
|
446
|
+
|
447
|
+
} /* namespace datasketches */
|
@@ -57,7 +57,7 @@ public:
|
|
57
57
|
// consistent way of initializing theta from p
|
58
58
|
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
59
59
|
static uint64_t starting_theta_from_p(float p) {
|
60
|
-
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
60
|
+
if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
|
61
61
|
return theta_constants::MAX_THETA;
|
62
62
|
}
|
63
63
|
|
@@ -417,6 +417,20 @@ public:
|
|
417
417
|
virtual uint32_t get_num_retained() const;
|
418
418
|
virtual uint16_t get_seed_hash() const;
|
419
419
|
|
420
|
+
/**
|
421
|
+
* Computes maximum serialized size in bytes
|
422
|
+
* @param lg_k nominal number of entries in the sketch
|
423
|
+
*/
|
424
|
+
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
425
|
+
|
426
|
+
/**
|
427
|
+
* Computes size in bytes required to serialize the current state of the sketch.
|
428
|
+
* Computing compressed size is expensive. It takes iterating over all retained hashes,
|
429
|
+
* and the actual serialization will have to look at them again.
|
430
|
+
* @param compressed if true compressed size is returned (if applicable)
|
431
|
+
*/
|
432
|
+
size_t get_serialized_size_bytes(bool compressed = false) const;
|
433
|
+
|
420
434
|
/**
|
421
435
|
* This method serializes the sketch into a given stream in a binary form
|
422
436
|
* @param os output stream
|
@@ -486,8 +500,11 @@ private:
|
|
486
500
|
uint64_t theta_;
|
487
501
|
std::vector<uint64_t, Allocator> entries_;
|
488
502
|
|
503
|
+
uint8_t get_preamble_longs(bool compressed) const;
|
489
504
|
bool is_suitable_for_compression() const;
|
490
|
-
uint8_t
|
505
|
+
uint8_t compute_entry_bits() const;
|
506
|
+
uint8_t get_num_entries_bytes() const;
|
507
|
+
size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
|
491
508
|
void serialize_version_4(std::ostream& os) const;
|
492
509
|
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
493
510
|
|