datasketches 0.4.2 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -0,0 +1,54 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
|
23
|
+
#include "tdigest.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
// assume the binary sketches for this test have been generated by datasketches-java code
|
28
|
+
// in the subdirectory called "java" in the root directory of this project
|
29
|
+
static std::string testBinaryInputPath = std::string(TEST_BINARY_INPUT_PATH) + "../../java/";
|
30
|
+
|
31
|
+
TEST_CASE("tdigest double", "[serde_compat]") {
|
32
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
33
|
+
for (const unsigned n: n_arr) {
|
34
|
+
std::ifstream is;
|
35
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
36
|
+
is.open(testBinaryInputPath + "tdigest_double_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
37
|
+
const auto td = tdigest<double>::deserialize(is);
|
38
|
+
REQUIRE(td.is_empty() == (n == 0));
|
39
|
+
REQUIRE(td.get_total_weight() == n);
|
40
|
+
if (n > 0) {
|
41
|
+
REQUIRE(td.get_min_value() == 1.0);
|
42
|
+
REQUIRE(td.get_max_value() == static_cast<double>(n));
|
43
|
+
REQUIRE(td.get_rank(0) == 0);
|
44
|
+
REQUIRE(td.get_rank(n + 1) == 1);
|
45
|
+
if (n == 1) {
|
46
|
+
REQUIRE(td.get_rank(n) == 0.5);
|
47
|
+
} else {
|
48
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.05));
|
49
|
+
}
|
50
|
+
}
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,67 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <fstream>
|
22
|
+
|
23
|
+
#include "tdigest.hpp"
|
24
|
+
|
25
|
+
namespace datasketches {
|
26
|
+
|
27
|
+
TEST_CASE("tdigest double generate", "[serialize_for_java]") {
|
28
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
29
|
+
for (const unsigned n: n_arr) {
|
30
|
+
tdigest_double td(100);
|
31
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
32
|
+
std::ofstream os("tdigest_double_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
33
|
+
td.serialize(os);
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
TEST_CASE("tdigest double generate with buffer", "[serialize_for_java]") {
|
38
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
39
|
+
for (const unsigned n: n_arr) {
|
40
|
+
tdigest_double td(100);
|
41
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
42
|
+
std::ofstream os("tdigest_double_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
43
|
+
td.serialize(os, true);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
TEST_CASE("tdigest float generate", "[serialize_for_java]") {
|
48
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
49
|
+
for (const unsigned n: n_arr) {
|
50
|
+
tdigest_float td(100);
|
51
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
52
|
+
std::ofstream os("tdigest_float_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
53
|
+
td.serialize(os);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
TEST_CASE("tdigest float generate with buffer", "[serialize_for_java]") {
|
58
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
59
|
+
for (const unsigned n: n_arr) {
|
60
|
+
tdigest_float td(100);
|
61
|
+
for (unsigned i = 1; i <= n; ++i) td.update(i);
|
62
|
+
std::ofstream os("tdigest_float_buf_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
63
|
+
td.serialize(os, true);
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
} /* namespace datasketches */
|
@@ -0,0 +1,447 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
#include <iostream>
|
22
|
+
#include <fstream>
|
23
|
+
|
24
|
+
#include "tdigest.hpp"
|
25
|
+
|
26
|
+
namespace datasketches {
|
27
|
+
|
28
|
+
TEST_CASE("empty", "[tdigest]") {
|
29
|
+
tdigest_double td(10);
|
30
|
+
// std::cout << td.to_string();
|
31
|
+
REQUIRE(td.is_empty());
|
32
|
+
REQUIRE(td.get_k() == 10);
|
33
|
+
REQUIRE(td.get_total_weight() == 0);
|
34
|
+
REQUIRE_THROWS_AS(td.get_min_value(), std::runtime_error);
|
35
|
+
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
|
36
|
+
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
|
37
|
+
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
|
38
|
+
}
|
39
|
+
|
40
|
+
TEST_CASE("one value", "[tdigest]") {
|
41
|
+
tdigest_double td(100);
|
42
|
+
td.update(1);
|
43
|
+
REQUIRE(td.get_k() == 100);
|
44
|
+
REQUIRE(td.get_total_weight() == 1);
|
45
|
+
REQUIRE(td.get_min_value() == 1);
|
46
|
+
REQUIRE(td.get_max_value() == 1);
|
47
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
48
|
+
REQUIRE(td.get_rank(1) == 0.5);
|
49
|
+
REQUIRE(td.get_rank(1.01) == 1);
|
50
|
+
REQUIRE(td.get_quantile(0) == 1);
|
51
|
+
REQUIRE(td.get_quantile(0.5) == 1);
|
52
|
+
REQUIRE(td.get_quantile(1) == 1);
|
53
|
+
}
|
54
|
+
|
55
|
+
TEST_CASE("many values", "[tdigest]") {
|
56
|
+
const size_t n = 10000;
|
57
|
+
tdigest_double td;
|
58
|
+
for (size_t i = 0; i < n; ++i) td.update(i);
|
59
|
+
// std::cout << td.to_string(true);
|
60
|
+
// td.compress();
|
61
|
+
// std::cout << td.to_string(true);
|
62
|
+
REQUIRE_FALSE(td.is_empty());
|
63
|
+
REQUIRE(td.get_total_weight() == n);
|
64
|
+
REQUIRE(td.get_min_value() == 0);
|
65
|
+
REQUIRE(td.get_max_value() == n - 1);
|
66
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
67
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
68
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
69
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
70
|
+
REQUIRE(td.get_rank(n) == 1);
|
71
|
+
REQUIRE(td.get_quantile(0) == 0);
|
72
|
+
REQUIRE(td.get_quantile(0.5) == Approx(n / 2).epsilon(0.03));
|
73
|
+
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
|
74
|
+
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
|
75
|
+
REQUIRE(td.get_quantile(1) == n - 1);
|
76
|
+
}
|
77
|
+
|
78
|
+
TEST_CASE("rank - two values", "[tdigest]") {
|
79
|
+
tdigest_double td(100);
|
80
|
+
td.update(1);
|
81
|
+
td.update(2);
|
82
|
+
// td.compress();
|
83
|
+
// std::cout << td.to_string(true);
|
84
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
85
|
+
REQUIRE(td.get_rank(1) == 0.25);
|
86
|
+
REQUIRE(td.get_rank(1.25) == 0.375);
|
87
|
+
REQUIRE(td.get_rank(1.5) == 0.5);
|
88
|
+
REQUIRE(td.get_rank(1.75) == 0.625);
|
89
|
+
REQUIRE(td.get_rank(2) == 0.75);
|
90
|
+
REQUIRE(td.get_rank(2.01) == 1);
|
91
|
+
}
|
92
|
+
|
93
|
+
TEST_CASE("rank - repeated value", "[tdigest]") {
|
94
|
+
tdigest_double td(100);
|
95
|
+
td.update(1);
|
96
|
+
td.update(1);
|
97
|
+
td.update(1);
|
98
|
+
td.update(1);
|
99
|
+
// td.compress();
|
100
|
+
// std::cout << td.to_string(true);
|
101
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
102
|
+
REQUIRE(td.get_rank(1) == 0.5);
|
103
|
+
REQUIRE(td.get_rank(1.01) == 1);
|
104
|
+
}
|
105
|
+
|
106
|
+
TEST_CASE("rank - repeated block", "[tdigest]") {
|
107
|
+
tdigest_double td(100);
|
108
|
+
td.update(1);
|
109
|
+
td.update(2);
|
110
|
+
td.update(2);
|
111
|
+
td.update(3);
|
112
|
+
// td.compress();
|
113
|
+
// std::cout << td.to_string(true);
|
114
|
+
REQUIRE(td.get_rank(0.99) == 0);
|
115
|
+
REQUIRE(td.get_rank(1) == 0.125);
|
116
|
+
REQUIRE(td.get_rank(2) == 0.5);
|
117
|
+
REQUIRE(td.get_rank(3) == 0.875);
|
118
|
+
REQUIRE(td.get_rank(3.01) == 1);
|
119
|
+
}
|
120
|
+
|
121
|
+
TEST_CASE("merge small", "[tdigest]") {
|
122
|
+
tdigest_double td1(10);
|
123
|
+
td1.update(1);
|
124
|
+
td1.update(2);
|
125
|
+
tdigest_double td2(10);
|
126
|
+
td2.update(2);
|
127
|
+
td2.update(3);
|
128
|
+
td1.merge(td2);
|
129
|
+
REQUIRE(td1.get_min_value() == 1);
|
130
|
+
REQUIRE(td1.get_max_value() == 3);
|
131
|
+
REQUIRE(td1.get_total_weight() == 4);
|
132
|
+
REQUIRE(td1.get_rank(0.99) == 0);
|
133
|
+
REQUIRE(td1.get_rank(1) == 0.125);
|
134
|
+
REQUIRE(td1.get_rank(2) == 0.5);
|
135
|
+
REQUIRE(td1.get_rank(3) == 0.875);
|
136
|
+
REQUIRE(td1.get_rank(3.01) == 1);
|
137
|
+
}
|
138
|
+
|
139
|
+
TEST_CASE("merge large", "[tdigest]") {
|
140
|
+
const size_t n = 10000;
|
141
|
+
tdigest_double td1;
|
142
|
+
tdigest_double td2;
|
143
|
+
for (size_t i = 0; i < n / 2; ++i) {
|
144
|
+
td1.update(i);
|
145
|
+
td2.update(n / 2 + i);
|
146
|
+
}
|
147
|
+
// std::cout << td1.to_string();
|
148
|
+
// std::cout << td2.to_string();
|
149
|
+
td1.merge(td2);
|
150
|
+
// td1.compress();
|
151
|
+
// std::cout << td1.to_string(true);
|
152
|
+
REQUIRE(td1.get_total_weight() == n);
|
153
|
+
REQUIRE(td1.get_min_value() == 0);
|
154
|
+
REQUIRE(td1.get_max_value() == n - 1);
|
155
|
+
REQUIRE(td1.get_rank(0) == Approx(0).margin(0.0001));
|
156
|
+
REQUIRE(td1.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
157
|
+
REQUIRE(td1.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
158
|
+
REQUIRE(td1.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
159
|
+
REQUIRE(td1.get_rank(n) == 1);
|
160
|
+
}
|
161
|
+
|
162
|
+
TEST_CASE("serialize deserialize stream empty", "[tdigest]") {
|
163
|
+
tdigest<double> td(100);
|
164
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
165
|
+
td.serialize(s);
|
166
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
167
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
168
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
169
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
170
|
+
}
|
171
|
+
|
172
|
+
TEST_CASE("serialize deserialize stream single value", "[tdigest]") {
|
173
|
+
tdigest<double> td;
|
174
|
+
td.update(123);
|
175
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
176
|
+
td.serialize(s);
|
177
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
178
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
179
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
180
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
181
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
182
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
183
|
+
}
|
184
|
+
|
185
|
+
TEST_CASE("serialize deserialize stream single value buffered", "[tdigest]") {
|
186
|
+
tdigest<double> td;
|
187
|
+
td.update(123);
|
188
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
189
|
+
td.serialize(s, true);
|
190
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
191
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
192
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
193
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
194
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
195
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
196
|
+
}
|
197
|
+
|
198
|
+
TEST_CASE("serialize deserialize stream many values", "[tdigest]") {
|
199
|
+
tdigest<double> td(100);
|
200
|
+
for (int i = 0; i < 1000; ++i) td.update(i);
|
201
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
202
|
+
td.serialize(s);
|
203
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
204
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
205
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
206
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
207
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
208
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
209
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
210
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
211
|
+
}
|
212
|
+
|
213
|
+
TEST_CASE("serialize deserialize stream many values with buffer", "[tdigest]") {
|
214
|
+
tdigest<double> td(100);
|
215
|
+
for (int i = 0; i < 10000; ++i) td.update(i);
|
216
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
217
|
+
td.serialize(s, true);
|
218
|
+
auto deserialized_td = tdigest<double>::deserialize(s);
|
219
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
220
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
221
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
222
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
223
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
224
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
225
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
226
|
+
}
|
227
|
+
|
228
|
+
TEST_CASE("serialize deserialize bytes empty", "[tdigest]") {
|
229
|
+
tdigest<double> td(100);
|
230
|
+
auto bytes = td.serialize();
|
231
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
232
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
233
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
234
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("serialize deserialize bytes single value", "[tdigest]") {
|
238
|
+
tdigest<double> td(200);
|
239
|
+
td.update(123);
|
240
|
+
auto bytes = td.serialize();
|
241
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
242
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
243
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
244
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
245
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
246
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
247
|
+
}
|
248
|
+
|
249
|
+
TEST_CASE("serialize deserialize bytes single value buffered", "[tdigest]") {
|
250
|
+
tdigest<double> td(200);
|
251
|
+
td.update(123);
|
252
|
+
auto bytes = td.serialize(0, true);
|
253
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
254
|
+
REQUIRE(deserialized_td.get_k() == 200);
|
255
|
+
REQUIRE(deserialized_td.get_total_weight() == 1);
|
256
|
+
REQUIRE_FALSE(deserialized_td.is_empty());
|
257
|
+
REQUIRE(deserialized_td.get_min_value() == 123);
|
258
|
+
REQUIRE(deserialized_td.get_max_value() == 123);
|
259
|
+
}
|
260
|
+
|
261
|
+
TEST_CASE("serialize deserialize bytes many values", "[tdigest]") {
|
262
|
+
tdigest<double> td(100);
|
263
|
+
for (int i = 0; i < 1000; ++i) td.update(i);
|
264
|
+
auto bytes = td.serialize();
|
265
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
266
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
267
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
268
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
269
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
270
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
271
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
272
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
273
|
+
}
|
274
|
+
|
275
|
+
TEST_CASE("serialize deserialize bytes many values with buffer", "[tdigest]") {
|
276
|
+
tdigest<double> td(100);
|
277
|
+
for (int i = 0; i < 10000; ++i) td.update(i);
|
278
|
+
auto bytes = td.serialize();
|
279
|
+
auto deserialized_td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
280
|
+
REQUIRE(td.get_k() == deserialized_td.get_k());
|
281
|
+
REQUIRE(td.get_total_weight() == deserialized_td.get_total_weight());
|
282
|
+
REQUIRE(td.is_empty() == deserialized_td.is_empty());
|
283
|
+
REQUIRE(td.get_min_value() == deserialized_td.get_min_value());
|
284
|
+
REQUIRE(td.get_max_value() == deserialized_td.get_max_value());
|
285
|
+
REQUIRE(td.get_rank(500) == deserialized_td.get_rank(500));
|
286
|
+
REQUIRE(td.get_quantile(0.5) == deserialized_td.get_quantile(0.5));
|
287
|
+
}
|
288
|
+
|
289
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence empty", "[tdigest]") {
|
290
|
+
tdigest<double> td(100);
|
291
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
292
|
+
td.serialize(s);
|
293
|
+
auto bytes = td.serialize();
|
294
|
+
|
295
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
296
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
297
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
298
|
+
}
|
299
|
+
|
300
|
+
s.seekg(0); // rewind
|
301
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
302
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
303
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
304
|
+
|
305
|
+
REQUIRE(deserialized_td1.is_empty());
|
306
|
+
REQUIRE(deserialized_td2.is_empty());
|
307
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
308
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
309
|
+
REQUIRE(deserialized_td1.get_total_weight() == 0);
|
310
|
+
REQUIRE(deserialized_td2.get_total_weight() == 0);
|
311
|
+
}
|
312
|
+
|
313
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence", "[tdigest]") {
|
314
|
+
tdigest<double> td(100);
|
315
|
+
const int n = 1000;
|
316
|
+
for (int i = 0; i < n; ++i) td.update(i);
|
317
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
318
|
+
td.serialize(s);
|
319
|
+
auto bytes = td.serialize();
|
320
|
+
|
321
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
322
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
323
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
324
|
+
}
|
325
|
+
|
326
|
+
s.seekg(0); // rewind
|
327
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
328
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
329
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
330
|
+
|
331
|
+
REQUIRE_FALSE(deserialized_td1.is_empty());
|
332
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
333
|
+
REQUIRE(deserialized_td1.get_total_weight() == n);
|
334
|
+
REQUIRE(deserialized_td1.get_min_value() == 0);
|
335
|
+
REQUIRE(deserialized_td1.get_max_value() == n - 1);
|
336
|
+
|
337
|
+
REQUIRE_FALSE(deserialized_td2.is_empty());
|
338
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
339
|
+
REQUIRE(deserialized_td2.get_total_weight() == n);
|
340
|
+
REQUIRE(deserialized_td2.get_min_value() == 0);
|
341
|
+
REQUIRE(deserialized_td2.get_max_value() == n - 1);
|
342
|
+
|
343
|
+
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
|
344
|
+
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
|
345
|
+
}
|
346
|
+
|
347
|
+
TEST_CASE("serialize deserialize steam and bytes equivalence with buffer", "[tdigest]") {
|
348
|
+
tdigest<double> td(100);
|
349
|
+
const int n = 10000;
|
350
|
+
for (int i = 0; i < n; ++i) td.update(i);
|
351
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
352
|
+
td.serialize(s, true);
|
353
|
+
auto bytes = td.serialize(0, true);
|
354
|
+
|
355
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
356
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
357
|
+
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
358
|
+
}
|
359
|
+
|
360
|
+
s.seekg(0); // rewind
|
361
|
+
auto deserialized_td1 = tdigest<double>::deserialize(s);
|
362
|
+
auto deserialized_td2 = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
363
|
+
REQUIRE(bytes.size() == static_cast<size_t>(s.tellg()));
|
364
|
+
|
365
|
+
REQUIRE_FALSE(deserialized_td1.is_empty());
|
366
|
+
REQUIRE(deserialized_td1.get_k() == 100);
|
367
|
+
REQUIRE(deserialized_td1.get_total_weight() == n);
|
368
|
+
REQUIRE(deserialized_td1.get_min_value() == 0);
|
369
|
+
REQUIRE(deserialized_td1.get_max_value() == n - 1);
|
370
|
+
|
371
|
+
REQUIRE_FALSE(deserialized_td2.is_empty());
|
372
|
+
REQUIRE(deserialized_td2.get_k() == 100);
|
373
|
+
REQUIRE(deserialized_td2.get_total_weight() == n);
|
374
|
+
REQUIRE(deserialized_td2.get_min_value() == 0);
|
375
|
+
REQUIRE(deserialized_td2.get_max_value() == n - 1);
|
376
|
+
|
377
|
+
REQUIRE(deserialized_td1.get_rank(n / 2) == deserialized_td2.get_rank(n / 2));
|
378
|
+
REQUIRE(deserialized_td1.get_quantile(0.5) == deserialized_td2.get_quantile(0.5));
|
379
|
+
}
|
380
|
+
|
381
|
+
TEST_CASE("deserialize from reference implementation stream double", "[tdigest]") {
|
382
|
+
std::ifstream is;
|
383
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
384
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
|
385
|
+
const auto td = tdigest<double>::deserialize(is);
|
386
|
+
const size_t n = 10000;
|
387
|
+
REQUIRE(td.get_total_weight() == n);
|
388
|
+
REQUIRE(td.get_min_value() == 0);
|
389
|
+
REQUIRE(td.get_max_value() == n - 1);
|
390
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
391
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
392
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
393
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
394
|
+
REQUIRE(td.get_rank(n) == 1);
|
395
|
+
}
|
396
|
+
|
397
|
+
TEST_CASE("deserialize from reference implementation stream float", "[tdigest]") {
|
398
|
+
std::ifstream is;
|
399
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
400
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
|
401
|
+
const auto td = tdigest<float>::deserialize(is);
|
402
|
+
const size_t n = 10000;
|
403
|
+
REQUIRE(td.get_total_weight() == n);
|
404
|
+
REQUIRE(td.get_min_value() == 0);
|
405
|
+
REQUIRE(td.get_max_value() == n - 1);
|
406
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
407
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
408
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
409
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
410
|
+
REQUIRE(td.get_rank(n) == 1);
|
411
|
+
}
|
412
|
+
|
413
|
+
TEST_CASE("deserialize from reference implementation bytes double", "[tdigest]") {
|
414
|
+
std::ifstream is;
|
415
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
416
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_double.sk", std::ios::binary);
|
417
|
+
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
|
418
|
+
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
419
|
+
const size_t n = 10000;
|
420
|
+
REQUIRE(td.get_total_weight() == n);
|
421
|
+
REQUIRE(td.get_min_value() == 0);
|
422
|
+
REQUIRE(td.get_max_value() == n - 1);
|
423
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
424
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
425
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
426
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
427
|
+
REQUIRE(td.get_rank(n) == 1);
|
428
|
+
}
|
429
|
+
|
430
|
+
TEST_CASE("deserialize from reference implementation bytes float", "[tdigest]") {
|
431
|
+
std::ifstream is;
|
432
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
433
|
+
is.open(std::string(TEST_BINARY_INPUT_PATH) + "tdigest_ref_k100_n10000_float.sk", std::ios::binary);
|
434
|
+
std::vector<char> bytes((std::istreambuf_iterator<char>(is)), (std::istreambuf_iterator<char>()));
|
435
|
+
const auto td = tdigest<double>::deserialize(bytes.data(), bytes.size());
|
436
|
+
const size_t n = 10000;
|
437
|
+
REQUIRE(td.get_total_weight() == n);
|
438
|
+
REQUIRE(td.get_min_value() == 0);
|
439
|
+
REQUIRE(td.get_max_value() == n - 1);
|
440
|
+
REQUIRE(td.get_rank(0) == Approx(0).margin(0.0001));
|
441
|
+
REQUIRE(td.get_rank(n / 4) == Approx(0.25).margin(0.0001));
|
442
|
+
REQUIRE(td.get_rank(n / 2) == Approx(0.5).margin(0.0001));
|
443
|
+
REQUIRE(td.get_rank(n * 3 / 4) == Approx(0.75).margin(0.0001));
|
444
|
+
REQUIRE(td.get_rank(n) == 1);
|
445
|
+
}
|
446
|
+
|
447
|
+
} /* namespace datasketches */
|
@@ -57,7 +57,7 @@ public:
|
|
57
57
|
// consistent way of initializing theta from p
|
58
58
|
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
59
59
|
static uint64_t starting_theta_from_p(float p) {
|
60
|
-
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
60
|
+
if (p < 1) return static_cast<uint64_t>(static_cast<double>(theta_constants::MAX_THETA) * p);
|
61
61
|
return theta_constants::MAX_THETA;
|
62
62
|
}
|
63
63
|
|
@@ -417,6 +417,20 @@ public:
|
|
417
417
|
virtual uint32_t get_num_retained() const;
|
418
418
|
virtual uint16_t get_seed_hash() const;
|
419
419
|
|
420
|
+
/**
|
421
|
+
* Computes maximum serialized size in bytes
|
422
|
+
* @param lg_k nominal number of entries in the sketch
|
423
|
+
*/
|
424
|
+
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
425
|
+
|
426
|
+
/**
|
427
|
+
* Computes size in bytes required to serialize the current state of the sketch.
|
428
|
+
* Computing compressed size is expensive. It takes iterating over all retained hashes,
|
429
|
+
* and the actual serialization will have to look at them again.
|
430
|
+
* @param compressed if true compressed size is returned (if applicable)
|
431
|
+
*/
|
432
|
+
size_t get_serialized_size_bytes(bool compressed = false) const;
|
433
|
+
|
420
434
|
/**
|
421
435
|
* This method serializes the sketch into a given stream in a binary form
|
422
436
|
* @param os output stream
|
@@ -486,8 +500,11 @@ private:
|
|
486
500
|
uint64_t theta_;
|
487
501
|
std::vector<uint64_t, Allocator> entries_;
|
488
502
|
|
503
|
+
uint8_t get_preamble_longs(bool compressed) const;
|
489
504
|
bool is_suitable_for_compression() const;
|
490
|
-
uint8_t
|
505
|
+
uint8_t compute_entry_bits() const;
|
506
|
+
uint8_t get_num_entries_bytes() const;
|
507
|
+
size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
|
491
508
|
void serialize_version_4(std::ostream& os) const;
|
492
509
|
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
493
510
|
|