datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -0,0 +1,975 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <cmath>
22
+ #include <sstream>
23
+ #include <fstream>
24
+
25
+ #include <quantiles_sketch.hpp>
26
+ #include <test_allocator.hpp>
27
+ #include <common_defs.hpp>
28
+
29
+ namespace datasketches {
30
+
31
+ static const double RANK_EPS_FOR_K_128 = 0.01725;
32
+ static const double NUMERIC_NOISE_TOLERANCE = 1E-6;
33
+
34
+ #ifdef TEST_BINARY_INPUT_PATH
35
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
36
+ #else
37
+ static std::string testBinaryInputPath = "test/";
38
+ #endif
39
+
40
+ // typical usage would be just quantiles_sketch<float> or quantiles_sketch<std::string>, but here we use test_allocator
41
+ using quantiles_float_sketch = quantiles_sketch<float, std::less<float>, test_allocator<float>>;
42
+ using quantiles_string_sketch = quantiles_sketch<std::string, std::less<std::string>, test_allocator<std::string>>;
43
+
44
+ TEST_CASE("quantiles sketch", "[quantiles_sketch]") {
45
+
46
+ // setup
47
+ test_allocator_total_bytes = 0;
48
+
49
+ SECTION("k limits") {
50
+ quantiles_float_sketch sketch1(quantiles_constants::MIN_K, 0); // this should work
51
+ quantiles_float_sketch sketch2(quantiles_constants::MAX_K, 0); // this should work
52
+ REQUIRE_THROWS_AS(new quantiles_float_sketch(quantiles_constants::MIN_K - 1, 0), std::invalid_argument);
53
+ REQUIRE_THROWS_AS(new quantiles_float_sketch(40, 0), std::invalid_argument); // not power of 2
54
+ // MAX_K + 1 makes no sense because k is uint16_t
55
+ }
56
+
57
+ SECTION("empty") {
58
+ quantiles_float_sketch sketch(128, 0);
59
+ REQUIRE(sketch.is_empty());
60
+ REQUIRE_FALSE(sketch.is_estimation_mode());
61
+ REQUIRE(sketch.get_n() == 0);
62
+ REQUIRE(sketch.get_num_retained() == 0);
63
+ REQUIRE(std::isnan(sketch.get_rank(0)));
64
+ REQUIRE(std::isnan(sketch.get_min_value()));
65
+ REQUIRE(std::isnan(sketch.get_max_value()));
66
+ REQUIRE(std::isnan(sketch.get_quantile(0.5)));
67
+ const double fractions[3] {0, 0.5, 1};
68
+ REQUIRE(sketch.get_quantiles(fractions, 3).empty());
69
+ const float split_points[1] {0};
70
+ REQUIRE(sketch.get_PMF(split_points, 1).empty());
71
+ REQUIRE(sketch.get_CDF(split_points, 1).empty());
72
+
73
+ for (auto it: sketch) {
74
+ unused(it);
75
+ FAIL("should be no iterations over an empty sketch");
76
+ }
77
+ }
78
+
79
+ SECTION("get bad quantile") {
80
+ quantiles_float_sketch sketch(64, 0);
81
+ sketch.update(0.0f); // has to be non-empty to reach the check
82
+ REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
83
+ }
84
+
85
+ SECTION("one item") {
86
+ quantiles_float_sketch sketch(128, 0);
87
+ sketch.update(1.0f);
88
+ REQUIRE_FALSE(sketch.is_empty());
89
+ REQUIRE_FALSE(sketch.is_estimation_mode());
90
+ REQUIRE(sketch.get_n() == 1);
91
+ REQUIRE(sketch.get_num_retained() == 1);
92
+ REQUIRE(sketch.get_rank(1.0f) == 0.0);
93
+ REQUIRE(sketch.get_rank(2.0f) == 1.0);
94
+ REQUIRE(sketch.get_min_value() == 1.0);
95
+ REQUIRE(sketch.get_max_value() == 1.0);
96
+ REQUIRE(sketch.get_quantile(0.5) == 1.0);
97
+ const double fractions[3] {0, 0.5, 1};
98
+ auto quantiles = sketch.get_quantiles(fractions, 3);
99
+ REQUIRE(quantiles.size() == 3);
100
+ REQUIRE(quantiles[0] == 1.0);
101
+ REQUIRE(quantiles[1] == 1.0);
102
+ REQUIRE(quantiles[2] == 1.0);
103
+
104
+ int count = 0;
105
+ for (auto it: sketch) {
106
+ REQUIRE(it.second == 1);
107
+ ++count;
108
+ }
109
+ REQUIRE(count == 1);
110
+ }
111
+
112
+ SECTION("NaN") {
113
+ quantiles_float_sketch sketch(256, 0);
114
+ sketch.update(std::numeric_limits<float>::quiet_NaN());
115
+ REQUIRE(sketch.is_empty());
116
+
117
+ sketch.update(0.0f);
118
+ sketch.update(std::numeric_limits<float>::quiet_NaN());
119
+ REQUIRE(sketch.get_n() == 1);
120
+ }
121
+
122
+
123
+ SECTION("sampling mode") {
124
+ const uint16_t k = 8;
125
+ const uint32_t n = 16 * (2 * k) + 1;
126
+ quantiles_float_sketch sk(k, 0);
127
+ for (uint32_t i = 0; i < n; ++i) {
128
+ sk.update(static_cast<float>(i));
129
+ }
130
+ }
131
+
132
+ SECTION("many items, exact mode") {
133
+ const uint32_t n = 127;
134
+ quantiles_float_sketch sketch(n + 1, 0);
135
+ for (uint32_t i = 0; i < n; i++) {
136
+ sketch.update(static_cast<float>(i));
137
+ REQUIRE(sketch.get_n() == i + 1);
138
+ }
139
+ REQUIRE_FALSE(sketch.is_empty());
140
+ REQUIRE_FALSE(sketch.is_estimation_mode());
141
+ REQUIRE(sketch.get_num_retained() == n);
142
+ REQUIRE(sketch.get_min_value() == 0.0);
143
+ REQUIRE(sketch.get_quantile(0) == 0.0);
144
+ REQUIRE(sketch.get_max_value() == n - 1);
145
+ REQUIRE(sketch.get_quantile(1) == n - 1);
146
+
147
+ int count = 0;
148
+ for (auto it: sketch) {
149
+ REQUIRE(it.second == 1);
150
+ ++count;
151
+ }
152
+ REQUIRE(count == n);
153
+
154
+ const double fractions[3] {0, 0.5, 1};
155
+ auto quantiles = sketch.get_quantiles(fractions, 3);
156
+ REQUIRE(quantiles.size() == 3);
157
+ REQUIRE(quantiles[0] == 0.0);
158
+ REQUIRE(quantiles[1] == static_cast<float>(n / 2));
159
+ REQUIRE(quantiles[2] == n - 1 );
160
+
161
+ for (uint32_t i = 0; i < n; i++) {
162
+ const double trueRank = (double) i / n;
163
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
164
+ }
165
+
166
+ // the alternative method must produce the same result
167
+ auto quantiles2 = sketch.get_quantiles(3);
168
+ REQUIRE(quantiles2.size() == 3);
169
+ REQUIRE(quantiles[0] == quantiles2[0]);
170
+ REQUIRE(quantiles[1] == quantiles2[1]);
171
+ REQUIRE(quantiles[2] == quantiles2[2]);
172
+ }
173
+
174
+ SECTION("10 items") {
175
+ quantiles_float_sketch sketch(128, 0);
176
+ sketch.update(1.0f);
177
+ sketch.update(2.0f);
178
+ sketch.update(3.0f);
179
+ sketch.update(4.0f);
180
+ sketch.update(5.0f);
181
+ sketch.update(6.0f);
182
+ sketch.update(7.0f);
183
+ sketch.update(8.0f);
184
+ sketch.update(9.0f);
185
+ sketch.update(10.0f);
186
+ REQUIRE(sketch.get_quantile(0) == 1.0);
187
+ REQUIRE(sketch.get_quantile(0.5) == 6.0);
188
+ REQUIRE(sketch.get_quantile(0.99) == 10.0);
189
+ REQUIRE(sketch.get_quantile(1) == 10.0);
190
+ }
191
+
192
+ SECTION("100 items") {
193
+ quantiles_float_sketch sketch(128, 0);
194
+ for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
195
+ REQUIRE(sketch.get_quantile(0) == 0);
196
+ REQUIRE(sketch.get_quantile(0.01) == 1);
197
+ REQUIRE(sketch.get_quantile(0.5) == 50);
198
+ REQUIRE(sketch.get_quantile(0.99) == 99.0);
199
+ REQUIRE(sketch.get_quantile(1) == 99.0);
200
+ }
201
+
202
+ SECTION("many items, estimation mode") {
203
+ quantiles_float_sketch sketch(128, 0);
204
+ const int n = 1000000;
205
+ for (int i = 0; i < n; i++) {
206
+ sketch.update(static_cast<float>(i));
207
+ REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
208
+ }
209
+ REQUIRE_FALSE(sketch.is_empty());
210
+ REQUIRE(sketch.is_estimation_mode());
211
+ REQUIRE(sketch.get_min_value() == 0.0); // min value is exact
212
+ REQUIRE(sketch.get_quantile(0) == 0.0); // min value is exact
213
+ REQUIRE(sketch.get_max_value() == n - 1); // max value is exact
214
+ REQUIRE(sketch.get_quantile(1) == n - 1); // max value is exact
215
+
216
+ // test rank
217
+ for (int i = 0; i < n; i++) {
218
+ const double trueRank = static_cast<float>(i) / n;
219
+ const double sketchRank = sketch.get_rank(static_cast<float>(i));
220
+ REQUIRE(sketchRank == Approx(trueRank).margin(RANK_EPS_FOR_K_128));
221
+ }
222
+
223
+ // test quantiles at every 0.1 percentage point
224
+ double fractions[1001];
225
+ double reverse_fractions[1001]; // check that ordering does not matter
226
+ for (int i = 0; i < 1001; i++) {
227
+ fractions[i] = (double) i / 1000;
228
+ reverse_fractions[1000 - i] = fractions[i];
229
+ }
230
+ auto quantiles = sketch.get_quantiles(fractions, 1001);
231
+ auto reverse_quantiles = sketch.get_quantiles(reverse_fractions, 1001);
232
+ float previous_quantile(0);
233
+ for (int i = 0; i < 1001; i++) {
234
+ // expensive in a loop, just to check the equivalence here, not advised for real code
235
+ const float quantile = sketch.get_quantile(fractions[i]);
236
+ REQUIRE(quantiles[i] == quantile);
237
+ REQUIRE(reverse_quantiles[1000 - i] == quantile);
238
+ REQUIRE(previous_quantile <= quantile);
239
+ previous_quantile = quantile;
240
+ }
241
+
242
+ //std::cout << sketch.to_string();
243
+
244
+ uint32_t count = 0;
245
+ uint64_t total_weight = 0;
246
+ for (auto it: sketch) {
247
+ ++count;
248
+ total_weight += it.second;
249
+ }
250
+ REQUIRE(count == sketch.get_num_retained());
251
+ REQUIRE(total_weight == sketch.get_n());
252
+ }
253
+
254
+ SECTION("consistency between get_rank and get_PMF/CDF") {
255
+ quantiles_float_sketch sketch(64, 0);
256
+ const int n = 1000;
257
+ float values[n];
258
+ for (int i = 0; i < n; i++) {
259
+ sketch.update(static_cast<float>(i));
260
+ values[i] = static_cast<float>(i);
261
+ }
262
+
263
+ const auto ranks(sketch.get_CDF(values, n));
264
+ const auto pmf(sketch.get_PMF(values, n));
265
+
266
+ double subtotal_pmf(0);
267
+ for (int i = 0; i < n; i++) {
268
+ if (sketch.get_rank(values[i]) != ranks[i]) {
269
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
270
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
271
+ }
272
+ subtotal_pmf += pmf[i];
273
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
274
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
275
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
276
+ }
277
+ }
278
+ }
279
+
280
+ SECTION("inclusive true vs false") {
281
+ quantiles_sketch<int> sketch(32);
282
+ const int n = 100;
283
+ for (int i = 1; i <= n; i++) {
284
+ sketch.update(i);
285
+ }
286
+
287
+ // get_rank()
288
+ // using knowledge of internal structure
289
+ // value still in the base buffer to avoid randomness
290
+ REQUIRE(sketch.get_rank<false>(80) == 0.79);
291
+ REQUIRE(sketch.get_rank<true>(80) == 0.80);
292
+
293
+ // value pushed into higher level
294
+ REQUIRE(sketch.get_rank<false>(50) == Approx(0.49).margin(0.01));
295
+ REQUIRE(sketch.get_rank<true>(50) == 0.50);
296
+
297
+ // get_quantile()
298
+ // value still in base buffer
299
+ REQUIRE(sketch.get_quantile<false>(0.70) == 71);
300
+ REQUIRE(sketch.get_quantile<true>(0.70) == 70);
301
+
302
+ // value pushed into higher levell
303
+ int quantile = sketch.get_quantile<false>(0.30);
304
+ if (quantile != 31 && quantile != 32) { FAIL(); }
305
+
306
+ quantile = sketch.get_quantile<true>(0.30);
307
+ if (quantile != 29 && quantile != 30) { FAIL(); }
308
+ }
309
+
310
+ SECTION("stream serialize deserialize empty") {
311
+ quantiles_float_sketch sketch(128, 0);
312
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
313
+ sketch.serialize(s);
314
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
315
+ auto sketch2 = quantiles_float_sketch::deserialize(s, serde<float>(), test_allocator<float>(0));
316
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
317
+ REQUIRE(s.tellg() == s.tellp());
318
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
319
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
320
+ REQUIRE(sketch2.get_n() == sketch.get_n());
321
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
322
+ REQUIRE(std::isnan(sketch2.get_min_value()));
323
+ REQUIRE(std::isnan(sketch2.get_max_value()));
324
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
325
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
326
+ }
327
+
328
+ SECTION("bytes serialize deserialize empty") {
329
+ quantiles_float_sketch sketch(256, 0);
330
+ auto bytes = sketch.serialize();
331
+ auto sketch2 = quantiles_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
332
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
333
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
334
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
335
+ REQUIRE(sketch2.get_n() == sketch.get_n());
336
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
337
+ REQUIRE(std::isnan(sketch2.get_min_value()));
338
+ REQUIRE(std::isnan(sketch2.get_max_value()));
339
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
340
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
341
+ }
342
+
343
+ SECTION("stream serialize deserialize one item") {
344
+ quantiles_float_sketch sketch(32, 0);
345
+ sketch.update(1.0f);
346
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
347
+ sketch.serialize(s);
348
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
349
+ auto sketch2 = quantiles_float_sketch::deserialize(s, serde<float>(), test_allocator<float>(0));
350
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
351
+ REQUIRE(s.tellg() == s.tellp());
352
+ REQUIRE_FALSE(sketch2.is_empty());
353
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
354
+ REQUIRE(sketch2.get_n() == 1);
355
+ REQUIRE(sketch2.get_num_retained() == 1);
356
+ REQUIRE(sketch2.get_min_value() == 1.0);
357
+ REQUIRE(sketch2.get_max_value() == 1.0);
358
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
359
+ REQUIRE(sketch2.get_rank(1) == 0.0);
360
+ REQUIRE(sketch2.get_rank(2) == 1.0);
361
+ }
362
+
363
+ SECTION("bytes serialize deserialize one item") {
364
+ quantiles_float_sketch sketch(64, 0);
365
+ sketch.update(1.0f);
366
+ auto bytes = sketch.serialize();
367
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
368
+ auto sketch2 = quantiles_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
369
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
370
+ REQUIRE_FALSE(sketch2.is_empty());
371
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
372
+ REQUIRE(sketch2.get_n() == 1);
373
+ REQUIRE(sketch2.get_num_retained() == 1);
374
+ REQUIRE(sketch2.get_min_value() == 1.0);
375
+ REQUIRE(sketch2.get_max_value() == 1.0);
376
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
377
+ REQUIRE(sketch2.get_rank(1) == 0.0);
378
+ REQUIRE(sketch2.get_rank(2) == 1.0);
379
+ }
380
+
381
+ SECTION("stream serialize deserialize three items") {
382
+ quantiles_float_sketch sketch(128, 0);
383
+ sketch.update(1.0f);
384
+ sketch.update(2.0f);
385
+ sketch.update(3.0f);
386
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
387
+ sketch.serialize(s);
388
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
389
+ auto sketch2 = quantiles_float_sketch::deserialize(s, serde<float>(), test_allocator<float>(0));
390
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
391
+ REQUIRE(s.tellg() == s.tellp());
392
+ REQUIRE_FALSE(sketch2.is_empty());
393
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
394
+ REQUIRE(sketch2.get_n() == 3);
395
+ REQUIRE(sketch2.get_num_retained() == 3);
396
+ REQUIRE(sketch2.get_min_value() == 1.0);
397
+ REQUIRE(sketch2.get_max_value() == 3.0);
398
+ }
399
+
400
+ SECTION("bytes serialize deserialize three items") {
401
+ quantiles_float_sketch sketch(128, 0);
402
+ sketch.update(1.0f);
403
+ sketch.update(2.0f);
404
+ sketch.update(3.0f);
405
+ auto bytes = sketch.serialize();
406
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
407
+ auto sketch2 = quantiles_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
408
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
409
+ REQUIRE_FALSE(sketch2.is_empty());
410
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
411
+ REQUIRE(sketch2.get_n() == 3);
412
+ REQUIRE(sketch2.get_num_retained() == 3);
413
+ REQUIRE(sketch2.get_min_value() == 1.0);
414
+ REQUIRE(sketch2.get_max_value() == 3.0);
415
+ }
416
+
417
+ SECTION("stream serialize deserialize many floats") {
418
+ quantiles_float_sketch sketch(128, 0);
419
+ const int n = 1000;
420
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
421
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
422
+ sketch.serialize(s);
423
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
424
+ auto sketch2 = quantiles_float_sketch::deserialize(s, serde<float>(), test_allocator<float>(0));
425
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
426
+ REQUIRE(s.tellg() == s.tellp());
427
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
428
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
429
+ REQUIRE(sketch2.get_n() == sketch.get_n());
430
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
431
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
432
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
433
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
434
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
435
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
436
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
437
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
438
+ }
439
+ SECTION("bytes serialize deserialize many floats") {
440
+ quantiles_float_sketch sketch(128, 0);
441
+ const int n = 1000;
442
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
443
+ auto bytes = sketch.serialize();
444
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
445
+ auto sketch2 = quantiles_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
446
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
447
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
448
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
449
+ REQUIRE(sketch2.get_n() == sketch.get_n());
450
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
451
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
452
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
453
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
454
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
455
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
456
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
457
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
458
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
459
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
460
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
461
+ }
462
+
463
+ SECTION("bytes serialize deserialize many ints") {
464
+ quantiles_sketch<int> sketch;
465
+ const int n = 1000;
466
+ for (int i = 0; i < n; i++) sketch.update(i);
467
+ auto bytes = sketch.serialize();
468
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
469
+ auto sketch2 = quantiles_sketch<int>::deserialize(bytes.data(), bytes.size());
470
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
471
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
472
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
473
+ REQUIRE(sketch2.get_n() == sketch.get_n());
474
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
475
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
476
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
477
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
478
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
479
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
480
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
481
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
482
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
483
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
484
+ REQUIRE_THROWS_AS(quantiles_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
485
+ }
486
+
487
+ SECTION("out of order split points, float") {
488
+ quantiles_float_sketch sketch(256, 0);
489
+ sketch.update(0.0f); // has too be non-empty to reach the check
490
+ float split_points[2] = {1, 0};
491
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
492
+ }
493
+
494
+ SECTION("out of order split points, int") {
495
+ quantiles_sketch<int> sketch;
496
+ sketch.update(0); // has too be non-empty to reach the check
497
+ int split_points[2] = {1, 0};
498
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
499
+ }
500
+
501
+ SECTION("NaN split point") {
502
+ quantiles_float_sketch sketch(512, 0);
503
+ sketch.update(0.0f); // has too be non-empty to reach the check
504
+ float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
505
+ REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
506
+ }
507
+
508
+ SECTION("merge") {
509
+ quantiles_float_sketch sketch1(128, 0);
510
+ quantiles_float_sketch sketch2(128, 0);
511
+ const int n = 10000;
512
+ for (int i = 0; i < n; i++) {
513
+ sketch1.update(static_cast<float>(i));
514
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
515
+ }
516
+
517
+ REQUIRE(sketch1.get_min_value() == 0.0f);
518
+ REQUIRE(sketch1.get_max_value() == n - 1);
519
+ REQUIRE(sketch2.get_min_value() == n);
520
+ REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
521
+
522
+ sketch1.merge(sketch2);
523
+
524
+ REQUIRE_FALSE(sketch1.is_empty());
525
+ REQUIRE(sketch1.get_n() == 2 * n);
526
+ REQUIRE(sketch1.get_min_value() == 0.0f);
527
+ REQUIRE(sketch1.get_max_value() == 2.0f * n - 1);
528
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
529
+ }
530
+
531
+ SECTION("merge from const") {
532
+ quantiles_float_sketch sketch1(128, 0);
533
+ quantiles_float_sketch sketch2(128, 0);
534
+ const int n = 10000;
535
+ for (int i = 0; i < n; i++) {
536
+ sketch1.update(static_cast<float>(i));
537
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
538
+ }
539
+
540
+ REQUIRE(sketch1.get_min_value() == 0.0f);
541
+ REQUIRE(sketch1.get_max_value() == n - 1);
542
+ REQUIRE(sketch2.get_min_value() == n);
543
+ REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
544
+
545
+ sketch1.merge(const_cast<const quantiles_float_sketch&>(sketch2));
546
+
547
+ REQUIRE_FALSE(sketch1.is_empty());
548
+ REQUIRE(sketch1.get_n() == 2 * n);
549
+ REQUIRE(sketch1.get_min_value() == 0.0f);
550
+ REQUIRE(sketch1.get_max_value() == 2.0f * n - 1);
551
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
552
+ }
553
+
554
+
555
+ SECTION("merge lower k") {
556
+ quantiles_float_sketch sketch1(256, 0);
557
+ quantiles_float_sketch sketch2(128, 0);
558
+ const int n = 10000;
559
+ for (int i = 0; i < n; i++) {
560
+ sketch1.update(static_cast<float>(i));
561
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
562
+ }
563
+
564
+ REQUIRE(sketch1.get_min_value() == 0.0f);
565
+ REQUIRE(sketch1.get_max_value() == n - 1);
566
+ REQUIRE(sketch2.get_min_value() == n);
567
+ REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
568
+
569
+ REQUIRE(sketch1.get_k() == 256);
570
+ REQUIRE(sketch2.get_k() == 128);
571
+
572
+ REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
573
+ REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
574
+
575
+ sketch1.merge(sketch2);
576
+
577
+ // sketch1 must get "contaminated" by the lower K in sketch2
578
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
579
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
580
+
581
+ REQUIRE_FALSE(sketch1.is_empty());
582
+ REQUIRE(sketch1.get_n() == 2 * n);
583
+ REQUIRE(sketch1.get_min_value() == 0.0f);
584
+ REQUIRE(sketch1.get_max_value() == 2.0f * n - 1);
585
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
586
+ }
587
+
588
+ SECTION("merge exact mode, lower k") {
589
+ quantiles_float_sketch sketch1(256, 0);
590
+ quantiles_float_sketch sketch2(128, 0);
591
+ const int n = 10000;
592
+ for (int i = 0; i < n; i++) {
593
+ sketch1.update(static_cast<float>(i));
594
+ }
595
+
596
+ // rank error should not be affected by a merge with an empty sketch with lower k
597
+ const double rank_error_before_merge = sketch1.get_normalized_rank_error(true);
598
+ sketch1.merge(sketch2);
599
+ REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
600
+
601
+ REQUIRE_FALSE(sketch1.is_empty());
602
+ REQUIRE(sketch1.get_n() == n);
603
+ REQUIRE(sketch1.get_min_value() == 0.0f);
604
+ REQUIRE(sketch1.get_max_value() == n - 1);
605
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n / 2).margin(n / 2 * RANK_EPS_FOR_K_128));
606
+
607
+ sketch2.update(static_cast<float>(0));
608
+ sketch1.merge(sketch2);
609
+ // rank error should not be affected by a merge with a sketch in exact mode with lower k
610
+ REQUIRE(sketch1.get_normalized_rank_error(true) == rank_error_before_merge);
611
+ }
612
+
613
+ SECTION("merge min value from other") {
614
+ quantiles_float_sketch sketch1(128, 0);
615
+ quantiles_float_sketch sketch2(128, 0);
616
+ sketch1.update(1.0f);
617
+ sketch2.update(2.0f);
618
+ sketch2.merge(sketch1);
619
+ REQUIRE(sketch2.get_min_value() == 1.0f);
620
+ REQUIRE(sketch2.get_max_value() == 2.0f);
621
+ }
622
+
623
+ SECTION("merge min and max values from other") {
624
+ quantiles_float_sketch sketch1(128, 0);
625
+ for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
626
+ quantiles_float_sketch sketch2(128, 0);
627
+ sketch2.merge(sketch1);
628
+ REQUIRE(sketch2.get_min_value() == 0.0f);
629
+ REQUIRE(sketch2.get_max_value() == 999999.0f);
630
+ }
631
+
632
+ SECTION("merge: two empty") {
633
+ quantiles_float_sketch sk1(128, 0);
634
+ quantiles_float_sketch sk2(64, 0);
635
+ sk1.merge(sk2);
636
+ REQUIRE(sk1.get_n() == 0);
637
+ REQUIRE(sk1.get_k() == 128);
638
+
639
+ sk2.merge(const_cast<const quantiles_float_sketch&>(sk1));
640
+ REQUIRE(sk2.get_n() == 0);
641
+ REQUIRE(sk2.get_k() == 64);
642
+ }
643
+
644
+ SECTION("merge: exact as input") {
645
+ const uint16_t k = 128;
646
+ quantiles_float_sketch sketch1(2 * k, 0);
647
+ quantiles_float_sketch sketch2(k, 0);
648
+
649
+ for (int i = 0; i < k / 2; i++) {
650
+ sketch1.update(static_cast<float>(i));
651
+ sketch2.update(static_cast<float>(i));
652
+ }
653
+
654
+ for (int i = 0; i < 100 * k; i++) {
655
+ sketch1.update(static_cast<float>(i));
656
+ }
657
+
658
+ sketch1.merge(sketch2);
659
+ REQUIRE(sketch1.get_n() == 101 * k);
660
+ REQUIRE(sketch1.get_k() == 2 * k); // no reason to have shrunk
661
+ REQUIRE(sketch1.get_min_value() == 0.0f);
662
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
663
+ }
664
+
665
+ SECTION("merge: src estimation, tgt exact, tgt.k > src.k") {
666
+ const uint16_t k = 128;
667
+ quantiles_float_sketch sketch1(2 * k, 0);
668
+ quantiles_float_sketch sketch2(k, 0);
669
+
670
+ for (int i = 0; i < k / 2; i++) {
671
+ sketch1.update(static_cast<float>(i));
672
+ sketch2.update(static_cast<float>(i));
673
+ }
674
+
675
+ for (int i = 0; i < 100 * k; i++) {
676
+ sketch2.update(static_cast<float>(i));
677
+ }
678
+
679
+ sketch1.merge(sketch2);
680
+ REQUIRE(sketch1.get_n() == 101 * k);
681
+ REQUIRE(sketch1.get_k() == k); // no reason to have shrunk
682
+ REQUIRE(sketch1.get_min_value() == 0.0f);
683
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
684
+ }
685
+
686
+ SECTION("merge: both estimation, tgt.k < src.k") {
687
+ const uint16_t k = 128;
688
+ quantiles_float_sketch sketch1(k, 0);
689
+ quantiles_float_sketch sketch2(2 * k, 0);
690
+
691
+ for (int i = 0; i < 100 * k; i++) {
692
+ sketch1.update(static_cast<float>(i));
693
+ sketch2.update(static_cast<float>(-i));
694
+ }
695
+
696
+ sketch1.merge(sketch2);
697
+ REQUIRE(sketch1.get_n() == 200 * k);
698
+ REQUIRE(sketch1.get_k() == k); // no reason to have shrunk
699
+ REQUIRE(sketch1.get_min_value() == static_cast<float>(-100 * k + 1));
700
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
701
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(0.0).margin(100 * k * RANK_EPS_FOR_K_128));
702
+ }
703
+
704
+ SECTION("merge: src estimation, tgt exact, equal k") {
705
+ const uint16_t k = 128;
706
+ quantiles_float_sketch sketch1(k, 0);
707
+ quantiles_float_sketch sketch2(k, 0);
708
+
709
+ for (int i = 0; i < k / 2; i++) {
710
+ sketch1.update(static_cast<float>(i));
711
+ sketch2.update(static_cast<float>(k - i - 1));
712
+ }
713
+
714
+ for (int i = k; i < 100 * k; i++) {
715
+ sketch2.update(static_cast<float>(i));
716
+ }
717
+
718
+ sketch1.merge(sketch2);
719
+ REQUIRE(sketch1.get_n() == 100 * k);
720
+ REQUIRE(sketch1.get_k() == k);
721
+ REQUIRE(sketch1.get_min_value() == 0.0f);
722
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(100 * k - 1));
723
+ float n = 100 * k - 1;
724
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n / 2).margin(n / 2 * RANK_EPS_FOR_K_128));
725
+ }
726
+
727
+ SECTION("merge: both estimation, no base buffer, same k") {
728
+ const uint16_t k = 128;
729
+ quantiles_float_sketch sketch1(k, 0);
730
+ quantiles_float_sketch sketch2(k, 0);
731
+
732
+ uint64_t n = 2 * k;
733
+ for (uint64_t i = 0; i < n; i++) {
734
+ sketch1.update(static_cast<float>(i));
735
+ sketch2.update(static_cast<float>(2 * n - i - 1));
736
+ }
737
+
738
+ sketch1.merge(sketch2);
739
+ REQUIRE(sketch1.get_n() == 2 * n);
740
+ REQUIRE(sketch1.get_k() == k);
741
+ REQUIRE(sketch1.get_min_value() == 0.0f);
742
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(2 * n - 1));
743
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
744
+ }
745
+
746
+ SECTION("merge: both estimation, no base buffer, tgt.k < src.k") {
747
+ const uint16_t k = 128;
748
+ quantiles_float_sketch sketch1(k, 0);
749
+ quantiles_float_sketch sketch2(2 * k, 0);
750
+
751
+ uint64_t n = 4 * k;
752
+ for (uint64_t i = 0; i < n; i++) {
753
+ sketch1.update(static_cast<float>(i));
754
+ sketch2.update(static_cast<float>(2 * n - i - 1));
755
+ }
756
+
757
+ sketch1.merge(sketch2);
758
+ REQUIRE(sketch1.get_n() == 2 * n);
759
+ REQUIRE(sketch1.get_k() == k);
760
+ REQUIRE(sketch1.get_min_value() == 0.0f);
761
+ REQUIRE(sketch1.get_max_value() == static_cast<float>(2 * n - 1));
762
+ REQUIRE(sketch1.get_quantile(0.5) == Approx(n).margin(n * RANK_EPS_FOR_K_128));
763
+ }
764
+
765
+ SECTION("sketch of ints") {
766
+ quantiles_sketch<int> sketch;
767
+ REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
768
+ REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
769
+ REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
770
+
771
+ const int n = 10000;
772
+ for (int i = 0; i < n; i++) sketch.update(i);
773
+
774
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
775
+ sketch.serialize(s);
776
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
777
+ auto sketch2 = quantiles_sketch<int>::deserialize(s);
778
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
779
+ REQUIRE(s.tellg() == s.tellp());
780
+ REQUIRE(sketch2.is_empty() == sketch.is_empty());
781
+ REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
782
+ REQUIRE(sketch2.get_n() == sketch.get_n());
783
+ REQUIRE(sketch2.get_num_retained() == sketch.get_num_retained());
784
+ REQUIRE(sketch2.get_min_value() == sketch.get_min_value());
785
+ REQUIRE(sketch2.get_max_value() == sketch.get_max_value());
786
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch.get_normalized_rank_error(false));
787
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
788
+ REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
789
+ REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
790
+ REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
791
+ }
792
+
793
+ SECTION("sketch of strings stream") {
794
+ quantiles_string_sketch sketch1(128, 0);
795
+ REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
796
+ REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
797
+ REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
798
+ REQUIRE(sketch1.get_serialized_size_bytes() == 8);
799
+
800
+ const int n = 1000;
801
+ for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
802
+
803
+ REQUIRE(sketch1.get_min_value() == std::string("0"));
804
+ REQUIRE(sketch1.get_max_value() == std::string("999"));
805
+
806
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
807
+ sketch1.serialize(s);
808
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
809
+ auto sketch2 = quantiles_string_sketch::deserialize(s, serde<std::string>(), test_allocator<std::string>(0));
810
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
811
+ REQUIRE(s.tellg() == s.tellp());
812
+ REQUIRE(sketch2.is_empty() == sketch1.is_empty());
813
+ REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
814
+ REQUIRE(sketch2.get_n() == sketch1.get_n());
815
+ REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
816
+ REQUIRE(sketch2.get_min_value() == sketch1.get_min_value());
817
+ REQUIRE(sketch2.get_max_value() == sketch1.get_max_value());
818
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
819
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
820
+ REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
821
+ REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
822
+ REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
823
+
824
+ // to take a look using hexdump
825
+ //std::ofstream os("quantiles-string.sk");
826
+ //sketch1.serialize(os);
827
+ }
828
+
829
+ SECTION("sketch of strings bytes") {
830
+ quantiles_string_sketch sketch1(128, 0);
831
+ REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
832
+ REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
833
+ REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
834
+ REQUIRE(sketch1.get_serialized_size_bytes() == 8);
835
+
836
+ const int n = 10000;
837
+ for (int i = 0; i < n; i++) sketch1.update(std::to_string(i));
838
+
839
+ REQUIRE(sketch1.get_min_value() == std::string("0"));
840
+ REQUIRE(sketch1.get_max_value() == std::string("9999"));
841
+
842
+ auto bytes = sketch1.serialize();
843
+ REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
844
+ auto sketch2 = quantiles_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
845
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
846
+ REQUIRE(sketch2.is_empty() == sketch1.is_empty());
847
+ REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
848
+ REQUIRE(sketch2.get_n() == sketch1.get_n());
849
+ REQUIRE(sketch2.get_num_retained() == sketch1.get_num_retained());
850
+ REQUIRE(sketch2.get_min_value() == sketch1.get_min_value());
851
+ REQUIRE(sketch2.get_max_value() == sketch1.get_max_value());
852
+ REQUIRE(sketch2.get_normalized_rank_error(false) == sketch1.get_normalized_rank_error(false));
853
+ REQUIRE(sketch2.get_normalized_rank_error(true) == sketch1.get_normalized_rank_error(true));
854
+ REQUIRE(sketch2.get_quantile(0.5) == sketch1.get_quantile(0.5));
855
+ REQUIRE(sketch2.get_rank(std::to_string(0)) == sketch1.get_rank(std::to_string(0)));
856
+ REQUIRE(sketch2.get_rank(std::to_string(n)) == sketch1.get_rank(std::to_string(n)));
857
+ }
858
+
859
+ SECTION("sketch of strings, single item, bytes") {
860
+ quantiles_string_sketch sketch1(64, 0);
861
+ sketch1.update("a");
862
+ auto bytes = sketch1.serialize();
863
+ REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
864
+ auto sketch2 = quantiles_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
865
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
866
+ }
867
+
868
+ SECTION("copy") {
869
+ quantiles_sketch<int> sketch1;
870
+ const int n(1000);
871
+ for (int i = 0; i < n; i++) sketch1.update(i);
872
+
873
+ // copy constructor
874
+ quantiles_sketch<int> sketch2(sketch1);
875
+ for (int i = 0; i < n; i++) {
876
+ REQUIRE(sketch2.get_rank(i) == sketch1.get_rank(i));
877
+ }
878
+
879
+ // copy assignment
880
+ quantiles_sketch<int> sketch3;
881
+ sketch3 = sketch1;
882
+ for (int i = 0; i < n; i++) {
883
+ REQUIRE(sketch3.get_rank(i) == sketch1.get_rank(i));
884
+ }
885
+ }
886
+
887
+ SECTION("move") {
888
+ quantiles_sketch<int> sketch1;
889
+ const int n(100);
890
+ for (int i = 0; i < n; i++) sketch1.update(i);
891
+
892
+ // move constructor
893
+ quantiles_sketch<int> sketch2(std::move(sketch1));
894
+ for (int i = 0; i < n; i++) {
895
+ REQUIRE(sketch2.get_rank(i) == (double) i / n);
896
+ }
897
+
898
+ // move assignment
899
+ quantiles_sketch<int> sketch3;
900
+ sketch3 = std::move(sketch2);
901
+ for (int i = 0; i < n; i++) {
902
+ REQUIRE(sketch3.get_rank(i) == (double) i / n);
903
+ }
904
+ }
905
+
906
+ SECTION("Type converting copy constructor") {
907
+ const uint16_t k = 8;
908
+ const int n = 403;
909
+ quantiles_sketch<double> sk_double(k);
910
+
911
+ quantiles_sketch<float> sk_float(k, sk_double.get_allocator());
912
+ REQUIRE(sk_float.is_empty());
913
+
914
+ for (int i = 0; i < n; ++i) sk_double.update(i + .01);
915
+
916
+ quantiles_sketch<int> sk_int(sk_double);
917
+ REQUIRE(sk_double.get_n() == sk_int.get_n());
918
+ REQUIRE(sk_double.get_k() == sk_int.get_k());
919
+ REQUIRE(sk_double.get_num_retained() == sk_int.get_num_retained());
920
+
921
+ auto sv_double = sk_double.get_sorted_view(false);
922
+ std::vector<std::pair<double, uint64_t>> vec_double(sv_double.begin(), sv_double.end());
923
+
924
+ auto sv_int = sk_int.get_sorted_view(false);
925
+ std::vector<std::pair<int, uint64_t>> vec_int(sv_int.begin(), sv_int.end());
926
+
927
+ REQUIRE(vec_double.size() == vec_int.size());
928
+
929
+ for (size_t i = 0; i < vec_int.size(); ++i) {
930
+ // known truncation with conversion so approximate result
931
+ REQUIRE(vec_double[i].first == Approx(vec_int[i].first).margin(0.1));
932
+ // exact equality for weights
933
+ REQUIRE(vec_double[i].second == vec_int[i].second);
934
+ }
935
+ }
936
+
937
+ class A {
938
+ int val;
939
+ public:
940
+ A(int val): val(val) {}
941
+ int get_val() const { return val; }
942
+ };
943
+
944
+ struct less_A {
945
+ bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
946
+ };
947
+
948
+ class B {
949
+ int val;
950
+ public:
951
+ explicit B(const A& a): val(a.get_val()) {}
952
+ int get_val() const { return val; }
953
+ };
954
+
955
+ struct less_B {
956
+ bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
957
+ };
958
+
959
+ SECTION("type conversion: custom types") {
960
+ quantiles_sketch<A, less_A> sa;
961
+ sa.update(1);
962
+ sa.update(2);
963
+ sa.update(3);
964
+
965
+ quantiles_sketch<B, less_B> sb(sa);
966
+ REQUIRE(sb.get_n() == 3);
967
+ }
968
+
969
+ // cleanup
970
+ if (test_allocator_total_bytes != 0) {
971
+ REQUIRE(test_allocator_total_bytes == 0);
972
+ }
973
+ }
974
+
975
+ } /* namespace datasketches */