datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KOLMOGOROV_SMIRNOV_HPP_
|
|
21
|
+
#define KOLMOGOROV_SMIRNOV_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
class kolmogorov_smirnov {
|
|
26
|
+
public:
|
|
27
|
+
/**
|
|
28
|
+
* Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
|
|
29
|
+
* @param sketch1 KLL sketch 1
|
|
30
|
+
* @param sketch2 KLL sketch 2
|
|
31
|
+
* @return the raw delta between two KLL quantile sketches
|
|
32
|
+
*/
|
|
33
|
+
template<typename Sketch>
|
|
34
|
+
static double delta(const Sketch& sketch1, const Sketch& sketch2);
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
|
|
38
|
+
* Adjusts the computed threshold by the error epsilons of the two given sketches.
|
|
39
|
+
* See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
|
|
40
|
+
* @param sketch1 KLL sketch 1
|
|
41
|
+
* @param sketch2 KLL sketch 2
|
|
42
|
+
* @param p Target p-value. Typically .001 to .1, e.g., .05.
|
|
43
|
+
* @return the adjusted threshold to be compared with the raw delta
|
|
44
|
+
*/
|
|
45
|
+
template<typename Sketch>
|
|
46
|
+
static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
|
|
50
|
+
* Note: if the given sketches have insufficient data or if the sketch sizes are too small,
|
|
51
|
+
* this will return false.
|
|
52
|
+
* @param sketch1 KLL sketch 1
|
|
53
|
+
* @param sketch2 KLL sketch 2
|
|
54
|
+
* @param p Target p-value. Typically .001 to .1, e.g., .05.
|
|
55
|
+
* @return Boolean indicating whether we can reject the null hypothesis (that the sketches
|
|
56
|
+
* reflect the same underlying distribution) using the provided p-value.
|
|
57
|
+
*/
|
|
58
|
+
template<typename Sketch>
|
|
59
|
+
static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
|
|
60
|
+
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
} /* namespace datasketches */
|
|
64
|
+
|
|
65
|
+
#include "kolmogorov_smirnov_impl.hpp"
|
|
66
|
+
|
|
67
|
+
#endif
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
21
|
+
#define KOLMOGOROV_SMIRNOV_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
// type resolver
|
|
26
|
+
template<typename T, typename C, typename S, typename A>
|
|
27
|
+
kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
|
|
28
|
+
return kll_quantile_calculator<T, C, A>(sketch);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
template<typename Sketch>
|
|
32
|
+
double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
|
|
33
|
+
using Comparator = typename Sketch::comparator;
|
|
34
|
+
auto calc1 = make_quantile_calculator(sketch1);
|
|
35
|
+
auto calc2 = make_quantile_calculator(sketch2);
|
|
36
|
+
auto it1 = calc1.begin();
|
|
37
|
+
auto it2 = calc2.begin();
|
|
38
|
+
const auto n1 = sketch1.get_n();
|
|
39
|
+
const auto n2 = sketch2.get_n();
|
|
40
|
+
double delta = 0;
|
|
41
|
+
while (it1 != calc1.end() && it2 != calc2.end()) {
|
|
42
|
+
const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
|
|
43
|
+
const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
|
|
44
|
+
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
45
|
+
if (Comparator()((*it1).first, (*it2).first)) {
|
|
46
|
+
++it1;
|
|
47
|
+
} else if (Comparator()((*it2).first, (*it1).first)) {
|
|
48
|
+
++it2;
|
|
49
|
+
} else {
|
|
50
|
+
++it1;
|
|
51
|
+
++it2;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
|
|
55
|
+
const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
|
|
56
|
+
delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
|
|
57
|
+
return delta;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
template<typename Sketch>
|
|
61
|
+
double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
|
|
62
|
+
const double r1 = sketch1.get_num_retained();
|
|
63
|
+
const double r2 = sketch2.get_num_retained();
|
|
64
|
+
const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
|
|
65
|
+
const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
|
|
66
|
+
const double eps1 = sketch1.get_normalized_rank_error(false);
|
|
67
|
+
const double eps2 = sketch2.get_normalized_rank_error(false);
|
|
68
|
+
return delta_area_threshold + eps1 + eps2;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
template<typename Sketch>
|
|
72
|
+
bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
|
|
73
|
+
return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
} /* namespace datasketches */
|
|
77
|
+
|
|
78
|
+
#endif
|
|
@@ -70,12 +70,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
70
70
|
REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
|
|
71
71
|
REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
for (auto& it: sketch) {
|
|
73
|
+
for (auto it: sketch) {
|
|
75
74
|
(void) it; // to suppress "unused" warning
|
|
76
|
-
|
|
75
|
+
FAIL("should be no iterations over an empty sketch");
|
|
77
76
|
}
|
|
78
|
-
REQUIRE(count == 0);
|
|
79
77
|
}
|
|
80
78
|
|
|
81
79
|
SECTION("get bad quantile") {
|
|
@@ -86,13 +84,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
86
84
|
|
|
87
85
|
SECTION("one item") {
|
|
88
86
|
kll_float_sketch sketch(200, 0);
|
|
89
|
-
sketch.update(1);
|
|
87
|
+
sketch.update(1.0f);
|
|
90
88
|
REQUIRE_FALSE(sketch.is_empty());
|
|
91
89
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
92
90
|
REQUIRE(sketch.get_n() == 1);
|
|
93
91
|
REQUIRE(sketch.get_num_retained() == 1);
|
|
94
|
-
REQUIRE(sketch.get_rank(1) == 0.0);
|
|
95
|
-
REQUIRE(sketch.get_rank(2) == 1.0);
|
|
92
|
+
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
|
93
|
+
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
|
96
94
|
REQUIRE(sketch.get_min_value() == 1.0);
|
|
97
95
|
REQUIRE(sketch.get_max_value() == 1.0);
|
|
98
96
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
|
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
104
102
|
REQUIRE(quantiles[2] == 1.0);
|
|
105
103
|
|
|
106
104
|
int count = 0;
|
|
107
|
-
for (auto
|
|
105
|
+
for (auto it: sketch) {
|
|
108
106
|
REQUIRE(it.second == 1);
|
|
109
107
|
++count;
|
|
110
108
|
}
|
|
@@ -116,16 +114,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
116
114
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
|
117
115
|
REQUIRE(sketch.is_empty());
|
|
118
116
|
|
|
119
|
-
sketch.update(0
|
|
117
|
+
sketch.update(0);
|
|
120
118
|
sketch.update(std::numeric_limits<float>::quiet_NaN());
|
|
121
119
|
REQUIRE(sketch.get_n() == 1);
|
|
122
120
|
}
|
|
123
121
|
|
|
124
122
|
SECTION("many items, exact mode") {
|
|
125
123
|
kll_float_sketch sketch(200, 0);
|
|
126
|
-
const uint32_t n
|
|
124
|
+
const uint32_t n = 200;
|
|
127
125
|
for (uint32_t i = 0; i < n; i++) {
|
|
128
|
-
sketch.update(i);
|
|
126
|
+
sketch.update(static_cast<float>(i));
|
|
129
127
|
REQUIRE(sketch.get_n() == i + 1);
|
|
130
128
|
}
|
|
131
129
|
REQUIRE_FALSE(sketch.is_empty());
|
|
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
145
143
|
|
|
146
144
|
for (uint32_t i = 0; i < n; i++) {
|
|
147
145
|
const double trueRank = (double) i / n;
|
|
148
|
-
REQUIRE(sketch.get_rank(i) == trueRank);
|
|
146
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
|
|
149
147
|
}
|
|
150
148
|
|
|
151
149
|
// the alternative method must produce the same result
|
|
@@ -158,16 +156,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
158
156
|
|
|
159
157
|
SECTION("10 items") {
|
|
160
158
|
kll_float_sketch sketch(200, 0);
|
|
161
|
-
sketch.update(1);
|
|
162
|
-
sketch.update(2);
|
|
163
|
-
sketch.update(3);
|
|
164
|
-
sketch.update(4);
|
|
165
|
-
sketch.update(5);
|
|
166
|
-
sketch.update(6);
|
|
167
|
-
sketch.update(7);
|
|
168
|
-
sketch.update(8);
|
|
169
|
-
sketch.update(9);
|
|
170
|
-
sketch.update(10);
|
|
159
|
+
sketch.update(1.0f);
|
|
160
|
+
sketch.update(2.0f);
|
|
161
|
+
sketch.update(3.0f);
|
|
162
|
+
sketch.update(4.0f);
|
|
163
|
+
sketch.update(5.0f);
|
|
164
|
+
sketch.update(6.0f);
|
|
165
|
+
sketch.update(7.0f);
|
|
166
|
+
sketch.update(8.0f);
|
|
167
|
+
sketch.update(9.0f);
|
|
168
|
+
sketch.update(10.0f);
|
|
171
169
|
REQUIRE(sketch.get_quantile(0) == 1.0);
|
|
172
170
|
REQUIRE(sketch.get_quantile(0.5) == 6.0);
|
|
173
171
|
REQUIRE(sketch.get_quantile(0.99) == 10.0);
|
|
@@ -176,7 +174,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
176
174
|
|
|
177
175
|
SECTION("100 items") {
|
|
178
176
|
kll_float_sketch sketch(200, 0);
|
|
179
|
-
for (int i = 0; i < 100; ++i) sketch.update(i);
|
|
177
|
+
for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
|
|
180
178
|
REQUIRE(sketch.get_quantile(0) == 0);
|
|
181
179
|
REQUIRE(sketch.get_quantile(0.01) == 1);
|
|
182
180
|
REQUIRE(sketch.get_quantile(0.5) == 50);
|
|
@@ -186,9 +184,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
186
184
|
|
|
187
185
|
SECTION("many items, estimation mode") {
|
|
188
186
|
kll_float_sketch sketch(200, 0);
|
|
189
|
-
const int n
|
|
187
|
+
const int n = 1000000;
|
|
190
188
|
for (int i = 0; i < n; i++) {
|
|
191
|
-
sketch.update(i);
|
|
189
|
+
sketch.update(static_cast<float>(i));
|
|
192
190
|
REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
|
|
193
191
|
}
|
|
194
192
|
REQUIRE_FALSE(sketch.is_empty());
|
|
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
201
199
|
// test rank
|
|
202
200
|
for (int i = 0; i < n; i++) {
|
|
203
201
|
const double trueRank = (double) i / n;
|
|
204
|
-
REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
|
|
202
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
|
|
205
203
|
}
|
|
206
204
|
|
|
207
205
|
// test quantiles at every 0.1 percentage point
|
|
@@ -224,6 +222,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
224
222
|
}
|
|
225
223
|
|
|
226
224
|
//std::cout << sketch.to_string();
|
|
225
|
+
|
|
226
|
+
uint32_t count = 0;
|
|
227
|
+
uint64_t total_weight = 0;
|
|
228
|
+
for (auto it: sketch) {
|
|
229
|
+
++count;
|
|
230
|
+
total_weight += it.second;
|
|
231
|
+
}
|
|
232
|
+
REQUIRE(count == sketch.get_num_retained());
|
|
233
|
+
REQUIRE(total_weight == sketch.get_n());
|
|
227
234
|
}
|
|
228
235
|
|
|
229
236
|
SECTION("consistency between get_rank adn get_PMF/CDF") {
|
|
@@ -231,8 +238,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
231
238
|
const int n = 1000;
|
|
232
239
|
float values[n];
|
|
233
240
|
for (int i = 0; i < n; i++) {
|
|
234
|
-
sketch.update(i);
|
|
235
|
-
values[i] = i;
|
|
241
|
+
sketch.update(static_cast<float>(i));
|
|
242
|
+
values[i] = static_cast<float>(i);
|
|
236
243
|
}
|
|
237
244
|
|
|
238
245
|
const auto ranks(sketch.get_CDF(values, n));
|
|
@@ -299,7 +306,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
299
306
|
|
|
300
307
|
SECTION("serialize deserialize one item") {
|
|
301
308
|
kll_float_sketch sketch(200, 0);
|
|
302
|
-
sketch.update(1);
|
|
309
|
+
sketch.update(1.0f);
|
|
303
310
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
304
311
|
sketch.serialize(s);
|
|
305
312
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
@@ -332,8 +339,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
332
339
|
|
|
333
340
|
SECTION("stream serialize deserialize many floats") {
|
|
334
341
|
kll_float_sketch sketch(200, 0);
|
|
335
|
-
const int n
|
|
336
|
-
for (int i = 0; i < n; i++) sketch.update(i);
|
|
342
|
+
const int n = 1000;
|
|
343
|
+
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
|
337
344
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
338
345
|
sketch.serialize(s);
|
|
339
346
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
@@ -350,13 +357,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
350
357
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
|
351
358
|
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
|
|
352
359
|
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
|
|
353
|
-
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
|
|
360
|
+
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
|
|
354
361
|
}
|
|
355
362
|
|
|
356
363
|
SECTION("bytes serialize deserialize many floats") {
|
|
357
364
|
kll_float_sketch sketch(200, 0);
|
|
358
|
-
const int n
|
|
359
|
-
for (int i = 0; i < n; i++) sketch.update(i);
|
|
365
|
+
const int n = 1000;
|
|
366
|
+
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
|
360
367
|
auto bytes = sketch.serialize();
|
|
361
368
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
362
369
|
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
@@ -371,7 +378,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
371
378
|
REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
|
|
372
379
|
REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
|
|
373
380
|
REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
|
|
374
|
-
REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
|
|
381
|
+
REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
|
|
375
382
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
|
|
376
383
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
|
|
377
384
|
REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
@@ -379,7 +386,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
379
386
|
|
|
380
387
|
SECTION("bytes serialize deserialize many ints") {
|
|
381
388
|
kll_sketch<int> sketch;
|
|
382
|
-
const int n
|
|
389
|
+
const int n = 1000;
|
|
383
390
|
for (int i = 0; i < n; i++) sketch.update(i);
|
|
384
391
|
auto bytes = sketch.serialize();
|
|
385
392
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
@@ -439,8 +446,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
439
446
|
kll_float_sketch sketch2(200, 0);
|
|
440
447
|
const int n = 10000;
|
|
441
448
|
for (int i = 0; i < n; i++) {
|
|
442
|
-
sketch1.update(i);
|
|
443
|
-
sketch2.update((2 * n) - i - 1);
|
|
449
|
+
sketch1.update(static_cast<float>(i));
|
|
450
|
+
sketch2.update(static_cast<float>((2 * n) - i - 1));
|
|
444
451
|
}
|
|
445
452
|
|
|
446
453
|
REQUIRE(sketch1.get_min_value() == 0.0f);
|
|
@@ -462,8 +469,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
462
469
|
kll_float_sketch sketch2(128, 0);
|
|
463
470
|
const int n = 10000;
|
|
464
471
|
for (int i = 0; i < n; i++) {
|
|
465
|
-
sketch1.update(i);
|
|
466
|
-
sketch2.update((2 * n) - i - 1);
|
|
472
|
+
sketch1.update(static_cast<float>(i));
|
|
473
|
+
sketch2.update(static_cast<float>((2 * n) - i - 1));
|
|
467
474
|
}
|
|
468
475
|
|
|
469
476
|
REQUIRE(sketch1.get_min_value() == 0.0f);
|
|
@@ -495,7 +502,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
495
502
|
kll_float_sketch sketch2(128, 0);
|
|
496
503
|
const int n = 10000;
|
|
497
504
|
for (int i = 0; i < n; i++) {
|
|
498
|
-
sketch1.update(i);
|
|
505
|
+
sketch1.update(static_cast<float>(i));
|
|
499
506
|
}
|
|
500
507
|
|
|
501
508
|
// rank error should not be affected by a merge with an empty sketch with lower k
|
|
@@ -518,8 +525,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
518
525
|
SECTION("merge min value from other") {
|
|
519
526
|
kll_float_sketch sketch1(200, 0);
|
|
520
527
|
kll_float_sketch sketch2(200, 0);
|
|
521
|
-
sketch1.update(1);
|
|
522
|
-
sketch2.update(2);
|
|
528
|
+
sketch1.update(1.0f);
|
|
529
|
+
sketch2.update(2.0f);
|
|
523
530
|
sketch2.merge(sketch1);
|
|
524
531
|
REQUIRE(sketch2.get_min_value() == 1.0f);
|
|
525
532
|
REQUIRE(sketch2.get_max_value() == 2.0f);
|
|
@@ -527,7 +534,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
527
534
|
|
|
528
535
|
SECTION("merge min and max values from other") {
|
|
529
536
|
kll_float_sketch sketch1(200, 0);
|
|
530
|
-
for (int i = 0; i < 1000000; i++) sketch1.update(i);
|
|
537
|
+
for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
|
|
531
538
|
kll_float_sketch sketch2(200, 0);
|
|
532
539
|
sketch2.merge(sketch1);
|
|
533
540
|
REQUIRE(sketch2.get_min_value() == 0.0f);
|
|
@@ -540,7 +547,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
540
547
|
REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
|
|
541
548
|
REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
|
|
542
549
|
|
|
543
|
-
const int n
|
|
550
|
+
const int n = 1000;
|
|
544
551
|
for (int i = 0; i < n; i++) sketch.update(i);
|
|
545
552
|
|
|
546
553
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
@@ -679,6 +686,22 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
679
686
|
}
|
|
680
687
|
}
|
|
681
688
|
|
|
689
|
+
SECTION("max serialized size arithmetic type") {
|
|
690
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
|
|
691
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
|
|
692
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
|
|
693
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
|
|
694
|
+
REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
SECTION("max serialized size non-arithmetic type") {
|
|
698
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
|
|
699
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
|
|
700
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
|
|
701
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
|
|
702
|
+
REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
|
|
703
|
+
}
|
|
704
|
+
|
|
682
705
|
// cleanup
|
|
683
706
|
if (test_allocator_total_bytes != 0) {
|
|
684
707
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
|
|
22
|
+
#include <random>
|
|
23
|
+
|
|
24
|
+
#include <kll_sketch.hpp>
|
|
25
|
+
#include <kolmogorov_smirnov.hpp>
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
|
|
30
|
+
const uint16_t k = 200;
|
|
31
|
+
kll_sketch<double> sketch1(k);
|
|
32
|
+
kll_sketch<double> sketch2(k);
|
|
33
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
|
|
34
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
|
38
|
+
const uint16_t k = 200;
|
|
39
|
+
kll_sketch<double> sketch1(k);
|
|
40
|
+
kll_sketch<double> sketch2(k);
|
|
41
|
+
std::default_random_engine rand;
|
|
42
|
+
std::normal_distribution<double> distr;
|
|
43
|
+
const int n = k * 3 - 1;
|
|
44
|
+
for (int i = 0; i < n; ++i) {
|
|
45
|
+
const double x = distr(rand);
|
|
46
|
+
sketch1.update(x);
|
|
47
|
+
sketch2.update(x);
|
|
48
|
+
}
|
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
|
|
50
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
|
|
54
|
+
const uint16_t k = 200;
|
|
55
|
+
kll_sketch<double> sketch1(k);
|
|
56
|
+
kll_sketch<double> sketch2(k);
|
|
57
|
+
std::default_random_engine rand;
|
|
58
|
+
std::normal_distribution<double> distr;
|
|
59
|
+
const int n = k * 3 - 1;
|
|
60
|
+
for (int i = 0; i < n; ++i) {
|
|
61
|
+
const double x = distr(rand);
|
|
62
|
+
sketch1.update(x + 100.0);
|
|
63
|
+
sketch2.update(x);
|
|
64
|
+
}
|
|
65
|
+
const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
|
66
|
+
REQUIRE(delta == Approx(1.0).margin(1e-6));
|
|
67
|
+
REQUIRE(delta <= 1);
|
|
68
|
+
REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
|
|
72
|
+
const uint16_t k = 2000;
|
|
73
|
+
kll_sketch<double> sketch1(k);
|
|
74
|
+
kll_sketch<double> sketch2(k);
|
|
75
|
+
std::default_random_engine rand;
|
|
76
|
+
std::normal_distribution<double> distr;
|
|
77
|
+
const int n = k * 3 - 1;
|
|
78
|
+
for (int i = 0; i < n; ++i) {
|
|
79
|
+
const double x = distr(rand);
|
|
80
|
+
sketch1.update(x + 0.05);
|
|
81
|
+
sketch2.update(x);
|
|
82
|
+
}
|
|
83
|
+
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
|
84
|
+
REQUIRE(delta == Approx(0.02).margin(0.01));
|
|
85
|
+
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
|
86
|
+
//std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
|
|
87
|
+
REQUIRE_FALSE(delta > threshold);
|
|
88
|
+
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
|
|
92
|
+
const uint16_t k = 8000;
|
|
93
|
+
kll_sketch<double> sketch1(k);
|
|
94
|
+
kll_sketch<double> sketch2(k);
|
|
95
|
+
std::default_random_engine rand;
|
|
96
|
+
std::normal_distribution<double> distr;
|
|
97
|
+
const int n = k * 3 - 1;
|
|
98
|
+
for (int i = 0; i < n; ++i) {
|
|
99
|
+
const double x = distr(rand);
|
|
100
|
+
sketch1.update(x + 0.05);
|
|
101
|
+
sketch2.update(x);
|
|
102
|
+
}
|
|
103
|
+
const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
|
|
104
|
+
REQUIRE(delta == Approx(0.02).margin(0.01));
|
|
105
|
+
const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
|
|
106
|
+
//std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
|
|
107
|
+
REQUIRE(delta > threshold);
|
|
108
|
+
REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
} /* namespace datasketches */
|