datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ class kolmogorov_smirnov {
26
+ public:
27
+ /**
28
+ * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * @param sketch1 KLL sketch 1
30
+ * @param sketch2 KLL sketch 2
31
+ * @return the raw delta between two KLL quantile sketches
32
+ */
33
+ template<typename Sketch>
34
+ static double delta(const Sketch& sketch1, const Sketch& sketch2);
35
+
36
+ /**
37
+ * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
+ * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
40
+ * @param sketch1 KLL sketch 1
41
+ * @param sketch2 KLL sketch 2
42
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
43
+ * @return the adjusted threshold to be compared with the raw delta
44
+ */
45
+ template<typename Sketch>
46
+ static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
+
48
+ /**
49
+ * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
50
+ * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
+ * this will return false.
52
+ * @param sketch1 KLL sketch 1
53
+ * @param sketch2 KLL sketch 2
54
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
55
+ * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
56
+ * reflect the same underlying distribution) using the provided p-value.
57
+ */
58
+ template<typename Sketch>
59
+ static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
+
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "kolmogorov_smirnov_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,78 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ // type resolver
26
+ template<typename T, typename C, typename S, typename A>
27
+ kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
+ return kll_quantile_calculator<T, C, A>(sketch);
29
+ }
30
+
31
+ template<typename Sketch>
32
+ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
+ using Comparator = typename Sketch::comparator;
34
+ auto calc1 = make_quantile_calculator(sketch1);
35
+ auto calc2 = make_quantile_calculator(sketch2);
36
+ auto it1 = calc1.begin();
37
+ auto it2 = calc2.begin();
38
+ const auto n1 = sketch1.get_n();
39
+ const auto n2 = sketch2.get_n();
40
+ double delta = 0;
41
+ while (it1 != calc1.end() && it2 != calc2.end()) {
42
+ const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
+ const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
+ if (Comparator()((*it1).first, (*it2).first)) {
46
+ ++it1;
47
+ } else if (Comparator()((*it2).first, (*it1).first)) {
48
+ ++it2;
49
+ } else {
50
+ ++it1;
51
+ ++it2;
52
+ }
53
+ }
54
+ const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
+ const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
+ return delta;
58
+ }
59
+
60
+ template<typename Sketch>
61
+ double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
62
+ const double r1 = sketch1.get_num_retained();
63
+ const double r2 = sketch2.get_num_retained();
64
+ const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
65
+ const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
66
+ const double eps1 = sketch1.get_normalized_rank_error(false);
67
+ const double eps2 = sketch2.get_normalized_rank_error(false);
68
+ return delta_area_threshold + eps1 + eps2;
69
+ }
70
+
71
+ template<typename Sketch>
72
+ bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
73
+ return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
74
+ }
75
+
76
+ } /* namespace datasketches */
77
+
78
+ #endif
@@ -41,4 +41,5 @@ target_sources(kll_test
41
41
  kll_sketch_test.cpp
42
42
  kll_sketch_custom_type_test.cpp
43
43
  kll_sketch_validation.cpp
44
+ kolmogorov_smirnov_test.cpp
44
45
  )
@@ -70,12 +70,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
70
70
  REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
71
71
  REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
72
72
 
73
- int count = 0;
74
- for (auto& it: sketch) {
73
+ for (auto it: sketch) {
75
74
  (void) it; // to suppress "unused" warning
76
- ++count;
75
+ FAIL("should be no iterations over an empty sketch");
77
76
  }
78
- REQUIRE(count == 0);
79
77
  }
80
78
 
81
79
  SECTION("get bad quantile") {
@@ -86,13 +84,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
86
84
 
87
85
  SECTION("one item") {
88
86
  kll_float_sketch sketch(200, 0);
89
- sketch.update(1);
87
+ sketch.update(1.0f);
90
88
  REQUIRE_FALSE(sketch.is_empty());
91
89
  REQUIRE_FALSE(sketch.is_estimation_mode());
92
90
  REQUIRE(sketch.get_n() == 1);
93
91
  REQUIRE(sketch.get_num_retained() == 1);
94
- REQUIRE(sketch.get_rank(1) == 0.0);
95
- REQUIRE(sketch.get_rank(2) == 1.0);
92
+ REQUIRE(sketch.get_rank(1.0f) == 0.0);
93
+ REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
94
  REQUIRE(sketch.get_min_value() == 1.0);
97
95
  REQUIRE(sketch.get_max_value() == 1.0);
98
96
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
104
102
  REQUIRE(quantiles[2] == 1.0);
105
103
 
106
104
  int count = 0;
107
- for (auto& it: sketch) {
105
+ for (auto it: sketch) {
108
106
  REQUIRE(it.second == 1);
109
107
  ++count;
110
108
  }
@@ -116,16 +114,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
116
114
  sketch.update(std::numeric_limits<float>::quiet_NaN());
117
115
  REQUIRE(sketch.is_empty());
118
116
 
119
- sketch.update(0.0);
117
+ sketch.update(0);
120
118
  sketch.update(std::numeric_limits<float>::quiet_NaN());
121
119
  REQUIRE(sketch.get_n() == 1);
122
120
  }
123
121
 
124
122
  SECTION("many items, exact mode") {
125
123
  kll_float_sketch sketch(200, 0);
126
- const uint32_t n(200);
124
+ const uint32_t n = 200;
127
125
  for (uint32_t i = 0; i < n; i++) {
128
- sketch.update(i);
126
+ sketch.update(static_cast<float>(i));
129
127
  REQUIRE(sketch.get_n() == i + 1);
130
128
  }
131
129
  REQUIRE_FALSE(sketch.is_empty());
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
145
143
 
146
144
  for (uint32_t i = 0; i < n; i++) {
147
145
  const double trueRank = (double) i / n;
148
- REQUIRE(sketch.get_rank(i) == trueRank);
146
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
149
147
  }
150
148
 
151
149
  // the alternative method must produce the same result
@@ -158,16 +156,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
158
156
 
159
157
  SECTION("10 items") {
160
158
  kll_float_sketch sketch(200, 0);
161
- sketch.update(1);
162
- sketch.update(2);
163
- sketch.update(3);
164
- sketch.update(4);
165
- sketch.update(5);
166
- sketch.update(6);
167
- sketch.update(7);
168
- sketch.update(8);
169
- sketch.update(9);
170
- sketch.update(10);
159
+ sketch.update(1.0f);
160
+ sketch.update(2.0f);
161
+ sketch.update(3.0f);
162
+ sketch.update(4.0f);
163
+ sketch.update(5.0f);
164
+ sketch.update(6.0f);
165
+ sketch.update(7.0f);
166
+ sketch.update(8.0f);
167
+ sketch.update(9.0f);
168
+ sketch.update(10.0f);
171
169
  REQUIRE(sketch.get_quantile(0) == 1.0);
172
170
  REQUIRE(sketch.get_quantile(0.5) == 6.0);
173
171
  REQUIRE(sketch.get_quantile(0.99) == 10.0);
@@ -176,7 +174,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
176
174
 
177
175
  SECTION("100 items") {
178
176
  kll_float_sketch sketch(200, 0);
179
- for (int i = 0; i < 100; ++i) sketch.update(i);
177
+ for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
180
178
  REQUIRE(sketch.get_quantile(0) == 0);
181
179
  REQUIRE(sketch.get_quantile(0.01) == 1);
182
180
  REQUIRE(sketch.get_quantile(0.5) == 50);
@@ -186,9 +184,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
186
184
 
187
185
  SECTION("many items, estimation mode") {
188
186
  kll_float_sketch sketch(200, 0);
189
- const int n(1000000);
187
+ const int n = 1000000;
190
188
  for (int i = 0; i < n; i++) {
191
- sketch.update(i);
189
+ sketch.update(static_cast<float>(i));
192
190
  REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
193
191
  }
194
192
  REQUIRE_FALSE(sketch.is_empty());
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
201
199
  // test rank
202
200
  for (int i = 0; i < n; i++) {
203
201
  const double trueRank = (double) i / n;
204
- REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
202
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
205
203
  }
206
204
 
207
205
  // test quantiles at every 0.1 percentage point
@@ -224,6 +222,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
224
222
  }
225
223
 
226
224
  //std::cout << sketch.to_string();
225
+
226
+ uint32_t count = 0;
227
+ uint64_t total_weight = 0;
228
+ for (auto it: sketch) {
229
+ ++count;
230
+ total_weight += it.second;
231
+ }
232
+ REQUIRE(count == sketch.get_num_retained());
233
+ REQUIRE(total_weight == sketch.get_n());
227
234
  }
228
235
 
229
236
  SECTION("consistency between get_rank adn get_PMF/CDF") {
@@ -231,8 +238,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
231
238
  const int n = 1000;
232
239
  float values[n];
233
240
  for (int i = 0; i < n; i++) {
234
- sketch.update(i);
235
- values[i] = i;
241
+ sketch.update(static_cast<float>(i));
242
+ values[i] = static_cast<float>(i);
236
243
  }
237
244
 
238
245
  const auto ranks(sketch.get_CDF(values, n));
@@ -299,7 +306,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
299
306
 
300
307
  SECTION("serialize deserialize one item") {
301
308
  kll_float_sketch sketch(200, 0);
302
- sketch.update(1);
309
+ sketch.update(1.0f);
303
310
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
304
311
  sketch.serialize(s);
305
312
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -332,8 +339,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
332
339
 
333
340
  SECTION("stream serialize deserialize many floats") {
334
341
  kll_float_sketch sketch(200, 0);
335
- const int n(1000);
336
- for (int i = 0; i < n; i++) sketch.update(i);
342
+ const int n = 1000;
343
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
337
344
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
338
345
  sketch.serialize(s);
339
346
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -350,13 +357,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
350
357
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
351
358
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
352
359
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
353
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
360
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
354
361
  }
355
362
 
356
363
  SECTION("bytes serialize deserialize many floats") {
357
364
  kll_float_sketch sketch(200, 0);
358
- const int n(1000);
359
- for (int i = 0; i < n; i++) sketch.update(i);
365
+ const int n = 1000;
366
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
360
367
  auto bytes = sketch.serialize();
361
368
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
362
369
  auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
@@ -371,7 +378,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
371
378
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
372
379
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
373
380
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
374
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
381
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
375
382
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
376
383
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
377
384
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
@@ -379,7 +386,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
379
386
 
380
387
  SECTION("bytes serialize deserialize many ints") {
381
388
  kll_sketch<int> sketch;
382
- const int n(1000);
389
+ const int n = 1000;
383
390
  for (int i = 0; i < n; i++) sketch.update(i);
384
391
  auto bytes = sketch.serialize();
385
392
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
@@ -439,8 +446,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
439
446
  kll_float_sketch sketch2(200, 0);
440
447
  const int n = 10000;
441
448
  for (int i = 0; i < n; i++) {
442
- sketch1.update(i);
443
- sketch2.update((2 * n) - i - 1);
449
+ sketch1.update(static_cast<float>(i));
450
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
444
451
  }
445
452
 
446
453
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -462,8 +469,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
462
469
  kll_float_sketch sketch2(128, 0);
463
470
  const int n = 10000;
464
471
  for (int i = 0; i < n; i++) {
465
- sketch1.update(i);
466
- sketch2.update((2 * n) - i - 1);
472
+ sketch1.update(static_cast<float>(i));
473
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
467
474
  }
468
475
 
469
476
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -495,7 +502,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
495
502
  kll_float_sketch sketch2(128, 0);
496
503
  const int n = 10000;
497
504
  for (int i = 0; i < n; i++) {
498
- sketch1.update(i);
505
+ sketch1.update(static_cast<float>(i));
499
506
  }
500
507
 
501
508
  // rank error should not be affected by a merge with an empty sketch with lower k
@@ -518,8 +525,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
518
525
  SECTION("merge min value from other") {
519
526
  kll_float_sketch sketch1(200, 0);
520
527
  kll_float_sketch sketch2(200, 0);
521
- sketch1.update(1);
522
- sketch2.update(2);
528
+ sketch1.update(1.0f);
529
+ sketch2.update(2.0f);
523
530
  sketch2.merge(sketch1);
524
531
  REQUIRE(sketch2.get_min_value() == 1.0f);
525
532
  REQUIRE(sketch2.get_max_value() == 2.0f);
@@ -527,7 +534,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
527
534
 
528
535
  SECTION("merge min and max values from other") {
529
536
  kll_float_sketch sketch1(200, 0);
530
- for (int i = 0; i < 1000000; i++) sketch1.update(i);
537
+ for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
531
538
  kll_float_sketch sketch2(200, 0);
532
539
  sketch2.merge(sketch1);
533
540
  REQUIRE(sketch2.get_min_value() == 0.0f);
@@ -540,7 +547,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
540
547
  REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
541
548
  REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
542
549
 
543
- const int n(1000);
550
+ const int n = 1000;
544
551
  for (int i = 0; i < n; i++) sketch.update(i);
545
552
 
546
553
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
@@ -679,6 +686,22 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
679
686
  }
680
687
  }
681
688
 
689
+ SECTION("max serialized size arithmetic type") {
690
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
691
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
692
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
693
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
694
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
695
+ }
696
+
697
+ SECTION("max serialized size non-arithmetic type") {
698
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
699
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
700
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
701
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
702
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
703
+ }
704
+
682
705
  // cleanup
683
706
  if (test_allocator_total_bytes != 0) {
684
707
  REQUIRE(test_allocator_total_bytes == 0);
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <random>
23
+
24
+ #include <kll_sketch.hpp>
25
+ #include <kolmogorov_smirnov.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
30
+ const uint16_t k = 200;
31
+ kll_sketch<double> sketch1(k);
32
+ kll_sketch<double> sketch2(k);
33
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
34
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
35
+ }
36
+
37
+ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
38
+ const uint16_t k = 200;
39
+ kll_sketch<double> sketch1(k);
40
+ kll_sketch<double> sketch2(k);
41
+ std::default_random_engine rand;
42
+ std::normal_distribution<double> distr;
43
+ const int n = k * 3 - 1;
44
+ for (int i = 0; i < n; ++i) {
45
+ const double x = distr(rand);
46
+ sketch1.update(x);
47
+ sketch2.update(x);
48
+ }
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
50
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
+ }
52
+
53
+ TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
54
+ const uint16_t k = 200;
55
+ kll_sketch<double> sketch1(k);
56
+ kll_sketch<double> sketch2(k);
57
+ std::default_random_engine rand;
58
+ std::normal_distribution<double> distr;
59
+ const int n = k * 3 - 1;
60
+ for (int i = 0; i < n; ++i) {
61
+ const double x = distr(rand);
62
+ sketch1.update(x + 100.0);
63
+ sketch2.update(x);
64
+ }
65
+ const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
66
+ REQUIRE(delta == Approx(1.0).margin(1e-6));
67
+ REQUIRE(delta <= 1);
68
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
69
+ }
70
+
71
+ TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
72
+ const uint16_t k = 2000;
73
+ kll_sketch<double> sketch1(k);
74
+ kll_sketch<double> sketch2(k);
75
+ std::default_random_engine rand;
76
+ std::normal_distribution<double> distr;
77
+ const int n = k * 3 - 1;
78
+ for (int i = 0; i < n; ++i) {
79
+ const double x = distr(rand);
80
+ sketch1.update(x + 0.05);
81
+ sketch2.update(x);
82
+ }
83
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
84
+ REQUIRE(delta == Approx(0.02).margin(0.01));
85
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
86
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
87
+ REQUIRE_FALSE(delta > threshold);
88
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
89
+ }
90
+
91
+ TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
92
+ const uint16_t k = 8000;
93
+ kll_sketch<double> sketch1(k);
94
+ kll_sketch<double> sketch2(k);
95
+ std::default_random_engine rand;
96
+ std::normal_distribution<double> distr;
97
+ const int n = k * 3 - 1;
98
+ for (int i = 0; i < n; ++i) {
99
+ const double x = distr(rand);
100
+ sketch1.update(x + 0.05);
101
+ sketch2.update(x);
102
+ }
103
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
104
+ REQUIRE(delta == Approx(0.02).margin(0.01));
105
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
106
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
107
+ REQUIRE(delta > threshold);
108
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
109
+ }
110
+
111
+ } /* namespace datasketches */