datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ class kolmogorov_smirnov {
26
+ public:
27
+ /**
28
+ * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * @param sketch1 KLL sketch 1
30
+ * @param sketch2 KLL sketch 2
31
+ * @return the raw delta between two KLL quantile sketches
32
+ */
33
+ template<typename Sketch>
34
+ static double delta(const Sketch& sketch1, const Sketch& sketch2);
35
+
36
+ /**
37
+ * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
+ * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
40
+ * @param sketch1 KLL sketch 1
41
+ * @param sketch2 KLL sketch 2
42
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
43
+ * @return the adjusted threshold to be compared with the raw delta
44
+ */
45
+ template<typename Sketch>
46
+ static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
+
48
+ /**
49
+ * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
50
+ * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
+ * this will return false.
52
+ * @param sketch1 KLL sketch 1
53
+ * @param sketch2 KLL sketch 2
54
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
55
+ * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
56
+ * reflect the same underlying distribution) using the provided p-value.
57
+ */
58
+ template<typename Sketch>
59
+ static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
+
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "kolmogorov_smirnov_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,78 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ // type resolver
26
+ template<typename T, typename C, typename S, typename A>
27
+ kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
+ return kll_quantile_calculator<T, C, A>(sketch);
29
+ }
30
+
31
+ template<typename Sketch>
32
+ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
+ using Comparator = typename Sketch::comparator;
34
+ auto calc1 = make_quantile_calculator(sketch1);
35
+ auto calc2 = make_quantile_calculator(sketch2);
36
+ auto it1 = calc1.begin();
37
+ auto it2 = calc2.begin();
38
+ const auto n1 = sketch1.get_n();
39
+ const auto n2 = sketch2.get_n();
40
+ double delta = 0;
41
+ while (it1 != calc1.end() && it2 != calc2.end()) {
42
+ const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
+ const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
+ if (Comparator()((*it1).first, (*it2).first)) {
46
+ ++it1;
47
+ } else if (Comparator()((*it2).first, (*it1).first)) {
48
+ ++it2;
49
+ } else {
50
+ ++it1;
51
+ ++it2;
52
+ }
53
+ }
54
+ const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
+ const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
+ return delta;
58
+ }
59
+
60
+ template<typename Sketch>
61
+ double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
62
+ const double r1 = sketch1.get_num_retained();
63
+ const double r2 = sketch2.get_num_retained();
64
+ const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
65
+ const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
66
+ const double eps1 = sketch1.get_normalized_rank_error(false);
67
+ const double eps2 = sketch2.get_normalized_rank_error(false);
68
+ return delta_area_threshold + eps1 + eps2;
69
+ }
70
+
71
+ template<typename Sketch>
72
+ bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
73
+ return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
74
+ }
75
+
76
+ } /* namespace datasketches */
77
+
78
+ #endif
@@ -41,4 +41,5 @@ target_sources(kll_test
41
41
  kll_sketch_test.cpp
42
42
  kll_sketch_custom_type_test.cpp
43
43
  kll_sketch_validation.cpp
44
+ kolmogorov_smirnov_test.cpp
44
45
  )
@@ -70,12 +70,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
70
70
  REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
71
71
  REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
72
72
 
73
- int count = 0;
74
- for (auto& it: sketch) {
73
+ for (auto it: sketch) {
75
74
  (void) it; // to suppress "unused" warning
76
- ++count;
75
+ FAIL("should be no iterations over an empty sketch");
77
76
  }
78
- REQUIRE(count == 0);
79
77
  }
80
78
 
81
79
  SECTION("get bad quantile") {
@@ -86,13 +84,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
86
84
 
87
85
  SECTION("one item") {
88
86
  kll_float_sketch sketch(200, 0);
89
- sketch.update(1);
87
+ sketch.update(1.0f);
90
88
  REQUIRE_FALSE(sketch.is_empty());
91
89
  REQUIRE_FALSE(sketch.is_estimation_mode());
92
90
  REQUIRE(sketch.get_n() == 1);
93
91
  REQUIRE(sketch.get_num_retained() == 1);
94
- REQUIRE(sketch.get_rank(1) == 0.0);
95
- REQUIRE(sketch.get_rank(2) == 1.0);
92
+ REQUIRE(sketch.get_rank(1.0f) == 0.0);
93
+ REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
94
  REQUIRE(sketch.get_min_value() == 1.0);
97
95
  REQUIRE(sketch.get_max_value() == 1.0);
98
96
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
104
102
  REQUIRE(quantiles[2] == 1.0);
105
103
 
106
104
  int count = 0;
107
- for (auto& it: sketch) {
105
+ for (auto it: sketch) {
108
106
  REQUIRE(it.second == 1);
109
107
  ++count;
110
108
  }
@@ -116,16 +114,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
116
114
  sketch.update(std::numeric_limits<float>::quiet_NaN());
117
115
  REQUIRE(sketch.is_empty());
118
116
 
119
- sketch.update(0.0);
117
+ sketch.update(0);
120
118
  sketch.update(std::numeric_limits<float>::quiet_NaN());
121
119
  REQUIRE(sketch.get_n() == 1);
122
120
  }
123
121
 
124
122
  SECTION("many items, exact mode") {
125
123
  kll_float_sketch sketch(200, 0);
126
- const uint32_t n(200);
124
+ const uint32_t n = 200;
127
125
  for (uint32_t i = 0; i < n; i++) {
128
- sketch.update(i);
126
+ sketch.update(static_cast<float>(i));
129
127
  REQUIRE(sketch.get_n() == i + 1);
130
128
  }
131
129
  REQUIRE_FALSE(sketch.is_empty());
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
145
143
 
146
144
  for (uint32_t i = 0; i < n; i++) {
147
145
  const double trueRank = (double) i / n;
148
- REQUIRE(sketch.get_rank(i) == trueRank);
146
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
149
147
  }
150
148
 
151
149
  // the alternative method must produce the same result
@@ -158,16 +156,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
158
156
 
159
157
  SECTION("10 items") {
160
158
  kll_float_sketch sketch(200, 0);
161
- sketch.update(1);
162
- sketch.update(2);
163
- sketch.update(3);
164
- sketch.update(4);
165
- sketch.update(5);
166
- sketch.update(6);
167
- sketch.update(7);
168
- sketch.update(8);
169
- sketch.update(9);
170
- sketch.update(10);
159
+ sketch.update(1.0f);
160
+ sketch.update(2.0f);
161
+ sketch.update(3.0f);
162
+ sketch.update(4.0f);
163
+ sketch.update(5.0f);
164
+ sketch.update(6.0f);
165
+ sketch.update(7.0f);
166
+ sketch.update(8.0f);
167
+ sketch.update(9.0f);
168
+ sketch.update(10.0f);
171
169
  REQUIRE(sketch.get_quantile(0) == 1.0);
172
170
  REQUIRE(sketch.get_quantile(0.5) == 6.0);
173
171
  REQUIRE(sketch.get_quantile(0.99) == 10.0);
@@ -176,7 +174,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
176
174
 
177
175
  SECTION("100 items") {
178
176
  kll_float_sketch sketch(200, 0);
179
- for (int i = 0; i < 100; ++i) sketch.update(i);
177
+ for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
180
178
  REQUIRE(sketch.get_quantile(0) == 0);
181
179
  REQUIRE(sketch.get_quantile(0.01) == 1);
182
180
  REQUIRE(sketch.get_quantile(0.5) == 50);
@@ -186,9 +184,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
186
184
 
187
185
  SECTION("many items, estimation mode") {
188
186
  kll_float_sketch sketch(200, 0);
189
- const int n(1000000);
187
+ const int n = 1000000;
190
188
  for (int i = 0; i < n; i++) {
191
- sketch.update(i);
189
+ sketch.update(static_cast<float>(i));
192
190
  REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
193
191
  }
194
192
  REQUIRE_FALSE(sketch.is_empty());
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
201
199
  // test rank
202
200
  for (int i = 0; i < n; i++) {
203
201
  const double trueRank = (double) i / n;
204
- REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
202
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
205
203
  }
206
204
 
207
205
  // test quantiles at every 0.1 percentage point
@@ -224,6 +222,15 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
224
222
  }
225
223
 
226
224
  //std::cout << sketch.to_string();
225
+
226
+ uint32_t count = 0;
227
+ uint64_t total_weight = 0;
228
+ for (auto it: sketch) {
229
+ ++count;
230
+ total_weight += it.second;
231
+ }
232
+ REQUIRE(count == sketch.get_num_retained());
233
+ REQUIRE(total_weight == sketch.get_n());
227
234
  }
228
235
 
229
236
  SECTION("consistency between get_rank adn get_PMF/CDF") {
@@ -231,8 +238,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
231
238
  const int n = 1000;
232
239
  float values[n];
233
240
  for (int i = 0; i < n; i++) {
234
- sketch.update(i);
235
- values[i] = i;
241
+ sketch.update(static_cast<float>(i));
242
+ values[i] = static_cast<float>(i);
236
243
  }
237
244
 
238
245
  const auto ranks(sketch.get_CDF(values, n));
@@ -299,7 +306,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
299
306
 
300
307
  SECTION("serialize deserialize one item") {
301
308
  kll_float_sketch sketch(200, 0);
302
- sketch.update(1);
309
+ sketch.update(1.0f);
303
310
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
304
311
  sketch.serialize(s);
305
312
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -332,8 +339,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
332
339
 
333
340
  SECTION("stream serialize deserialize many floats") {
334
341
  kll_float_sketch sketch(200, 0);
335
- const int n(1000);
336
- for (int i = 0; i < n; i++) sketch.update(i);
342
+ const int n = 1000;
343
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
337
344
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
338
345
  sketch.serialize(s);
339
346
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
@@ -350,13 +357,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
350
357
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
351
358
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
352
359
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
353
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
360
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
354
361
  }
355
362
 
356
363
  SECTION("bytes serialize deserialize many floats") {
357
364
  kll_float_sketch sketch(200, 0);
358
- const int n(1000);
359
- for (int i = 0; i < n; i++) sketch.update(i);
365
+ const int n = 1000;
366
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
360
367
  auto bytes = sketch.serialize();
361
368
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
362
369
  auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
@@ -371,7 +378,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
371
378
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
372
379
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
373
380
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
374
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
381
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
375
382
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
376
383
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
377
384
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
@@ -379,7 +386,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
379
386
 
380
387
  SECTION("bytes serialize deserialize many ints") {
381
388
  kll_sketch<int> sketch;
382
- const int n(1000);
389
+ const int n = 1000;
383
390
  for (int i = 0; i < n; i++) sketch.update(i);
384
391
  auto bytes = sketch.serialize();
385
392
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
@@ -439,8 +446,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
439
446
  kll_float_sketch sketch2(200, 0);
440
447
  const int n = 10000;
441
448
  for (int i = 0; i < n; i++) {
442
- sketch1.update(i);
443
- sketch2.update((2 * n) - i - 1);
449
+ sketch1.update(static_cast<float>(i));
450
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
444
451
  }
445
452
 
446
453
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -462,8 +469,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
462
469
  kll_float_sketch sketch2(128, 0);
463
470
  const int n = 10000;
464
471
  for (int i = 0; i < n; i++) {
465
- sketch1.update(i);
466
- sketch2.update((2 * n) - i - 1);
472
+ sketch1.update(static_cast<float>(i));
473
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
467
474
  }
468
475
 
469
476
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -495,7 +502,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
495
502
  kll_float_sketch sketch2(128, 0);
496
503
  const int n = 10000;
497
504
  for (int i = 0; i < n; i++) {
498
- sketch1.update(i);
505
+ sketch1.update(static_cast<float>(i));
499
506
  }
500
507
 
501
508
  // rank error should not be affected by a merge with an empty sketch with lower k
@@ -518,8 +525,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
518
525
  SECTION("merge min value from other") {
519
526
  kll_float_sketch sketch1(200, 0);
520
527
  kll_float_sketch sketch2(200, 0);
521
- sketch1.update(1);
522
- sketch2.update(2);
528
+ sketch1.update(1.0f);
529
+ sketch2.update(2.0f);
523
530
  sketch2.merge(sketch1);
524
531
  REQUIRE(sketch2.get_min_value() == 1.0f);
525
532
  REQUIRE(sketch2.get_max_value() == 2.0f);
@@ -527,7 +534,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
527
534
 
528
535
  SECTION("merge min and max values from other") {
529
536
  kll_float_sketch sketch1(200, 0);
530
- for (int i = 0; i < 1000000; i++) sketch1.update(i);
537
+ for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
531
538
  kll_float_sketch sketch2(200, 0);
532
539
  sketch2.merge(sketch1);
533
540
  REQUIRE(sketch2.get_min_value() == 0.0f);
@@ -540,7 +547,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
540
547
  REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
541
548
  REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
542
549
 
543
- const int n(1000);
550
+ const int n = 1000;
544
551
  for (int i = 0; i < n; i++) sketch.update(i);
545
552
 
546
553
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
@@ -679,6 +686,22 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
679
686
  }
680
687
  }
681
688
 
689
+ SECTION("max serialized size arithmetic type") {
690
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
691
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
692
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
693
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
694
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
695
+ }
696
+
697
+ SECTION("max serialized size non-arithmetic type") {
698
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
699
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
700
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
701
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
702
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
703
+ }
704
+
682
705
  // cleanup
683
706
  if (test_allocator_total_bytes != 0) {
684
707
  REQUIRE(test_allocator_total_bytes == 0);
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <random>
23
+
24
+ #include <kll_sketch.hpp>
25
+ #include <kolmogorov_smirnov.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
30
+ const uint16_t k = 200;
31
+ kll_sketch<double> sketch1(k);
32
+ kll_sketch<double> sketch2(k);
33
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
34
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
35
+ }
36
+
37
+ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
38
+ const uint16_t k = 200;
39
+ kll_sketch<double> sketch1(k);
40
+ kll_sketch<double> sketch2(k);
41
+ std::default_random_engine rand;
42
+ std::normal_distribution<double> distr;
43
+ const int n = k * 3 - 1;
44
+ for (int i = 0; i < n; ++i) {
45
+ const double x = distr(rand);
46
+ sketch1.update(x);
47
+ sketch2.update(x);
48
+ }
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
50
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
+ }
52
+
53
+ TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
54
+ const uint16_t k = 200;
55
+ kll_sketch<double> sketch1(k);
56
+ kll_sketch<double> sketch2(k);
57
+ std::default_random_engine rand;
58
+ std::normal_distribution<double> distr;
59
+ const int n = k * 3 - 1;
60
+ for (int i = 0; i < n; ++i) {
61
+ const double x = distr(rand);
62
+ sketch1.update(x + 100.0);
63
+ sketch2.update(x);
64
+ }
65
+ const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
66
+ REQUIRE(delta == Approx(1.0).margin(1e-6));
67
+ REQUIRE(delta <= 1);
68
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
69
+ }
70
+
71
+ TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
72
+ const uint16_t k = 2000;
73
+ kll_sketch<double> sketch1(k);
74
+ kll_sketch<double> sketch2(k);
75
+ std::default_random_engine rand;
76
+ std::normal_distribution<double> distr;
77
+ const int n = k * 3 - 1;
78
+ for (int i = 0; i < n; ++i) {
79
+ const double x = distr(rand);
80
+ sketch1.update(x + 0.05);
81
+ sketch2.update(x);
82
+ }
83
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
84
+ REQUIRE(delta == Approx(0.02).margin(0.01));
85
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
86
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
87
+ REQUIRE_FALSE(delta > threshold);
88
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
89
+ }
90
+
91
+ TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
92
+ const uint16_t k = 8000;
93
+ kll_sketch<double> sketch1(k);
94
+ kll_sketch<double> sketch2(k);
95
+ std::default_random_engine rand;
96
+ std::normal_distribution<double> distr;
97
+ const int n = k * 3 - 1;
98
+ for (int i = 0; i < n; ++i) {
99
+ const double x = distr(rand);
100
+ sketch1.update(x + 0.05);
101
+ sketch2.update(x);
102
+ }
103
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
104
+ REQUIRE(delta == Approx(0.02).margin(0.01));
105
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
106
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
107
+ REQUIRE(delta > threshold);
108
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
109
+ }
110
+
111
+ } /* namespace datasketches */