datasketches 0.1.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ class kolmogorov_smirnov {
26
+ public:
27
+ /**
28
+ * Computes the raw delta area between two KLL quantile sketches for the Kolmogorov-Smirnov Test.
29
+ * @param sketch1 KLL sketch 1
30
+ * @param sketch2 KLL sketch 2
31
+ * @return the raw delta between two KLL quantile sketches
32
+ */
33
+ template<typename Sketch>
34
+ static double delta(const Sketch& sketch1, const Sketch& sketch2);
35
+
36
+ /**
37
+ * Computes the adjusted delta area threshold for the Kolmogorov-Smirnov Test.
38
+ * Adjusts the computed threshold by the error epsilons of the two given sketches.
39
+ * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a>
40
+ * @param sketch1 KLL sketch 1
41
+ * @param sketch2 KLL sketch 2
42
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
43
+ * @return the adjusted threshold to be compared with the raw delta
44
+ */
45
+ template<typename Sketch>
46
+ static double threshold(const Sketch& sketch1, const Sketch& sketch2, double p);
47
+
48
+ /**
49
+ * Performs the Kolmogorov-Smirnov Test between two KLL quantiles sketches.
50
+ * Note: if the given sketches have insufficient data or if the sketch sizes are too small,
51
+ * this will return false.
52
+ * @param sketch1 KLL sketch 1
53
+ * @param sketch2 KLL sketch 2
54
+ * @param p Target p-value. Typically .001 to .1, e.g., .05.
55
+ * @return Boolean indicating whether we can reject the null hypothesis (that the sketches
56
+ * reflect the same underlying distribution) using the provided p-value.
57
+ */
58
+ template<typename Sketch>
59
+ static bool test(const Sketch& sketch1, const Sketch& sketch2, double p);
60
+
61
+ };
62
+
63
+ } /* namespace datasketches */
64
+
65
+ #include "kolmogorov_smirnov_impl.hpp"
66
+
67
+ #endif
@@ -0,0 +1,78 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KOLMOGOROV_SMIRNOV_IMPL_HPP_
21
+ #define KOLMOGOROV_SMIRNOV_IMPL_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ // type resolver
26
+ template<typename T, typename C, typename S, typename A>
27
+ kll_quantile_calculator<T, C, A> make_quantile_calculator(const kll_sketch<T, C, S, A>& sketch) {
28
+ return kll_quantile_calculator<T, C, A>(sketch);
29
+ }
30
+
31
+ template<typename Sketch>
32
+ double kolmogorov_smirnov::delta(const Sketch& sketch1, const Sketch& sketch2) {
33
+ using Comparator = typename Sketch::comparator;
34
+ auto calc1 = make_quantile_calculator(sketch1);
35
+ auto calc2 = make_quantile_calculator(sketch2);
36
+ auto it1 = calc1.begin();
37
+ auto it2 = calc2.begin();
38
+ const auto n1 = sketch1.get_n();
39
+ const auto n2 = sketch2.get_n();
40
+ double delta = 0;
41
+ while (it1 != calc1.end() && it2 != calc2.end()) {
42
+ const double norm_cum_wt1 = static_cast<double>((*it1).second) / n1;
43
+ const double norm_cum_wt2 = static_cast<double>((*it2).second) / n2;
44
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
45
+ if (Comparator()((*it1).first, (*it2).first)) {
46
+ ++it1;
47
+ } else if (Comparator()((*it2).first, (*it1).first)) {
48
+ ++it2;
49
+ } else {
50
+ ++it1;
51
+ ++it2;
52
+ }
53
+ }
54
+ const double norm_cum_wt1 = it1 == calc1.end() ? 1 : static_cast<double>((*it1).second) / n1;
55
+ const double norm_cum_wt2 = it2 == calc2.end() ? 1 : static_cast<double>((*it2).second) / n2;
56
+ delta = std::max(delta, std::abs(norm_cum_wt1 - norm_cum_wt2));
57
+ return delta;
58
+ }
59
+
60
+ template<typename Sketch>
61
+ double kolmogorov_smirnov::threshold(const Sketch& sketch1, const Sketch& sketch2, double p) {
62
+ const double r1 = sketch1.get_num_retained();
63
+ const double r2 = sketch2.get_num_retained();
64
+ const double alpha_factor = sqrt(-0.5 * log(0.5 * p));
65
+ const double delta_area_threshold = alpha_factor * sqrt((r1 + r2) / (r1 * r2));
66
+ const double eps1 = sketch1.get_normalized_rank_error(false);
67
+ const double eps2 = sketch2.get_normalized_rank_error(false);
68
+ return delta_area_threshold + eps1 + eps2;
69
+ }
70
+
71
+ template<typename Sketch>
72
+ bool kolmogorov_smirnov::test(const Sketch& sketch1, const Sketch& sketch2, double p) {
73
+ return delta(sketch1, sketch2) > threshold(sketch1, sketch2, p);
74
+ }
75
+
76
+ } /* namespace datasketches */
77
+
78
+ #endif
@@ -41,4 +41,5 @@ target_sources(kll_test
41
41
  kll_sketch_test.cpp
42
42
  kll_sketch_custom_type_test.cpp
43
43
  kll_sketch_validation.cpp
44
+ kolmogorov_smirnov_test.cpp
44
45
  )
@@ -26,7 +26,8 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- typedef kll_sketch<test_type, test_type_less, test_type_serde, test_allocator<test_type>> kll_test_type_sketch;
29
+ using kll_test_type_sketch = kll_sketch<test_type, test_type_less, test_type_serde, test_allocator<test_type>>;
30
+ using alloc = test_allocator<test_type>;
30
31
 
31
32
  TEST_CASE("kll sketch custom type", "[kll_sketch]") {
32
33
 
@@ -34,7 +35,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
34
35
  test_allocator_total_bytes = 0;
35
36
 
36
37
  SECTION("compact level zero") {
37
- kll_test_type_sketch sketch(8);
38
+ kll_test_type_sketch sketch(8, 0);
38
39
  REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
39
40
  REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
40
41
  REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
@@ -59,10 +60,10 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
59
60
  }
60
61
 
61
62
  SECTION("merge small") {
62
- kll_test_type_sketch sketch1(8);
63
+ kll_test_type_sketch sketch1(8, 0);
63
64
  sketch1.update(1);
64
65
 
65
- kll_test_type_sketch sketch2(8);
66
+ kll_test_type_sketch sketch2(8, 0);
66
67
  sketch2.update(2);
67
68
 
68
69
  sketch2.merge(sketch1);
@@ -76,7 +77,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
76
77
  }
77
78
 
78
79
  SECTION("merge higher levels") {
79
- kll_test_type_sketch sketch1(8);
80
+ kll_test_type_sketch sketch1(8, 0);
80
81
  sketch1.update(1);
81
82
  sketch1.update(2);
82
83
  sketch1.update(3);
@@ -87,7 +88,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
87
88
  sketch1.update(8);
88
89
  sketch1.update(9);
89
90
 
90
- kll_test_type_sketch sketch2(8);
91
+ kll_test_type_sketch sketch2(8, 0);
91
92
  sketch2.update(10);
92
93
  sketch2.update(11);
93
94
  sketch2.update(12);
@@ -109,7 +110,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
109
110
  }
110
111
 
111
112
  SECTION("serialize deserialize") {
112
- kll_test_type_sketch sketch1;
113
+ kll_test_type_sketch sketch1(200, 0);
113
114
 
114
115
  const int n = 1000;
115
116
  for (int i = 0; i < n; i++) sketch1.update(i);
@@ -117,7 +118,7 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
117
118
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
118
119
  sketch1.serialize(s);
119
120
  REQUIRE((size_t) s.tellp() == sketch1.get_serialized_size_bytes());
120
- auto sketch2 = kll_test_type_sketch::deserialize(s);
121
+ auto sketch2 = kll_test_type_sketch::deserialize(s, alloc(0));
121
122
  REQUIRE((size_t) s.tellg() == sketch2.get_serialized_size_bytes());
122
123
  REQUIRE(s.tellg() == s.tellp());
123
124
  REQUIRE(sketch2.is_empty() == sketch1.is_empty());
@@ -135,9 +136,9 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
135
136
  }
136
137
 
137
138
  SECTION("moving merge") {
138
- kll_test_type_sketch sketch1(8);
139
+ kll_test_type_sketch sketch1(8, 0);
139
140
  for (int i = 0; i < 10; i++) sketch1.update(i);
140
- kll_test_type_sketch sketch2(8);
141
+ kll_test_type_sketch sketch2(8, 0);
141
142
  sketch2.update(10);
142
143
  sketch2.merge(std::move(sketch1));
143
144
  REQUIRE(sketch2.get_min_value().get_value() == 0);