datasketches 0.1.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -48,14 +48,14 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
48
48
  test_allocator_total_bytes = 0;
49
49
 
50
50
  SECTION("k limits") {
51
- kll_float_sketch sketch1(kll_float_sketch::MIN_K); // this should work
52
- kll_float_sketch sketch2(kll_float_sketch::MAX_K); // this should work
53
- REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1), std::invalid_argument);
51
+ kll_float_sketch sketch1(kll_float_sketch::MIN_K, 0); // this should work
52
+ kll_float_sketch sketch2(kll_float_sketch::MAX_K, 0); // this should work
53
+ REQUIRE_THROWS_AS(new kll_float_sketch(kll_float_sketch::MIN_K - 1, 0), std::invalid_argument);
54
54
  // MAX_K + 1 makes no sense because k is uint16_t
55
55
  }
56
56
 
57
57
  SECTION("empty") {
58
- kll_float_sketch sketch;
58
+ kll_float_sketch sketch(200, 0);
59
59
  REQUIRE(sketch.is_empty());
60
60
  REQUIRE_FALSE(sketch.is_estimation_mode());
61
61
  REQUIRE(sketch.get_n() == 0);
@@ -70,29 +70,27 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
70
70
  REQUIRE(sketch.get_PMF(split_points, 1).size() == 0);
71
71
  REQUIRE(sketch.get_CDF(split_points, 1).size() == 0);
72
72
 
73
- int count = 0;
74
- for (auto& it: sketch) {
73
+ for (auto it: sketch) {
75
74
  (void) it; // to suppress "unused" warning
76
- ++count;
75
+ FAIL("should be no iterations over an empty sketch");
77
76
  }
78
- REQUIRE(count == 0);
79
77
  }
80
78
 
81
79
  SECTION("get bad quantile") {
82
- kll_float_sketch sketch;
80
+ kll_float_sketch sketch(200, 0);
83
81
  sketch.update(0); // has to be non-empty to reach the check
84
82
  REQUIRE_THROWS_AS(sketch.get_quantile(-1), std::invalid_argument);
85
83
  }
86
84
 
87
85
  SECTION("one item") {
88
- kll_float_sketch sketch;
89
- sketch.update(1);
86
+ kll_float_sketch sketch(200, 0);
87
+ sketch.update(1.0f);
90
88
  REQUIRE_FALSE(sketch.is_empty());
91
89
  REQUIRE_FALSE(sketch.is_estimation_mode());
92
90
  REQUIRE(sketch.get_n() == 1);
93
91
  REQUIRE(sketch.get_num_retained() == 1);
94
- REQUIRE(sketch.get_rank(1) == 0.0);
95
- REQUIRE(sketch.get_rank(2) == 1.0);
92
+ REQUIRE(sketch.get_rank(1.0f) == 0.0);
93
+ REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
94
  REQUIRE(sketch.get_min_value() == 1.0);
97
95
  REQUIRE(sketch.get_max_value() == 1.0);
98
96
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -104,7 +102,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
104
102
  REQUIRE(quantiles[2] == 1.0);
105
103
 
106
104
  int count = 0;
107
- for (auto& it: sketch) {
105
+ for (auto it: sketch) {
108
106
  REQUIRE(it.second == 1);
109
107
  ++count;
110
108
  }
@@ -112,20 +110,20 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
112
110
  }
113
111
 
114
112
  SECTION("NaN") {
115
- kll_float_sketch sketch;
113
+ kll_float_sketch sketch(200, 0);
116
114
  sketch.update(std::numeric_limits<float>::quiet_NaN());
117
115
  REQUIRE(sketch.is_empty());
118
116
 
119
- sketch.update(0.0);
117
+ sketch.update(0);
120
118
  sketch.update(std::numeric_limits<float>::quiet_NaN());
121
119
  REQUIRE(sketch.get_n() == 1);
122
120
  }
123
121
 
124
122
  SECTION("many items, exact mode") {
125
- kll_float_sketch sketch;
126
- const uint32_t n(200);
123
+ kll_float_sketch sketch(200, 0);
124
+ const uint32_t n = 200;
127
125
  for (uint32_t i = 0; i < n; i++) {
128
- sketch.update(i);
126
+ sketch.update(static_cast<float>(i));
129
127
  REQUIRE(sketch.get_n() == i + 1);
130
128
  }
131
129
  REQUIRE_FALSE(sketch.is_empty());
@@ -145,7 +143,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
145
143
 
146
144
  for (uint32_t i = 0; i < n; i++) {
147
145
  const double trueRank = (double) i / n;
148
- REQUIRE(sketch.get_rank(i) == trueRank);
146
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
149
147
  }
150
148
 
151
149
  // the alternative method must produce the same result
@@ -157,17 +155,17 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
157
155
  }
158
156
 
159
157
  SECTION("10 items") {
160
- kll_float_sketch sketch;
161
- sketch.update(1);
162
- sketch.update(2);
163
- sketch.update(3);
164
- sketch.update(4);
165
- sketch.update(5);
166
- sketch.update(6);
167
- sketch.update(7);
168
- sketch.update(8);
169
- sketch.update(9);
170
- sketch.update(10);
158
+ kll_float_sketch sketch(200, 0);
159
+ sketch.update(1.0f);
160
+ sketch.update(2.0f);
161
+ sketch.update(3.0f);
162
+ sketch.update(4.0f);
163
+ sketch.update(5.0f);
164
+ sketch.update(6.0f);
165
+ sketch.update(7.0f);
166
+ sketch.update(8.0f);
167
+ sketch.update(9.0f);
168
+ sketch.update(10.0f);
171
169
  REQUIRE(sketch.get_quantile(0) == 1.0);
172
170
  REQUIRE(sketch.get_quantile(0.5) == 6.0);
173
171
  REQUIRE(sketch.get_quantile(0.99) == 10.0);
@@ -175,8 +173,8 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
175
173
  }
176
174
 
177
175
  SECTION("100 items") {
178
- kll_float_sketch sketch;
179
- for (int i = 0; i < 100; ++i) sketch.update(i);
176
+ kll_float_sketch sketch(200, 0);
177
+ for (int i = 0; i < 100; ++i) sketch.update(static_cast<float>(i));
180
178
  REQUIRE(sketch.get_quantile(0) == 0);
181
179
  REQUIRE(sketch.get_quantile(0.01) == 1);
182
180
  REQUIRE(sketch.get_quantile(0.5) == 50);
@@ -185,10 +183,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
185
183
  }
186
184
 
187
185
  SECTION("many items, estimation mode") {
188
- kll_float_sketch sketch;
189
- const int n(1000000);
186
+ kll_float_sketch sketch(200, 0);
187
+ const int n = 1000000;
190
188
  for (int i = 0; i < n; i++) {
191
- sketch.update(i);
189
+ sketch.update(static_cast<float>(i));
192
190
  REQUIRE(sketch.get_n() == static_cast<uint64_t>(i + 1));
193
191
  }
194
192
  REQUIRE_FALSE(sketch.is_empty());
@@ -201,7 +199,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
201
199
  // test rank
202
200
  for (int i = 0; i < n; i++) {
203
201
  const double trueRank = (double) i / n;
204
- REQUIRE(sketch.get_rank(i) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
202
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == Approx(trueRank).margin(RANK_EPS_FOR_K_200));
205
203
  }
206
204
 
207
205
  // test quantiles at every 0.1 percentage point
@@ -224,15 +222,24 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
224
222
  }
225
223
 
226
224
  //std::cout << sketch.to_string();
225
+
226
+ uint32_t count = 0;
227
+ uint64_t total_weight = 0;
228
+ for (auto it: sketch) {
229
+ ++count;
230
+ total_weight += it.second;
231
+ }
232
+ REQUIRE(count == sketch.get_num_retained());
233
+ REQUIRE(total_weight == sketch.get_n());
227
234
  }
228
235
 
229
236
  SECTION("consistency between get_rank adn get_PMF/CDF") {
230
- kll_float_sketch sketch;
237
+ kll_float_sketch sketch(200, 0);
231
238
  const int n = 1000;
232
239
  float values[n];
233
240
  for (int i = 0; i < n; i++) {
234
- sketch.update(i);
235
- values[i] = i;
241
+ sketch.update(static_cast<float>(i));
242
+ values[i] = static_cast<float>(i);
236
243
  }
237
244
 
238
245
  const auto ranks(sketch.get_CDF(values, n));
@@ -256,7 +263,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
256
263
  std::ifstream is;
257
264
  is.exceptions(std::ios::failbit | std::ios::badbit);
258
265
  is.open(testBinaryInputPath + "kll_sketch_from_java.sk", std::ios::binary);
259
- auto sketch = kll_float_sketch::deserialize(is);
266
+ auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
260
267
  REQUIRE_FALSE(sketch.is_empty());
261
268
  REQUIRE(sketch.is_estimation_mode());
262
269
  REQUIRE(sketch.get_n() == 1000000);
@@ -266,12 +273,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
266
273
  }
267
274
 
268
275
  SECTION("stream serialize deserialize empty") {
269
- kll_float_sketch sketch;
276
+ kll_float_sketch sketch(200, 0);
270
277
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
271
278
  sketch.serialize(s);
272
279
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
273
- auto sketch2 = kll_float_sketch::deserialize(s);
280
+ auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
274
281
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
282
+ REQUIRE(s.tellg() == s.tellp());
275
283
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
276
284
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
277
285
  REQUIRE(sketch2.get_n() == sketch.get_n());
@@ -283,9 +291,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
283
291
  }
284
292
 
285
293
  SECTION("bytes serialize deserialize empty") {
286
- kll_float_sketch sketch;
294
+ kll_float_sketch sketch(200, 0);
287
295
  auto bytes = sketch.serialize();
288
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
296
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
289
297
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
290
298
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
291
299
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -297,13 +305,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
297
305
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
298
306
  }
299
307
 
300
- SECTION("serialize deserialize one item") {
301
- kll_float_sketch sketch;
302
- sketch.update(1);
308
+ SECTION("stream serialize deserialize one item") {
309
+ kll_float_sketch sketch(200, 0);
310
+ sketch.update(1.0f);
303
311
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
304
312
  sketch.serialize(s);
305
313
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
306
- auto sketch2 = kll_float_sketch::deserialize(s);
314
+ auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
307
315
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
308
316
  REQUIRE(s.tellg() == s.tellp());
309
317
  REQUIRE_FALSE(sketch2.is_empty());
@@ -317,11 +325,29 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
317
325
  REQUIRE(sketch2.get_rank(2) == 1.0);
318
326
  }
319
327
 
328
+ SECTION("bytes serialize deserialize one item") {
329
+ kll_float_sketch sketch(200, 0);
330
+ sketch.update(1.0f);
331
+ auto bytes = sketch.serialize();
332
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
333
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
334
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
335
+ REQUIRE_FALSE(sketch2.is_empty());
336
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
337
+ REQUIRE(sketch2.get_n() == 1);
338
+ REQUIRE(sketch2.get_num_retained() == 1);
339
+ REQUIRE(sketch2.get_min_value() == 1.0);
340
+ REQUIRE(sketch2.get_max_value() == 1.0);
341
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
342
+ REQUIRE(sketch2.get_rank(1) == 0.0);
343
+ REQUIRE(sketch2.get_rank(2) == 1.0);
344
+ }
345
+
320
346
  SECTION("deserialize one item v1") {
321
347
  std::ifstream is;
322
348
  is.exceptions(std::ios::failbit | std::ios::badbit);
323
349
  is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
324
- auto sketch = kll_float_sketch::deserialize(is);
350
+ auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
325
351
  REQUIRE_FALSE(sketch.is_empty());
326
352
  REQUIRE_FALSE(sketch.is_estimation_mode());
327
353
  REQUIRE(sketch.get_n() == 1);
@@ -330,14 +356,50 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
330
356
  REQUIRE(sketch.get_max_value() == 1.0);
331
357
  }
332
358
 
359
+ SECTION("stream serialize deserialize three items") {
360
+ kll_float_sketch sketch(200, 0);
361
+ sketch.update(1.0f);
362
+ sketch.update(2.0f);
363
+ sketch.update(3.0f);
364
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
365
+ sketch.serialize(s);
366
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
367
+ auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
368
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
369
+ REQUIRE(s.tellg() == s.tellp());
370
+ REQUIRE_FALSE(sketch2.is_empty());
371
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
372
+ REQUIRE(sketch2.get_n() == 3);
373
+ REQUIRE(sketch2.get_num_retained() == 3);
374
+ REQUIRE(sketch2.get_min_value() == 1.0);
375
+ REQUIRE(sketch2.get_max_value() == 3.0);
376
+ }
377
+
378
+ SECTION("bytes serialize deserialize three items") {
379
+ kll_float_sketch sketch(200, 0);
380
+ sketch.update(1.0f);
381
+ sketch.update(2.0f);
382
+ sketch.update(3.0f);
383
+ auto bytes = sketch.serialize();
384
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
385
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
386
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
387
+ REQUIRE_FALSE(sketch2.is_empty());
388
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
389
+ REQUIRE(sketch2.get_n() == 3);
390
+ REQUIRE(sketch2.get_num_retained() == 3);
391
+ REQUIRE(sketch2.get_min_value() == 1.0);
392
+ REQUIRE(sketch2.get_max_value() == 3.0);
393
+ }
394
+
333
395
  SECTION("stream serialize deserialize many floats") {
334
- kll_float_sketch sketch;
335
- const int n(1000);
336
- for (int i = 0; i < n; i++) sketch.update(i);
396
+ kll_float_sketch sketch(200, 0);
397
+ const int n = 1000;
398
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
337
399
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
338
400
  sketch.serialize(s);
339
401
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
340
- auto sketch2 = kll_float_sketch::deserialize(s);
402
+ auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
341
403
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
342
404
  REQUIRE(s.tellg() == s.tellp());
343
405
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
@@ -350,16 +412,16 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
350
412
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
351
413
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
352
414
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
353
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
415
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
354
416
  }
355
417
 
356
418
  SECTION("bytes serialize deserialize many floats") {
357
- kll_float_sketch sketch;
358
- const int n(1000);
359
- for (int i = 0; i < n; i++) sketch.update(i);
419
+ kll_float_sketch sketch(200, 0);
420
+ const int n = 1000;
421
+ for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
360
422
  auto bytes = sketch.serialize();
361
423
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
362
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size());
424
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
363
425
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
364
426
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
365
427
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -371,7 +433,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
371
433
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
372
434
  REQUIRE(sketch2.get_quantile(0.5) == sketch.get_quantile(0.5));
373
435
  REQUIRE(sketch2.get_rank(0) == sketch.get_rank(0));
374
- REQUIRE(sketch2.get_rank(n) == sketch.get_rank(n));
436
+ REQUIRE(sketch2.get_rank(static_cast<float>(n)) == sketch.get_rank(static_cast<float>(n)));
375
437
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 7), std::out_of_range);
376
438
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), 15), std::out_of_range);
377
439
  REQUIRE_THROWS_AS(kll_sketch<int>::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
@@ -379,7 +441,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
379
441
 
380
442
  SECTION("bytes serialize deserialize many ints") {
381
443
  kll_sketch<int> sketch;
382
- const int n(1000);
444
+ const int n = 1000;
383
445
  for (int i = 0; i < n; i++) sketch.update(i);
384
446
  auto bytes = sketch.serialize();
385
447
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
@@ -414,7 +476,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
414
476
  }
415
477
 
416
478
  SECTION("out of order split points, float") {
417
- kll_float_sketch sketch;
479
+ kll_float_sketch sketch(200, 0);
418
480
  sketch.update(0); // has too be non-empty to reach the check
419
481
  float split_points[2] = {1, 0};
420
482
  REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 2), std::invalid_argument);
@@ -428,19 +490,19 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
428
490
  }
429
491
 
430
492
  SECTION("NaN split point") {
431
- kll_float_sketch sketch;
493
+ kll_float_sketch sketch(200, 0);
432
494
  sketch.update(0); // has too be non-empty to reach the check
433
495
  float split_points[1] = {std::numeric_limits<float>::quiet_NaN()};
434
496
  REQUIRE_THROWS_AS(sketch.get_CDF(split_points, 1), std::invalid_argument);
435
497
  }
436
498
 
437
499
  SECTION("merge") {
438
- kll_float_sketch sketch1;
439
- kll_float_sketch sketch2;
500
+ kll_float_sketch sketch1(200, 0);
501
+ kll_float_sketch sketch2(200, 0);
440
502
  const int n = 10000;
441
503
  for (int i = 0; i < n; i++) {
442
- sketch1.update(i);
443
- sketch2.update((2 * n) - i - 1);
504
+ sketch1.update(static_cast<float>(i));
505
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
444
506
  }
445
507
 
446
508
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -458,12 +520,12 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
458
520
  }
459
521
 
460
522
  SECTION("merge lower k") {
461
- kll_float_sketch sketch1(256);
462
- kll_float_sketch sketch2(128);
523
+ kll_float_sketch sketch1(256, 0);
524
+ kll_float_sketch sketch2(128, 0);
463
525
  const int n = 10000;
464
526
  for (int i = 0; i < n; i++) {
465
- sketch1.update(i);
466
- sketch2.update((2 * n) - i - 1);
527
+ sketch1.update(static_cast<float>(i));
528
+ sketch2.update(static_cast<float>((2 * n) - i - 1));
467
529
  }
468
530
 
469
531
  REQUIRE(sketch1.get_min_value() == 0.0f);
@@ -471,6 +533,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
471
533
  REQUIRE(sketch2.get_min_value() == n);
472
534
  REQUIRE(sketch2.get_max_value() == 2.0f * n - 1);
473
535
 
536
+ REQUIRE(sketch1.get_k() == 256);
537
+ REQUIRE(sketch2.get_k() == 128);
538
+
474
539
  REQUIRE(sketch1.get_normalized_rank_error(false) < sketch2.get_normalized_rank_error(false));
475
540
  REQUIRE(sketch1.get_normalized_rank_error(true) < sketch2.get_normalized_rank_error(true));
476
541
 
@@ -488,11 +553,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
488
553
  }
489
554
 
490
555
  SECTION("merge exact mode, lower k") {
491
- kll_float_sketch sketch1(256);
492
- kll_float_sketch sketch2(128);
556
+ kll_float_sketch sketch1(256, 0);
557
+ kll_float_sketch sketch2(128, 0);
493
558
  const int n = 10000;
494
559
  for (int i = 0; i < n; i++) {
495
- sketch1.update(i);
560
+ sketch1.update(static_cast<float>(i));
496
561
  }
497
562
 
498
563
  // rank error should not be affected by a merge with an empty sketch with lower k
@@ -513,19 +578,19 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
513
578
  }
514
579
 
515
580
  SECTION("merge min value from other") {
516
- kll_float_sketch sketch1;
517
- kll_float_sketch sketch2;
518
- sketch1.update(1);
519
- sketch2.update(2);
581
+ kll_float_sketch sketch1(200, 0);
582
+ kll_float_sketch sketch2(200, 0);
583
+ sketch1.update(1.0f);
584
+ sketch2.update(2.0f);
520
585
  sketch2.merge(sketch1);
521
586
  REQUIRE(sketch2.get_min_value() == 1.0f);
522
587
  REQUIRE(sketch2.get_max_value() == 2.0f);
523
588
  }
524
589
 
525
590
  SECTION("merge min and max values from other") {
526
- kll_float_sketch sketch1;
527
- for (int i = 0; i < 1000000; i++) sketch1.update(i);
528
- kll_float_sketch sketch2;
591
+ kll_float_sketch sketch1(200, 0);
592
+ for (int i = 0; i < 1000000; i++) sketch1.update(static_cast<float>(i));
593
+ kll_float_sketch sketch2(200, 0);
529
594
  sketch2.merge(sketch1);
530
595
  REQUIRE(sketch2.get_min_value() == 0.0f);
531
596
  REQUIRE(sketch2.get_max_value() == 999999.0f);
@@ -537,7 +602,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
537
602
  REQUIRE_THROWS_AS(sketch.get_min_value(), std::runtime_error);
538
603
  REQUIRE_THROWS_AS(sketch.get_max_value(), std::runtime_error);
539
604
 
540
- const int n(1000);
605
+ const int n = 1000;
541
606
  for (int i = 0; i < n; i++) sketch.update(i);
542
607
 
543
608
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
@@ -560,7 +625,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
560
625
  }
561
626
 
562
627
  SECTION("sketch of strings stream") {
563
- kll_string_sketch sketch1;
628
+ kll_string_sketch sketch1(200, 0);
564
629
  REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
565
630
  REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
566
631
  REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
@@ -575,7 +640,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
575
640
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
576
641
  sketch1.serialize(s);
577
642
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch1.get_serialized_size_bytes());
578
- auto sketch2 = kll_string_sketch::deserialize(s);
643
+ auto sketch2 = kll_string_sketch::deserialize(s, test_allocator<std::string>(0));
579
644
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
580
645
  REQUIRE(s.tellg() == s.tellp());
581
646
  REQUIRE(sketch2.is_empty() == sketch1.is_empty());
@@ -599,7 +664,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
599
664
  }
600
665
 
601
666
  SECTION("sketch of strings bytes") {
602
- kll_string_sketch sketch1;
667
+ kll_string_sketch sketch1(200, 0);
603
668
  REQUIRE_THROWS_AS(sketch1.get_quantile(0), std::runtime_error);
604
669
  REQUIRE_THROWS_AS(sketch1.get_min_value(), std::runtime_error);
605
670
  REQUIRE_THROWS_AS(sketch1.get_max_value(), std::runtime_error);
@@ -613,7 +678,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
613
678
 
614
679
  auto bytes = sketch1.serialize();
615
680
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
616
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
681
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
617
682
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
618
683
  REQUIRE(sketch2.is_empty() == sketch1.is_empty());
619
684
  REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
@@ -630,11 +695,11 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
630
695
 
631
696
 
632
697
  SECTION("sketch of strings, single item, bytes") {
633
- kll_string_sketch sketch1;
698
+ kll_string_sketch sketch1(200, 0);
634
699
  sketch1.update("a");
635
700
  auto bytes = sketch1.serialize();
636
701
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
637
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size());
702
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
638
703
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
639
704
  }
640
705
 
@@ -676,6 +741,31 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
676
741
  }
677
742
  }
678
743
 
744
+ SECTION("max serialized size arithmetic type") {
745
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 10) == 1968);
746
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 100) == 2316);
747
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000) == 2440);
748
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000) == 2800);
749
+ REQUIRE(kll_sketch<float>::get_max_serialized_size_bytes(200, 1000000000) == 3160);
750
+ }
751
+
752
+ SECTION("max serialized size non-arithmetic type") {
753
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 10, 4) == 1968);
754
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 100, 4) == 2316);
755
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000, 4) == 2440);
756
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000, 4) == 2800);
757
+ REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
758
+ }
759
+
760
+ SECTION("issue #236") {
761
+ kll_sketch<int8_t> kll;
762
+ kll.update(1);
763
+ kll.update(2);
764
+ kll.update(3);
765
+ auto blob = kll.serialize();
766
+ auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
767
+ }
768
+
679
769
  // cleanup
680
770
  if (test_allocator_total_bytes != 0) {
681
771
  REQUIRE(test_allocator_total_bytes == 0);
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+
22
+ #include <random>
23
+
24
+ #include <kll_sketch.hpp>
25
+ #include <kolmogorov_smirnov.hpp>
26
+
27
+ namespace datasketches {
28
+
29
+ TEST_CASE("kolmogorov-smirnov empty", "[kll_sketch]") {
30
+ const uint16_t k = 200;
31
+ kll_sketch<double> sketch1(k);
32
+ kll_sketch<double> sketch2(k);
33
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == 0);
34
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
35
+ }
36
+
37
+ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
38
+ const uint16_t k = 200;
39
+ kll_sketch<double> sketch1(k);
40
+ kll_sketch<double> sketch2(k);
41
+ std::default_random_engine rand;
42
+ std::normal_distribution<double> distr;
43
+ const int n = k * 3 - 1;
44
+ for (int i = 0; i < n; ++i) {
45
+ const double x = distr(rand);
46
+ sketch1.update(x);
47
+ sketch2.update(x);
48
+ }
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
50
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
+ }
52
+
53
+ TEST_CASE("kolmogorov-smirnov very different distributions", "[kll_sketch]") {
54
+ const uint16_t k = 200;
55
+ kll_sketch<double> sketch1(k);
56
+ kll_sketch<double> sketch2(k);
57
+ std::default_random_engine rand;
58
+ std::normal_distribution<double> distr;
59
+ const int n = k * 3 - 1;
60
+ for (int i = 0; i < n; ++i) {
61
+ const double x = distr(rand);
62
+ sketch1.update(x + 100.0);
63
+ sketch2.update(x);
64
+ }
65
+ const auto delta = kolmogorov_smirnov::delta(sketch1, sketch2);
66
+ REQUIRE(delta == Approx(1.0).margin(1e-6));
67
+ REQUIRE(delta <= 1);
68
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
69
+ }
70
+
71
+ TEST_CASE("kolmogorov-smirnov slightly different distributions", "[kll_sketch]") {
72
+ const uint16_t k = 2000;
73
+ kll_sketch<double> sketch1(k);
74
+ kll_sketch<double> sketch2(k);
75
+ std::default_random_engine rand;
76
+ std::normal_distribution<double> distr;
77
+ const int n = k * 3 - 1;
78
+ for (int i = 0; i < n; ++i) {
79
+ const double x = distr(rand);
80
+ sketch1.update(x + 0.05);
81
+ sketch2.update(x);
82
+ }
83
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
84
+ REQUIRE(delta == Approx(0.02).margin(0.01));
85
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
86
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
87
+ REQUIRE_FALSE(delta > threshold);
88
+ REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
89
+ }
90
+
91
+ TEST_CASE("kolmogorov-smirnov slightly different distributions high resolution", "[kll_sketch]") {
92
+ const uint16_t k = 8000;
93
+ kll_sketch<double> sketch1(k);
94
+ kll_sketch<double> sketch2(k);
95
+ std::default_random_engine rand;
96
+ std::normal_distribution<double> distr;
97
+ const int n = k * 3 - 1;
98
+ for (int i = 0; i < n; ++i) {
99
+ const double x = distr(rand);
100
+ sketch1.update(x + 0.05);
101
+ sketch2.update(x);
102
+ }
103
+ const double delta = kolmogorov_smirnov::delta(sketch1, sketch2);
104
+ REQUIRE(delta == Approx(0.02).margin(0.01));
105
+ const double threshold = kolmogorov_smirnov::threshold(sketch1, sketch2, 0.05);
106
+ //std::cout << "delta=" << delta << ", threshold=" << threshold << "\n";
107
+ REQUIRE(delta > threshold);
108
+ REQUIRE(kolmogorov_smirnov::test(sketch1, sketch2, 0.05));
109
+ }
110
+
111
+ } /* namespace datasketches */
@@ -1,8 +1,10 @@
1
1
  [build-system]
2
2
  requires = ["wheel",
3
3
  "setuptools >= 30.3.0",
4
- "setuptools_scm",
5
- "cmake >= 3.12"]
4
+ "cmake >= 3.12",
5
+ "pip >= 10.0",
6
+ "pybind11[global] >= 2.6.0"]
7
+ build-backend = "setuptools.build_meta"
6
8
 
7
9
  [tool.tox]
8
10
  legacy_tox_ini = """