datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -27,7 +27,7 @@ namespace datasketches {
27
27
  using hll_sketch_test_alloc = hll_sketch_alloc<test_allocator<uint8_t>>;
28
28
  using alloc = test_allocator<uint8_t>;
29
29
 
30
- static void runCheckCopy(int lgConfigK, target_hll_type tgtHllType) {
30
+ static void runCheckCopy(uint8_t lgConfigK, target_hll_type tgtHllType) {
31
31
  hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
32
32
 
33
33
  for (int i = 0; i < 7; ++i) {
@@ -66,7 +66,7 @@ TEST_CASE("hll sketch: check copies", "[hll_sketch]") {
66
66
  }
67
67
 
68
68
  static void copyAs(target_hll_type srcType, target_hll_type dstType) {
69
- int lgK = 8;
69
+ uint8_t lgK = 8;
70
70
  int n1 = 7;
71
71
  int n2 = 24;
72
72
  int n3 = 1000;
@@ -109,7 +109,7 @@ TEST_CASE("hll sketch: check copy as", "[hll_sketch]") {
109
109
  TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
110
110
  test_allocator_total_bytes = 0;
111
111
  {
112
- int lgConfigK = 8;
112
+ uint8_t lgConfigK = 8;
113
113
  target_hll_type srcType = target_hll_type::HLL_8;
114
114
  hll_sketch_test_alloc sk(lgConfigK, srcType, false, 0);
115
115
 
@@ -124,7 +124,7 @@ TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
124
124
  sk.update(24); // HLL
125
125
  REQUIRE(sk.get_updatable_serialization_bytes() == 40 + 256);
126
126
 
127
- const int hllBytes = HllUtil<>::HLL_BYTE_ARR_START + (1 << lgConfigK);
127
+ const auto hllBytes = hll_constants::HLL_BYTE_ARR_START + (1 << lgConfigK);
128
128
  REQUIRE(sk.get_compact_serialization_bytes() == hllBytes);
129
129
  REQUIRE(hll_sketch::get_max_updatable_serialization_bytes(lgConfigK, HLL_8) == hllBytes);
130
130
  }
@@ -135,22 +135,22 @@ TEST_CASE("hll sketch: check num std dev", "[hll_sketch]") {
135
135
  REQUIRE_THROWS_AS(HllUtil<>::checkNumStdDev(0), std::invalid_argument);
136
136
  }
137
137
 
138
- void checkSerializationSizes(const int lgConfigK, target_hll_type tgtHllType) {
138
+ void checkSerializationSizes(uint8_t lgConfigK, target_hll_type tgtHllType) {
139
139
  hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
140
140
  int i;
141
141
 
142
142
  // LIST
143
143
  for (i = 0; i < 7; ++i) { sk.update(i); }
144
- int expected = HllUtil<>::LIST_INT_ARR_START + (i << 2);
144
+ auto expected = hll_constants::LIST_INT_ARR_START + (i << 2);
145
145
  REQUIRE(sk.get_compact_serialization_bytes() == expected);
146
- expected = HllUtil<>::LIST_INT_ARR_START + (4 << HllUtil<>::LG_INIT_LIST_SIZE);
146
+ expected = hll_constants::LIST_INT_ARR_START + (4 << hll_constants::LG_INIT_LIST_SIZE);
147
147
  REQUIRE(sk.get_updatable_serialization_bytes() == expected);
148
148
 
149
149
  // SET
150
150
  for (i = 7; i < 24; ++i) { sk.update(i); }
151
- expected = HllUtil<>::HASH_SET_INT_ARR_START + (i << 2);
151
+ expected = hll_constants::HASH_SET_INT_ARR_START + (i << 2);
152
152
  REQUIRE(sk.get_compact_serialization_bytes() == expected);
153
- expected = HllUtil<>::HASH_SET_INT_ARR_START + (4 << HllUtil<>::LG_INIT_SET_SIZE);
153
+ expected = hll_constants::HASH_SET_INT_ARR_START + (4 << hll_constants::LG_INIT_SET_SIZE);
154
154
  REQUIRE(sk.get_updatable_serialization_bytes() == expected);
155
155
  }
156
156
 
@@ -178,7 +178,7 @@ TEST_CASE("hll sketch: exercise to string", "[hll_sketch]") {
178
178
 
179
179
  // Creates and serializes then deserializes sketch.
180
180
  // Returns true if deserialized sketch is compact.
181
- static bool checkCompact(const int lgK, const int n, const target_hll_type type, bool compact) {
181
+ static bool checkCompact(uint8_t lgK, const int n, const target_hll_type type, bool compact) {
182
182
  hll_sketch_test_alloc sk(lgK, type, false, 0);
183
183
  for (int i = 0; i < n; ++i) { sk.update(i); }
184
184
 
@@ -201,7 +201,7 @@ static bool checkCompact(const int lgK, const int n, const target_hll_type type,
201
201
  TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
202
202
  test_allocator_total_bytes = 0;
203
203
  {
204
- int lgK = 8;
204
+ uint8_t lgK = 8;
205
205
  // unless/until we create non-updatable "direct" versions,
206
206
  // deserialized image should never be compact
207
207
  // LIST: follows serialization request
@@ -230,10 +230,10 @@ TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
230
230
  TEST_CASE("hll sketch: check k limits", "[hll_sketch]") {
231
231
  test_allocator_total_bytes = 0;
232
232
  {
233
- hll_sketch_test_alloc sketch1(HllUtil<>::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
234
- hll_sketch_test_alloc sketch2(HllUtil<>::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
235
- REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
236
- REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
233
+ hll_sketch_test_alloc sketch1(hll_constants::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
234
+ hll_sketch_test_alloc sketch2(hll_constants::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
235
+ REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
236
+ REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
237
237
  }
238
238
  REQUIRE(test_allocator_total_bytes == 0);
239
239
  }
@@ -24,23 +24,19 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
- static int min(int a, int b) {
28
- return (a < b) ? a : b;
29
- }
30
-
31
27
  static void println(std::string& str) {
32
28
  //std::cout << str << "\n";
33
29
  }
34
30
 
35
31
  static void basicUnion(uint64_t n1, uint64_t n2,
36
- uint64_t lgk1, uint64_t lgk2, uint64_t lgMaxK,
32
+ uint8_t lgk1, uint8_t lgk2, uint8_t lgMaxK,
37
33
  target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
38
34
  uint64_t v = 0;
39
35
  //int tot = n1 + n2;
40
36
 
41
37
  hll_sketch h1(lgk1, type1);
42
38
  hll_sketch h2(lgk2, type2);
43
- int lgControlK = min(min(lgk1, lgk2), lgMaxK);
39
+ uint8_t lgControlK = std::min(std::min(lgk1, lgk2), lgMaxK);
44
40
  hll_sketch control(lgControlK, resultType);
45
41
 
46
42
  for (uint64_t i = 0; i < n1; ++i) {
@@ -89,9 +85,9 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
89
85
  target_hll_type type2 = HLL_8;
90
86
  target_hll_type resultType = HLL_8;
91
87
 
92
- uint64_t lgK1 = 7;
93
- uint64_t lgK2 = 7;
94
- uint64_t lgMaxK = 7;
88
+ uint8_t lgK1 = 7;
89
+ uint8_t lgK2 = 7;
90
+ uint8_t lgMaxK = 7;
95
91
  uint64_t n1 = 7;
96
92
  uint64_t n2 = 7;
97
93
  basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
@@ -108,7 +104,7 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
108
104
  n2 = 14;
109
105
  basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
110
106
 
111
- int i = 0;
107
+ uint8_t i = 0;
112
108
  for (i = 7; i <= 13; ++i) {
113
109
  lgK1 = i;
114
110
  lgK2 = i;
@@ -184,9 +180,9 @@ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
184
180
  }
185
181
 
186
182
  TEST_CASE("hll union: check config k limits", "[hll_union]") {
187
- REQUIRE_THROWS_AS(hll_union(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
183
+ REQUIRE_THROWS_AS(hll_union(hll_constants::MIN_LOG_K - 1), std::invalid_argument);
188
184
 
189
- REQUIRE_THROWS_AS(hll_union(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
185
+ REQUIRE_THROWS_AS(hll_union(hll_constants::MAX_LOG_K + 1), std::invalid_argument);
190
186
  }
191
187
 
192
188
  static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
@@ -195,7 +191,7 @@ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est
195
191
  }
196
192
 
197
193
  TEST_CASE("hll union: check ub lb", "[hll_union]") {
198
- int lgK = 4;
194
+ uint8_t lgK = 4;
199
195
  int n = 1 << 20;
200
196
  bool oooFlag = false;
201
197
 
@@ -223,7 +219,7 @@ TEST_CASE("hll union: check ub lb", "[hll_union]") {
223
219
  }
224
220
 
225
221
  TEST_CASE("hll union: check conversions", "[hll_union]") {
226
- int lgK = 4;
222
+ uint8_t lgK = 4;
227
223
  hll_sketch sk1(lgK, HLL_8);
228
224
  hll_sketch sk2(lgK, HLL_8);
229
225
  int n = 1 << 20;
@@ -57,7 +57,7 @@ static int get_n(int lg_k, hll_mode mode) {
57
57
 
58
58
  static long v = 0;
59
59
 
60
- static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode) {
60
+ static hll_sketch build_sketch(uint8_t lg_k, target_hll_type hll_type, hll_mode mode) {
61
61
  hll_sketch sk(lg_k, hll_type);
62
62
  int n = get_n(lg_k, mode);
63
63
  for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
@@ -67,7 +67,7 @@ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode
67
67
 
68
68
  // merges a sketch to an empty union and gets result of the same type, checks binary equivalence
69
69
  static void union_one_update(bool compact) {
70
- for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
70
+ for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
71
71
  for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
72
72
  if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
73
73
  for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
@@ -102,7 +102,7 @@ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic
102
102
 
103
103
  // converts a sketch to a different type and converts back to the original type to check binary equivalence
104
104
  static void convert_back_and_forth(bool compact) {
105
- for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
105
+ for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
106
106
  for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
107
107
  if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
108
108
  for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
@@ -44,11 +44,11 @@ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
44
44
  auto ser2 = sk.serialize_updatable();
45
45
 
46
46
  REQUIRE(ser1.size() == ser2.size());
47
- int len = ser1.size();
47
+ size_t len = ser1.size();
48
48
  uint8_t* b1 = ser1.data();
49
49
  uint8_t* b2 = ser2.data();
50
50
 
51
- for (int i = 0; i < len; ++i) {
51
+ for (size_t i = 0; i < len; ++i) {
52
52
  REQUIRE(b2[i] == b1[i]);
53
53
  }
54
54
  }
@@ -129,7 +129,7 @@ static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
129
129
  REQUIRE(sk1.get_target_type() == sk2.get_target_type());
130
130
  }
131
131
 
132
- static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const int n) {
132
+ static void toFrom(const uint8_t lgConfigK, const target_hll_type tgtHllType, const int n) {
133
133
  hll_sketch src(lgConfigK, tgtHllType);
134
134
  for (int i = 0; i < n; ++i) {
135
135
  src.update(i);
@@ -157,7 +157,7 @@ static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const
157
157
  TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
158
158
  for (int i = 0; i < 10; ++i) {
159
159
  int n = nArr[i];
160
- for (int lgK = 4; lgK <= 13; ++lgK) {
160
+ for (uint8_t lgK = 4; lgK <= 13; ++lgK) {
161
161
  toFrom(lgK, HLL_4, n);
162
162
  toFrom(lgK, HLL_6, n);
163
163
  toFrom(lgK, HLL_8, n);
@@ -32,27 +32,17 @@ target_include_directories(kll
32
32
  target_link_libraries(kll INTERFACE common)
33
33
  target_compile_features(kll INTERFACE cxx_std_11)
34
34
 
35
- set(kll_HEADERS "")
36
- list(APPEND kll_HEADERS "include/kll_sketch.hpp")
37
- list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
38
- list(APPEND kll_HEADERS "include/kll_helper.hpp")
39
- list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
40
- list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
41
- list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
42
-
43
35
  install(TARGETS kll
44
36
  EXPORT ${PROJECT_NAME}
45
37
  )
46
38
 
47
- install(FILES ${kll_HEADERS}
39
+ install(FILES
40
+ include/kll_sketch.hpp
41
+ include/kll_sketch_impl.hpp
42
+ include/kll_helper.hpp
43
+ include/kll_helper_impl.hpp
44
+ include/kll_quantile_calculator.hpp
45
+ include/kll_quantile_calculator_impl.hpp
46
+ include/kolmogorov_smirnov.hpp
47
+ include/kolmogorov_smirnov_impl.hpp
48
48
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
49
-
50
- target_sources(kll
51
- INTERFACE
52
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
54
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
55
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
56
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
57
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
58
- )
@@ -26,7 +26,8 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- static std::independent_bits_engine<std::mt19937, 1, uint32_t> random_bit(std::chrono::system_clock::now().time_since_epoch().count());
29
+ static std::independent_bits_engine<std::mt19937, 1, uint32_t>
30
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
30
31
 
31
32
  #ifdef KLL_VALIDATION
32
33
  extern uint32_t kll_next_offset;
@@ -46,9 +47,9 @@ class kll_helper {
46
47
  static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
47
48
  static inline uint8_t ub_on_num_levels(uint64_t n);
48
49
  static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
49
- static inline uint32_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
50
- static inline uint32_t int_cap_aux(uint16_t k, uint8_t depth);
51
- static inline uint32_t int_cap_aux_aux(uint16_t k, uint8_t depth);
50
+ static inline uint16_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
51
+ static inline uint16_t int_cap_aux(uint16_t k, uint8_t depth);
52
+ static inline uint16_t int_cap_aux_aux(uint16_t k, uint8_t depth);
52
53
  static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
53
54
 
54
55
  /*
@@ -55,28 +55,28 @@ uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_l
55
55
  return total;
56
56
  }
57
57
 
58
- uint32_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
58
+ uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
59
59
  if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
60
60
  const uint8_t depth = numLevels - height - 1;
61
- return std::max((uint32_t) min_wid, int_cap_aux(k, depth));
61
+ return std::max<uint16_t>(min_wid, int_cap_aux(k, depth));
62
62
  }
63
63
 
64
- uint32_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
64
+ uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
65
65
  if (depth > 60) throw std::invalid_argument("depth > 60");
66
66
  if (depth <= 30) return int_cap_aux_aux(k, depth);
67
67
  const uint8_t half = depth / 2;
68
68
  const uint8_t rest = depth - half;
69
- const uint32_t tmp = int_cap_aux_aux(k, half);
69
+ const uint16_t tmp = int_cap_aux_aux(k, half);
70
70
  return int_cap_aux_aux(tmp, rest);
71
71
  }
72
72
 
73
- uint32_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
73
+ uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
74
74
  if (depth > 30) throw std::invalid_argument("depth > 30");
75
75
  const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
76
76
  const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
77
77
  const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
78
78
  if (result > k) throw std::logic_error("result > k");
79
- return result;
79
+ return static_cast<uint16_t>(result);
80
80
  }
81
81
 
82
82
  uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
@@ -24,19 +24,27 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
+ // forward declaration
28
+ template<typename T, typename C, typename S, typename A> class kll_sketch;
29
+
27
30
  template <typename T, typename C, typename A>
28
31
  class kll_quantile_calculator {
29
32
  public:
30
- // assumes that all levels are sorted including level 0
31
- kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator);
33
+ using Entry = std::pair<T, uint64_t>;
34
+ using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
35
+ using Container = std::vector<Entry, AllocEntry>;
36
+ using const_iterator = typename Container::const_iterator;
37
+
38
+ template<typename S>
39
+ kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
40
+
32
41
  T get_quantile(double fraction) const;
42
+ const_iterator begin() const;
43
+ const_iterator end() const;
33
44
 
34
45
  private:
35
46
  using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
36
47
  using vector_u32 = std::vector<uint32_t, AllocU32>;
37
- using Entry = std::pair<T, uint64_t>;
38
- using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
39
- using Container = std::vector<Entry, AllocEntry>;
40
48
  uint64_t n_;
41
49
  vector_u32 levels_;
42
50
  Container entries_;
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
45
53
  T approximately_answer_positional_query(uint64_t pos) const;
46
54
  void convert_to_preceding_cummulative();
47
55
  uint32_t chunk_containing_pos(uint64_t pos) const;
48
- uint32_t search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const;
56
+ uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
49
57
  static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
50
58
  static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
51
59
  static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
@@ -28,24 +28,38 @@
28
28
 
29
29
  namespace datasketches {
30
30
 
31
- template <typename T, typename C, typename A>
32
- kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator):
33
- n_(n), levels_(num_levels + 1, 0, allocator), entries_(allocator)
31
+ template<typename T, typename C, typename A>
32
+ template<typename S>
33
+ kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
34
+ n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
34
35
  {
35
- const uint32_t num_items = levels[num_levels] - levels[0];
36
- entries_.reserve(num_items);
37
- populate_from_sketch(items, levels, num_levels);
38
- merge_sorted_blocks(entries_, levels_.data(), levels_.size() - 1, num_items);
39
- if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
40
- convert_to_preceding_cummulative();
36
+ const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
37
+ if (num_items > 0) {
38
+ entries_.reserve(num_items);
39
+ populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
40
+ if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
41
+ merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
42
+ if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
43
+ convert_to_preceding_cummulative();
44
+ }
41
45
  }
42
46
 
43
- template <typename T, typename C, typename A>
47
+ template<typename T, typename C, typename A>
44
48
  T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
45
49
  return approximately_answer_positional_query(pos_of_phi(fraction, n_));
46
50
  }
47
51
 
48
- template <typename T, typename C, typename A>
52
+ template<typename T, typename C, typename A>
53
+ auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
54
+ return entries_.begin();
55
+ }
56
+
57
+ template<typename T, typename C, typename A>
58
+ auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
59
+ return entries_.end();
60
+ }
61
+
62
+ template<typename T, typename C, typename A>
49
63
  void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
50
64
  size_t src_level = 0;
51
65
  size_t dst_level = 0;
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
68
82
  if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
69
83
  }
70
84
 
71
- template <typename T, typename C, typename A>
85
+ template<typename T, typename C, typename A>
72
86
  T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
73
87
  if (pos >= n_) throw std::logic_error("position out of range");
74
88
  const uint32_t num_items = levels_[levels_.size() - 1];
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
77
91
  return entries_[index].first;
78
92
  }
79
93
 
80
- template <typename T, typename C, typename A>
94
+ template<typename T, typename C, typename A>
81
95
  void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
82
96
  uint64_t subtotal = 0;
83
97
  for (auto& entry: entries_) {
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
87
101
  }
88
102
  }
89
103
 
90
- template <typename T, typename C, typename A>
104
+ template<typename T, typename C, typename A>
91
105
  uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
92
- const uint64_t pos = std::floor(phi * n);
106
+ const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
93
107
  return (pos == n) ? n - 1 : pos;
94
108
  }
95
109
 
96
- template <typename T, typename C, typename A>
110
+ template<typename T, typename C, typename A>
97
111
  uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
98
112
  if (entries_.size() < 1) throw std::logic_error("array too short");
99
113
  if (pos < entries_[0].second) throw std::logic_error("position too small");
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
101
115
  return search_for_chunk_containing_pos(pos, 0, entries_.size());
102
116
  }
103
117
 
104
- template <typename T, typename C, typename A>
105
- uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const {
118
+ template<typename T, typename C, typename A>
119
+ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
106
120
  if (l + 1 == r) {
107
- return l;
121
+ return static_cast<uint32_t>(l);
108
122
  }
109
- const uint32_t m(l + (r - l) / 2);
123
+ const uint64_t m = l + (r - l) / 2;
110
124
  if (entries_[m].second <= pos) {
111
125
  return search_for_chunk_containing_pos(pos, m, r);
112
126
  }
113
127
  return search_for_chunk_containing_pos(pos, l, m);
114
128
  }
115
129
 
116
- template <typename T, typename C, typename A>
130
+ template<typename T, typename C, typename A>
117
131
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
118
132
  if (num_levels == 1) return;
119
133
  Container temporary(entries.get_allocator());
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
121
135
  merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
122
136
  }
123
137
 
124
- template <typename T, typename C, typename A>
138
+ template<typename T, typename C, typename A>
125
139
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
126
140
  uint8_t starting_level, uint8_t num_levels) {
127
141
  if (num_levels == 1) return;
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
129
143
  const uint8_t num_levels_2 = num_levels - num_levels_1;
130
144
  const uint8_t starting_level_1 = starting_level;
131
145
  const uint8_t starting_level_2 = starting_level + num_levels_1;
132
- const auto chunk_begin = temp.begin() + temp.size();
146
+ const auto initial_size = temp.size();
133
147
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
134
148
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
135
149
  const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
150
+ const auto chunk_begin = temp.begin() + initial_size;
136
151
  std::merge(
137
152
  std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
138
153
  std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
141
156
  temp.erase(chunk_begin, temp.end());
142
157
  }
143
158
 
144
- template <typename T, typename C, typename A>
159
+ template<typename T, typename C, typename A>
145
160
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
146
161
  uint8_t starting_level, uint8_t num_levels) {
147
162
  if (num_levels == 1) {
@@ -153,15 +153,23 @@ template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
153
153
  template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
154
154
  template<typename A> using vector_d = std::vector<double, AllocD<A>>;
155
155
 
156
+ namespace kll_constants {
157
+ const uint16_t DEFAULT_K = 200;
158
+ }
159
+
156
160
  template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
157
161
  class kll_sketch {
158
162
  public:
163
+ using value_type = T;
164
+ using comparator = C;
165
+
159
166
  static const uint8_t DEFAULT_M = 8;
160
- static const uint16_t DEFAULT_K = 200;
167
+ // TODO: Redundant and deprecated. Will be remove din next major version.
168
+ static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
161
169
  static const uint16_t MIN_K = DEFAULT_M;
162
170
  static const uint16_t MAX_K = (1 << 16) - 1;
163
171
 
164
- explicit kll_sketch(uint16_t k = DEFAULT_K, const A& allocator = A());
172
+ explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
165
173
  kll_sketch(const kll_sketch& other);
166
174
  kll_sketch(kll_sketch&& other) noexcept;
167
175
  ~kll_sketch();
@@ -296,7 +304,7 @@ class kll_sketch {
296
304
  *
297
305
  * @return array of approximations to the given number of evenly-spaced fractional ranks.
298
306
  */
299
- std::vector<T, A> get_quantiles(size_t num) const;
307
+ std::vector<T, A> get_quantiles(uint32_t num) const;
300
308
 
301
309
  /**
302
310
  * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
@@ -383,6 +391,33 @@ class kll_sketch {
383
391
  template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
384
392
  size_t get_serialized_size_bytes() const;
385
393
 
394
+ /**
395
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
396
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
397
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
398
+ * optimal.
399
+ * This method is for arithmetic types (integral and floating point)
400
+ * @param k parameter that controls size of the sketch and accuracy of estimates
401
+ * @param n stream length
402
+ * @return upper bound on the serialized size
403
+ */
404
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
405
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
406
+
407
+ /**
408
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
409
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
410
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
411
+ * optimal.
412
+ * This method is for all other non-arithmetic types, and it takes a max size of an item as input.
413
+ * @param k parameter that controls size of the sketch and accuracy of estimates
414
+ * @param n stream length
415
+ * @param max_item_size_bytes maximum size of an item in bytes
416
+ * @return upper bound on the serialized size
417
+ */
418
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
419
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
420
+
386
421
  /**
387
422
  * This method serializes the sketch into a given stream in a binary form
388
423
  * @param os output stream
@@ -391,7 +426,7 @@ class kll_sketch {
391
426
 
392
427
  // This is a convenience alias for users
393
428
  // The type returned by the following serialize method
394
- typedef vector_u8<A> vector_bytes;
429
+ using vector_bytes = vector_u8<A>;
395
430
 
396
431
  /**
397
432
  * This method serializes the sketch as a vector of bytes.
@@ -480,6 +515,8 @@ class kll_sketch {
480
515
  T* max_value_;
481
516
  bool is_level_zero_sorted_;
482
517
 
518
+ friend class kll_quantile_calculator<T, C, A>;
519
+
483
520
  // for deserialization
484
521
  class item_deleter;
485
522
  class items_deleter;