datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -27,7 +27,7 @@ namespace datasketches {
27
27
  using hll_sketch_test_alloc = hll_sketch_alloc<test_allocator<uint8_t>>;
28
28
  using alloc = test_allocator<uint8_t>;
29
29
 
30
- static void runCheckCopy(int lgConfigK, target_hll_type tgtHllType) {
30
+ static void runCheckCopy(uint8_t lgConfigK, target_hll_type tgtHllType) {
31
31
  hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
32
32
 
33
33
  for (int i = 0; i < 7; ++i) {
@@ -66,7 +66,7 @@ TEST_CASE("hll sketch: check copies", "[hll_sketch]") {
66
66
  }
67
67
 
68
68
  static void copyAs(target_hll_type srcType, target_hll_type dstType) {
69
- int lgK = 8;
69
+ uint8_t lgK = 8;
70
70
  int n1 = 7;
71
71
  int n2 = 24;
72
72
  int n3 = 1000;
@@ -109,7 +109,7 @@ TEST_CASE("hll sketch: check copy as", "[hll_sketch]") {
109
109
  TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
110
110
  test_allocator_total_bytes = 0;
111
111
  {
112
- int lgConfigK = 8;
112
+ uint8_t lgConfigK = 8;
113
113
  target_hll_type srcType = target_hll_type::HLL_8;
114
114
  hll_sketch_test_alloc sk(lgConfigK, srcType, false, 0);
115
115
 
@@ -124,7 +124,7 @@ TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
124
124
  sk.update(24); // HLL
125
125
  REQUIRE(sk.get_updatable_serialization_bytes() == 40 + 256);
126
126
 
127
- const int hllBytes = HllUtil<>::HLL_BYTE_ARR_START + (1 << lgConfigK);
127
+ const auto hllBytes = hll_constants::HLL_BYTE_ARR_START + (1 << lgConfigK);
128
128
  REQUIRE(sk.get_compact_serialization_bytes() == hllBytes);
129
129
  REQUIRE(hll_sketch::get_max_updatable_serialization_bytes(lgConfigK, HLL_8) == hllBytes);
130
130
  }
@@ -135,22 +135,22 @@ TEST_CASE("hll sketch: check num std dev", "[hll_sketch]") {
135
135
  REQUIRE_THROWS_AS(HllUtil<>::checkNumStdDev(0), std::invalid_argument);
136
136
  }
137
137
 
138
- void checkSerializationSizes(const int lgConfigK, target_hll_type tgtHllType) {
138
+ void checkSerializationSizes(uint8_t lgConfigK, target_hll_type tgtHllType) {
139
139
  hll_sketch_test_alloc sk(lgConfigK, tgtHllType, false, 0);
140
140
  int i;
141
141
 
142
142
  // LIST
143
143
  for (i = 0; i < 7; ++i) { sk.update(i); }
144
- int expected = HllUtil<>::LIST_INT_ARR_START + (i << 2);
144
+ auto expected = hll_constants::LIST_INT_ARR_START + (i << 2);
145
145
  REQUIRE(sk.get_compact_serialization_bytes() == expected);
146
- expected = HllUtil<>::LIST_INT_ARR_START + (4 << HllUtil<>::LG_INIT_LIST_SIZE);
146
+ expected = hll_constants::LIST_INT_ARR_START + (4 << hll_constants::LG_INIT_LIST_SIZE);
147
147
  REQUIRE(sk.get_updatable_serialization_bytes() == expected);
148
148
 
149
149
  // SET
150
150
  for (i = 7; i < 24; ++i) { sk.update(i); }
151
- expected = HllUtil<>::HASH_SET_INT_ARR_START + (i << 2);
151
+ expected = hll_constants::HASH_SET_INT_ARR_START + (i << 2);
152
152
  REQUIRE(sk.get_compact_serialization_bytes() == expected);
153
- expected = HllUtil<>::HASH_SET_INT_ARR_START + (4 << HllUtil<>::LG_INIT_SET_SIZE);
153
+ expected = hll_constants::HASH_SET_INT_ARR_START + (4 << hll_constants::LG_INIT_SET_SIZE);
154
154
  REQUIRE(sk.get_updatable_serialization_bytes() == expected);
155
155
  }
156
156
 
@@ -178,7 +178,7 @@ TEST_CASE("hll sketch: exercise to string", "[hll_sketch]") {
178
178
 
179
179
  // Creates and serializes then deserializes sketch.
180
180
  // Returns true if deserialized sketch is compact.
181
- static bool checkCompact(const int lgK, const int n, const target_hll_type type, bool compact) {
181
+ static bool checkCompact(uint8_t lgK, const int n, const target_hll_type type, bool compact) {
182
182
  hll_sketch_test_alloc sk(lgK, type, false, 0);
183
183
  for (int i = 0; i < n; ++i) { sk.update(i); }
184
184
 
@@ -201,7 +201,7 @@ static bool checkCompact(const int lgK, const int n, const target_hll_type type,
201
201
  TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
202
202
  test_allocator_total_bytes = 0;
203
203
  {
204
- int lgK = 8;
204
+ uint8_t lgK = 8;
205
205
  // unless/until we create non-updatable "direct" versions,
206
206
  // deserialized image should never be compact
207
207
  // LIST: follows serialization request
@@ -230,10 +230,10 @@ TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
230
230
  TEST_CASE("hll sketch: check k limits", "[hll_sketch]") {
231
231
  test_allocator_total_bytes = 0;
232
232
  {
233
- hll_sketch_test_alloc sketch1(HllUtil<>::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
234
- hll_sketch_test_alloc sketch2(HllUtil<>::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
235
- REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
236
- REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
233
+ hll_sketch_test_alloc sketch1(hll_constants::MIN_LOG_K, target_hll_type::HLL_8, false, 0);
234
+ hll_sketch_test_alloc sketch2(hll_constants::MAX_LOG_K, target_hll_type::HLL_4, false, 0);
235
+ REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MIN_LOG_K - 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
236
+ REQUIRE_THROWS_AS(hll_sketch_test_alloc(hll_constants::MAX_LOG_K + 1, target_hll_type::HLL_4, false, 0), std::invalid_argument);
237
237
  }
238
238
  REQUIRE(test_allocator_total_bytes == 0);
239
239
  }
@@ -24,23 +24,19 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
- static int min(int a, int b) {
28
- return (a < b) ? a : b;
29
- }
30
-
31
27
  static void println(std::string& str) {
32
28
  //std::cout << str << "\n";
33
29
  }
34
30
 
35
31
  static void basicUnion(uint64_t n1, uint64_t n2,
36
- uint64_t lgk1, uint64_t lgk2, uint64_t lgMaxK,
32
+ uint8_t lgk1, uint8_t lgk2, uint8_t lgMaxK,
37
33
  target_hll_type type1, target_hll_type type2, target_hll_type resultType) {
38
34
  uint64_t v = 0;
39
35
  //int tot = n1 + n2;
40
36
 
41
37
  hll_sketch h1(lgk1, type1);
42
38
  hll_sketch h2(lgk2, type2);
43
- int lgControlK = min(min(lgk1, lgk2), lgMaxK);
39
+ uint8_t lgControlK = std::min(std::min(lgk1, lgk2), lgMaxK);
44
40
  hll_sketch control(lgControlK, resultType);
45
41
 
46
42
  for (uint64_t i = 0; i < n1; ++i) {
@@ -89,9 +85,9 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
89
85
  target_hll_type type2 = HLL_8;
90
86
  target_hll_type resultType = HLL_8;
91
87
 
92
- uint64_t lgK1 = 7;
93
- uint64_t lgK2 = 7;
94
- uint64_t lgMaxK = 7;
88
+ uint8_t lgK1 = 7;
89
+ uint8_t lgK2 = 7;
90
+ uint8_t lgMaxK = 7;
95
91
  uint64_t n1 = 7;
96
92
  uint64_t n2 = 7;
97
93
  basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
@@ -108,7 +104,7 @@ TEST_CASE("hll union: check unions", "[hll_union]") {
108
104
  n2 = 14;
109
105
  basicUnion(n1, n2, lgK1, lgK2, lgMaxK, type1, type2, resultType);
110
106
 
111
- int i = 0;
107
+ uint8_t i = 0;
112
108
  for (i = 7; i <= 13; ++i) {
113
109
  lgK1 = i;
114
110
  lgK2 = i;
@@ -184,9 +180,9 @@ TEST_CASE("hll union: check composite estimate", "[hll_union]") {
184
180
  }
185
181
 
186
182
  TEST_CASE("hll union: check config k limits", "[hll_union]") {
187
- REQUIRE_THROWS_AS(hll_union(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
183
+ REQUIRE_THROWS_AS(hll_union(hll_constants::MIN_LOG_K - 1), std::invalid_argument);
188
184
 
189
- REQUIRE_THROWS_AS(hll_union(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
185
+ REQUIRE_THROWS_AS(hll_union(hll_constants::MAX_LOG_K + 1), std::invalid_argument);
190
186
  }
191
187
 
192
188
  static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est) {
@@ -195,7 +191,7 @@ static double getBound(int lgK, bool ub, bool oooFlag, int numStdDev, double est
195
191
  }
196
192
 
197
193
  TEST_CASE("hll union: check ub lb", "[hll_union]") {
198
- int lgK = 4;
194
+ uint8_t lgK = 4;
199
195
  int n = 1 << 20;
200
196
  bool oooFlag = false;
201
197
 
@@ -223,7 +219,7 @@ TEST_CASE("hll union: check ub lb", "[hll_union]") {
223
219
  }
224
220
 
225
221
  TEST_CASE("hll union: check conversions", "[hll_union]") {
226
- int lgK = 4;
222
+ uint8_t lgK = 4;
227
223
  hll_sketch sk1(lgK, HLL_8);
228
224
  hll_sketch sk2(lgK, HLL_8);
229
225
  int n = 1 << 20;
@@ -57,7 +57,7 @@ static int get_n(int lg_k, hll_mode mode) {
57
57
 
58
58
  static long v = 0;
59
59
 
60
- static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode) {
60
+ static hll_sketch build_sketch(uint8_t lg_k, target_hll_type hll_type, hll_mode mode) {
61
61
  hll_sketch sk(lg_k, hll_type);
62
62
  int n = get_n(lg_k, mode);
63
63
  for (int i = 0; i < n; i++) sk.update(static_cast<uint64_t>(i + v));
@@ -67,7 +67,7 @@ static hll_sketch build_sketch(int lg_k, target_hll_type hll_type, hll_mode mode
67
67
 
68
68
  // merges a sketch to an empty union and gets result of the same type, checks binary equivalence
69
69
  static void union_one_update(bool compact) {
70
- for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
70
+ for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
71
71
  for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
72
72
  if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
73
73
  for (int t = 0; t <= 2; t++) { // HLL_4, HLL_6, HLL_8
@@ -102,7 +102,7 @@ TEST_CASE("hll isomorphic: union one update serialize compact", "[hll_isomorphic
102
102
 
103
103
  // converts a sketch to a different type and converts back to the original type to check binary equivalence
104
104
  static void convert_back_and_forth(bool compact) {
105
- for (int lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
105
+ for (uint8_t lg_k = 4; lg_k <= 21; lg_k++) { // all lg_k
106
106
  for (int mode = 0; mode <= 2; mode++) { // List, Set, Hll
107
107
  if ((lg_k < 8) && (mode == 1)) continue; // lg_k < 8 list transitions directly to HLL
108
108
  for (int t1 = 0; t1 <= 2; t1++) { // HLL_4, HLL_6, HLL_8
@@ -44,11 +44,11 @@ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
44
44
  auto ser2 = sk.serialize_updatable();
45
45
 
46
46
  REQUIRE(ser1.size() == ser2.size());
47
- int len = ser1.size();
47
+ size_t len = ser1.size();
48
48
  uint8_t* b1 = ser1.data();
49
49
  uint8_t* b2 = ser2.data();
50
50
 
51
- for (int i = 0; i < len; ++i) {
51
+ for (size_t i = 0; i < len; ++i) {
52
52
  REQUIRE(b2[i] == b1[i]);
53
53
  }
54
54
  }
@@ -129,7 +129,7 @@ static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
129
129
  REQUIRE(sk1.get_target_type() == sk2.get_target_type());
130
130
  }
131
131
 
132
- static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const int n) {
132
+ static void toFrom(const uint8_t lgConfigK, const target_hll_type tgtHllType, const int n) {
133
133
  hll_sketch src(lgConfigK, tgtHllType);
134
134
  for (int i = 0; i < n; ++i) {
135
135
  src.update(i);
@@ -157,7 +157,7 @@ static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const
157
157
  TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
158
158
  for (int i = 0; i < 10; ++i) {
159
159
  int n = nArr[i];
160
- for (int lgK = 4; lgK <= 13; ++lgK) {
160
+ for (uint8_t lgK = 4; lgK <= 13; ++lgK) {
161
161
  toFrom(lgK, HLL_4, n);
162
162
  toFrom(lgK, HLL_6, n);
163
163
  toFrom(lgK, HLL_8, n);
@@ -32,27 +32,17 @@ target_include_directories(kll
32
32
  target_link_libraries(kll INTERFACE common)
33
33
  target_compile_features(kll INTERFACE cxx_std_11)
34
34
 
35
- set(kll_HEADERS "")
36
- list(APPEND kll_HEADERS "include/kll_sketch.hpp")
37
- list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
38
- list(APPEND kll_HEADERS "include/kll_helper.hpp")
39
- list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
40
- list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
41
- list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
42
-
43
35
  install(TARGETS kll
44
36
  EXPORT ${PROJECT_NAME}
45
37
  )
46
38
 
47
- install(FILES ${kll_HEADERS}
39
+ install(FILES
40
+ include/kll_sketch.hpp
41
+ include/kll_sketch_impl.hpp
42
+ include/kll_helper.hpp
43
+ include/kll_helper_impl.hpp
44
+ include/kll_quantile_calculator.hpp
45
+ include/kll_quantile_calculator_impl.hpp
46
+ include/kolmogorov_smirnov.hpp
47
+ include/kolmogorov_smirnov_impl.hpp
48
48
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
49
-
50
- target_sources(kll
51
- INTERFACE
52
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
54
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
55
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
56
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
57
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
58
- )
@@ -26,7 +26,8 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
- static std::independent_bits_engine<std::mt19937, 1, uint32_t> random_bit(std::chrono::system_clock::now().time_since_epoch().count());
29
+ static std::independent_bits_engine<std::mt19937, 1, uint32_t>
30
+ random_bit(static_cast<uint32_t>(std::chrono::system_clock::now().time_since_epoch().count()));
30
31
 
31
32
  #ifdef KLL_VALIDATION
32
33
  extern uint32_t kll_next_offset;
@@ -46,9 +47,9 @@ class kll_helper {
46
47
  static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
47
48
  static inline uint8_t ub_on_num_levels(uint64_t n);
48
49
  static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
49
- static inline uint32_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
50
- static inline uint32_t int_cap_aux(uint16_t k, uint8_t depth);
51
- static inline uint32_t int_cap_aux_aux(uint16_t k, uint8_t depth);
50
+ static inline uint16_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
51
+ static inline uint16_t int_cap_aux(uint16_t k, uint8_t depth);
52
+ static inline uint16_t int_cap_aux_aux(uint16_t k, uint8_t depth);
52
53
  static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
53
54
 
54
55
  /*
@@ -55,28 +55,28 @@ uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_l
55
55
  return total;
56
56
  }
57
57
 
58
- uint32_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
58
+ uint16_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
59
59
  if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
60
60
  const uint8_t depth = numLevels - height - 1;
61
- return std::max((uint32_t) min_wid, int_cap_aux(k, depth));
61
+ return std::max<uint16_t>(min_wid, int_cap_aux(k, depth));
62
62
  }
63
63
 
64
- uint32_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
64
+ uint16_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
65
65
  if (depth > 60) throw std::invalid_argument("depth > 60");
66
66
  if (depth <= 30) return int_cap_aux_aux(k, depth);
67
67
  const uint8_t half = depth / 2;
68
68
  const uint8_t rest = depth - half;
69
- const uint32_t tmp = int_cap_aux_aux(k, half);
69
+ const uint16_t tmp = int_cap_aux_aux(k, half);
70
70
  return int_cap_aux_aux(tmp, rest);
71
71
  }
72
72
 
73
- uint32_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
73
+ uint16_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
74
74
  if (depth > 30) throw std::invalid_argument("depth > 30");
75
75
  const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
76
76
  const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
77
77
  const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
78
78
  if (result > k) throw std::logic_error("result > k");
79
- return result;
79
+ return static_cast<uint16_t>(result);
80
80
  }
81
81
 
82
82
  uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
@@ -24,19 +24,27 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
+ // forward declaration
28
+ template<typename T, typename C, typename S, typename A> class kll_sketch;
29
+
27
30
  template <typename T, typename C, typename A>
28
31
  class kll_quantile_calculator {
29
32
  public:
30
- // assumes that all levels are sorted including level 0
31
- kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator);
33
+ using Entry = std::pair<T, uint64_t>;
34
+ using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
35
+ using Container = std::vector<Entry, AllocEntry>;
36
+ using const_iterator = typename Container::const_iterator;
37
+
38
+ template<typename S>
39
+ kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
40
+
32
41
  T get_quantile(double fraction) const;
42
+ const_iterator begin() const;
43
+ const_iterator end() const;
33
44
 
34
45
  private:
35
46
  using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
36
47
  using vector_u32 = std::vector<uint32_t, AllocU32>;
37
- using Entry = std::pair<T, uint64_t>;
38
- using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
39
- using Container = std::vector<Entry, AllocEntry>;
40
48
  uint64_t n_;
41
49
  vector_u32 levels_;
42
50
  Container entries_;
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
45
53
  T approximately_answer_positional_query(uint64_t pos) const;
46
54
  void convert_to_preceding_cummulative();
47
55
  uint32_t chunk_containing_pos(uint64_t pos) const;
48
- uint32_t search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const;
56
+ uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
49
57
  static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
50
58
  static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
51
59
  static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
@@ -28,24 +28,38 @@
28
28
 
29
29
  namespace datasketches {
30
30
 
31
- template <typename T, typename C, typename A>
32
- kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator):
33
- n_(n), levels_(num_levels + 1, 0, allocator), entries_(allocator)
31
+ template<typename T, typename C, typename A>
32
+ template<typename S>
33
+ kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
34
+ n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
34
35
  {
35
- const uint32_t num_items = levels[num_levels] - levels[0];
36
- entries_.reserve(num_items);
37
- populate_from_sketch(items, levels, num_levels);
38
- merge_sorted_blocks(entries_, levels_.data(), levels_.size() - 1, num_items);
39
- if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
40
- convert_to_preceding_cummulative();
36
+ const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
37
+ if (num_items > 0) {
38
+ entries_.reserve(num_items);
39
+ populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
40
+ if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
41
+ merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
42
+ if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
43
+ convert_to_preceding_cummulative();
44
+ }
41
45
  }
42
46
 
43
- template <typename T, typename C, typename A>
47
+ template<typename T, typename C, typename A>
44
48
  T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
45
49
  return approximately_answer_positional_query(pos_of_phi(fraction, n_));
46
50
  }
47
51
 
48
- template <typename T, typename C, typename A>
52
+ template<typename T, typename C, typename A>
53
+ auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
54
+ return entries_.begin();
55
+ }
56
+
57
+ template<typename T, typename C, typename A>
58
+ auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
59
+ return entries_.end();
60
+ }
61
+
62
+ template<typename T, typename C, typename A>
49
63
  void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
50
64
  size_t src_level = 0;
51
65
  size_t dst_level = 0;
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
68
82
  if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
69
83
  }
70
84
 
71
- template <typename T, typename C, typename A>
85
+ template<typename T, typename C, typename A>
72
86
  T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
73
87
  if (pos >= n_) throw std::logic_error("position out of range");
74
88
  const uint32_t num_items = levels_[levels_.size() - 1];
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
77
91
  return entries_[index].first;
78
92
  }
79
93
 
80
- template <typename T, typename C, typename A>
94
+ template<typename T, typename C, typename A>
81
95
  void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
82
96
  uint64_t subtotal = 0;
83
97
  for (auto& entry: entries_) {
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
87
101
  }
88
102
  }
89
103
 
90
- template <typename T, typename C, typename A>
104
+ template<typename T, typename C, typename A>
91
105
  uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
92
- const uint64_t pos = std::floor(phi * n);
106
+ const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
93
107
  return (pos == n) ? n - 1 : pos;
94
108
  }
95
109
 
96
- template <typename T, typename C, typename A>
110
+ template<typename T, typename C, typename A>
97
111
  uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
98
112
  if (entries_.size() < 1) throw std::logic_error("array too short");
99
113
  if (pos < entries_[0].second) throw std::logic_error("position too small");
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
101
115
  return search_for_chunk_containing_pos(pos, 0, entries_.size());
102
116
  }
103
117
 
104
- template <typename T, typename C, typename A>
105
- uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const {
118
+ template<typename T, typename C, typename A>
119
+ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
106
120
  if (l + 1 == r) {
107
- return l;
121
+ return static_cast<uint32_t>(l);
108
122
  }
109
- const uint32_t m(l + (r - l) / 2);
123
+ const uint64_t m = l + (r - l) / 2;
110
124
  if (entries_[m].second <= pos) {
111
125
  return search_for_chunk_containing_pos(pos, m, r);
112
126
  }
113
127
  return search_for_chunk_containing_pos(pos, l, m);
114
128
  }
115
129
 
116
- template <typename T, typename C, typename A>
130
+ template<typename T, typename C, typename A>
117
131
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
118
132
  if (num_levels == 1) return;
119
133
  Container temporary(entries.get_allocator());
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
121
135
  merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
122
136
  }
123
137
 
124
- template <typename T, typename C, typename A>
138
+ template<typename T, typename C, typename A>
125
139
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
126
140
  uint8_t starting_level, uint8_t num_levels) {
127
141
  if (num_levels == 1) return;
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
129
143
  const uint8_t num_levels_2 = num_levels - num_levels_1;
130
144
  const uint8_t starting_level_1 = starting_level;
131
145
  const uint8_t starting_level_2 = starting_level + num_levels_1;
132
- const auto chunk_begin = temp.begin() + temp.size();
146
+ const auto initial_size = temp.size();
133
147
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
134
148
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
135
149
  const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
150
+ const auto chunk_begin = temp.begin() + initial_size;
136
151
  std::merge(
137
152
  std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
138
153
  std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
141
156
  temp.erase(chunk_begin, temp.end());
142
157
  }
143
158
 
144
- template <typename T, typename C, typename A>
159
+ template<typename T, typename C, typename A>
145
160
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
146
161
  uint8_t starting_level, uint8_t num_levels) {
147
162
  if (num_levels == 1) {
@@ -153,15 +153,23 @@ template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
153
153
  template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
154
154
  template<typename A> using vector_d = std::vector<double, AllocD<A>>;
155
155
 
156
+ namespace kll_constants {
157
+ const uint16_t DEFAULT_K = 200;
158
+ }
159
+
156
160
  template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
157
161
  class kll_sketch {
158
162
  public:
163
+ using value_type = T;
164
+ using comparator = C;
165
+
159
166
  static const uint8_t DEFAULT_M = 8;
160
- static const uint16_t DEFAULT_K = 200;
167
+ // TODO: Redundant and deprecated. Will be remove din next major version.
168
+ static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
161
169
  static const uint16_t MIN_K = DEFAULT_M;
162
170
  static const uint16_t MAX_K = (1 << 16) - 1;
163
171
 
164
- explicit kll_sketch(uint16_t k = DEFAULT_K, const A& allocator = A());
172
+ explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
165
173
  kll_sketch(const kll_sketch& other);
166
174
  kll_sketch(kll_sketch&& other) noexcept;
167
175
  ~kll_sketch();
@@ -296,7 +304,7 @@ class kll_sketch {
296
304
  *
297
305
  * @return array of approximations to the given number of evenly-spaced fractional ranks.
298
306
  */
299
- std::vector<T, A> get_quantiles(size_t num) const;
307
+ std::vector<T, A> get_quantiles(uint32_t num) const;
300
308
 
301
309
  /**
302
310
  * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
@@ -383,6 +391,33 @@ class kll_sketch {
383
391
  template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
384
392
  size_t get_serialized_size_bytes() const;
385
393
 
394
+ /**
395
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
396
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
397
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
398
+ * optimal.
399
+ * This method is for arithmetic types (integral and floating point)
400
+ * @param k parameter that controls size of the sketch and accuracy of estimates
401
+ * @param n stream length
402
+ * @return upper bound on the serialized size
403
+ */
404
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
405
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
406
+
407
+ /**
408
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
409
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
410
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
411
+ * optimal.
412
+ * This method is for all other non-arithmetic types, and it takes a max size of an item as input.
413
+ * @param k parameter that controls size of the sketch and accuracy of estimates
414
+ * @param n stream length
415
+ * @param max_item_size_bytes maximum size of an item in bytes
416
+ * @return upper bound on the serialized size
417
+ */
418
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
419
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
420
+
386
421
  /**
387
422
  * This method serializes the sketch into a given stream in a binary form
388
423
  * @param os output stream
@@ -391,7 +426,7 @@ class kll_sketch {
391
426
 
392
427
  // This is a convenience alias for users
393
428
  // The type returned by the following serialize method
394
- typedef vector_u8<A> vector_bytes;
429
+ using vector_bytes = vector_u8<A>;
395
430
 
396
431
  /**
397
432
  * This method serializes the sketch as a vector of bytes.
@@ -480,6 +515,8 @@ class kll_sketch {
480
515
  T* max_value_;
481
516
  bool is_level_zero_sorted_;
482
517
 
518
+ friend class kll_quantile_calculator<T, C, A>;
519
+
483
520
  // for deserialization
484
521
  class item_deleter;
485
522
  class items_deleter;