datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,1131 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_SKETCH_IMPL_HPP_
21
+ #define KLL_SKETCH_IMPL_HPP_
22
+
23
+ #include <iostream>
24
+ #include <iomanip>
25
+ #include <sstream>
26
+
27
+ #include "memory_operations.hpp"
28
+ #include "kll_helper.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename T, typename C, typename S, typename A>
33
+ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k):
34
+ k_(k),
35
+ m_(DEFAULT_M),
36
+ min_k_(k),
37
+ n_(0),
38
+ num_levels_(1),
39
+ levels_(2),
40
+ items_(nullptr),
41
+ items_size_(k_),
42
+ min_value_(nullptr),
43
+ max_value_(nullptr),
44
+ is_level_zero_sorted_(false)
45
+ {
46
+ if (k < MIN_K || k > MAX_K) {
47
+ throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= " + std::to_string(MAX_K) + ": " + std::to_string(k));
48
+ }
49
+ levels_[0] = levels_[1] = k;
50
+ items_ = A().allocate(items_size_);
51
+ }
52
+
53
+ template<typename T, typename C, typename S, typename A>
54
+ kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch& other):
55
+ k_(other.k_),
56
+ m_(other.m_),
57
+ min_k_(other.min_k_),
58
+ n_(other.n_),
59
+ num_levels_(other.num_levels_),
60
+ levels_(other.levels_),
61
+ items_(nullptr),
62
+ items_size_(other.items_size_),
63
+ min_value_(nullptr),
64
+ max_value_(nullptr),
65
+ is_level_zero_sorted_(other.is_level_zero_sorted_)
66
+ {
67
+ items_ = A().allocate(items_size_);
68
+ std::copy(&other.items_[levels_[0]], &other.items_[levels_[num_levels_]], &items_[levels_[0]]);
69
+ if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
70
+ if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
71
+ }
72
+
73
+ template<typename T, typename C, typename S, typename A>
74
+ kll_sketch<T, C, S, A>::kll_sketch(kll_sketch&& other) noexcept:
75
+ k_(other.k_),
76
+ m_(other.m_),
77
+ min_k_(other.min_k_),
78
+ n_(other.n_),
79
+ num_levels_(other.num_levels_),
80
+ levels_(std::move(other.levels_)),
81
+ items_(other.items_),
82
+ items_size_(other.items_size_),
83
+ min_value_(other.min_value_),
84
+ max_value_(other.max_value_),
85
+ is_level_zero_sorted_(other.is_level_zero_sorted_)
86
+ {
87
+ other.items_ = nullptr;
88
+ other.min_value_ = nullptr;
89
+ other.max_value_ = nullptr;
90
+ }
91
+
92
+ template<typename T, typename C, typename S, typename A>
93
+ kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
94
+ kll_sketch copy(other);
95
+ std::swap(k_, copy.k_);
96
+ std::swap(m_, copy.m_);
97
+ std::swap(min_k_, copy.min_k_);
98
+ std::swap(n_, copy.n_);
99
+ std::swap(num_levels_, copy.num_levels_);
100
+ std::swap(levels_, copy.levels_);
101
+ std::swap(items_, copy.items_);
102
+ std::swap(items_size_, copy.items_size_);
103
+ std::swap(min_value_, copy.min_value_);
104
+ std::swap(max_value_, copy.max_value_);
105
+ std::swap(is_level_zero_sorted_, copy.is_level_zero_sorted_);
106
+ return *this;
107
+ }
108
+
109
+ template<typename T, typename C, typename S, typename A>
110
+ kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(kll_sketch&& other) {
111
+ std::swap(k_, other.k_);
112
+ std::swap(m_, other.m_);
113
+ std::swap(min_k_, other.min_k_);
114
+ std::swap(n_, other.n_);
115
+ std::swap(num_levels_, other.num_levels_);
116
+ std::swap(levels_, other.levels_);
117
+ std::swap(items_, other.items_);
118
+ std::swap(items_size_, other.items_size_);
119
+ std::swap(min_value_, other.min_value_);
120
+ std::swap(max_value_, other.max_value_);
121
+ std::swap(is_level_zero_sorted_, other.is_level_zero_sorted_);
122
+ return *this;
123
+ }
124
+
125
+ template<typename T, typename C, typename S, typename A>
126
+ kll_sketch<T, C, S, A>::~kll_sketch() {
127
+ if (items_ != nullptr) {
128
+ const uint32_t begin = levels_[0];
129
+ const uint32_t end = levels_[num_levels_];
130
+ for (uint32_t i = begin; i < end; i++) items_[i].~T();
131
+ A().deallocate(items_, items_size_);
132
+ }
133
+ if (min_value_ != nullptr) {
134
+ min_value_->~T();
135
+ A().deallocate(min_value_, 1);
136
+ }
137
+ if (max_value_ != nullptr) {
138
+ max_value_->~T();
139
+ A().deallocate(max_value_, 1);
140
+ }
141
+ }
142
+
143
+ template<typename T, typename C, typename S, typename A>
144
+ void kll_sketch<T, C, S, A>::update(const T& value) {
145
+ if (!check_update_value(value)) { return; }
146
+ update_min_max(value);
147
+ const uint32_t index = internal_update();
148
+ new (&items_[index]) T(value);
149
+ }
150
+
151
+ template<typename T, typename C, typename S, typename A>
152
+ void kll_sketch<T, C, S, A>::update(T&& value) {
153
+ if (!check_update_value(value)) { return; }
154
+ update_min_max(value);
155
+ const uint32_t index = internal_update();
156
+ new (&items_[index]) T(std::move(value));
157
+ }
158
+
159
+ template<typename T, typename C, typename S, typename A>
160
+ void kll_sketch<T, C, S, A>::update_min_max(const T& value) {
161
+ if (is_empty()) {
162
+ min_value_ = new (A().allocate(1)) T(value);
163
+ max_value_ = new (A().allocate(1)) T(value);
164
+ } else {
165
+ if (C()(value, *min_value_)) *min_value_ = value;
166
+ if (C()(*max_value_, value)) *max_value_ = value;
167
+ }
168
+ }
169
+
170
+ template<typename T, typename C, typename S, typename A>
171
+ uint32_t kll_sketch<T, C, S, A>::internal_update() {
172
+ if (levels_[0] == 0) compress_while_updating();
173
+ n_++;
174
+ is_level_zero_sorted_ = false;
175
+ return --levels_[0];
176
+ }
177
+
178
+ template<typename T, typename C, typename S, typename A>
179
+ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
180
+ if (other.is_empty()) return;
181
+ if (m_ != other.m_) {
182
+ throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
183
+ }
184
+ if (is_empty()) {
185
+ min_value_ = new (A().allocate(1)) T(*other.min_value_);
186
+ max_value_ = new (A().allocate(1)) T(*other.max_value_);
187
+ } else {
188
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
189
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
190
+ }
191
+ const uint64_t final_n = n_ + other.n_;
192
+ for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
193
+ const uint32_t index = internal_update();
194
+ new (&items_[index]) T(other.items_[i]);
195
+ }
196
+ if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
197
+ n_ = final_n;
198
+ if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
199
+ assert_correct_total_weight();
200
+ }
201
+
202
+ template<typename T, typename C, typename S, typename A>
203
+ void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
204
+ if (other.is_empty()) return;
205
+ if (m_ != other.m_) {
206
+ throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
207
+ }
208
+ if (is_empty()) {
209
+ min_value_ = new (A().allocate(1)) T(std::move(*other.min_value_));
210
+ max_value_ = new (A().allocate(1)) T(std::move(*other.max_value_));
211
+ } else {
212
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
213
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
214
+ }
215
+ const uint64_t final_n = n_ + other.n_;
216
+ for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
217
+ const uint32_t index = internal_update();
218
+ new (&items_[index]) T(std::move(other.items_[i]));
219
+ }
220
+ if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
221
+ n_ = final_n;
222
+ if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
223
+ assert_correct_total_weight();
224
+ }
225
+
226
+ template<typename T, typename C, typename S, typename A>
227
+ bool kll_sketch<T, C, S, A>::is_empty() const {
228
+ return n_ == 0;
229
+ }
230
+
231
+ template<typename T, typename C, typename S, typename A>
232
+ uint64_t kll_sketch<T, C, S, A>::get_n() const {
233
+ return n_;
234
+ }
235
+
236
+ template<typename T, typename C, typename S, typename A>
237
+ uint32_t kll_sketch<T, C, S, A>::get_num_retained() const {
238
+ return levels_[num_levels_] - levels_[0];
239
+ }
240
+
241
+ template<typename T, typename C, typename S, typename A>
242
+ bool kll_sketch<T, C, S, A>::is_estimation_mode() const {
243
+ return num_levels_ > 1;
244
+ }
245
+
246
+ template<typename T, typename C, typename S, typename A>
247
+ T kll_sketch<T, C, S, A>::get_min_value() const {
248
+ if (is_empty()) return get_invalid_value();
249
+ return *min_value_;
250
+ }
251
+
252
+ template<typename T, typename C, typename S, typename A>
253
+ T kll_sketch<T, C, S, A>::get_max_value() const {
254
+ if (is_empty()) return get_invalid_value();
255
+ return *max_value_;
256
+ }
257
+
258
+ template<typename T, typename C, typename S, typename A>
259
+ T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
260
+ if (is_empty()) return get_invalid_value();
261
+ if (fraction == 0.0) return *min_value_;
262
+ if (fraction == 1.0) return *max_value_;
263
+ if ((fraction < 0.0) || (fraction > 1.0)) {
264
+ throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
265
+ }
266
+ // has side effect of sorting level zero if needed
267
+ auto quantile_calculator(const_cast<kll_sketch*>(this)->get_quantile_calculator());
268
+ return quantile_calculator->get_quantile(fraction);
269
+ }
270
+
271
+ template<typename T, typename C, typename S, typename A>
272
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
273
+ std::vector<T, A> quantiles;
274
+ quantiles.reserve(size);
275
+ if (is_empty()) return quantiles;
276
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
277
+ quantiles.reserve(size);
278
+ for (uint32_t i = 0; i < size; i++) {
279
+ const double fraction = fractions[i];
280
+ if ((fraction < 0.0) || (fraction > 1.0)) {
281
+ throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
282
+ }
283
+ if (fraction == 0.0) quantiles.push_back(*min_value_);
284
+ else if (fraction == 1.0) quantiles.push_back(*max_value_);
285
+ else {
286
+ if (!quantile_calculator) {
287
+ // has side effect of sorting level zero if needed
288
+ quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
289
+ }
290
+ quantiles.push_back(quantile_calculator->get_quantile(fraction));
291
+ }
292
+ }
293
+ return quantiles;
294
+ }
295
+
296
+ template<typename T, typename C, typename S, typename A>
297
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
298
+ if (is_empty()) return std::vector<T, A>();
299
+ if (num == 0) {
300
+ throw std::invalid_argument("num must be > 0");
301
+ }
302
+ std::vector<double> fractions(num);
303
+ fractions[0] = 0.0;
304
+ for (size_t i = 1; i < num; i++) {
305
+ fractions[i] = static_cast<double>(i) / (num - 1);
306
+ }
307
+ if (num > 1) {
308
+ fractions[num - 1] = 1.0;
309
+ }
310
+ return get_quantiles(fractions.data(), num);
311
+ }
312
+
313
+ template<typename T, typename C, typename S, typename A>
314
+ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
315
+ if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
316
+ uint8_t level = 0;
317
+ uint64_t weight = 1;
318
+ uint64_t total = 0;
319
+ while (level < num_levels_) {
320
+ const auto from_index(levels_[level]);
321
+ const auto to_index(levels_[level + 1]); // exclusive
322
+ for (uint32_t i = from_index; i < to_index; i++) {
323
+ if (C()(items_[i], value)) {
324
+ total += weight;
325
+ } else if ((level > 0) || is_level_zero_sorted_) {
326
+ break; // levels above 0 are sorted, no point comparing further
327
+ }
328
+ }
329
+ level++;
330
+ weight *= 2;
331
+ }
332
+ return (double) total / n_;
333
+ }
334
+
335
+ template<typename T, typename C, typename S, typename A>
336
+ vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
337
+ return get_PMF_or_CDF(split_points, size, false);
338
+ }
339
+
340
+ template<typename T, typename C, typename S, typename A>
341
+ vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
342
+ return get_PMF_or_CDF(split_points, size, true);
343
+ }
344
+
345
+ template<typename T, typename C, typename S, typename A>
346
+ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
347
+ return get_normalized_rank_error(min_k_, pmf);
348
+ }
349
+
350
+ // implementation for fixed-size arithmetic types (integral and floating point)
351
+ template<typename T, typename C, typename S, typename A>
352
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
353
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
354
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
355
+ if (num_levels_ == 1 && get_num_retained() == 1) {
356
+ return DATA_START_SINGLE_ITEM + sizeof(TT);
357
+ }
358
+ // the last integer in the levels_ array is not serialized because it can be derived
359
+ return DATA_START + num_levels_ * sizeof(uint32_t) + (get_num_retained() + 2) * sizeof(TT);
360
+ }
361
+
362
+ // implementation for all other types
363
+ template<typename T, typename C, typename S, typename A>
364
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
365
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
366
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
367
+ if (num_levels_ == 1 && get_num_retained() == 1) {
368
+ return DATA_START_SINGLE_ITEM + S().size_of_item(items_[levels_[0]]);
369
+ }
370
+ // the last integer in the levels_ array is not serialized because it can be derived
371
+ size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
372
+ size += S().size_of_item(*min_value_);
373
+ size += S().size_of_item(*max_value_);
374
+ for (auto& it: *this) size += S().size_of_item(it.first);
375
+ return size;
376
+ }
377
+
378
+ template<typename T, typename C, typename S, typename A>
379
+ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
380
+ const bool is_single_item = n_ == 1;
381
+ const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
382
+ os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
383
+ const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
384
+ os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
385
+ const uint8_t family(FAMILY);
386
+ os.write(reinterpret_cast<const char*>(&family), sizeof(family));
387
+ const uint8_t flags_byte(
388
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
389
+ | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
390
+ | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
391
+ );
392
+ os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
393
+ os.write((char*)&k_, sizeof(k_));
394
+ os.write((char*)&m_, sizeof(m_));
395
+ const uint8_t unused = 0;
396
+ os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
397
+ if (is_empty()) return;
398
+ if (!is_single_item) {
399
+ os.write((char*)&n_, sizeof(n_));
400
+ os.write((char*)&min_k_, sizeof(min_k_));
401
+ os.write((char*)&num_levels_, sizeof(num_levels_));
402
+ os.write((char*)&unused, sizeof(unused));
403
+ os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
404
+ S().serialize(os, min_value_, 1);
405
+ S().serialize(os, max_value_, 1);
406
+ }
407
+ S().serialize(os, &items_[levels_[0]], get_num_retained());
408
+ }
409
+
410
+ template<typename T, typename C, typename S, typename A>
411
+ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
412
+ const bool is_single_item = n_ == 1;
413
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
414
+ vector_u8<A> bytes(size);
415
+ uint8_t* ptr = bytes.data() + header_size_bytes;
416
+ const uint8_t* end_ptr = ptr + size;
417
+ const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
418
+ ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
419
+ const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
420
+ ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
421
+ const uint8_t family(FAMILY);
422
+ ptr += copy_to_mem(&family, ptr, sizeof(family));
423
+ const uint8_t flags_byte(
424
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
425
+ | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
426
+ | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
427
+ );
428
+ ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
429
+ ptr += copy_to_mem(&k_, ptr, sizeof(k_));
430
+ ptr += copy_to_mem(&m_, ptr, sizeof(m_));
431
+ const uint8_t unused = 0;
432
+ ptr += copy_to_mem(&unused, ptr, sizeof(unused));
433
+ if (!is_empty()) {
434
+ if (!is_single_item) {
435
+ ptr += copy_to_mem(&n_, ptr, sizeof(n_));
436
+ ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
437
+ ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
438
+ ptr += copy_to_mem(&unused, ptr, sizeof(unused));
439
+ ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
440
+ ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
441
+ ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
442
+ }
443
+ const size_t bytes_remaining = end_ptr - ptr;
444
+ ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
445
+ }
446
+ const size_t delta = ptr - bytes.data();
447
+ if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
448
+ return bytes;
449
+ }
450
+
451
+ template<typename T, typename C, typename S, typename A>
452
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
453
+ uint8_t preamble_ints;
454
+ is.read((char*)&preamble_ints, sizeof(preamble_ints));
455
+ uint8_t serial_version;
456
+ is.read((char*)&serial_version, sizeof(serial_version));
457
+ uint8_t family_id;
458
+ is.read((char*)&family_id, sizeof(family_id));
459
+ uint8_t flags_byte;
460
+ is.read((char*)&flags_byte, sizeof(flags_byte));
461
+ uint16_t k;
462
+ is.read((char*)&k, sizeof(k));
463
+ uint8_t m;
464
+ is.read((char*)&m, sizeof(m));
465
+ uint8_t unused;
466
+ is.read((char*)&unused, sizeof(unused));
467
+
468
+ check_m(m);
469
+ check_preamble_ints(preamble_ints, flags_byte);
470
+ check_serial_version(serial_version);
471
+ check_family_id(family_id);
472
+
473
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
474
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
475
+ if (is_empty) return kll_sketch(k);
476
+
477
+ uint64_t n;
478
+ uint16_t min_k;
479
+ uint8_t num_levels;
480
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
481
+ if (is_single_item) {
482
+ n = 1;
483
+ min_k = k;
484
+ num_levels = 1;
485
+ } else {
486
+ is.read((char*)&n, sizeof(n_));
487
+ is.read((char*)&min_k, sizeof(min_k_));
488
+ is.read((char*)&num_levels, sizeof(num_levels));
489
+ is.read((char*)&unused, sizeof(unused));
490
+ }
491
+ vector_u32<A> levels(num_levels + 1);
492
+ const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
493
+ if (is_single_item) {
494
+ levels[0] = capacity - 1;
495
+ } else {
496
+ // the last integer in levels_ is not serialized because it can be derived
497
+ is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
498
+ }
499
+ levels[num_levels] = capacity;
500
+ auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
501
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
502
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
503
+ std::unique_ptr<T, item_deleter> min_value;
504
+ std::unique_ptr<T, item_deleter> max_value;
505
+ if (!is_single_item) {
506
+ S().deserialize(is, min_value_buffer.get(), 1);
507
+ // serde call did not throw, repackage with destrtuctor
508
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
509
+ S().deserialize(is, max_value_buffer.get(), 1);
510
+ // serde call did not throw, repackage with destrtuctor
511
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
512
+ }
513
+ auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
514
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
515
+ const auto num_items = levels[num_levels] - levels[0];
516
+ S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
517
+ // serde call did not throw, repackage with destrtuctors
518
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
519
+ const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
520
+ if (is_single_item) {
521
+ new (min_value_buffer.get()) T(items.get()[levels[0]]);
522
+ // copy did not throw, repackage with destrtuctor
523
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
524
+ new (max_value_buffer.get()) T(items.get()[levels[0]]);
525
+ // copy did not throw, repackage with destrtuctor
526
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
527
+ }
528
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
529
+ return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
530
+ std::move(min_value), std::move(max_value), is_level_zero_sorted);
531
+ }
532
+
533
+ template<typename T, typename C, typename S, typename A>
534
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size) {
535
+ ensure_minimum_memory(size, 8);
536
+ const char* ptr = static_cast<const char*>(bytes);
537
+ uint8_t preamble_ints;
538
+ ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
539
+ uint8_t serial_version;
540
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
541
+ uint8_t family_id;
542
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
543
+ uint8_t flags_byte;
544
+ ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
545
+ uint16_t k;
546
+ ptr += copy_from_mem(ptr, &k, sizeof(k));
547
+ uint8_t m;
548
+ ptr += copy_from_mem(ptr, &m, sizeof(m));
549
+ ptr++; // skip unused byte
550
+
551
+ check_m(m);
552
+ check_preamble_ints(preamble_ints, flags_byte);
553
+ check_serial_version(serial_version);
554
+ check_family_id(family_id);
555
+ ensure_minimum_memory(size, 1 << preamble_ints);
556
+
557
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
558
+ if (is_empty) return kll_sketch<T, C, S, A>(k);
559
+
560
+ uint64_t n;
561
+ uint16_t min_k;
562
+ uint8_t num_levels;
563
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
564
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
565
+ if (is_single_item) {
566
+ n = 1;
567
+ min_k = k;
568
+ num_levels = 1;
569
+ } else {
570
+ ptr += copy_from_mem(ptr, &n, sizeof(n));
571
+ ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
572
+ ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
573
+ ptr++; // skip unused byte
574
+ }
575
+ vector_u32<A> levels(num_levels + 1);
576
+ const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
577
+ if (is_single_item) {
578
+ levels[0] = capacity - 1;
579
+ } else {
580
+ // the last integer in levels_ is not serialized because it can be derived
581
+ ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
582
+ }
583
+ levels[num_levels] = capacity;
584
+ auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
585
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
586
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
587
+ std::unique_ptr<T, item_deleter> min_value;
588
+ std::unique_ptr<T, item_deleter> max_value;
589
+ if (!is_single_item) {
590
+ ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
591
+ // serde call did not throw, repackage with destrtuctor
592
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
593
+ ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
594
+ // serde call did not throw, repackage with destrtuctor
595
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
596
+ }
597
+ auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
598
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
599
+ const auto num_items = levels[num_levels] - levels[0];
600
+ ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
601
+ // serde call did not throw, repackage with destrtuctors
602
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
603
+ const size_t delta = ptr - static_cast<const char*>(bytes);
604
+ if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
605
+ const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
606
+ if (is_single_item) {
607
+ new (min_value_buffer.get()) T(items.get()[levels[0]]);
608
+ // copy did not throw, repackage with destrtuctor
609
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
610
+ new (max_value_buffer.get()) T(items.get()[levels[0]]);
611
+ // copy did not throw, repackage with destrtuctor
612
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
613
+ }
614
+ return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
615
+ std::move(min_value), std::move(max_value), is_level_zero_sorted);
616
+ }
617
+
618
+ /*
619
+ * Gets the normalized rank error given k and pmf.
620
+ * k - the configuration parameter
621
+ * pmf - if true, returns the "double-sided" normalized rank error for the get_PMF() function.
622
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
623
+ * Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials
624
+ */
625
+ template<typename T, typename C, typename S, typename A>
626
+ double kll_sketch<T, C, S, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
627
+ return pmf
628
+ ? 2.446 / pow(k, 0.9433)
629
+ : 2.296 / pow(k, 0.9723);
630
+ }
631
+
632
+ // for deserialization
633
+ template<typename T, typename C, typename S, typename A>
634
+ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
635
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
636
+ std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
637
+ k_(k),
638
+ m_(DEFAULT_M),
639
+ min_k_(min_k),
640
+ n_(n),
641
+ num_levels_(num_levels),
642
+ levels_(std::move(levels)),
643
+ items_(items.release()),
644
+ items_size_(items_size),
645
+ min_value_(min_value.release()),
646
+ max_value_(max_value.release()),
647
+ is_level_zero_sorted_(is_level_zero_sorted)
648
+ {}
649
+
650
+ // The following code is only valid in the special case of exactly reaching capacity while updating.
651
+ // It cannot be used while merging, while reducing k, or anything else.
652
+ template<typename T, typename C, typename S, typename A>
653
+ void kll_sketch<T, C, S, A>::compress_while_updating(void) {
654
+ const uint8_t level = find_level_to_compact();
655
+
656
+ // It is important to add the new top level right here. Be aware that this operation
657
+ // grows the buffer and shifts the data and also the boundaries of the data and grows the
658
+ // levels array and increments num_levels_
659
+ if (level == (num_levels_ - 1)) {
660
+ add_empty_top_level_to_completely_full_sketch();
661
+ }
662
+
663
+ const uint32_t raw_beg = levels_[level];
664
+ const uint32_t raw_lim = levels_[level + 1];
665
+ // +2 is OK because we already added a new top level if necessary
666
+ const uint32_t pop_above = levels_[level + 2] - raw_lim;
667
+ const uint32_t raw_pop = raw_lim - raw_beg;
668
+ const bool odd_pop = kll_helper::is_odd(raw_pop);
669
+ const uint32_t adj_beg = odd_pop ? raw_beg + 1 : raw_beg;
670
+ const uint32_t adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
671
+ const uint32_t half_adj_pop = adj_pop / 2;
672
+ const uint32_t destroy_beg = levels_[0];
673
+
674
+ // level zero might not be sorted, so we must sort it if we wish to compact it
675
+ // sort_level_zero() is not used here because of the adjustment for odd number of items
676
+ if ((level == 0) && !is_level_zero_sorted_) {
677
+ std::sort(&items_[adj_beg], &items_[adj_beg + adj_pop], C());
678
+ }
679
+ if (pop_above == 0) {
680
+ kll_helper::randomly_halve_up(items_, adj_beg, adj_pop);
681
+ } else {
682
+ kll_helper::randomly_halve_down(items_, adj_beg, adj_pop);
683
+ kll_helper::merge_sorted_arrays<T, C>(items_, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
684
+ }
685
+ levels_[level + 1] -= half_adj_pop; // adjust boundaries of the level above
686
+ if (odd_pop) {
687
+ levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
688
+ if (levels_[level] != raw_beg) items_[levels_[level]] = std::move(items_[raw_beg]); // namely this leftover guy
689
+ } else {
690
+ levels_[level] = levels_[level + 1]; // the current level is now empty
691
+ }
692
+
693
+ // verify that we freed up half_adj_pop array slots just below the current level
694
+ if (levels_[level] != (raw_beg + half_adj_pop)) throw std::logic_error("compaction error");
695
+
696
+ // finally, we need to shift up the data in the levels below
697
+ // so that the freed-up space can be used by level zero
698
+ if (level > 0) {
699
+ const uint32_t amount = raw_beg - levels_[0];
700
+ std::move_backward(&items_[levels_[0]], &items_[levels_[0] + amount], &items_[levels_[0] + half_adj_pop + amount]);
701
+ for (uint8_t lvl = 0; lvl < level; lvl++) levels_[lvl] += half_adj_pop;
702
+ }
703
+ for (uint32_t i = 0; i < half_adj_pop; i++) items_[i + destroy_beg].~T();
704
+ }
705
+
706
+ template<typename T, typename C, typename S, typename A>
707
+ uint8_t kll_sketch<T, C, S, A>::find_level_to_compact() const {
708
+ uint8_t level = 0;
709
+ while (true) {
710
+ if (level >= num_levels_) throw std::logic_error("capacity calculation error");
711
+ const uint32_t pop = levels_[level + 1] - levels_[level];
712
+ const uint32_t cap = kll_helper::level_capacity(k_, num_levels_, level, m_);
713
+ if (pop >= cap) {
714
+ return level;
715
+ }
716
+ level++;
717
+ }
718
+ }
719
+
720
+ template<typename T, typename C, typename S, typename A>
721
+ void kll_sketch<T, C, S, A>::add_empty_top_level_to_completely_full_sketch() {
722
+ const uint32_t cur_total_cap = levels_[num_levels_];
723
+
724
+ // make sure that we are following a certain growth scheme
725
+ if (levels_[0] != 0) throw std::logic_error("full sketch expected");
726
+ if (items_size_ != cur_total_cap) throw std::logic_error("current capacity mismatch");
727
+
728
+ // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here
729
+ const uint8_t new_levels_size = num_levels_ + 2;
730
+ if (levels_.size() < new_levels_size) {
731
+ levels_.resize(new_levels_size);
732
+ }
733
+
734
+ const uint32_t delta_cap = kll_helper::level_capacity(k_, num_levels_ + 1, 0, m_);
735
+ const uint32_t new_total_cap = cur_total_cap + delta_cap;
736
+
737
+ // move (and shift) the current data into the new buffer
738
+ T* new_buf = A().allocate(new_total_cap);
739
+ kll_helper::move_construct<T>(items_, 0, cur_total_cap, new_buf, delta_cap, true);
740
+ A().deallocate(items_, items_size_);
741
+ items_ = new_buf;
742
+ items_size_ = new_total_cap;
743
+
744
+ // this loop includes the old "extra" index at the top
745
+ for (uint8_t i = 0; i <= num_levels_; i++) {
746
+ levels_[i] += delta_cap;
747
+ }
748
+
749
+ if (levels_[num_levels_] != new_total_cap) throw std::logic_error("new capacity mismatch");
750
+
751
+ num_levels_++;
752
+ levels_[num_levels_] = new_total_cap; // initialize the new "extra" index at the top
753
+ }
754
+
755
+ template<typename T, typename C, typename S, typename A>
756
+ void kll_sketch<T, C, S, A>::sort_level_zero() {
757
+ if (!is_level_zero_sorted_) {
758
+ std::sort(&items_[levels_[0]], &items_[levels_[1]], C());
759
+ is_level_zero_sorted_ = true;
760
+ }
761
+ }
762
+
763
+ template<typename T, typename C, typename S, typename A>
764
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
765
+ sort_level_zero();
766
+ typedef typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>> AllocCalc;
767
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
768
+ new (AllocCalc().allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_),
769
+ [](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); AllocCalc().deallocate(ptr, 1); }
770
+ );
771
+ return quantile_calculator;
772
+ }
773
+
774
+ template<typename T, typename C, typename S, typename A>
775
+ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
776
+ if (is_empty()) return vector_d<A>();
777
+ kll_helper::validate_values<T, C>(split_points, size);
778
+ vector_d<A> buckets(size + 1, 0);
779
+ uint8_t level = 0;
780
+ uint64_t weight = 1;
781
+ while (level < num_levels_) {
782
+ const auto from_index = levels_[level];
783
+ const auto to_index = levels_[level + 1]; // exclusive
784
+ if ((level == 0) && !is_level_zero_sorted_) {
785
+ increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
786
+ } else {
787
+ increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
788
+ }
789
+ level++;
790
+ weight *= 2;
791
+ }
792
+ // normalize and, if CDF, convert to cumulative
793
+ if (is_CDF) {
794
+ double subtotal = 0;
795
+ for (uint32_t i = 0; i <= size; i++) {
796
+ subtotal += buckets[i];
797
+ buckets[i] = subtotal / n_;
798
+ }
799
+ } else {
800
+ for (uint32_t i = 0; i <= size; i++) {
801
+ buckets[i] /= n_;
802
+ }
803
+ }
804
+ return buckets;
805
+ }
806
+
807
+ template<typename T, typename C, typename S, typename A>
808
+ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
809
+ const T* split_points, uint32_t size, double* buckets) const
810
+ {
811
+ for (uint32_t i = from_index; i < to_index; i++) {
812
+ uint32_t j;
813
+ for (j = 0; j < size; j++) {
814
+ if (C()(items_[i], split_points[j])) {
815
+ break;
816
+ }
817
+ }
818
+ buckets[j] += weight;
819
+ }
820
+ }
821
+
822
+ template<typename T, typename C, typename S, typename A>
823
+ void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
824
+ const T* split_points, uint32_t size, double* buckets) const
825
+ {
826
+ uint32_t i = from_index;
827
+ uint32_t j = 0;
828
+ while ((i < to_index) && (j < size)) {
829
+ if (C()(items_[i], split_points[j])) {
830
+ buckets[j] += weight; // this sample goes into this bucket
831
+ i++; // move on to next sample and see whether it also goes into this bucket
832
+ } else {
833
+ j++; // no more samples for this bucket
834
+ }
835
+ }
836
+ // now either i == to_index (we are out of samples), or
837
+ // j == size (we are out of buckets, but there are more samples remaining)
838
+ // we only need to do something in the latter case
839
+ if (j == size) {
840
+ buckets[j] += weight * (to_index - i);
841
+ }
842
+ }
843
+
844
+ template<typename T, typename C, typename S, typename A>
845
+ template<typename O>
846
+ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
847
+ const uint32_t tmp_num_items = get_num_retained() + other.get_num_retained_above_level_zero();
848
+ auto tmp_items_deleter = [tmp_num_items](T* ptr) { A().deallocate(ptr, tmp_num_items); }; // no destructor needed
849
+ const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(A().allocate(tmp_num_items), tmp_items_deleter);
850
+ const uint8_t ub = kll_helper::ub_on_num_levels(final_n);
851
+ const size_t work_levels_size = ub + 2; // ub+1 does not work
852
+ vector_u32<A> worklevels(work_levels_size);
853
+ vector_u32<A> outlevels(work_levels_size);
854
+
855
+ const uint8_t provisional_num_levels = std::max(num_levels_, other.num_levels_);
856
+
857
+ populate_work_arrays(std::forward<O>(other), workbuf.get(), worklevels.data(), provisional_num_levels);
858
+
859
+ const kll_helper::compress_result result = kll_helper::general_compress<T, C>(k_, m_, provisional_num_levels, workbuf.get(),
860
+ worklevels.data(), outlevels.data(), is_level_zero_sorted_);
861
+
862
+ // ub can sometimes be much bigger
863
+ if (result.final_num_levels > ub) throw std::logic_error("merge error");
864
+
865
+ // now we need to transfer the results back into "this" sketch
866
+ if (result.final_capacity != items_size_) {
867
+ A().deallocate(items_, items_size_);
868
+ items_size_ = result.final_capacity;
869
+ items_ = A().allocate(items_size_);
870
+ }
871
+ const uint32_t free_space_at_bottom = result.final_capacity - result.final_num_items;
872
+ kll_helper::move_construct<T>(workbuf.get(), outlevels[0], outlevels[0] + result.final_num_items, items_, free_space_at_bottom, true);
873
+
874
+ const size_t new_levels_size = result.final_num_levels + 1;
875
+ if (levels_.size() < new_levels_size) {
876
+ levels_.resize(new_levels_size);
877
+ }
878
+ const uint32_t offset = free_space_at_bottom - outlevels[0];
879
+ for (uint8_t lvl = 0; lvl < levels_.size(); lvl++) { // includes the "extra" index
880
+ levels_[lvl] = outlevels[lvl] + offset;
881
+ }
882
+ num_levels_ = result.final_num_levels;
883
+ }
884
+
885
+ // this leaves items_ uninitialized (all objects moved out and destroyed)
886
+ // this version copies objects from the incoming sketch
887
+ template<typename T, typename C, typename S, typename A>
888
+ void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
889
+ worklevels[0] = 0;
890
+
891
+ // the level zero data from "other" was already inserted into "this"
892
+ kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
893
+ worklevels[1] = safe_level_size(0);
894
+
895
+ for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
896
+ const uint32_t self_pop = safe_level_size(lvl);
897
+ const uint32_t other_pop = other.safe_level_size(lvl);
898
+ worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
899
+
900
+ if ((self_pop > 0) && (other_pop == 0)) {
901
+ kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
902
+ } else if ((self_pop == 0) && (other_pop > 0)) {
903
+ kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
904
+ } else if ((self_pop > 0) && (other_pop > 0)) {
905
+ kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
906
+ }
907
+ }
908
+ }
909
+
910
+ // this leaves items_ uninitialized (all objects moved out and destroyed)
911
+ // this version moves objects from the incoming sketch
912
+ template<typename T, typename C, typename S, typename A>
913
+ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
914
+ worklevels[0] = 0;
915
+
916
+ // the level zero data from "other" was already inserted into "this"
917
+ kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
918
+ worklevels[1] = safe_level_size(0);
919
+
920
+ for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
921
+ const uint32_t self_pop = safe_level_size(lvl);
922
+ const uint32_t other_pop = other.safe_level_size(lvl);
923
+ worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
924
+
925
+ if ((self_pop > 0) && (other_pop == 0)) {
926
+ kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
927
+ } else if ((self_pop == 0) && (other_pop > 0)) {
928
+ kll_helper::move_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl], false);
929
+ } else if ((self_pop > 0) && (other_pop > 0)) {
930
+ kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
931
+ }
932
+ }
933
+ }
934
+
935
+ template<typename T, typename C, typename S, typename A>
936
+ void kll_sketch<T, C, S, A>::assert_correct_total_weight() const {
937
+ const uint64_t total(kll_helper::sum_the_sample_weights(num_levels_, levels_.data()));
938
+ if (total != n_) {
939
+ throw std::logic_error("Total weight does not match N");
940
+ }
941
+ }
942
+
943
+ template<typename T, typename C, typename S, typename A>
944
+ uint32_t kll_sketch<T, C, S, A>::safe_level_size(uint8_t level) const {
945
+ if (level >= num_levels_) return 0;
946
+ return levels_[level + 1] - levels_[level];
947
+ }
948
+
949
+ template<typename T, typename C, typename S, typename A>
950
+ uint32_t kll_sketch<T, C, S, A>::get_num_retained_above_level_zero() const {
951
+ if (num_levels_ == 1) return 0;
952
+ return levels_[num_levels_] - levels_[1];
953
+ }
954
+
955
+ template<typename T, typename C, typename S, typename A>
956
+ void kll_sketch<T, C, S, A>::check_m(uint8_t m) {
957
+ if (m != DEFAULT_M) {
958
+ throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
959
+ + ": " + std::to_string(m));
960
+ }
961
+ }
962
+
963
+ template<typename T, typename C, typename S, typename A>
964
+ void kll_sketch<T, C, S, A>::check_preamble_ints(uint8_t preamble_ints, uint8_t flags_byte) {
965
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
966
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM));
967
+ if (is_empty || is_single_item) {
968
+ if (preamble_ints != PREAMBLE_INTS_SHORT) {
969
+ throw std::invalid_argument("Possible corruption: preamble ints must be "
970
+ + std::to_string(PREAMBLE_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints));
971
+ }
972
+ } else {
973
+ if (preamble_ints != PREAMBLE_INTS_FULL) {
974
+ throw std::invalid_argument("Possible corruption: preamble ints must be "
975
+ + std::to_string(PREAMBLE_INTS_FULL) + " for a sketch with more than one item: " + std::to_string(preamble_ints));
976
+ }
977
+ }
978
+ }
979
+
980
+ template<typename T, typename C, typename S, typename A>
981
+ void kll_sketch<T, C, S, A>::check_serial_version(uint8_t serial_version) {
982
+ if (serial_version != SERIAL_VERSION_1 && serial_version != SERIAL_VERSION_2) {
983
+ throw std::invalid_argument("Possible corruption: serial version mismatch: expected "
984
+ + std::to_string(SERIAL_VERSION_1) + " or " + std::to_string(SERIAL_VERSION_2)
985
+ + ", got " + std::to_string(serial_version));
986
+ }
987
+ }
988
+
989
+ template<typename T, typename C, typename S, typename A>
990
+ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
991
+ if (family_id != FAMILY) {
992
+ throw std::invalid_argument("Possible corruption: family mismatch: expected "
993
+ + std::to_string(FAMILY) + ", got " + std::to_string(family_id));
994
+ }
995
+ }
996
+
997
+ template <typename T, typename C, typename S, typename A>
998
+ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
999
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
1000
+ os << "### KLL sketch summary:" << std::endl;
1001
+ os << " K : " << k_ << std::endl;
1002
+ os << " min K : " << min_k_ << std::endl;
1003
+ os << " M : " << (unsigned int) m_ << std::endl;
1004
+ os << " N : " << n_ << std::endl;
1005
+ os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
1006
+ os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
1007
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
1008
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
1009
+ os << " Levels : " << (unsigned int) num_levels_ << std::endl;
1010
+ os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
1011
+ os << " Capacity items : " << items_size_ << std::endl;
1012
+ os << " Retained items : " << get_num_retained() << std::endl;
1013
+ os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
1014
+ if (!is_empty()) {
1015
+ os << " Min value : " << *min_value_ << std::endl;
1016
+ os << " Max value : " << *max_value_ << std::endl;
1017
+ }
1018
+ os << "### End sketch summary" << std::endl;
1019
+
1020
+ if (print_levels) {
1021
+ os << "### KLL sketch levels:" << std::endl;
1022
+ os << " index: nominal capacity, actual size" << std::endl;
1023
+ for (uint8_t i = 0; i < num_levels_; i++) {
1024
+ os << " " << (unsigned int) i << ": " << kll_helper::level_capacity(k_, num_levels_, i, m_) << ", " << safe_level_size(i) << std::endl;
1025
+ }
1026
+ os << "### End sketch levels" << std::endl;
1027
+ }
1028
+
1029
+ if (print_items) {
1030
+ os << "### KLL sketch data:" << std::endl;
1031
+ uint8_t level = 0;
1032
+ while (level < num_levels_) {
1033
+ const uint32_t from_index = levels_[level];
1034
+ const uint32_t to_index = levels_[level + 1]; // exclusive
1035
+ if (from_index < to_index) {
1036
+ os << " level " << (unsigned int) level << ":" << std::endl;
1037
+ }
1038
+ for (uint32_t i = from_index; i < to_index; i++) {
1039
+ os << " " << items_[i] << std::endl;
1040
+ }
1041
+ level++;
1042
+ }
1043
+ os << "### End sketch data" << std::endl;
1044
+ }
1045
+ return os.str();
1046
+ }
1047
+
1048
+ template <typename T, typename C, typename S, typename A>
1049
+ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin() const {
1050
+ return kll_sketch<T, C, S, A>::const_iterator(items_, levels_.data(), num_levels_);
1051
+ }
1052
+
1053
+ template <typename T, typename C, typename S, typename A>
1054
+ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
1055
+ return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
1056
+ }
1057
+
1058
+ // kll_sketch::const_iterator implementation
1059
+
1060
+ template<typename T, typename C, typename S, typename A>
1061
+ kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
1062
+ items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
1063
+ {}
1064
+
1065
+ template<typename T, typename C, typename S, typename A>
1066
+ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++() {
1067
+ ++index;
1068
+ if (index == levels[level + 1]) { // go to the next non-empty level
1069
+ do {
1070
+ ++level;
1071
+ weight *= 2;
1072
+ } while (level < num_levels && levels[level] == levels[level + 1]);
1073
+ }
1074
+ return *this;
1075
+ }
1076
+
1077
+ template<typename T, typename C, typename S, typename A>
1078
+ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++(int) {
1079
+ const_iterator tmp(*this);
1080
+ operator++();
1081
+ return tmp;
1082
+ }
1083
+
1084
+ template<typename T, typename C, typename S, typename A>
1085
+ bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
1086
+ if (level != other.level) return false;
1087
+ if (level == num_levels) return true; // end
1088
+ return index == other.index;
1089
+ }
1090
+
1091
+ template<typename T, typename C, typename S, typename A>
1092
+ bool kll_sketch<T, C, S, A>::const_iterator::operator!=(const const_iterator& other) const {
1093
+ return !operator==(other);
1094
+ }
1095
+
1096
+ template<typename T, typename C, typename S, typename A>
1097
+ const std::pair<const T&, const uint64_t> kll_sketch<T, C, S, A>::const_iterator::operator*() const {
1098
+ return std::pair<const T&, const uint64_t>(items[index], weight);
1099
+ }
1100
+
1101
+ template<typename T, typename C, typename S, typename A>
1102
+ class kll_sketch<T, C, S, A>::item_deleter {
1103
+ public:
1104
+ void operator() (T* ptr) const {
1105
+ if (ptr != nullptr) {
1106
+ ptr->~T();
1107
+ A().deallocate(ptr, 1);
1108
+ }
1109
+ }
1110
+ };
1111
+
1112
+ template<typename T, typename C, typename S, typename A>
1113
+ class kll_sketch<T, C, S, A>::items_deleter {
1114
+ public:
1115
+ items_deleter(uint32_t start, uint32_t num): start(start), num(num) {}
1116
+ void operator() (T* ptr) const {
1117
+ if (ptr != nullptr) {
1118
+ for (uint32_t i = start; i < num; ++i) {
1119
+ ptr[i].~T();
1120
+ }
1121
+ A().deallocate(ptr, num);
1122
+ }
1123
+ }
1124
+ private:
1125
+ uint32_t start;
1126
+ uint32_t num;
1127
+ };
1128
+
1129
+ } /* namespace datasketches */
1130
+
1131
+ #endif