datasketches 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,1131 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_SKETCH_IMPL_HPP_
21
+ #define KLL_SKETCH_IMPL_HPP_
22
+
23
+ #include <iostream>
24
+ #include <iomanip>
25
+ #include <sstream>
26
+
27
+ #include "memory_operations.hpp"
28
+ #include "kll_helper.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename T, typename C, typename S, typename A>
33
+ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k):
34
+ k_(k),
35
+ m_(DEFAULT_M),
36
+ min_k_(k),
37
+ n_(0),
38
+ num_levels_(1),
39
+ levels_(2),
40
+ items_(nullptr),
41
+ items_size_(k_),
42
+ min_value_(nullptr),
43
+ max_value_(nullptr),
44
+ is_level_zero_sorted_(false)
45
+ {
46
+ if (k < MIN_K || k > MAX_K) {
47
+ throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= " + std::to_string(MAX_K) + ": " + std::to_string(k));
48
+ }
49
+ levels_[0] = levels_[1] = k;
50
+ items_ = A().allocate(items_size_);
51
+ }
52
+
53
+ template<typename T, typename C, typename S, typename A>
54
+ kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch& other):
55
+ k_(other.k_),
56
+ m_(other.m_),
57
+ min_k_(other.min_k_),
58
+ n_(other.n_),
59
+ num_levels_(other.num_levels_),
60
+ levels_(other.levels_),
61
+ items_(nullptr),
62
+ items_size_(other.items_size_),
63
+ min_value_(nullptr),
64
+ max_value_(nullptr),
65
+ is_level_zero_sorted_(other.is_level_zero_sorted_)
66
+ {
67
+ items_ = A().allocate(items_size_);
68
+ std::copy(&other.items_[levels_[0]], &other.items_[levels_[num_levels_]], &items_[levels_[0]]);
69
+ if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
70
+ if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
71
+ }
72
+
73
+ template<typename T, typename C, typename S, typename A>
74
+ kll_sketch<T, C, S, A>::kll_sketch(kll_sketch&& other) noexcept:
75
+ k_(other.k_),
76
+ m_(other.m_),
77
+ min_k_(other.min_k_),
78
+ n_(other.n_),
79
+ num_levels_(other.num_levels_),
80
+ levels_(std::move(other.levels_)),
81
+ items_(other.items_),
82
+ items_size_(other.items_size_),
83
+ min_value_(other.min_value_),
84
+ max_value_(other.max_value_),
85
+ is_level_zero_sorted_(other.is_level_zero_sorted_)
86
+ {
87
+ other.items_ = nullptr;
88
+ other.min_value_ = nullptr;
89
+ other.max_value_ = nullptr;
90
+ }
91
+
92
+ template<typename T, typename C, typename S, typename A>
93
+ kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
94
+ kll_sketch copy(other);
95
+ std::swap(k_, copy.k_);
96
+ std::swap(m_, copy.m_);
97
+ std::swap(min_k_, copy.min_k_);
98
+ std::swap(n_, copy.n_);
99
+ std::swap(num_levels_, copy.num_levels_);
100
+ std::swap(levels_, copy.levels_);
101
+ std::swap(items_, copy.items_);
102
+ std::swap(items_size_, copy.items_size_);
103
+ std::swap(min_value_, copy.min_value_);
104
+ std::swap(max_value_, copy.max_value_);
105
+ std::swap(is_level_zero_sorted_, copy.is_level_zero_sorted_);
106
+ return *this;
107
+ }
108
+
109
+ template<typename T, typename C, typename S, typename A>
110
+ kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(kll_sketch&& other) {
111
+ std::swap(k_, other.k_);
112
+ std::swap(m_, other.m_);
113
+ std::swap(min_k_, other.min_k_);
114
+ std::swap(n_, other.n_);
115
+ std::swap(num_levels_, other.num_levels_);
116
+ std::swap(levels_, other.levels_);
117
+ std::swap(items_, other.items_);
118
+ std::swap(items_size_, other.items_size_);
119
+ std::swap(min_value_, other.min_value_);
120
+ std::swap(max_value_, other.max_value_);
121
+ std::swap(is_level_zero_sorted_, other.is_level_zero_sorted_);
122
+ return *this;
123
+ }
124
+
125
+ template<typename T, typename C, typename S, typename A>
126
+ kll_sketch<T, C, S, A>::~kll_sketch() {
127
+ if (items_ != nullptr) {
128
+ const uint32_t begin = levels_[0];
129
+ const uint32_t end = levels_[num_levels_];
130
+ for (uint32_t i = begin; i < end; i++) items_[i].~T();
131
+ A().deallocate(items_, items_size_);
132
+ }
133
+ if (min_value_ != nullptr) {
134
+ min_value_->~T();
135
+ A().deallocate(min_value_, 1);
136
+ }
137
+ if (max_value_ != nullptr) {
138
+ max_value_->~T();
139
+ A().deallocate(max_value_, 1);
140
+ }
141
+ }
142
+
143
+ template<typename T, typename C, typename S, typename A>
144
+ void kll_sketch<T, C, S, A>::update(const T& value) {
145
+ if (!check_update_value(value)) { return; }
146
+ update_min_max(value);
147
+ const uint32_t index = internal_update();
148
+ new (&items_[index]) T(value);
149
+ }
150
+
151
+ template<typename T, typename C, typename S, typename A>
152
+ void kll_sketch<T, C, S, A>::update(T&& value) {
153
+ if (!check_update_value(value)) { return; }
154
+ update_min_max(value);
155
+ const uint32_t index = internal_update();
156
+ new (&items_[index]) T(std::move(value));
157
+ }
158
+
159
+ template<typename T, typename C, typename S, typename A>
160
+ void kll_sketch<T, C, S, A>::update_min_max(const T& value) {
161
+ if (is_empty()) {
162
+ min_value_ = new (A().allocate(1)) T(value);
163
+ max_value_ = new (A().allocate(1)) T(value);
164
+ } else {
165
+ if (C()(value, *min_value_)) *min_value_ = value;
166
+ if (C()(*max_value_, value)) *max_value_ = value;
167
+ }
168
+ }
169
+
170
+ template<typename T, typename C, typename S, typename A>
171
+ uint32_t kll_sketch<T, C, S, A>::internal_update() {
172
+ if (levels_[0] == 0) compress_while_updating();
173
+ n_++;
174
+ is_level_zero_sorted_ = false;
175
+ return --levels_[0];
176
+ }
177
+
178
+ template<typename T, typename C, typename S, typename A>
179
+ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
180
+ if (other.is_empty()) return;
181
+ if (m_ != other.m_) {
182
+ throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
183
+ }
184
+ if (is_empty()) {
185
+ min_value_ = new (A().allocate(1)) T(*other.min_value_);
186
+ max_value_ = new (A().allocate(1)) T(*other.max_value_);
187
+ } else {
188
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
189
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
190
+ }
191
+ const uint64_t final_n = n_ + other.n_;
192
+ for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
193
+ const uint32_t index = internal_update();
194
+ new (&items_[index]) T(other.items_[i]);
195
+ }
196
+ if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
197
+ n_ = final_n;
198
+ if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
199
+ assert_correct_total_weight();
200
+ }
201
+
202
+ template<typename T, typename C, typename S, typename A>
203
+ void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
204
+ if (other.is_empty()) return;
205
+ if (m_ != other.m_) {
206
+ throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
207
+ }
208
+ if (is_empty()) {
209
+ min_value_ = new (A().allocate(1)) T(std::move(*other.min_value_));
210
+ max_value_ = new (A().allocate(1)) T(std::move(*other.max_value_));
211
+ } else {
212
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
213
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
214
+ }
215
+ const uint64_t final_n = n_ + other.n_;
216
+ for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
217
+ const uint32_t index = internal_update();
218
+ new (&items_[index]) T(std::move(other.items_[i]));
219
+ }
220
+ if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
221
+ n_ = final_n;
222
+ if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
223
+ assert_correct_total_weight();
224
+ }
225
+
226
+ template<typename T, typename C, typename S, typename A>
227
+ bool kll_sketch<T, C, S, A>::is_empty() const {
228
+ return n_ == 0;
229
+ }
230
+
231
+ template<typename T, typename C, typename S, typename A>
232
+ uint64_t kll_sketch<T, C, S, A>::get_n() const {
233
+ return n_;
234
+ }
235
+
236
+ template<typename T, typename C, typename S, typename A>
237
+ uint32_t kll_sketch<T, C, S, A>::get_num_retained() const {
238
+ return levels_[num_levels_] - levels_[0];
239
+ }
240
+
241
+ template<typename T, typename C, typename S, typename A>
242
+ bool kll_sketch<T, C, S, A>::is_estimation_mode() const {
243
+ return num_levels_ > 1;
244
+ }
245
+
246
+ template<typename T, typename C, typename S, typename A>
247
+ T kll_sketch<T, C, S, A>::get_min_value() const {
248
+ if (is_empty()) return get_invalid_value();
249
+ return *min_value_;
250
+ }
251
+
252
+ template<typename T, typename C, typename S, typename A>
253
+ T kll_sketch<T, C, S, A>::get_max_value() const {
254
+ if (is_empty()) return get_invalid_value();
255
+ return *max_value_;
256
+ }
257
+
258
+ template<typename T, typename C, typename S, typename A>
259
+ T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
260
+ if (is_empty()) return get_invalid_value();
261
+ if (fraction == 0.0) return *min_value_;
262
+ if (fraction == 1.0) return *max_value_;
263
+ if ((fraction < 0.0) || (fraction > 1.0)) {
264
+ throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
265
+ }
266
+ // has side effect of sorting level zero if needed
267
+ auto quantile_calculator(const_cast<kll_sketch*>(this)->get_quantile_calculator());
268
+ return quantile_calculator->get_quantile(fraction);
269
+ }
270
+
271
+ template<typename T, typename C, typename S, typename A>
272
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
273
+ std::vector<T, A> quantiles;
274
+ quantiles.reserve(size);
275
+ if (is_empty()) return quantiles;
276
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
277
+ quantiles.reserve(size);
278
+ for (uint32_t i = 0; i < size; i++) {
279
+ const double fraction = fractions[i];
280
+ if ((fraction < 0.0) || (fraction > 1.0)) {
281
+ throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
282
+ }
283
+ if (fraction == 0.0) quantiles.push_back(*min_value_);
284
+ else if (fraction == 1.0) quantiles.push_back(*max_value_);
285
+ else {
286
+ if (!quantile_calculator) {
287
+ // has side effect of sorting level zero if needed
288
+ quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
289
+ }
290
+ quantiles.push_back(quantile_calculator->get_quantile(fraction));
291
+ }
292
+ }
293
+ return quantiles;
294
+ }
295
+
296
+ template<typename T, typename C, typename S, typename A>
297
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
298
+ if (is_empty()) return std::vector<T, A>();
299
+ if (num == 0) {
300
+ throw std::invalid_argument("num must be > 0");
301
+ }
302
+ std::vector<double> fractions(num);
303
+ fractions[0] = 0.0;
304
+ for (size_t i = 1; i < num; i++) {
305
+ fractions[i] = static_cast<double>(i) / (num - 1);
306
+ }
307
+ if (num > 1) {
308
+ fractions[num - 1] = 1.0;
309
+ }
310
+ return get_quantiles(fractions.data(), num);
311
+ }
312
+
313
+ template<typename T, typename C, typename S, typename A>
314
+ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
315
+ if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
316
+ uint8_t level = 0;
317
+ uint64_t weight = 1;
318
+ uint64_t total = 0;
319
+ while (level < num_levels_) {
320
+ const auto from_index(levels_[level]);
321
+ const auto to_index(levels_[level + 1]); // exclusive
322
+ for (uint32_t i = from_index; i < to_index; i++) {
323
+ if (C()(items_[i], value)) {
324
+ total += weight;
325
+ } else if ((level > 0) || is_level_zero_sorted_) {
326
+ break; // levels above 0 are sorted, no point comparing further
327
+ }
328
+ }
329
+ level++;
330
+ weight *= 2;
331
+ }
332
+ return (double) total / n_;
333
+ }
334
+
335
+ template<typename T, typename C, typename S, typename A>
336
+ vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
337
+ return get_PMF_or_CDF(split_points, size, false);
338
+ }
339
+
340
+ template<typename T, typename C, typename S, typename A>
341
+ vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
342
+ return get_PMF_or_CDF(split_points, size, true);
343
+ }
344
+
345
+ template<typename T, typename C, typename S, typename A>
346
+ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
347
+ return get_normalized_rank_error(min_k_, pmf);
348
+ }
349
+
350
+ // implementation for fixed-size arithmetic types (integral and floating point)
351
+ template<typename T, typename C, typename S, typename A>
352
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
353
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
354
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
355
+ if (num_levels_ == 1 && get_num_retained() == 1) {
356
+ return DATA_START_SINGLE_ITEM + sizeof(TT);
357
+ }
358
+ // the last integer in the levels_ array is not serialized because it can be derived
359
+ return DATA_START + num_levels_ * sizeof(uint32_t) + (get_num_retained() + 2) * sizeof(TT);
360
+ }
361
+
362
+ // implementation for all other types
363
+ template<typename T, typename C, typename S, typename A>
364
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
365
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
366
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
367
+ if (num_levels_ == 1 && get_num_retained() == 1) {
368
+ return DATA_START_SINGLE_ITEM + S().size_of_item(items_[levels_[0]]);
369
+ }
370
+ // the last integer in the levels_ array is not serialized because it can be derived
371
+ size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
372
+ size += S().size_of_item(*min_value_);
373
+ size += S().size_of_item(*max_value_);
374
+ for (auto& it: *this) size += S().size_of_item(it.first);
375
+ return size;
376
+ }
377
+
378
+ template<typename T, typename C, typename S, typename A>
379
+ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
380
+ const bool is_single_item = n_ == 1;
381
+ const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
382
+ os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
383
+ const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
384
+ os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
385
+ const uint8_t family(FAMILY);
386
+ os.write(reinterpret_cast<const char*>(&family), sizeof(family));
387
+ const uint8_t flags_byte(
388
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
389
+ | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
390
+ | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
391
+ );
392
+ os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
393
+ os.write((char*)&k_, sizeof(k_));
394
+ os.write((char*)&m_, sizeof(m_));
395
+ const uint8_t unused = 0;
396
+ os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
397
+ if (is_empty()) return;
398
+ if (!is_single_item) {
399
+ os.write((char*)&n_, sizeof(n_));
400
+ os.write((char*)&min_k_, sizeof(min_k_));
401
+ os.write((char*)&num_levels_, sizeof(num_levels_));
402
+ os.write((char*)&unused, sizeof(unused));
403
+ os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
404
+ S().serialize(os, min_value_, 1);
405
+ S().serialize(os, max_value_, 1);
406
+ }
407
+ S().serialize(os, &items_[levels_[0]], get_num_retained());
408
+ }
409
+
410
+ template<typename T, typename C, typename S, typename A>
411
+ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
412
+ const bool is_single_item = n_ == 1;
413
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
414
+ vector_u8<A> bytes(size);
415
+ uint8_t* ptr = bytes.data() + header_size_bytes;
416
+ const uint8_t* end_ptr = ptr + size;
417
+ const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
418
+ ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
419
+ const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
420
+ ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
421
+ const uint8_t family(FAMILY);
422
+ ptr += copy_to_mem(&family, ptr, sizeof(family));
423
+ const uint8_t flags_byte(
424
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
425
+ | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
426
+ | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
427
+ );
428
+ ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
429
+ ptr += copy_to_mem(&k_, ptr, sizeof(k_));
430
+ ptr += copy_to_mem(&m_, ptr, sizeof(m_));
431
+ const uint8_t unused = 0;
432
+ ptr += copy_to_mem(&unused, ptr, sizeof(unused));
433
+ if (!is_empty()) {
434
+ if (!is_single_item) {
435
+ ptr += copy_to_mem(&n_, ptr, sizeof(n_));
436
+ ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
437
+ ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
438
+ ptr += copy_to_mem(&unused, ptr, sizeof(unused));
439
+ ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
440
+ ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
441
+ ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
442
+ }
443
+ const size_t bytes_remaining = end_ptr - ptr;
444
+ ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
445
+ }
446
+ const size_t delta = ptr - bytes.data();
447
+ if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
448
+ return bytes;
449
+ }
450
+
451
+ template<typename T, typename C, typename S, typename A>
452
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
453
+ uint8_t preamble_ints;
454
+ is.read((char*)&preamble_ints, sizeof(preamble_ints));
455
+ uint8_t serial_version;
456
+ is.read((char*)&serial_version, sizeof(serial_version));
457
+ uint8_t family_id;
458
+ is.read((char*)&family_id, sizeof(family_id));
459
+ uint8_t flags_byte;
460
+ is.read((char*)&flags_byte, sizeof(flags_byte));
461
+ uint16_t k;
462
+ is.read((char*)&k, sizeof(k));
463
+ uint8_t m;
464
+ is.read((char*)&m, sizeof(m));
465
+ uint8_t unused;
466
+ is.read((char*)&unused, sizeof(unused));
467
+
468
+ check_m(m);
469
+ check_preamble_ints(preamble_ints, flags_byte);
470
+ check_serial_version(serial_version);
471
+ check_family_id(family_id);
472
+
473
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
474
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
475
+ if (is_empty) return kll_sketch(k);
476
+
477
+ uint64_t n;
478
+ uint16_t min_k;
479
+ uint8_t num_levels;
480
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
481
+ if (is_single_item) {
482
+ n = 1;
483
+ min_k = k;
484
+ num_levels = 1;
485
+ } else {
486
+ is.read((char*)&n, sizeof(n_));
487
+ is.read((char*)&min_k, sizeof(min_k_));
488
+ is.read((char*)&num_levels, sizeof(num_levels));
489
+ is.read((char*)&unused, sizeof(unused));
490
+ }
491
+ vector_u32<A> levels(num_levels + 1);
492
+ const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
493
+ if (is_single_item) {
494
+ levels[0] = capacity - 1;
495
+ } else {
496
+ // the last integer in levels_ is not serialized because it can be derived
497
+ is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
498
+ }
499
+ levels[num_levels] = capacity;
500
+ auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
501
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
502
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
503
+ std::unique_ptr<T, item_deleter> min_value;
504
+ std::unique_ptr<T, item_deleter> max_value;
505
+ if (!is_single_item) {
506
+ S().deserialize(is, min_value_buffer.get(), 1);
507
+ // serde call did not throw, repackage with destrtuctor
508
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
509
+ S().deserialize(is, max_value_buffer.get(), 1);
510
+ // serde call did not throw, repackage with destrtuctor
511
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
512
+ }
513
+ auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
514
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
515
+ const auto num_items = levels[num_levels] - levels[0];
516
+ S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
517
+ // serde call did not throw, repackage with destrtuctors
518
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
519
+ const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
520
+ if (is_single_item) {
521
+ new (min_value_buffer.get()) T(items.get()[levels[0]]);
522
+ // copy did not throw, repackage with destrtuctor
523
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
524
+ new (max_value_buffer.get()) T(items.get()[levels[0]]);
525
+ // copy did not throw, repackage with destrtuctor
526
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
527
+ }
528
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
529
+ return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
530
+ std::move(min_value), std::move(max_value), is_level_zero_sorted);
531
+ }
532
+
533
+ template<typename T, typename C, typename S, typename A>
534
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size) {
535
+ ensure_minimum_memory(size, 8);
536
+ const char* ptr = static_cast<const char*>(bytes);
537
+ uint8_t preamble_ints;
538
+ ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
539
+ uint8_t serial_version;
540
+ ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
541
+ uint8_t family_id;
542
+ ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
543
+ uint8_t flags_byte;
544
+ ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
545
+ uint16_t k;
546
+ ptr += copy_from_mem(ptr, &k, sizeof(k));
547
+ uint8_t m;
548
+ ptr += copy_from_mem(ptr, &m, sizeof(m));
549
+ ptr++; // skip unused byte
550
+
551
+ check_m(m);
552
+ check_preamble_ints(preamble_ints, flags_byte);
553
+ check_serial_version(serial_version);
554
+ check_family_id(family_id);
555
+ ensure_minimum_memory(size, 1 << preamble_ints);
556
+
557
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
558
+ if (is_empty) return kll_sketch<T, C, S, A>(k);
559
+
560
+ uint64_t n;
561
+ uint16_t min_k;
562
+ uint8_t num_levels;
563
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
564
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
565
+ if (is_single_item) {
566
+ n = 1;
567
+ min_k = k;
568
+ num_levels = 1;
569
+ } else {
570
+ ptr += copy_from_mem(ptr, &n, sizeof(n));
571
+ ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
572
+ ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
573
+ ptr++; // skip unused byte
574
+ }
575
+ vector_u32<A> levels(num_levels + 1);
576
+ const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
577
+ if (is_single_item) {
578
+ levels[0] = capacity - 1;
579
+ } else {
580
+ // the last integer in levels_ is not serialized because it can be derived
581
+ ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
582
+ }
583
+ levels[num_levels] = capacity;
584
+ auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
585
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
586
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
587
+ std::unique_ptr<T, item_deleter> min_value;
588
+ std::unique_ptr<T, item_deleter> max_value;
589
+ if (!is_single_item) {
590
+ ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
591
+ // serde call did not throw, repackage with destrtuctor
592
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
593
+ ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
594
+ // serde call did not throw, repackage with destrtuctor
595
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
596
+ }
597
+ auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
598
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
599
+ const auto num_items = levels[num_levels] - levels[0];
600
+ ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
601
+ // serde call did not throw, repackage with destrtuctors
602
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
603
+ const size_t delta = ptr - static_cast<const char*>(bytes);
604
+ if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
605
+ const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
606
+ if (is_single_item) {
607
+ new (min_value_buffer.get()) T(items.get()[levels[0]]);
608
+ // copy did not throw, repackage with destrtuctor
609
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
610
+ new (max_value_buffer.get()) T(items.get()[levels[0]]);
611
+ // copy did not throw, repackage with destrtuctor
612
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
613
+ }
614
+ return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
615
+ std::move(min_value), std::move(max_value), is_level_zero_sorted);
616
+ }
617
+
618
+ /*
619
+ * Gets the normalized rank error given k and pmf.
620
+ * k - the configuration parameter
621
+ * pmf - if true, returns the "double-sided" normalized rank error for the get_PMF() function.
622
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
623
+ * Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials
624
+ */
625
+ template<typename T, typename C, typename S, typename A>
626
+ double kll_sketch<T, C, S, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
627
+ return pmf
628
+ ? 2.446 / pow(k, 0.9433)
629
+ : 2.296 / pow(k, 0.9723);
630
+ }
631
+
632
+ // for deserialization
633
+ template<typename T, typename C, typename S, typename A>
634
+ kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
635
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
636
+ std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
637
+ k_(k),
638
+ m_(DEFAULT_M),
639
+ min_k_(min_k),
640
+ n_(n),
641
+ num_levels_(num_levels),
642
+ levels_(std::move(levels)),
643
+ items_(items.release()),
644
+ items_size_(items_size),
645
+ min_value_(min_value.release()),
646
+ max_value_(max_value.release()),
647
+ is_level_zero_sorted_(is_level_zero_sorted)
648
+ {}
649
+
650
+ // The following code is only valid in the special case of exactly reaching capacity while updating.
651
+ // It cannot be used while merging, while reducing k, or anything else.
652
+ template<typename T, typename C, typename S, typename A>
653
+ void kll_sketch<T, C, S, A>::compress_while_updating(void) {
654
+ const uint8_t level = find_level_to_compact();
655
+
656
+ // It is important to add the new top level right here. Be aware that this operation
657
+ // grows the buffer and shifts the data and also the boundaries of the data and grows the
658
+ // levels array and increments num_levels_
659
+ if (level == (num_levels_ - 1)) {
660
+ add_empty_top_level_to_completely_full_sketch();
661
+ }
662
+
663
+ const uint32_t raw_beg = levels_[level];
664
+ const uint32_t raw_lim = levels_[level + 1];
665
+ // +2 is OK because we already added a new top level if necessary
666
+ const uint32_t pop_above = levels_[level + 2] - raw_lim;
667
+ const uint32_t raw_pop = raw_lim - raw_beg;
668
+ const bool odd_pop = kll_helper::is_odd(raw_pop);
669
+ const uint32_t adj_beg = odd_pop ? raw_beg + 1 : raw_beg;
670
+ const uint32_t adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
671
+ const uint32_t half_adj_pop = adj_pop / 2;
672
+ const uint32_t destroy_beg = levels_[0];
673
+
674
+ // level zero might not be sorted, so we must sort it if we wish to compact it
675
+ // sort_level_zero() is not used here because of the adjustment for odd number of items
676
+ if ((level == 0) && !is_level_zero_sorted_) {
677
+ std::sort(&items_[adj_beg], &items_[adj_beg + adj_pop], C());
678
+ }
679
+ if (pop_above == 0) {
680
+ kll_helper::randomly_halve_up(items_, adj_beg, adj_pop);
681
+ } else {
682
+ kll_helper::randomly_halve_down(items_, adj_beg, adj_pop);
683
+ kll_helper::merge_sorted_arrays<T, C>(items_, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
684
+ }
685
+ levels_[level + 1] -= half_adj_pop; // adjust boundaries of the level above
686
+ if (odd_pop) {
687
+ levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
688
+ if (levels_[level] != raw_beg) items_[levels_[level]] = std::move(items_[raw_beg]); // namely this leftover guy
689
+ } else {
690
+ levels_[level] = levels_[level + 1]; // the current level is now empty
691
+ }
692
+
693
+ // verify that we freed up half_adj_pop array slots just below the current level
694
+ if (levels_[level] != (raw_beg + half_adj_pop)) throw std::logic_error("compaction error");
695
+
696
+ // finally, we need to shift up the data in the levels below
697
+ // so that the freed-up space can be used by level zero
698
+ if (level > 0) {
699
+ const uint32_t amount = raw_beg - levels_[0];
700
+ std::move_backward(&items_[levels_[0]], &items_[levels_[0] + amount], &items_[levels_[0] + half_adj_pop + amount]);
701
+ for (uint8_t lvl = 0; lvl < level; lvl++) levels_[lvl] += half_adj_pop;
702
+ }
703
+ for (uint32_t i = 0; i < half_adj_pop; i++) items_[i + destroy_beg].~T();
704
+ }
705
+
706
+ template<typename T, typename C, typename S, typename A>
707
+ uint8_t kll_sketch<T, C, S, A>::find_level_to_compact() const {
708
+ uint8_t level = 0;
709
+ while (true) {
710
+ if (level >= num_levels_) throw std::logic_error("capacity calculation error");
711
+ const uint32_t pop = levels_[level + 1] - levels_[level];
712
+ const uint32_t cap = kll_helper::level_capacity(k_, num_levels_, level, m_);
713
+ if (pop >= cap) {
714
+ return level;
715
+ }
716
+ level++;
717
+ }
718
+ }
719
+
720
+ template<typename T, typename C, typename S, typename A>
721
+ void kll_sketch<T, C, S, A>::add_empty_top_level_to_completely_full_sketch() {
722
+ const uint32_t cur_total_cap = levels_[num_levels_];
723
+
724
+ // make sure that we are following a certain growth scheme
725
+ if (levels_[0] != 0) throw std::logic_error("full sketch expected");
726
+ if (items_size_ != cur_total_cap) throw std::logic_error("current capacity mismatch");
727
+
728
+ // note that merging MIGHT over-grow levels_, in which case we might not have to grow it here
729
+ const uint8_t new_levels_size = num_levels_ + 2;
730
+ if (levels_.size() < new_levels_size) {
731
+ levels_.resize(new_levels_size);
732
+ }
733
+
734
+ const uint32_t delta_cap = kll_helper::level_capacity(k_, num_levels_ + 1, 0, m_);
735
+ const uint32_t new_total_cap = cur_total_cap + delta_cap;
736
+
737
+ // move (and shift) the current data into the new buffer
738
+ T* new_buf = A().allocate(new_total_cap);
739
+ kll_helper::move_construct<T>(items_, 0, cur_total_cap, new_buf, delta_cap, true);
740
+ A().deallocate(items_, items_size_);
741
+ items_ = new_buf;
742
+ items_size_ = new_total_cap;
743
+
744
+ // this loop includes the old "extra" index at the top
745
+ for (uint8_t i = 0; i <= num_levels_; i++) {
746
+ levels_[i] += delta_cap;
747
+ }
748
+
749
+ if (levels_[num_levels_] != new_total_cap) throw std::logic_error("new capacity mismatch");
750
+
751
+ num_levels_++;
752
+ levels_[num_levels_] = new_total_cap; // initialize the new "extra" index at the top
753
+ }
754
+
755
+ template<typename T, typename C, typename S, typename A>
756
+ void kll_sketch<T, C, S, A>::sort_level_zero() {
757
+ if (!is_level_zero_sorted_) {
758
+ std::sort(&items_[levels_[0]], &items_[levels_[1]], C());
759
+ is_level_zero_sorted_ = true;
760
+ }
761
+ }
762
+
763
+ template<typename T, typename C, typename S, typename A>
764
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
765
+ sort_level_zero();
766
+ typedef typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>> AllocCalc;
767
+ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
768
+ new (AllocCalc().allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_),
769
+ [](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); AllocCalc().deallocate(ptr, 1); }
770
+ );
771
+ return quantile_calculator;
772
+ }
773
+
774
+ template<typename T, typename C, typename S, typename A>
775
+ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
776
+ if (is_empty()) return vector_d<A>();
777
+ kll_helper::validate_values<T, C>(split_points, size);
778
+ vector_d<A> buckets(size + 1, 0);
779
+ uint8_t level = 0;
780
+ uint64_t weight = 1;
781
+ while (level < num_levels_) {
782
+ const auto from_index = levels_[level];
783
+ const auto to_index = levels_[level + 1]; // exclusive
784
+ if ((level == 0) && !is_level_zero_sorted_) {
785
+ increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
786
+ } else {
787
+ increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
788
+ }
789
+ level++;
790
+ weight *= 2;
791
+ }
792
+ // normalize and, if CDF, convert to cumulative
793
+ if (is_CDF) {
794
+ double subtotal = 0;
795
+ for (uint32_t i = 0; i <= size; i++) {
796
+ subtotal += buckets[i];
797
+ buckets[i] = subtotal / n_;
798
+ }
799
+ } else {
800
+ for (uint32_t i = 0; i <= size; i++) {
801
+ buckets[i] /= n_;
802
+ }
803
+ }
804
+ return buckets;
805
+ }
806
+
807
+ template<typename T, typename C, typename S, typename A>
808
+ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
809
+ const T* split_points, uint32_t size, double* buckets) const
810
+ {
811
+ for (uint32_t i = from_index; i < to_index; i++) {
812
+ uint32_t j;
813
+ for (j = 0; j < size; j++) {
814
+ if (C()(items_[i], split_points[j])) {
815
+ break;
816
+ }
817
+ }
818
+ buckets[j] += weight;
819
+ }
820
+ }
821
+
822
+ template<typename T, typename C, typename S, typename A>
823
+ void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
824
+ const T* split_points, uint32_t size, double* buckets) const
825
+ {
826
+ uint32_t i = from_index;
827
+ uint32_t j = 0;
828
+ while ((i < to_index) && (j < size)) {
829
+ if (C()(items_[i], split_points[j])) {
830
+ buckets[j] += weight; // this sample goes into this bucket
831
+ i++; // move on to next sample and see whether it also goes into this bucket
832
+ } else {
833
+ j++; // no more samples for this bucket
834
+ }
835
+ }
836
+ // now either i == to_index (we are out of samples), or
837
+ // j == size (we are out of buckets, but there are more samples remaining)
838
+ // we only need to do something in the latter case
839
+ if (j == size) {
840
+ buckets[j] += weight * (to_index - i);
841
+ }
842
+ }
843
+
844
+ template<typename T, typename C, typename S, typename A>
845
+ template<typename O>
846
+ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
847
+ const uint32_t tmp_num_items = get_num_retained() + other.get_num_retained_above_level_zero();
848
+ auto tmp_items_deleter = [tmp_num_items](T* ptr) { A().deallocate(ptr, tmp_num_items); }; // no destructor needed
849
+ const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(A().allocate(tmp_num_items), tmp_items_deleter);
850
+ const uint8_t ub = kll_helper::ub_on_num_levels(final_n);
851
+ const size_t work_levels_size = ub + 2; // ub+1 does not work
852
+ vector_u32<A> worklevels(work_levels_size);
853
+ vector_u32<A> outlevels(work_levels_size);
854
+
855
+ const uint8_t provisional_num_levels = std::max(num_levels_, other.num_levels_);
856
+
857
+ populate_work_arrays(std::forward<O>(other), workbuf.get(), worklevels.data(), provisional_num_levels);
858
+
859
+ const kll_helper::compress_result result = kll_helper::general_compress<T, C>(k_, m_, provisional_num_levels, workbuf.get(),
860
+ worklevels.data(), outlevels.data(), is_level_zero_sorted_);
861
+
862
+ // ub can sometimes be much bigger
863
+ if (result.final_num_levels > ub) throw std::logic_error("merge error");
864
+
865
+ // now we need to transfer the results back into "this" sketch
866
+ if (result.final_capacity != items_size_) {
867
+ A().deallocate(items_, items_size_);
868
+ items_size_ = result.final_capacity;
869
+ items_ = A().allocate(items_size_);
870
+ }
871
+ const uint32_t free_space_at_bottom = result.final_capacity - result.final_num_items;
872
+ kll_helper::move_construct<T>(workbuf.get(), outlevels[0], outlevels[0] + result.final_num_items, items_, free_space_at_bottom, true);
873
+
874
+ const size_t new_levels_size = result.final_num_levels + 1;
875
+ if (levels_.size() < new_levels_size) {
876
+ levels_.resize(new_levels_size);
877
+ }
878
+ const uint32_t offset = free_space_at_bottom - outlevels[0];
879
+ for (uint8_t lvl = 0; lvl < levels_.size(); lvl++) { // includes the "extra" index
880
+ levels_[lvl] = outlevels[lvl] + offset;
881
+ }
882
+ num_levels_ = result.final_num_levels;
883
+ }
884
+
885
+ // this leaves items_ uninitialized (all objects moved out and destroyed)
886
+ // this version copies objects from the incoming sketch
887
+ template<typename T, typename C, typename S, typename A>
888
+ void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
889
+ worklevels[0] = 0;
890
+
891
+ // the level zero data from "other" was already inserted into "this"
892
+ kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
893
+ worklevels[1] = safe_level_size(0);
894
+
895
+ for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
896
+ const uint32_t self_pop = safe_level_size(lvl);
897
+ const uint32_t other_pop = other.safe_level_size(lvl);
898
+ worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
899
+
900
+ if ((self_pop > 0) && (other_pop == 0)) {
901
+ kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
902
+ } else if ((self_pop == 0) && (other_pop > 0)) {
903
+ kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
904
+ } else if ((self_pop > 0) && (other_pop > 0)) {
905
+ kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
906
+ }
907
+ }
908
+ }
909
+
910
+ // this leaves items_ uninitialized (all objects moved out and destroyed)
911
+ // this version moves objects from the incoming sketch
912
+ template<typename T, typename C, typename S, typename A>
913
+ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
914
+ worklevels[0] = 0;
915
+
916
+ // the level zero data from "other" was already inserted into "this"
917
+ kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
918
+ worklevels[1] = safe_level_size(0);
919
+
920
+ for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
921
+ const uint32_t self_pop = safe_level_size(lvl);
922
+ const uint32_t other_pop = other.safe_level_size(lvl);
923
+ worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
924
+
925
+ if ((self_pop > 0) && (other_pop == 0)) {
926
+ kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
927
+ } else if ((self_pop == 0) && (other_pop > 0)) {
928
+ kll_helper::move_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl], false);
929
+ } else if ((self_pop > 0) && (other_pop > 0)) {
930
+ kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
931
+ }
932
+ }
933
+ }
934
+
935
+ template<typename T, typename C, typename S, typename A>
936
+ void kll_sketch<T, C, S, A>::assert_correct_total_weight() const {
937
+ const uint64_t total(kll_helper::sum_the_sample_weights(num_levels_, levels_.data()));
938
+ if (total != n_) {
939
+ throw std::logic_error("Total weight does not match N");
940
+ }
941
+ }
942
+
943
+ template<typename T, typename C, typename S, typename A>
944
+ uint32_t kll_sketch<T, C, S, A>::safe_level_size(uint8_t level) const {
945
+ if (level >= num_levels_) return 0;
946
+ return levels_[level + 1] - levels_[level];
947
+ }
948
+
949
+ template<typename T, typename C, typename S, typename A>
950
+ uint32_t kll_sketch<T, C, S, A>::get_num_retained_above_level_zero() const {
951
+ if (num_levels_ == 1) return 0;
952
+ return levels_[num_levels_] - levels_[1];
953
+ }
954
+
955
+ template<typename T, typename C, typename S, typename A>
956
+ void kll_sketch<T, C, S, A>::check_m(uint8_t m) {
957
+ if (m != DEFAULT_M) {
958
+ throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
959
+ + ": " + std::to_string(m));
960
+ }
961
+ }
962
+
963
+ template<typename T, typename C, typename S, typename A>
964
+ void kll_sketch<T, C, S, A>::check_preamble_ints(uint8_t preamble_ints, uint8_t flags_byte) {
965
+ const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
966
+ const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM));
967
+ if (is_empty || is_single_item) {
968
+ if (preamble_ints != PREAMBLE_INTS_SHORT) {
969
+ throw std::invalid_argument("Possible corruption: preamble ints must be "
970
+ + std::to_string(PREAMBLE_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints));
971
+ }
972
+ } else {
973
+ if (preamble_ints != PREAMBLE_INTS_FULL) {
974
+ throw std::invalid_argument("Possible corruption: preamble ints must be "
975
+ + std::to_string(PREAMBLE_INTS_FULL) + " for a sketch with more than one item: " + std::to_string(preamble_ints));
976
+ }
977
+ }
978
+ }
979
+
980
+ template<typename T, typename C, typename S, typename A>
981
+ void kll_sketch<T, C, S, A>::check_serial_version(uint8_t serial_version) {
982
+ if (serial_version != SERIAL_VERSION_1 && serial_version != SERIAL_VERSION_2) {
983
+ throw std::invalid_argument("Possible corruption: serial version mismatch: expected "
984
+ + std::to_string(SERIAL_VERSION_1) + " or " + std::to_string(SERIAL_VERSION_2)
985
+ + ", got " + std::to_string(serial_version));
986
+ }
987
+ }
988
+
989
+ template<typename T, typename C, typename S, typename A>
990
+ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
991
+ if (family_id != FAMILY) {
992
+ throw std::invalid_argument("Possible corruption: family mismatch: expected "
993
+ + std::to_string(FAMILY) + ", got " + std::to_string(family_id));
994
+ }
995
+ }
996
+
997
+ template <typename T, typename C, typename S, typename A>
998
+ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
999
+ std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
1000
+ os << "### KLL sketch summary:" << std::endl;
1001
+ os << " K : " << k_ << std::endl;
1002
+ os << " min K : " << min_k_ << std::endl;
1003
+ os << " M : " << (unsigned int) m_ << std::endl;
1004
+ os << " N : " << n_ << std::endl;
1005
+ os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
1006
+ os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
1007
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
1008
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
1009
+ os << " Levels : " << (unsigned int) num_levels_ << std::endl;
1010
+ os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
1011
+ os << " Capacity items : " << items_size_ << std::endl;
1012
+ os << " Retained items : " << get_num_retained() << std::endl;
1013
+ os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
1014
+ if (!is_empty()) {
1015
+ os << " Min value : " << *min_value_ << std::endl;
1016
+ os << " Max value : " << *max_value_ << std::endl;
1017
+ }
1018
+ os << "### End sketch summary" << std::endl;
1019
+
1020
+ if (print_levels) {
1021
+ os << "### KLL sketch levels:" << std::endl;
1022
+ os << " index: nominal capacity, actual size" << std::endl;
1023
+ for (uint8_t i = 0; i < num_levels_; i++) {
1024
+ os << " " << (unsigned int) i << ": " << kll_helper::level_capacity(k_, num_levels_, i, m_) << ", " << safe_level_size(i) << std::endl;
1025
+ }
1026
+ os << "### End sketch levels" << std::endl;
1027
+ }
1028
+
1029
+ if (print_items) {
1030
+ os << "### KLL sketch data:" << std::endl;
1031
+ uint8_t level = 0;
1032
+ while (level < num_levels_) {
1033
+ const uint32_t from_index = levels_[level];
1034
+ const uint32_t to_index = levels_[level + 1]; // exclusive
1035
+ if (from_index < to_index) {
1036
+ os << " level " << (unsigned int) level << ":" << std::endl;
1037
+ }
1038
+ for (uint32_t i = from_index; i < to_index; i++) {
1039
+ os << " " << items_[i] << std::endl;
1040
+ }
1041
+ level++;
1042
+ }
1043
+ os << "### End sketch data" << std::endl;
1044
+ }
1045
+ return os.str();
1046
+ }
1047
+
1048
+ template <typename T, typename C, typename S, typename A>
1049
+ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin() const {
1050
+ return kll_sketch<T, C, S, A>::const_iterator(items_, levels_.data(), num_levels_);
1051
+ }
1052
+
1053
+ template <typename T, typename C, typename S, typename A>
1054
+ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
1055
+ return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
1056
+ }
1057
+
1058
+ // kll_sketch::const_iterator implementation
1059
+
1060
+ template<typename T, typename C, typename S, typename A>
1061
+ kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
1062
+ items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
1063
+ {}
1064
+
1065
+ template<typename T, typename C, typename S, typename A>
1066
+ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++() {
1067
+ ++index;
1068
+ if (index == levels[level + 1]) { // go to the next non-empty level
1069
+ do {
1070
+ ++level;
1071
+ weight *= 2;
1072
+ } while (level < num_levels && levels[level] == levels[level + 1]);
1073
+ }
1074
+ return *this;
1075
+ }
1076
+
1077
+ template<typename T, typename C, typename S, typename A>
1078
+ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++(int) {
1079
+ const_iterator tmp(*this);
1080
+ operator++();
1081
+ return tmp;
1082
+ }
1083
+
1084
+ template<typename T, typename C, typename S, typename A>
1085
+ bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
1086
+ if (level != other.level) return false;
1087
+ if (level == num_levels) return true; // end
1088
+ return index == other.index;
1089
+ }
1090
+
1091
+ template<typename T, typename C, typename S, typename A>
1092
+ bool kll_sketch<T, C, S, A>::const_iterator::operator!=(const const_iterator& other) const {
1093
+ return !operator==(other);
1094
+ }
1095
+
1096
+ template<typename T, typename C, typename S, typename A>
1097
+ const std::pair<const T&, const uint64_t> kll_sketch<T, C, S, A>::const_iterator::operator*() const {
1098
+ return std::pair<const T&, const uint64_t>(items[index], weight);
1099
+ }
1100
+
1101
+ template<typename T, typename C, typename S, typename A>
1102
+ class kll_sketch<T, C, S, A>::item_deleter {
1103
+ public:
1104
+ void operator() (T* ptr) const {
1105
+ if (ptr != nullptr) {
1106
+ ptr->~T();
1107
+ A().deallocate(ptr, 1);
1108
+ }
1109
+ }
1110
+ };
1111
+
1112
+ template<typename T, typename C, typename S, typename A>
1113
+ class kll_sketch<T, C, S, A>::items_deleter {
1114
+ public:
1115
+ items_deleter(uint32_t start, uint32_t num): start(start), num(num) {}
1116
+ void operator() (T* ptr) const {
1117
+ if (ptr != nullptr) {
1118
+ for (uint32_t i = start; i < num; ++i) {
1119
+ ptr[i].~T();
1120
+ }
1121
+ A().deallocate(ptr, num);
1122
+ }
1123
+ }
1124
+ private:
1125
+ uint32_t start;
1126
+ uint32_t num;
1127
+ };
1128
+
1129
+ } /* namespace datasketches */
1130
+
1131
+ #endif