datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,43 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _INTARRAYPAIRITERATOR_HPP_
21
+ #define _INTARRAYPAIRITERATOR_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ template<typename A>
26
+ class coupon_iterator: public std::iterator<std::input_iterator_tag, uint32_t> {
27
+ public:
28
+ coupon_iterator(const int* array, size_t array_slze, size_t index, bool all);
29
+ coupon_iterator& operator++();
30
+ bool operator!=(const coupon_iterator& other) const;
31
+ uint32_t operator*() const;
32
+ private:
33
+ const int* array;
34
+ size_t array_size;
35
+ size_t index;
36
+ bool all;
37
+ };
38
+
39
+ }
40
+
41
+ #include "coupon_iterator-internal.hpp"
42
+
43
+ #endif /* _INTARRAYPAIRITERATOR_HPP_ */
@@ -0,0 +1,669 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _HLL_HPP_
21
+ #define _HLL_HPP_
22
+
23
+ #include "common_defs.hpp"
24
+ #include "HllUtil.hpp"
25
+
26
+ #include <memory>
27
+ #include <iostream>
28
+ #include <vector>
29
+
30
+ namespace datasketches {
31
+
32
+ /**
33
+ * This is a high performance implementation of Phillipe Flajolet&#8217;s HLL sketch but with
34
+ * significantly improved error behavior. If the ONLY use case for sketching is counting
35
+ * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
36
+ * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
37
+ * 16 times smaller than the Theta sketch family for the same accuracy.
38
+ *
39
+ * <p>This implementation offers three different types of HLL sketch, each with different
40
+ * trade-offs with accuracy, space and performance. These types are specified with the
41
+ * {@link TgtHllType} parameter.
42
+ *
43
+ * <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
44
+ * distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
45
+ * The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
46
+ * where <i>K</i> is the number of buckets or slots for the sketch.
47
+ *
48
+ * <p>During warmup, when the sketch has only received a small number of unique items
49
+ * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
50
+ * algorithms with significantly better accuracy.
51
+ *
52
+ * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
53
+ * created by the user, the sketch will perform all of its updates and internal phase transitions
54
+ * in that object, which can actually reside either on-heap or off-heap based on how it is
55
+ * configured. In large systems that must update and merge many millions of sketches, having the
56
+ * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
57
+ * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
58
+ * delays.
59
+ *
60
+ * author Jon Malkin
61
+ * author Lee Rhodes
62
+ * author Kevin Lang
63
+ */
64
+
65
+
66
+ /**
67
+ * Specifies the target type of HLL sketch to be created. It is a target in that the actual
68
+ * allocation of the HLL array is deferred until sufficient number of items have been received by
69
+ * the warm-up phases.
70
+ *
71
+ * <p>These three target types are isomorphic representations of the same underlying HLL algorithm.
72
+ * Thus, given the same value of <i>lg_config_k</i> and the same input, all three HLL target types
73
+ * will produce identical estimates and have identical error distributions.</p>
74
+ *
75
+ * <p>The memory (and also the serialization) of the sketch during this early warmup phase starts
76
+ * out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required
77
+ * until the full HLL array is allocated. This transition point occurs at about 10% of K for
78
+ * sketches where lg_config_k is &gt; 8.</p>
79
+ *
80
+ * <ul>
81
+ * <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
82
+ * fastest in terms of update time, but has the largest storage footprint of about
83
+ * <i>K</i> bytes.</li>
84
+ *
85
+ * <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
86
+ * in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
87
+ *
88
+ * <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
89
+ * the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
90
+ * For the values of <i>lg_config_k &gt; 13</i> (<i>K</i> = 8192),
91
+ * this additional array adds about 3% to the overall storage. It is generally the slowest in
92
+ * terms of update time, but has the smallest storage footprint of about
93
+ * <i>K/2 * 1.03</i> bytes.</li>
94
+ * </ul>
95
+ */
96
+ enum target_hll_type {
97
+ HLL_4, ///< 4 bits per entry (most compact, size may vary)
98
+ HLL_6, ///< 6 bits per entry (fixed size)
99
+ HLL_8 ///< 8 bits per entry (fastest, fixed size)
100
+ };
101
+
102
+ template<typename A>
103
+ class HllSketchImpl;
104
+
105
+ template<typename A>
106
+ class hll_union_alloc;
107
+
108
+ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
109
+ template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
110
+
111
+ template<typename A = std::allocator<char> >
112
+ class hll_sketch_alloc final {
113
+ public:
114
+ /**
115
+ * Constructs a new HLL sketch.
116
+ * @param lg_config_k Sketch can hold 2^lg_config_k rows
117
+ * @param tgt_type The HLL mode to use, if/when the sketch reaches that state
118
+ * @param start_full_size Indicates whether to start in HLL mode,
119
+ * keeping memory use constant (if HLL_6 or HLL_8) at the cost of
120
+ * starting out using much more memory
121
+ */
122
+ explicit hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false);
123
+
124
+ /**
125
+ * Copy constructor
126
+ */
127
+ hll_sketch_alloc(const hll_sketch_alloc<A>& that);
128
+
129
+ /**
130
+ * Copy constructor to a new target type
131
+ */
132
+ hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
133
+
134
+ /**
135
+ * Move constructor
136
+ */
137
+ hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
138
+
139
+ /**
140
+ * Reconstructs a sketch from a serialized image on a stream.
141
+ * @param is An input stream with a binary image of a sketch
142
+ */
143
+ static hll_sketch_alloc deserialize(std::istream& is);
144
+
145
+ /**
146
+ * Reconstructs a sketch from a serialized image in a byte array.
147
+ * @param is bytes An input array with a binary image of a sketch
148
+ * @param len Length of the input array, in bytes
149
+ */
150
+ static hll_sketch_alloc deserialize(const void* bytes, size_t len);
151
+
152
+ //! Class destructor
153
+ virtual ~hll_sketch_alloc();
154
+
155
+ //! Copy assignment operator
156
+ hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other);
157
+
158
+ //! Move assignment operator
159
+ hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other);
160
+
161
+ /**
162
+ * Resets the sketch to an empty state in coupon collection mode.
163
+ * Does not re-use existing internal objects.
164
+ */
165
+ void reset();
166
+
167
+ typedef vector_u8<A> vector_bytes; // alias for users
168
+
169
+ /**
170
+ * Serializes the sketch to a byte array, compacting data structures
171
+ * where feasible to eliminate unused storage in the serialized image.
172
+ * @param header_size_bytes Allows for PostgreSQL integration
173
+ */
174
+ vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
175
+
176
+ /**
177
+ * Serializes the sketch to a byte array, retaining all internal
178
+ * data structures in their current form.
179
+ */
180
+ vector_bytes serialize_updatable() const;
181
+
182
+ /**
183
+ * Serializes the sketch to an ostream, compacting data structures
184
+ * where feasible to eliminate unused storage in the serialized image.
185
+ * @param os std::ostream to use for output.
186
+ */
187
+ void serialize_compact(std::ostream& os) const;
188
+
189
+ /**
190
+ * Serializes the sketch to an ostream, retaining all internal data
191
+ * structures in their current form.
192
+ * @param os std::ostream to use for output.
193
+ */
194
+ void serialize_updatable(std::ostream& os) const;
195
+
196
+ /**
197
+ * Human readable summary with optional detail
198
+ * @param summary if true, output the sketch summary
199
+ * @param detail if true, output the internal data array
200
+ * @param auxDetail if true, output the internal Aux array, if it exists.
201
+ * @param all if true, outputs all entries including empty ones
202
+ * @return human readable string with optional detail.
203
+ */
204
+ string<A> to_string(bool summary = true,
205
+ bool detail = false,
206
+ bool aux_detail = false,
207
+ bool all = false) const;
208
+
209
+ /**
210
+ * Present the given std::string as a potential unique item.
211
+ * The string is converted to a byte array using UTF8 encoding.
212
+ * If the string is null or empty no update attempt is made and the method returns.
213
+ * @param datum The given string.
214
+ */
215
+ void update(const std::string& datum);
216
+
217
+ /**
218
+ * Present the given unsigned 64-bit integer as a potential unique item.
219
+ * @param datum The given integer.
220
+ */
221
+ void update(uint64_t datum);
222
+
223
+ /**
224
+ * Present the given unsigned 32-bit integer as a potential unique item.
225
+ * @param datum The given integer.
226
+ */
227
+ void update(uint32_t datum);
228
+
229
+ /**
230
+ * Present the given unsigned 16-bit integer as a potential unique item.
231
+ * @param datum The given integer.
232
+ */
233
+ void update(uint16_t datum);
234
+
235
+ /**
236
+ * Present the given unsigned 8-bit integer as a potential unique item.
237
+ * @param datum The given integer.
238
+ */
239
+ void update(uint8_t datum);
240
+
241
+ /**
242
+ * Present the given signed 64-bit integer as a potential unique item.
243
+ * @param datum The given integer.
244
+ */
245
+ void update(int64_t datum);
246
+
247
+ /**
248
+ * Present the given signed 32-bit integer as a potential unique item.
249
+ * @param datum The given integer.
250
+ */
251
+ void update(int32_t datum);
252
+
253
+ /**
254
+ * Present the given signed 16-bit integer as a potential unique item.
255
+ * @param datum The given integer.
256
+ */
257
+ void update(int16_t datum);
258
+
259
+ /**
260
+ * Present the given signed 8-bit integer as a potential unique item.
261
+ * @param datum The given integer.
262
+ */
263
+ void update(int8_t datum);
264
+
265
+ /**
266
+ * Present the given 64-bit floating point value as a potential unique item.
267
+ * @param datum The given double.
268
+ */
269
+ void update(double datum);
270
+
271
+ /**
272
+ * Present the given 32-bit floating point value as a potential unique item.
273
+ * @param datum The given float.
274
+ */
275
+ void update(float datum);
276
+
277
+ /**
278
+ * Present the given data array as a potential unique item.
279
+ * @param data The given array.
280
+ * @param length_bytes The array length in bytes.
281
+ */
282
+ void update(const void* data, size_t length_bytes);
283
+
284
+ /**
285
+ * Returns the current cardinality estimate
286
+ * @return the cardinality estimate
287
+ */
288
+ double get_estimate() const;
289
+
290
+ /**
291
+ * This is less accurate than the getEstimate() method
292
+ * and is automatically used when the sketch has gone through
293
+ * union operations where the more accurate HIP estimator cannot
294
+ * be used.
295
+ *
296
+ * This is made public only for error characterization software
297
+ * that exists in separate packages and is not intended for normal
298
+ * use.
299
+ * @return the composite cardinality estimate
300
+ */
301
+ double get_composite_estimate() const;
302
+
303
+ /**
304
+ * Returns the approximate lower error bound given the specified
305
+ * number of standard deviations.
306
+ * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
307
+ * @return The approximate lower bound.
308
+ */
309
+ double get_lower_bound(int num_std_dev) const;
310
+
311
+ /**
312
+ * Returns the approximate upper error bound given the specified
313
+ * number of standard deviations.
314
+ * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
315
+ * @return The approximate upper bound.
316
+ */
317
+ double get_upper_bound(int num_std_dev) const;
318
+
319
+ /**
320
+ * Returns sketch's configured lg_k value.
321
+ * @return Configured lg_k value.
322
+ */
323
+ int get_lg_config_k() const;
324
+
325
+ /**
326
+ * Returns the sketch's target HLL mode (from #target_hll_type).
327
+ * @return The sketch's target HLL mode.
328
+ */
329
+ target_hll_type get_target_type() const;
330
+
331
+ /**
332
+ * Indicates if the sketch is currently stored compacted.
333
+ * @return True if the sketch is stored in compact form.
334
+ */
335
+ bool is_compact() const;
336
+
337
+ /**
338
+ * Indicates if the sketch is currently empty.
339
+ * @return True if the sketch is empty.
340
+ */
341
+ bool is_empty() const;
342
+
343
+ /**
344
+ * Returns the size of the sketch serialized in compact form.
345
+ * @return Size of the sketch serialized in compact form, in bytes.
346
+ */
347
+ int get_compact_serialization_bytes() const;
348
+
349
+ /**
350
+ * Returns the size of the sketch serialized without compaction.
351
+ * @return Size of the sketch serialized without compaction, in bytes.
352
+ */
353
+ int get_updatable_serialization_bytes() const;
354
+
355
+ /**
356
+ * Returns the maximum size in bytes that this sketch can grow to
357
+ * given lg_config_k. However, for the HLL_4 sketch type, this
358
+ * value can be exceeded in extremely rare cases. If exceeded, it
359
+ * will be larger by only a few percent.
360
+ *
361
+ * @param lg_config_k The Log2 of K for the target HLL sketch. This value must be
362
+ * between 4 and 21 inclusively.
363
+ * @param tgt_type the desired Hll type
364
+ * @return the maximum size in bytes that this sketch can grow to.
365
+ */
366
+ static int get_max_updatable_serialization_bytes(int lg_k, target_hll_type tgt_type);
367
+
368
+ /**
369
+ * Gets the current (approximate) Relative Error (RE) asymptotic values given several
370
+ * parameters. This is used primarily for testing.
371
+ * @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
372
+ * @param unioned set true if the sketch is the result of a union operation.
373
+ * @param lg_config_k the configured value for the sketch.
374
+ * @param num_std_dev the given number of Standard Deviations. This must be an integer between
375
+ * 1 and 3, inclusive.
376
+ * @return the current (approximate) RelativeError
377
+ */
378
+ static double get_rel_err(bool upper_bound, bool unioned,
379
+ int lg_config_k, int num_std_dev);
380
+
381
+ private:
382
+ explicit hll_sketch_alloc(HllSketchImpl<A>* that);
383
+
384
+ void coupon_update(int coupon);
385
+
386
+ std::string type_as_string() const;
387
+ std::string mode_as_string() const;
388
+
389
+ hll_mode get_current_mode() const;
390
+ int get_serialization_version() const;
391
+ bool is_out_of_order_flag() const;
392
+ bool is_estimation_mode() const;
393
+
394
+ typedef typename std::allocator_traits<A>::template rebind_alloc<hll_sketch_alloc> AllocHllSketch;
395
+
396
+ HllSketchImpl<A>* sketch_impl;
397
+ friend hll_union_alloc<A>;
398
+ };
399
+
400
+ /**
401
+ * This performs union operations for HLL sketches. This union operator is configured with a
402
+ * <i>lgMaxK</i> instead of the normal <i>lg_config_k</i>.
403
+ *
404
+ * <p>This union operator does permit the unioning of sketches with different values of
405
+ * <i>lg_config_k</i>. The user should be aware that the resulting accuracy of a sketch returned
406
+ * at the end of the unioning process will be a function of the smallest of <i>lg_max_k</i> and
407
+ * <i>lg_config_k</i> that the union operator has seen.
408
+ *
409
+ * <p>This union operator also permits unioning of any of the three different target hll_sketch
410
+ * types.
411
+ *
412
+ * <p>Although the API for this union operator parallels many of the methods of the
413
+ * <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
414
+ *
415
+ * <p>First, the user cannot specify the #tgt_hll_type as an input parameter.
416
+ * Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope).
417
+ *
418
+ * <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
419
+ * change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
420
+ *
421
+ * author Jon Malkin
422
+ * author Lee Rhodes
423
+ * author Kevin Lang
424
+ */
425
+
426
+ template<typename A = std::allocator<char> >
427
+ class hll_union_alloc {
428
+ public:
429
+ /**
430
+ * Construct an hll_union operator with the given maximum log2 of k.
431
+ * @param lg_max_k The maximum size, in log2, of k. The value must
432
+ * be between 7 and 21, inclusive.
433
+ */
434
+ explicit hll_union_alloc(int lg_max_k);
435
+
436
+ /**
437
+ * Returns the current cardinality estimate
438
+ * @return the cardinality estimate
439
+ */
440
+ double get_estimate() const;
441
+
442
+ /**
443
+ * This is less accurate than the get_estimate() method
444
+ * and is automatically used when the union has gone through
445
+ * union operations where the more accurate HIP estimator cannot
446
+ * be used.
447
+ *
448
+ * This is made public only for error characterization software
449
+ * that exists in separate packages and is not intended for normal
450
+ * use.
451
+ * @return the composite cardinality estimate
452
+ */
453
+ double get_composite_estimate() const;
454
+
455
+ /**
456
+ * Returns the approximate lower error bound given the specified
457
+ * number of standard deviations.
458
+ * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
459
+ * @return The approximate lower bound.
460
+ */
461
+ double get_lower_bound(int num_std_dev) const;
462
+
463
+ /**
464
+ * Returns the approximate upper error bound given the specified
465
+ * number of standard deviations.
466
+ * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
467
+ * @return The approximate upper bound.
468
+ */
469
+ double get_upper_bound(int num_std_dev) const;
470
+
471
+ /**
472
+ * Returns the size of the union serialized in compact form.
473
+ * @return Size of the union serialized in compact form, in bytes.
474
+ */
475
+ int get_compact_serialization_bytes() const;
476
+
477
+ /**
478
+ * Returns the size of the union serialized without compaction.
479
+ * @return Size of the union serialized without compaction, in bytes.
480
+ */
481
+ int get_updatable_serialization_bytes() const;
482
+
483
+ /**
484
+ * Returns union's configured lg_k value.
485
+ * @return Configured lg_k value.
486
+ */
487
+ int get_lg_config_k() const;
488
+
489
+ /**
490
+ * Returns the union's target HLL mode (from #target_hll_type).
491
+ * @return The union's target HLL mode.
492
+ */
493
+ target_hll_type get_target_type() const;
494
+
495
+ /**
496
+ * Indicates if the union is currently stored compacted.
497
+ * @return True if the union is stored in compact form.
498
+ */
499
+ bool is_compact() const;
500
+
501
+ /**
502
+ * Indicates if the union is currently empty.
503
+ * @return True if the union is empty.
504
+ */
505
+ bool is_empty() const;
506
+
507
+ /**
508
+ * Resets the union to an empty state in coupon collection mode.
509
+ * Does not re-use existing internal objects.
510
+ */
511
+ void reset();
512
+
513
+ /**
514
+ * Returns the result of this union operator with the specified
515
+ * #tgt_hll_type.
516
+ * @param The tgt_hll_type enum value of the desired result (Default: HLL_4)
517
+ * @return The result of this union with the specified tgt_hll_type
518
+ */
519
+ hll_sketch_alloc<A> get_result(target_hll_type tgt_type = HLL_4) const;
520
+
521
+ /**
522
+ * Update this union operator with the given sketch.
523
+ * @param The given sketch.
524
+ */
525
+ void update(const hll_sketch_alloc<A>& sketch);
526
+
527
+ /**
528
+ * Update this union operator with the given temporary sketch.
529
+ * @param The given sketch.
530
+ */
531
+ void update(hll_sketch_alloc<A>&& sketch);
532
+
533
+ /**
534
+ * Present the given std::string as a potential unique item.
535
+ * The string is converted to a byte array using UTF8 encoding.
536
+ * If the string is null or empty no update attempt is made and the method returns.
537
+ * @param datum The given string.
538
+ */
539
+ void update(const std::string& datum);
540
+
541
+ /**
542
+ * Present the given unsigned 64-bit integer as a potential unique item.
543
+ * @param datum The given integer.
544
+ */
545
+ void update(uint64_t datum);
546
+
547
+ /**
548
+ * Present the given unsigned 32-bit integer as a potential unique item.
549
+ * @param datum The given integer.
550
+ */
551
+ void update(uint32_t datum);
552
+
553
+ /**
554
+ * Present the given unsigned 16-bit integer as a potential unique item.
555
+ * @param datum The given integer.
556
+ */
557
+ void update(uint16_t datum);
558
+
559
+ /**
560
+ * Present the given unsigned 8-bit integer as a potential unique item.
561
+ * @param datum The given integer.
562
+ */
563
+ void update(uint8_t datum);
564
+
565
+ /**
566
+ * Present the given signed 64-bit integer as a potential unique item.
567
+ * @param datum The given integer.
568
+ */
569
+ void update(int64_t datum);
570
+
571
+ /**
572
+ * Present the given signed 32-bit integer as a potential unique item.
573
+ * @param datum The given integer.
574
+ */
575
+ void update(int32_t datum);
576
+
577
+ /**
578
+ * Present the given signed 16-bit integer as a potential unique item.
579
+ * @param datum The given integer.
580
+ */
581
+ void update(int16_t datum);
582
+
583
+ /**
584
+ * Present the given signed 8-bit integer as a potential unique item.
585
+ * @param datum The given integer.
586
+ */
587
+ void update(int8_t datum);
588
+
589
+ /**
590
+ * Present the given 64-bit floating point value as a potential unique item.
591
+ * @param datum The given double.
592
+ */
593
+ void update(double datum);
594
+
595
+ /**
596
+ * Present the given 32-bit floating point value as a potential unique item.
597
+ * @param datum The given float.
598
+ */
599
+ void update(float datum);
600
+
601
+ /**
602
+ * Present the given data array as a potential unique item.
603
+ * @param data The given array.
604
+ * @param length_bytes The array length in bytes.
605
+ */
606
+ void update(const void* data, size_t length_bytes);
607
+
608
+ /**
609
+ * Returns the maximum size in bytes that this union operator can grow to given a lg_k.
610
+ *
611
+ * @param lg_k The maximum Log2 of k for this union operator. This value must be
612
+ * between 4 and 21 inclusively.
613
+ * @return the maximum size in bytes that this union operator can grow to.
614
+ */
615
+ static int get_max_serialization_bytes(int lg_k);
616
+
617
+ /**
618
+ * Gets the current (approximate) Relative Error (RE) asymptotic values given several
619
+ * parameters. This is used primarily for testing.
620
+ * @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
621
+ * @param unioned set true if the sketch is the result of a union operation.
622
+ * @param lg_config_k the configured value for the sketch.
623
+ * @param num_std_dev the given number of Standard Deviations. This must be an integer between
624
+ * 1 and 3, inclusive.
625
+ * @return the current (approximate) RelativeError
626
+ */
627
+ static double get_rel_err(bool upper_bound, bool unioned,
628
+ int lg_config_k, int num_std_dev);
629
+
630
+ private:
631
+
632
+ /**
633
+ * Union the given source and destination sketches. This method examines the state of
634
+ * the current internal gadget and the incoming sketch and determines the optimal way to
635
+ * perform the union. This may involve swapping, down-sampling, transforming, and / or
636
+ * copying one of the arguments and may completely replace the internals of the union.
637
+ *
638
+ * @param incoming_impl the given incoming sketch, which may not be modified.
639
+ * @param lg_max_k the maximum value of log2 K for this union.
640
+ */
641
+ inline void union_impl(const hll_sketch_alloc<A>& sketch, int lg_max_k);
642
+
643
+ static HllSketchImpl<A>* copy_or_downsample(const HllSketchImpl<A>* src_impl, int tgt_lg_k);
644
+
645
+ void coupon_update(int coupon);
646
+
647
+ hll_mode get_current_mode() const;
648
+ int get_serialization_version() const;
649
+ bool is_out_of_order_flag() const;
650
+ bool is_estimation_mode() const;
651
+
652
+ // calls couponUpdate on sketch, freeing the old sketch upon changes in hll_mode
653
+ static HllSketchImpl<A>* leak_free_coupon_update(HllSketchImpl<A>* impl, int coupon);
654
+
655
+ int lg_max_k;
656
+ hll_sketch_alloc<A> gadget;
657
+ };
658
+
659
+ /// convenience alias for hll_sketch with default allocator
660
+ typedef hll_sketch_alloc<> hll_sketch;
661
+
662
+ /// convenience alias for hll_union with default allocator
663
+ typedef hll_union_alloc<> hll_union;
664
+
665
+ } // namespace datasketches
666
+
667
+ #include "hll.private.hpp"
668
+
669
+ #endif // _HLL_HPP_