datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,44 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+
21
+ #include <catch.hpp>
22
+
23
+ #include "CubicInterpolation.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ TEST_CASE("hll tables: interpolation exception", "[hll_tables]") {
28
+ REQUIRE_THROWS_AS(CubicInterpolation<>::usingXAndYTables(-1.0), std::invalid_argument);
29
+
30
+ REQUIRE_THROWS_AS(CubicInterpolation<>::usingXAndYTables(1e12), std::invalid_argument);
31
+ }
32
+
33
+ TEST_CASE("hll tables: check corner case", "[hll_tables]") {
34
+ int len = 10;
35
+ double xArr[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
36
+ double yArr[] = {2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0};
37
+ double x = xArr[len - 1];
38
+ double y = CubicInterpolation<>::usingXAndYTables(xArr, yArr, len, x);
39
+ double yExp = yArr[len - 1];
40
+ REQUIRE(y == yExp);
41
+ }
42
+
43
+ } /* namespace datasketches */
44
+
@@ -0,0 +1,168 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch.hpp>
21
+ #include <fstream>
22
+
23
+ #include "hll.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ static const int nArr[] = {1, 3, 10, 30, 100, 300, 1000, 3000, 10000, 30000};
28
+
29
+ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
30
+ hll_sketch sk(9, HLL_8);
31
+ for (int i = 0; i < 1024; ++i) {
32
+ sk.update(i);
33
+ }
34
+
35
+ std::stringstream ss1;
36
+ sk.serialize_updatable(ss1);
37
+ auto ser1 = sk.serialize_updatable();
38
+
39
+ std::stringstream ss;
40
+ sk.serialize_updatable(ss);
41
+ std::string str = ss.str();
42
+
43
+ hll_sketch sk2 = hll_sketch::deserialize(ser1.data(), ser1.size());
44
+ auto ser2 = sk.serialize_updatable();
45
+
46
+ REQUIRE(ser1.size() == ser2.size());
47
+ int len = ser1.size();
48
+ uint8_t* b1 = ser1.data();
49
+ uint8_t* b2 = ser2.data();
50
+
51
+ for (int i = 0; i < len; ++i) {
52
+ REQUIRE(b2[i] == b1[i]);
53
+ }
54
+ }
55
+
56
+ TEST_CASE("hll to/from byte array: deserialize from java", "[hll_byte_array]") {
57
+ std::string inputPath;
58
+ #ifdef TEST_BINARY_INPUT_PATH
59
+ inputPath = TEST_BINARY_INPUT_PATH;
60
+ #else
61
+ inputPath = "test/";
62
+ #endif
63
+
64
+ std::ifstream ifs;
65
+ ifs.open(inputPath + "list_from_java.sk", std::ios::binary);
66
+ hll_sketch sk = hll_sketch::deserialize(ifs);
67
+ REQUIRE(sk.is_empty() == false);
68
+ REQUIRE(sk.get_lg_config_k() == 8);
69
+ REQUIRE(sk.get_lower_bound(1) == 7.0);
70
+ REQUIRE(sk.get_estimate() == Approx(7.0).margin(1e-6));
71
+ REQUIRE(sk.get_upper_bound(1) == Approx(7.000350).margin(1e-5));
72
+ ifs.close();
73
+
74
+ ifs.open(inputPath + "compact_set_from_java.sk", std::ios::binary);
75
+ sk = hll_sketch::deserialize(ifs);
76
+ REQUIRE(sk.is_empty() == false);
77
+ REQUIRE(sk.get_lg_config_k() == 8);
78
+ REQUIRE(sk.get_lower_bound(1) == 24.0);
79
+ REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
80
+ REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
81
+ ifs.close();
82
+
83
+ ifs.open(inputPath + "updatable_set_from_java.sk", std::ios::binary);
84
+ sk = hll_sketch::deserialize(ifs);
85
+ REQUIRE(sk.is_empty() == false);
86
+ REQUIRE(sk.get_lg_config_k() == 8);
87
+ REQUIRE(sk.get_lower_bound(1) == 24.0);
88
+ REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
89
+ REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
90
+ ifs.close();
91
+
92
+
93
+ ifs.open(inputPath + "array6_from_java.sk", std::ios::binary);
94
+ sk = hll_sketch::deserialize(ifs);
95
+ REQUIRE(sk.is_empty() == false);
96
+ REQUIRE(sk.get_lg_config_k() == 8);
97
+ REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
98
+ REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
99
+ REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
100
+ ifs.close();
101
+
102
+
103
+ ifs.open(inputPath + "compact_array4_from_java.sk", std::ios::binary);
104
+ sk = hll_sketch::deserialize(ifs);
105
+ REQUIRE(sk.is_empty() == false);
106
+ REQUIRE(sk.get_lg_config_k() == 8);
107
+ REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
108
+ REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
109
+ REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
110
+
111
+ ifs.close();
112
+
113
+
114
+ ifs.open(inputPath + "updatable_array4_from_java.sk", std::ios::binary);
115
+ sk = hll_sketch::deserialize(ifs);
116
+ REQUIRE(sk.is_empty() == false);
117
+ REQUIRE(sk.get_lg_config_k() == 8);
118
+ REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
119
+ REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
120
+ REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
121
+ ifs.close();
122
+ }
123
+
124
+ static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
125
+ REQUIRE(sk1.get_lg_config_k() == sk2.get_lg_config_k());
126
+ REQUIRE(sk1.get_lower_bound(1) == sk2.get_lower_bound(1));
127
+ REQUIRE(sk1.get_estimate() == sk2.get_estimate());
128
+ REQUIRE(sk1.get_upper_bound(1) == sk2.get_upper_bound(1));
129
+ REQUIRE(sk1.get_target_type() == sk2.get_target_type());
130
+ }
131
+
132
+ static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const int n) {
133
+ hll_sketch src(lgConfigK, tgtHllType);
134
+ for (int i = 0; i < n; ++i) {
135
+ src.update(i);
136
+ }
137
+
138
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
139
+ src.serialize_compact(ss);
140
+ hll_sketch dst = hll_sketch::deserialize(ss);
141
+ checkSketchEquality(src, dst);
142
+
143
+ auto bytes1 = src.serialize_compact();
144
+ dst = hll_sketch::deserialize(bytes1.data(), bytes1.size());
145
+ checkSketchEquality(src, dst);
146
+
147
+ ss.clear();
148
+ src.serialize_updatable(ss);
149
+ dst = hll_sketch::deserialize(ss);
150
+ checkSketchEquality(src, dst);
151
+
152
+ auto bytes2 = src.serialize_updatable();
153
+ dst = hll_sketch::deserialize(bytes2.data(), bytes2.size());
154
+ checkSketchEquality(src, dst);
155
+ }
156
+
157
+ TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
158
+ for (int i = 0; i < 10; ++i) {
159
+ int n = nArr[i];
160
+ for (int lgK = 4; lgK <= 13; ++lgK) {
161
+ toFrom(lgK, HLL_4, n);
162
+ toFrom(lgK, HLL_6, n);
163
+ toFrom(lgK, HLL_8, n);
164
+ }
165
+ }
166
+ }
167
+
168
+ } /* namespace datasketches */
@@ -0,0 +1,58 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ add_library(kll INTERFACE)
19
+
20
+ add_library(${PROJECT_NAME}::KLL ALIAS kll)
21
+
22
+ if (BUILD_TESTS)
23
+ add_subdirectory(test)
24
+ endif()
25
+
26
+ target_include_directories(kll
27
+ INTERFACE
28
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
29
+ $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
30
+ )
31
+
32
+ target_link_libraries(kll INTERFACE common)
33
+ target_compile_features(kll INTERFACE cxx_std_11)
34
+
35
+ set(kll_HEADERS "")
36
+ list(APPEND kll_HEADERS "include/kll_sketch.hpp")
37
+ list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
38
+ list(APPEND kll_HEADERS "include/kll_helper.hpp")
39
+ list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
40
+ list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
41
+ list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
42
+
43
+ install(TARGETS kll
44
+ EXPORT ${PROJECT_NAME}
45
+ )
46
+
47
+ install(FILES ${kll_HEADERS}
48
+ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
49
+
50
+ target_sources(kll
51
+ INTERFACE
52
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
53
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
54
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
55
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
56
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
57
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
58
+ )
@@ -0,0 +1,150 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_HELPER_HPP_
21
+ #define KLL_HELPER_HPP_
22
+
23
+ #include <random>
24
+ #include <stdexcept>
25
+ #include <chrono>
26
+
27
+ namespace datasketches {
28
+
29
+ static std::independent_bits_engine<std::mt19937, 1, uint32_t> random_bit(std::chrono::system_clock::now().time_since_epoch().count());
30
+
31
+ #ifdef KLL_VALIDATION
32
+ extern uint32_t kll_next_offset;
33
+ #endif
34
+
35
+ // 0 <= power <= 30
36
+ static const uint64_t powers_of_three[] = {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441,
37
+ 1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467,
38
+ 3486784401, 10460353203, 31381059609, 94143178827, 282429536481,
39
+ 847288609443, 2541865828329, 7625597484987, 22876792454961, 68630377364883,
40
+ 205891132094649};
41
+
42
+ class kll_helper {
43
+ public:
44
+ static inline bool is_even(uint32_t value);
45
+ static inline bool is_odd(uint32_t value);
46
+ static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
47
+ static inline uint8_t ub_on_num_levels(uint64_t n);
48
+ static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
49
+ static inline uint32_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
50
+ static inline uint32_t int_cap_aux(uint16_t k, uint8_t depth);
51
+ static inline uint32_t int_cap_aux_aux(uint16_t k, uint8_t depth);
52
+ static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
53
+
54
+ /*
55
+ * This version is for floating point types
56
+ * Checks the sequential validity of the given array of values.
57
+ * They must be unique, monotonically increasing and not NaN.
58
+ */
59
+ template <typename T, typename C>
60
+ static typename std::enable_if<std::is_floating_point<T>::value, void>::type
61
+ validate_values(const T* values, uint32_t size) {
62
+ for (uint32_t i = 0; i < size ; i++) {
63
+ if (std::isnan(values[i])) {
64
+ throw std::invalid_argument("Values must not be NaN");
65
+ }
66
+ if ((i < (size - 1)) && !(C()(values[i], values[i + 1]))) {
67
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
68
+ }
69
+ }
70
+ }
71
+ /*
72
+ * This version is for non-floating point types
73
+ * Checks the sequential validity of the given array of values.
74
+ * They must be unique and monotonically increasing.
75
+ */
76
+ template <typename T, typename C>
77
+ static typename std::enable_if<!std::is_floating_point<T>::value, void>::type
78
+ validate_values(const T* values, uint32_t size) {
79
+ for (uint32_t i = 0; i < size ; i++) {
80
+ if ((i < (size - 1)) && !(C()(values[i], values[i + 1]))) {
81
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
82
+ }
83
+ }
84
+ }
85
+
86
+ template <typename T>
87
+ static void randomly_halve_down(T* buf, uint32_t start, uint32_t length);
88
+
89
+ template <typename T>
90
+ static void randomly_halve_up(T* buf, uint32_t start, uint32_t length);
91
+
92
+ // this version moves objects within the same buffer
93
+ // assumes that destination has initialized objects
94
+ // does not destroy the originals after the move
95
+ template <typename T, typename C>
96
+ static void merge_sorted_arrays(T* buf, uint32_t start_a, uint32_t len_a, uint32_t start_b, uint32_t len_b, uint32_t start_c);
97
+
98
+ // this version is to merge from two different buffers into a third buffer
99
+ // initializes objects is the destination buffer
100
+ // moves objects from buf_a and destroys the originals
101
+ // copies objects from buf_b
102
+ template <typename T, typename C>
103
+ static void merge_sorted_arrays(const T* buf_a, uint32_t start_a, uint32_t len_a, const T* buf_b, uint32_t start_b, uint32_t len_b, T* buf_c, uint32_t start_c);
104
+
105
+ struct compress_result {
106
+ uint8_t final_num_levels;
107
+ uint32_t final_capacity;
108
+ uint32_t final_num_items;
109
+ };
110
+
111
+ /*
112
+ * Here is what we do for each level:
113
+ * If it does not need to be compacted, then simply copy it over.
114
+ *
115
+ * Otherwise, it does need to be compacted, so...
116
+ * Copy zero or one guy over.
117
+ * If the level above is empty, halve up.
118
+ * Else the level above is nonempty, so...
119
+ * halve down, then merge up.
120
+ * Adjust the boundaries of the level above.
121
+ *
122
+ * It can be proved that general_compress returns a sketch that satisfies the space constraints
123
+ * no matter how much data is passed in.
124
+ * All levels except for level zero must be sorted before calling this, and will still be
125
+ * sorted afterwards.
126
+ * Level zero is not required to be sorted before, and may not be sorted afterwards.
127
+ */
128
+ template <typename T, typename C>
129
+ static compress_result general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
130
+ uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted);
131
+
132
+ template<typename T>
133
+ static void copy_construct(const T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first);
134
+
135
+ template<typename T>
136
+ static void move_construct(T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first, bool destroy);
137
+
138
+ #ifdef KLL_VALIDATION
139
+ private:
140
+
141
+ static inline uint32_t deterministic_offset();
142
+ #endif
143
+
144
+ };
145
+
146
+ } /* namespace datasketches */
147
+
148
+ #include "kll_helper_impl.hpp"
149
+
150
+ #endif // KLL_HELPER_HPP_
@@ -0,0 +1,319 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KLL_HELPER_IMPL_HPP_
21
+ #define KLL_HELPER_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+
25
+ namespace datasketches {
26
+
27
+ bool kll_helper::is_even(uint32_t value) {
28
+ return (value & 1) == 0;
29
+ }
30
+
31
+ bool kll_helper::is_odd(uint32_t value) {
32
+ return (value & 1) > 0;
33
+ }
34
+
35
+ uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) {
36
+ if (denom > numer) return 0;
37
+ uint8_t count = 0;
38
+ while (true) {
39
+ denom <<= 1;
40
+ if (denom > numer) return count;
41
+ count++;
42
+ }
43
+ }
44
+
45
+ uint8_t kll_helper::ub_on_num_levels(uint64_t n) {
46
+ if (n == 0) return 1;
47
+ return 1 + floor_of_log2_of_fraction(n, 1);
48
+ }
49
+
50
+ uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels) {
51
+ uint32_t total = 0;
52
+ for (uint8_t h = 0; h < num_levels; h++) {
53
+ total += level_capacity(k, num_levels, h, m);
54
+ }
55
+ return total;
56
+ }
57
+
58
+ uint32_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
59
+ if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
60
+ const uint8_t depth = numLevels - height - 1;
61
+ return std::max((uint32_t) min_wid, int_cap_aux(k, depth));
62
+ }
63
+
64
+ uint32_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
65
+ if (depth > 60) throw std::invalid_argument("depth > 60");
66
+ if (depth <= 30) return int_cap_aux_aux(k, depth);
67
+ const uint8_t half = depth / 2;
68
+ const uint8_t rest = depth - half;
69
+ const uint32_t tmp = int_cap_aux_aux(k, half);
70
+ return int_cap_aux_aux(tmp, rest);
71
+ }
72
+
73
+ uint32_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
74
+ if (depth > 30) throw std::invalid_argument("depth > 30");
75
+ const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
76
+ const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
77
+ const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
78
+ if (result > k) throw std::logic_error("result > k");
79
+ return result;
80
+ }
81
+
82
+ uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
83
+ uint64_t total = 0;
84
+ uint64_t weight = 1;
85
+ for (uint8_t lvl = 0; lvl < num_levels; lvl++) {
86
+ total += weight * (levels[lvl + 1] - levels[lvl]);
87
+ weight *= 2;
88
+ }
89
+ return total;
90
+ }
91
+
92
+ template <typename T>
93
+ void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
94
+ if (!is_even(length)) throw std::invalid_argument("length must be even");
95
+ const uint32_t half_length = length / 2;
96
+ #ifdef KLL_VALIDATION
97
+ const uint32_t offset = deterministic_offset();
98
+ #else
99
+ const uint32_t offset = random_bit();
100
+ #endif
101
+ uint32_t j = start + offset;
102
+ for (uint32_t i = start; i < (start + half_length); i++) {
103
+ if (i != j) buf[i] = std::move(buf[j]);
104
+ j += 2;
105
+ }
106
+ }
107
+
108
+ template <typename T>
109
+ void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
110
+ if (!is_even(length)) throw std::invalid_argument("length must be even");
111
+ const uint32_t half_length = length / 2;
112
+ #ifdef KLL_VALIDATION
113
+ const uint32_t offset = deterministic_offset();
114
+ #else
115
+ const uint32_t offset = random_bit();
116
+ #endif
117
+ uint32_t j = (start + length) - 1 - offset;
118
+ for (uint32_t i = (start + length) - 1; i >= (start + half_length); i--) {
119
+ if (i != j) buf[i] = std::move(buf[j]);
120
+ j -= 2;
121
+ }
122
+ }
123
+
124
+ // this version moves objects within the same buffer
125
+ // assumes that destination has initialized objects
126
+ // does not destroy the originals after the move
127
+ template <typename T, typename C>
128
+ void kll_helper::merge_sorted_arrays(T* buf, uint32_t start_a, uint32_t len_a, uint32_t start_b, uint32_t len_b, uint32_t start_c) {
129
+ const uint32_t len_c = len_a + len_b;
130
+ const uint32_t lim_a = start_a + len_a;
131
+ const uint32_t lim_b = start_b + len_b;
132
+ const uint32_t lim_c = start_c + len_c;
133
+
134
+ uint32_t a = start_a;
135
+ uint32_t b = start_b;
136
+
137
+ for (uint32_t c = start_c; c < lim_c; c++) {
138
+ if (a == lim_a) {
139
+ if (b != c) buf[c] = std::move(buf[b]);
140
+ b++;
141
+ } else if (b == lim_b) {
142
+ if (a != c) buf[c] = std::move(buf[a]);
143
+ a++;
144
+ } else if (C()(buf[a], buf[b])) {
145
+ if (a != c) buf[c] = std::move(buf[a]);
146
+ a++;
147
+ } else {
148
+ if (b != c) buf[c] = std::move(buf[b]);
149
+ b++;
150
+ }
151
+ }
152
+ if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
153
+ }
154
+
155
+ // this version is to merge from two different buffers into a third buffer
156
+ // initializes objects is the destination buffer
157
+ // moves objects from buf_a and destroys the originals
158
+ // copies objects from buf_b
159
+ template <typename T, typename C>
160
+ void kll_helper::merge_sorted_arrays(const T* buf_a, uint32_t start_a, uint32_t len_a, const T* buf_b, uint32_t start_b, uint32_t len_b, T* buf_c, uint32_t start_c) {
161
+ const uint32_t len_c = len_a + len_b;
162
+ const uint32_t lim_a = start_a + len_a;
163
+ const uint32_t lim_b = start_b + len_b;
164
+ const uint32_t lim_c = start_c + len_c;
165
+
166
+ uint32_t a = start_a;
167
+ uint32_t b = start_b;
168
+
169
+ for (uint32_t c = start_c; c < lim_c; c++) {
170
+ if (a == lim_a) {
171
+ new (&buf_c[c]) T(buf_b[b++]);
172
+ } else if (b == lim_b) {
173
+ new (&buf_c[c]) T(std::move(buf_a[a]));
174
+ buf_a[a++].~T();
175
+ } else if (C()(buf_a[a], buf_b[b])) {
176
+ new (&buf_c[c]) T(std::move(buf_a[a]));
177
+ buf_a[a++].~T();
178
+ } else {
179
+ new (&buf_c[c]) T(buf_b[b++]);
180
+ }
181
+ }
182
+ if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
183
+ }
184
+
185
+ /*
186
+ * Here is what we do for each level:
187
+ * If it does not need to be compacted, then simply copy it over.
188
+ *
189
+ * Otherwise, it does need to be compacted, so...
190
+ * Copy zero or one guy over.
191
+ * If the level above is empty, halve up.
192
+ * Else the level above is nonempty, so...
193
+ * halve down, then merge up.
194
+ * Adjust the boundaries of the level above.
195
+ *
196
+ * It can be proved that general_compress returns a sketch that satisfies the space constraints
197
+ * no matter how much data is passed in.
198
+ * All levels except for level zero must be sorted before calling this, and will still be
199
+ * sorted afterwards.
200
+ * Level zero is not required to be sorted before, and may not be sorted afterwards.
201
+ */
202
+ template <typename T, typename C>
203
+ kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
204
+ uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted)
205
+ {
206
+ if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed
207
+ const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0];
208
+ uint8_t current_num_levels = num_levels_in;
209
+ uint32_t current_item_count = starting_item_count; // decreases with each compaction
210
+ uint32_t target_item_count = compute_total_capacity(k, m, current_num_levels); // increases if we add levels
211
+ bool done_yet = false;
212
+ out_levels[0] = 0;
213
+ uint8_t current_level = 0;
214
+ while (!done_yet) {
215
+
216
+ // If we are at the current top level, add an empty level above it for convenience,
217
+ // but do not increment num_levels until later
218
+ if (current_level == (current_num_levels - 1)) {
219
+ in_levels[current_level + 2] = in_levels[current_level + 1];
220
+ }
221
+
222
+ const auto raw_beg = in_levels[current_level];
223
+ const auto raw_lim = in_levels[current_level + 1];
224
+ const auto raw_pop = raw_lim - raw_beg;
225
+
226
+ if ((current_item_count < target_item_count) || (raw_pop < level_capacity(k, current_num_levels, current_level, m))) {
227
+ // move level over as is
228
+ // make sure we are not moving data upwards
229
+ if (raw_beg < out_levels[current_level]) throw std::logic_error("wrong move");
230
+ std::move(&items[raw_beg], &items[raw_lim], &items[out_levels[current_level]]);
231
+ out_levels[current_level + 1] = out_levels[current_level] + raw_pop;
232
+ } else {
233
+ // The sketch is too full AND this level is too full, so we compact it
234
+ // Note: this can add a level and thus change the sketches capacities
235
+
236
+ const auto pop_above = in_levels[current_level + 2] - raw_lim;
237
+ const bool odd_pop = is_odd(raw_pop);
238
+ const auto adj_beg = odd_pop ? 1 + raw_beg : raw_beg;
239
+ const auto adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
240
+ const auto half_adj_pop = adj_pop / 2;
241
+
242
+ if (odd_pop) { // move one guy over
243
+ items[out_levels[current_level]] = std::move(items[raw_beg]);
244
+ out_levels[current_level + 1] = out_levels[current_level] + 1;
245
+ } else { // even number of items
246
+ out_levels[current_level + 1] = out_levels[current_level];
247
+ }
248
+
249
+ // level zero might not be sorted, so we must sort it if we wish to compact it
250
+ if ((current_level == 0) && !is_level_zero_sorted) {
251
+ std::sort(&items[adj_beg], &items[adj_beg + adj_pop], C());
252
+ }
253
+
254
+ if (pop_above == 0) { // Level above is empty, so halve up
255
+ randomly_halve_up(items, adj_beg, adj_pop);
256
+ } else { // Level above is nonempty, so halve down, then merge up
257
+ randomly_halve_down(items, adj_beg, adj_pop);
258
+ merge_sorted_arrays<T, C>(items, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
259
+ }
260
+
261
+ // track the fact that we just eliminated some data
262
+ current_item_count -= half_adj_pop;
263
+
264
+ // adjust the boundaries of the level above
265
+ in_levels[current_level + 1] = in_levels[current_level + 1] - half_adj_pop;
266
+
267
+ // increment num_levels if we just compacted the old top level
268
+ // this creates some more capacity (the size of the new bottom level)
269
+ if (current_level == (current_num_levels - 1)) {
270
+ current_num_levels++;
271
+ target_item_count += level_capacity(k, current_num_levels, 0, m);
272
+ }
273
+
274
+ } // end of code for compacting a level
275
+
276
+ // determine whether we have processed all levels yet (including any new levels that we created)
277
+
278
+ if (current_level == (current_num_levels - 1)) done_yet = true;
279
+ current_level++;
280
+ } // end of loop over levels
281
+
282
+ if ((out_levels[current_num_levels] - out_levels[0]) != current_item_count) throw std::logic_error("inconsistent state");
283
+
284
+ for (uint32_t i = current_item_count; i < starting_item_count; i++) items[i].~T();
285
+
286
+ compress_result result;
287
+ result.final_num_levels = current_num_levels;
288
+ result.final_capacity = target_item_count;
289
+ result.final_num_items = current_item_count;
290
+ return result;
291
+ }
292
+
293
+ template<typename T>
294
+ void kll_helper::copy_construct(const T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first) {
295
+ while (src_first != src_last) {
296
+ new (&dst[dst_first++]) T(src[src_first++]);
297
+ }
298
+ }
299
+
300
+ template<typename T>
301
+ void kll_helper::move_construct(T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first, bool destroy) {
302
+ while (src_first != src_last) {
303
+ new (&dst[dst_first++]) T(std::move(src[src_first]));
304
+ if (destroy) src[src_first].~T();
305
+ src_first++;
306
+ }
307
+ }
308
+
309
+ #ifdef KLL_VALIDATION
310
+ uint32_t kll_helper::deterministic_offset() {
311
+ const uint32_t result(kll_next_offset);
312
+ kll_next_offset = 1 - kll_next_offset;
313
+ return result;
314
+ }
315
+ #endif
316
+
317
+ } /* namespace datasketches */
318
+
319
+ #endif // KLL_HELPER_IMPL_HPP_