datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,274 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ // author Kevin Lang, Oath Research
21
+
22
+ #ifndef ICON_ESTIMATOR_HPP_
23
+ #define ICON_ESTIMATOR_HPP_
24
+
25
+ #include <cmath>
26
+ #include <cstdint>
27
+ #include <stdexcept>
28
+
29
+ namespace datasketches {
30
+
31
+ // The ICON estimator for FM85 sketches is defined by the arXiv paper.
32
+
33
+ // The current file provides exact and approximate implementations of this estimator.
34
+
35
+ // The exact version works for any value of K, but is quite slow.
36
+
37
+ // The much faster approximate version works for K values that are powers of two
38
+ // ranging from 2^4 to 2^32.
39
+
40
+ // At a high-level, this approximation can be described as using an
41
+ // exponential approximation when C > K * (5.6 or 5.7), while smaller
42
+ // values of C are handled by a degree-19 polynomial approximation of
43
+ // a pre-conditioned version of the true ICON mapping from C to N_hat.
44
+
45
+ // This file also provides a validation procedure that compares its approximate
46
+ // and exact implementations of the FM85 ICON estimator.
47
+
48
+ static const int ICON_MIN_LOG_K = 4;
49
+ static const int ICON_MAX_LOG_K = 26;
50
+ static const int ICON_POLYNOMIAL_DEGREE = 19;
51
+ static const int ICON_POLYNOMIAL_NUM_COEFFICIENTS = 1 + ICON_POLYNOMIAL_DEGREE;
52
+ static const int ICON_TABLE_SIZE = ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K));
53
+
54
+ static const double ICON_POLYNOMIAL_COEFFICIENTS[ICON_TABLE_SIZE] = {
55
+
56
+ // log K = 4
57
+ 0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817,
58
+ 1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904,
59
+ -30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998,
60
+ 0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05,
61
+
62
+ // log K = 5
63
+ 0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854,
64
+ 1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069,
65
+ -28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962,
66
+ 0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05,
67
+
68
+ // log K = 6
69
+ 0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116,
70
+ 1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077,
71
+ -28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851,
72
+ 0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05,
73
+
74
+ // log K = 7
75
+ 0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863,
76
+ 1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936,
77
+ -28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037,
78
+ 0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05,
79
+
80
+ // log K = 8
81
+ 0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681,
82
+ 0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002,
83
+ -15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996,
84
+ 0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05,
85
+
86
+ // log K = 9
87
+ 0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214,
88
+ 0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059,
89
+ -15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002,
90
+ 0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05,
91
+
92
+ // log K = 10
93
+ 0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428,
94
+ 0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034,
95
+ -15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093,
96
+ 0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05,
97
+
98
+ // log K = 11
99
+ 0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211,
100
+ 0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958,
101
+ -14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051,
102
+ 0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05,
103
+
104
+ // log K = 12
105
+ 0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343,
106
+ 0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078,
107
+ -17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076,
108
+ 0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05,
109
+
110
+ // log K = 13
111
+ 0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548,
112
+ 0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962,
113
+ -17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975,
114
+ 0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05,
115
+
116
+ // log K = 14
117
+ 0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777,
118
+ 0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022,
119
+ -17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993,
120
+ 0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05,
121
+
122
+ // log K = 15
123
+ 0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128,
124
+ 0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048,
125
+ -18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953,
126
+ 0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05,
127
+
128
+ // log K = 16
129
+ 0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592,
130
+ 0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041,
131
+ -18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102,
132
+ 0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05,
133
+
134
+ // log K = 17
135
+ 0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043,
136
+ 0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058,
137
+ -18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063,
138
+ 0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05,
139
+
140
+ // log K = 18
141
+ 0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003,
142
+ 0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132,
143
+ -18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104,
144
+ 0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05,
145
+
146
+ // log K = 19
147
+ 0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954,
148
+ 0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001,
149
+ -16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915,
150
+ 0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05,
151
+
152
+ // log K = 20
153
+ 0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951,
154
+ 0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011,
155
+ -15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906,
156
+ 0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05,
157
+
158
+ // log K = 21
159
+ 0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961,
160
+ 0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888,
161
+ -19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987,
162
+ 0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05,
163
+
164
+ // log K = 22
165
+ 0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969,
166
+ 0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099,
167
+ -18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093,
168
+ 0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05,
169
+
170
+ // log K = 23
171
+ 0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059,
172
+ 0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843,
173
+ -18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987,
174
+ 0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05,
175
+
176
+ // log K = 24
177
+ 0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152,
178
+ 0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951,
179
+ -17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056,
180
+ 0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05,
181
+
182
+ // log K = 25
183
+ 0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451,
184
+ 0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923,
185
+ -17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892,
186
+ 0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05,
187
+
188
+ // log K = 26
189
+ 0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949,
190
+ 0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154,
191
+ -18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955,
192
+ 0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05,
193
+
194
+ #ifdef LARGER_K_VALUES
195
+ // log K = 27
196
+ 1.000000000639100106, 0.3333378987508219815, 0.126670943746902992, -0.06418811974745139426, -0.0972951198506895043,
197
+ 0.4687977077401049852, -1.945290489888900076, 5.499494964974400268, -11.05078190574979935, 16.3446428009706004,
198
+ -18.10936908931320133, 15.19089294103859977, -9.691829972777059155, 4.694320543263319934, -1.710719212277360013,
199
+ 0.4606257962161550146, -0.08875858006645380438, 0.01156634964444109952, -0.0009125838337464230437, 3.290907977404550287e-05,
200
+
201
+ // log K = 28
202
+ 0.9999999993590269476, 0.3333385660745579737, 0.1266394134278630013, -0.0636305053404186971, -0.1022354305220320031,
203
+ 0.4945787360853979853, -2.032468917547570086, 5.702461924065530319, -11.38943406618639997, 16.76052144140630062,
204
+ -18.49169753114890113, 15.4564578116809006, -9.831507534599410292, 4.749667961030789698, -1.72701519749717991,
205
+ 0.4640997252013580043, -0.08927103511252110213, 0.01161455495023329919, -0.000915030036039231982, 3.295110296010450275e-05,
206
+
207
+ // log K = 29
208
+ 0.9999999998441060356, 0.3333383341194189886, 0.1266687338487519909, -0.06416245828383730643, -0.09764561286937140094,
209
+ 0.4715274747139350242, -1.958172229464169911, 5.539587632966780362, -11.13784217611559946, 16.48149277721759987,
210
+ -18.26888916646990069, 15.33085193018819936, -9.78493991484172021, 4.741302923579859829, -1.728568959451310061,
211
+ 0.4656457646521020011, -0.08977142058582450457, 0.01170492245846839995, -0.0009240931538567209464, 3.334703207098030245e-05,
212
+
213
+ // log K = 30
214
+ 0.9999999992599339915, 0.3333384538468979752, 0.1266452025739940035, -0.06374775920488300052, -0.1009917742909720029,
215
+ 0.4867931642504759737, -2.000981224888669807, 5.614968747087539569, -11.21527907219130071, 16.50500949673639894,
216
+ -18.21007853829650003, 15.22056128176249956, -9.680565515478869898, 4.675983737170599674, -1.69980511941418011,
217
+ 0.4566332138743600111, -0.08779650251621799739, 0.01141656381272189956, -0.0008988545845624889468, 3.234448025291899689e-05,
218
+
219
+ // log K = 31
220
+ 0.9999999973204000137, 0.333337762450663988, 0.1266965469104399944, -0.06475154253624139378, -0.09133098208494490333,
221
+ 0.4320356889637699815, -1.799236887220760028, 5.100971076171499696, -10.27175516606700079, 15.22198757843720074,
222
+ -16.89368636262300072, 14.19016571851859965, -9.062390133299189188, 4.39220025249522994, -1.600994848692480099,
223
+ 0.4310075283759189912, -0.08300339267288289746, 0.01080584419810979961, -0.0008514267355136160122, 3.065110087496039805e-05,
224
+
225
+ // log K = 32
226
+ 0.9999999987706390536, 0.3333387038350890119, 0.1266354589419070031, -0.06355195838981600454, -0.102952771506954005,
227
+ 0.4983589546197609854, -2.045281215270029929, 5.732181222451769642, -11.43849817800069957, 16.81961198331340057,
228
+ -18.54433120118400069, 15.49126422718470053, -9.84846998787154071, 4.755615082534379923, -1.728430514092559989,
229
+ 0.4642927653670489985, -0.08927380119154580684, 0.01161055316485629964, -0.0009143724787632470305, 3.291492066818770055e-05,
230
+
231
+ #endif
232
+ };
233
+
234
+ static double evaluate_polynomial(const double* coefficients, int start, int num, double x) {
235
+ const int final = start + num - 1;
236
+ double total = coefficients[final];
237
+ for (int j = final - 1; j >= start; j--) {
238
+ total *= x;
239
+ total += coefficients[j];
240
+ }
241
+ return total;
242
+ }
243
+
244
+ static double icon_exponential_approximation(double k, double c) {
245
+ return (0.7940236163830469 * k * pow(2.0, c / k));
246
+ }
247
+
248
+ static double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
249
+ if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
250
+ if (c < 2) return ((c == 0) ? 0.0 : 1.0);
251
+ const size_t k = 1 << lg_k;
252
+ const double double_k = k;
253
+ const double double_c = c;
254
+ // Differing thresholds ensure that the approximated estimator is monotonically increasing.
255
+ const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
256
+ if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
257
+ const double factor = evaluate_polynomial(
258
+ ICON_POLYNOMIAL_COEFFICIENTS,
259
+ ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
260
+ ICON_POLYNOMIAL_NUM_COEFFICIENTS,
261
+ // The somewhat arbitrary constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
262
+ double_c / (2.0 * double_k)
263
+ );
264
+ const double ratio = double_c / double_k;
265
+ // The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
266
+ const double term = 1.0 + (ratio * ratio * ratio / 66.774757);
267
+ const double result = double_c * factor * term;
268
+ if (result >= double_c) return result;
269
+ else return double_c;
270
+ }
271
+
272
+ } /* namespace datasketches */
273
+
274
+ #endif
@@ -0,0 +1,81 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef KXP_BYTE_LOOKUP_HPP_
21
+ #define KXP_BYTE_LOOKUP_HPP_
22
+
23
+ namespace datasketches {
24
+
25
+ // the table was created by the following procedure:
26
+
27
+ //void fill_kxp_byte_table() {
28
+ // for (int byte = 0; byte < 256; byte++) {
29
+ // double sum = 0.0;
30
+ // for (int col = 0; col < 8; col++) {
31
+ // const uint8_t bit = (byte >> col) & 1;
32
+ // if (bit == 0) { // note the inverted logic
33
+ // sum += INVERSE_POWERS_OF_2[col + 1]; // note the "+1"
34
+ // }
35
+ // }
36
+ // kxp_byte_table[byte] = sum;
37
+ //
38
+ // printf("%.17g", kxp_byte_table[byte]);
39
+ // if (byte != 255) printf(", ");
40
+ // if ((byte + 1) % 8 == 0) printf("\n");
41
+ // }
42
+ //}
43
+
44
+ static const double KXP_BYTE_TABLE[256] = {
45
+ 0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375,
46
+ 0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375,
47
+ 0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375,
48
+ 0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375,
49
+ 0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875,
50
+ 0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875,
51
+ 0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875,
52
+ 0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875,
53
+ 0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125,
54
+ 0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125,
55
+ 0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125,
56
+ 0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125,
57
+ 0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625,
58
+ 0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625,
59
+ 0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625,
60
+ 0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625,
61
+ 0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875,
62
+ 0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875,
63
+ 0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375,
64
+ 0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375,
65
+ 0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625,
66
+ 0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625,
67
+ 0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125,
68
+ 0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125,
69
+ 0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375,
70
+ 0.921875, 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875,
71
+ 0.953125, 0.453125, 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125,
72
+ 0.890625, 0.390625, 0.640625, 0.140625, 0.765625, 0.265625, 0.515625, 0.015625,
73
+ 0.96875, 0.46875, 0.71875, 0.21875, 0.84375, 0.34375, 0.59375, 0.09375,
74
+ 0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125, 0.03125,
75
+ 0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625,
76
+ 0.875, 0.375, 0.625, 0.125, 0.75, 0.25, 0.5, 0
77
+ };
78
+
79
+ } /* namespace datasketches */
80
+
81
+ #endif
@@ -0,0 +1,84 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ // author Kevin Lang, Oath Research
21
+
22
+ #ifndef U32_TABLE_HPP_
23
+ #define U32_TABLE_HPP_
24
+
25
+ // This is a highly specialized hash table that was designed
26
+ // to be a part of the library's CPC sketch implementation
27
+
28
+ #include "cpc_common.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ static const uint64_t U32_TABLE_UPSIZE_NUMER = 3LL;
33
+ static const uint64_t U32_TABLE_UPSIZE_DENOM = 4LL;
34
+
35
+ static const uint64_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
36
+ static const uint64_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
37
+
38
+ template<typename A>
39
+ class u32_table {
40
+ public:
41
+
42
+ u32_table();
43
+ u32_table(uint8_t lg_size, uint8_t num_valid_bits);
44
+
45
+ inline size_t get_num_items() const;
46
+ inline const uint32_t* get_slots() const;
47
+ inline uint8_t get_lg_size() const;
48
+ inline void clear();
49
+
50
+ // returns true iff the item was new and was therefore added to the table
51
+ inline bool maybe_insert(uint32_t item);
52
+ // returns true iff the item was present and was therefore removed from the table
53
+ inline bool maybe_delete(uint32_t item);
54
+
55
+ static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k);
56
+
57
+ vector_u32<A> unwrapping_get_items() const;
58
+
59
+ static void merge(
60
+ const uint32_t* arr_a, size_t start_a, size_t length_a, // input
61
+ const uint32_t* arr_b, size_t start_b, size_t length_b, // input
62
+ uint32_t* arr_c, size_t start_c // output
63
+ );
64
+
65
+ static void introspective_insertion_sort(uint32_t* a, size_t l, size_t r);
66
+ static void knuth_shell_sort3(uint32_t* a, size_t l, size_t r);
67
+
68
+ private:
69
+
70
+ uint8_t lg_size; // log2 of number of slots
71
+ uint8_t num_valid_bits;
72
+ size_t num_items;
73
+ vector_u32<A> slots;
74
+
75
+ inline size_t lookup(uint32_t item) const;
76
+ inline void must_insert(uint32_t item);
77
+ inline void rebuild(uint8_t new_lg_size);
78
+ };
79
+
80
+ } /* namespace datasketches */
81
+
82
+ #include "u32_table_impl.hpp"
83
+
84
+ #endif
@@ -0,0 +1,266 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ // author Kevin Lang, Oath Research
21
+
22
+ #ifndef U32_TABLE_IMPL_HPP_
23
+ #define U32_TABLE_IMPL_HPP_
24
+
25
+ #include <stdexcept>
26
+ #include <algorithm>
27
+ #include <climits>
28
+
29
+ namespace datasketches {
30
+
31
+ template<typename A>
32
+ u32_table<A>::u32_table():
33
+ lg_size(0),
34
+ num_valid_bits(0),
35
+ num_items(0),
36
+ slots()
37
+ {}
38
+
39
+ template<typename A>
40
+ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits):
41
+ lg_size(lg_size),
42
+ num_valid_bits(num_valid_bits),
43
+ num_items(0),
44
+ slots(1 << lg_size, UINT32_MAX)
45
+ {
46
+ if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
47
+ if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
48
+ }
49
+
50
+ template<typename A>
51
+ size_t u32_table<A>::get_num_items() const {
52
+ return num_items;
53
+ }
54
+
55
+ template<typename A>
56
+ const uint32_t* u32_table<A>::get_slots() const {
57
+ return slots.data();
58
+ }
59
+
60
+ template<typename A>
61
+ uint8_t u32_table<A>::get_lg_size() const {
62
+ return lg_size;
63
+ }
64
+
65
+ template<typename A>
66
+ void u32_table<A>::clear() {
67
+ std::fill(slots.begin(), slots.end(), UINT32_MAX);
68
+ num_items = 0;
69
+ }
70
+
71
+ template<typename A>
72
+ bool u32_table<A>::maybe_insert(uint32_t item) {
73
+ const size_t index = lookup(item);
74
+ if (slots[index] == item) return false;
75
+ if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
76
+ slots[index] = item;
77
+ num_items++;
78
+ if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) {
79
+ rebuild(lg_size + 1);
80
+ }
81
+ return true;
82
+ }
83
+
84
+ template<typename A>
85
+ bool u32_table<A>::maybe_delete(uint32_t item) {
86
+ const size_t index = lookup(item);
87
+ if (slots[index] == UINT32_MAX) return false;
88
+ if (slots[index] != item) throw std::logic_error("item does not exist");
89
+ if (num_items == 0) throw std::logic_error("delete error");
90
+ // delete the item
91
+ slots[index] = UINT32_MAX;
92
+ num_items--;
93
+
94
+ // re-insert all items between the freed slot and the next empty slot
95
+ const size_t mask = (1 << lg_size) - 1;
96
+ size_t probe = (index + 1) & mask;
97
+ uint32_t fetched = slots[probe];
98
+ while (fetched != UINT32_MAX) {
99
+ slots[probe] = UINT32_MAX;
100
+ must_insert(fetched);
101
+ probe = (probe + 1) & mask;
102
+ fetched = slots[probe];
103
+ }
104
+ // shrink if necessary
105
+ if (U32_TABLE_DOWNSIZE_DENOM * num_items < U32_TABLE_DOWNSIZE_NUMER * (1 << lg_size) && lg_size > 2) {
106
+ rebuild(lg_size - 1);
107
+ }
108
+ return true;
109
+ }
110
+
111
+ // this one is specifically tailored to be a part of fm85 decompression scheme
112
+ template<typename A>
113
+ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k) {
114
+ uint8_t lg_num_slots = 2;
115
+ while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
116
+ u32_table<A> table(lg_num_slots, 6 + lg_k);
117
+ // Note: there is a possible "snowplow effect" here because the caller is passing in a sorted pairs array
118
+ // However, we are starting out with the correct final table size, so the problem might not occur
119
+ for (size_t i = 0; i < num_pairs; i++) {
120
+ table.must_insert(pairs[i]);
121
+ }
122
+ table.num_items = num_pairs;
123
+ return table;
124
+ }
125
+
126
+ template<typename A>
127
+ size_t u32_table<A>::lookup(uint32_t item) const {
128
+ const size_t size = 1 << lg_size;
129
+ const size_t mask = size - 1;
130
+ const uint8_t shift = num_valid_bits - lg_size;
131
+ size_t probe = item >> shift;
132
+ if (probe > mask) throw std::logic_error("probe out of range");
133
+ while (slots[probe] != item && slots[probe] != UINT32_MAX) {
134
+ probe = (probe + 1) & mask;
135
+ }
136
+ return probe;
137
+ }
138
+
139
+ // counts and resizing must be handled by the caller
140
+ template<typename A>
141
+ void u32_table<A>::must_insert(uint32_t item) {
142
+ const size_t index = lookup(item);
143
+ if (slots[index] == item) throw std::logic_error("item exists");
144
+ if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
145
+ slots[index] = item;
146
+ }
147
+
148
+ template<typename A>
149
+ void u32_table<A>::rebuild(uint8_t new_lg_size) {
150
+ if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
151
+ const size_t old_size = 1 << lg_size;
152
+ const size_t new_size = 1 << new_lg_size;
153
+ if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
154
+ vector_u32<A> old_slots = std::move(slots);
155
+ slots = vector_u32<A>(new_size, UINT32_MAX);
156
+ lg_size = new_lg_size;
157
+ for (size_t i = 0; i < old_size; i++) {
158
+ if (old_slots[i] != UINT32_MAX) {
159
+ must_insert(old_slots[i]);
160
+ }
161
+ }
162
+ }
163
+
164
+ // While extracting the items from a linear probing hashtable,
165
+ // this will usually undo the wrap-around provided that the table
166
+ // isn't too full. Experiments suggest that for sufficiently large tables
167
+ // the load factor would have to be over 90 percent before this would fail frequently,
168
+ // and even then the subsequent sort would fix things up.
169
+ // The result is nearly sorted, so make sure to use an efficient sort for that case
170
+ template<typename A>
171
+ vector_u32<A> u32_table<A>::unwrapping_get_items() const {
172
+ if (num_items == 0) return vector_u32<A>();
173
+ const size_t table_size = 1 << lg_size;
174
+ vector_u32<A> result(num_items);
175
+ size_t i = 0;
176
+ size_t l = 0;
177
+ size_t r = num_items - 1;
178
+
179
+ // special rules for the region before the first empty slot
180
+ uint32_t hi_bit = 1 << (num_valid_bits - 1);
181
+ while (i < table_size && slots[i] != UINT32_MAX) {
182
+ const uint32_t item = slots[i++];
183
+ if (item & hi_bit) { result[r--] = item; } // this item was probably wrapped, so move to end
184
+ else { result[l++] = item; }
185
+ }
186
+
187
+ // the rest of the table is processed normally
188
+ while (i < table_size) {
189
+ const uint32_t item = slots[i++];
190
+ if (item != UINT32_MAX) result[l++] = item;
191
+ }
192
+ if (l != r + 1) throw std::logic_error("unwrapping error");
193
+ return result;
194
+ }
195
+
196
+ // This merge is safe to use in carefully designed overlapping scenarios.
197
+ template<typename A>
198
+ void u32_table<A>::merge(
199
+ const uint32_t* arr_a, size_t start_a, size_t length_a, // input
200
+ const uint32_t* arr_b, size_t start_b, size_t length_b, // input
201
+ uint32_t* arr_c, size_t start_c // output
202
+ ) {
203
+ const size_t length_c = length_a + length_b;
204
+ const size_t lim_a = start_a + length_a;
205
+ const size_t lim_b = start_b + length_b;
206
+ const size_t lim_c = start_c + length_c;
207
+ size_t a = start_a;
208
+ size_t b = start_b;
209
+ size_t c = start_c;
210
+ for ( ; c < lim_c ; c++) {
211
+ if (b >= lim_b) { arr_c[c] = arr_a[a++]; }
212
+ else if (a >= lim_a) { arr_c[c] = arr_b[b++]; }
213
+ else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; }
214
+ else { arr_c[c] = arr_b[b++]; }
215
+ }
216
+ if (a != lim_a || b != lim_b) throw std::logic_error("merging error");
217
+ }
218
+
219
+ // In applications where the input array is already nearly sorted,
220
+ // insertion sort runs in linear time with a very small constant.
221
+ // This introspective version of insertion sort protects against
222
+ // the quadratic cost of sorting bad input arrays.
223
+ // It keeps track of how much work has been done, and if that exceeds a
224
+ // constant times the array length, it switches to a different sorting algorithm.
225
+
226
+ template<typename A>
227
+ void u32_table<A>::introspective_insertion_sort(uint32_t* a, size_t l, size_t r) { // r points past the rightmost element
228
+ const size_t length = r - l;
229
+ const size_t cost_limit = 8 * length;
230
+ size_t cost = 0;
231
+ for (size_t i = l + 1; i < r; i++) {
232
+ size_t j = i;
233
+ uint32_t v = a[i];
234
+ while (j >= l + 1 && v < a[j - 1]) {
235
+ a[j] = a[j - 1];
236
+ j--;
237
+ }
238
+ a[j] = v;
239
+ cost += i - j; // distance moved is a measure of work
240
+ if (cost > cost_limit) {
241
+ knuth_shell_sort3(a, l, r);
242
+ return;
243
+ }
244
+ }
245
+ }
246
+
247
+ template<typename A>
248
+ void u32_table<A>::knuth_shell_sort3(uint32_t* a, size_t l, size_t r) {
249
+ size_t h;
250
+ for (h = 1; h < (r - l) / 9; h = 3 * h + 1);
251
+ for ( ; h > 0; h /= 3) {
252
+ for (size_t i = l + h; i < r; i++) {
253
+ size_t j = i;
254
+ const uint32_t v = a[i];
255
+ while (j >= l + h && v < a[j - h]) {
256
+ a[j] = a[j - h];
257
+ j -= h;
258
+ }
259
+ a[j] = v;
260
+ }
261
+ }
262
+ }
263
+
264
+ } /* namespace datasketches */
265
+
266
+ #endif