datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,102 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CPC_UNION_HPP_
21
+ #define CPC_UNION_HPP_
22
+
23
+ #include <string>
24
+
25
+ #include "cpc_sketch.hpp"
26
+ #include "common_defs.hpp"
27
+
28
+ namespace datasketches {
29
+
30
+ /*
31
+ * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union
32
+ *
33
+ * author Kevin Lang
34
+ * author Alexander Saydakov
35
+ */
36
+
37
+ // alias with default allocator for convenience
38
+ typedef cpc_union_alloc<std::allocator<void>> cpc_union;
39
+
40
+ template<typename A>
41
+ class cpc_union_alloc {
42
+ public:
43
+ /**
44
+ * Creates an instance of the union given the lg_k parameter and hash seed.
45
+ * @param lg_k base 2 logarithm of the number of bins in the sketch
46
+ * @param seed for hash function
47
+ */
48
+ explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
49
+
50
+ cpc_union_alloc(const cpc_union_alloc<A>& other);
51
+ cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
52
+ ~cpc_union_alloc();
53
+
54
+ cpc_union_alloc<A>& operator=(const cpc_union_alloc<A>& other);
55
+ cpc_union_alloc<A>& operator=(cpc_union_alloc<A>&& other) noexcept;
56
+
57
+ /**
58
+ * This method is to update the union with a given sketch (lvalue)
59
+ * @param sketch to update the union with
60
+ */
61
+ void update(const cpc_sketch_alloc<A>& sketch);
62
+
63
+ /**
64
+ * This method is to update the union with a given sketch (rvalue)
65
+ * @param sketch to update the union with
66
+ */
67
+ void update(cpc_sketch_alloc<A>&& sketch);
68
+
69
+ /**
70
+ * This method produces a copy of the current state of the union as a sketch.
71
+ * @return the result of the union
72
+ */
73
+ cpc_sketch_alloc<A> get_result() const;
74
+
75
+ private:
76
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> AllocU8;
77
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
78
+ typedef typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>> AllocCpc;
79
+
80
+ uint8_t lg_k;
81
+ uint64_t seed;
82
+ cpc_sketch_alloc<A>* accumulator;
83
+ vector_u64<A> bit_matrix;
84
+
85
+ template<typename S> void internal_update(S&& sketch); // to support both rvalue and lvalue
86
+
87
+ cpc_sketch_alloc<A> get_result_from_accumulator() const;
88
+ cpc_sketch_alloc<A> get_result_from_bit_matrix() const;
89
+
90
+ void switch_to_bit_matrix();
91
+ void walk_table_updating_sketch(const u32_table<A>& table);
92
+ void or_table_into_matrix(const u32_table<A>& table);
93
+ void or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k);
94
+ void or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k);
95
+ void reduce_k(uint8_t new_lg_k);
96
+ };
97
+
98
+ } /* namespace datasketches */
99
+
100
+ #include "cpc_union_impl.hpp"
101
+
102
+ #endif
@@ -0,0 +1,346 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CPC_UNION_IMPL_HPP_
21
+ #define CPC_UNION_IMPL_HPP_
22
+
23
+ #include "count_zeros.hpp"
24
+
25
+ namespace datasketches {
26
+
27
+ template<typename A>
28
+ cpc_union_alloc<A>::cpc_union_alloc(uint8_t lg_k, uint64_t seed):
29
+ lg_k(lg_k),
30
+ seed(seed),
31
+ accumulator(nullptr),
32
+ bit_matrix()
33
+ {
34
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
35
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
36
+ }
37
+ accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed);
38
+ }
39
+
40
+ template<typename A>
41
+ cpc_union_alloc<A>::cpc_union_alloc(const cpc_union_alloc<A>& other):
42
+ lg_k(other.lg_k),
43
+ seed(other.seed),
44
+ accumulator(other.accumulator),
45
+ bit_matrix(other.bit_matrix)
46
+ {
47
+ if (accumulator != nullptr) {
48
+ accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
49
+ }
50
+ }
51
+
52
+ template<typename A>
53
+ cpc_union_alloc<A>::cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept:
54
+ lg_k(other.lg_k),
55
+ seed(other.seed),
56
+ accumulator(other.accumulator),
57
+ bit_matrix(std::move(other.bit_matrix))
58
+ {
59
+ other.accumulator = nullptr;
60
+ }
61
+
62
+ template<typename A>
63
+ cpc_union_alloc<A>::~cpc_union_alloc() {
64
+ if (accumulator != nullptr) {
65
+ accumulator->~cpc_sketch_alloc<A>();
66
+ AllocCpc().deallocate(accumulator, 1);
67
+ }
68
+ }
69
+
70
+ template<typename A>
71
+ cpc_union_alloc<A>& cpc_union_alloc<A>::operator=(const cpc_union_alloc<A>& other) {
72
+ cpc_union_alloc<A> copy(other);
73
+ std::swap(lg_k, copy.lg_k);
74
+ seed = copy.seed;
75
+ std::swap(accumulator, copy.accumulator);
76
+ bit_matrix = std::move(copy.bit_matrix);
77
+ return *this;
78
+ }
79
+
80
+ template<typename A>
81
+ cpc_union_alloc<A>& cpc_union_alloc<A>::operator=(cpc_union_alloc<A>&& other) noexcept {
82
+ std::swap(lg_k, other.lg_k);
83
+ seed = other.seed;
84
+ std::swap(accumulator, other.accumulator);
85
+ bit_matrix = std::move(other.bit_matrix);
86
+ return *this;
87
+ }
88
+
89
+ template<typename A>
90
+ void cpc_union_alloc<A>::update(const cpc_sketch_alloc<A>& sketch) {
91
+ internal_update(sketch);
92
+ }
93
+
94
+ template<typename A>
95
+ void cpc_union_alloc<A>::update(cpc_sketch_alloc<A>&& sketch) {
96
+ internal_update(std::forward<cpc_sketch_alloc<A>>(sketch));
97
+ }
98
+
99
+ template<typename A>
100
+ template<typename S>
101
+ void cpc_union_alloc<A>::internal_update(S&& sketch) {
102
+ const uint16_t seed_hash_union = compute_seed_hash(seed);
103
+ const uint16_t seed_hash_sketch = compute_seed_hash(sketch.seed);
104
+ if (seed_hash_union != seed_hash_sketch) {
105
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash_union) + ", "
106
+ + std::to_string(seed_hash_sketch));
107
+ }
108
+ const auto src_flavor = sketch.determine_flavor();
109
+ if (cpc_sketch_alloc<A>::flavor::EMPTY == src_flavor) return;
110
+
111
+ if (sketch.get_lg_k() < lg_k) reduce_k(sketch.get_lg_k());
112
+ if (sketch.get_lg_k() < lg_k) throw std::logic_error("sketch lg_k < union lg_k");
113
+
114
+ if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit matrix are absent");
115
+
116
+ if (cpc_sketch_alloc<A>::flavor::SPARSE == src_flavor && accumulator != nullptr) { // Case A
117
+ if (bit_matrix.size() > 0) throw std::logic_error("union bit_matrix is not expected");
118
+ const auto initial_dest_flavor = accumulator->determine_flavor();
119
+ if (cpc_sketch_alloc<A>::flavor::EMPTY != initial_dest_flavor &&
120
+ cpc_sketch_alloc<A>::flavor::SPARSE != initial_dest_flavor) throw std::logic_error("wrong flavor");
121
+
122
+ // The following partially fixes the snowplow problem provided that the K's are equal.
123
+ if (cpc_sketch_alloc<A>::flavor::EMPTY == initial_dest_flavor && lg_k == sketch.get_lg_k()) {
124
+ *accumulator = std::forward<S>(sketch);
125
+ return;
126
+ }
127
+
128
+ walk_table_updating_sketch(sketch.surprising_value_table);
129
+ const auto final_dst_flavor = accumulator->determine_flavor();
130
+ // if the accumulator has graduated beyond sparse, switch to a bit matrix representation
131
+ if (final_dst_flavor != cpc_sketch_alloc<A>::flavor::EMPTY && final_dst_flavor != cpc_sketch_alloc<A>::flavor::SPARSE) {
132
+ switch_to_bit_matrix();
133
+ }
134
+ return;
135
+ }
136
+
137
+ if (cpc_sketch_alloc<A>::flavor::SPARSE == src_flavor && bit_matrix.size() > 0) { // Case B
138
+ if (accumulator != nullptr) throw std::logic_error("union accumulator != null");
139
+ or_table_into_matrix(sketch.surprising_value_table);
140
+ return;
141
+ }
142
+
143
+ if (cpc_sketch_alloc<A>::flavor::HYBRID != src_flavor && cpc_sketch_alloc<A>::flavor::PINNED != src_flavor
144
+ && cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor");
145
+
146
+ // source is past SPARSE mode, so make sure that dest is a bit matrix
147
+ if (accumulator != nullptr) {
148
+ if (bit_matrix.size() > 0) throw std::logic_error("union bit matrix is not expected");
149
+ const auto dst_flavor = accumulator->determine_flavor();
150
+ if (cpc_sketch_alloc<A>::flavor::EMPTY != dst_flavor && cpc_sketch_alloc<A>::flavor::SPARSE != dst_flavor) {
151
+ throw std::logic_error("wrong flavor");
152
+ }
153
+ switch_to_bit_matrix();
154
+ }
155
+ if (bit_matrix.size() == 0) throw std::logic_error("union bit_matrix is expected");
156
+
157
+ if (cpc_sketch_alloc<A>::flavor::HYBRID == src_flavor || cpc_sketch_alloc<A>::flavor::PINNED == src_flavor) { // Case C
158
+ or_window_into_matrix(sketch.sliding_window, sketch.window_offset, sketch.get_lg_k());
159
+ or_table_into_matrix(sketch.surprising_value_table);
160
+ return;
161
+ }
162
+
163
+ // SLIDING mode involves inverted logic, so we can't just walk the source sketch.
164
+ // Instead, we convert it to a bitMatrix that can be OR'ed into the destination.
165
+ if (cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D
166
+ vector_u64<A> src_matrix = sketch.build_bit_matrix();
167
+ or_matrix_into_matrix(src_matrix, sketch.get_lg_k());
168
+ }
169
+
170
+ template<typename A>
171
+ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result() const {
172
+ if (accumulator != nullptr) {
173
+ if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
174
+ return get_result_from_accumulator();
175
+ }
176
+ if (bit_matrix.size() == 0) throw std::logic_error("bit_matrix is expected");
177
+ return get_result_from_bit_matrix();
178
+ }
179
+
180
+ template<typename A>
181
+ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
182
+ if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
183
+ if (accumulator->get_num_coupons() == 0) {
184
+ return cpc_sketch_alloc<A>(lg_k, seed);
185
+ }
186
+ if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
187
+ cpc_sketch_alloc<A> copy(*accumulator);
188
+ copy.was_merged = true;
189
+ return copy;
190
+ }
191
+
192
+ template<typename A>
193
+ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
194
+ const uint64_t k = 1 << lg_k;
195
+ const uint64_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
196
+
197
+ const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
198
+ if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
199
+ && flavor != cpc_sketch_alloc<A>::flavor::SLIDING) throw std::logic_error("wrong flavor");
200
+
201
+ const uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
202
+
203
+ vector_u8<A> sliding_window(k);
204
+ // don't need to zero the window's memory
205
+
206
+ // dynamically growing caused snowplow effect
207
+ uint8_t table_lg_size = lg_k - 4; // K/16; in some cases this will end up being oversized
208
+ if (table_lg_size < 2) table_lg_size = 2;
209
+ u32_table<A> table(table_lg_size, 6 + lg_k);
210
+
211
+ // the following should work even when the offset is zero
212
+ const uint64_t mask_for_clearing_window = (static_cast<uint64_t>(0xff) << offset) ^ UINT64_MAX;
213
+ const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << offset) - 1;
214
+ uint64_t all_surprises_ored = 0;
215
+
216
+ // The snowplow effect was caused by processing the rows in order,
217
+ // but we have fixed it by using a sufficiently large hash table.
218
+ for (unsigned i = 0; i < k; i++) {
219
+ uint64_t pattern = bit_matrix[i];
220
+ sliding_window[i] = (pattern >> offset) & 0xff;
221
+ pattern &= mask_for_clearing_window;
222
+ pattern ^= mask_for_flipping_early_zone; // this flipping converts surprising 0's to 1's
223
+ all_surprises_ored |= pattern;
224
+ while (pattern != 0) {
225
+ const uint8_t col = count_trailing_zeros_in_u64(pattern);
226
+ pattern = pattern ^ (static_cast<uint64_t>(1) << col); // erase the 1
227
+ const uint32_t row_col = (i << 6) | col;
228
+ bool is_novel = table.maybe_insert(row_col);
229
+ if (!is_novel) throw std::logic_error("is_novel != true");
230
+ }
231
+ }
232
+
233
+ // at this point we could shrink an oversized hash table, but the relative waste isn't very big
234
+
235
+ uint8_t first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored);
236
+ if (first_interesting_column > offset) first_interesting_column = offset; // corner case
237
+
238
+ // HIP-related fields will contain zeros, and that is okay
239
+ return cpc_sketch_alloc<A>(lg_k, num_coupons, first_interesting_column, std::move(table), std::move(sliding_window), false, 0, 0, seed);
240
+ }
241
+
242
+ template<typename A>
243
+ void cpc_union_alloc<A>::switch_to_bit_matrix() {
244
+ bit_matrix = accumulator->build_bit_matrix();
245
+ accumulator->~cpc_sketch_alloc<A>();
246
+ AllocCpc().deallocate(accumulator, 1);
247
+ accumulator = nullptr;
248
+ }
249
+
250
+ template<typename A>
251
+ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
252
+ const uint32_t* slots = table.get_slots();
253
+ const size_t num_slots = 1 << table.get_lg_size();
254
+ const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
255
+
256
+ // Using a golden ratio stride fixes the snowplow effect.
257
+ const double golden = 0.6180339887498949025;
258
+ size_t stride = static_cast<size_t>(golden * static_cast<double>(num_slots));
259
+ if (stride < 2) throw std::logic_error("stride < 2");
260
+ if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
261
+ if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
262
+
263
+ for (size_t i = 0, j = 0; i < num_slots; i++, j += stride) {
264
+ j &= num_slots - 1;
265
+ const uint32_t row_col = slots[j];
266
+ if (row_col != UINT32_MAX) {
267
+ accumulator->row_col_update(row_col & dst_mask);
268
+ }
269
+ }
270
+ }
271
+
272
+ template<typename A>
273
+ void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
274
+ const uint32_t* slots = table.get_slots();
275
+ const size_t num_slots = 1 << table.get_lg_size();
276
+ const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
277
+ for (size_t i = 0; i < num_slots; i++) {
278
+ const uint32_t row_col = slots[i];
279
+ if (row_col != UINT32_MAX) {
280
+ const uint8_t col = row_col & 63;
281
+ const size_t row = row_col >> 6;
282
+ bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
283
+ }
284
+ }
285
+ }
286
+
287
+ template<typename A>
288
+ void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
289
+ if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
290
+ const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
291
+ const size_t src_k = 1 << src_lg_k;
292
+ for (size_t src_row = 0; src_row < src_k; src_row++) {
293
+ bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
294
+ }
295
+ }
296
+
297
+ template<typename A>
298
+ void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
299
+ if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
300
+ const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
301
+ const size_t src_k = 1 << src_lg_k;
302
+ for (size_t src_row = 0; src_row < src_k; src_row++) {
303
+ bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
304
+ }
305
+ }
306
+
307
+ template<typename A>
308
+ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
309
+ if (new_lg_k >= lg_k) throw std::logic_error("new LgK >= union lgK");
310
+ if (accumulator == nullptr && bit_matrix.size() == 0) throw std::logic_error("both accumulator and bit_matrix are absent");
311
+
312
+ if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix
313
+ if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
314
+ vector_u64<A> old_matrix = std::move(bit_matrix);
315
+ const uint8_t old_lg_k = lg_k;
316
+ const size_t new_k = 1 << new_lg_k;
317
+ bit_matrix = vector_u64<A>(new_k, 0);
318
+ lg_k = new_lg_k;
319
+ or_matrix_into_matrix(old_matrix, old_lg_k);
320
+ return;
321
+ }
322
+
323
+ if (accumulator != nullptr) { // downsample the unioner's sketch
324
+ if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
325
+ if (!accumulator->is_empty()) {
326
+ cpc_sketch_alloc<A> old_accumulator(*accumulator);
327
+ *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed);
328
+ walk_table_updating_sketch(old_accumulator.surprising_value_table);
329
+ }
330
+ lg_k = new_lg_k;
331
+
332
+ const auto final_new_flavor = accumulator->determine_flavor();
333
+ // if the new sketch has graduated beyond sparse, convert to bit_matrix
334
+ if (final_new_flavor != cpc_sketch_alloc<A>::flavor::EMPTY &&
335
+ final_new_flavor != cpc_sketch_alloc<A>::flavor::SPARSE) {
336
+ switch_to_bit_matrix();
337
+ }
338
+ return;
339
+ }
340
+
341
+ throw std::logic_error("invalid state");
342
+ }
343
+
344
+ } /* namespace datasketches */
345
+
346
+ #endif
@@ -0,0 +1,137 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef CPC_UTIL_HPP_
21
+ #define CPC_UTIL_HPP_
22
+
23
+ #include <stdexcept>
24
+
25
+ namespace datasketches {
26
+
27
+ static inline uint16_t compute_seed_hash(uint64_t seed) {
28
+ HashState hashes;
29
+ MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
30
+ return hashes.h1 & 0xffff;
31
+ }
32
+
33
+ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
34
+ if (y == 0) throw std::invalid_argument("divide_longs_rounding_up: bad argument");
35
+ const uint64_t quotient = x / y;
36
+ if (quotient * y == x) return (quotient);
37
+ else return quotient + 1;
38
+ }
39
+
40
+ static inline uint64_t long_floor_log2_of_long(uint64_t x) {
41
+ if (x < 1) throw std::invalid_argument("long_floor_log2_of_long: bad argument");
42
+ uint64_t p = 0;
43
+ uint64_t y = 1;
44
+ while (true) {
45
+ if (y == x) return p;
46
+ if (y > x) return p - 1;
47
+ p += 1;
48
+ y <<= 1;
49
+ }
50
+ }
51
+
52
+ // This place-holder code was inadequate because it caused
53
+ // the cost of the post-merge get_result() operation to be O(C)
54
+ // instead of O(K). It did have the advantage of being
55
+ // very simple and trustworthy during initial testing.
56
+ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, size_t length) {
57
+ uint64_t pattern = 0;
58
+ uint64_t count = 0;
59
+ // clock_t t0, t1;
60
+ // t0 = clock();
61
+ // Wegner's Bit-Counting Algorithm, CACM 3 (1960), p. 322.
62
+ for (uint64_t i = 0; i < length; i++) {
63
+ pattern = array[i];
64
+ while (pattern != 0) {
65
+ pattern &= (pattern - 1);
66
+ count++;
67
+ }
68
+ }
69
+ // t1 = clock();
70
+ // printf ("\n(Wegner CountBitsTime %.1f)\n", ((double) (t1 - t0)) / 1000.0);
71
+ // fflush (stdout);
72
+ return count;
73
+ }
74
+
75
+ // Note: this is an adaptation of the Java code,
76
+ // which is apparently a variation of Figure 5-2 in "Hacker's Delight"
77
+ // by Henry S. Warren.
78
+ static inline uint64_t warren_bit_count(uint64_t i) {
79
+ i = i - ((i >> 1) & 0x5555555555555555ULL);
80
+ i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
81
+ i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
82
+ i = i + (i >> 8);
83
+ i = i + (i >> 16);
84
+ i = i + (i >> 32);
85
+ return i & 0x7f;
86
+ }
87
+
88
+ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, size_t length) {
89
+ uint64_t count = 0;
90
+ for (size_t i = 0; i < length; i++) {
91
+ count += warren_bit_count(array[i]);
92
+ }
93
+ return count;
94
+ }
95
+
96
+ // This code is Figure 5-9 in "Hacker's Delight" by Henry S. Warren.
97
+
98
+ #define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
99
+
100
+ static inline uint64_t count_bits_set_in_matrix(const uint64_t* a, size_t length) {
101
+ if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
102
+ uint64_t total = 0;
103
+ uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
104
+ fours = twos = ones = 0;
105
+
106
+ for (size_t i = 0; i <= length - 8; i = i + 8) {
107
+ CSA(twos_a, ones, ones, a[i+0], a[i+1]);
108
+ CSA(twos_b, ones, ones, a[i+2], a[i+3]);
109
+ CSA(fours_a, twos, twos, twos_a, twos_b);
110
+
111
+ CSA(twos_a, ones, ones, a[i+4], a[i+5]);
112
+ CSA(twos_b, ones, ones, a[i+6], a[i+7]);
113
+ CSA(fours_b, twos, twos, twos_a, twos_b);
114
+
115
+ CSA(eights, fours, fours, fours_a, fours_b);
116
+
117
+ total += warren_bit_count(eights);
118
+ }
119
+ total = 8 * total + 4 * warren_bit_count(fours) + 2 * warren_bit_count(twos) + warren_bit_count(ones);
120
+
121
+ // Because I still don't fully trust this fancy version
122
+ // assert(total == wegner_count_bits_set_in_matrix(A, length));
123
+ //if (total != wegner_count_bits_set_in_matrix(a, length)) throw std::logic_error("count_bits_set_in_matrix error");
124
+
125
+ return total;
126
+ }
127
+
128
+ // Here are some timings made with quickTestMerge.c
129
+ // for the "5 5" case:
130
+
131
+ // Wegner CountBitsTime 29.3
132
+ // Warren CountBitsTime 5.3
133
+ // CSA CountBitsTime 4.3
134
+
135
+ } /* namespace datasketches */
136
+
137
+ #endif