datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,48 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _HARMONICNUMBERS_HPP_
21
+ #define _HARMONICNUMBERS_HPP_
22
+
23
+ #include <cstdint>
24
+ #include <memory>
25
+
26
+ namespace datasketches {
27
+
28
+ template<typename A = std::allocator<char>>
29
+ class HarmonicNumbers {
30
+ public:
31
+ /**
32
+ * This is the estimator you would use for flat bit map random accessed, similar to a Bloom filter.
33
+ * @param bitVectorLength the length of the bit vector in bits. Must be &gt; 0.
34
+ * @param numBitsSet the number of bits set in this bit vector. Must be &ge; 0 and &le;
35
+ * bitVectorLength.
36
+ * @return the estimate.
37
+ */
38
+ static double getBitMapEstimate(int bitVectorLength, int numBitsSet);
39
+
40
+ private:
41
+ static double harmonicNumber(uint64_t x_i);
42
+ };
43
+
44
+ }
45
+
46
+ #include "HarmonicNumbers-internal.hpp"
47
+
48
+ #endif /* _HARMONICNUMBERS_HPP_ */
@@ -0,0 +1,335 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _HLL4ARRAY_INTERNAL_HPP_
21
+ #define _HLL4ARRAY_INTERNAL_HPP_
22
+
23
+ #include "Hll4Array.hpp"
24
+
25
+ #include <cstring>
26
+ #include <memory>
27
+ #include <stdexcept>
28
+ #include <string>
29
+
30
+ namespace datasketches {
31
+
32
+ template<typename A>
33
+ Hll4Array<A>::Hll4Array(const int lgConfigK, const bool startFullSize) :
34
+ HllArray<A>(lgConfigK, target_hll_type::HLL_4, startFullSize) {
35
+ const int numBytes = this->hll4ArrBytes(lgConfigK);
36
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
37
+ this->hllByteArr = uint8Alloc().allocate(numBytes);
38
+ std::fill(this->hllByteArr, this->hllByteArr + numBytes, 0);
39
+ auxHashMap = nullptr;
40
+ }
41
+
42
+ template<typename A>
43
+ Hll4Array<A>::Hll4Array(const Hll4Array<A>& that) :
44
+ HllArray<A>(that)
45
+ {
46
+ // can determine hllByteArr size in parent class, no need to allocate here
47
+ // but parent class doesn't handle the auxHashMap
48
+ if (that.auxHashMap != nullptr) {
49
+ auxHashMap = that.auxHashMap->copy();
50
+ } else {
51
+ auxHashMap = nullptr;
52
+ }
53
+ }
54
+
55
+ template<typename A>
56
+ Hll4Array<A>::~Hll4Array() {
57
+ // hllByteArr deleted in parent
58
+ if (auxHashMap != nullptr) {
59
+ AuxHashMap<A>::make_deleter()(auxHashMap);
60
+ }
61
+ }
62
+
63
+ template<typename A>
64
+ std::function<void(HllSketchImpl<A>*)> Hll4Array<A>::get_deleter() const {
65
+ return [](HllSketchImpl<A>* ptr) {
66
+ typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
67
+ Hll4Array<A>* hll = static_cast<Hll4Array<A>*>(ptr);
68
+ hll->~Hll4Array();
69
+ hll4Alloc().deallocate(hll, 1);
70
+ };
71
+ }
72
+
73
+ template<typename A>
74
+ Hll4Array<A>* Hll4Array<A>::copy() const {
75
+ typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
76
+ return new (hll4Alloc().allocate(1)) Hll4Array<A>(*this);
77
+ }
78
+
79
+ template<typename A>
80
+ int Hll4Array<A>::getUpdatableSerializationBytes() const {
81
+ AuxHashMap<A>* auxHashMap = getAuxHashMap();
82
+ int auxBytes;
83
+ if (auxHashMap == nullptr) {
84
+ auxBytes = 4 << HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK];
85
+ } else {
86
+ auxBytes = 4 << auxHashMap->getLgAuxArrInts();
87
+ }
88
+ return HllUtil<A>::HLL_BYTE_ARR_START + getHllByteArrBytes() + auxBytes;
89
+ }
90
+
91
+ template<typename A>
92
+ int Hll4Array<A>::getHllByteArrBytes() const {
93
+ return this->hll4ArrBytes(this->lgConfigK);
94
+ }
95
+
96
+ template<typename A>
97
+ AuxHashMap<A>* Hll4Array<A>::getAuxHashMap() const {
98
+ return auxHashMap;
99
+ }
100
+
101
+ template<typename A>
102
+ void Hll4Array<A>::putAuxHashMap(AuxHashMap<A>* auxHashMap) {
103
+ this->auxHashMap = auxHashMap;
104
+ }
105
+
106
+ template<typename A>
107
+ uint8_t Hll4Array<A>::getSlot(int slotNo) const {
108
+ const uint8_t byte = this->hllByteArr[slotNo >> 1];
109
+ if ((slotNo & 1) > 0) { // odd?
110
+ return byte >> 4;
111
+ }
112
+ return byte & HllUtil<A>::loNibbleMask;
113
+ }
114
+
115
+ template<typename A>
116
+ uint8_t Hll4Array<A>::get_value(uint32_t index) const {
117
+ const uint8_t value = getSlot(index);
118
+ if (value != HllUtil<A>::AUX_TOKEN) return value + this->curMin;
119
+ return auxHashMap->mustFindValueFor(index);
120
+ }
121
+
122
+ template<typename A>
123
+ HllSketchImpl<A>* Hll4Array<A>::couponUpdate(const int coupon) {
124
+ internalCouponUpdate(coupon);
125
+ return this;
126
+ }
127
+
128
+ template<typename A>
129
+ void Hll4Array<A>::internalCouponUpdate(const int coupon) {
130
+ const int newValue = HllUtil<A>::getValue(coupon);
131
+ if (newValue <= this->curMin) {
132
+ return; // quick rejection, but only works for large N
133
+ }
134
+ const int configKmask = (1 << this->lgConfigK) - 1;
135
+ const int slotNo = HllUtil<A>::getLow26(coupon) & configKmask;
136
+ internalHll4Update(slotNo, newValue);
137
+ }
138
+
139
+ template<typename A>
140
+ void Hll4Array<A>::putSlot(int slotNo, uint8_t newValue) {
141
+ const int byteno = slotNo >> 1;
142
+ const uint8_t oldValue = this->hllByteArr[byteno];
143
+ if ((slotNo & 1) == 0) { // set low nibble
144
+ this->hllByteArr[byteno]
145
+ = ((oldValue & HllUtil<A>::hiNibbleMask) | (newValue & HllUtil<A>::loNibbleMask));
146
+ } else { // set high nibble
147
+ this->hllByteArr[byteno]
148
+ = ((oldValue & HllUtil<A>::loNibbleMask) | ((newValue << 4) & HllUtil<A>::hiNibbleMask));
149
+ }
150
+ }
151
+
152
+ //In C: two-registers.c Line 836 in "hhb_abstract_set_slot_if_new_value_bigger" non-sparse
153
+ template<typename A>
154
+ void Hll4Array<A>::internalHll4Update(const int slotNo, const int newVal) {
155
+
156
+ const int rawStoredOldValue = getSlot(slotNo); // could be a 0
157
+ // this is provably a LB:
158
+ const int lbOnOldValue = rawStoredOldValue + this->curMin; // lower bound, could be 0
159
+
160
+ if (newVal > lbOnOldValue) { // 842
161
+ // Note: if an AUX_TOKEN exists, then auxHashMap must already exist
162
+ // 846: rawStoredOldValue == AUX_TOKEN
163
+ const int actualOldValue = (rawStoredOldValue < HllUtil<A>::AUX_TOKEN)
164
+ ? (lbOnOldValue) : (auxHashMap->mustFindValueFor(slotNo));
165
+
166
+ if (newVal > actualOldValue) { // 848: actualOldValue could still be 0; newValue > 0
167
+ // we know that the array will change, but we haven't actually updated yet
168
+ this->hipAndKxQIncrementalUpdate(actualOldValue, newVal);
169
+
170
+ // newVal >= curMin
171
+
172
+ const int shiftedNewValue = newVal - this->curMin; // 874
173
+ // redundant since we know newVal >= curMin,
174
+ // and lgConfigK bounds do not allow overflowing an int
175
+ //assert(shiftedNewValue >= 0);
176
+
177
+ if (rawStoredOldValue == HllUtil<A>::AUX_TOKEN) { // 879
178
+ // Given that we have an AUX_TOKEN, there are 4 cases for how to
179
+ // actually modify the data structure
180
+
181
+ if (shiftedNewValue >= HllUtil<A>::AUX_TOKEN) { // case 1: 881
182
+ // the byte array already contains aux token
183
+ // This is the case where old and new values are both exceptions.
184
+ // The 4-bit array already is AUX_TOKEN, only need to update auxHashMap
185
+ auxHashMap->mustReplace(slotNo, newVal);
186
+ }
187
+ else { // case 2: 885
188
+ // This is the hypothetical case where the old value is an exception and the new one is not,
189
+ // which is impossible given that curMin has not changed here and newVal > oldValue
190
+ }
191
+ } else { // rawStoredOldValue != AUX_TOKEN
192
+ if (shiftedNewValue >= HllUtil<A>::AUX_TOKEN) { // case 3: 892
193
+ // This is the case where the old value is not an exception and the new value is.
194
+ // The AUX_TOKEN must be stored in the 4-bit array and the new value
195
+ // added to the exception table
196
+ putSlot(slotNo, HllUtil<A>::AUX_TOKEN);
197
+ if (auxHashMap == nullptr) {
198
+ auxHashMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK);
199
+ }
200
+ auxHashMap->mustAdd(slotNo, newVal);
201
+ }
202
+ else { // case 4: 897
203
+ // This is the case where neither the old value nor the new value is an exception.
204
+ // We just overwrite the 4-bit array with the shifted new value.
205
+ putSlot(slotNo, shiftedNewValue);
206
+ }
207
+ }
208
+
209
+ // we just increased a pair value, so it might be time to change curMin
210
+ if (actualOldValue == this->curMin) { // 908
211
+ this->decNumAtCurMin();
212
+ while (this->numAtCurMin == 0) {
213
+ shiftToBiggerCurMin(); // increases curMin by 1, builds a new aux table
214
+ // shifts values in 4-bit table and recounts curMin
215
+ }
216
+ }
217
+ } // end newVal <= actualOldValue
218
+ } // end newValue <= lbOnOldValue -> return, no need to update array
219
+ }
220
+
221
+ // This scheme only works with two double registers (2 kxq values).
222
+ // HipAccum, kxq0 and kxq1 remain untouched.
223
+ // This changes curMin, numAtCurMin, hllByteArr and auxMap.
224
+ // Entering this routine assumes that all slots have valid values > 0 and <= 15.
225
+ // An AuxHashMap must exist if any values in the current hllByteArray are already 15.
226
+ // In C: again-two-registers.c Lines 710 "hhb_shift_to_bigger_curmin"
227
+ template<typename A>
228
+ void Hll4Array<A>::shiftToBiggerCurMin() {
229
+ const int newCurMin = this->curMin + 1;
230
+ const int configK = 1 << this->lgConfigK;
231
+ const int configKmask = configK - 1;
232
+
233
+ int numAtNewCurMin = 0;
234
+ int numAuxTokens = 0;
235
+
236
+ // Walk through the slots of 4-bit array decrementing stored values by one unless it
237
+ // equals AUX_TOKEN, where it is left alone but counted to be checked later.
238
+ // If oldStoredValue is 0 it is an error.
239
+ // If the decremented value is 0, we increment numAtNewCurMin.
240
+ // Because getNibble is masked to 4 bits oldStoredValue can never be > 15 or negative
241
+ for (int i = 0; i < configK; i++) { //724
242
+ int oldStoredValue = getSlot(i);
243
+ if (oldStoredValue == 0) {
244
+ throw std::runtime_error("Array slots cannot be 0 at this point.");
245
+ }
246
+ if (oldStoredValue < HllUtil<A>::AUX_TOKEN) {
247
+ putSlot(i, --oldStoredValue);
248
+ if (oldStoredValue == 0) { numAtNewCurMin++; }
249
+ } else { //oldStoredValue == AUX_TOKEN
250
+ numAuxTokens++;
251
+ if (auxHashMap == nullptr) {
252
+ throw std::logic_error("auxHashMap cannot be null at this point");
253
+ }
254
+ }
255
+ }
256
+
257
+ // If old AuxHashMap exists, walk through it updating some slots and build a new AuxHashMap
258
+ // if needed.
259
+ AuxHashMap<A>* newAuxMap = nullptr;
260
+ if (auxHashMap != nullptr) {
261
+ int slotNum;
262
+ int oldActualVal;
263
+ int newShiftedVal;
264
+
265
+ for (auto coupon: *auxHashMap) {
266
+ slotNum = HllUtil<A>::getLow26(coupon) & configKmask;
267
+ oldActualVal = HllUtil<A>::getValue(coupon);
268
+ newShiftedVal = oldActualVal - newCurMin;
269
+ if (newShiftedVal < 0) {
270
+ throw std::logic_error("oldActualVal < newCurMin when incrementing curMin");
271
+ }
272
+
273
+ if (getSlot(slotNum) != HllUtil<A>::AUX_TOKEN) {
274
+ throw std::logic_error("getSlot(slotNum) != AUX_TOKEN for item in auxiliary hash map");
275
+ }
276
+ // Array slot != AUX_TOKEN at getSlot(slotNum);
277
+ if (newShiftedVal < HllUtil<A>::AUX_TOKEN) { // 756
278
+ if (newShiftedVal != 14) {
279
+ throw std::logic_error("newShiftedVal != 14 for item in old auxHashMap despite curMin increment");
280
+ }
281
+ // The former exception value isn't one anymore, so it stays out of new AuxHashMap.
282
+ // Correct the AUX_TOKEN value in the HLL array to the newShiftedVal (14).
283
+ putSlot(slotNum, newShiftedVal);
284
+ numAuxTokens--;
285
+ } else { //newShiftedVal >= AUX_TOKEN
286
+ // the former exception remains an exception, so must be added to the newAuxMap
287
+ if (newAuxMap == nullptr) {
288
+ newAuxMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK);
289
+ }
290
+ newAuxMap->mustAdd(slotNum, oldActualVal);
291
+ }
292
+ } //end scan of oldAuxMap
293
+ } //end if (auxHashMap != null)
294
+ else { // oldAuxMap == null
295
+ if (numAuxTokens != 0) {
296
+ throw std::logic_error("No auxiliary hash map, but numAuxTokens != 0");
297
+ }
298
+ }
299
+
300
+ if (newAuxMap != nullptr) {
301
+ if (newAuxMap->getAuxCount() != numAuxTokens) {
302
+ throw std::runtime_error("Inconsistent counts: auxCount: " + std::to_string(newAuxMap->getAuxCount())
303
+ + ", HLL tokesn: " + std::to_string(numAuxTokens));
304
+ }
305
+ }
306
+
307
+ if (auxHashMap != nullptr) {
308
+ AuxHashMap<A>::make_deleter()(auxHashMap);
309
+ }
310
+ auxHashMap = newAuxMap;
311
+
312
+ this->curMin = newCurMin;
313
+ this->numAtCurMin = numAtNewCurMin;
314
+ }
315
+
316
+ template<typename A>
317
+ typename HllArray<A>::const_iterator Hll4Array<A>::begin(bool all) const {
318
+ return typename HllArray<A>::const_iterator(this->hllByteArr, 1 << this->lgConfigK, 0, this->tgtHllType, auxHashMap, this->curMin, all);
319
+ }
320
+
321
+ template<typename A>
322
+ typename HllArray<A>::const_iterator Hll4Array<A>::end() const {
323
+ return typename HllArray<A>::const_iterator(this->hllByteArr, 1 << this->lgConfigK, 1 << this->lgConfigK, this->tgtHllType, auxHashMap, this->curMin, false);
324
+ }
325
+
326
+ template<typename A>
327
+ void Hll4Array<A>::mergeHll(const HllArray<A>& src) {
328
+ for (auto coupon: src) {
329
+ internalCouponUpdate(coupon);
330
+ }
331
+ }
332
+
333
+ }
334
+
335
+ #endif // _HLL4ARRAY_INTERNAL_HPP_
@@ -0,0 +1,69 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _HLL4ARRAY_HPP_
21
+ #define _HLL4ARRAY_HPP_
22
+
23
+ #include "AuxHashMap.hpp"
24
+ #include "HllArray.hpp"
25
+
26
+ namespace datasketches {
27
+
28
+ template<typename A>
29
+ class Hll4Iterator;
30
+
31
+ template<typename A>
32
+ class Hll4Array final : public HllArray<A> {
33
+ public:
34
+ explicit Hll4Array(int lgConfigK, bool startFullSize);
35
+ explicit Hll4Array(const Hll4Array<A>& that);
36
+
37
+ virtual ~Hll4Array();
38
+ virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
39
+
40
+ virtual Hll4Array* copy() const;
41
+
42
+ inline uint8_t getSlot(int slotNo) const;
43
+ inline void putSlot(int slotNo, uint8_t value);
44
+ inline uint8_t get_value(uint32_t index) const;
45
+
46
+ virtual int getUpdatableSerializationBytes() const;
47
+ virtual int getHllByteArrBytes() const;
48
+
49
+ virtual HllSketchImpl<A>* couponUpdate(int coupon) final;
50
+ void mergeHll(const HllArray<A>& src);
51
+
52
+ virtual AuxHashMap<A>* getAuxHashMap() const;
53
+ // does *not* delete old map if overwriting
54
+ void putAuxHashMap(AuxHashMap<A>* auxHashMap);
55
+
56
+ virtual typename HllArray<A>::const_iterator begin(bool all = false) const;
57
+ virtual typename HllArray<A>::const_iterator end() const;
58
+
59
+ private:
60
+ void internalCouponUpdate(int coupon);
61
+ void internalHll4Update(int slotNo, int newVal);
62
+ void shiftToBiggerCurMin();
63
+
64
+ AuxHashMap<A>* auxHashMap;
65
+ };
66
+
67
+ }
68
+
69
+ #endif /* _HLL4ARRAY_HPP_ */