datasketches 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/LICENSE +310 -0
  4. data/NOTICE +11 -0
  5. data/README.md +126 -0
  6. data/ext/datasketches/cpc_wrapper.cpp +50 -0
  7. data/ext/datasketches/ext.cpp +12 -0
  8. data/ext/datasketches/extconf.rb +11 -0
  9. data/ext/datasketches/hll_wrapper.cpp +69 -0
  10. data/lib/datasketches.rb +9 -0
  11. data/lib/datasketches/version.rb +3 -0
  12. data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
  13. data/vendor/datasketches-cpp/LICENSE +311 -0
  14. data/vendor/datasketches-cpp/MANIFEST.in +19 -0
  15. data/vendor/datasketches-cpp/NOTICE +11 -0
  16. data/vendor/datasketches-cpp/README.md +42 -0
  17. data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
  18. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
  19. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
  20. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
  21. data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
  24. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
  25. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
  26. data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
  27. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
  28. data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
  29. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
  30. data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
  31. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
  32. data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
  33. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
  34. data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
  35. data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
  36. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
  37. data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
  38. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
  39. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
  40. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
  41. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
  42. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
  43. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
  44. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
  45. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
  46. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
  47. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
  48. data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
  49. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
  50. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
  51. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
  52. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
  53. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
  54. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
  55. data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
  59. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
  60. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
  63. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  64. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  65. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  66. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
  67. data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
  68. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
  69. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
  70. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
  71. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
  72. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
  73. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
  74. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
  75. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
  76. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
  77. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
  78. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
  79. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
  80. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
  81. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
  82. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
  83. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
  84. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
  85. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
  86. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
  87. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
  88. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
  89. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
  90. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
  91. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
  92. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
  93. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
  94. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
  95. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
  96. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
  97. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
  98. data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
  99. data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
  100. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
  101. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
  102. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
  103. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
  104. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
  105. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
  106. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
  107. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
  108. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
  109. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
  110. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
  111. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  112. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  113. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  114. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  115. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  116. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  117. data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
  118. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
  119. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
  120. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
  121. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
  122. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
  123. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
  124. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
  125. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
  126. data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
  127. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  128. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
  129. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
  130. data/vendor/datasketches-cpp/pyproject.toml +17 -0
  131. data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
  132. data/vendor/datasketches-cpp/python/README.md +78 -0
  133. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
  134. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
  135. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
  136. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
  137. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
  138. data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
  139. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
  140. data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
  141. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
  142. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
  143. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
  144. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
  145. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
  146. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
  147. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
  148. data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
  149. data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
  150. data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
  151. data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
  152. data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
  153. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
  154. data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
  155. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
  156. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
  157. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
  158. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
  159. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
  160. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
  161. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
  162. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
  163. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
  164. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  165. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  166. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  167. data/vendor/datasketches-cpp/setup.py +94 -0
  168. data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
  169. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
  170. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
  171. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
  172. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
  173. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
  174. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
  175. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
  176. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
  177. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
  178. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
  179. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  180. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  181. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  182. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
  183. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
  184. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
  185. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
  188. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
  189. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
  190. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
  191. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
  192. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
  193. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
  194. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
  195. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
  196. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
  197. data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
  198. data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
  199. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
  200. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
  201. data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
  202. data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
  203. data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
  204. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
  205. data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
  206. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
  207. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
  208. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
  209. data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
  210. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
  211. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
  212. data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
  213. data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
  214. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
  215. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
  216. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
  217. data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
  218. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
  219. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
  220. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
  221. data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
  222. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
  223. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
  224. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
  225. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
  226. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
  227. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
  228. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  230. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  231. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
  232. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
  233. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
  234. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
  238. data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
  239. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
  240. data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
  241. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
  242. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
  243. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
  244. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
  245. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
  246. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
  247. metadata +302 -0
@@ -0,0 +1,114 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REVERSE_PURGE_HASH_MAP_HPP_
21
+ #define REVERSE_PURGE_HASH_MAP_HPP_
22
+
23
+ #include <memory>
24
+ #include <iterator>
25
+
26
+ namespace datasketches {
27
+
28
+ /*
29
+ * This is a specialized linear-probing hash map with a reverse purge operation
30
+ * that removes all entries in the map with values that are less than zero.
31
+ * Based on Java implementation here:
32
+ * https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ReversePurgeItemHashMap.java
33
+ * author Alexander Saydakov
34
+ */
35
+
36
+ template<typename K, typename V = uint64_t, typename H = std::hash<K>, typename E = std::equal_to<K>, typename A = std::allocator<K>>
37
+ class reverse_purge_hash_map {
38
+ public:
39
+ using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
40
+ using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
41
+
42
+ reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size);
43
+ reverse_purge_hash_map(const reverse_purge_hash_map& other);
44
+ reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
45
+ ~reverse_purge_hash_map();
46
+ reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
47
+ reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
48
+ V adjust_or_insert(const K& key, V value);
49
+ V adjust_or_insert(K&& key, V value);
50
+ V get(const K& key) const;
51
+ uint8_t get_lg_cur_size() const;
52
+ uint8_t get_lg_max_size() const;
53
+ uint32_t get_capacity() const;
54
+ uint32_t get_num_active() const;
55
+ class iterator;
56
+ iterator begin() const;
57
+ iterator end() const;
58
+ private:
59
+ static constexpr double LOAD_FACTOR = 0.75;
60
+ static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
61
+ static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
62
+
63
+ uint8_t lg_cur_size;
64
+ uint8_t lg_max_size;
65
+ uint32_t num_active;
66
+ K* keys;
67
+ V* values;
68
+ uint16_t* states;
69
+
70
+ inline bool is_active(uint32_t probe) const;
71
+ void subtract_and_keep_positive_only(V amount);
72
+ void hash_delete(uint32_t probe);
73
+ uint32_t internal_adjust_or_insert(const K& key, V value);
74
+ V resize_or_purge_if_needed();
75
+ void resize(uint8_t lg_new_size);
76
+ V purge();
77
+ };
78
+
79
+ // This iterator uses strides based on golden ratio to avoid clustering during merge
80
+ template<typename K, typename V, typename H, typename E, typename A>
81
+ class reverse_purge_hash_map<K, V, H, E, A>::iterator: public std::iterator<std::input_iterator_tag, K> {
82
+ public:
83
+ friend class reverse_purge_hash_map<K, V, H, E, A>;
84
+ iterator& operator++() {
85
+ ++count;
86
+ if (count < map->num_active) {
87
+ const uint32_t mask = (1 << map->lg_cur_size) - 1;
88
+ do {
89
+ index = (index + stride) & mask;
90
+ } while (!map->is_active(index));
91
+ }
92
+ return *this;
93
+ }
94
+ iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; }
95
+ bool operator==(const iterator& rhs) const { return count == rhs.count; }
96
+ bool operator!=(const iterator& rhs) const { return count != rhs.count; }
97
+ const std::pair<K&, V> operator*() const {
98
+ return std::pair<K&, V>(map->keys[index], map->values[index]);
99
+ }
100
+ private:
101
+ static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
102
+ const reverse_purge_hash_map<K, V, H, E, A>* map;
103
+ uint32_t index;
104
+ uint32_t count;
105
+ uint32_t stride;
106
+ iterator(const reverse_purge_hash_map<K, V, H, E, A>* map, uint32_t index, uint32_t count):
107
+ map(map), index(index), count(count), stride(static_cast<uint32_t>((1 << map->lg_cur_size) * GOLDEN_RATIO_RECIPROCAL) | 1) {}
108
+ };
109
+
110
+ } /* namespace datasketches */
111
+
112
+ #include "reverse_purge_hash_map_impl.hpp"
113
+
114
+ #endif
@@ -0,0 +1,345 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef REVERSE_PURGE_HASH_MAP_IMPL_HPP_
21
+ #define REVERSE_PURGE_HASH_MAP_IMPL_HPP_
22
+
23
+ #include <memory>
24
+ #include <algorithm>
25
+ #include <iterator>
26
+ #include <cmath>
27
+
28
+ #include "MurmurHash3.h"
29
+
30
+ namespace datasketches {
31
+
32
+ // clang++ seems to require this declaration for CMAKE_BUILD_TYPE='Debug"
33
+ template<typename K, typename V, typename H, typename E, typename A>
34
+ constexpr uint32_t reverse_purge_hash_map<K, V, H, E, A>::MAX_SAMPLE_SIZE;
35
+
36
+ template<typename K, typename V, typename H, typename E, typename A>
37
+ reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(uint8_t lg_cur_size, uint8_t lg_max_size):
38
+ lg_cur_size(lg_cur_size),
39
+ lg_max_size(lg_max_size),
40
+ num_active(0),
41
+ keys(A().allocate(1 << lg_cur_size)),
42
+ values(AllocV().allocate(1 << lg_cur_size)),
43
+ states(AllocU16().allocate(1 << lg_cur_size))
44
+ {
45
+ std::fill(states, &states[1 << lg_cur_size], 0);
46
+ }
47
+
48
+ template<typename K, typename V, typename H, typename E, typename A>
49
+ reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(const reverse_purge_hash_map<K, V, H, E, A>& other):
50
+ lg_cur_size(other.lg_cur_size),
51
+ lg_max_size(other.lg_max_size),
52
+ num_active(other.num_active),
53
+ keys(A().allocate(1 << lg_cur_size)),
54
+ values(AllocV().allocate(1 << lg_cur_size)),
55
+ states(AllocU16().allocate(1 << lg_cur_size))
56
+ {
57
+ const uint32_t size = 1 << lg_cur_size;
58
+ if (num_active > 0) {
59
+ auto num = num_active;
60
+ for (uint32_t i = 0; i < size; i++) {
61
+ if (other.states[i] > 0) {
62
+ new (&keys[i]) K(other.keys[i]);
63
+ values[i] = other.values[i];
64
+ }
65
+ if (--num == 0) break;
66
+ }
67
+ }
68
+ std::copy(&other.states[0], &other.states[size], states);
69
+ }
70
+
71
+ template<typename K, typename V, typename H, typename E, typename A>
72
+ reverse_purge_hash_map<K, V, H, E, A>::reverse_purge_hash_map(reverse_purge_hash_map<K, V, H, E, A>&& other) noexcept:
73
+ lg_cur_size(other.lg_cur_size),
74
+ lg_max_size(other.lg_max_size),
75
+ num_active(other.num_active),
76
+ keys(nullptr),
77
+ values(nullptr),
78
+ states(nullptr)
79
+ {
80
+ std::swap(keys, other.keys);
81
+ std::swap(values, other.values);
82
+ std::swap(states, other.states);
83
+ other.num_active = 0;
84
+ }
85
+
86
+ template<typename K, typename V, typename H, typename E, typename A>
87
+ reverse_purge_hash_map<K, V, H, E, A>::~reverse_purge_hash_map() {
88
+ const uint32_t size = 1 << lg_cur_size;
89
+ if (num_active > 0) {
90
+ for (uint32_t i = 0; i < size; i++) {
91
+ if (is_active(i)) {
92
+ keys[i].~K();
93
+ if (--num_active == 0) break;
94
+ }
95
+ }
96
+ }
97
+ if (keys != nullptr)
98
+ A().deallocate(keys, size);
99
+ if (values != nullptr)
100
+ AllocV().deallocate(values, size);
101
+ if (states != nullptr)
102
+ AllocU16().deallocate(states, size);
103
+ }
104
+
105
+ template<typename K, typename V, typename H, typename E, typename A>
106
+ reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A> other) {
107
+ std::swap(lg_cur_size, other.lg_cur_size);
108
+ std::swap(lg_max_size, other.lg_max_size);
109
+ std::swap(num_active, other.num_active);
110
+ std::swap(keys, other.keys);
111
+ std::swap(values, other.values);
112
+ std::swap(states, other.states);
113
+ return *this;
114
+ }
115
+
116
+ template<typename K, typename V, typename H, typename E, typename A>
117
+ reverse_purge_hash_map<K, V, H, E, A>& reverse_purge_hash_map<K, V, H, E, A>::operator=(reverse_purge_hash_map<K, V, H, E, A>&& other) {
118
+ std::swap(lg_cur_size, other.lg_cur_size);
119
+ std::swap(lg_max_size, other.lg_max_size);
120
+ std::swap(num_active, other.num_active);
121
+ std::swap(keys, other.keys);
122
+ std::swap(values, other.values);
123
+ std::swap(states, other.states);
124
+ return *this;
125
+ }
126
+
127
+ template<typename K, typename V, typename H, typename E, typename A>
128
+ V reverse_purge_hash_map<K, V, H, E, A>::adjust_or_insert(const K& key, V value) {
129
+ const uint32_t num_active_before = num_active;
130
+ const uint32_t index = internal_adjust_or_insert(key, value);
131
+ if (num_active > num_active_before) {
132
+ new (&keys[index]) K(key);
133
+ return resize_or_purge_if_needed();
134
+ }
135
+ return 0;
136
+ }
137
+
138
+ template<typename K, typename V, typename H, typename E, typename A>
139
+ V reverse_purge_hash_map<K, V, H, E, A>::adjust_or_insert(K&& key, V value) {
140
+ const uint32_t num_active_before = num_active;
141
+ const uint32_t index = internal_adjust_or_insert(key, value);
142
+ if (num_active > num_active_before) {
143
+ new (&keys[index]) K(std::move(key));
144
+ return resize_or_purge_if_needed();
145
+ }
146
+ return 0;
147
+ }
148
+
149
+ template<typename K, typename V, typename H, typename E, typename A>
150
+ V reverse_purge_hash_map<K, V, H, E, A>::get(const K& key) const {
151
+ const uint32_t mask = (1 << lg_cur_size) - 1;
152
+ uint32_t probe = fmix64(H()(key)) & mask;
153
+ while (is_active(probe)) {
154
+ if (E()(keys[probe], key)) return values[probe];
155
+ probe = (probe + 1) & mask;
156
+ }
157
+ return 0;
158
+ }
159
+
160
+ template<typename K, typename V, typename H, typename E, typename A>
161
+ uint8_t reverse_purge_hash_map<K, V, H, E, A>::get_lg_cur_size() const {
162
+ return lg_cur_size;
163
+ }
164
+
165
+ template<typename K, typename V, typename H, typename E, typename A>
166
+ uint8_t reverse_purge_hash_map<K, V, H, E, A>::get_lg_max_size() const {
167
+ return lg_max_size;
168
+ }
169
+
170
+ template<typename K, typename V, typename H, typename E, typename A>
171
+ uint32_t reverse_purge_hash_map<K, V, H, E, A>::get_capacity() const {
172
+ return (1 << lg_cur_size) * LOAD_FACTOR;
173
+ }
174
+
175
+ template<typename K, typename V, typename H, typename E, typename A>
176
+ uint32_t reverse_purge_hash_map<K, V, H, E, A>::get_num_active() const {
177
+ return num_active;
178
+ }
179
+
180
+ template<typename K, typename V, typename H, typename E, typename A>
181
+ typename reverse_purge_hash_map<K, V, H, E, A>::iterator reverse_purge_hash_map<K, V, H, E, A>::begin() const {
182
+ const uint32_t size = 1 << lg_cur_size;
183
+ uint32_t i = 0;
184
+ while (i < size && !is_active(i)) i++;
185
+ return reverse_purge_hash_map<K, V, H, E, A>::iterator(this, i, 0);
186
+ }
187
+
188
+ template<typename K, typename V, typename H, typename E, typename A>
189
+ typename reverse_purge_hash_map<K, V, H, E, A>::iterator reverse_purge_hash_map<K, V, H, E, A>::end() const {
190
+ return reverse_purge_hash_map<K, V, H, E, A>::iterator(this, 1 << lg_cur_size, num_active);
191
+ }
192
+
193
+ template<typename K, typename V, typename H, typename E, typename A>
194
+ bool reverse_purge_hash_map<K, V, H, E, A>::is_active(uint32_t index) const {
195
+ return states[index] > 0;
196
+ }
197
+
198
+ template<typename K, typename V, typename H, typename E, typename A>
199
+ void reverse_purge_hash_map<K, V, H, E, A>::subtract_and_keep_positive_only(V amount) {
200
+ // starting from the back, find the first empty cell,
201
+ // which establishes the high end of a cluster.
202
+ uint32_t first_probe = (1 << lg_cur_size) - 1;
203
+ while (is_active(first_probe)) first_probe--;
204
+ // when we find the next non-empty cell, we know we are at the high end of a cluster
205
+ // work towards the front, delete any non-positive entries.
206
+ for (uint32_t probe = first_probe; probe-- > 0;) {
207
+ if (is_active(probe)) {
208
+ if (values[probe] <= amount) {
209
+ hash_delete(probe); // does the work of deletion and moving higher items towards the front
210
+ num_active--;
211
+ } else {
212
+ values[probe] -= amount;
213
+ }
214
+ }
215
+ }
216
+ // now work on the first cluster that was skipped
217
+ for (uint32_t probe = (1 << lg_cur_size); probe-- > first_probe;) {
218
+ if (is_active(probe)) {
219
+ if (values[probe] <= amount) {
220
+ hash_delete(probe);
221
+ num_active--;
222
+ } else {
223
+ values[probe] -= amount;
224
+ }
225
+ }
226
+ }
227
+ }
228
+
229
+ template<typename K, typename V, typename H, typename E, typename A>
230
+ void reverse_purge_hash_map<K, V, H, E, A>::hash_delete(uint32_t delete_index) {
231
+ // Looks ahead in the table to search for another
232
+ // item to move to this location
233
+ // if none are found, the status is changed
234
+ states[delete_index] = 0; // mark as empty
235
+ keys[delete_index].~K();
236
+ uint32_t drift = 1;
237
+ const uint32_t mask = (1 << lg_cur_size) - 1;
238
+ uint32_t probe = (delete_index + drift) & mask; // map length must be a power of 2
239
+ // advance until we find a free location replacing locations as needed
240
+ while (is_active(probe)) {
241
+ if (states[probe] > drift) {
242
+ // move current element
243
+ new (&keys[delete_index]) K(std::move(keys[probe]));
244
+ values[delete_index] = values[probe];
245
+ states[delete_index] = states[probe] - drift;
246
+ states[probe] = 0; // mark as empty
247
+ keys[probe].~K();
248
+ drift = 0;
249
+ delete_index = probe;
250
+ }
251
+ probe = (probe + 1) & mask;
252
+ drift++;
253
+ // only used for theoretical analysis
254
+ if (drift >= DRIFT_LIMIT) throw std::logic_error("drift: " + std::to_string(drift) + " >= DRIFT_LIMIT");
255
+ }
256
+ }
257
+
258
+ template<typename K, typename V, typename H, typename E, typename A>
259
+ uint32_t reverse_purge_hash_map<K, V, H, E, A>::internal_adjust_or_insert(const K& key, V value) {
260
+ const uint32_t mask = (1 << lg_cur_size) - 1;
261
+ uint32_t index = fmix64(H()(key)) & mask;
262
+ uint16_t drift = 1;
263
+ while (is_active(index)) {
264
+ if (E()(keys[index], key)) {
265
+ // adjusting the value of an existing key
266
+ values[index] += value;
267
+ return index;
268
+ }
269
+ index = (index + 1) & mask;
270
+ drift++;
271
+ // only used for theoretical analysis
272
+ if (drift >= DRIFT_LIMIT) throw std::logic_error("drift limit reached");
273
+ }
274
+ // adding the key and value to the table
275
+ if (num_active > get_capacity()) {
276
+ throw std::logic_error("num_active " + std::to_string(num_active) + " > capacity " + std::to_string(get_capacity()));
277
+ }
278
+ values[index] = value;
279
+ states[index] = drift;
280
+ num_active++;
281
+ return index;
282
+ }
283
+
284
+ template<typename K, typename V, typename H, typename E, typename A>
285
+ V reverse_purge_hash_map<K, V, H, E, A>::resize_or_purge_if_needed() {
286
+ if (num_active > get_capacity()) {
287
+ if (lg_cur_size < lg_max_size) { // can grow
288
+ resize(lg_cur_size + 1);
289
+ } else { // at target size, must purge
290
+ const V offset = purge();
291
+ if (num_active > get_capacity()) {
292
+ throw std::logic_error("purge did not reduce number of active items");
293
+ }
294
+ return offset;
295
+ }
296
+ }
297
+ return 0;
298
+ }
299
+
300
+ template<typename K, typename V, typename H, typename E, typename A>
301
+ void reverse_purge_hash_map<K, V, H, E, A>::resize(uint8_t lg_new_size) {
302
+ const uint32_t old_size = 1 << lg_cur_size;
303
+ K* old_keys = keys;
304
+ V* old_values = values;
305
+ uint16_t* old_states = states;
306
+ const uint32_t new_size = 1 << lg_new_size;
307
+ keys = A().allocate(new_size);
308
+ values = AllocV().allocate(new_size);
309
+ states = AllocU16().allocate(new_size);
310
+ std::fill(states, &states[new_size], 0);
311
+ num_active = 0;
312
+ lg_cur_size = lg_new_size;
313
+ for (uint32_t i = 0; i < old_size; i++) {
314
+ if (old_states[i] > 0) {
315
+ adjust_or_insert(std::move(old_keys[i]), old_values[i]);
316
+ old_keys[i].~K();
317
+ }
318
+ }
319
+ A().deallocate(old_keys, old_size);
320
+ AllocV().deallocate(old_values, old_size);
321
+ AllocU16().deallocate(old_states, old_size);
322
+ }
323
+
324
+ template<typename K, typename V, typename H, typename E, typename A>
325
+ V reverse_purge_hash_map<K, V, H, E, A>::purge() {
326
+ const uint32_t limit = std::min(MAX_SAMPLE_SIZE, num_active);
327
+ uint32_t num_samples = 0;
328
+ uint32_t i = 0;
329
+ V* samples = AllocV().allocate(limit);
330
+ while (num_samples < limit) {
331
+ if (is_active(i)) {
332
+ samples[num_samples++] = values[i];
333
+ }
334
+ i++;
335
+ }
336
+ std::nth_element(&samples[0], &samples[num_samples / 2], &samples[num_samples]);
337
+ const V median = samples[num_samples / 2];
338
+ AllocV().deallocate(samples, limit);
339
+ subtract_and_keep_positive_only(median);
340
+ return median;
341
+ }
342
+
343
+ } /* namespace datasketches */
344
+
345
+ # endif