datasketches 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +3 -3
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +23 -20
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/README.md +1 -3
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +15 -10
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  21. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  23. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  24. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  25. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +2 -2
  26. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  27. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  28. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  29. data/vendor/datasketches-cpp/{python/src/__init__.py → count/CMakeLists.txt} +25 -1
  30. data/vendor/datasketches-cpp/count/include/count_min.hpp +405 -0
  31. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +497 -0
  32. data/vendor/datasketches-cpp/{MANIFEST.in → count/test/CMakeLists.txt} +23 -20
  33. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  34. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +303 -0
  35. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  36. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  37. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  38. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  39. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  40. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +14 -11
  41. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  42. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  43. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  44. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  45. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  46. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  47. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  48. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_serialize_for_java.cpp +38 -0
  49. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  50. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  51. data/vendor/datasketches-cpp/{tox.ini → density/CMakeLists.txt} +24 -8
  52. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +256 -0
  53. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  54. data/vendor/datasketches-cpp/{python/datasketches/__init__.py → density/test/CMakeLists.txt} +15 -3
  55. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  56. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  57. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  58. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  59. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  60. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  61. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  62. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  63. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  64. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +94 -61
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +20 -8
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  76. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  77. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -18
  78. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  79. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  80. data/vendor/datasketches-cpp/hll/include/hll.hpp +79 -65
  81. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  82. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  83. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  84. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  85. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  86. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  87. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  88. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +79 -53
  89. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +61 -132
  90. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  91. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  92. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  93. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +5 -40
  94. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +76 -54
  95. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +66 -136
  96. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  97. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  98. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  99. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +15 -39
  100. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  101. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -4
  102. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +105 -26
  103. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +50 -111
  104. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  105. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  106. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  107. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  108. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  109. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  110. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  111. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  112. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  113. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +89 -32
  114. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +33 -19
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +13 -10
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +23 -19
  117. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  118. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  119. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  120. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  121. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  122. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  123. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -51
  124. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  125. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  126. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -20
  127. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  128. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  129. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  130. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  131. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  132. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  133. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  134. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  135. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  137. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  140. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  141. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  142. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +180 -33
  143. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  144. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  145. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +10 -10
  146. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  147. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +21 -6
  148. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +13 -3
  149. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +15 -1
  150. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  151. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  152. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  153. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +39 -188
  154. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  155. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  157. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  158. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  159. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  160. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection_impl.hpp +31 -0
  161. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  162. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  163. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  164. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  165. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  166. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  167. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  168. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  169. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  170. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  171. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  172. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  173. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  174. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  175. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  176. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  177. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  178. data/vendor/datasketches-cpp/{python/src/datasketches.cpp → tuple/test/tuple_sketch_serialize_for_java.cpp} +16 -30
  179. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  180. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  181. metadata +61 -79
  182. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  183. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  184. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  185. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  186. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  187. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  188. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  189. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  190. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  191. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  192. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  193. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -81
  194. data/vendor/datasketches-cpp/python/README.md +0 -85
  195. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -104
  196. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  197. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  198. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  199. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  200. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  201. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  202. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  203. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -90
  204. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -128
  205. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -134
  206. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -210
  207. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  208. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -111
  209. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -204
  210. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -215
  211. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -172
  212. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  213. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  214. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  215. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  216. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -110
  217. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -130
  218. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -125
  219. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -126
  220. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -126
  221. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -146
  222. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  223. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -125
  224. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  225. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  230. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  231. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  232. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  233. data/vendor/datasketches-cpp/setup.py +0 -110
  234. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  238. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  239. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  240. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  241. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  242. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  243. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  244. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  245. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -0,0 +1,497 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef COUNT_MIN_IMPL_HPP_
21
+ #define COUNT_MIN_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <iomanip>
25
+ #include <random>
26
+ #include <sstream>
27
+
28
+ #include "MurmurHash3.h"
29
+ #include "count_min.hpp"
30
+ #include "memory_operations.hpp"
31
+
32
+ namespace datasketches {
33
+
34
+ template<typename W, typename A>
35
+ count_min_sketch<W,A>::count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed, const A& allocator):
36
+ _allocator(allocator),
37
+ _num_hashes(num_hashes),
38
+ _num_buckets(num_buckets),
39
+ _sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
40
+ _seed(seed),
41
+ _total_weight(0) {
42
+ if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
43
+
44
+ // This check is to ensure later compatibility with a Java implementation whose maximum size can only
45
+ // be 2^31-1. We check only against 2^30 for simplicity.
46
+ if (num_buckets * num_hashes >= 1 << 30) {
47
+ throw std::invalid_argument("These parameters generate a sketch that exceeds 2^30 elements."
48
+ "Try reducing either the number of buckets or the number of hash functions.");
49
+ }
50
+
51
+ std::default_random_engine rng(_seed);
52
+ std::uniform_int_distribution<uint64_t> extra_hash_seeds(0, std::numeric_limits<uint64_t>::max());
53
+ hash_seeds.reserve(num_hashes);
54
+
55
+ for (uint64_t i=0; i < num_hashes; ++i) {
56
+ hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
57
+ }
58
+ }
59
+
60
+ template<typename W, typename A>
61
+ uint8_t count_min_sketch<W,A>::get_num_hashes() const {
62
+ return _num_hashes;
63
+ }
64
+
65
+ template<typename W, typename A>
66
+ uint32_t count_min_sketch<W,A>::get_num_buckets() const {
67
+ return _num_buckets;
68
+ }
69
+
70
+ template<typename W, typename A>
71
+ uint64_t count_min_sketch<W,A>::get_seed() const {
72
+ return _seed;
73
+ }
74
+
75
+ template<typename W, typename A>
76
+ double count_min_sketch<W,A>::get_relative_error() const {
77
+ return exp(1.0) / double(_num_buckets);
78
+ }
79
+
80
+ template<typename W, typename A>
81
+ W count_min_sketch<W,A>::get_total_weight() const {
82
+ return _total_weight;
83
+ }
84
+
85
+ template<typename W, typename A>
86
+ uint32_t count_min_sketch<W,A>::suggest_num_buckets(double relative_error) {
87
+ /*
88
+ * Function to help users select a number of buckets for a given error.
89
+ * TODO: Change this when we use only power of 2 buckets.
90
+ */
91
+ if (relative_error < 0.) {
92
+ throw std::invalid_argument("Relative error must be at least 0.");
93
+ }
94
+ return static_cast<uint32_t>(ceil(exp(1.0) / relative_error));
95
+ }
96
+
97
+ template<typename W, typename A>
98
+ uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence) {
99
+ /*
100
+ * Function to help users select a number of hashes for a given confidence
101
+ * e.g. confidence = 1 - failure probability
102
+ * failure probability == delta in the literature.
103
+ */
104
+ if (confidence < 0. || confidence > 1.0) {
105
+ throw std::invalid_argument("Confidence must be between 0 and 1.0 (inclusive).");
106
+ }
107
+ return std::min<uint8_t>(ceil(log(1.0 / (1.0 - confidence))), UINT8_MAX);
108
+ }
109
+
110
+ template<typename W, typename A>
111
+ std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const {
112
+ /*
113
+ * Returns the hash locations for the input item using the original hashing
114
+ * scheme from [1].
115
+ * Generate _num_hashes separate hashes from calls to murmurmhash.
116
+ * This could be optimized by keeping both of the 64bit parts of the hash
117
+ * function, rather than generating a new one for every level.
118
+ *
119
+ *
120
+ * Postscript.
121
+ * Note that a tradeoff can be achieved over the update time and space
122
+ * complexity of the sketch by using a combinatorial hashing scheme from
123
+ * https://github.com/Claudenw/BloomFilter/wiki/Bloom-Filters----An-overview
124
+ * https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
125
+ */
126
+ uint64_t bucket_index;
127
+ std::vector<uint64_t> sketch_update_locations;
128
+ sketch_update_locations.reserve(_num_hashes);
129
+
130
+ uint64_t hash_seed_index = 0;
131
+ for (const auto &it: hash_seeds) {
132
+ HashState hashes;
133
+ MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
134
+ uint64_t hash = hashes.h1;
135
+ bucket_index = hash % _num_buckets;
136
+ sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index);
137
+ hash_seed_index += 1;
138
+ }
139
+ return sketch_update_locations;
140
+ }
141
+
142
+ template<typename W, typename A>
143
+ W count_min_sketch<W,A>::get_estimate(uint64_t item) const {return get_estimate(&item, sizeof(item));}
144
+
145
+ template<typename W, typename A>
146
+ W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&item, sizeof(item));}
147
+
148
+ template<typename W, typename A>
149
+ W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
150
+ if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
151
+ return get_estimate(item.c_str(), item.length());
152
+ }
153
+
154
+ template<typename W, typename A>
155
+ W count_min_sketch<W,A>::get_estimate(const void* item, size_t size) const {
156
+ /*
157
+ * Returns the estimated frequency of the item
158
+ */
159
+ std::vector<uint64_t> hash_locations = get_hashes(item, size);
160
+ std::vector<W> estimates;
161
+ for (const auto h: hash_locations) {
162
+ estimates.push_back(_sketch_array[h]);
163
+ }
164
+ return *std::min_element(estimates.begin(), estimates.end());
165
+ }
166
+
167
+ template<typename W, typename A>
168
+ void count_min_sketch<W,A>::update(uint64_t item, W weight) {
169
+ update(&item, sizeof(item), weight);
170
+ }
171
+
172
+ template<typename W, typename A>
173
+ void count_min_sketch<W,A>::update(int64_t item, W weight) {
174
+ update(&item, sizeof(item), weight);
175
+ }
176
+
177
+ template<typename W, typename A>
178
+ void count_min_sketch<W,A>::update(const std::string& item, W weight) {
179
+ if (item.empty()) return;
180
+ update(item.c_str(), item.length(), weight);
181
+ }
182
+
183
+ template<typename W, typename A>
184
+ void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
185
+ /*
186
+ * Gets the item's hash locations and then increments the sketch in those
187
+ * locations by the weight.
188
+ */
189
+ _total_weight += weight >= 0 ? weight : -weight;
190
+ std::vector<uint64_t> hash_locations = get_hashes(item, size);
191
+ for (const auto h: hash_locations) {
192
+ _sketch_array[h] += weight;
193
+ }
194
+ }
195
+
196
+ template<typename W, typename A>
197
+ W count_min_sketch<W,A>::get_upper_bound(uint64_t item) const {return get_upper_bound(&item, sizeof(item));}
198
+
199
+ template<typename W, typename A>
200
+ W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_bound(&item, sizeof(item));}
201
+
202
+ template<typename W, typename A>
203
+ W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
204
+ if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
205
+ return get_upper_bound(item.c_str(), item.length());
206
+ }
207
+
208
+ template<typename W, typename A>
209
+ W count_min_sketch<W,A>::get_upper_bound(const void* item, size_t size) const {
210
+ return static_cast<W>(get_estimate(item, size) + get_relative_error() * get_total_weight());
211
+ }
212
+
213
+ template<typename W, typename A>
214
+ W count_min_sketch<W,A>::get_lower_bound(uint64_t item) const {return get_lower_bound(&item, sizeof(item));}
215
+
216
+ template<typename W, typename A>
217
+ W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_bound(&item, sizeof(item));}
218
+
219
+ template<typename W, typename A>
220
+ W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
221
+ if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
222
+ return get_lower_bound(item.c_str(), item.length());
223
+ }
224
+
225
+ template<typename W, typename A>
226
+ W count_min_sketch<W,A>::get_lower_bound(const void* item, size_t size) const {
227
+ return get_estimate(item, size);
228
+ }
229
+
230
+ template<typename W, typename A>
231
+ void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch) {
232
+ /*
233
+ * Merges this sketch into other_sketch sketch by elementwise summing of buckets
234
+ */
235
+ if (this == &other_sketch) {
236
+ throw std::invalid_argument( "Cannot merge a sketch with itself." );
237
+ }
238
+
239
+ bool acceptable_config =
240
+ (get_num_hashes() == other_sketch.get_num_hashes()) &&
241
+ (get_num_buckets() == other_sketch.get_num_buckets()) &&
242
+ (get_seed() == other_sketch.get_seed());
243
+ if (!acceptable_config) {
244
+ throw std::invalid_argument( "Incompatible sketch configuration." );
245
+ }
246
+
247
+ // Merge step - iterate over the other vector and add the weights to this sketch
248
+ auto it = _sketch_array.begin(); // This is a std::vector iterator.
249
+ auto other_it = other_sketch.begin(); //This is a const iterator over the other sketch.
250
+ while (it != _sketch_array.end()) {
251
+ *it += *other_it;
252
+ ++it;
253
+ ++other_it;
254
+ }
255
+ _total_weight += other_sketch.get_total_weight();
256
+ }
257
+
258
+ // Iterators
259
+ template<typename W, typename A>
260
+ typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::begin() const {
261
+ return _sketch_array.begin();
262
+ }
263
+
264
+ template<typename W, typename A>
265
+ typename count_min_sketch<W,A>::const_iterator count_min_sketch<W,A>::end() const {
266
+ return _sketch_array.end();
267
+ }
268
+
269
+ template<typename W, typename A>
270
+ void count_min_sketch<W,A>::serialize(std::ostream& os) const {
271
+ // Long 0
272
+ //const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
273
+ const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
274
+ const uint8_t ser_ver = SERIAL_VERSION_1;
275
+ const uint8_t family_id = FAMILY_ID;
276
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
277
+ const uint32_t unused32 = NULL_32;
278
+ write(os, preamble_longs);
279
+ write(os, ser_ver);
280
+ write(os, family_id);
281
+ write(os, flags_byte);
282
+ write(os, unused32);
283
+
284
+ // Long 1
285
+ const uint32_t nbuckets = _num_buckets;
286
+ const uint8_t nhashes = _num_hashes;
287
+ const uint16_t seed_hash(compute_seed_hash(_seed));
288
+ const uint8_t unused8 = NULL_8;
289
+ write(os, nbuckets);
290
+ write(os, nhashes);
291
+ write(os, seed_hash);
292
+ write(os, unused8);
293
+ if (is_empty()) return; // sketch is empty, no need to write further bytes.
294
+
295
+ // Long 2
296
+ write(os, _total_weight);
297
+
298
+ // Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
299
+ auto it = _sketch_array.begin();
300
+ while (it != _sketch_array.end()) {
301
+ write(os, *it);
302
+ ++it;
303
+ }
304
+ }
305
+
306
+ template<typename W, typename A>
307
+ auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) -> count_min_sketch {
308
+
309
+ // First 8 bytes are 4 bytes of preamble and 4 unused bytes.
310
+ const auto preamble_longs = read<uint8_t>(is);
311
+ const auto serial_version = read<uint8_t>(is);
312
+ const auto family_id = read<uint8_t>(is);
313
+ const auto flags_byte = read<uint8_t>(is);
314
+ read<uint32_t>(is); // 4 unused bytes
315
+
316
+ check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
317
+
318
+ // Sketch parameters
319
+ const auto nbuckets = read<uint32_t>(is);
320
+ const auto nhashes = read<uint8_t>(is);
321
+ const auto seed_hash = read<uint16_t>(is);
322
+ read<uint8_t>(is); // 1 unused byte
323
+
324
+ if (seed_hash != compute_seed_hash(seed)) {
325
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
326
+ + std::to_string(compute_seed_hash(seed)));
327
+ }
328
+ count_min_sketch c(nhashes, nbuckets, seed, allocator);
329
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
330
+ if (is_empty == 1) return c; // sketch is empty, no need to read further.
331
+
332
+ // Set the sketch weight and read in the sketch values
333
+ const auto weight = read<W>(is);
334
+ c._total_weight += weight;
335
+ read(is, c._sketch_array.data(), sizeof(W) * c._sketch_array.size());
336
+
337
+ return c;
338
+ }
339
+
340
+ template<typename W, typename A>
341
+ size_t count_min_sketch<W,A>::get_serialized_size_bytes() const {
342
+ // The header is always 2 longs, whether empty or full
343
+ const size_t preamble_longs = PREAMBLE_LONGS_SHORT;
344
+
345
+ // If the sketch is empty, we're done. Otherwise, we need the total weight
346
+ // held by the sketch as well as a data table of size (num_buckets * num_hashes)
347
+ return (preamble_longs * sizeof(uint64_t)) + (is_empty() ? 0 : sizeof(W) * (1 + _num_buckets * _num_hashes));
348
+ }
349
+
350
+ template<typename W, typename A>
351
+ auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
352
+ vector_bytes bytes(header_size_bytes + get_serialized_size_bytes(), 0, _allocator);
353
+ uint8_t *ptr = bytes.data() + header_size_bytes;
354
+
355
+ // Long 0
356
+ const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
357
+ ptr += copy_to_mem(preamble_longs, ptr);
358
+ const uint8_t ser_ver = SERIAL_VERSION_1;
359
+ ptr += copy_to_mem(ser_ver, ptr);
360
+ const uint8_t family_id = FAMILY_ID;
361
+ ptr += copy_to_mem(family_id, ptr);
362
+ const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
363
+ ptr += copy_to_mem(flags_byte, ptr);
364
+ const uint32_t unused32 = NULL_32;
365
+ ptr += copy_to_mem(unused32, ptr);
366
+
367
+ // Long 1
368
+ const uint32_t nbuckets = _num_buckets;
369
+ const uint8_t nhashes = _num_hashes;
370
+ const uint16_t seed_hash(compute_seed_hash(_seed));
371
+ const uint8_t null_characters_8 = NULL_8;
372
+ ptr += copy_to_mem(nbuckets, ptr);
373
+ ptr += copy_to_mem(nhashes, ptr);
374
+ ptr += copy_to_mem(seed_hash, ptr);
375
+ ptr += copy_to_mem(null_characters_8, ptr);
376
+ if (is_empty()) return bytes; // sketch is empty, no need to write further bytes.
377
+
378
+ // Long 2
379
+ const W t_weight = _total_weight;
380
+ ptr += copy_to_mem(t_weight, ptr);
381
+
382
+ // Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
383
+ auto it = _sketch_array.begin();
384
+ while (it != _sketch_array.end()) {
385
+ ptr += copy_to_mem(*it, ptr);
386
+ ++it;
387
+ }
388
+
389
+ return bytes;
390
+ }
391
+
392
+ template<typename W, typename A>
393
+ auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) -> count_min_sketch {
394
+ ensure_minimum_memory(size, PREAMBLE_LONGS_SHORT * sizeof(uint64_t));
395
+
396
+ const char* ptr = static_cast<const char*>(bytes);
397
+
398
+ // First 8 bytes are 4 bytes of preamble and 4 unused bytes.
399
+ uint8_t preamble_longs;
400
+ ptr += copy_from_mem(ptr, preamble_longs);
401
+ uint8_t serial_version;
402
+ ptr += copy_from_mem(ptr, serial_version);
403
+ uint8_t family_id;
404
+ ptr += copy_from_mem(ptr, family_id);
405
+ uint8_t flags_byte;
406
+ ptr += copy_from_mem(ptr, flags_byte);
407
+ ptr += sizeof(uint32_t);
408
+
409
+ check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
410
+
411
+ // Second 8 bytes are the sketch parameters with a final, unused byte.
412
+ uint32_t nbuckets;
413
+ uint8_t nhashes;
414
+ uint16_t seed_hash;
415
+ ptr += copy_from_mem(ptr, nbuckets);
416
+ ptr += copy_from_mem(ptr, nhashes);
417
+ ptr += copy_from_mem(ptr, seed_hash);
418
+ ptr += sizeof(uint8_t);
419
+
420
+ if (seed_hash != compute_seed_hash(seed)) {
421
+ throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
422
+ + std::to_string(compute_seed_hash(seed)));
423
+ }
424
+ count_min_sketch c(nhashes, nbuckets, seed, allocator);
425
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
426
+ if (is_empty) return c; // sketch is empty, no need to read further.
427
+
428
+ ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));
429
+
430
+ // Long 2 is the weight.
431
+ W weight;
432
+ ptr += copy_from_mem(ptr, weight);
433
+ c._total_weight += weight;
434
+
435
+ // All remaining bytes are the sketch table entries.
436
+ for (size_t i = 0; i<c._num_buckets*c._num_hashes; ++i) {
437
+ ptr += copy_from_mem(ptr, c._sketch_array[i]);
438
+ }
439
+ return c;
440
+ }
441
+
442
+ template<typename W, typename A>
443
+ bool count_min_sketch<W,A>::is_empty() const {
444
+ return _total_weight == 0;
445
+ }
446
+
447
+ template<typename W, typename A>
448
+ string<A> count_min_sketch<W,A>::to_string() const {
449
+ // count the number of used entries in the sketch
450
+ uint64_t num_nonzero = 0;
451
+ for (const auto entry: _sketch_array) {
452
+ if (entry != static_cast<W>(0.0))
453
+ ++num_nonzero;
454
+ }
455
+
456
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
457
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
458
+ std::ostringstream os;
459
+ os << "### Count Min sketch summary:" << std::endl;
460
+ os << " num hashes : " << static_cast<uint32_t>(_num_hashes) << std::endl;
461
+ os << " num buckets : " << _num_buckets << std::endl;
462
+ os << " capacity bins : " << _sketch_array.size() << std::endl;
463
+ os << " filled bins : " << num_nonzero << std::endl;
464
+ os << " pct filled : " << std::setprecision(3) << (num_nonzero * 100.0) / _sketch_array.size() << "%" << std::endl;
465
+ os << "### End sketch summary" << std::endl;
466
+
467
+ return string<A>(os.str().c_str(), _allocator);
468
+ }
469
+
470
+ template<typename W, typename A>
471
+ void count_min_sketch<W,A>::check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte) {
472
+ const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
473
+
474
+ const uint8_t sw = (empty ? 1 : 0) + (2 * serial_version) + (4 * family_id) + (32 * (preamble_longs & 0x3F));
475
+ bool valid = true;
476
+
477
+ switch (sw) { // exhaustive list and description of all valid cases
478
+ case 138 : break; // !empty, ser_ver==1, family==18, preLongs=2;
479
+ case 139 : break; // empty, ser_ver==1, family==18, preLongs=2;
480
+ //case 170 : break; // !empty, ser_ver==1, family==18, preLongs=3;
481
+ default : // all other case values are invalid
482
+ valid = false;
483
+ }
484
+
485
+ if (!valid) {
486
+ std::ostringstream os;
487
+ os << "Possible sketch corruption. Inconsistent state: "
488
+ << "preamble_longs = " << static_cast<uint32_t>(preamble_longs)
489
+ << ", empty = " << (empty ? "true" : "false")
490
+ << ", serialization_version = " << static_cast<uint32_t>(serial_version);
491
+ throw std::invalid_argument(os.str());
492
+ }
493
+ }
494
+
495
+ } /* namespace datasketches */
496
+
497
+ #endif
@@ -15,25 +15,28 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- global-include CMakeLists.txt
19
- global-include *.cpp
20
- global-include *.c
21
- global-include *.hpp
22
- global-include *.h
23
- global-include *.bin
24
- global-include *.in
18
+ add_executable(count_min_test)
25
19
 
26
- graft cmake
27
- graft common
28
- graft cpc
29
- graft fi
30
- graft hll
31
- graft kll
32
- graft req
33
- graft theta
34
- graft tuple
35
- graft sampling
36
- graft python
20
+ target_link_libraries(count_min_test count common_test_lib)
37
21
 
38
- # exclusions appear after including subdirectories
39
- prune build
22
+ set_target_properties(count_min_test PROPERTIES
23
+ CXX_STANDARD_REQUIRED YES
24
+ )
25
+
26
+ file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" COUNT_TEST_BINARY_PATH)
27
+ string(APPEND COUNT_TEST_BINARY_PATH "/")
28
+ target_compile_definitions(count_min_test
29
+ PRIVATE
30
+ TEST_BINARY_INPUT_PATH="${COUNT_TEST_BINARY_PATH}"
31
+ )
32
+
33
+ add_test(
34
+ NAME count_min_test
35
+ COMMAND count_min_test
36
+ )
37
+
38
+ target_sources(count_min_test
39
+ PRIVATE
40
+ count_min_test.cpp
41
+ count_min_allocation_test.cpp
42
+ )
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+ #include <vector>
22
+ #include <cstring>
23
+ #include <sstream>
24
+ #include <fstream>
25
+
26
+ #include "count_min.hpp"
27
+ #include "common_defs.hpp"
28
+ #include "test_allocator.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ using count_min_sketch_test_alloc = count_min_sketch<uint64_t, test_allocator<uint64_t>>;
33
+ using alloc = test_allocator<uint64_t>;
34
+
35
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]") {
36
+ test_allocator_total_bytes = 0;
37
+ test_allocator_net_allocations = 0;
38
+ {
39
+ uint8_t n_hashes = 1;
40
+ uint32_t n_buckets = 5;
41
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
42
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
43
+ c.serialize(s);
44
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
45
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes());
46
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets());
47
+ REQUIRE(c.get_seed() == d.get_seed());
48
+ uint64_t zero = 0;
49
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
50
+ REQUIRE(c.get_total_weight() == d.get_total_weight());
51
+
52
+ // Check that all entries are equal and 0
53
+ for (auto di: d) {
54
+ REQUIRE(di == 0);
55
+ }
56
+ }
57
+ REQUIRE(test_allocator_total_bytes == 0);
58
+ REQUIRE(test_allocator_net_allocations == 0);
59
+ }
60
+
61
+ TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]") {
62
+ test_allocator_total_bytes = 0;
63
+ test_allocator_net_allocations = 0;
64
+ {
65
+ uint8_t n_hashes = 3;
66
+ uint32_t n_buckets = 1024;
67
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
68
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
69
+ for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
70
+ c.serialize(s);
71
+ count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
72
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes());
73
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets());
74
+ REQUIRE(c.get_seed() == d.get_seed());
75
+ REQUIRE(c.get_total_weight() == d.get_total_weight());
76
+ for (uint64_t i = 0; i < 10; ++i) {
77
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i));
78
+ }
79
+
80
+ auto c_it = c.begin();
81
+ auto d_it = d.begin();
82
+ while (c_it != c.end()) {
83
+ REQUIRE(*c_it == *d_it);
84
+ ++c_it;
85
+ ++d_it;
86
+ }
87
+ }
88
+ REQUIRE(test_allocator_total_bytes == 0);
89
+ REQUIRE(test_allocator_net_allocations == 0);
90
+ }
91
+
92
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]") {
93
+ test_allocator_total_bytes = 0;
94
+ test_allocator_net_allocations = 0;
95
+ {
96
+ uint8_t n_hashes = 3;
97
+ uint32_t n_buckets = 32;
98
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
99
+ auto bytes = c.serialize();
100
+
101
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
102
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
103
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes());
104
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets());
105
+ REQUIRE(c.get_seed() == d.get_seed());
106
+ uint64_t zero = 0;
107
+ REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
108
+ REQUIRE(c.get_total_weight() == d.get_total_weight());
109
+
110
+ // Check that all entries are equal and 0
111
+ for (auto di: d) {
112
+ REQUIRE(di == 0);
113
+ }
114
+ }
115
+ REQUIRE(test_allocator_total_bytes == 0);
116
+ REQUIRE(test_allocator_net_allocations == 0);
117
+ }
118
+
119
+ TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]") {
120
+ test_allocator_total_bytes = 0;
121
+ test_allocator_net_allocations = 0;
122
+ {
123
+ uint8_t n_hashes = 5;
124
+ uint32_t n_buckets = 64;
125
+ count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
126
+ for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
127
+
128
+ auto bytes = c.serialize();
129
+ REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
130
+ auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
131
+
132
+ REQUIRE(c.get_num_hashes() == d.get_num_hashes());
133
+ REQUIRE(c.get_num_buckets() == d.get_num_buckets());
134
+ REQUIRE(c.get_seed() == d.get_seed());
135
+ REQUIRE(c.get_total_weight() == d.get_total_weight());
136
+
137
+ // Check that all entries are equal
138
+ auto c_it = c.begin();
139
+ auto d_it = d.begin();
140
+ while (c_it != c.end()) {
141
+ REQUIRE(*c_it == *d_it);
142
+ ++c_it;
143
+ ++d_it;
144
+ }
145
+
146
+ // Check that the estimates agree
147
+ for (uint64_t i = 0; i < 10; ++i) {
148
+ REQUIRE(c.get_estimate(i) == d.get_estimate(i));
149
+ }
150
+ }
151
+ REQUIRE(test_allocator_total_bytes == 0);
152
+ REQUIRE(test_allocator_net_allocations == 0);
153
+ }
154
+
155
+ } // namespace datasketches