datasketches 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +6 -0
  3. data/NOTICE +1 -1
  4. data/README.md +0 -2
  5. data/ext/datasketches/cpc_wrapper.cpp +2 -2
  6. data/ext/datasketches/kll_wrapper.cpp +0 -10
  7. data/lib/datasketches/version.rb +1 -1
  8. data/lib/datasketches.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  10. data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
  11. data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
  12. data/vendor/datasketches-cpp/Doxyfile +2827 -0
  13. data/vendor/datasketches-cpp/LICENSE +0 -76
  14. data/vendor/datasketches-cpp/NOTICE +1 -1
  15. data/vendor/datasketches-cpp/README.md +1 -3
  16. data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
  18. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
  19. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
  20. data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
  21. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
  22. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
  23. data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
  24. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  25. data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
  26. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
  27. data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
  28. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
  29. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
  30. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
  31. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
  32. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
  33. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
  34. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
  35. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
  36. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
  37. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
  38. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
  42. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
  43. data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
  46. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
  47. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
  48. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
  49. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
  50. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
  51. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
  52. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
  53. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
  54. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
  55. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
  56. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
  57. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
  58. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
  59. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
  60. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
  61. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
  62. data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
  63. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
  64. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
  65. data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
  66. data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
  67. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
  69. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
  70. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
  71. data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
  73. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
  77. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
  80. data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
  86. data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
  87. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
  88. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
  89. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
  90. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
  91. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
  92. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
  94. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
  95. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
  96. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
  97. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
  98. data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
  99. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
  100. data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
  101. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
  102. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
  103. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
  107. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
  108. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
  110. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
  114. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
  115. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
  116. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
  117. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
  118. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
  120. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
  121. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
  122. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
  124. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
  125. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
  126. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
  127. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
  130. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
  131. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
  132. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
  133. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
  134. data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
  135. data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
  136. data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
  137. data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
  138. data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
  139. data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
  140. data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
  141. data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
  142. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
  143. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
  144. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
  145. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
  146. data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
  147. data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
  148. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
  149. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
  150. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
  151. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
  152. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
  153. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
  154. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  155. metadata +47 -93
  156. data/vendor/datasketches-cpp/MANIFEST.in +0 -39
  157. data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
  158. data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
  159. data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
  160. data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
  161. data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
  162. data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
  163. data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
  164. data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
  165. data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
  166. data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
  167. data/vendor/datasketches-cpp/pyproject.toml +0 -23
  168. data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
  169. data/vendor/datasketches-cpp/python/README.md +0 -85
  170. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
  171. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
  172. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
  173. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
  174. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
  175. data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
  176. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
  177. data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
  178. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
  179. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
  180. data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
  181. data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
  182. data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
  183. data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
  184. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
  185. data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
  186. data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
  187. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
  188. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
  189. data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
  190. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
  191. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
  192. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
  193. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
  194. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
  195. data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
  196. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
  197. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
  198. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
  199. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
  200. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
  201. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
  202. data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
  203. data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
  204. data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
  205. data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
  206. data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
  207. data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
  208. data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
  209. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
  210. data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
  211. data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
  212. data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
  213. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
  214. data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
  215. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  216. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  217. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  218. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  219. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  220. data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
  221. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
  222. data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
  223. data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
  224. data/vendor/datasketches-cpp/setup.py +0 -110
  225. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
  226. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
  227. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  228. data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
  229. data/vendor/datasketches-cpp/tox.ini +0 -26
  230. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
  231. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
  232. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
  233. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
  234. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
  235. data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
  236. data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
  237. data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -26,45 +26,39 @@
26
26
 
27
27
  namespace datasketches {
28
28
 
29
+ /// CPC constants
29
30
  namespace cpc_constants {
30
- const uint8_t MIN_LG_K = 4;
31
- const uint8_t MAX_LG_K = 26;
32
- const uint8_t DEFAULT_LG_K = 11;
31
+ /// min log2 of K
32
+ const uint8_t MIN_LG_K = 4;
33
+ /// max log2 of K
34
+ const uint8_t MAX_LG_K = 26;
35
+ /// default log2 of K
36
+ const uint8_t DEFAULT_LG_K = 11;
33
37
  }
34
38
 
35
- // TODO: Redundant and deprecated. Will be removed in next major version release.
36
- static const uint8_t CPC_MIN_LG_K = cpc_constants::MIN_LG_K;
37
- static const uint8_t CPC_MAX_LG_K = cpc_constants::MAX_LG_K;
38
- static const uint8_t CPC_DEFAULT_LG_K = cpc_constants::DEFAULT_LG_K;
39
-
40
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
41
- template<typename A> using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
42
- template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
43
- template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
44
-
45
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
46
- template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
47
- template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
48
-
49
39
  // forward declaration
50
40
  template<typename A> class u32_table;
51
41
 
52
42
  template<typename A>
53
43
  struct compressed_state {
44
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
45
+
54
46
  explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
55
47
  window_data(allocator), window_data_words(0) {}
56
- vector_u32<A> table_data;
48
+ vector_u32 table_data;
57
49
  uint32_t table_data_words;
58
50
  uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
59
- vector_u32<A> window_data;
51
+ vector_u32 window_data;
60
52
  uint32_t window_data_words;
61
53
  };
62
54
 
63
55
  template<typename A>
64
56
  struct uncompressed_state {
57
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
58
+
65
59
  explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
66
60
  u32_table<A> table;
67
- vector_u8<A> window;
61
+ vector_bytes window;
68
62
  };
69
63
 
70
64
  } /* namespace datasketches */
@@ -47,6 +47,9 @@ inline cpc_compressor<A>& get_compressor();
47
47
  template<typename A>
48
48
  class cpc_compressor {
49
49
  public:
50
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
51
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
52
+
50
53
  void compress(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
51
54
  void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
52
55
 
@@ -126,17 +129,17 @@ private:
126
129
  uint16_t* make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values);
127
130
  void validate_decoding_table(const uint16_t* decoding_table, const uint16_t* encoding_table) const;
128
131
 
129
- void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
132
+ void compress_surprising_values(const vector_u32& pairs, uint8_t lg_k, compressed_state<A>& result) const;
130
133
  void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
131
134
 
132
- vector_u32<A> uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs, uint8_t lg_k, const A& allocator) const;
133
- void uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
135
+ vector_u32 uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs, uint8_t lg_k, const A& allocator) const;
136
+ void uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_bytes& window, uint8_t lg_k, uint32_t num_coupons) const;
134
137
 
135
138
  static size_t safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits);
136
139
  static size_t safe_length_for_compressed_window_buf(uint32_t k);
137
140
  static uint8_t determine_pseudo_phase(uint8_t lg_k, uint32_t c);
138
141
 
139
- static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
142
+ static inline vector_u32 tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
140
143
  static inline uint8_t golomb_choose_number_of_base_bits(uint32_t k, uint64_t count);
141
144
  };
142
145
 
@@ -183,7 +183,7 @@ void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompress
183
183
  template<typename A>
184
184
  void cpc_compressor<A>::compress_sparse_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
185
185
  if (source.sliding_window.size() > 0) throw std::logic_error("unexpected sliding window");
186
- vector_u32<A> pairs = source.surprising_value_table.unwrapping_get_items();
186
+ vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
187
187
  u32_table<A>::introspective_insertion_sort(pairs.data(), 0, pairs.size());
188
188
  compress_surprising_values(pairs, source.get_lg_k(), result);
189
189
  }
@@ -192,7 +192,7 @@ template<typename A>
192
192
  void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
193
193
  if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
194
194
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
195
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
195
+ vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
196
196
  lg_k, source.table_data.get_allocator());
197
197
  target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
198
198
  }
@@ -204,12 +204,12 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
204
204
  if (source.sliding_window.size() == 0) throw std::logic_error("no sliding window");
205
205
  if (source.window_offset != 0) throw std::logic_error("window_offset != 0");
206
206
  const uint32_t k = 1 << source.get_lg_k();
207
- vector_u32<A> pairs_from_table = source.surprising_value_table.unwrapping_get_items();
207
+ vector_u32 pairs_from_table = source.surprising_value_table.unwrapping_get_items();
208
208
  const uint32_t num_pairs_from_table = static_cast<uint32_t>(pairs_from_table.size());
209
209
  if (num_pairs_from_table > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, num_pairs_from_table);
210
210
  const uint32_t num_pairs_from_window = source.get_num_coupons() - num_pairs_from_table; // because the window offset is zero
211
211
 
212
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
212
+ vector_u32 all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
213
213
 
214
214
  u32_table<A>::merge(
215
215
  pairs_from_table.data(), 0, pairs_from_table.size(),
@@ -224,7 +224,7 @@ template<typename A>
224
224
  void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
225
225
  if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
226
226
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
227
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
227
+ vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
228
228
  lg_k, source.table_data.get_allocator());
229
229
 
230
230
  // In the hybrid flavor, some of these pairs actually
@@ -250,7 +250,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
250
250
  template<typename A>
251
251
  void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
252
252
  compress_sliding_window(source.sliding_window.data(), source.get_lg_k(), source.get_num_coupons(), result);
253
- vector_u32<A> pairs = source.surprising_value_table.unwrapping_get_items();
253
+ vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
254
254
  if (pairs.size() > 0) {
255
255
  // Here we subtract 8 from the column indices. Because they are stored in the low 6 bits
256
256
  // of each row_col pair, and because no column index is less than 8 for a "Pinned" sketch,
@@ -277,7 +277,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
277
277
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
278
278
  } else {
279
279
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
280
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
280
+ vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
281
281
  lg_k, source.table_data.get_allocator());
282
282
  // undo the compressor's 8-column shift
283
283
  for (uint32_t i = 0; i < num_pairs; i++) {
@@ -291,7 +291,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
291
291
  template<typename A>
292
292
  void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
293
293
  compress_sliding_window(source.sliding_window.data(), source.get_lg_k(), source.get_num_coupons(), result);
294
- vector_u32<A> pairs = source.surprising_value_table.unwrapping_get_items();
294
+ vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
295
295
  if (pairs.size() > 0) {
296
296
  // Here we apply a complicated transformation to the column indices, which
297
297
  // changes the implied ordering of the pairs, so we must do it before sorting.
@@ -330,7 +330,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
330
330
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
331
331
  } else {
332
332
  if (source.table_data.size() == 0) throw std::logic_error("table is expected");
333
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
333
+ vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
334
334
  lg_k, source.table_data.get_allocator());
335
335
 
336
336
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -356,7 +356,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
356
356
  }
357
357
 
358
358
  template<typename A>
359
- void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const {
359
+ void cpc_compressor<A>::compress_surprising_values(const vector_u32& pairs, uint8_t lg_k, compressed_state<A>& result) const {
360
360
  const uint32_t k = 1 << lg_k;
361
361
  const uint32_t num_pairs = static_cast<uint32_t>(pairs.size());
362
362
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
@@ -374,10 +374,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
374
374
  }
375
375
 
376
376
  template<typename A>
377
- vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
378
- uint8_t lg_k, const A& allocator) const {
377
+ auto cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
378
+ uint8_t lg_k, const A& allocator) const -> vector_u32 {
379
379
  const uint32_t k = 1 << lg_k;
380
- vector_u32<A> pairs(num_pairs, 0, allocator);
380
+ vector_u32 pairs(num_pairs, 0, allocator);
381
381
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
382
382
  low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
383
383
  return pairs;
@@ -399,7 +399,7 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
399
399
  }
400
400
 
401
401
  template<typename A>
402
- void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window,
402
+ void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_bytes& window,
403
403
  uint8_t lg_k, uint32_t num_coupons) const {
404
404
  const uint32_t k = 1 << lg_k;
405
405
  window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
@@ -722,10 +722,10 @@ void write_unary(
722
722
  // The empty space that this leaves at the beginning of the output array
723
723
  // will be filled in later by the caller.
724
724
  template<typename A>
725
- vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
726
- uint32_t empty_space, const A& allocator) {
725
+ auto cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
726
+ uint32_t empty_space, const A& allocator) -> vector_u32 {
727
727
  const size_t output_length = empty_space + num_pairs_to_get;
728
- vector_u32<A> pairs(output_length, 0, allocator);
728
+ vector_u32 pairs(output_length, 0, allocator);
729
729
  size_t pair_index = empty_space;
730
730
  for (unsigned row_index = 0; row_index < k; row_index++) {
731
731
  uint8_t byte = window[row_index];
@@ -33,58 +33,58 @@
33
33
 
34
34
  namespace datasketches {
35
35
 
36
- /*
37
- * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
38
- *
39
- * This is a very compact (in serialized form) distinct counting sketch.
40
- * The theory is described in the following paper:
41
- * https://arxiv.org/abs/1708.06839
42
- *
43
- * author Kevin Lang
44
- * author Alexander Saydakov
45
- */
46
-
47
- // forward-declarations
36
+ // forward declarations
48
37
  template<typename A> class cpc_sketch_alloc;
49
38
  template<typename A> class cpc_union_alloc;
50
39
 
51
- // alias with default allocator for convenience
40
+ /// CPC sketch alias with default allocator
52
41
  using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
53
42
 
54
- // allocation and initialization of global decompression (decoding) tables
55
- // call this before anything else if you want to control the initialization time
56
- // for instance, to have this happen outside of a transaction context
57
- // otherwise initialization happens on the first use (serialization or deserialization)
58
- // it is safe to call more than once assuming no race conditions
59
- // this is not thread safe! neither is the rest of the library
43
+ /**
44
+ * Allocation and initialization of global decompression (decoding) tables.
45
+ * Call this before anything else if you want to control the initialization time.
46
+ * For instance, to have this happen outside of a transaction context.
47
+ * Otherwise initialization happens on the first use (serialization or deserialization).
48
+ * It is safe to call more than once assuming no race conditions.
49
+ * This is not thread safe! Neither is the rest of the library.
50
+ */
60
51
  template<typename A> void cpc_init();
61
52
 
53
+ /**
54
+ * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
55
+ *
56
+ * This is a very compact (in serialized form) distinct counting sketch.
57
+ * The theory is described in the following paper:
58
+ * https://arxiv.org/abs/1708.06839
59
+ *
60
+ * @author Kevin Lang
61
+ * @author Alexander Saydakov
62
+ */
62
63
  template<typename A>
63
64
  class cpc_sketch_alloc {
64
65
  public:
66
+ using allocator_type = A;
67
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
68
+ using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
69
+
65
70
  /**
66
71
  * Creates an instance of the sketch given the lg_k parameter and hash seed.
67
72
  * @param lg_k base 2 logarithm of the number of bins in the sketch
68
73
  * @param seed for hash function
74
+ * @param allocator instance of an allocator
69
75
  */
70
76
  explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
71
77
 
72
- using allocator_type = A;
78
+ /// @return allocator
73
79
  A get_allocator() const;
74
80
 
75
- /**
76
- * @return configured lg_k of this sketch
77
- */
81
+ /// @return configured lg_k of this sketch
78
82
  uint8_t get_lg_k() const;
79
83
 
80
- /**
81
- * @return true if this sketch represents an empty set
82
- */
84
+ /// @return true if this sketch represents an empty set
83
85
  bool is_empty() const;
84
86
 
85
- /**
86
- * @return estimate of the distinct count of the input stream
87
- */
87
+ /// @return estimate of the distinct count of the input stream
88
88
  double get_estimate() const;
89
89
 
90
90
  /**
@@ -189,13 +189,14 @@ public:
189
189
  * Otherwise two sketches that should represent overlapping sets will be disjoint
190
190
  * For instance, for signed 32-bit values call update(int32_t) method above,
191
191
  * which does widening conversion to int64_t, if compatibility with Java is expected
192
- * @param data pointer to the data
193
- * @param length of the data in bytes
192
+ * @param value pointer to the data
193
+ * @param size of the data in bytes
194
194
  */
195
195
  void update(const void* value, size_t size);
196
196
 
197
197
  /**
198
198
  * Returns a human-readable summary of this sketch
199
+ * @return a human-readable summary of this sketch
199
200
  */
200
201
  string<A> to_string() const;
201
202
 
@@ -205,16 +206,13 @@ public:
205
206
  */
206
207
  void serialize(std::ostream& os) const;
207
208
 
208
- // This is a convenience alias for users
209
- // The type returned by the following serialize method
210
- using vector_bytes = vector_u8<A>;
211
-
212
209
  /**
213
210
  * This method serializes the sketch as a vector of bytes.
214
211
  * An optional header can be reserved in front of the sketch.
215
212
  * It is an uninitialized space of a given size.
216
213
  * This header is used in Datasketches PostgreSQL extension.
217
214
  * @param header_size_bytes space to reserve in front of the sketch
215
+ * @return serialized sketch as a vector of bytes
218
216
  */
219
217
  vector_bytes serialize(unsigned header_size_bytes = 0) const;
220
218
 
@@ -222,6 +220,7 @@ public:
222
220
  * This method deserializes a sketch from a given stream.
223
221
  * @param is input stream
224
222
  * @param seed the seed for the hash function that was used to create the sketch
223
+ * @param allocator instance of an Allocator
225
224
  * @return an instance of a sketch
226
225
  */
227
226
  static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
@@ -231,6 +230,7 @@ public:
231
230
  * @param bytes pointer to the array of bytes
232
231
  * @param size the size of the array
233
232
  * @param seed the seed for the hash function that was used to create the sketch
233
+ * @param allocator instance of an Allocator
234
234
  * @return an instance of the sketch
235
235
  */
236
236
  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
@@ -246,10 +246,10 @@ public:
246
246
  */
247
247
  static size_t get_max_serialized_size_bytes(uint8_t lg_k);
248
248
 
249
- // for internal use
249
+ /// @private for internal use
250
250
  uint32_t get_num_coupons() const;
251
251
 
252
- // for debugging
252
+ /// @private for debugging
253
253
  // this should catch some forms of corruption during serialization-deserialization
254
254
  bool validate() const;
255
255
 
@@ -276,7 +276,7 @@ private:
276
276
  uint32_t num_coupons; // the number of coupons collected so far
277
277
 
278
278
  u32_table<A> surprising_value_table;
279
- vector_u8<A> sliding_window;
279
+ vector_bytes sliding_window;
280
280
  uint8_t window_offset; // derivable from num_coupons, but made explicit for speed
281
281
  uint8_t first_interesting_column; // This is part of a speed optimization
282
282
 
@@ -285,7 +285,7 @@ private:
285
285
 
286
286
  // for deserialization and cpc_union::get_result()
287
287
  cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table,
288
- vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
288
+ vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
289
289
 
290
290
  inline void row_col_update(uint32_t row_col);
291
291
  inline void update_sparse(uint32_t row_col);
@@ -308,7 +308,7 @@ private:
308
308
  static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c);
309
309
 
310
310
  // this produces a full-size k-by-64 bit matrix
311
- vector_u64<A> build_bit_matrix() const;
311
+ vector_u64 build_bit_matrix() const;
312
312
 
313
313
  static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window);
314
314
  inline void write_hip(std::ostream& os) const;
@@ -315,7 +315,7 @@ void cpc_sketch_alloc<A>::move_window() {
315
315
  const uint32_t k = 1 << lg_k;
316
316
 
317
317
  // Construct the full-sized bit matrix that corresponds to the sketch
318
- vector_u64<A> bit_matrix = build_bit_matrix();
318
+ vector_u64 bit_matrix = build_bit_matrix();
319
319
 
320
320
  // refresh the KXP register on every 8th window shift.
321
321
  if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
@@ -458,7 +458,7 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
458
458
  }
459
459
 
460
460
  template<typename A>
461
- vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
461
+ auto cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
462
462
  compressed_state<A> compressed(sliding_window.get_allocator());
463
463
  compressed.table_data_words = 0;
464
464
  compressed.table_num_entries = 0;
@@ -469,7 +469,7 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
469
469
  const bool has_window = compressed.window_data.size() > 0;
470
470
  const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
471
471
  const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
472
- vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
472
+ vector_bytes bytes(size, 0, sliding_window.get_allocator());
473
473
  uint8_t* ptr = bytes.data() + header_size_bytes;
474
474
  ptr += copy_to_mem(preamble_ints, ptr);
475
475
  const uint8_t serial_version = SERIAL_VERSION;
@@ -712,15 +712,18 @@ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
712
712
  template<typename A>
713
713
  size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
714
714
  check_lg_k(lg_k);
715
- if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
715
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) {
716
+ return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - cpc_constants::MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
717
+ }
716
718
  const uint32_t k = 1 << lg_k;
717
719
  return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
718
720
  }
719
721
 
720
722
  template<typename A>
721
723
  void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
722
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
723
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
724
+ if (lg_k < cpc_constants::MIN_LG_K || lg_k > cpc_constants::MAX_LG_K) {
725
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(cpc_constants::MIN_LG_K) + " and <= "
726
+ + std::to_string(cpc_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
724
727
  }
725
728
  }
726
729
 
@@ -731,14 +734,14 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
731
734
 
732
735
  template<typename A>
733
736
  bool cpc_sketch_alloc<A>::validate() const {
734
- vector_u64<A> bit_matrix = build_bit_matrix();
737
+ vector_u64 bit_matrix = build_bit_matrix();
735
738
  const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
736
739
  return num_bits_set == num_coupons;
737
740
  }
738
741
 
739
742
  template<typename A>
740
743
  cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column,
741
- u32_table<A>&& table, vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed):
744
+ u32_table<A>&& table, vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed):
742
745
  lg_k(lg_k),
743
746
  seed(seed),
744
747
  was_merged(!has_hip),
@@ -800,14 +803,14 @@ uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c)
800
803
  }
801
804
 
802
805
  template<typename A>
803
- vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
806
+ auto cpc_sketch_alloc<A>::build_bit_matrix() const -> vector_u64 {
804
807
  const uint32_t k = 1 << lg_k;
805
808
  if (window_offset > 56) throw std::logic_error("offset > 56");
806
809
 
807
810
  // Fill the matrix with default rows in which the "early zone" is filled with ones.
808
811
  // This is essential for the routine's O(k) time cost (as opposed to O(C)).
809
812
  const uint64_t default_row = (static_cast<uint64_t>(1) << window_offset) - 1;
810
- vector_u64<A> matrix(k, default_row, sliding_window.get_allocator());
813
+ vector_u64 matrix(k, default_row, sliding_window.get_allocator());
811
814
 
812
815
  if (num_coupons == 0) return matrix;
813
816
 
@@ -27,31 +27,55 @@
27
27
 
28
28
  namespace datasketches {
29
29
 
30
- /*
30
+ /// CPC union alias with default allocator
31
+ using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
32
+
33
+ /**
31
34
  * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union
32
35
  *
33
36
  * author Kevin Lang
34
37
  * author Alexander Saydakov
35
38
  */
36
-
37
- // alias with default allocator for convenience
38
- using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
39
-
40
39
  template<typename A>
41
40
  class cpc_union_alloc {
42
41
  public:
42
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
43
+ using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
44
+
43
45
  /**
44
46
  * Creates an instance of the union given the lg_k parameter and hash seed.
45
47
  * @param lg_k base 2 logarithm of the number of bins in the sketch
46
48
  * @param seed for hash function
49
+ * @param allocator instance of an allocator
47
50
  */
48
51
  explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
49
52
 
53
+ /**
54
+ * Copy constructor
55
+ * @param other union to be copied
56
+ */
50
57
  cpc_union_alloc(const cpc_union_alloc<A>& other);
58
+
59
+ /**
60
+ * Move constructor
61
+ * @param other union to be moved
62
+ */
51
63
  cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
64
+
52
65
  ~cpc_union_alloc();
53
66
 
67
+ /**
68
+ * Copy assignment
69
+ * @param other union to be copied
70
+ * @return reference to this union
71
+ */
54
72
  cpc_union_alloc<A>& operator=(const cpc_union_alloc<A>& other);
73
+
74
+ /**
75
+ * Move assignment
76
+ * @param other union to be moved
77
+ * @return reference to this union
78
+ */
55
79
  cpc_union_alloc<A>& operator=(cpc_union_alloc<A>&& other) noexcept;
56
80
 
57
81
  /**
@@ -73,14 +97,14 @@ public:
73
97
  cpc_sketch_alloc<A> get_result() const;
74
98
 
75
99
  private:
76
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> AllocU8;
77
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
78
- typedef typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>> AllocCpc;
100
+ using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
101
+ using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
102
+ using AllocCpc = typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>>;
79
103
 
80
104
  uint8_t lg_k;
81
105
  uint64_t seed;
82
106
  cpc_sketch_alloc<A>* accumulator;
83
- vector_u64<A> bit_matrix;
107
+ vector_u64 bit_matrix;
84
108
 
85
109
  template<typename S> void internal_update(S&& sketch); // to support both rvalue and lvalue
86
110
 
@@ -90,8 +114,8 @@ private:
90
114
  void switch_to_bit_matrix();
91
115
  void walk_table_updating_sketch(const u32_table<A>& table);
92
116
  void or_table_into_matrix(const u32_table<A>& table);
93
- void or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k);
94
- void or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k);
117
+ void or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k);
118
+ void or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k);
95
119
  void reduce_k(uint8_t new_lg_k);
96
120
  };
97
121
 
@@ -33,8 +33,8 @@ seed(seed),
33
33
  accumulator(nullptr),
34
34
  bit_matrix(allocator)
35
35
  {
36
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
37
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
36
+ if (lg_k < cpc_constants::MIN_LG_K || lg_k > cpc_constants::MAX_LG_K) {
37
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(cpc_constants::MIN_LG_K) + " and <= " + std::to_string(cpc_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
38
38
  }
39
39
  accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
40
40
  }
@@ -166,7 +166,7 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
166
166
  // SLIDING mode involves inverted logic, so we can't just walk the source sketch.
167
167
  // Instead, we convert it to a bitMatrix that can be OR'ed into the destination.
168
168
  if (cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D
169
- vector_u64<A> src_matrix = sketch.build_bit_matrix();
169
+ vector_u64 src_matrix = sketch.build_bit_matrix();
170
170
  or_matrix_into_matrix(src_matrix, sketch.get_lg_k());
171
171
  }
172
172
 
@@ -203,7 +203,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
203
203
 
204
204
  const uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
205
205
 
206
- vector_u8<A> sliding_window(k, 0, bit_matrix.get_allocator());
206
+ vector_bytes sliding_window(k, 0, bit_matrix.get_allocator());
207
207
  // don't need to zero the window's memory
208
208
 
209
209
  // dynamically growing caused snowplow effect
@@ -289,7 +289,7 @@ void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
289
289
  }
290
290
 
291
291
  template<typename A>
292
- void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
292
+ void cpc_union_alloc<A>::or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k) {
293
293
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
294
294
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
295
295
  const uint32_t src_k = 1 << src_lg_k;
@@ -299,7 +299,7 @@ void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_windo
299
299
  }
300
300
 
301
301
  template<typename A>
302
- void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
302
+ void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k) {
303
303
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
304
304
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
305
305
  const uint32_t src_k = 1 << src_lg_k;
@@ -315,10 +315,10 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
315
315
 
316
316
  if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix
317
317
  if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
318
- vector_u64<A> old_matrix = std::move(bit_matrix);
318
+ vector_u64 old_matrix = std::move(bit_matrix);
319
319
  const uint8_t old_lg_k = lg_k;
320
320
  const uint32_t new_k = 1 << new_lg_k;
321
- bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
321
+ bit_matrix = vector_u64(new_k, 0, old_matrix.get_allocator());
322
322
  lg_k = new_lg_k;
323
323
  or_matrix_into_matrix(old_matrix, old_lg_k);
324
324
  return;
@@ -38,6 +38,7 @@ static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
38
38
  template<typename A>
39
39
  class u32_table {
40
40
  public:
41
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
41
42
 
42
43
  u32_table(const A& allocator);
43
44
  u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
@@ -54,7 +55,7 @@ public:
54
55
 
55
56
  static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
56
57
 
57
- vector_u32<A> unwrapping_get_items() const;
58
+ vector_u32 unwrapping_get_items() const;
58
59
 
59
60
  static void merge(
60
61
  const uint32_t* arr_a, size_t start_a, size_t length_a, // input
@@ -70,7 +71,7 @@ private:
70
71
  uint8_t lg_size; // log2 of number of slots
71
72
  uint8_t num_valid_bits;
72
73
  uint32_t num_items;
73
- vector_u32<A> slots;
74
+ vector_u32 slots;
74
75
 
75
76
  inline uint32_t lookup(uint32_t item) const;
76
77
  inline void must_insert(uint32_t item);