datasketches 0.1.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +17 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/ext/datasketches/cpc_wrapper.cpp +12 -13
  6. data/ext/datasketches/ext.cpp +1 -1
  7. data/ext/datasketches/ext.h +4 -0
  8. data/ext/datasketches/extconf.rb +1 -1
  9. data/ext/datasketches/fi_wrapper.cpp +6 -8
  10. data/ext/datasketches/hll_wrapper.cpp +13 -14
  11. data/ext/datasketches/kll_wrapper.cpp +28 -76
  12. data/ext/datasketches/theta_wrapper.cpp +27 -41
  13. data/ext/datasketches/vo_wrapper.cpp +4 -6
  14. data/lib/datasketches/version.rb +1 -1
  15. data/vendor/datasketches-cpp/CMakeLists.txt +10 -0
  16. data/vendor/datasketches-cpp/LICENSE +40 -3
  17. data/vendor/datasketches-cpp/NOTICE +1 -1
  18. data/vendor/datasketches-cpp/README.md +4 -4
  19. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +18 -7
  20. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  21. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  22. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  23. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  24. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  25. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +12 -0
  26. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  27. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +24 -0
  28. data/vendor/datasketches-cpp/common/test/integration_test.cpp +77 -0
  29. data/vendor/datasketches-cpp/common/test/test_allocator.hpp +9 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +13 -3
  31. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +20 -20
  32. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +116 -105
  33. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +22 -6
  34. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +140 -101
  35. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +2 -2
  36. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +20 -20
  37. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -16
  38. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +6 -6
  39. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +10 -10
  40. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +21 -21
  41. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -0
  42. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  43. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +237 -0
  44. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  45. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  46. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +15 -10
  47. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +102 -105
  48. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +19 -13
  49. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +141 -125
  50. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +15 -12
  51. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +5 -5
  52. data/vendor/datasketches-cpp/hll/CMakeLists.txt +3 -0
  53. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +81 -109
  54. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +25 -24
  55. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  56. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +5 -5
  57. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +89 -105
  58. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +13 -13
  59. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +130 -165
  60. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +21 -22
  61. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  62. data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +2 -2
  63. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  64. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +2 -2
  65. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +88 -83
  66. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  67. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +34 -45
  68. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +7 -8
  69. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +41 -52
  70. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +7 -8
  71. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +220 -251
  72. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +42 -42
  73. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +36 -38
  74. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  75. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +15 -14
  76. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +47 -44
  77. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +62 -87
  78. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +121 -128
  79. data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +1 -1
  80. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  81. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  82. data/vendor/datasketches-cpp/hll/include/hll.hpp +25 -53
  83. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +8 -8
  84. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +36 -36
  85. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +28 -28
  86. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  87. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +37 -37
  88. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +57 -61
  89. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  90. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  91. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  92. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  93. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  94. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  95. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +40 -25
  96. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +50 -6
  97. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +164 -136
  98. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  99. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  100. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  101. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +11 -10
  102. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +178 -88
  103. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  104. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  105. data/vendor/datasketches-cpp/python/CMakeLists.txt +12 -6
  106. data/vendor/datasketches-cpp/python/README.md +52 -49
  107. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  108. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  109. data/vendor/datasketches-cpp/python/src/datasketches.cpp +2 -0
  110. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -6
  111. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +4 -2
  112. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +246 -0
  113. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +38 -28
  114. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  115. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  116. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -2
  117. data/vendor/datasketches-cpp/python/tests/kll_test.py +5 -5
  118. data/vendor/datasketches-cpp/python/tests/req_test.py +126 -0
  119. data/vendor/datasketches-cpp/python/tests/theta_test.py +28 -3
  120. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  121. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  122. data/vendor/datasketches-cpp/req/CMakeLists.txt +60 -0
  123. data/vendor/datasketches-cpp/{tuple/include/theta_a_not_b_experimental_impl.hpp → req/include/req_common.hpp} +18 -8
  124. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +137 -0
  125. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +488 -0
  126. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +69 -0
  127. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +60 -0
  128. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +395 -0
  129. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +810 -0
  130. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +43 -0
  131. data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
  132. data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
  133. data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
  134. data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
  135. data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
  136. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +128 -0
  137. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +494 -0
  138. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +19 -13
  139. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +130 -127
  140. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -5
  141. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +41 -49
  142. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -0
  143. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +96 -0
  144. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  145. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -44
  146. data/vendor/datasketches-cpp/setup.py +11 -6
  147. data/vendor/datasketches-cpp/theta/CMakeLists.txt +30 -3
  148. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_sampled_sets.hpp +3 -2
  149. data/vendor/datasketches-cpp/{tuple → theta}/include/bounds_on_ratios_in_theta_sketched_sets.hpp +1 -1
  150. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  151. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  152. data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +12 -29
  153. data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +5 -46
  154. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_comparators.hpp +0 -0
  155. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_constants.hpp +11 -4
  156. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_helpers.hpp +0 -0
  157. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +26 -28
  158. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base.hpp +0 -0
  159. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_intersection_base_impl.hpp +0 -0
  160. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +8 -90
  161. data/vendor/datasketches-cpp/{tuple/test/theta_union_experimental_test.cpp → theta/include/theta_jaccard_similarity.hpp} +11 -18
  162. data/vendor/datasketches-cpp/{tuple/include/jaccard_similarity.hpp → theta/include/theta_jaccard_similarity_base.hpp} +24 -36
  163. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base.hpp +0 -0
  164. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_set_difference_base_impl.hpp +5 -0
  165. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +163 -256
  166. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +250 -651
  167. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +27 -60
  168. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base.hpp +1 -1
  169. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_union_base_impl.hpp +6 -1
  170. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +13 -69
  171. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base.hpp +10 -21
  172. data/vendor/datasketches-cpp/{tuple → theta}/include/theta_update_sketch_base_impl.hpp +44 -30
  173. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  174. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  175. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  176. data/vendor/datasketches-cpp/{tuple → theta}/test/theta_jaccard_similarity_test.cpp +60 -5
  177. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +74 -235
  178. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  179. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +3 -35
  180. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  181. data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +38 -0
  182. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -13
  183. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +57 -70
  184. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +1 -6
  185. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  186. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +18 -21
  187. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +13 -16
  188. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +7 -6
  189. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +3 -3
  190. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  191. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +13 -16
  192. metadata +51 -36
  193. data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +0 -53
  194. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +0 -78
  195. data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +0 -43
  196. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +0 -393
  197. data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +0 -481
  198. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +0 -88
  199. data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +0 -47
  200. data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +0 -250
  201. data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
  202. data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
  203. data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
  204. data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +0 -224
  205. data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +0 -247
@@ -33,10 +33,14 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
33
33
  const uint8_t frequent_items_sketch<T, W, H, E, S, A>::LG_MIN_MAP_SIZE;
34
34
 
35
35
  template<typename T, typename W, typename H, typename E, typename S, typename A>
36
- frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size):
36
+ frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size, const A& allocator):
37
37
  total_weight(0),
38
38
  offset(0),
39
- map(std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE), std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE))
39
+ map(
40
+ std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
41
+ std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE),
42
+ allocator
43
+ )
40
44
  {
41
45
  if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
42
46
  }
@@ -61,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
61
65
  void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
62
66
  if (other.is_empty()) return;
63
67
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
64
- for (auto &it: other.map) {
68
+ for (auto it: other.map) {
65
69
  update(it.first, it.second);
66
70
  }
67
71
  offset += other.offset;
@@ -72,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
72
76
  void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
73
77
  if (other.is_empty()) return;
74
78
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
75
- for (auto &it: other.map) {
79
+ for (auto it: other.map) {
76
80
  update(std::move(it.first), it.second);
77
81
  }
78
82
  offset += other.offset;
@@ -142,8 +146,8 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
142
146
  template<typename T, typename W, typename H, typename E, typename S, typename A>
143
147
  typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
144
148
  frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
145
- vector_row items;
146
- for (auto &it: map) {
149
+ vector_row items(map.get_allocator());
150
+ for (auto it: map) {
147
151
  const W lb = it.second;
148
152
  const W ub = it.second + offset;
149
153
  if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
@@ -158,43 +162,45 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
158
162
  template<typename T, typename W, typename H, typename E, typename S, typename A>
159
163
  void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
160
164
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
161
- os.write((char*)&preamble_longs, sizeof(preamble_longs));
165
+ write(os, preamble_longs);
162
166
  const uint8_t serial_version = SERIAL_VERSION;
163
- os.write((char*)&serial_version, sizeof(serial_version));
167
+ write(os, serial_version);
164
168
  const uint8_t family = FAMILY_ID;
165
- os.write((char*)&family, sizeof(family));
169
+ write(os, family);
166
170
  const uint8_t lg_max_size = map.get_lg_max_size();
167
- os.write((char*)&lg_max_size, sizeof(lg_max_size));
171
+ write(os, lg_max_size);
168
172
  const uint8_t lg_cur_size = map.get_lg_cur_size();
169
- os.write((char*)&lg_cur_size, sizeof(lg_cur_size));
173
+ write(os, lg_cur_size);
170
174
  const uint8_t flags_byte(
171
175
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
172
176
  );
173
- os.write((char*)&flags_byte, sizeof(flags_byte));
177
+ write(os, flags_byte);
174
178
  const uint16_t unused16 = 0;
175
- os.write((char*)&unused16, sizeof(unused16));
179
+ write(os, unused16);
176
180
  if (!is_empty()) {
177
181
  const uint32_t num_items = map.get_num_active();
178
- os.write((char*)&num_items, sizeof(num_items));
182
+ write(os, num_items);
179
183
  const uint32_t unused32 = 0;
180
- os.write((char*)&unused32, sizeof(unused32));
181
- os.write((char*)&total_weight, sizeof(total_weight));
182
- os.write((char*)&offset, sizeof(offset));
184
+ write(os, unused32);
185
+ write(os, total_weight);
186
+ write(os, offset);
183
187
 
184
188
  // copy active items and their weights to use batch serialization
185
- typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
186
- W* weights = AllocW().allocate(num_items);
187
- T* items = A().allocate(num_items);
189
+ using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
190
+ AllocW aw(map.get_allocator());
191
+ W* weights = aw.allocate(num_items);
192
+ A alloc(map.get_allocator());
193
+ T* items = alloc.allocate(num_items);
188
194
  uint32_t i = 0;
189
- for (auto &it: map) {
195
+ for (auto it: map) {
190
196
  new (&items[i]) T(it.first);
191
197
  weights[i++] = it.second;
192
198
  }
193
- os.write((char*)weights, sizeof(W) * num_items);
194
- AllocW().deallocate(weights, num_items);
199
+ write(os, weights, sizeof(W) * num_items);
200
+ aw.deallocate(weights, num_items);
195
201
  S().serialize(os, items, num_items);
196
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
197
- A().deallocate(items, num_items);
202
+ for (i = 0; i < num_items; i++) items[i].~T();
203
+ alloc.deallocate(items, num_items);
198
204
  }
199
205
  }
200
206
 
@@ -202,56 +208,56 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
202
208
  size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
203
209
  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
204
210
  size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
205
- for (auto &it: map) size += S().size_of_item(it.first);
211
+ for (auto it: map) size += S().size_of_item(it.first);
206
212
  return size;
207
213
  }
208
214
 
209
215
  template<typename T, typename W, typename H, typename E, typename S, typename A>
210
- vector_u8<A> frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes) const {
216
+ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
211
217
  const size_t size = header_size_bytes + get_serialized_size_bytes();
212
- vector_u8<A> bytes(size);
218
+ vector_bytes bytes(size, 0, map.get_allocator());
213
219
  uint8_t* ptr = bytes.data() + header_size_bytes;
214
220
  uint8_t* end_ptr = ptr + size;
215
221
 
216
222
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
217
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
223
+ ptr += copy_to_mem(preamble_longs, ptr);
218
224
  const uint8_t serial_version = SERIAL_VERSION;
219
- ptr += copy_to_mem(&serial_version, ptr, sizeof(uint8_t));
225
+ ptr += copy_to_mem(serial_version, ptr);
220
226
  const uint8_t family = FAMILY_ID;
221
- ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
227
+ ptr += copy_to_mem(family, ptr);
222
228
  const uint8_t lg_max_size = map.get_lg_max_size();
223
- ptr += copy_to_mem(&lg_max_size, ptr, sizeof(uint8_t));
229
+ ptr += copy_to_mem(lg_max_size, ptr);
224
230
  const uint8_t lg_cur_size = map.get_lg_cur_size();
225
- ptr += copy_to_mem(&lg_cur_size, ptr, sizeof(uint8_t));
231
+ ptr += copy_to_mem(lg_cur_size, ptr);
226
232
  const uint8_t flags_byte(
227
233
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
228
234
  );
229
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(uint8_t));
230
- const uint16_t unused16 = 0;
231
- ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
235
+ ptr += copy_to_mem(flags_byte, ptr);
236
+ ptr += sizeof(uint16_t); // unused
232
237
  if (!is_empty()) {
233
238
  const uint32_t num_items = map.get_num_active();
234
- ptr += copy_to_mem(&num_items, ptr, sizeof(uint32_t));
235
- const uint32_t unused32 = 0;
236
- ptr += copy_to_mem(&unused32, ptr, sizeof(uint32_t));
237
- ptr += copy_to_mem(&total_weight, ptr, sizeof(total_weight));
238
- ptr += copy_to_mem(&offset, ptr, sizeof(offset));
239
+ ptr += copy_to_mem(num_items, ptr);
240
+ ptr += sizeof(uint32_t); // unused
241
+ ptr += copy_to_mem(total_weight, ptr);
242
+ ptr += copy_to_mem(offset, ptr);
239
243
 
240
244
  // copy active items and their weights to use batch serialization
241
- typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
242
- W* weights = AllocW().allocate(num_items);
243
- T* items = A().allocate(num_items);
245
+ using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
246
+ AllocW aw(map.get_allocator());
247
+ W* weights = aw.allocate(num_items);
248
+ A alloc(map.get_allocator());
249
+ T* items = alloc.allocate(num_items);
244
250
  uint32_t i = 0;
245
- for (auto &it: map) {
251
+ for (auto it: map) {
246
252
  new (&items[i]) T(it.first);
247
253
  weights[i++] = it.second;
248
254
  }
249
255
  ptr += copy_to_mem(weights, ptr, sizeof(W) * num_items);
250
- AllocW().deallocate(weights, num_items);
256
+ aw.deallocate(weights, num_items);
251
257
  const size_t bytes_remaining = end_ptr - ptr;
252
258
  ptr += S().serialize(ptr, bytes_remaining, items, num_items);
253
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
254
- A().deallocate(items, num_items);
259
+ for (i = 0; i < num_items; i++) items[i].~T();
260
+ alloc.deallocate(items, num_items);
255
261
  }
256
262
  return bytes;
257
263
  }
@@ -259,37 +265,32 @@ vector_u8<A> frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_
259
265
  template<typename T, typename W, typename H, typename E, typename S, typename A>
260
266
  class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
261
267
  public:
262
- items_deleter(uint32_t num, bool destroy): num(num), destroy(destroy) {}
263
- void set_destroy(bool destroy) { this->destroy = destroy; }
264
- void operator() (T* ptr) const {
268
+ items_deleter(uint32_t num, bool destroy, const A& allocator):
269
+ allocator_(allocator), num_(num), destroy_(destroy) {}
270
+ void set_destroy(bool destroy) { destroy_ = destroy; }
271
+ void operator() (T* ptr) {
265
272
  if (ptr != nullptr) {
266
- if (destroy) {
267
- for (uint32_t i = 0; i < num; ++i) ptr[i].~T();
273
+ if (destroy_) {
274
+ for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
268
275
  }
269
- A().deallocate(ptr, num);
276
+ allocator_.deallocate(ptr, num_);
270
277
  }
271
278
  }
272
279
  private:
273
- uint32_t num;
274
- bool destroy;
280
+ A allocator_;
281
+ uint32_t num_;
282
+ bool destroy_;
275
283
  };
276
284
 
277
285
  template<typename T, typename W, typename H, typename E, typename S, typename A>
278
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is) {
279
- uint8_t preamble_longs;
280
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
281
- uint8_t serial_version;
282
- is.read((char*)&serial_version, sizeof(serial_version));
283
- uint8_t family_id;
284
- is.read((char*)&family_id, sizeof(family_id));
285
- uint8_t lg_max_size;
286
- is.read((char*)&lg_max_size, sizeof(lg_max_size));
287
- uint8_t lg_cur_size;
288
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
289
- uint8_t flags_byte;
290
- is.read((char*)&flags_byte, sizeof(flags_byte));
291
- uint16_t unused16;
292
- is.read((char*)&unused16, sizeof(unused16));
286
+ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
287
+ const auto preamble_longs = read<uint8_t>(is);
288
+ const auto serial_version = read<uint8_t>(is);
289
+ const auto family_id = read<uint8_t>(is);
290
+ const auto lg_max_size = read<uint8_t>(is);
291
+ const auto lg_cur_size = read<uint8_t>(is);
292
+ const auto flags_byte = read<uint8_t>(is);
293
+ read<uint16_t>(is); // unused
293
294
 
294
295
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
295
296
 
@@ -298,22 +299,19 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
298
299
  check_family_id(family_id);
299
300
  check_size(lg_cur_size, lg_max_size);
300
301
 
301
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
302
+ frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
302
303
  if (!is_empty) {
303
- uint32_t num_items;
304
- is.read((char*)&num_items, sizeof(num_items));
305
- uint32_t unused32;
306
- is.read((char*)&unused32, sizeof(unused32));
307
- W total_weight;
308
- is.read((char*)&total_weight, sizeof(total_weight));
309
- W offset;
310
- is.read((char*)&offset, sizeof(offset));
304
+ const auto num_items = read<uint32_t>(is);
305
+ read<uint32_t>(is); // unused
306
+ const auto total_weight = read<W>(is);
307
+ const auto offset = read<W>(is);
311
308
 
312
309
  // batch deserialization with intermediate array of items and weights
313
- typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
314
- std::vector<W, AllocW> weights(num_items);
315
- is.read((char*)weights.data(), sizeof(W) * num_items);
316
- std::unique_ptr<T, items_deleter> items(A().allocate(num_items), items_deleter(num_items, false));
310
+ using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
311
+ std::vector<W, AllocW> weights(num_items, 0, allocator);
312
+ read(is, weights.data(), sizeof(W) * num_items);
313
+ A alloc(allocator);
314
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
317
315
  S().deserialize(is, items.get(), num_items);
318
316
  items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
319
317
  for (uint32_t i = 0; i < num_items; i++) {
@@ -328,24 +326,23 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
328
326
  }
329
327
 
330
328
  template<typename T, typename W, typename H, typename E, typename S, typename A>
331
- frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size) {
329
+ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
332
330
  ensure_minimum_memory(size, 8);
333
331
  const char* ptr = static_cast<const char*>(bytes);
334
332
  const char* base = static_cast<const char*>(bytes);
335
333
  uint8_t preamble_longs;
336
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(uint8_t));
334
+ ptr += copy_from_mem(ptr, preamble_longs);
337
335
  uint8_t serial_version;
338
- ptr += copy_from_mem(ptr, &serial_version, sizeof(uint8_t));
336
+ ptr += copy_from_mem(ptr, serial_version);
339
337
  uint8_t family_id;
340
- ptr += copy_from_mem(ptr, &family_id, sizeof(uint8_t));
338
+ ptr += copy_from_mem(ptr, family_id);
341
339
  uint8_t lg_max_size;
342
- ptr += copy_from_mem(ptr, &lg_max_size, sizeof(uint8_t));
340
+ ptr += copy_from_mem(ptr, lg_max_size);
343
341
  uint8_t lg_cur_size;
344
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(uint8_t));
342
+ ptr += copy_from_mem(ptr, lg_cur_size);
345
343
  uint8_t flags_byte;
346
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(uint8_t));
347
- uint16_t unused16;
348
- ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
344
+ ptr += copy_from_mem(ptr, flags_byte);
345
+ ptr += sizeof(uint16_t); // unused
349
346
 
350
347
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
351
348
 
@@ -353,25 +350,25 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
353
350
  check_serial_version(serial_version);
354
351
  check_family_id(family_id);
355
352
  check_size(lg_cur_size, lg_max_size);
356
- ensure_minimum_memory(size, 1 << preamble_longs);
353
+ ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
357
354
 
358
- frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
355
+ frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
359
356
  if (!is_empty) {
360
357
  uint32_t num_items;
361
- ptr += copy_from_mem(ptr, &num_items, sizeof(uint32_t));
362
- uint32_t unused32;
363
- ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
358
+ ptr += copy_from_mem(ptr, num_items);
359
+ ptr += sizeof(uint32_t); // unused
364
360
  W total_weight;
365
- ptr += copy_from_mem(ptr, &total_weight, sizeof(total_weight));
361
+ ptr += copy_from_mem(ptr, total_weight);
366
362
  W offset;
367
- ptr += copy_from_mem(ptr, &offset, sizeof(offset));
363
+ ptr += copy_from_mem(ptr, offset);
368
364
 
369
365
  ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
370
366
  // batch deserialization with intermediate array of items and weights
371
- typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
372
- std::vector<W, AllocW> weights(num_items);
367
+ using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
368
+ std::vector<W, AllocW> weights(num_items, 0, allocator);
373
369
  ptr += copy_from_mem(ptr, weights.data(), sizeof(W) * num_items);
374
- std::unique_ptr<T, items_deleter> items(A().allocate(num_items), items_deleter(num_items, false));
370
+ A alloc(allocator);
371
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
375
372
  const size_t bytes_remaining = size - (ptr - base);
376
373
  ptr += S().deserialize(ptr, bytes_remaining, items.get(), num_items);
377
374
  items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
@@ -434,14 +431,14 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
434
431
  os << "### End sketch summary" << std::endl;
435
432
  if (print_items) {
436
433
  vector_row items;
437
- for (auto &it: map) {
434
+ for (auto it: map) {
438
435
  items.push_back(row(&it.first, it.second, offset));
439
436
  }
440
437
  // sort by estimate in descending order
441
438
  std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
442
439
  os << "### Items in descending order by estimate" << std::endl;
443
440
  os << " item, estimate, lower bound, upper bound" << std::endl;
444
- for (auto &it: items) {
441
+ for (auto it: items) {
445
442
  os << " " << it.get_item() << ", " << it.get_estimate() << ", "
446
443
  << it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
447
444
  }
@@ -39,33 +39,39 @@ public:
39
39
  using AllocV = typename std::allocator_traits<A>::template rebind_alloc<V>;
40
40
  using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
41
41
 
42
- reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size);
42
+ reverse_purge_hash_map(uint8_t lg_size, uint8_t lg_max_size, const A& allocator);
43
43
  reverse_purge_hash_map(const reverse_purge_hash_map& other);
44
44
  reverse_purge_hash_map(reverse_purge_hash_map&& other) noexcept;
45
45
  ~reverse_purge_hash_map();
46
46
  reverse_purge_hash_map& operator=(reverse_purge_hash_map other);
47
47
  reverse_purge_hash_map& operator=(reverse_purge_hash_map&& other);
48
- V adjust_or_insert(const K& key, V value);
49
- V adjust_or_insert(K&& key, V value);
48
+
49
+ template<typename FwdK>
50
+ V adjust_or_insert(FwdK&& key, V value);
51
+
50
52
  V get(const K& key) const;
51
53
  uint8_t get_lg_cur_size() const;
52
54
  uint8_t get_lg_max_size() const;
53
55
  uint32_t get_capacity() const;
54
56
  uint32_t get_num_active() const;
57
+ const A& get_allocator() const;
58
+
55
59
  class iterator;
56
60
  iterator begin() const;
57
61
  iterator end() const;
62
+
58
63
  private:
59
64
  static constexpr double LOAD_FACTOR = 0.75;
60
65
  static constexpr uint16_t DRIFT_LIMIT = 1024; // used only for stress testing
61
66
  static constexpr uint32_t MAX_SAMPLE_SIZE = 1024; // number of samples to compute approximate median during purge
62
67
 
63
- uint8_t lg_cur_size;
64
- uint8_t lg_max_size;
65
- uint32_t num_active;
66
- K* keys;
67
- V* values;
68
- uint16_t* states;
68
+ A allocator_;
69
+ uint8_t lg_cur_size_;
70
+ uint8_t lg_max_size_;
71
+ uint32_t num_active_;
72
+ K* keys_;
73
+ V* values_;
74
+ uint16_t* states_;
69
75
 
70
76
  inline bool is_active(uint32_t probe) const;
71
77
  void subtract_and_keep_positive_only(V amount);
@@ -83,8 +89,8 @@ public:
83
89
  friend class reverse_purge_hash_map<K, V, H, E, A>;
84
90
  iterator& operator++() {
85
91
  ++count;
86
- if (count < map->num_active) {
87
- const uint32_t mask = (1 << map->lg_cur_size) - 1;
92
+ if (count < map->num_active_) {
93
+ const uint32_t mask = (1 << map->lg_cur_size_) - 1;
88
94
  do {
89
95
  index = (index + stride) & mask;
90
96
  } while (!map->is_active(index));
@@ -95,7 +101,7 @@ public:
95
101
  bool operator==(const iterator& rhs) const { return count == rhs.count; }
96
102
  bool operator!=(const iterator& rhs) const { return count != rhs.count; }
97
103
  const std::pair<K&, V> operator*() const {
98
- return std::pair<K&, V>(map->keys[index], map->values[index]);
104
+ return std::pair<K&, V>(map->keys_[index], map->values_[index]);
99
105
  }
100
106
  private:
101
107
  static constexpr double GOLDEN_RATIO_RECIPROCAL = 0.6180339887498949; // = (sqrt(5) - 1) / 2
@@ -104,7 +110,7 @@ private:
104
110
  uint32_t count;
105
111
  uint32_t stride;
106
112
  iterator(const reverse_purge_hash_map<K, V, H, E, A>* map, uint32_t index, uint32_t count):
107
- map(map), index(index), count(count), stride(static_cast<uint32_t>((1 << map->lg_cur_size) * GOLDEN_RATIO_RECIPROCAL) | 1) {}
113
+ map(map), index(index), count(count), stride(static_cast<uint32_t>((1 << map->lg_cur_size_) * GOLDEN_RATIO_RECIPROCAL) | 1) {}
108
114
  };
109
115
 
110
116
  } /* namespace datasketches */