datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -65,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
65
65
  void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
66
66
  if (other.is_empty()) return;
67
67
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
68
- for (auto &it: other.map) {
68
+ for (auto it: other.map) {
69
69
  update(it.first, it.second);
70
70
  }
71
71
  offset += other.offset;
@@ -76,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
76
76
  void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
77
77
  if (other.is_empty()) return;
78
78
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
79
- for (auto &it: other.map) {
79
+ for (auto it: other.map) {
80
80
  update(std::move(it.first), it.second);
81
81
  }
82
82
  offset += other.offset;
@@ -147,7 +147,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
147
147
  typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
148
148
  frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
149
149
  vector_row items(map.get_allocator());
150
- for (auto &it: map) {
150
+ for (auto it: map) {
151
151
  const W lb = it.second;
152
152
  const W ub = it.second + offset;
153
153
  if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
@@ -162,28 +162,28 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
162
162
  template<typename T, typename W, typename H, typename E, typename S, typename A>
163
163
  void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
164
164
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
165
- os.write((char*)&preamble_longs, sizeof(preamble_longs));
165
+ write(os, preamble_longs);
166
166
  const uint8_t serial_version = SERIAL_VERSION;
167
- os.write((char*)&serial_version, sizeof(serial_version));
167
+ write(os, serial_version);
168
168
  const uint8_t family = FAMILY_ID;
169
- os.write((char*)&family, sizeof(family));
169
+ write(os, family);
170
170
  const uint8_t lg_max_size = map.get_lg_max_size();
171
- os.write((char*)&lg_max_size, sizeof(lg_max_size));
171
+ write(os, lg_max_size);
172
172
  const uint8_t lg_cur_size = map.get_lg_cur_size();
173
- os.write((char*)&lg_cur_size, sizeof(lg_cur_size));
173
+ write(os, lg_cur_size);
174
174
  const uint8_t flags_byte(
175
175
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
176
176
  );
177
- os.write((char*)&flags_byte, sizeof(flags_byte));
177
+ write(os, flags_byte);
178
178
  const uint16_t unused16 = 0;
179
- os.write((char*)&unused16, sizeof(unused16));
179
+ write(os, unused16);
180
180
  if (!is_empty()) {
181
181
  const uint32_t num_items = map.get_num_active();
182
- os.write((char*)&num_items, sizeof(num_items));
182
+ write(os, num_items);
183
183
  const uint32_t unused32 = 0;
184
- os.write((char*)&unused32, sizeof(unused32));
185
- os.write((char*)&total_weight, sizeof(total_weight));
186
- os.write((char*)&offset, sizeof(offset));
184
+ write(os, unused32);
185
+ write(os, total_weight);
186
+ write(os, offset);
187
187
 
188
188
  // copy active items and their weights to use batch serialization
189
189
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
@@ -192,14 +192,14 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const
192
192
  A alloc(map.get_allocator());
193
193
  T* items = alloc.allocate(num_items);
194
194
  uint32_t i = 0;
195
- for (auto &it: map) {
195
+ for (auto it: map) {
196
196
  new (&items[i]) T(it.first);
197
197
  weights[i++] = it.second;
198
198
  }
199
- os.write((char*)weights, sizeof(W) * num_items);
199
+ write(os, weights, sizeof(W) * num_items);
200
200
  aw.deallocate(weights, num_items);
201
201
  S().serialize(os, items, num_items);
202
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
202
+ for (i = 0; i < num_items; i++) items[i].~T();
203
203
  alloc.deallocate(items, num_items);
204
204
  }
205
205
  }
@@ -208,7 +208,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
208
208
  size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
209
209
  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
210
210
  size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
211
- for (auto &it: map) size += S().size_of_item(it.first);
211
+ for (auto it: map) size += S().size_of_item(it.first);
212
212
  return size;
213
213
  }
214
214
 
@@ -220,28 +220,26 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
220
220
  uint8_t* end_ptr = ptr + size;
221
221
 
222
222
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
223
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
223
+ ptr += copy_to_mem(preamble_longs, ptr);
224
224
  const uint8_t serial_version = SERIAL_VERSION;
225
- ptr += copy_to_mem(&serial_version, ptr, sizeof(uint8_t));
225
+ ptr += copy_to_mem(serial_version, ptr);
226
226
  const uint8_t family = FAMILY_ID;
227
- ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
227
+ ptr += copy_to_mem(family, ptr);
228
228
  const uint8_t lg_max_size = map.get_lg_max_size();
229
- ptr += copy_to_mem(&lg_max_size, ptr, sizeof(uint8_t));
229
+ ptr += copy_to_mem(lg_max_size, ptr);
230
230
  const uint8_t lg_cur_size = map.get_lg_cur_size();
231
- ptr += copy_to_mem(&lg_cur_size, ptr, sizeof(uint8_t));
231
+ ptr += copy_to_mem(lg_cur_size, ptr);
232
232
  const uint8_t flags_byte(
233
233
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
234
234
  );
235
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(uint8_t));
236
- const uint16_t unused16 = 0;
237
- ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
235
+ ptr += copy_to_mem(flags_byte, ptr);
236
+ ptr += sizeof(uint16_t); // unused
238
237
  if (!is_empty()) {
239
238
  const uint32_t num_items = map.get_num_active();
240
- ptr += copy_to_mem(&num_items, ptr, sizeof(uint32_t));
241
- const uint32_t unused32 = 0;
242
- ptr += copy_to_mem(&unused32, ptr, sizeof(uint32_t));
243
- ptr += copy_to_mem(&total_weight, ptr, sizeof(total_weight));
244
- ptr += copy_to_mem(&offset, ptr, sizeof(offset));
239
+ ptr += copy_to_mem(num_items, ptr);
240
+ ptr += sizeof(uint32_t); // unused
241
+ ptr += copy_to_mem(total_weight, ptr);
242
+ ptr += copy_to_mem(offset, ptr);
245
243
 
246
244
  // copy active items and their weights to use batch serialization
247
245
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
@@ -250,7 +248,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
250
248
  A alloc(map.get_allocator());
251
249
  T* items = alloc.allocate(num_items);
252
250
  uint32_t i = 0;
253
- for (auto &it: map) {
251
+ for (auto it: map) {
254
252
  new (&items[i]) T(it.first);
255
253
  weights[i++] = it.second;
256
254
  }
@@ -258,7 +256,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
258
256
  aw.deallocate(weights, num_items);
259
257
  const size_t bytes_remaining = end_ptr - ptr;
260
258
  ptr += S().serialize(ptr, bytes_remaining, items, num_items);
261
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
259
+ for (i = 0; i < num_items; i++) items[i].~T();
262
260
  alloc.deallocate(items, num_items);
263
261
  }
264
262
  return bytes;
@@ -268,38 +266,31 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
268
266
  class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
269
267
  public:
270
268
  items_deleter(uint32_t num, bool destroy, const A& allocator):
271
- allocator(allocator), num(num), destroy(destroy) {}
272
- void set_destroy(bool destroy) { this->destroy = destroy; }
269
+ allocator_(allocator), num_(num), destroy_(destroy) {}
270
+ void set_destroy(bool destroy) { destroy_ = destroy; }
273
271
  void operator() (T* ptr) {
274
272
  if (ptr != nullptr) {
275
- if (destroy) {
276
- for (uint32_t i = 0; i < num; ++i) ptr[i].~T();
273
+ if (destroy_) {
274
+ for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
277
275
  }
278
- allocator.deallocate(ptr, num);
276
+ allocator_.deallocate(ptr, num_);
279
277
  }
280
278
  }
281
279
  private:
282
- A allocator;
283
- uint32_t num;
284
- bool destroy;
280
+ A allocator_;
281
+ uint32_t num_;
282
+ bool destroy_;
285
283
  };
286
284
 
287
285
  template<typename T, typename W, typename H, typename E, typename S, typename A>
288
286
  frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
289
- uint8_t preamble_longs;
290
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
291
- uint8_t serial_version;
292
- is.read((char*)&serial_version, sizeof(serial_version));
293
- uint8_t family_id;
294
- is.read((char*)&family_id, sizeof(family_id));
295
- uint8_t lg_max_size;
296
- is.read((char*)&lg_max_size, sizeof(lg_max_size));
297
- uint8_t lg_cur_size;
298
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
299
- uint8_t flags_byte;
300
- is.read((char*)&flags_byte, sizeof(flags_byte));
301
- uint16_t unused16;
302
- is.read((char*)&unused16, sizeof(unused16));
287
+ const auto preamble_longs = read<uint8_t>(is);
288
+ const auto serial_version = read<uint8_t>(is);
289
+ const auto family_id = read<uint8_t>(is);
290
+ const auto lg_max_size = read<uint8_t>(is);
291
+ const auto lg_cur_size = read<uint8_t>(is);
292
+ const auto flags_byte = read<uint8_t>(is);
293
+ read<uint16_t>(is); // unused
303
294
 
304
295
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
305
296
 
@@ -310,19 +301,15 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
310
301
 
311
302
  frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
312
303
  if (!is_empty) {
313
- uint32_t num_items;
314
- is.read((char*)&num_items, sizeof(num_items));
315
- uint32_t unused32;
316
- is.read((char*)&unused32, sizeof(unused32));
317
- W total_weight;
318
- is.read((char*)&total_weight, sizeof(total_weight));
319
- W offset;
320
- is.read((char*)&offset, sizeof(offset));
304
+ const auto num_items = read<uint32_t>(is);
305
+ read<uint32_t>(is); // unused
306
+ const auto total_weight = read<W>(is);
307
+ const auto offset = read<W>(is);
321
308
 
322
309
  // batch deserialization with intermediate array of items and weights
323
310
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
324
311
  std::vector<W, AllocW> weights(num_items, 0, allocator);
325
- is.read((char*)weights.data(), sizeof(W) * num_items);
312
+ read(is, weights.data(), sizeof(W) * num_items);
326
313
  A alloc(allocator);
327
314
  std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
328
315
  S().deserialize(is, items.get(), num_items);
@@ -344,19 +331,18 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
344
331
  const char* ptr = static_cast<const char*>(bytes);
345
332
  const char* base = static_cast<const char*>(bytes);
346
333
  uint8_t preamble_longs;
347
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(uint8_t));
334
+ ptr += copy_from_mem(ptr, preamble_longs);
348
335
  uint8_t serial_version;
349
- ptr += copy_from_mem(ptr, &serial_version, sizeof(uint8_t));
336
+ ptr += copy_from_mem(ptr, serial_version);
350
337
  uint8_t family_id;
351
- ptr += copy_from_mem(ptr, &family_id, sizeof(uint8_t));
338
+ ptr += copy_from_mem(ptr, family_id);
352
339
  uint8_t lg_max_size;
353
- ptr += copy_from_mem(ptr, &lg_max_size, sizeof(uint8_t));
340
+ ptr += copy_from_mem(ptr, lg_max_size);
354
341
  uint8_t lg_cur_size;
355
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(uint8_t));
342
+ ptr += copy_from_mem(ptr, lg_cur_size);
356
343
  uint8_t flags_byte;
357
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(uint8_t));
358
- uint16_t unused16;
359
- ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
344
+ ptr += copy_from_mem(ptr, flags_byte);
345
+ ptr += sizeof(uint16_t); // unused
360
346
 
361
347
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
362
348
 
@@ -364,18 +350,17 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
364
350
  check_serial_version(serial_version);
365
351
  check_family_id(family_id);
366
352
  check_size(lg_cur_size, lg_max_size);
367
- ensure_minimum_memory(size, 1 << preamble_longs);
353
+ ensure_minimum_memory(size, preamble_longs * sizeof(uint64_t));
368
354
 
369
355
  frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
370
356
  if (!is_empty) {
371
357
  uint32_t num_items;
372
- ptr += copy_from_mem(ptr, &num_items, sizeof(uint32_t));
373
- uint32_t unused32;
374
- ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
358
+ ptr += copy_from_mem(ptr, num_items);
359
+ ptr += sizeof(uint32_t); // unused
375
360
  W total_weight;
376
- ptr += copy_from_mem(ptr, &total_weight, sizeof(total_weight));
361
+ ptr += copy_from_mem(ptr, total_weight);
377
362
  W offset;
378
- ptr += copy_from_mem(ptr, &offset, sizeof(offset));
363
+ ptr += copy_from_mem(ptr, offset);
379
364
 
380
365
  ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
381
366
  // batch deserialization with intermediate array of items and weights
@@ -436,7 +421,9 @@ void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, ui
436
421
 
437
422
  template<typename T, typename W, typename H, typename E, typename S, typename A>
438
423
  string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) const {
439
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
424
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
425
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
426
+ std::ostringstream os;
440
427
  os << "### Frequent items sketch summary:" << std::endl;
441
428
  os << " lg cur map size : " << (int) map.get_lg_cur_size() << std::endl;
442
429
  os << " lg max map size : " << (int) map.get_lg_max_size() << std::endl;
@@ -446,20 +433,20 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
446
433
  os << "### End sketch summary" << std::endl;
447
434
  if (print_items) {
448
435
  vector_row items;
449
- for (auto &it: map) {
436
+ for (auto it: map) {
450
437
  items.push_back(row(&it.first, it.second, offset));
451
438
  }
452
439
  // sort by estimate in descending order
453
440
  std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
454
441
  os << "### Items in descending order by estimate" << std::endl;
455
442
  os << " item, estimate, lower bound, upper bound" << std::endl;
456
- for (auto &it: items) {
443
+ for (auto it: items) {
457
444
  os << " " << it.get_item() << ", " << it.get_estimate() << ", "
458
445
  << it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
459
446
  }
460
447
  os << "### End items" << std::endl;
461
448
  }
462
- return os.str();
449
+ return string<A>(os.str().c_str(), map.get_allocator());
463
450
  }
464
451
 
465
452
  // version for integral signed type
@@ -39,15 +39,15 @@ allocator_(allocator),
39
39
  lg_cur_size_(lg_cur_size),
40
40
  lg_max_size_(lg_max_size),
41
41
  num_active_(0),
42
- keys_(allocator_.allocate(1 << lg_cur_size)),
42
+ keys_(allocator_.allocate(1ULL << lg_cur_size)),
43
43
  values_(nullptr),
44
44
  states_(nullptr)
45
45
  {
46
46
  AllocV av(allocator_);
47
- values_ = av.allocate(1 << lg_cur_size);
47
+ values_ = av.allocate(1ULL << lg_cur_size);
48
48
  AllocU16 au16(allocator_);
49
- states_ = au16.allocate(1 << lg_cur_size);
50
- std::fill(states_, states_ + (1 << lg_cur_size), 0);
49
+ states_ = au16.allocate(1ULL << lg_cur_size);
50
+ std::fill(states_, states_ + (1ULL << lg_cur_size), static_cast<uint16_t>(0));
51
51
  }
52
52
 
53
53
  template<typename K, typename V, typename H, typename E, typename A>
@@ -56,14 +56,14 @@ allocator_(other.allocator_),
56
56
  lg_cur_size_(other.lg_cur_size_),
57
57
  lg_max_size_(other.lg_max_size_),
58
58
  num_active_(other.num_active_),
59
- keys_(allocator_.allocate(1 << lg_cur_size_)),
59
+ keys_(allocator_.allocate(1ULL << lg_cur_size_)),
60
60
  values_(nullptr),
61
61
  states_(nullptr)
62
62
  {
63
63
  AllocV av(allocator_);
64
- values_ = av.allocate(1 << lg_cur_size_);
64
+ values_ = av.allocate(1ULL << lg_cur_size_);
65
65
  AllocU16 au16(allocator_);
66
- states_ = au16.allocate(1 << lg_cur_size_);
66
+ states_ = au16.allocate(1ULL << lg_cur_size_);
67
67
  const uint32_t size = 1 << lg_cur_size_;
68
68
  if (num_active_ > 0) {
69
69
  auto num = num_active_;
@@ -177,7 +177,7 @@ uint8_t reverse_purge_hash_map<K, V, H, E, A>::get_lg_max_size() const {
177
177
 
178
178
  template<typename K, typename V, typename H, typename E, typename A>
179
179
  uint32_t reverse_purge_hash_map<K, V, H, E, A>::get_capacity() const {
180
- return (1 << lg_cur_size_) * LOAD_FACTOR;
180
+ return static_cast<uint32_t>((1 << lg_cur_size_) * LOAD_FACTOR);
181
181
  }
182
182
 
183
183
  template<typename K, typename V, typename H, typename E, typename A>
@@ -246,7 +246,7 @@ void reverse_purge_hash_map<K, V, H, E, A>::hash_delete(uint32_t delete_index) {
246
246
  // if none are found, the status is changed
247
247
  states_[delete_index] = 0; // mark as empty
248
248
  keys_[delete_index].~K();
249
- uint32_t drift = 1;
249
+ uint16_t drift = 1;
250
250
  const uint32_t mask = (1 << lg_cur_size_) - 1;
251
251
  uint32_t probe = (delete_index + drift) & mask; // map length must be a power of 2
252
252
  // advance until we find a free location replacing locations as needed
@@ -322,7 +322,7 @@ void reverse_purge_hash_map<K, V, H, E, A>::resize(uint8_t lg_new_size) {
322
322
  values_ = av.allocate(new_size);
323
323
  AllocU16 au16(allocator_);
324
324
  states_ = au16.allocate(new_size);
325
- std::fill(states_, states_ + new_size, 0);
325
+ std::fill(states_, states_ + new_size, static_cast<uint16_t>(0));
326
326
  num_active_ = 0;
327
327
  lg_cur_size_ = lg_new_size;
328
328
  for (uint32_t i = 0; i < old_size; i++) {
@@ -39,8 +39,8 @@ TEST_CASE("reverse purge hash map: one item", "[frequent_items_sketch]") {
39
39
  TEST_CASE("reverse purge hash map: iterator", "[frequent_items_sketch]") {
40
40
  reverse_purge_hash_map<int> map(3, 4, std::allocator<int>());
41
41
  for (int i = 0; i < 11; i++) map.adjust_or_insert(i, 1); // this should fit with no purge
42
- int sum = 0;
43
- for (auto &it: map) sum += it.second;
42
+ uint64_t sum = 0;
43
+ for (auto it: map) sum += it.second;
44
44
  REQUIRE(sum == 11);
45
45
  }
46
46
 
@@ -32,64 +32,41 @@ target_include_directories(hll
32
32
  target_link_libraries(hll INTERFACE common)
33
33
  target_compile_features(hll INTERFACE cxx_std_11)
34
34
 
35
- # TODO: would be useful if this didn't need to be reproduced in target_sources(), too
36
- set(hll_HEADERS "")
37
- list(APPEND hll_HEADERS "include/hll.hpp;include/AuxHashMap.hpp;include/CompositeInterpolationXTable.hpp")
38
- list(APPEND hll_HEADERS "include/hll.private.hpp;include/HllSketchImplFactory.hpp")
39
- list(APPEND hll_HEADERS "include/CouponHashSet.hpp;include/CouponList.hpp")
40
- list(APPEND hll_HEADERS "include/CubicInterpolation.hpp;include/HarmonicNumbers.hpp;include/Hll4Array.hpp")
41
- list(APPEND hll_HEADERS "include/Hll6Array.hpp;include/Hll8Array.hpp;include/HllArray.hpp")
42
- list(APPEND hll_HEADERS "include/HllSketchImpl.hpp")
43
- list(APPEND hll_HEADERS "include/HllUtil.hpp;include/coupon_iterator.hpp")
44
- list(APPEND hll_HEADERS "include/RelativeErrorTables.hpp;include/AuxHashMap-internal.hpp")
45
- list(APPEND hll_HEADERS "include/CompositeInterpolationXTable-internal.hpp")
46
- list(APPEND hll_HEADERS "include/CouponHashSet-internal.hpp;include/CouponList-internal.hpp")
47
- list(APPEND hll_HEADERS "include/CubicInterpolation-internal.hpp;include/HarmonicNumbers-internal.hpp")
48
- list(APPEND hll_HEADERS "include/Hll4Array-internal.hpp;include/Hll6Array-internal.hpp")
49
- list(APPEND hll_HEADERS "include/Hll8Array-internal.hpp;include/HllArray-internal.hpp")
50
- list(APPEND hll_HEADERS "include/HllSketch-internal.hpp")
51
- list(APPEND hll_HEADERS "include/HllSketchImpl-internal.hpp;include/HllUnion-internal.hpp")
52
- list(APPEND hll_HEADERS "include/coupon_iterator-internal.hpp;include/RelativeErrorTables-internal.hpp")
53
-
54
35
  install(TARGETS hll
55
36
  EXPORT ${PROJECT_NAME}
56
37
  )
57
38
 
58
- install(FILES ${hll_HEADERS}
39
+ install(FILES
40
+ include/hll.hpp
41
+ include/AuxHashMap.hpp
42
+ include/CompositeInterpolationXTable.hpp
43
+ include/hll.private.hpp
44
+ include/HllSketchImplFactory.hpp
45
+ include/CouponHashSet.hpp
46
+ include/CouponList.hpp
47
+ include/CubicInterpolation.hpp
48
+ include/HarmonicNumbers.hpp
49
+ include/Hll4Array.hpp
50
+ include/Hll6Array.hpp
51
+ include/Hll8Array.hpp
52
+ include/HllArray.hpp
53
+ include/HllSketchImpl.hpp
54
+ include/HllUtil.hpp
55
+ include/coupon_iterator.hpp
56
+ include/RelativeErrorTables.hpp
57
+ include/AuxHashMap-internal.hpp
58
+ include/CompositeInterpolationXTable-internal.hpp
59
+ include/CouponHashSet-internal.hpp
60
+ include/CouponList-internal.hpp
61
+ include/CubicInterpolation-internal.hpp
62
+ include/HarmonicNumbers-internal.hpp
63
+ include/Hll4Array-internal.hpp
64
+ include/Hll6Array-internal.hpp
65
+ include/Hll8Array-internal.hpp
66
+ include/HllArray-internal.hpp
67
+ include/HllSketch-internal.hpp
68
+ include/HllSketchImpl-internal.hpp
69
+ include/HllUnion-internal.hpp
70
+ include/coupon_iterator-internal.hpp
71
+ include/RelativeErrorTables-internal.hpp
59
72
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
60
-
61
- target_sources(hll
62
- INTERFACE
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/hll.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/hll.private.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/AuxHashMap.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CompositeInterpolationXTable.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CouponHashSet.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CouponList.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CubicInterpolation.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HarmonicNumbers.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll4Array.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll6Array.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll8Array.hpp
74
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllArray.hpp
75
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllSketchImpl.hpp
76
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllSketchImplFactory.hpp
77
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllUtil.hpp
78
- ${CMAKE_CURRENT_SOURCE_DIR}/include/RelativeErrorTables.hpp
79
- ${CMAKE_CURRENT_SOURCE_DIR}/include/coupon_iterator.hpp
80
- ${CMAKE_CURRENT_SOURCE_DIR}/include/AuxHashMap-internal.hpp
81
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CompositeInterpolationXTable-internal.hpp
82
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CouponHashSet-internal.hpp
83
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CouponList-internal.hpp
84
- ${CMAKE_CURRENT_SOURCE_DIR}/include/CubicInterpolation-internal.hpp
85
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HarmonicNumbers-internal.hpp
86
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll4Array-internal.hpp
87
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll6Array-internal.hpp
88
- ${CMAKE_CURRENT_SOURCE_DIR}/include/Hll8Array-internal.hpp
89
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllArray-internal.hpp
90
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllSketch-internal.hpp
91
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllSketchImpl-internal.hpp
92
- ${CMAKE_CURRENT_SOURCE_DIR}/include/HllUnion-internal.hpp
93
- ${CMAKE_CURRENT_SOURCE_DIR}/include/RelativeErrorTables-internal.hpp
94
- ${CMAKE_CURRENT_SOURCE_DIR}/include/coupon_iterator-internal.hpp
95
- )