datasketches 0.2.0 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -53,9 +53,7 @@ first_interesting_column(0),
53
53
  kxp(1 << lg_k),
54
54
  hip_est_accum(0)
55
55
  {
56
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
57
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
58
- }
56
+ check_lg_k(lg_k);
59
57
  }
60
58
 
61
59
  template<typename A>
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
176
174
 
177
175
  static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
178
176
  if (lg_k > 26) throw std::logic_error("lg_k > 26");
179
- const uint64_t k = 1 << lg_k;
177
+ const uint32_t k = 1 << lg_k;
180
178
  uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
181
179
  if (col > 63) col = 63; // clip so that 0 <= col <= 63
182
180
  const uint32_t row = hash0 & (k - 1);
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
188
186
  }
189
187
 
190
188
  template<typename A>
191
- void cpc_sketch_alloc<A>::update(const void* value, int size) {
189
+ void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
192
190
  HashState hashes;
193
191
  MurmurHash3_x64_128(value, size, seed, hashes);
194
192
  row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
208
206
 
209
207
  template<typename A>
210
208
  void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
211
- const uint64_t k = 1 << lg_k;
209
+ const uint32_t k = 1 << lg_k;
212
210
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
213
211
  if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
214
212
  bool is_novel = surprising_value_table.maybe_insert(row_col);
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
224
222
  template<typename A>
225
223
  void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
226
224
  if (window_offset > 56) throw std::logic_error("wrong window offset");
227
- const uint64_t k = 1 << lg_k;
225
+ const uint32_t k = 1 << lg_k;
228
226
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
229
227
  if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
230
228
  const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
266
264
  // Call this whenever a new coupon has been collected.
267
265
  template<typename A>
268
266
  void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
269
- const uint64_t k = 1 << lg_k;
267
+ const uint32_t k = 1 << lg_k;
270
268
  const uint8_t col = row_col & 63;
271
269
  const double one_over_p = static_cast<double>(k) / kxp;
272
270
  hip_est_accum += one_over_p;
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
276
274
  // In terms of flavor, this promotes SPARSE to HYBRID
277
275
  template<typename A>
278
276
  void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
279
- const uint64_t k = 1 << lg_k;
277
+ const uint32_t k = 1 << lg_k;
280
278
  const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
281
279
  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
282
280
 
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
285
283
  u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
286
284
 
287
285
  const uint32_t* old_slots = surprising_value_table.get_slots();
288
- const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
286
+ const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
289
287
 
290
288
  if (window_offset != 0) throw std::logic_error("window_offset != 0");
291
289
 
292
- for (size_t i = 0; i < old_num_slots; i++) {
290
+ for (uint32_t i = 0; i < old_num_slots; i++) {
293
291
  const uint32_t row_col = old_slots[i];
294
292
  if (row_col != UINT32_MAX) {
295
293
  const uint8_t col = row_col & 63;
296
294
  if (col < 8) {
297
- const size_t row = row_col >> 6;
295
+ const uint32_t row = row_col >> 6;
298
296
  sliding_window[row] |= 1 << col;
299
297
  } else {
300
298
  // cannot use u32_table::must_insert(), because it doesn't provide for growth
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
314
312
  if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
315
313
 
316
314
  if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
317
- const uint64_t k = 1 << lg_k;
315
+ const uint32_t k = 1 << lg_k;
318
316
 
319
317
  // Construct the full-sized bit matrix that corresponds to the sketch
320
318
  vector_u64<A> bit_matrix = build_bit_matrix();
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
328
326
  const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
329
327
  uint64_t all_surprises_ored = 0;
330
328
 
331
- for (size_t i = 0; i < k; i++) {
329
+ for (uint32_t i = 0; i < k; i++) {
332
330
  uint64_t pattern = bit_matrix[i];
333
331
  sliding_window[i] = (pattern >> new_offset) & 0xff;
334
332
  pattern &= mask_for_clearing_window;
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
357
355
  // so that it will reflect changes that were previously outside the mantissa.
358
356
  template<typename A>
359
357
  void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
360
- const uint64_t k = 1 << lg_k;
358
+ const uint32_t k = 1 << lg_k;
361
359
 
362
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
363
361
  double byte_sums[8]; // allocating on the stack
@@ -383,7 +381,9 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
383
381
 
384
382
  template<typename A>
385
383
  string<A> cpc_sketch_alloc<A>::to_string() const {
386
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
384
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
385
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
386
+ std::ostringstream os;
387
387
  os << "### CPC sketch summary:" << std::endl;
388
388
  os << " lg_k : " << std::to_string(lg_k) << std::endl;
389
389
  os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
@@ -394,14 +394,14 @@ string<A> cpc_sketch_alloc<A>::to_string() const {
394
394
  os << " HIP estimate : " << hip_est_accum << std::endl;
395
395
  os << " kxp : " << kxp << std::endl;
396
396
  }
397
- os << " intresting col : " << std::to_string(first_interesting_column) << std::endl;
397
+ os << " interesting col: " << std::to_string(first_interesting_column) << std::endl;
398
398
  os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
399
399
  os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
400
400
  if (sliding_window.size() > 0) {
401
401
  os << " window offset : " << std::to_string(window_offset) << std::endl;
402
402
  }
403
403
  os << "### End sketch summary" << std::endl;
404
- return os.str();
404
+ return string<A>(os.str().c_str(), sliding_window.get_allocator());
405
405
  }
406
406
 
407
407
  template<typename A>
@@ -415,44 +415,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
415
415
  const bool has_table = compressed.table_data.size() > 0;
416
416
  const bool has_window = compressed.window_data.size() > 0;
417
417
  const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
418
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
418
+ write(os, preamble_ints);
419
419
  const uint8_t serial_version = SERIAL_VERSION;
420
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
420
+ write(os, serial_version);
421
421
  const uint8_t family = FAMILY;
422
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
423
- os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
424
- os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
422
+ write(os, family);
423
+ write(os, lg_k);
424
+ write(os, first_interesting_column);
425
425
  const uint8_t flags_byte(
426
426
  (1 << flags::IS_COMPRESSED)
427
427
  | (has_hip ? 1 << flags::HAS_HIP : 0)
428
428
  | (has_table ? 1 << flags::HAS_TABLE : 0)
429
429
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
430
430
  );
431
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
431
+ write(os, flags_byte);
432
432
  const uint16_t seed_hash(compute_seed_hash(seed));
433
- os.write((char*)&seed_hash, sizeof(seed_hash));
433
+ write(os, seed_hash);
434
434
  if (!is_empty()) {
435
- os.write((char*)&num_coupons, sizeof(num_coupons));
435
+ write(os, num_coupons);
436
436
  if (has_table && has_window) {
437
437
  // if there is no window it is the same as number of coupons
438
- os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
438
+ write(os, compressed.table_num_entries);
439
439
  // HIP values can be in two different places in the sequence of fields
440
440
  // this is the first HIP decision point
441
441
  if (has_hip) write_hip(os);
442
442
  }
443
443
  if (has_table) {
444
- os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
444
+ write(os, compressed.table_data_words);
445
445
  }
446
446
  if (has_window) {
447
- os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
447
+ write(os, compressed.window_data_words);
448
448
  }
449
449
  // this is the second HIP decision point
450
450
  if (has_hip && !(has_table && has_window)) write_hip(os);
451
451
  if (has_window) {
452
- os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
452
+ write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
453
453
  }
454
454
  if (has_table) {
455
- os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
455
+ write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
456
456
  }
457
457
  }
458
458
  }
@@ -471,36 +471,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
471
471
  const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
472
472
  vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
473
473
  uint8_t* ptr = bytes.data() + header_size_bytes;
474
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
474
+ ptr += copy_to_mem(preamble_ints, ptr);
475
475
  const uint8_t serial_version = SERIAL_VERSION;
476
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
476
+ ptr += copy_to_mem(serial_version, ptr);
477
477
  const uint8_t family = FAMILY;
478
- ptr += copy_to_mem(&family, ptr, sizeof(family));
479
- ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
480
- ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
478
+ ptr += copy_to_mem(family, ptr);
479
+ ptr += copy_to_mem(lg_k, ptr);
480
+ ptr += copy_to_mem(first_interesting_column, ptr);
481
481
  const uint8_t flags_byte(
482
482
  (1 << flags::IS_COMPRESSED)
483
483
  | (has_hip ? 1 << flags::HAS_HIP : 0)
484
484
  | (has_table ? 1 << flags::HAS_TABLE : 0)
485
485
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
486
486
  );
487
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
487
+ ptr += copy_to_mem(flags_byte, ptr);
488
488
  const uint16_t seed_hash = compute_seed_hash(seed);
489
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
489
+ ptr += copy_to_mem(seed_hash, ptr);
490
490
  if (!is_empty()) {
491
- ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
491
+ ptr += copy_to_mem(num_coupons, ptr);
492
492
  if (has_table && has_window) {
493
493
  // if there is no window it is the same as number of coupons
494
- ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
494
+ ptr += copy_to_mem(compressed.table_num_entries, ptr);
495
495
  // HIP values can be in two different places in the sequence of fields
496
496
  // this is the first HIP decision point
497
497
  if (has_hip) ptr += copy_hip_to_mem(ptr);
498
498
  }
499
499
  if (has_table) {
500
- ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
500
+ ptr += copy_to_mem(compressed.table_data_words, ptr);
501
501
  }
502
502
  if (has_window) {
503
- ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
503
+ ptr += copy_to_mem(compressed.window_data_words, ptr);
504
504
  }
505
505
  // this is the second HIP decision point
506
506
  if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
@@ -517,20 +517,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
517
517
 
518
518
  template<typename A>
519
519
  cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
520
- uint8_t preamble_ints;
521
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
522
- uint8_t serial_version;
523
- is.read((char*)&serial_version, sizeof(serial_version));
524
- uint8_t family_id;
525
- is.read((char*)&family_id, sizeof(family_id));
526
- uint8_t lg_k;
527
- is.read((char*)&lg_k, sizeof(lg_k));
528
- uint8_t first_interesting_column;
529
- is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
530
- uint8_t flags_byte;
531
- is.read((char*)&flags_byte, sizeof(flags_byte));
532
- uint16_t seed_hash;
533
- is.read((char*)&seed_hash, sizeof(seed_hash));
520
+ const auto preamble_ints = read<uint8_t>(is);
521
+ const auto serial_version = read<uint8_t>(is);
522
+ const auto family_id = read<uint8_t>(is);
523
+ const auto lg_k = read<uint8_t>(is);
524
+ const auto first_interesting_column = read<uint8_t>(is);
525
+ const auto flags_byte = read<uint8_t>(is);
526
+ const auto seed_hash = read<uint16_t>(is);
534
527
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
535
528
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
536
529
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -542,31 +535,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
542
535
  double kxp = 0;
543
536
  double hip_est_accum = 0;
544
537
  if (has_table || has_window) {
545
- is.read((char*)&num_coupons, sizeof(num_coupons));
538
+ num_coupons = read<uint32_t>(is);
546
539
  if (has_table && has_window) {
547
- is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
540
+ compressed.table_num_entries = read<uint32_t>(is);
548
541
  if (has_hip) {
549
- is.read((char*)&kxp, sizeof(kxp));
550
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
542
+ kxp = read<double>(is);
543
+ hip_est_accum = read<double>(is);
551
544
  }
552
545
  }
553
546
  if (has_table) {
554
- is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
547
+ compressed.table_data_words = read<uint32_t>(is);
555
548
  }
556
549
  if (has_window) {
557
- is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
550
+ compressed.window_data_words = read<uint32_t>(is);
558
551
  }
559
552
  if (has_hip && !(has_table && has_window)) {
560
- is.read((char*)&kxp, sizeof(kxp));
561
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
553
+ kxp = read<double>(is);
554
+ hip_est_accum = read<double>(is);
562
555
  }
563
556
  if (has_window) {
564
557
  compressed.window_data.resize(compressed.window_data_words);
565
- is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
558
+ read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
566
559
  }
567
560
  if (has_table) {
568
561
  compressed.table_data.resize(compressed.table_data_words);
569
- is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
562
+ read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
570
563
  }
571
564
  if (!has_window) compressed.table_num_entries = num_coupons;
572
565
  }
@@ -602,19 +595,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
602
595
  const char* ptr = static_cast<const char*>(bytes);
603
596
  const char* base = static_cast<const char*>(bytes);
604
597
  uint8_t preamble_ints;
605
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
598
+ ptr += copy_from_mem(ptr, preamble_ints);
606
599
  uint8_t serial_version;
607
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
600
+ ptr += copy_from_mem(ptr, serial_version);
608
601
  uint8_t family_id;
609
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
602
+ ptr += copy_from_mem(ptr, family_id);
610
603
  uint8_t lg_k;
611
- ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
604
+ ptr += copy_from_mem(ptr, lg_k);
612
605
  uint8_t first_interesting_column;
613
- ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
606
+ ptr += copy_from_mem(ptr, first_interesting_column);
614
607
  uint8_t flags_byte;
615
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
608
+ ptr += copy_from_mem(ptr, flags_byte);
616
609
  uint16_t seed_hash;
617
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
610
+ ptr += copy_from_mem(ptr, seed_hash);
618
611
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
619
612
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
620
613
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -628,28 +621,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
628
621
  double hip_est_accum = 0;
629
622
  if (has_table || has_window) {
630
623
  check_memory_size(ptr - base + sizeof(num_coupons), size);
631
- ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
624
+ ptr += copy_from_mem(ptr, num_coupons);
632
625
  if (has_table && has_window) {
633
626
  check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
634
- ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
627
+ ptr += copy_from_mem(ptr, compressed.table_num_entries);
635
628
  if (has_hip) {
636
629
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
637
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
638
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
630
+ ptr += copy_from_mem(ptr, kxp);
631
+ ptr += copy_from_mem(ptr, hip_est_accum);
639
632
  }
640
633
  }
641
634
  if (has_table) {
642
635
  check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
643
- ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
636
+ ptr += copy_from_mem(ptr, compressed.table_data_words);
644
637
  }
645
638
  if (has_window) {
646
639
  check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
647
- ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
640
+ ptr += copy_from_mem(ptr, compressed.window_data_words);
648
641
  }
649
642
  if (has_hip && !(has_table && has_window)) {
650
643
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
651
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
652
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
644
+ ptr += copy_from_mem(ptr, kxp);
645
+ ptr += copy_from_mem(ptr, hip_est_accum);
653
646
  }
654
647
  if (has_window) {
655
648
  compressed.window_data.resize(compressed.window_data_words);
@@ -688,6 +681,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
688
681
  std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
689
682
  }
690
683
 
684
+ /*
685
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
686
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
687
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
688
+ * worst-case space for the preamble, which is added by the function.
689
+ */
690
+ static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
691
+ static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
692
+ 24, // lg_k = 4
693
+ 36, // lg_k = 5
694
+ 56, // lg_k = 6
695
+ 100, // lg_k = 7
696
+ 180, // lg_k = 8
697
+ 344, // lg_k = 9
698
+ 660, // lg_k = 10
699
+ 1292, // lg_k = 11
700
+ 2540, // lg_k = 12
701
+ 5020, // lg_k = 13
702
+ 9968, // lg_k = 14
703
+ 19836, // lg_k = 15
704
+ 39532, // lg_k = 16
705
+ 78880, // lg_k = 17
706
+ 157516, // lg_k = 18
707
+ 314656 // lg_k = 19
708
+ };
709
+ static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
710
+ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
711
+
712
+ template<typename A>
713
+ size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
714
+ check_lg_k(lg_k);
715
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
716
+ const uint32_t k = 1 << lg_k;
717
+ return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
718
+ }
719
+
720
+ template<typename A>
721
+ void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
722
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
723
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
724
+ }
725
+ }
726
+
691
727
  template<typename A>
692
728
  uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
693
729
  return num_coupons;
@@ -696,7 +732,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
696
732
  template<typename A>
697
733
  bool cpc_sketch_alloc<A>::validate() const {
698
734
  vector_u64<A> bit_matrix = build_bit_matrix();
699
- const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
735
+ const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
700
736
  return num_bits_set == num_coupons;
701
737
  }
702
738
 
@@ -744,7 +780,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
744
780
 
745
781
  template<typename A>
746
782
  typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
747
- const uint64_t k = 1 << lg_k;
783
+ const uint32_t k = 1 << lg_k;
748
784
  const uint64_t c2 = c << 1;
749
785
  const uint64_t c8 = c << 3;
750
786
  const uint64_t c32 = c << 5;
@@ -757,15 +793,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
757
793
 
758
794
  template<typename A>
759
795
  uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
760
- const uint64_t k = 1 << lg_k;
796
+ const uint32_t k = 1 << lg_k;
761
797
  const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
762
798
  if (tmp < 0) return 0;
763
- return tmp >> (lg_k + 3); // tmp / 8K
799
+ return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
764
800
  }
765
801
 
766
802
  template<typename A>
767
803
  vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
768
- const size_t k = 1 << lg_k;
804
+ const uint32_t k = 1 << lg_k;
769
805
  if (window_offset > 56) throw std::logic_error("offset > 56");
770
806
 
771
807
  // Fill the matrix with default rows in which the "early zone" is filled with ones.
@@ -782,12 +818,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
782
818
  }
783
819
 
784
820
  const uint32_t* slots = surprising_value_table.get_slots();
785
- const size_t num_slots = 1 << surprising_value_table.get_lg_size();
821
+ const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
786
822
  for (size_t i = 0; i < num_slots; i++) {
787
823
  const uint32_t row_col = slots[i];
788
824
  if (row_col != UINT32_MAX) {
789
825
  const uint8_t col = row_col & 63;
790
- const size_t row = row_col >> 6;
826
+ const uint32_t row = row_col >> 6;
791
827
  // Flip the specified matrix bit from its default value.
792
828
  // In the "early" zone the bit changes from 1 to 0.
793
829
  // In the "late" zone the bit changes from 0 to 1.
@@ -799,8 +835,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
799
835
 
800
836
  template<typename A>
801
837
  void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
802
- os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
803
- os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
838
+ write(os, kxp);
839
+ write(os, hip_est_accum);
804
840
  }
805
841
 
806
842
  template<typename A>
@@ -45,7 +45,7 @@ public:
45
45
  * @param lg_k base 2 logarithm of the number of bins in the sketch
46
46
  * @param seed for hash function
47
47
  */
48
- explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
48
+ explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
49
49
 
50
50
  cpc_union_alloc(const cpc_union_alloc<A>& other);
51
51
  cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
@@ -34,7 +34,7 @@ bit_matrix(allocator)
34
34
  if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
35
35
  throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
36
36
  }
37
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
37
+ accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
38
38
  }
39
39
 
40
40
  template<typename A>
@@ -45,7 +45,7 @@ accumulator(other.accumulator),
45
45
  bit_matrix(other.bit_matrix)
46
46
  {
47
47
  if (accumulator != nullptr) {
48
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
48
+ accumulator = new (AllocCpc(accumulator->get_allocator()).allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
49
49
  }
50
50
  }
51
51
 
@@ -62,8 +62,9 @@ bit_matrix(std::move(other.bit_matrix))
62
62
  template<typename A>
63
63
  cpc_union_alloc<A>::~cpc_union_alloc() {
64
64
  if (accumulator != nullptr) {
65
+ AllocCpc allocator(accumulator->get_allocator());
65
66
  accumulator->~cpc_sketch_alloc<A>();
66
- AllocCpc().deallocate(accumulator, 1);
67
+ allocator.deallocate(accumulator, 1);
67
68
  }
68
69
  }
69
70
 
@@ -181,7 +182,7 @@ template<typename A>
181
182
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
182
183
  if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
183
184
  if (accumulator->get_num_coupons() == 0) {
184
- return cpc_sketch_alloc<A>(lg_k, seed);
185
+ return cpc_sketch_alloc<A>(lg_k, seed, accumulator->get_allocator());
185
186
  }
186
187
  if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
187
188
  cpc_sketch_alloc<A> copy(*accumulator);
@@ -191,8 +192,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
191
192
 
192
193
  template<typename A>
193
194
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
194
- const uint64_t k = 1 << lg_k;
195
- const uint64_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
195
+ const uint32_t k = 1 << lg_k;
196
+ const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
196
197
 
197
198
  const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
198
199
  if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
@@ -215,7 +216,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
215
216
 
216
217
  // The snowplow effect was caused by processing the rows in order,
217
218
  // but we have fixed it by using a sufficiently large hash table.
218
- for (unsigned i = 0; i < k; i++) {
219
+ for (uint32_t i = 0; i < k; i++) {
219
220
  uint64_t pattern = bit_matrix[i];
220
221
  sliding_window[i] = (pattern >> offset) & 0xff;
221
222
  pattern &= mask_for_clearing_window;
@@ -242,25 +243,26 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
242
243
  template<typename A>
243
244
  void cpc_union_alloc<A>::switch_to_bit_matrix() {
244
245
  bit_matrix = accumulator->build_bit_matrix();
246
+ AllocCpc allocator(accumulator->get_allocator());
245
247
  accumulator->~cpc_sketch_alloc<A>();
246
- AllocCpc().deallocate(accumulator, 1);
248
+ allocator.deallocate(accumulator, 1);
247
249
  accumulator = nullptr;
248
250
  }
249
251
 
250
252
  template<typename A>
251
253
  void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
252
254
  const uint32_t* slots = table.get_slots();
253
- const size_t num_slots = 1 << table.get_lg_size();
255
+ const uint32_t num_slots = 1 << table.get_lg_size();
254
256
  const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
255
257
 
256
258
  // Using a golden ratio stride fixes the snowplow effect.
257
259
  const double golden = 0.6180339887498949025;
258
- size_t stride = static_cast<size_t>(golden * static_cast<double>(num_slots));
260
+ uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
259
261
  if (stride < 2) throw std::logic_error("stride < 2");
260
262
  if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
261
263
  if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
262
264
 
263
- for (size_t i = 0, j = 0; i < num_slots; i++, j += stride) {
265
+ for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
264
266
  j &= num_slots - 1;
265
267
  const uint32_t row_col = slots[j];
266
268
  if (row_col != UINT32_MAX) {
@@ -272,13 +274,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
272
274
  template<typename A>
273
275
  void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
274
276
  const uint32_t* slots = table.get_slots();
275
- const size_t num_slots = 1 << table.get_lg_size();
277
+ const uint32_t num_slots = 1 << table.get_lg_size();
276
278
  const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
277
- for (size_t i = 0; i < num_slots; i++) {
279
+ for (uint32_t i = 0; i < num_slots; i++) {
278
280
  const uint32_t row_col = slots[i];
279
281
  if (row_col != UINT32_MAX) {
280
282
  const uint8_t col = row_col & 63;
281
- const size_t row = row_col >> 6;
283
+ const uint32_t row = row_col >> 6;
282
284
  bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
283
285
  }
284
286
  }
@@ -288,8 +290,8 @@ template<typename A>
288
290
  void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
289
291
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
290
292
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
291
- const size_t src_k = 1 << src_lg_k;
292
- for (size_t src_row = 0; src_row < src_k; src_row++) {
293
+ const uint32_t src_k = 1 << src_lg_k;
294
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
293
295
  bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
294
296
  }
295
297
  }
@@ -298,8 +300,8 @@ template<typename A>
298
300
  void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
299
301
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
300
302
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
301
- const size_t src_k = 1 << src_lg_k;
302
- for (size_t src_row = 0; src_row < src_k; src_row++) {
303
+ const uint32_t src_k = 1 << src_lg_k;
304
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
303
305
  bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
304
306
  }
305
307
  }
@@ -313,7 +315,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
313
315
  if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
314
316
  vector_u64<A> old_matrix = std::move(bit_matrix);
315
317
  const uint8_t old_lg_k = lg_k;
316
- const size_t new_k = 1 << new_lg_k;
318
+ const uint32_t new_k = 1 << new_lg_k;
317
319
  bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
318
320
  lg_k = new_lg_k;
319
321
  or_matrix_into_matrix(old_matrix, old_lg_k);
@@ -324,7 +326,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
324
326
  if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
325
327
  if (!accumulator->is_empty()) {
326
328
  cpc_sketch_alloc<A> old_accumulator(*accumulator);
327
- *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed);
329
+ *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed, old_accumulator.get_allocator());
328
330
  walk_table_updating_sketch(old_accumulator.surprising_value_table);
329
331
  }
330
332
  lg_k = new_lg_k;