datasketches 0.2.0 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (170) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +16 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +7 -7
  6. data/ext/datasketches/extconf.rb +1 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +31 -3
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  16. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  17. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  18. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  19. data/vendor/datasketches-cpp/common/include/common_defs.hpp +26 -0
  20. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  21. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  22. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  26. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +15 -2
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +126 -90
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +22 -20
  31. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  32. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  33. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  34. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  35. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  36. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  37. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  38. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  39. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  40. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +69 -82
  41. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  42. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  43. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  44. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  45. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  46. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  47. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  48. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  49. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  50. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  51. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  52. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  53. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  54. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  55. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  56. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  57. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  58. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  59. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  60. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  61. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  62. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +34 -32
  63. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  64. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  65. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  66. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  67. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  68. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  69. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  70. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  71. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  72. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  73. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  74. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  75. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  76. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  77. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  78. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  79. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  80. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  81. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  82. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  83. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  84. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  85. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +41 -4
  86. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +76 -64
  87. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  88. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  89. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  90. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +133 -46
  91. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  92. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  93. data/vendor/datasketches-cpp/python/CMakeLists.txt +10 -6
  94. data/vendor/datasketches-cpp/python/README.md +50 -50
  95. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  96. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  97. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  98. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +1 -1
  99. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +8 -8
  100. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +11 -5
  101. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  102. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  103. data/vendor/datasketches-cpp/python/tests/kll_test.py +2 -2
  104. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  105. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  106. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  107. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  108. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  109. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  110. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  111. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  112. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +13 -11
  113. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  114. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  115. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +10 -5
  116. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +61 -64
  117. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +42 -48
  118. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  119. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  120. data/vendor/datasketches-cpp/setup.py +10 -7
  121. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  122. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  123. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  124. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +137 -0
  125. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  126. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  127. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  128. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  129. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  130. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  131. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  132. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +73 -15
  133. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +247 -103
  134. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +10 -5
  135. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  136. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -3
  137. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  138. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +11 -5
  139. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +70 -37
  140. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  141. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  142. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  144. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  145. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  146. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  147. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  148. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  149. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  150. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +437 -1
  151. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +41 -9
  152. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  153. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  154. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +50 -63
  155. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  156. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  157. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  158. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +84 -78
  159. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  160. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  161. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +17 -1
  162. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  163. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  164. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  165. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  166. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +66 -28
  167. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +19 -12
  168. metadata +18 -7
  169. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  170. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -53,9 +53,7 @@ first_interesting_column(0),
53
53
  kxp(1 << lg_k),
54
54
  hip_est_accum(0)
55
55
  {
56
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
57
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
58
- }
56
+ check_lg_k(lg_k);
59
57
  }
60
58
 
61
59
  template<typename A>
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
176
174
 
177
175
  static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
178
176
  if (lg_k > 26) throw std::logic_error("lg_k > 26");
179
- const uint64_t k = 1 << lg_k;
177
+ const uint32_t k = 1 << lg_k;
180
178
  uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
181
179
  if (col > 63) col = 63; // clip so that 0 <= col <= 63
182
180
  const uint32_t row = hash0 & (k - 1);
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
188
186
  }
189
187
 
190
188
  template<typename A>
191
- void cpc_sketch_alloc<A>::update(const void* value, int size) {
189
+ void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
192
190
  HashState hashes;
193
191
  MurmurHash3_x64_128(value, size, seed, hashes);
194
192
  row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
208
206
 
209
207
  template<typename A>
210
208
  void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
211
- const uint64_t k = 1 << lg_k;
209
+ const uint32_t k = 1 << lg_k;
212
210
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
213
211
  if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
214
212
  bool is_novel = surprising_value_table.maybe_insert(row_col);
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
224
222
  template<typename A>
225
223
  void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
226
224
  if (window_offset > 56) throw std::logic_error("wrong window offset");
227
- const uint64_t k = 1 << lg_k;
225
+ const uint32_t k = 1 << lg_k;
228
226
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
229
227
  if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
230
228
  const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
266
264
  // Call this whenever a new coupon has been collected.
267
265
  template<typename A>
268
266
  void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
269
- const uint64_t k = 1 << lg_k;
267
+ const uint32_t k = 1 << lg_k;
270
268
  const uint8_t col = row_col & 63;
271
269
  const double one_over_p = static_cast<double>(k) / kxp;
272
270
  hip_est_accum += one_over_p;
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
276
274
  // In terms of flavor, this promotes SPARSE to HYBRID
277
275
  template<typename A>
278
276
  void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
279
- const uint64_t k = 1 << lg_k;
277
+ const uint32_t k = 1 << lg_k;
280
278
  const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
281
279
  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
282
280
 
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
285
283
  u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
286
284
 
287
285
  const uint32_t* old_slots = surprising_value_table.get_slots();
288
- const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
286
+ const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
289
287
 
290
288
  if (window_offset != 0) throw std::logic_error("window_offset != 0");
291
289
 
292
- for (size_t i = 0; i < old_num_slots; i++) {
290
+ for (uint32_t i = 0; i < old_num_slots; i++) {
293
291
  const uint32_t row_col = old_slots[i];
294
292
  if (row_col != UINT32_MAX) {
295
293
  const uint8_t col = row_col & 63;
296
294
  if (col < 8) {
297
- const size_t row = row_col >> 6;
295
+ const uint32_t row = row_col >> 6;
298
296
  sliding_window[row] |= 1 << col;
299
297
  } else {
300
298
  // cannot use u32_table::must_insert(), because it doesn't provide for growth
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
314
312
  if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
315
313
 
316
314
  if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
317
- const uint64_t k = 1 << lg_k;
315
+ const uint32_t k = 1 << lg_k;
318
316
 
319
317
  // Construct the full-sized bit matrix that corresponds to the sketch
320
318
  vector_u64<A> bit_matrix = build_bit_matrix();
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
328
326
  const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
329
327
  uint64_t all_surprises_ored = 0;
330
328
 
331
- for (size_t i = 0; i < k; i++) {
329
+ for (uint32_t i = 0; i < k; i++) {
332
330
  uint64_t pattern = bit_matrix[i];
333
331
  sliding_window[i] = (pattern >> new_offset) & 0xff;
334
332
  pattern &= mask_for_clearing_window;
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
357
355
  // so that it will reflect changes that were previously outside the mantissa.
358
356
  template<typename A>
359
357
  void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
360
- const uint64_t k = 1 << lg_k;
358
+ const uint32_t k = 1 << lg_k;
361
359
 
362
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
363
361
  double byte_sums[8]; // allocating on the stack
@@ -383,7 +381,9 @@ void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
383
381
 
384
382
  template<typename A>
385
383
  string<A> cpc_sketch_alloc<A>::to_string() const {
386
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
384
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
385
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
386
+ std::ostringstream os;
387
387
  os << "### CPC sketch summary:" << std::endl;
388
388
  os << " lg_k : " << std::to_string(lg_k) << std::endl;
389
389
  os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
@@ -394,14 +394,14 @@ string<A> cpc_sketch_alloc<A>::to_string() const {
394
394
  os << " HIP estimate : " << hip_est_accum << std::endl;
395
395
  os << " kxp : " << kxp << std::endl;
396
396
  }
397
- os << " intresting col : " << std::to_string(first_interesting_column) << std::endl;
397
+ os << " interesting col: " << std::to_string(first_interesting_column) << std::endl;
398
398
  os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
399
399
  os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
400
400
  if (sliding_window.size() > 0) {
401
401
  os << " window offset : " << std::to_string(window_offset) << std::endl;
402
402
  }
403
403
  os << "### End sketch summary" << std::endl;
404
- return os.str();
404
+ return string<A>(os.str().c_str(), sliding_window.get_allocator());
405
405
  }
406
406
 
407
407
  template<typename A>
@@ -415,44 +415,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
415
415
  const bool has_table = compressed.table_data.size() > 0;
416
416
  const bool has_window = compressed.window_data.size() > 0;
417
417
  const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
418
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
418
+ write(os, preamble_ints);
419
419
  const uint8_t serial_version = SERIAL_VERSION;
420
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
420
+ write(os, serial_version);
421
421
  const uint8_t family = FAMILY;
422
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
423
- os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
424
- os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
422
+ write(os, family);
423
+ write(os, lg_k);
424
+ write(os, first_interesting_column);
425
425
  const uint8_t flags_byte(
426
426
  (1 << flags::IS_COMPRESSED)
427
427
  | (has_hip ? 1 << flags::HAS_HIP : 0)
428
428
  | (has_table ? 1 << flags::HAS_TABLE : 0)
429
429
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
430
430
  );
431
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
431
+ write(os, flags_byte);
432
432
  const uint16_t seed_hash(compute_seed_hash(seed));
433
- os.write((char*)&seed_hash, sizeof(seed_hash));
433
+ write(os, seed_hash);
434
434
  if (!is_empty()) {
435
- os.write((char*)&num_coupons, sizeof(num_coupons));
435
+ write(os, num_coupons);
436
436
  if (has_table && has_window) {
437
437
  // if there is no window it is the same as number of coupons
438
- os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
438
+ write(os, compressed.table_num_entries);
439
439
  // HIP values can be in two different places in the sequence of fields
440
440
  // this is the first HIP decision point
441
441
  if (has_hip) write_hip(os);
442
442
  }
443
443
  if (has_table) {
444
- os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
444
+ write(os, compressed.table_data_words);
445
445
  }
446
446
  if (has_window) {
447
- os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
447
+ write(os, compressed.window_data_words);
448
448
  }
449
449
  // this is the second HIP decision point
450
450
  if (has_hip && !(has_table && has_window)) write_hip(os);
451
451
  if (has_window) {
452
- os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
452
+ write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
453
453
  }
454
454
  if (has_table) {
455
- os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
455
+ write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
456
456
  }
457
457
  }
458
458
  }
@@ -471,36 +471,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
471
471
  const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
472
472
  vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
473
473
  uint8_t* ptr = bytes.data() + header_size_bytes;
474
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
474
+ ptr += copy_to_mem(preamble_ints, ptr);
475
475
  const uint8_t serial_version = SERIAL_VERSION;
476
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
476
+ ptr += copy_to_mem(serial_version, ptr);
477
477
  const uint8_t family = FAMILY;
478
- ptr += copy_to_mem(&family, ptr, sizeof(family));
479
- ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
480
- ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
478
+ ptr += copy_to_mem(family, ptr);
479
+ ptr += copy_to_mem(lg_k, ptr);
480
+ ptr += copy_to_mem(first_interesting_column, ptr);
481
481
  const uint8_t flags_byte(
482
482
  (1 << flags::IS_COMPRESSED)
483
483
  | (has_hip ? 1 << flags::HAS_HIP : 0)
484
484
  | (has_table ? 1 << flags::HAS_TABLE : 0)
485
485
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
486
486
  );
487
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
487
+ ptr += copy_to_mem(flags_byte, ptr);
488
488
  const uint16_t seed_hash = compute_seed_hash(seed);
489
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
489
+ ptr += copy_to_mem(seed_hash, ptr);
490
490
  if (!is_empty()) {
491
- ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
491
+ ptr += copy_to_mem(num_coupons, ptr);
492
492
  if (has_table && has_window) {
493
493
  // if there is no window it is the same as number of coupons
494
- ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
494
+ ptr += copy_to_mem(compressed.table_num_entries, ptr);
495
495
  // HIP values can be in two different places in the sequence of fields
496
496
  // this is the first HIP decision point
497
497
  if (has_hip) ptr += copy_hip_to_mem(ptr);
498
498
  }
499
499
  if (has_table) {
500
- ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
500
+ ptr += copy_to_mem(compressed.table_data_words, ptr);
501
501
  }
502
502
  if (has_window) {
503
- ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
503
+ ptr += copy_to_mem(compressed.window_data_words, ptr);
504
504
  }
505
505
  // this is the second HIP decision point
506
506
  if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
@@ -517,20 +517,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
517
517
 
518
518
  template<typename A>
519
519
  cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
520
- uint8_t preamble_ints;
521
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
522
- uint8_t serial_version;
523
- is.read((char*)&serial_version, sizeof(serial_version));
524
- uint8_t family_id;
525
- is.read((char*)&family_id, sizeof(family_id));
526
- uint8_t lg_k;
527
- is.read((char*)&lg_k, sizeof(lg_k));
528
- uint8_t first_interesting_column;
529
- is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
530
- uint8_t flags_byte;
531
- is.read((char*)&flags_byte, sizeof(flags_byte));
532
- uint16_t seed_hash;
533
- is.read((char*)&seed_hash, sizeof(seed_hash));
520
+ const auto preamble_ints = read<uint8_t>(is);
521
+ const auto serial_version = read<uint8_t>(is);
522
+ const auto family_id = read<uint8_t>(is);
523
+ const auto lg_k = read<uint8_t>(is);
524
+ const auto first_interesting_column = read<uint8_t>(is);
525
+ const auto flags_byte = read<uint8_t>(is);
526
+ const auto seed_hash = read<uint16_t>(is);
534
527
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
535
528
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
536
529
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -542,31 +535,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
542
535
  double kxp = 0;
543
536
  double hip_est_accum = 0;
544
537
  if (has_table || has_window) {
545
- is.read((char*)&num_coupons, sizeof(num_coupons));
538
+ num_coupons = read<uint32_t>(is);
546
539
  if (has_table && has_window) {
547
- is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
540
+ compressed.table_num_entries = read<uint32_t>(is);
548
541
  if (has_hip) {
549
- is.read((char*)&kxp, sizeof(kxp));
550
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
542
+ kxp = read<double>(is);
543
+ hip_est_accum = read<double>(is);
551
544
  }
552
545
  }
553
546
  if (has_table) {
554
- is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
547
+ compressed.table_data_words = read<uint32_t>(is);
555
548
  }
556
549
  if (has_window) {
557
- is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
550
+ compressed.window_data_words = read<uint32_t>(is);
558
551
  }
559
552
  if (has_hip && !(has_table && has_window)) {
560
- is.read((char*)&kxp, sizeof(kxp));
561
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
553
+ kxp = read<double>(is);
554
+ hip_est_accum = read<double>(is);
562
555
  }
563
556
  if (has_window) {
564
557
  compressed.window_data.resize(compressed.window_data_words);
565
- is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
558
+ read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
566
559
  }
567
560
  if (has_table) {
568
561
  compressed.table_data.resize(compressed.table_data_words);
569
- is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
562
+ read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
570
563
  }
571
564
  if (!has_window) compressed.table_num_entries = num_coupons;
572
565
  }
@@ -602,19 +595,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
602
595
  const char* ptr = static_cast<const char*>(bytes);
603
596
  const char* base = static_cast<const char*>(bytes);
604
597
  uint8_t preamble_ints;
605
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
598
+ ptr += copy_from_mem(ptr, preamble_ints);
606
599
  uint8_t serial_version;
607
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
600
+ ptr += copy_from_mem(ptr, serial_version);
608
601
  uint8_t family_id;
609
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
602
+ ptr += copy_from_mem(ptr, family_id);
610
603
  uint8_t lg_k;
611
- ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
604
+ ptr += copy_from_mem(ptr, lg_k);
612
605
  uint8_t first_interesting_column;
613
- ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
606
+ ptr += copy_from_mem(ptr, first_interesting_column);
614
607
  uint8_t flags_byte;
615
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
608
+ ptr += copy_from_mem(ptr, flags_byte);
616
609
  uint16_t seed_hash;
617
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
610
+ ptr += copy_from_mem(ptr, seed_hash);
618
611
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
619
612
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
620
613
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -628,28 +621,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
628
621
  double hip_est_accum = 0;
629
622
  if (has_table || has_window) {
630
623
  check_memory_size(ptr - base + sizeof(num_coupons), size);
631
- ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
624
+ ptr += copy_from_mem(ptr, num_coupons);
632
625
  if (has_table && has_window) {
633
626
  check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
634
- ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
627
+ ptr += copy_from_mem(ptr, compressed.table_num_entries);
635
628
  if (has_hip) {
636
629
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
637
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
638
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
630
+ ptr += copy_from_mem(ptr, kxp);
631
+ ptr += copy_from_mem(ptr, hip_est_accum);
639
632
  }
640
633
  }
641
634
  if (has_table) {
642
635
  check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
643
- ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
636
+ ptr += copy_from_mem(ptr, compressed.table_data_words);
644
637
  }
645
638
  if (has_window) {
646
639
  check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
647
- ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
640
+ ptr += copy_from_mem(ptr, compressed.window_data_words);
648
641
  }
649
642
  if (has_hip && !(has_table && has_window)) {
650
643
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
651
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
652
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
644
+ ptr += copy_from_mem(ptr, kxp);
645
+ ptr += copy_from_mem(ptr, hip_est_accum);
653
646
  }
654
647
  if (has_window) {
655
648
  compressed.window_data.resize(compressed.window_data_words);
@@ -688,6 +681,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
688
681
  std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
689
682
  }
690
683
 
684
+ /*
685
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
686
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
687
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
688
+ * worst-case space for the preamble, which is added by the function.
689
+ */
690
+ static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
691
+ static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
692
+ 24, // lg_k = 4
693
+ 36, // lg_k = 5
694
+ 56, // lg_k = 6
695
+ 100, // lg_k = 7
696
+ 180, // lg_k = 8
697
+ 344, // lg_k = 9
698
+ 660, // lg_k = 10
699
+ 1292, // lg_k = 11
700
+ 2540, // lg_k = 12
701
+ 5020, // lg_k = 13
702
+ 9968, // lg_k = 14
703
+ 19836, // lg_k = 15
704
+ 39532, // lg_k = 16
705
+ 78880, // lg_k = 17
706
+ 157516, // lg_k = 18
707
+ 314656 // lg_k = 19
708
+ };
709
+ static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
710
+ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
711
+
712
+ template<typename A>
713
+ size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
714
+ check_lg_k(lg_k);
715
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
716
+ const uint32_t k = 1 << lg_k;
717
+ return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
718
+ }
719
+
720
+ template<typename A>
721
+ void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
722
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
723
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
724
+ }
725
+ }
726
+
691
727
  template<typename A>
692
728
  uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
693
729
  return num_coupons;
@@ -696,7 +732,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
696
732
  template<typename A>
697
733
  bool cpc_sketch_alloc<A>::validate() const {
698
734
  vector_u64<A> bit_matrix = build_bit_matrix();
699
- const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
735
+ const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
700
736
  return num_bits_set == num_coupons;
701
737
  }
702
738
 
@@ -744,7 +780,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
744
780
 
745
781
  template<typename A>
746
782
  typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
747
- const uint64_t k = 1 << lg_k;
783
+ const uint32_t k = 1 << lg_k;
748
784
  const uint64_t c2 = c << 1;
749
785
  const uint64_t c8 = c << 3;
750
786
  const uint64_t c32 = c << 5;
@@ -757,15 +793,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
757
793
 
758
794
  template<typename A>
759
795
  uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
760
- const uint64_t k = 1 << lg_k;
796
+ const uint32_t k = 1 << lg_k;
761
797
  const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
762
798
  if (tmp < 0) return 0;
763
- return tmp >> (lg_k + 3); // tmp / 8K
799
+ return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
764
800
  }
765
801
 
766
802
  template<typename A>
767
803
  vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
768
- const size_t k = 1 << lg_k;
804
+ const uint32_t k = 1 << lg_k;
769
805
  if (window_offset > 56) throw std::logic_error("offset > 56");
770
806
 
771
807
  // Fill the matrix with default rows in which the "early zone" is filled with ones.
@@ -782,12 +818,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
782
818
  }
783
819
 
784
820
  const uint32_t* slots = surprising_value_table.get_slots();
785
- const size_t num_slots = 1 << surprising_value_table.get_lg_size();
821
+ const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
786
822
  for (size_t i = 0; i < num_slots; i++) {
787
823
  const uint32_t row_col = slots[i];
788
824
  if (row_col != UINT32_MAX) {
789
825
  const uint8_t col = row_col & 63;
790
- const size_t row = row_col >> 6;
826
+ const uint32_t row = row_col >> 6;
791
827
  // Flip the specified matrix bit from its default value.
792
828
  // In the "early" zone the bit changes from 1 to 0.
793
829
  // In the "late" zone the bit changes from 0 to 1.
@@ -799,8 +835,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
799
835
 
800
836
  template<typename A>
801
837
  void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
802
- os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
803
- os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
838
+ write(os, kxp);
839
+ write(os, hip_est_accum);
804
840
  }
805
841
 
806
842
  template<typename A>
@@ -45,7 +45,7 @@ public:
45
45
  * @param lg_k base 2 logarithm of the number of bins in the sketch
46
46
  * @param seed for hash function
47
47
  */
48
- explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
48
+ explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
49
49
 
50
50
  cpc_union_alloc(const cpc_union_alloc<A>& other);
51
51
  cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
@@ -34,7 +34,7 @@ bit_matrix(allocator)
34
34
  if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
35
35
  throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
36
36
  }
37
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
37
+ accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
38
38
  }
39
39
 
40
40
  template<typename A>
@@ -45,7 +45,7 @@ accumulator(other.accumulator),
45
45
  bit_matrix(other.bit_matrix)
46
46
  {
47
47
  if (accumulator != nullptr) {
48
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
48
+ accumulator = new (AllocCpc(accumulator->get_allocator()).allocate(1)) cpc_sketch_alloc<A>(*other.accumulator);
49
49
  }
50
50
  }
51
51
 
@@ -62,8 +62,9 @@ bit_matrix(std::move(other.bit_matrix))
62
62
  template<typename A>
63
63
  cpc_union_alloc<A>::~cpc_union_alloc() {
64
64
  if (accumulator != nullptr) {
65
+ AllocCpc allocator(accumulator->get_allocator());
65
66
  accumulator->~cpc_sketch_alloc<A>();
66
- AllocCpc().deallocate(accumulator, 1);
67
+ allocator.deallocate(accumulator, 1);
67
68
  }
68
69
  }
69
70
 
@@ -181,7 +182,7 @@ template<typename A>
181
182
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
182
183
  if (lg_k != accumulator->get_lg_k()) throw std::logic_error("lg_k != accumulator->lg_k");
183
184
  if (accumulator->get_num_coupons() == 0) {
184
- return cpc_sketch_alloc<A>(lg_k, seed);
185
+ return cpc_sketch_alloc<A>(lg_k, seed, accumulator->get_allocator());
185
186
  }
186
187
  if (accumulator->determine_flavor() != cpc_sketch_alloc<A>::flavor::SPARSE) throw std::logic_error("wrong flavor");
187
188
  cpc_sketch_alloc<A> copy(*accumulator);
@@ -191,8 +192,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
191
192
 
192
193
  template<typename A>
193
194
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
194
- const uint64_t k = 1 << lg_k;
195
- const uint64_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
195
+ const uint32_t k = 1 << lg_k;
196
+ const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
196
197
 
197
198
  const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
198
199
  if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
@@ -215,7 +216,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
215
216
 
216
217
  // The snowplow effect was caused by processing the rows in order,
217
218
  // but we have fixed it by using a sufficiently large hash table.
218
- for (unsigned i = 0; i < k; i++) {
219
+ for (uint32_t i = 0; i < k; i++) {
219
220
  uint64_t pattern = bit_matrix[i];
220
221
  sliding_window[i] = (pattern >> offset) & 0xff;
221
222
  pattern &= mask_for_clearing_window;
@@ -242,25 +243,26 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
242
243
  template<typename A>
243
244
  void cpc_union_alloc<A>::switch_to_bit_matrix() {
244
245
  bit_matrix = accumulator->build_bit_matrix();
246
+ AllocCpc allocator(accumulator->get_allocator());
245
247
  accumulator->~cpc_sketch_alloc<A>();
246
- AllocCpc().deallocate(accumulator, 1);
248
+ allocator.deallocate(accumulator, 1);
247
249
  accumulator = nullptr;
248
250
  }
249
251
 
250
252
  template<typename A>
251
253
  void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
252
254
  const uint32_t* slots = table.get_slots();
253
- const size_t num_slots = 1 << table.get_lg_size();
255
+ const uint32_t num_slots = 1 << table.get_lg_size();
254
256
  const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
255
257
 
256
258
  // Using a golden ratio stride fixes the snowplow effect.
257
259
  const double golden = 0.6180339887498949025;
258
- size_t stride = static_cast<size_t>(golden * static_cast<double>(num_slots));
260
+ uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
259
261
  if (stride < 2) throw std::logic_error("stride < 2");
260
262
  if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
261
263
  if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
262
264
 
263
- for (size_t i = 0, j = 0; i < num_slots; i++, j += stride) {
265
+ for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
264
266
  j &= num_slots - 1;
265
267
  const uint32_t row_col = slots[j];
266
268
  if (row_col != UINT32_MAX) {
@@ -272,13 +274,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
272
274
  template<typename A>
273
275
  void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
274
276
  const uint32_t* slots = table.get_slots();
275
- const size_t num_slots = 1 << table.get_lg_size();
277
+ const uint32_t num_slots = 1 << table.get_lg_size();
276
278
  const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
277
- for (size_t i = 0; i < num_slots; i++) {
279
+ for (uint32_t i = 0; i < num_slots; i++) {
278
280
  const uint32_t row_col = slots[i];
279
281
  if (row_col != UINT32_MAX) {
280
282
  const uint8_t col = row_col & 63;
281
- const size_t row = row_col >> 6;
283
+ const uint32_t row = row_col >> 6;
282
284
  bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
283
285
  }
284
286
  }
@@ -288,8 +290,8 @@ template<typename A>
288
290
  void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
289
291
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
290
292
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
291
- const size_t src_k = 1 << src_lg_k;
292
- for (size_t src_row = 0; src_row < src_k; src_row++) {
293
+ const uint32_t src_k = 1 << src_lg_k;
294
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
293
295
  bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
294
296
  }
295
297
  }
@@ -298,8 +300,8 @@ template<typename A>
298
300
  void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
299
301
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
300
302
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
301
- const size_t src_k = 1 << src_lg_k;
302
- for (size_t src_row = 0; src_row < src_k; src_row++) {
303
+ const uint32_t src_k = 1 << src_lg_k;
304
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
303
305
  bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
304
306
  }
305
307
  }
@@ -313,7 +315,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
313
315
  if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
314
316
  vector_u64<A> old_matrix = std::move(bit_matrix);
315
317
  const uint8_t old_lg_k = lg_k;
316
- const size_t new_k = 1 << new_lg_k;
318
+ const uint32_t new_k = 1 << new_lg_k;
317
319
  bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
318
320
  lg_k = new_lg_k;
319
321
  or_matrix_into_matrix(old_matrix, old_lg_k);
@@ -324,7 +326,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
324
326
  if (bit_matrix.size() > 0) throw std::logic_error("bit_matrix is not expected");
325
327
  if (!accumulator->is_empty()) {
326
328
  cpc_sketch_alloc<A> old_accumulator(*accumulator);
327
- *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed);
329
+ *accumulator = cpc_sketch_alloc<A>(new_lg_k, seed, old_accumulator.get_allocator());
328
330
  walk_table_updating_sketch(old_accumulator.surprising_value_table);
329
331
  }
330
332
  lg_k = new_lg_k;