datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -191,8 +191,8 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_accumulator() const {
191
191
 
192
192
  template<typename A>
193
193
  cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
194
- const uint64_t k = 1 << lg_k;
195
- const uint64_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
194
+ const uint32_t k = 1 << lg_k;
195
+ const uint32_t num_coupons = count_bits_set_in_matrix(bit_matrix.data(), k);
196
196
 
197
197
  const auto flavor = cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons);
198
198
  if (flavor != cpc_sketch_alloc<A>::flavor::HYBRID && flavor != cpc_sketch_alloc<A>::flavor::PINNED
@@ -215,7 +215,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
215
215
 
216
216
  // The snowplow effect was caused by processing the rows in order,
217
217
  // but we have fixed it by using a sufficiently large hash table.
218
- for (unsigned i = 0; i < k; i++) {
218
+ for (uint32_t i = 0; i < k; i++) {
219
219
  uint64_t pattern = bit_matrix[i];
220
220
  sliding_window[i] = (pattern >> offset) & 0xff;
221
221
  pattern &= mask_for_clearing_window;
@@ -250,17 +250,17 @@ void cpc_union_alloc<A>::switch_to_bit_matrix() {
250
250
  template<typename A>
251
251
  void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
252
252
  const uint32_t* slots = table.get_slots();
253
- const size_t num_slots = 1 << table.get_lg_size();
253
+ const uint32_t num_slots = 1 << table.get_lg_size();
254
254
  const uint64_t dst_mask = (((1 << accumulator->get_lg_k()) - 1) << 6) | 63; // downsamples when dst lgK < src LgK
255
255
 
256
256
  // Using a golden ratio stride fixes the snowplow effect.
257
257
  const double golden = 0.6180339887498949025;
258
- size_t stride = static_cast<size_t>(golden * static_cast<double>(num_slots));
258
+ uint32_t stride = static_cast<uint32_t>(golden * static_cast<double>(num_slots));
259
259
  if (stride < 2) throw std::logic_error("stride < 2");
260
260
  if (stride == ((stride >> 1) << 1)) stride += 1; // force the stride to be odd
261
261
  if (stride < 3 || stride >= num_slots) throw std::out_of_range("stride out of range");
262
262
 
263
- for (size_t i = 0, j = 0; i < num_slots; i++, j += stride) {
263
+ for (uint32_t i = 0, j = 0; i < num_slots; i++, j += stride) {
264
264
  j &= num_slots - 1;
265
265
  const uint32_t row_col = slots[j];
266
266
  if (row_col != UINT32_MAX) {
@@ -272,13 +272,13 @@ void cpc_union_alloc<A>::walk_table_updating_sketch(const u32_table<A>& table) {
272
272
  template<typename A>
273
273
  void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
274
274
  const uint32_t* slots = table.get_slots();
275
- const size_t num_slots = 1 << table.get_lg_size();
275
+ const uint32_t num_slots = 1 << table.get_lg_size();
276
276
  const uint64_t dest_mask = (1 << lg_k) - 1; // downsamples when dst lgK < sr LgK
277
- for (size_t i = 0; i < num_slots; i++) {
277
+ for (uint32_t i = 0; i < num_slots; i++) {
278
278
  const uint32_t row_col = slots[i];
279
279
  if (row_col != UINT32_MAX) {
280
280
  const uint8_t col = row_col & 63;
281
- const size_t row = row_col >> 6;
281
+ const uint32_t row = row_col >> 6;
282
282
  bit_matrix[row & dest_mask] |= static_cast<uint64_t>(1) << col; // set the bit
283
283
  }
284
284
  }
@@ -288,8 +288,8 @@ template<typename A>
288
288
  void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_window, uint8_t offset, uint8_t src_lg_k) {
289
289
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
290
290
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
291
- const size_t src_k = 1 << src_lg_k;
292
- for (size_t src_row = 0; src_row < src_k; src_row++) {
291
+ const uint32_t src_k = 1 << src_lg_k;
292
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
293
293
  bit_matrix[src_row & dst_mask] |= static_cast<uint64_t>(sliding_window[src_row]) << offset;
294
294
  }
295
295
  }
@@ -298,8 +298,8 @@ template<typename A>
298
298
  void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64<A>& src_matrix, uint8_t src_lg_k) {
299
299
  if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
300
300
  const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
301
- const size_t src_k = 1 << src_lg_k;
302
- for (size_t src_row = 0; src_row < src_k; src_row++) {
301
+ const uint32_t src_k = 1 << src_lg_k;
302
+ for (uint32_t src_row = 0; src_row < src_k; src_row++) {
303
303
  bit_matrix[src_row & dst_mask] |= src_matrix[src_row];
304
304
  }
305
305
  }
@@ -313,7 +313,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
313
313
  if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
314
314
  vector_u64<A> old_matrix = std::move(bit_matrix);
315
315
  const uint8_t old_lg_k = lg_k;
316
- const size_t new_k = 1 << new_lg_k;
316
+ const uint32_t new_k = 1 << new_lg_k;
317
317
  bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
318
318
  lg_k = new_lg_k;
319
319
  or_matrix_into_matrix(old_matrix, old_lg_k);
@@ -31,9 +31,9 @@ static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
31
31
  else return quotient + 1;
32
32
  }
33
33
 
34
- static inline uint64_t long_floor_log2_of_long(uint64_t x) {
35
- if (x < 1) throw std::invalid_argument("long_floor_log2_of_long: bad argument");
36
- uint64_t p = 0;
34
+ static inline uint8_t floor_log2_of_long(uint64_t x) {
35
+ if (x < 1) throw std::invalid_argument("floor_log2_of_long: bad argument");
36
+ uint8_t p = 0;
37
37
  uint64_t y = 1;
38
38
  while (true) {
39
39
  if (y == x) return p;
@@ -69,7 +69,7 @@ static inline uint64_t wegner_count_bits_set_in_matrix(const uint64_t* array, si
69
69
  // Note: this is an adaptation of the Java code,
70
70
  // which is apparently a variation of Figure 5-2 in "Hacker's Delight"
71
71
  // by Henry S. Warren.
72
- static inline uint64_t warren_bit_count(uint64_t i) {
72
+ static inline uint32_t warren_bit_count(uint64_t i) {
73
73
  i = i - ((i >> 1) & 0x5555555555555555ULL);
74
74
  i = (i & 0x3333333333333333ULL) + ((i >> 2) & 0x3333333333333333ULL);
75
75
  i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
@@ -79,9 +79,9 @@ static inline uint64_t warren_bit_count(uint64_t i) {
79
79
  return i & 0x7f;
80
80
  }
81
81
 
82
- static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, size_t length) {
83
- uint64_t count = 0;
84
- for (size_t i = 0; i < length; i++) {
82
+ static inline uint32_t warren_count_bits_set_in_matrix(const uint64_t* array, uint32_t length) {
83
+ uint32_t count = 0;
84
+ for (uint32_t i = 0; i < length; i++) {
85
85
  count += warren_bit_count(array[i]);
86
86
  }
87
87
  return count;
@@ -91,13 +91,13 @@ static inline uint64_t warren_count_bits_set_in_matrix(const uint64_t* array, si
91
91
 
92
92
  #define CSA(h,l,a,b,c) {uint64_t u = a ^ b; uint64_t v = c; h = (a & b) | (u & v); l = u ^ v;}
93
93
 
94
- static inline uint64_t count_bits_set_in_matrix(const uint64_t* a, size_t length) {
94
+ static inline uint32_t count_bits_set_in_matrix(const uint64_t* a, uint32_t length) {
95
95
  if ((length & 0x7) != 0) throw std::invalid_argument("the length of the array must be a multiple of 8");
96
- uint64_t total = 0;
96
+ uint32_t total = 0;
97
97
  uint64_t ones, twos, twos_a, twos_b, fours, fours_a, fours_b, eights;
98
98
  fours = twos = ones = 0;
99
99
 
100
- for (size_t i = 0; i <= length - 8; i = i + 8) {
100
+ for (uint32_t i = 0; i <= length - 8; i += 8) {
101
101
  CSA(twos_a, ones, ones, a[i+0], a[i+1]);
102
102
  CSA(twos_b, ones, ones, a[i+2], a[i+3]);
103
103
  CSA(fours_a, twos, twos, twos_a, twos_b);
@@ -245,12 +245,12 @@ static inline double icon_exponential_approximation(double k, double c) {
245
245
  return (0.7940236163830469 * k * pow(2.0, c / k));
246
246
  }
247
247
 
248
- static inline double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
248
+ static inline double compute_icon_estimate(uint8_t lg_k, uint32_t c) {
249
249
  if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
250
250
  if (c < 2) return ((c == 0) ? 0.0 : 1.0);
251
- const size_t k = 1 << lg_k;
252
- const double double_k = k;
253
- const double double_c = c;
251
+ const uint32_t k = 1 << lg_k;
252
+ const double double_k = static_cast<double>(k);
253
+ const double double_c = static_cast<double>(c);
254
254
  // Differing thresholds ensure that the approximated estimator is monotonically increasing.
255
255
  const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
256
256
  if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
@@ -29,11 +29,11 @@
29
29
 
30
30
  namespace datasketches {
31
31
 
32
- static const uint64_t U32_TABLE_UPSIZE_NUMER = 3LL;
33
- static const uint64_t U32_TABLE_UPSIZE_DENOM = 4LL;
32
+ static const uint32_t U32_TABLE_UPSIZE_NUMER = 3LL;
33
+ static const uint32_t U32_TABLE_UPSIZE_DENOM = 4LL;
34
34
 
35
- static const uint64_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
36
- static const uint64_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
35
+ static const uint32_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
36
+ static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
37
37
 
38
38
  template<typename A>
39
39
  class u32_table {
@@ -42,7 +42,7 @@ public:
42
42
  u32_table(const A& allocator);
43
43
  u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
44
44
 
45
- inline size_t get_num_items() const;
45
+ inline uint32_t get_num_items() const;
46
46
  inline const uint32_t* get_slots() const;
47
47
  inline uint8_t get_lg_size() const;
48
48
  inline void clear();
@@ -52,7 +52,7 @@ public:
52
52
  // returns true iff the item was present and was therefore removed from the table
53
53
  inline bool maybe_delete(uint32_t item);
54
54
 
55
- static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator);
55
+ static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
56
56
 
57
57
  vector_u32<A> unwrapping_get_items() const;
58
58
 
@@ -69,10 +69,10 @@ private:
69
69
 
70
70
  uint8_t lg_size; // log2 of number of slots
71
71
  uint8_t num_valid_bits;
72
- size_t num_items;
72
+ uint32_t num_items;
73
73
  vector_u32<A> slots;
74
74
 
75
- inline size_t lookup(uint32_t item) const;
75
+ inline uint32_t lookup(uint32_t item) const;
76
76
  inline void must_insert(uint32_t item);
77
77
  inline void rebuild(uint8_t new_lg_size);
78
78
  };
@@ -41,14 +41,14 @@ u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& alloca
41
41
  lg_size(lg_size),
42
42
  num_valid_bits(num_valid_bits),
43
43
  num_items(0),
44
- slots(1 << lg_size, UINT32_MAX, allocator)
44
+ slots(1ULL << lg_size, UINT32_MAX, allocator)
45
45
  {
46
46
  if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
47
47
  if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
48
48
  }
49
49
 
50
50
  template<typename A>
51
- size_t u32_table<A>::get_num_items() const {
51
+ uint32_t u32_table<A>::get_num_items() const {
52
52
  return num_items;
53
53
  }
54
54
 
@@ -70,7 +70,7 @@ void u32_table<A>::clear() {
70
70
 
71
71
  template<typename A>
72
72
  bool u32_table<A>::maybe_insert(uint32_t item) {
73
- const size_t index = lookup(item);
73
+ const uint32_t index = lookup(item);
74
74
  if (slots[index] == item) return false;
75
75
  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
76
76
  slots[index] = item;
@@ -83,7 +83,7 @@ bool u32_table<A>::maybe_insert(uint32_t item) {
83
83
 
84
84
  template<typename A>
85
85
  bool u32_table<A>::maybe_delete(uint32_t item) {
86
- const size_t index = lookup(item);
86
+ const uint32_t index = lookup(item);
87
87
  if (slots[index] == UINT32_MAX) return false;
88
88
  if (slots[index] != item) throw std::logic_error("item does not exist");
89
89
  if (num_items == 0) throw std::logic_error("delete error");
@@ -110,7 +110,7 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
110
110
 
111
111
  // this one is specifically tailored to be a part of fm85 decompression scheme
112
112
  template<typename A>
113
- u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator) {
113
+ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator) {
114
114
  uint8_t lg_num_slots = 2;
115
115
  while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
116
116
  u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
@@ -124,11 +124,11 @@ u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pai
124
124
  }
125
125
 
126
126
  template<typename A>
127
- size_t u32_table<A>::lookup(uint32_t item) const {
128
- const size_t size = 1 << lg_size;
129
- const size_t mask = size - 1;
127
+ uint32_t u32_table<A>::lookup(uint32_t item) const {
128
+ const uint32_t size = 1 << lg_size;
129
+ const uint32_t mask = size - 1;
130
130
  const uint8_t shift = num_valid_bits - lg_size;
131
- size_t probe = item >> shift;
131
+ uint32_t probe = item >> shift;
132
132
  if (probe > mask) throw std::logic_error("probe out of range");
133
133
  while (slots[probe] != item && slots[probe] != UINT32_MAX) {
134
134
  probe = (probe + 1) & mask;
@@ -139,7 +139,7 @@ size_t u32_table<A>::lookup(uint32_t item) const {
139
139
  // counts and resizing must be handled by the caller
140
140
  template<typename A>
141
141
  void u32_table<A>::must_insert(uint32_t item) {
142
- const size_t index = lookup(item);
142
+ const uint32_t index = lookup(item);
143
143
  if (slots[index] == item) throw std::logic_error("item exists");
144
144
  if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
145
145
  slots[index] = item;
@@ -148,13 +148,13 @@ void u32_table<A>::must_insert(uint32_t item) {
148
148
  template<typename A>
149
149
  void u32_table<A>::rebuild(uint8_t new_lg_size) {
150
150
  if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
151
- const size_t old_size = 1 << lg_size;
152
- const size_t new_size = 1 << new_lg_size;
151
+ const uint32_t old_size = 1 << lg_size;
152
+ const uint32_t new_size = 1 << new_lg_size;
153
153
  if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
154
154
  vector_u32<A> old_slots = std::move(slots);
155
155
  slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
156
156
  lg_size = new_lg_size;
157
- for (size_t i = 0; i < old_size; i++) {
157
+ for (uint32_t i = 0; i < old_size; i++) {
158
158
  if (old_slots[i] != UINT32_MAX) {
159
159
  must_insert(old_slots[i]);
160
160
  }
@@ -170,7 +170,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
170
170
  template<typename A>
171
171
  vector_u32<A> u32_table<A>::unwrapping_get_items() const {
172
172
  if (num_items == 0) return vector_u32<A>(slots.get_allocator());
173
- const size_t table_size = 1 << lg_size;
173
+ const uint32_t table_size = 1 << lg_size;
174
174
  vector_u32<A> result(num_items, 0, slots.get_allocator());
175
175
  size_t i = 0;
176
176
  size_t l = 0;
@@ -27,38 +27,38 @@ namespace datasketches {
27
27
  typedef u32_table<std::allocator<void>> table;
28
28
 
29
29
  TEST_CASE("cpc sketch: compress and decompress pairs", "[cpc_sketch]") {
30
- const int N = 200;
31
- const int MAXWORDS = 1000;
30
+ const size_t N = 200;
31
+ const size_t MAXWORDS = 1000;
32
32
 
33
33
  HashState twoHashes;
34
34
  uint32_t pairArray[N];
35
35
  uint32_t pairArray2[N];
36
36
  uint64_t value = 35538947; // some arbitrary starting value
37
37
  const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio
38
- for (int i = 0; i < N; i++) {
38
+ for (size_t i = 0; i < N; i++) {
39
39
  MurmurHash3_x64_128(&value, sizeof(value), 0, twoHashes);
40
40
  uint32_t rand = twoHashes.h1 & 0xffff;
41
41
  pairArray[i] = rand;
42
42
  value += golden64;
43
43
  }
44
44
  //table::knuth_shell_sort3(pairArray, 0, N - 1); // unsigned numerical sort
45
- std::sort(pairArray, &pairArray[N]);
45
+ std::sort(pairArray, pairArray + N);
46
46
  uint32_t prev = UINT32_MAX;
47
- int nxt = 0;
48
- for (int i = 0; i < N; i++) { // uniquify
47
+ uint32_t nxt = 0;
48
+ for (size_t i = 0; i < N; i++) { // uniquify
49
49
  if (pairArray[i] != prev) {
50
50
  prev = pairArray[i];
51
51
  pairArray[nxt++] = pairArray[i];
52
52
  }
53
53
  }
54
- int numPairs = nxt;
54
+ uint32_t numPairs = nxt;
55
55
 
56
56
  uint32_t compressedWords[MAXWORDS];
57
57
 
58
- for (size_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
59
- size_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
58
+ for (uint8_t numBaseBits = 0; numBaseBits <= 11; numBaseBits++) {
59
+ uint32_t numWordsWritten = get_compressor<std::allocator<void>>().low_level_compress_pairs(pairArray, numPairs, numBaseBits, compressedWords);
60
60
  get_compressor<std::allocator<void>>().low_level_uncompress_pairs(pairArray2, numPairs, numBaseBits, compressedWords, numWordsWritten);
61
- for (int i = 0; i < numPairs; i++) {
61
+ for (size_t i = 0; i < numPairs; i++) {
62
62
  REQUIRE(pairArray[i] == pairArray2[i]);
63
63
  }
64
64
  }
@@ -283,6 +283,26 @@ TEST_CASE("cpc sketch: serialize deserialize sliding, bytes", "[cpc_sketch]") {
283
283
  REQUIRE(deserialized.validate());
284
284
  }
285
285
 
286
+ TEST_CASE("cpc sketch: serialize deserialize sliding huge", "[cpc_sketch]") {
287
+ cpc_sketch sketch(26);
288
+ const int n = 10000000;
289
+ for (int i = 0; i < n; i++) sketch.update(i);
290
+ REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.001));
291
+ auto bytes = sketch.serialize();
292
+ cpc_sketch deserialized = cpc_sketch::deserialize(bytes.data(), bytes.size());
293
+ REQUIRE(deserialized.is_empty() == sketch.is_empty());
294
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
295
+ REQUIRE(deserialized.validate());
296
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 7), std::out_of_range);
297
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), 15), std::out_of_range);
298
+ REQUIRE_THROWS_AS(cpc_sketch::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
299
+
300
+ // updating again with the same values should not change the sketch
301
+ for (int i = 0; i < n; i++) deserialized.update(i);
302
+ REQUIRE(deserialized.get_estimate() == sketch.get_estimate());
303
+ REQUIRE(deserialized.validate());
304
+ }
305
+
286
306
  TEST_CASE("cpc sketch: copy", "[cpc_sketch]") {
287
307
  cpc_sketch s1(11);
288
308
  s1.update(1);
@@ -378,4 +398,9 @@ TEST_CASE("cpc sketch: update string equivalence", "[cpc_sketch]") {
378
398
  REQUIRE(sketch.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
379
399
  }
380
400
 
401
+ TEST_CASE("cpc sketch: max serialized size", "[cpc_sketch]") {
402
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(4) == 24 + 40);
403
+ REQUIRE(cpc_sketch::get_max_serialized_size_bytes(26) == static_cast<size_t>((0.6 * (1 << 26)) + 40));
404
+ }
405
+
381
406
  } /* namespace datasketches */
@@ -81,7 +81,7 @@ TEST_CASE("cpc union: large", "[cpc_union]") {
81
81
  cpc_union u(11);
82
82
  for (int i = 0; i < 1000; i++) {
83
83
  cpc_sketch tmp(11);
84
- for (int i = 0; i < 10000; i++) {
84
+ for (int j = 0; j < 10000; j++) {
85
85
  s.update(key);
86
86
  tmp.update(key);
87
87
  key++;
@@ -65,7 +65,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
65
65
  void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
66
66
  if (other.is_empty()) return;
67
67
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
68
- for (auto &it: other.map) {
68
+ for (auto it: other.map) {
69
69
  update(it.first, it.second);
70
70
  }
71
71
  offset += other.offset;
@@ -76,7 +76,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
76
76
  void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
77
77
  if (other.is_empty()) return;
78
78
  const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
79
- for (auto &it: other.map) {
79
+ for (auto it: other.map) {
80
80
  update(std::move(it.first), it.second);
81
81
  }
82
82
  offset += other.offset;
@@ -147,7 +147,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
147
147
  typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
148
148
  frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
149
149
  vector_row items(map.get_allocator());
150
- for (auto &it: map) {
150
+ for (auto it: map) {
151
151
  const W lb = it.second;
152
152
  const W ub = it.second + offset;
153
153
  if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
@@ -162,28 +162,28 @@ frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error
162
162
  template<typename T, typename W, typename H, typename E, typename S, typename A>
163
163
  void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
164
164
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
165
- os.write((char*)&preamble_longs, sizeof(preamble_longs));
165
+ write(os, preamble_longs);
166
166
  const uint8_t serial_version = SERIAL_VERSION;
167
- os.write((char*)&serial_version, sizeof(serial_version));
167
+ write(os, serial_version);
168
168
  const uint8_t family = FAMILY_ID;
169
- os.write((char*)&family, sizeof(family));
169
+ write(os, family);
170
170
  const uint8_t lg_max_size = map.get_lg_max_size();
171
- os.write((char*)&lg_max_size, sizeof(lg_max_size));
171
+ write(os, lg_max_size);
172
172
  const uint8_t lg_cur_size = map.get_lg_cur_size();
173
- os.write((char*)&lg_cur_size, sizeof(lg_cur_size));
173
+ write(os, lg_cur_size);
174
174
  const uint8_t flags_byte(
175
175
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
176
176
  );
177
- os.write((char*)&flags_byte, sizeof(flags_byte));
177
+ write(os, flags_byte);
178
178
  const uint16_t unused16 = 0;
179
- os.write((char*)&unused16, sizeof(unused16));
179
+ write(os, unused16);
180
180
  if (!is_empty()) {
181
181
  const uint32_t num_items = map.get_num_active();
182
- os.write((char*)&num_items, sizeof(num_items));
182
+ write(os, num_items);
183
183
  const uint32_t unused32 = 0;
184
- os.write((char*)&unused32, sizeof(unused32));
185
- os.write((char*)&total_weight, sizeof(total_weight));
186
- os.write((char*)&offset, sizeof(offset));
184
+ write(os, unused32);
185
+ write(os, total_weight);
186
+ write(os, offset);
187
187
 
188
188
  // copy active items and their weights to use batch serialization
189
189
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
@@ -192,14 +192,14 @@ void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const
192
192
  A alloc(map.get_allocator());
193
193
  T* items = alloc.allocate(num_items);
194
194
  uint32_t i = 0;
195
- for (auto &it: map) {
195
+ for (auto it: map) {
196
196
  new (&items[i]) T(it.first);
197
197
  weights[i++] = it.second;
198
198
  }
199
- os.write((char*)weights, sizeof(W) * num_items);
199
+ write(os, weights, sizeof(W) * num_items);
200
200
  aw.deallocate(weights, num_items);
201
201
  S().serialize(os, items, num_items);
202
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
202
+ for (i = 0; i < num_items; i++) items[i].~T();
203
203
  alloc.deallocate(items, num_items);
204
204
  }
205
205
  }
@@ -208,7 +208,7 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
208
208
  size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
209
209
  if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
210
210
  size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
211
- for (auto &it: map) size += S().size_of_item(it.first);
211
+ for (auto it: map) size += S().size_of_item(it.first);
212
212
  return size;
213
213
  }
214
214
 
@@ -220,28 +220,26 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
220
220
  uint8_t* end_ptr = ptr + size;
221
221
 
222
222
  const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
223
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
223
+ ptr += copy_to_mem(preamble_longs, ptr);
224
224
  const uint8_t serial_version = SERIAL_VERSION;
225
- ptr += copy_to_mem(&serial_version, ptr, sizeof(uint8_t));
225
+ ptr += copy_to_mem(serial_version, ptr);
226
226
  const uint8_t family = FAMILY_ID;
227
- ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
227
+ ptr += copy_to_mem(family, ptr);
228
228
  const uint8_t lg_max_size = map.get_lg_max_size();
229
- ptr += copy_to_mem(&lg_max_size, ptr, sizeof(uint8_t));
229
+ ptr += copy_to_mem(lg_max_size, ptr);
230
230
  const uint8_t lg_cur_size = map.get_lg_cur_size();
231
- ptr += copy_to_mem(&lg_cur_size, ptr, sizeof(uint8_t));
231
+ ptr += copy_to_mem(lg_cur_size, ptr);
232
232
  const uint8_t flags_byte(
233
233
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
234
234
  );
235
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(uint8_t));
236
- const uint16_t unused16 = 0;
237
- ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
235
+ ptr += copy_to_mem(flags_byte, ptr);
236
+ ptr += sizeof(uint16_t); // unused
238
237
  if (!is_empty()) {
239
238
  const uint32_t num_items = map.get_num_active();
240
- ptr += copy_to_mem(&num_items, ptr, sizeof(uint32_t));
241
- const uint32_t unused32 = 0;
242
- ptr += copy_to_mem(&unused32, ptr, sizeof(uint32_t));
243
- ptr += copy_to_mem(&total_weight, ptr, sizeof(total_weight));
244
- ptr += copy_to_mem(&offset, ptr, sizeof(offset));
239
+ ptr += copy_to_mem(num_items, ptr);
240
+ ptr += sizeof(uint32_t); // unused
241
+ ptr += copy_to_mem(total_weight, ptr);
242
+ ptr += copy_to_mem(offset, ptr);
245
243
 
246
244
  // copy active items and their weights to use batch serialization
247
245
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
@@ -250,7 +248,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
250
248
  A alloc(map.get_allocator());
251
249
  T* items = alloc.allocate(num_items);
252
250
  uint32_t i = 0;
253
- for (auto &it: map) {
251
+ for (auto it: map) {
254
252
  new (&items[i]) T(it.first);
255
253
  weights[i++] = it.second;
256
254
  }
@@ -258,7 +256,7 @@ auto frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_byt
258
256
  aw.deallocate(weights, num_items);
259
257
  const size_t bytes_remaining = end_ptr - ptr;
260
258
  ptr += S().serialize(ptr, bytes_remaining, items, num_items);
261
- for (unsigned i = 0; i < num_items; i++) items[i].~T();
259
+ for (i = 0; i < num_items; i++) items[i].~T();
262
260
  alloc.deallocate(items, num_items);
263
261
  }
264
262
  return bytes;
@@ -268,38 +266,31 @@ template<typename T, typename W, typename H, typename E, typename S, typename A>
268
266
  class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
269
267
  public:
270
268
  items_deleter(uint32_t num, bool destroy, const A& allocator):
271
- allocator(allocator), num(num), destroy(destroy) {}
272
- void set_destroy(bool destroy) { this->destroy = destroy; }
269
+ allocator_(allocator), num_(num), destroy_(destroy) {}
270
+ void set_destroy(bool destroy) { destroy_ = destroy; }
273
271
  void operator() (T* ptr) {
274
272
  if (ptr != nullptr) {
275
- if (destroy) {
276
- for (uint32_t i = 0; i < num; ++i) ptr[i].~T();
273
+ if (destroy_) {
274
+ for (uint32_t i = 0; i < num_; ++i) ptr[i].~T();
277
275
  }
278
- allocator.deallocate(ptr, num);
276
+ allocator_.deallocate(ptr, num_);
279
277
  }
280
278
  }
281
279
  private:
282
- A allocator;
283
- uint32_t num;
284
- bool destroy;
280
+ A allocator_;
281
+ uint32_t num_;
282
+ bool destroy_;
285
283
  };
286
284
 
287
285
  template<typename T, typename W, typename H, typename E, typename S, typename A>
288
286
  frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is, const A& allocator) {
289
- uint8_t preamble_longs;
290
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
291
- uint8_t serial_version;
292
- is.read((char*)&serial_version, sizeof(serial_version));
293
- uint8_t family_id;
294
- is.read((char*)&family_id, sizeof(family_id));
295
- uint8_t lg_max_size;
296
- is.read((char*)&lg_max_size, sizeof(lg_max_size));
297
- uint8_t lg_cur_size;
298
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
299
- uint8_t flags_byte;
300
- is.read((char*)&flags_byte, sizeof(flags_byte));
301
- uint16_t unused16;
302
- is.read((char*)&unused16, sizeof(unused16));
287
+ const auto preamble_longs = read<uint8_t>(is);
288
+ const auto serial_version = read<uint8_t>(is);
289
+ const auto family_id = read<uint8_t>(is);
290
+ const auto lg_max_size = read<uint8_t>(is);
291
+ const auto lg_cur_size = read<uint8_t>(is);
292
+ const auto flags_byte = read<uint8_t>(is);
293
+ read<uint16_t>(is); // unused
303
294
 
304
295
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
305
296
 
@@ -310,19 +301,15 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
310
301
 
311
302
  frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
312
303
  if (!is_empty) {
313
- uint32_t num_items;
314
- is.read((char*)&num_items, sizeof(num_items));
315
- uint32_t unused32;
316
- is.read((char*)&unused32, sizeof(unused32));
317
- W total_weight;
318
- is.read((char*)&total_weight, sizeof(total_weight));
319
- W offset;
320
- is.read((char*)&offset, sizeof(offset));
304
+ const auto num_items = read<uint32_t>(is);
305
+ read<uint32_t>(is); // unused
306
+ const auto total_weight = read<W>(is);
307
+ const auto offset = read<W>(is);
321
308
 
322
309
  // batch deserialization with intermediate array of items and weights
323
310
  using AllocW = typename std::allocator_traits<A>::template rebind_alloc<W>;
324
311
  std::vector<W, AllocW> weights(num_items, 0, allocator);
325
- is.read((char*)weights.data(), sizeof(W) * num_items);
312
+ read(is, weights.data(), sizeof(W) * num_items);
326
313
  A alloc(allocator);
327
314
  std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(num_items, false, alloc));
328
315
  S().deserialize(is, items.get(), num_items);
@@ -344,19 +331,18 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
344
331
  const char* ptr = static_cast<const char*>(bytes);
345
332
  const char* base = static_cast<const char*>(bytes);
346
333
  uint8_t preamble_longs;
347
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(uint8_t));
334
+ ptr += copy_from_mem(ptr, preamble_longs);
348
335
  uint8_t serial_version;
349
- ptr += copy_from_mem(ptr, &serial_version, sizeof(uint8_t));
336
+ ptr += copy_from_mem(ptr, serial_version);
350
337
  uint8_t family_id;
351
- ptr += copy_from_mem(ptr, &family_id, sizeof(uint8_t));
338
+ ptr += copy_from_mem(ptr, family_id);
352
339
  uint8_t lg_max_size;
353
- ptr += copy_from_mem(ptr, &lg_max_size, sizeof(uint8_t));
340
+ ptr += copy_from_mem(ptr, lg_max_size);
354
341
  uint8_t lg_cur_size;
355
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(uint8_t));
342
+ ptr += copy_from_mem(ptr, lg_cur_size);
356
343
  uint8_t flags_byte;
357
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(uint8_t));
358
- uint16_t unused16;
359
- ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
344
+ ptr += copy_from_mem(ptr, flags_byte);
345
+ ptr += sizeof(uint16_t); // unused
360
346
 
361
347
  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
362
348
 
@@ -364,18 +350,17 @@ frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>:
364
350
  check_serial_version(serial_version);
365
351
  check_family_id(family_id);
366
352
  check_size(lg_cur_size, lg_max_size);
367
- ensure_minimum_memory(size, 1 << preamble_longs);
353
+ ensure_minimum_memory(size, 1ULL << preamble_longs);
368
354
 
369
355
  frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size, allocator);
370
356
  if (!is_empty) {
371
357
  uint32_t num_items;
372
- ptr += copy_from_mem(ptr, &num_items, sizeof(uint32_t));
373
- uint32_t unused32;
374
- ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
358
+ ptr += copy_from_mem(ptr, num_items);
359
+ ptr += sizeof(uint32_t); // unused
375
360
  W total_weight;
376
- ptr += copy_from_mem(ptr, &total_weight, sizeof(total_weight));
361
+ ptr += copy_from_mem(ptr, total_weight);
377
362
  W offset;
378
- ptr += copy_from_mem(ptr, &offset, sizeof(offset));
363
+ ptr += copy_from_mem(ptr, offset);
379
364
 
380
365
  ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
381
366
  // batch deserialization with intermediate array of items and weights
@@ -446,14 +431,14 @@ string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) c
446
431
  os << "### End sketch summary" << std::endl;
447
432
  if (print_items) {
448
433
  vector_row items;
449
- for (auto &it: map) {
434
+ for (auto it: map) {
450
435
  items.push_back(row(&it.first, it.second, offset));
451
436
  }
452
437
  // sort by estimate in descending order
453
438
  std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
454
439
  os << "### Items in descending order by estimate" << std::endl;
455
440
  os << " item, estimate, lower bound, upper bound" << std::endl;
456
- for (auto &it: items) {
441
+ for (auto it: items) {
457
442
  os << " " << it.get_item() << ", " << it.get_estimate() << ", "
458
443
  << it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
459
444
  }