datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -49,12 +49,12 @@ cpc_compressor<A>::~cpc_compressor() {
49
49
  }
50
50
 
51
51
  template<typename A>
52
- uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, int length) {
52
+ uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, unsigned length) {
53
53
  uint8_t* inverse = new uint8_t[length]; // use new for global initialization
54
- for (int i = 0; i < length; i++) {
55
- inverse[permu[i]] = i;
54
+ for (unsigned i = 0; i < length; i++) {
55
+ inverse[permu[i]] = static_cast<uint8_t>(i);
56
56
  }
57
- for (int i = 0; i < length; i++) {
57
+ for (unsigned i = 0; i < length; i++) {
58
58
  if (permu[inverse[i]] != i) throw std::logic_error("inverse permutation error");
59
59
  }
60
60
  return inverse;
@@ -64,17 +64,17 @@ uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, int l
64
64
  of length at most 12, this builds a size-4096 decoding table */
65
65
  // The second argument is typically 256, but can be other values such as 65.
66
66
  template<typename A>
67
- uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table, int num_byte_values) {
67
+ uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values) {
68
68
  uint16_t* decoding_table = new uint16_t[4096]; // use new for global initialization
69
- for (int byte_value = 0; byte_value < num_byte_values; byte_value++) {
70
- const int encoding_entry = encoding_table[byte_value];
71
- const int code_value = encoding_entry & 0xfff;
72
- const int code_length = encoding_entry >> 12;
73
- const int decoding_entry = (code_length << 8) | byte_value;
74
- const int garbage_length = 12 - code_length;
75
- const int num_copies = 1 << garbage_length;
76
- for (int garbage_bits = 0; garbage_bits < num_copies; garbage_bits++) {
77
- const int extended_code_value = code_value | (garbage_bits << code_length);
69
+ for (unsigned byte_value = 0; byte_value < num_byte_values; byte_value++) {
70
+ const uint16_t encoding_entry = encoding_table[byte_value];
71
+ const uint16_t code_value = encoding_entry & 0xfff;
72
+ const uint8_t code_length = encoding_entry >> 12;
73
+ const uint16_t decoding_entry = static_cast<uint16_t>((code_length << 8) | byte_value);
74
+ const uint8_t garbage_length = 12 - code_length;
75
+ const uint32_t num_copies = 1 << garbage_length;
76
+ for (uint32_t garbage_bits = 0; garbage_bits < num_copies; garbage_bits++) {
77
+ const uint16_t extended_code_value = static_cast<uint16_t>(code_value | (garbage_bits << code_length));
78
78
  decoding_table[extended_code_value & 0xfff] = decoding_entry;
79
79
  }
80
80
  }
@@ -157,7 +157,7 @@ void cpc_compressor<A>::compress(const cpc_sketch_alloc<A>& source, compressed_s
157
157
  }
158
158
 
159
159
  template<typename A>
160
- void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
160
+ void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
161
161
  switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
162
162
  case cpc_sketch_alloc<A>::flavor::EMPTY:
163
163
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
@@ -202,16 +202,17 @@ template<typename A>
202
202
  void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
203
203
  if (source.sliding_window.size() == 0) throw std::logic_error("no sliding window");
204
204
  if (source.window_offset != 0) throw std::logic_error("window_offset != 0");
205
- const size_t k = 1 << source.get_lg_k();
205
+ const uint32_t k = 1 << source.get_lg_k();
206
206
  vector_u32<A> pairs_from_table = source.surprising_value_table.unwrapping_get_items();
207
- if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
208
- const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
207
+ const uint32_t num_pairs_from_table = static_cast<uint32_t>(pairs_from_table.size());
208
+ if (num_pairs_from_table > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, num_pairs_from_table);
209
+ const uint32_t num_pairs_from_window = source.get_num_coupons() - num_pairs_from_table; // because the window offset is zero
209
210
 
210
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
211
+ vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
211
212
 
212
213
  u32_table<A>::merge(
213
214
  pairs_from_table.data(), 0, pairs_from_table.size(),
214
- all_pairs.data(), pairs_from_table.size(), num_pairs_from_window,
215
+ all_pairs.data(), num_pairs_from_table, num_pairs_from_window,
215
216
  all_pairs.data(), 0
216
217
  ); // note the overlapping subarray trick
217
218
 
@@ -228,15 +229,15 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
228
229
  // In the hybrid flavor, some of these pairs actually
229
230
  // belong in the window, so we will separate them out,
230
231
  // moving the "true" pairs to the bottom of the array.
231
- const size_t k = 1 << lg_k;
232
+ const uint32_t k = 1 << lg_k;
232
233
  target.window.resize(k, 0); // important: zero the memory
233
- size_t next_true_pair = 0;
234
- for (size_t i = 0; i < source.table_num_entries; i++) {
234
+ uint32_t next_true_pair = 0;
235
+ for (uint32_t i = 0; i < source.table_num_entries; i++) {
235
236
  const uint32_t row_col = pairs[i];
236
237
  if (row_col == UINT32_MAX) throw std::logic_error("empty marker is not expected");
237
238
  const uint8_t col = row_col & 63;
238
239
  if (col < 8) {
239
- const size_t row = row_col >> 6;
240
+ const uint32_t row = row_col >> 6;
240
241
  target.window[row] |= 1 << col; // set the window bit
241
242
  } else {
242
243
  pairs[next_true_pair++] = row_col; // move true pair down
@@ -270,7 +271,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
270
271
  uint8_t lg_k, uint32_t num_coupons) const {
271
272
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
272
273
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
273
- const size_t num_pairs = source.table_num_entries;
274
+ const uint32_t num_pairs = source.table_num_entries;
274
275
  if (num_pairs == 0) {
275
276
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
276
277
  } else {
@@ -278,7 +279,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
278
279
  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
279
280
  lg_k, source.table_data.get_allocator());
280
281
  // undo the compressor's 8-column shift
281
- for (size_t i = 0; i < num_pairs; i++) {
282
+ for (uint32_t i = 0; i < num_pairs; i++) {
282
283
  if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
283
284
  pairs[i] += 8;
284
285
  }
@@ -302,7 +303,7 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
302
303
 
303
304
  for (size_t i = 0; i < pairs.size(); i++) {
304
305
  const uint32_t row_col = pairs[i];
305
- const size_t row = row_col >> 6;
306
+ const uint32_t row = row_col >> 6;
306
307
  uint8_t col = row_col & 63;
307
308
  // first rotate the columns into a canonical configuration: new = ((old - (offset+8)) + 64) mod 64
308
309
  col = (col + 56 - offset) & 63;
@@ -322,7 +323,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
322
323
  uint8_t lg_k, uint32_t num_coupons) const {
323
324
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
324
325
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
325
- const size_t num_pairs = source.table_num_entries;
326
+ const uint32_t num_pairs = source.table_num_entries;
326
327
  if (num_pairs == 0) {
327
328
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
328
329
  } else {
@@ -337,9 +338,9 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
337
338
  uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
338
339
  if (offset > 56) throw std::out_of_range("offset out of range");
339
340
 
340
- for (size_t i = 0; i < num_pairs; i++) {
341
+ for (uint32_t i = 0; i < num_pairs; i++) {
341
342
  const uint32_t row_col = pairs[i];
342
- const size_t row = row_col >> 6;
343
+ const uint32_t row = row_col >> 6;
343
344
  uint8_t col = row_col & 63;
344
345
  // first undo the permutation
345
346
  col = permutation[col];
@@ -354,25 +355,26 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
354
355
 
355
356
  template<typename A>
356
357
  void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const {
357
- const size_t k = 1 << lg_k;
358
- const uint64_t num_base_bits = golomb_choose_number_of_base_bits(k + pairs.size(), pairs.size());
359
- const uint64_t table_len = safe_length_for_compressed_pair_buf(k, pairs.size(), num_base_bits);
358
+ const uint32_t k = 1 << lg_k;
359
+ const uint32_t num_pairs = static_cast<uint32_t>(pairs.size());
360
+ const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
361
+ const uint64_t table_len = safe_length_for_compressed_pair_buf(k, num_pairs, num_base_bits);
360
362
  result.table_data.resize(table_len);
361
363
 
362
- size_t csv_length = low_level_compress_pairs(pairs.data(), pairs.size(), num_base_bits, result.table_data.data());
364
+ uint32_t csv_length = low_level_compress_pairs(pairs.data(), static_cast<uint32_t>(pairs.size()), num_base_bits, result.table_data.data());
363
365
 
364
366
  // At this point we could free the unused portion of the compression output buffer,
365
367
  // but it is not necessary if it is temporary
366
368
  // Note: realloc caused strange timing spikes for lgK = 11 and 12.
367
369
 
368
370
  result.table_data_words = csv_length;
369
- result.table_num_entries = pairs.size();
371
+ result.table_num_entries = num_pairs;
370
372
  }
371
373
 
372
374
  template<typename A>
373
- vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
375
+ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
374
376
  uint8_t lg_k, const A& allocator) const {
375
- const size_t k = 1 << lg_k;
377
+ const uint32_t k = 1 << lg_k;
376
378
  vector_u32<A> pairs(num_pairs, 0, allocator);
377
379
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
378
380
  low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
@@ -381,7 +383,7 @@ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* da
381
383
 
382
384
  template<typename A>
383
385
  void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const {
384
- const size_t k = 1 << lg_k;
386
+ const uint32_t k = 1 << lg_k;
385
387
  const size_t window_buf_len = safe_length_for_compressed_window_buf(k);
386
388
  target.window_data.resize(window_buf_len);
387
389
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -391,20 +393,20 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
391
393
  // but it is not necessary if it is temporary
392
394
  // Note: realloc caused strange timing spikes for lgK = 11 and 12.
393
395
 
394
- target.window_data_words = data_words;
396
+ target.window_data_words = static_cast<uint32_t>(data_words);
395
397
  }
396
398
 
397
399
  template<typename A>
398
- void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
400
+ void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window,
399
401
  uint8_t lg_k, uint32_t num_coupons) const {
400
- const size_t k = 1 << lg_k;
402
+ const uint32_t k = 1 << lg_k;
401
403
  window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
402
404
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
403
405
  low_level_uncompress_bytes(window.data(), k, decoding_tables_for_high_entropy_byte[pseudo_phase], data, data_words);
404
406
  }
405
407
 
406
408
  template<typename A>
407
- size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits) {
409
+ size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits) {
408
410
  // Long ybits = k + numPairs; // simpler and safer UB
409
411
  // The following tighter UB on ybits is based on page 198
410
412
  // of the textbook "Managing Gigabytes" by Witten, Moffat, and Bell.
@@ -422,14 +424,14 @@ size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint64_t k, size_t
422
424
  // So the 12-bit lookahead is the tight constraint, but there are at least (2 + B) bits emitted,
423
425
  // so we would be safe with max (0, 10 - B) bits of padding at the end of the bitstream.
424
426
  template<typename A>
425
- size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(uint64_t k) { // measured in 32-bit words
427
+ size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(uint32_t k) { // measured in 32-bit words
426
428
  const size_t bits = 12 * k + 11; // 11 bits of padding, due to 12-bit lookahead, with 1 bit certainly present.
427
429
  return divide_longs_rounding_up(bits, 32);
428
430
  }
429
431
 
430
432
  template<typename A>
431
- uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint64_t c) {
432
- const size_t k = 1 << lg_k;
433
+ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint32_t c) {
434
+ const uint32_t k = 1 << lg_k;
433
435
  // This mid-range logic produces pseudo-phases. They are used to select encoding tables.
434
436
  // The thresholds were chosen by hand after looking at plots of measured compression.
435
437
  if (1000 * c < 2375 * k) {
@@ -450,7 +452,7 @@ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint64_t c) {
450
452
  }
451
453
  }
452
454
 
453
- static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr, size_t& wordindex) {
455
+ static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr, uint32_t& wordindex) {
454
456
  if (bufbits >= 32) {
455
457
  wordarr[wordindex++] = bitbuf & 0xffffffff;
456
458
  bitbuf = bitbuf >> 32;
@@ -458,7 +460,7 @@ static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32
458
460
  }
459
461
  }
460
462
 
461
- static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr, size_t& wordindex, uint8_t minbits) {
463
+ static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr, uint32_t& wordindex, uint8_t minbits) {
462
464
  if (bufbits < minbits) {
463
465
  bitbuf |= static_cast<uint64_t>(wordarr[wordindex++]) << bufbits;
464
466
  bufbits += 32;
@@ -468,20 +470,20 @@ static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const u
468
470
  // This returns the number of compressed words that were actually used.
469
471
  // It is the caller's responsibility to ensure that the compressed_words array is long enough.
470
472
  template<typename A>
471
- size_t cpc_compressor<A>::low_level_compress_bytes(
473
+ uint32_t cpc_compressor<A>::low_level_compress_bytes(
472
474
  const uint8_t* byte_array, // input
473
- size_t num_bytes_to_encode,
475
+ uint32_t num_bytes_to_encode,
474
476
  const uint16_t* encoding_table,
475
477
  uint32_t* compressed_words // output
476
478
  ) const {
477
479
  uint64_t bitbuf = 0; // bits are packed into this first, then are flushed to compressed_words
478
480
  uint8_t bufbits = 0; // number of bits currently in bitbuf; must be between 0 and 31
479
- size_t next_word_index = 0;
481
+ uint32_t next_word_index = 0;
480
482
 
481
- for (size_t byte_index = 0; byte_index < num_bytes_to_encode; byte_index++) {
482
- const uint64_t code_info = encoding_table[byte_array[byte_index]];
483
+ for (uint32_t byte_index = 0; byte_index < num_bytes_to_encode; byte_index++) {
484
+ const uint16_t code_info = encoding_table[byte_array[byte_index]];
483
485
  const uint64_t code_val = code_info & 0xfff;
484
- const int code_len = code_info >> 12;
486
+ const uint8_t code_len = code_info >> 12;
485
487
  bitbuf |= (code_val << bufbits);
486
488
  bufbits += code_len;
487
489
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
@@ -502,12 +504,12 @@ size_t cpc_compressor<A>::low_level_compress_bytes(
502
504
  template<typename A>
503
505
  void cpc_compressor<A>::low_level_uncompress_bytes(
504
506
  uint8_t* byte_array, // output
505
- size_t num_bytes_to_decode,
507
+ uint32_t num_bytes_to_decode,
506
508
  const uint16_t* decoding_table,
507
509
  const uint32_t* compressed_words, // input
508
- size_t num_compressed_words
510
+ uint32_t num_compressed_words
509
511
  ) const {
510
- size_t word_index = 0;
512
+ uint32_t word_index = 0;
511
513
  uint64_t bitbuf = 0;
512
514
  uint8_t bufbits = 0;
513
515
 
@@ -515,7 +517,7 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
515
517
  if (decoding_table == nullptr) throw std::logic_error("decoding_table == NULL");
516
518
  if (compressed_words == nullptr) throw std::logic_error("compressed_words == NULL");
517
519
 
518
- for (size_t byte_index = 0; byte_index < num_bytes_to_decode; byte_index++) {
520
+ for (uint32_t byte_index = 0; byte_index < num_bytes_to_decode; byte_index++) {
519
521
  maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
520
522
 
521
523
  const size_t peek12 = bitbuf & 0xfff; // These 12 bits will include an entire Huffman codeword.
@@ -533,14 +535,14 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
533
535
 
534
536
  static inline uint64_t read_unary(
535
537
  const uint32_t* compressed_words,
536
- size_t& next_word_index,
538
+ uint32_t& next_word_index,
537
539
  uint64_t& bitbuf,
538
540
  uint8_t& bufbits
539
541
  );
540
542
 
541
543
  static inline void write_unary(
542
544
  uint32_t* compressed_words,
543
- size_t& next_word_index_ptr,
545
+ uint32_t& next_word_index_ptr,
544
546
  uint64_t& bit_buf_ptr,
545
547
  uint8_t& buf_bits_ptr,
546
548
  uint64_t value
@@ -551,38 +553,38 @@ static inline void write_unary(
551
553
 
552
554
  // returns the number of compressed_words actually used
553
555
  template<typename A>
554
- size_t cpc_compressor<A>::low_level_compress_pairs(
556
+ uint32_t cpc_compressor<A>::low_level_compress_pairs(
555
557
  const uint32_t* pair_array, // input
556
- size_t num_pairs_to_encode,
557
- size_t num_base_bits,
558
+ uint32_t num_pairs_to_encode,
559
+ uint8_t num_base_bits,
558
560
  uint32_t* compressed_words // output
559
561
  ) const {
560
562
  uint64_t bitbuf = 0;
561
563
  uint8_t bufbits = 0;
562
- size_t next_word_index = 0;
564
+ uint32_t next_word_index = 0;
563
565
  const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
564
- uint64_t predicted_row_index = 0;
565
- uint16_t predicted_col_index = 0;
566
+ uint32_t predicted_row_index = 0;
567
+ uint8_t predicted_col_index = 0;
566
568
 
567
- for (size_t pair_index = 0; pair_index < num_pairs_to_encode; pair_index++) {
569
+ for (uint32_t pair_index = 0; pair_index < num_pairs_to_encode; pair_index++) {
568
570
  const uint32_t row_col = pair_array[pair_index];
569
- const uint64_t row_index = row_col >> 6;
570
- const uint16_t col_index = row_col & 63;
571
+ const uint32_t row_index = row_col >> 6;
572
+ const uint8_t col_index = row_col & 63;
571
573
 
572
574
  if (row_index != predicted_row_index) predicted_col_index = 0;
573
575
 
574
576
  if (row_index < predicted_row_index) throw std::logic_error("row_index < predicted_row_index");
575
577
  if (col_index < predicted_col_index) throw std::logic_error("col_index < predicted_col_index");
576
578
 
577
- const uint64_t y_delta = row_index - predicted_row_index;
578
- const uint16_t x_delta = col_index - predicted_col_index;
579
+ const uint32_t y_delta = row_index - predicted_row_index;
580
+ const uint8_t x_delta = col_index - predicted_col_index;
579
581
 
580
582
  predicted_row_index = row_index;
581
583
  predicted_col_index = col_index + 1;
582
584
 
583
- const uint64_t code_info = length_limited_unary_encoding_table65[x_delta];
585
+ const uint16_t code_info = length_limited_unary_encoding_table65[x_delta];
584
586
  const uint64_t code_val = code_info & 0xfff;
585
- const uint8_t code_len = code_info >> 12;
587
+ const uint8_t code_len = static_cast<uint8_t>(code_info >> 12);
586
588
  bitbuf |= code_val << bufbits;
587
589
  bufbits += code_len;
588
590
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
@@ -614,29 +616,29 @@ size_t cpc_compressor<A>::low_level_compress_pairs(
614
616
  template<typename A>
615
617
  void cpc_compressor<A>::low_level_uncompress_pairs(
616
618
  uint32_t* pair_array, // output
617
- size_t num_pairs_to_decode,
618
- size_t num_base_bits,
619
+ uint32_t num_pairs_to_decode,
620
+ uint8_t num_base_bits,
619
621
  const uint32_t* compressed_words, // input
620
- size_t num_compressed_words
622
+ uint32_t num_compressed_words
621
623
  ) const {
622
- size_t word_index = 0;
624
+ uint32_t word_index = 0;
623
625
  uint64_t bitbuf = 0;
624
626
  uint8_t bufbits = 0;
625
627
  const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
626
- uint64_t predicted_row_index = 0;
627
- uint16_t predicted_col_index = 0;
628
+ uint32_t predicted_row_index = 0;
629
+ uint8_t predicted_col_index = 0;
628
630
 
629
631
  // for each pair we need to read:
630
632
  // x_delta (12-bit length-limited unary)
631
633
  // y_delta_hi (unary)
632
634
  // y_delta_lo (basebits)
633
635
 
634
- for (size_t pair_index = 0; pair_index < num_pairs_to_decode; pair_index++) {
636
+ for (uint32_t pair_index = 0; pair_index < num_pairs_to_decode; pair_index++) {
635
637
  maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
636
638
  const size_t peek12 = bitbuf & 0xfff;
637
639
  const uint16_t lookup = length_limited_unary_decoding_table65[peek12];
638
- const int code_word_length = lookup >> 8;
639
- const int16_t x_delta = lookup & 0xff;
640
+ const uint8_t code_word_length = lookup >> 8;
641
+ const int8_t x_delta = lookup & 0xff;
640
642
  bitbuf >>= code_word_length;
641
643
  bufbits -= code_word_length;
642
644
 
@@ -650,8 +652,8 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
650
652
 
651
653
  // Now that we have x_delta and y_delta, we can compute the pair's row and column
652
654
  if (y_delta > 0) predicted_col_index = 0;
653
- const uint64_t row_index = predicted_row_index + y_delta;
654
- const uint16_t col_index = predicted_col_index + x_delta;
655
+ const uint32_t row_index = static_cast<uint32_t>(predicted_row_index + y_delta);
656
+ const uint8_t col_index = predicted_col_index + x_delta;
655
657
  const uint32_t row_col = (row_index << 6) | col_index;
656
658
  pair_array[pair_index] = row_col;
657
659
  predicted_row_index = row_index;
@@ -662,7 +664,7 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
662
664
 
663
665
  uint64_t read_unary(
664
666
  const uint32_t* compressed_words,
665
- size_t& next_word_index,
667
+ uint32_t& next_word_index,
666
668
  uint64_t& bitbuf,
667
669
  uint8_t& bufbits
668
670
  ) {
@@ -689,7 +691,7 @@ uint64_t read_unary(
689
691
 
690
692
  void write_unary(
691
693
  uint32_t* compressed_words,
692
- size_t& next_word_index,
694
+ uint32_t& next_word_index,
693
695
  uint64_t& bitbuf,
694
696
  uint8_t& bufbits,
695
697
  uint64_t value
@@ -709,9 +711,9 @@ void write_unary(
709
711
 
710
712
  if (remaining > 15) throw std::out_of_range("remaining out of range");
711
713
 
712
- const uint64_t the_unary_code = 1 << remaining;
714
+ const uint64_t the_unary_code = 1ULL << remaining;
713
715
  bitbuf |= the_unary_code << bufbits;
714
- bufbits += 1 + remaining;
716
+ bufbits += static_cast<uint8_t>(remaining + 1);
715
717
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
716
718
  }
717
719
 
@@ -738,12 +740,12 @@ vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* win
738
740
  // returns an integer that is between
739
741
  // zero and ceiling(log_2(k)) - 1, inclusive
740
742
  template<typename A>
741
- uint64_t cpc_compressor<A>::golomb_choose_number_of_base_bits(uint64_t k, uint64_t count) {
743
+ uint8_t cpc_compressor<A>::golomb_choose_number_of_base_bits(uint32_t k, uint64_t count) {
742
744
  if (k < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: k < 1");
743
745
  if (count < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: count < 1");
744
746
  const uint64_t quotient = (k - count) / count; // integer division
745
747
  if (quotient == 0) return 0;
746
- else return long_floor_log2_of_long(quotient);
748
+ else return floor_log2_of_long(quotient);
747
749
  }
748
750
 
749
751
  } /* namespace datasketches */