datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -49,12 +49,12 @@ cpc_compressor<A>::~cpc_compressor() {
49
49
  }
50
50
 
51
51
  template<typename A>
52
- uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, int length) {
52
+ uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, unsigned length) {
53
53
  uint8_t* inverse = new uint8_t[length]; // use new for global initialization
54
- for (int i = 0; i < length; i++) {
55
- inverse[permu[i]] = i;
54
+ for (unsigned i = 0; i < length; i++) {
55
+ inverse[permu[i]] = static_cast<uint8_t>(i);
56
56
  }
57
- for (int i = 0; i < length; i++) {
57
+ for (unsigned i = 0; i < length; i++) {
58
58
  if (permu[inverse[i]] != i) throw std::logic_error("inverse permutation error");
59
59
  }
60
60
  return inverse;
@@ -64,17 +64,17 @@ uint8_t* cpc_compressor<A>::make_inverse_permutation(const uint8_t* permu, int l
64
64
  of length at most 12, this builds a size-4096 decoding table */
65
65
  // The second argument is typically 256, but can be other values such as 65.
66
66
  template<typename A>
67
- uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table, int num_byte_values) {
67
+ uint16_t* cpc_compressor<A>::make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values) {
68
68
  uint16_t* decoding_table = new uint16_t[4096]; // use new for global initialization
69
- for (int byte_value = 0; byte_value < num_byte_values; byte_value++) {
70
- const int encoding_entry = encoding_table[byte_value];
71
- const int code_value = encoding_entry & 0xfff;
72
- const int code_length = encoding_entry >> 12;
73
- const int decoding_entry = (code_length << 8) | byte_value;
74
- const int garbage_length = 12 - code_length;
75
- const int num_copies = 1 << garbage_length;
76
- for (int garbage_bits = 0; garbage_bits < num_copies; garbage_bits++) {
77
- const int extended_code_value = code_value | (garbage_bits << code_length);
69
+ for (unsigned byte_value = 0; byte_value < num_byte_values; byte_value++) {
70
+ const uint16_t encoding_entry = encoding_table[byte_value];
71
+ const uint16_t code_value = encoding_entry & 0xfff;
72
+ const uint8_t code_length = encoding_entry >> 12;
73
+ const uint16_t decoding_entry = static_cast<uint16_t>((code_length << 8) | byte_value);
74
+ const uint8_t garbage_length = 12 - code_length;
75
+ const uint32_t num_copies = 1 << garbage_length;
76
+ for (uint32_t garbage_bits = 0; garbage_bits < num_copies; garbage_bits++) {
77
+ const uint16_t extended_code_value = static_cast<uint16_t>(code_value | (garbage_bits << code_length));
78
78
  decoding_table[extended_code_value & 0xfff] = decoding_entry;
79
79
  }
80
80
  }
@@ -157,7 +157,7 @@ void cpc_compressor<A>::compress(const cpc_sketch_alloc<A>& source, compressed_s
157
157
  }
158
158
 
159
159
  template<typename A>
160
- void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
160
+ void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
161
161
  switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
162
162
  case cpc_sketch_alloc<A>::flavor::EMPTY:
163
163
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
@@ -202,16 +202,17 @@ template<typename A>
202
202
  void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
203
203
  if (source.sliding_window.size() == 0) throw std::logic_error("no sliding window");
204
204
  if (source.window_offset != 0) throw std::logic_error("window_offset != 0");
205
- const size_t k = 1 << source.get_lg_k();
205
+ const uint32_t k = 1 << source.get_lg_k();
206
206
  vector_u32<A> pairs_from_table = source.surprising_value_table.unwrapping_get_items();
207
- if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
208
- const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
207
+ const uint32_t num_pairs_from_table = static_cast<uint32_t>(pairs_from_table.size());
208
+ if (num_pairs_from_table > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, num_pairs_from_table);
209
+ const uint32_t num_pairs_from_window = source.get_num_coupons() - num_pairs_from_table; // because the window offset is zero
209
210
 
210
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
211
+ vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
211
212
 
212
213
  u32_table<A>::merge(
213
214
  pairs_from_table.data(), 0, pairs_from_table.size(),
214
- all_pairs.data(), pairs_from_table.size(), num_pairs_from_window,
215
+ all_pairs.data(), num_pairs_from_table, num_pairs_from_window,
215
216
  all_pairs.data(), 0
216
217
  ); // note the overlapping subarray trick
217
218
 
@@ -228,15 +229,15 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
228
229
  // In the hybrid flavor, some of these pairs actually
229
230
  // belong in the window, so we will separate them out,
230
231
  // moving the "true" pairs to the bottom of the array.
231
- const size_t k = 1 << lg_k;
232
+ const uint32_t k = 1 << lg_k;
232
233
  target.window.resize(k, 0); // important: zero the memory
233
- size_t next_true_pair = 0;
234
- for (size_t i = 0; i < source.table_num_entries; i++) {
234
+ uint32_t next_true_pair = 0;
235
+ for (uint32_t i = 0; i < source.table_num_entries; i++) {
235
236
  const uint32_t row_col = pairs[i];
236
237
  if (row_col == UINT32_MAX) throw std::logic_error("empty marker is not expected");
237
238
  const uint8_t col = row_col & 63;
238
239
  if (col < 8) {
239
- const size_t row = row_col >> 6;
240
+ const uint32_t row = row_col >> 6;
240
241
  target.window[row] |= 1 << col; // set the window bit
241
242
  } else {
242
243
  pairs[next_true_pair++] = row_col; // move true pair down
@@ -270,7 +271,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
270
271
  uint8_t lg_k, uint32_t num_coupons) const {
271
272
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
272
273
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
273
- const size_t num_pairs = source.table_num_entries;
274
+ const uint32_t num_pairs = source.table_num_entries;
274
275
  if (num_pairs == 0) {
275
276
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
276
277
  } else {
@@ -278,7 +279,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
278
279
  vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
279
280
  lg_k, source.table_data.get_allocator());
280
281
  // undo the compressor's 8-column shift
281
- for (size_t i = 0; i < num_pairs; i++) {
282
+ for (uint32_t i = 0; i < num_pairs; i++) {
282
283
  if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
283
284
  pairs[i] += 8;
284
285
  }
@@ -302,7 +303,7 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
302
303
 
303
304
  for (size_t i = 0; i < pairs.size(); i++) {
304
305
  const uint32_t row_col = pairs[i];
305
- const size_t row = row_col >> 6;
306
+ const uint32_t row = row_col >> 6;
306
307
  uint8_t col = row_col & 63;
307
308
  // first rotate the columns into a canonical configuration: new = ((old - (offset+8)) + 64) mod 64
308
309
  col = (col + 56 - offset) & 63;
@@ -322,7 +323,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
322
323
  uint8_t lg_k, uint32_t num_coupons) const {
323
324
  if (source.window_data.size() == 0) throw std::logic_error("window is expected");
324
325
  uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
325
- const size_t num_pairs = source.table_num_entries;
326
+ const uint32_t num_pairs = source.table_num_entries;
326
327
  if (num_pairs == 0) {
327
328
  target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
328
329
  } else {
@@ -337,9 +338,9 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
337
338
  uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
338
339
  if (offset > 56) throw std::out_of_range("offset out of range");
339
340
 
340
- for (size_t i = 0; i < num_pairs; i++) {
341
+ for (uint32_t i = 0; i < num_pairs; i++) {
341
342
  const uint32_t row_col = pairs[i];
342
- const size_t row = row_col >> 6;
343
+ const uint32_t row = row_col >> 6;
343
344
  uint8_t col = row_col & 63;
344
345
  // first undo the permutation
345
346
  col = permutation[col];
@@ -354,25 +355,26 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
354
355
 
355
356
  template<typename A>
356
357
  void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const {
357
- const size_t k = 1 << lg_k;
358
- const uint64_t num_base_bits = golomb_choose_number_of_base_bits(k + pairs.size(), pairs.size());
359
- const uint64_t table_len = safe_length_for_compressed_pair_buf(k, pairs.size(), num_base_bits);
358
+ const uint32_t k = 1 << lg_k;
359
+ const uint32_t num_pairs = static_cast<uint32_t>(pairs.size());
360
+ const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
361
+ const uint64_t table_len = safe_length_for_compressed_pair_buf(k, num_pairs, num_base_bits);
360
362
  result.table_data.resize(table_len);
361
363
 
362
- size_t csv_length = low_level_compress_pairs(pairs.data(), pairs.size(), num_base_bits, result.table_data.data());
364
+ uint32_t csv_length = low_level_compress_pairs(pairs.data(), static_cast<uint32_t>(pairs.size()), num_base_bits, result.table_data.data());
363
365
 
364
366
  // At this point we could free the unused portion of the compression output buffer,
365
367
  // but it is not necessary if it is temporary
366
368
  // Note: realloc caused strange timing spikes for lgK = 11 and 12.
367
369
 
368
370
  result.table_data_words = csv_length;
369
- result.table_num_entries = pairs.size();
371
+ result.table_num_entries = num_pairs;
370
372
  }
371
373
 
372
374
  template<typename A>
373
- vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
375
+ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
374
376
  uint8_t lg_k, const A& allocator) const {
375
- const size_t k = 1 << lg_k;
377
+ const uint32_t k = 1 << lg_k;
376
378
  vector_u32<A> pairs(num_pairs, 0, allocator);
377
379
  const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
378
380
  low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
@@ -381,7 +383,7 @@ vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* da
381
383
 
382
384
  template<typename A>
383
385
  void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const {
384
- const size_t k = 1 << lg_k;
386
+ const uint32_t k = 1 << lg_k;
385
387
  const size_t window_buf_len = safe_length_for_compressed_window_buf(k);
386
388
  target.window_data.resize(window_buf_len);
387
389
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -391,20 +393,20 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
391
393
  // but it is not necessary if it is temporary
392
394
  // Note: realloc caused strange timing spikes for lgK = 11 and 12.
393
395
 
394
- target.window_data_words = data_words;
396
+ target.window_data_words = static_cast<uint32_t>(data_words);
395
397
  }
396
398
 
397
399
  template<typename A>
398
- void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
400
+ void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_u8<A>& window,
399
401
  uint8_t lg_k, uint32_t num_coupons) const {
400
- const size_t k = 1 << lg_k;
402
+ const uint32_t k = 1 << lg_k;
401
403
  window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
402
404
  const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
403
405
  low_level_uncompress_bytes(window.data(), k, decoding_tables_for_high_entropy_byte[pseudo_phase], data, data_words);
404
406
  }
405
407
 
406
408
  template<typename A>
407
- size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits) {
409
+ size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits) {
408
410
  // Long ybits = k + numPairs; // simpler and safer UB
409
411
  // The following tighter UB on ybits is based on page 198
410
412
  // of the textbook "Managing Gigabytes" by Witten, Moffat, and Bell.
@@ -422,14 +424,14 @@ size_t cpc_compressor<A>::safe_length_for_compressed_pair_buf(uint64_t k, size_t
422
424
  // So the 12-bit lookahead is the tight constraint, but there are at least (2 + B) bits emitted,
423
425
  // so we would be safe with max (0, 10 - B) bits of padding at the end of the bitstream.
424
426
  template<typename A>
425
- size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(uint64_t k) { // measured in 32-bit words
427
+ size_t cpc_compressor<A>::safe_length_for_compressed_window_buf(uint32_t k) { // measured in 32-bit words
426
428
  const size_t bits = 12 * k + 11; // 11 bits of padding, due to 12-bit lookahead, with 1 bit certainly present.
427
429
  return divide_longs_rounding_up(bits, 32);
428
430
  }
429
431
 
430
432
  template<typename A>
431
- uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint64_t c) {
432
- const size_t k = 1 << lg_k;
433
+ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint32_t c) {
434
+ const uint32_t k = 1 << lg_k;
433
435
  // This mid-range logic produces pseudo-phases. They are used to select encoding tables.
434
436
  // The thresholds were chosen by hand after looking at plots of measured compression.
435
437
  if (1000 * c < 2375 * k) {
@@ -450,7 +452,7 @@ uint8_t cpc_compressor<A>::determine_pseudo_phase(uint8_t lg_k, uint64_t c) {
450
452
  }
451
453
  }
452
454
 
453
- static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr, size_t& wordindex) {
455
+ static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32_t* wordarr, uint32_t& wordindex) {
454
456
  if (bufbits >= 32) {
455
457
  wordarr[wordindex++] = bitbuf & 0xffffffff;
456
458
  bitbuf = bitbuf >> 32;
@@ -458,7 +460,7 @@ static inline void maybe_flush_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, uint32
458
460
  }
459
461
  }
460
462
 
461
- static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr, size_t& wordindex, uint8_t minbits) {
463
+ static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const uint32_t* wordarr, uint32_t& wordindex, uint8_t minbits) {
462
464
  if (bufbits < minbits) {
463
465
  bitbuf |= static_cast<uint64_t>(wordarr[wordindex++]) << bufbits;
464
466
  bufbits += 32;
@@ -468,20 +470,20 @@ static inline void maybe_fill_bitbuf(uint64_t& bitbuf, uint8_t& bufbits, const u
468
470
  // This returns the number of compressed words that were actually used.
469
471
  // It is the caller's responsibility to ensure that the compressed_words array is long enough.
470
472
  template<typename A>
471
- size_t cpc_compressor<A>::low_level_compress_bytes(
473
+ uint32_t cpc_compressor<A>::low_level_compress_bytes(
472
474
  const uint8_t* byte_array, // input
473
- size_t num_bytes_to_encode,
475
+ uint32_t num_bytes_to_encode,
474
476
  const uint16_t* encoding_table,
475
477
  uint32_t* compressed_words // output
476
478
  ) const {
477
479
  uint64_t bitbuf = 0; // bits are packed into this first, then are flushed to compressed_words
478
480
  uint8_t bufbits = 0; // number of bits currently in bitbuf; must be between 0 and 31
479
- size_t next_word_index = 0;
481
+ uint32_t next_word_index = 0;
480
482
 
481
- for (size_t byte_index = 0; byte_index < num_bytes_to_encode; byte_index++) {
482
- const uint64_t code_info = encoding_table[byte_array[byte_index]];
483
+ for (uint32_t byte_index = 0; byte_index < num_bytes_to_encode; byte_index++) {
484
+ const uint16_t code_info = encoding_table[byte_array[byte_index]];
483
485
  const uint64_t code_val = code_info & 0xfff;
484
- const int code_len = code_info >> 12;
486
+ const uint8_t code_len = code_info >> 12;
485
487
  bitbuf |= (code_val << bufbits);
486
488
  bufbits += code_len;
487
489
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
@@ -502,12 +504,12 @@ size_t cpc_compressor<A>::low_level_compress_bytes(
502
504
  template<typename A>
503
505
  void cpc_compressor<A>::low_level_uncompress_bytes(
504
506
  uint8_t* byte_array, // output
505
- size_t num_bytes_to_decode,
507
+ uint32_t num_bytes_to_decode,
506
508
  const uint16_t* decoding_table,
507
509
  const uint32_t* compressed_words, // input
508
- size_t num_compressed_words
510
+ uint32_t num_compressed_words
509
511
  ) const {
510
- size_t word_index = 0;
512
+ uint32_t word_index = 0;
511
513
  uint64_t bitbuf = 0;
512
514
  uint8_t bufbits = 0;
513
515
 
@@ -515,7 +517,7 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
515
517
  if (decoding_table == nullptr) throw std::logic_error("decoding_table == NULL");
516
518
  if (compressed_words == nullptr) throw std::logic_error("compressed_words == NULL");
517
519
 
518
- for (size_t byte_index = 0; byte_index < num_bytes_to_decode; byte_index++) {
520
+ for (uint32_t byte_index = 0; byte_index < num_bytes_to_decode; byte_index++) {
519
521
  maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
520
522
 
521
523
  const size_t peek12 = bitbuf & 0xfff; // These 12 bits will include an entire Huffman codeword.
@@ -533,14 +535,14 @@ void cpc_compressor<A>::low_level_uncompress_bytes(
533
535
 
534
536
  static inline uint64_t read_unary(
535
537
  const uint32_t* compressed_words,
536
- size_t& next_word_index,
538
+ uint32_t& next_word_index,
537
539
  uint64_t& bitbuf,
538
540
  uint8_t& bufbits
539
541
  );
540
542
 
541
543
  static inline void write_unary(
542
544
  uint32_t* compressed_words,
543
- size_t& next_word_index_ptr,
545
+ uint32_t& next_word_index_ptr,
544
546
  uint64_t& bit_buf_ptr,
545
547
  uint8_t& buf_bits_ptr,
546
548
  uint64_t value
@@ -551,38 +553,38 @@ static inline void write_unary(
551
553
 
552
554
  // returns the number of compressed_words actually used
553
555
  template<typename A>
554
- size_t cpc_compressor<A>::low_level_compress_pairs(
556
+ uint32_t cpc_compressor<A>::low_level_compress_pairs(
555
557
  const uint32_t* pair_array, // input
556
- size_t num_pairs_to_encode,
557
- size_t num_base_bits,
558
+ uint32_t num_pairs_to_encode,
559
+ uint8_t num_base_bits,
558
560
  uint32_t* compressed_words // output
559
561
  ) const {
560
562
  uint64_t bitbuf = 0;
561
563
  uint8_t bufbits = 0;
562
- size_t next_word_index = 0;
564
+ uint32_t next_word_index = 0;
563
565
  const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
564
- uint64_t predicted_row_index = 0;
565
- uint16_t predicted_col_index = 0;
566
+ uint32_t predicted_row_index = 0;
567
+ uint8_t predicted_col_index = 0;
566
568
 
567
- for (size_t pair_index = 0; pair_index < num_pairs_to_encode; pair_index++) {
569
+ for (uint32_t pair_index = 0; pair_index < num_pairs_to_encode; pair_index++) {
568
570
  const uint32_t row_col = pair_array[pair_index];
569
- const uint64_t row_index = row_col >> 6;
570
- const uint16_t col_index = row_col & 63;
571
+ const uint32_t row_index = row_col >> 6;
572
+ const uint8_t col_index = row_col & 63;
571
573
 
572
574
  if (row_index != predicted_row_index) predicted_col_index = 0;
573
575
 
574
576
  if (row_index < predicted_row_index) throw std::logic_error("row_index < predicted_row_index");
575
577
  if (col_index < predicted_col_index) throw std::logic_error("col_index < predicted_col_index");
576
578
 
577
- const uint64_t y_delta = row_index - predicted_row_index;
578
- const uint16_t x_delta = col_index - predicted_col_index;
579
+ const uint32_t y_delta = row_index - predicted_row_index;
580
+ const uint8_t x_delta = col_index - predicted_col_index;
579
581
 
580
582
  predicted_row_index = row_index;
581
583
  predicted_col_index = col_index + 1;
582
584
 
583
- const uint64_t code_info = length_limited_unary_encoding_table65[x_delta];
585
+ const uint16_t code_info = length_limited_unary_encoding_table65[x_delta];
584
586
  const uint64_t code_val = code_info & 0xfff;
585
- const uint8_t code_len = code_info >> 12;
587
+ const uint8_t code_len = static_cast<uint8_t>(code_info >> 12);
586
588
  bitbuf |= code_val << bufbits;
587
589
  bufbits += code_len;
588
590
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
@@ -614,29 +616,29 @@ size_t cpc_compressor<A>::low_level_compress_pairs(
614
616
  template<typename A>
615
617
  void cpc_compressor<A>::low_level_uncompress_pairs(
616
618
  uint32_t* pair_array, // output
617
- size_t num_pairs_to_decode,
618
- size_t num_base_bits,
619
+ uint32_t num_pairs_to_decode,
620
+ uint8_t num_base_bits,
619
621
  const uint32_t* compressed_words, // input
620
- size_t num_compressed_words
622
+ uint32_t num_compressed_words
621
623
  ) const {
622
- size_t word_index = 0;
624
+ uint32_t word_index = 0;
623
625
  uint64_t bitbuf = 0;
624
626
  uint8_t bufbits = 0;
625
627
  const uint64_t golomb_lo_mask = (1 << num_base_bits) - 1;
626
- uint64_t predicted_row_index = 0;
627
- uint16_t predicted_col_index = 0;
628
+ uint32_t predicted_row_index = 0;
629
+ uint8_t predicted_col_index = 0;
628
630
 
629
631
  // for each pair we need to read:
630
632
  // x_delta (12-bit length-limited unary)
631
633
  // y_delta_hi (unary)
632
634
  // y_delta_lo (basebits)
633
635
 
634
- for (size_t pair_index = 0; pair_index < num_pairs_to_decode; pair_index++) {
636
+ for (uint32_t pair_index = 0; pair_index < num_pairs_to_decode; pair_index++) {
635
637
  maybe_fill_bitbuf(bitbuf, bufbits, compressed_words, word_index, 12); // ensure 12 bits in bit buffer
636
638
  const size_t peek12 = bitbuf & 0xfff;
637
639
  const uint16_t lookup = length_limited_unary_decoding_table65[peek12];
638
- const int code_word_length = lookup >> 8;
639
- const int16_t x_delta = lookup & 0xff;
640
+ const uint8_t code_word_length = lookup >> 8;
641
+ const int8_t x_delta = lookup & 0xff;
640
642
  bitbuf >>= code_word_length;
641
643
  bufbits -= code_word_length;
642
644
 
@@ -650,8 +652,8 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
650
652
 
651
653
  // Now that we have x_delta and y_delta, we can compute the pair's row and column
652
654
  if (y_delta > 0) predicted_col_index = 0;
653
- const uint64_t row_index = predicted_row_index + y_delta;
654
- const uint16_t col_index = predicted_col_index + x_delta;
655
+ const uint32_t row_index = static_cast<uint32_t>(predicted_row_index + y_delta);
656
+ const uint8_t col_index = predicted_col_index + x_delta;
655
657
  const uint32_t row_col = (row_index << 6) | col_index;
656
658
  pair_array[pair_index] = row_col;
657
659
  predicted_row_index = row_index;
@@ -662,7 +664,7 @@ void cpc_compressor<A>::low_level_uncompress_pairs(
662
664
 
663
665
  uint64_t read_unary(
664
666
  const uint32_t* compressed_words,
665
- size_t& next_word_index,
667
+ uint32_t& next_word_index,
666
668
  uint64_t& bitbuf,
667
669
  uint8_t& bufbits
668
670
  ) {
@@ -689,7 +691,7 @@ uint64_t read_unary(
689
691
 
690
692
  void write_unary(
691
693
  uint32_t* compressed_words,
692
- size_t& next_word_index,
694
+ uint32_t& next_word_index,
693
695
  uint64_t& bitbuf,
694
696
  uint8_t& bufbits,
695
697
  uint64_t value
@@ -709,9 +711,9 @@ void write_unary(
709
711
 
710
712
  if (remaining > 15) throw std::out_of_range("remaining out of range");
711
713
 
712
- const uint64_t the_unary_code = 1 << remaining;
714
+ const uint64_t the_unary_code = 1ULL << remaining;
713
715
  bitbuf |= the_unary_code << bufbits;
714
- bufbits += 1 + remaining;
716
+ bufbits += static_cast<uint8_t>(remaining + 1);
715
717
  maybe_flush_bitbuf(bitbuf, bufbits, compressed_words, next_word_index);
716
718
  }
717
719
 
@@ -738,12 +740,12 @@ vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* win
738
740
  // returns an integer that is between
739
741
  // zero and ceiling(log_2(k)) - 1, inclusive
740
742
  template<typename A>
741
- uint64_t cpc_compressor<A>::golomb_choose_number_of_base_bits(uint64_t k, uint64_t count) {
743
+ uint8_t cpc_compressor<A>::golomb_choose_number_of_base_bits(uint32_t k, uint64_t count) {
742
744
  if (k < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: k < 1");
743
745
  if (count < 1) throw std::invalid_argument("golomb_choose_number_of_base_bits: count < 1");
744
746
  const uint64_t quotient = (k - count) / count; // integer division
745
747
  if (quotient == 0) return 0;
746
- else return long_floor_log2_of_long(quotient);
748
+ else return floor_log2_of_long(quotient);
747
749
  }
748
750
 
749
751
  } /* namespace datasketches */