datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -192,7 +192,7 @@ public:
192
192
  * @param data pointer to the data
193
193
  * @param length of the data in bytes
194
194
  */
195
- void update(const void* value, int size);
195
+ void update(const void* value, size_t size);
196
196
 
197
197
  /**
198
198
  * Returns a human-readable summary of this sketch
@@ -235,6 +235,17 @@ public:
235
235
  */
236
236
  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
237
237
 
238
+ /**
239
+ * The actual size of a compressed CPC sketch has a small random variance, but the following
240
+ * empirically measured size should be large enough for at least 99.9 percent of sketches.
241
+ *
242
+ * <p>For small values of <i>n</i> the size can be much smaller.
243
+ *
244
+ * @param lg_k the given value of lg_k.
245
+ * @return the estimated maximum compressed serialized size of a sketch.
246
+ */
247
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
248
+
238
249
  // for internal use
239
250
  uint32_t get_num_coupons() const;
240
251
 
@@ -303,6 +314,8 @@ private:
303
314
  inline void write_hip(std::ostream& os) const;
304
315
  inline size_t copy_hip_to_mem(void* dst) const;
305
316
 
317
+ static void check_lg_k(uint8_t lg_k);
318
+
306
319
  friend cpc_compressor<A>;
307
320
  friend cpc_union_alloc<A>;
308
321
  };
@@ -53,9 +53,7 @@ first_interesting_column(0),
53
53
  kxp(1 << lg_k),
54
54
  hip_est_accum(0)
55
55
  {
56
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
57
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
58
- }
56
+ check_lg_k(lg_k);
59
57
  }
60
58
 
61
59
  template<typename A>
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
176
174
 
177
175
  static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
178
176
  if (lg_k > 26) throw std::logic_error("lg_k > 26");
179
- const uint64_t k = 1 << lg_k;
177
+ const uint32_t k = 1 << lg_k;
180
178
  uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
181
179
  if (col > 63) col = 63; // clip so that 0 <= col <= 63
182
180
  const uint32_t row = hash0 & (k - 1);
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
188
186
  }
189
187
 
190
188
  template<typename A>
191
- void cpc_sketch_alloc<A>::update(const void* value, int size) {
189
+ void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
192
190
  HashState hashes;
193
191
  MurmurHash3_x64_128(value, size, seed, hashes);
194
192
  row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
208
206
 
209
207
  template<typename A>
210
208
  void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
211
- const uint64_t k = 1 << lg_k;
209
+ const uint32_t k = 1 << lg_k;
212
210
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
213
211
  if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
214
212
  bool is_novel = surprising_value_table.maybe_insert(row_col);
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
224
222
  template<typename A>
225
223
  void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
226
224
  if (window_offset > 56) throw std::logic_error("wrong window offset");
227
- const uint64_t k = 1 << lg_k;
225
+ const uint32_t k = 1 << lg_k;
228
226
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
229
227
  if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
230
228
  const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
266
264
  // Call this whenever a new coupon has been collected.
267
265
  template<typename A>
268
266
  void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
269
- const uint64_t k = 1 << lg_k;
267
+ const uint32_t k = 1 << lg_k;
270
268
  const uint8_t col = row_col & 63;
271
269
  const double one_over_p = static_cast<double>(k) / kxp;
272
270
  hip_est_accum += one_over_p;
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
276
274
  // In terms of flavor, this promotes SPARSE to HYBRID
277
275
  template<typename A>
278
276
  void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
279
- const uint64_t k = 1 << lg_k;
277
+ const uint32_t k = 1 << lg_k;
280
278
  const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
281
279
  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
282
280
 
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
285
283
  u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
286
284
 
287
285
  const uint32_t* old_slots = surprising_value_table.get_slots();
288
- const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
286
+ const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
289
287
 
290
288
  if (window_offset != 0) throw std::logic_error("window_offset != 0");
291
289
 
292
- for (size_t i = 0; i < old_num_slots; i++) {
290
+ for (uint32_t i = 0; i < old_num_slots; i++) {
293
291
  const uint32_t row_col = old_slots[i];
294
292
  if (row_col != UINT32_MAX) {
295
293
  const uint8_t col = row_col & 63;
296
294
  if (col < 8) {
297
- const size_t row = row_col >> 6;
295
+ const uint32_t row = row_col >> 6;
298
296
  sliding_window[row] |= 1 << col;
299
297
  } else {
300
298
  // cannot use u32_table::must_insert(), because it doesn't provide for growth
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
314
312
  if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
315
313
 
316
314
  if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
317
- const uint64_t k = 1 << lg_k;
315
+ const uint32_t k = 1 << lg_k;
318
316
 
319
317
  // Construct the full-sized bit matrix that corresponds to the sketch
320
318
  vector_u64<A> bit_matrix = build_bit_matrix();
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
328
326
  const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
329
327
  uint64_t all_surprises_ored = 0;
330
328
 
331
- for (size_t i = 0; i < k; i++) {
329
+ for (uint32_t i = 0; i < k; i++) {
332
330
  uint64_t pattern = bit_matrix[i];
333
331
  sliding_window[i] = (pattern >> new_offset) & 0xff;
334
332
  pattern &= mask_for_clearing_window;
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
357
355
  // so that it will reflect changes that were previously outside the mantissa.
358
356
  template<typename A>
359
357
  void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
360
- const uint64_t k = 1 << lg_k;
358
+ const uint32_t k = 1 << lg_k;
361
359
 
362
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
363
361
  double byte_sums[8]; // allocating on the stack
@@ -415,44 +413,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
415
413
  const bool has_table = compressed.table_data.size() > 0;
416
414
  const bool has_window = compressed.window_data.size() > 0;
417
415
  const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
418
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
416
+ write(os, preamble_ints);
419
417
  const uint8_t serial_version = SERIAL_VERSION;
420
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
418
+ write(os, serial_version);
421
419
  const uint8_t family = FAMILY;
422
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
423
- os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
424
- os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
420
+ write(os, family);
421
+ write(os, lg_k);
422
+ write(os, first_interesting_column);
425
423
  const uint8_t flags_byte(
426
424
  (1 << flags::IS_COMPRESSED)
427
425
  | (has_hip ? 1 << flags::HAS_HIP : 0)
428
426
  | (has_table ? 1 << flags::HAS_TABLE : 0)
429
427
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
430
428
  );
431
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
429
+ write(os, flags_byte);
432
430
  const uint16_t seed_hash(compute_seed_hash(seed));
433
- os.write((char*)&seed_hash, sizeof(seed_hash));
431
+ write(os, seed_hash);
434
432
  if (!is_empty()) {
435
- os.write((char*)&num_coupons, sizeof(num_coupons));
433
+ write(os, num_coupons);
436
434
  if (has_table && has_window) {
437
435
  // if there is no window it is the same as number of coupons
438
- os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
436
+ write(os, compressed.table_num_entries);
439
437
  // HIP values can be in two different places in the sequence of fields
440
438
  // this is the first HIP decision point
441
439
  if (has_hip) write_hip(os);
442
440
  }
443
441
  if (has_table) {
444
- os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
442
+ write(os, compressed.table_data_words);
445
443
  }
446
444
  if (has_window) {
447
- os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
445
+ write(os, compressed.window_data_words);
448
446
  }
449
447
  // this is the second HIP decision point
450
448
  if (has_hip && !(has_table && has_window)) write_hip(os);
451
449
  if (has_window) {
452
- os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
450
+ write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
453
451
  }
454
452
  if (has_table) {
455
- os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
453
+ write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
456
454
  }
457
455
  }
458
456
  }
@@ -471,36 +469,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
471
469
  const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
472
470
  vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
473
471
  uint8_t* ptr = bytes.data() + header_size_bytes;
474
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
472
+ ptr += copy_to_mem(preamble_ints, ptr);
475
473
  const uint8_t serial_version = SERIAL_VERSION;
476
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
474
+ ptr += copy_to_mem(serial_version, ptr);
477
475
  const uint8_t family = FAMILY;
478
- ptr += copy_to_mem(&family, ptr, sizeof(family));
479
- ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
480
- ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
476
+ ptr += copy_to_mem(family, ptr);
477
+ ptr += copy_to_mem(lg_k, ptr);
478
+ ptr += copy_to_mem(first_interesting_column, ptr);
481
479
  const uint8_t flags_byte(
482
480
  (1 << flags::IS_COMPRESSED)
483
481
  | (has_hip ? 1 << flags::HAS_HIP : 0)
484
482
  | (has_table ? 1 << flags::HAS_TABLE : 0)
485
483
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
486
484
  );
487
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
485
+ ptr += copy_to_mem(flags_byte, ptr);
488
486
  const uint16_t seed_hash = compute_seed_hash(seed);
489
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
487
+ ptr += copy_to_mem(seed_hash, ptr);
490
488
  if (!is_empty()) {
491
- ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
489
+ ptr += copy_to_mem(num_coupons, ptr);
492
490
  if (has_table && has_window) {
493
491
  // if there is no window it is the same as number of coupons
494
- ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
492
+ ptr += copy_to_mem(compressed.table_num_entries, ptr);
495
493
  // HIP values can be in two different places in the sequence of fields
496
494
  // this is the first HIP decision point
497
495
  if (has_hip) ptr += copy_hip_to_mem(ptr);
498
496
  }
499
497
  if (has_table) {
500
- ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
498
+ ptr += copy_to_mem(compressed.table_data_words, ptr);
501
499
  }
502
500
  if (has_window) {
503
- ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
501
+ ptr += copy_to_mem(compressed.window_data_words, ptr);
504
502
  }
505
503
  // this is the second HIP decision point
506
504
  if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
@@ -517,20 +515,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
517
515
 
518
516
  template<typename A>
519
517
  cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
520
- uint8_t preamble_ints;
521
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
522
- uint8_t serial_version;
523
- is.read((char*)&serial_version, sizeof(serial_version));
524
- uint8_t family_id;
525
- is.read((char*)&family_id, sizeof(family_id));
526
- uint8_t lg_k;
527
- is.read((char*)&lg_k, sizeof(lg_k));
528
- uint8_t first_interesting_column;
529
- is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
530
- uint8_t flags_byte;
531
- is.read((char*)&flags_byte, sizeof(flags_byte));
532
- uint16_t seed_hash;
533
- is.read((char*)&seed_hash, sizeof(seed_hash));
518
+ const auto preamble_ints = read<uint8_t>(is);
519
+ const auto serial_version = read<uint8_t>(is);
520
+ const auto family_id = read<uint8_t>(is);
521
+ const auto lg_k = read<uint8_t>(is);
522
+ const auto first_interesting_column = read<uint8_t>(is);
523
+ const auto flags_byte = read<uint8_t>(is);
524
+ const auto seed_hash = read<uint16_t>(is);
534
525
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
535
526
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
536
527
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -542,31 +533,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
542
533
  double kxp = 0;
543
534
  double hip_est_accum = 0;
544
535
  if (has_table || has_window) {
545
- is.read((char*)&num_coupons, sizeof(num_coupons));
536
+ num_coupons = read<uint32_t>(is);
546
537
  if (has_table && has_window) {
547
- is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
538
+ compressed.table_num_entries = read<uint32_t>(is);
548
539
  if (has_hip) {
549
- is.read((char*)&kxp, sizeof(kxp));
550
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
540
+ kxp = read<double>(is);
541
+ hip_est_accum = read<double>(is);
551
542
  }
552
543
  }
553
544
  if (has_table) {
554
- is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
545
+ compressed.table_data_words = read<uint32_t>(is);
555
546
  }
556
547
  if (has_window) {
557
- is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
548
+ compressed.window_data_words = read<uint32_t>(is);
558
549
  }
559
550
  if (has_hip && !(has_table && has_window)) {
560
- is.read((char*)&kxp, sizeof(kxp));
561
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
551
+ kxp = read<double>(is);
552
+ hip_est_accum = read<double>(is);
562
553
  }
563
554
  if (has_window) {
564
555
  compressed.window_data.resize(compressed.window_data_words);
565
- is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
556
+ read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
566
557
  }
567
558
  if (has_table) {
568
559
  compressed.table_data.resize(compressed.table_data_words);
569
- is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
560
+ read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
570
561
  }
571
562
  if (!has_window) compressed.table_num_entries = num_coupons;
572
563
  }
@@ -602,19 +593,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
602
593
  const char* ptr = static_cast<const char*>(bytes);
603
594
  const char* base = static_cast<const char*>(bytes);
604
595
  uint8_t preamble_ints;
605
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
596
+ ptr += copy_from_mem(ptr, preamble_ints);
606
597
  uint8_t serial_version;
607
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
598
+ ptr += copy_from_mem(ptr, serial_version);
608
599
  uint8_t family_id;
609
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
600
+ ptr += copy_from_mem(ptr, family_id);
610
601
  uint8_t lg_k;
611
- ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
602
+ ptr += copy_from_mem(ptr, lg_k);
612
603
  uint8_t first_interesting_column;
613
- ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
604
+ ptr += copy_from_mem(ptr, first_interesting_column);
614
605
  uint8_t flags_byte;
615
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
606
+ ptr += copy_from_mem(ptr, flags_byte);
616
607
  uint16_t seed_hash;
617
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
608
+ ptr += copy_from_mem(ptr, seed_hash);
618
609
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
619
610
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
620
611
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -628,28 +619,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
628
619
  double hip_est_accum = 0;
629
620
  if (has_table || has_window) {
630
621
  check_memory_size(ptr - base + sizeof(num_coupons), size);
631
- ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
622
+ ptr += copy_from_mem(ptr, num_coupons);
632
623
  if (has_table && has_window) {
633
624
  check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
634
- ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
625
+ ptr += copy_from_mem(ptr, compressed.table_num_entries);
635
626
  if (has_hip) {
636
627
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
637
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
638
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
628
+ ptr += copy_from_mem(ptr, kxp);
629
+ ptr += copy_from_mem(ptr, hip_est_accum);
639
630
  }
640
631
  }
641
632
  if (has_table) {
642
633
  check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
643
- ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
634
+ ptr += copy_from_mem(ptr, compressed.table_data_words);
644
635
  }
645
636
  if (has_window) {
646
637
  check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
647
- ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
638
+ ptr += copy_from_mem(ptr, compressed.window_data_words);
648
639
  }
649
640
  if (has_hip && !(has_table && has_window)) {
650
641
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
651
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
652
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
642
+ ptr += copy_from_mem(ptr, kxp);
643
+ ptr += copy_from_mem(ptr, hip_est_accum);
653
644
  }
654
645
  if (has_window) {
655
646
  compressed.window_data.resize(compressed.window_data_words);
@@ -688,6 +679,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
688
679
  std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
689
680
  }
690
681
 
682
+ /*
683
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
684
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
685
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
686
+ * worst-case space for the preamble, which is added by the function.
687
+ */
688
+ static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
689
+ static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
690
+ 24, // lg_k = 4
691
+ 36, // lg_k = 5
692
+ 56, // lg_k = 6
693
+ 100, // lg_k = 7
694
+ 180, // lg_k = 8
695
+ 344, // lg_k = 9
696
+ 660, // lg_k = 10
697
+ 1292, // lg_k = 11
698
+ 2540, // lg_k = 12
699
+ 5020, // lg_k = 13
700
+ 9968, // lg_k = 14
701
+ 19836, // lg_k = 15
702
+ 39532, // lg_k = 16
703
+ 78880, // lg_k = 17
704
+ 157516, // lg_k = 18
705
+ 314656 // lg_k = 19
706
+ };
707
+ static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
708
+ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
709
+
710
+ template<typename A>
711
+ size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
712
+ check_lg_k(lg_k);
713
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
714
+ const uint32_t k = 1 << lg_k;
715
+ return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
716
+ }
717
+
718
+ template<typename A>
719
+ void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
720
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
721
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
722
+ }
723
+ }
724
+
691
725
  template<typename A>
692
726
  uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
693
727
  return num_coupons;
@@ -696,7 +730,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
696
730
  template<typename A>
697
731
  bool cpc_sketch_alloc<A>::validate() const {
698
732
  vector_u64<A> bit_matrix = build_bit_matrix();
699
- const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
733
+ const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
700
734
  return num_bits_set == num_coupons;
701
735
  }
702
736
 
@@ -744,7 +778,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
744
778
 
745
779
  template<typename A>
746
780
  typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
747
- const uint64_t k = 1 << lg_k;
781
+ const uint32_t k = 1 << lg_k;
748
782
  const uint64_t c2 = c << 1;
749
783
  const uint64_t c8 = c << 3;
750
784
  const uint64_t c32 = c << 5;
@@ -757,15 +791,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
757
791
 
758
792
  template<typename A>
759
793
  uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
760
- const uint64_t k = 1 << lg_k;
794
+ const uint32_t k = 1 << lg_k;
761
795
  const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
762
796
  if (tmp < 0) return 0;
763
- return tmp >> (lg_k + 3); // tmp / 8K
797
+ return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
764
798
  }
765
799
 
766
800
  template<typename A>
767
801
  vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
768
- const size_t k = 1 << lg_k;
802
+ const uint32_t k = 1 << lg_k;
769
803
  if (window_offset > 56) throw std::logic_error("offset > 56");
770
804
 
771
805
  // Fill the matrix with default rows in which the "early zone" is filled with ones.
@@ -782,12 +816,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
782
816
  }
783
817
 
784
818
  const uint32_t* slots = surprising_value_table.get_slots();
785
- const size_t num_slots = 1 << surprising_value_table.get_lg_size();
819
+ const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
786
820
  for (size_t i = 0; i < num_slots; i++) {
787
821
  const uint32_t row_col = slots[i];
788
822
  if (row_col != UINT32_MAX) {
789
823
  const uint8_t col = row_col & 63;
790
- const size_t row = row_col >> 6;
824
+ const uint32_t row = row_col >> 6;
791
825
  // Flip the specified matrix bit from its default value.
792
826
  // In the "early" zone the bit changes from 1 to 0.
793
827
  // In the "late" zone the bit changes from 0 to 1.
@@ -799,8 +833,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
799
833
 
800
834
  template<typename A>
801
835
  void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
802
- os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
803
- os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
836
+ write(os, kxp);
837
+ write(os, hip_est_accum);
804
838
  }
805
839
 
806
840
  template<typename A>