datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -192,7 +192,7 @@ public:
192
192
  * @param data pointer to the data
193
193
  * @param length of the data in bytes
194
194
  */
195
- void update(const void* value, int size);
195
+ void update(const void* value, size_t size);
196
196
 
197
197
  /**
198
198
  * Returns a human-readable summary of this sketch
@@ -235,6 +235,17 @@ public:
235
235
  */
236
236
  static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
237
237
 
238
+ /**
239
+ * The actual size of a compressed CPC sketch has a small random variance, but the following
240
+ * empirically measured size should be large enough for at least 99.9 percent of sketches.
241
+ *
242
+ * <p>For small values of <i>n</i> the size can be much smaller.
243
+ *
244
+ * @param lg_k the given value of lg_k.
245
+ * @return the estimated maximum compressed serialized size of a sketch.
246
+ */
247
+ static size_t get_max_serialized_size_bytes(uint8_t lg_k);
248
+
238
249
  // for internal use
239
250
  uint32_t get_num_coupons() const;
240
251
 
@@ -303,6 +314,8 @@ private:
303
314
  inline void write_hip(std::ostream& os) const;
304
315
  inline size_t copy_hip_to_mem(void* dst) const;
305
316
 
317
+ static void check_lg_k(uint8_t lg_k);
318
+
306
319
  friend cpc_compressor<A>;
307
320
  friend cpc_union_alloc<A>;
308
321
  };
@@ -53,9 +53,7 @@ first_interesting_column(0),
53
53
  kxp(1 << lg_k),
54
54
  hip_est_accum(0)
55
55
  {
56
- if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
57
- throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
58
- }
56
+ check_lg_k(lg_k);
59
57
  }
60
58
 
61
59
  template<typename A>
@@ -176,7 +174,7 @@ void cpc_sketch_alloc<A>::update(float value) {
176
174
 
177
175
  static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
178
176
  if (lg_k > 26) throw std::logic_error("lg_k > 26");
179
- const uint64_t k = 1 << lg_k;
177
+ const uint32_t k = 1 << lg_k;
180
178
  uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
181
179
  if (col > 63) col = 63; // clip so that 0 <= col <= 63
182
180
  const uint32_t row = hash0 & (k - 1);
@@ -188,7 +186,7 @@ static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, u
188
186
  }
189
187
 
190
188
  template<typename A>
191
- void cpc_sketch_alloc<A>::update(const void* value, int size) {
189
+ void cpc_sketch_alloc<A>::update(const void* value, size_t size) {
192
190
  HashState hashes;
193
191
  MurmurHash3_x64_128(value, size, seed, hashes);
194
192
  row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
@@ -208,7 +206,7 @@ void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
208
206
 
209
207
  template<typename A>
210
208
  void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
211
- const uint64_t k = 1 << lg_k;
209
+ const uint32_t k = 1 << lg_k;
212
210
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
213
211
  if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
214
212
  bool is_novel = surprising_value_table.maybe_insert(row_col);
@@ -224,7 +222,7 @@ void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
224
222
  template<typename A>
225
223
  void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
226
224
  if (window_offset > 56) throw std::logic_error("wrong window offset");
227
- const uint64_t k = 1 << lg_k;
225
+ const uint32_t k = 1 << lg_k;
228
226
  const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
229
227
  if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
230
228
  const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
@@ -266,7 +264,7 @@ void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
266
264
  // Call this whenever a new coupon has been collected.
267
265
  template<typename A>
268
266
  void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
269
- const uint64_t k = 1 << lg_k;
267
+ const uint32_t k = 1 << lg_k;
270
268
  const uint8_t col = row_col & 63;
271
269
  const double one_over_p = static_cast<double>(k) / kxp;
272
270
  hip_est_accum += one_over_p;
@@ -276,7 +274,7 @@ void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
276
274
  // In terms of flavor, this promotes SPARSE to HYBRID
277
275
  template<typename A>
278
276
  void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
279
- const uint64_t k = 1 << lg_k;
277
+ const uint32_t k = 1 << lg_k;
280
278
  const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
281
279
  if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
282
280
 
@@ -285,16 +283,16 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
285
283
  u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
286
284
 
287
285
  const uint32_t* old_slots = surprising_value_table.get_slots();
288
- const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
286
+ const uint32_t old_num_slots = 1 << surprising_value_table.get_lg_size();
289
287
 
290
288
  if (window_offset != 0) throw std::logic_error("window_offset != 0");
291
289
 
292
- for (size_t i = 0; i < old_num_slots; i++) {
290
+ for (uint32_t i = 0; i < old_num_slots; i++) {
293
291
  const uint32_t row_col = old_slots[i];
294
292
  if (row_col != UINT32_MAX) {
295
293
  const uint8_t col = row_col & 63;
296
294
  if (col < 8) {
297
- const size_t row = row_col >> 6;
295
+ const uint32_t row = row_col >> 6;
298
296
  sliding_window[row] |= 1 << col;
299
297
  } else {
300
298
  // cannot use u32_table::must_insert(), because it doesn't provide for growth
@@ -314,7 +312,7 @@ void cpc_sketch_alloc<A>::move_window() {
314
312
  if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
315
313
 
316
314
  if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
317
- const uint64_t k = 1 << lg_k;
315
+ const uint32_t k = 1 << lg_k;
318
316
 
319
317
  // Construct the full-sized bit matrix that corresponds to the sketch
320
318
  vector_u64<A> bit_matrix = build_bit_matrix();
@@ -328,7 +326,7 @@ void cpc_sketch_alloc<A>::move_window() {
328
326
  const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
329
327
  uint64_t all_surprises_ored = 0;
330
328
 
331
- for (size_t i = 0; i < k; i++) {
329
+ for (uint32_t i = 0; i < k; i++) {
332
330
  uint64_t pattern = bit_matrix[i];
333
331
  sliding_window[i] = (pattern >> new_offset) & 0xff;
334
332
  pattern &= mask_for_clearing_window;
@@ -357,7 +355,7 @@ void cpc_sketch_alloc<A>::move_window() {
357
355
  // so that it will reflect changes that were previously outside the mantissa.
358
356
  template<typename A>
359
357
  void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
360
- const uint64_t k = 1 << lg_k;
358
+ const uint32_t k = 1 << lg_k;
361
359
 
362
360
  // for improved numerical accuracy, we separately sum the bytes of the U64's
363
361
  double byte_sums[8]; // allocating on the stack
@@ -415,44 +413,44 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
415
413
  const bool has_table = compressed.table_data.size() > 0;
416
414
  const bool has_window = compressed.window_data.size() > 0;
417
415
  const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
418
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
416
+ write(os, preamble_ints);
419
417
  const uint8_t serial_version = SERIAL_VERSION;
420
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
418
+ write(os, serial_version);
421
419
  const uint8_t family = FAMILY;
422
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
423
- os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
424
- os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
420
+ write(os, family);
421
+ write(os, lg_k);
422
+ write(os, first_interesting_column);
425
423
  const uint8_t flags_byte(
426
424
  (1 << flags::IS_COMPRESSED)
427
425
  | (has_hip ? 1 << flags::HAS_HIP : 0)
428
426
  | (has_table ? 1 << flags::HAS_TABLE : 0)
429
427
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
430
428
  );
431
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
429
+ write(os, flags_byte);
432
430
  const uint16_t seed_hash(compute_seed_hash(seed));
433
- os.write((char*)&seed_hash, sizeof(seed_hash));
431
+ write(os, seed_hash);
434
432
  if (!is_empty()) {
435
- os.write((char*)&num_coupons, sizeof(num_coupons));
433
+ write(os, num_coupons);
436
434
  if (has_table && has_window) {
437
435
  // if there is no window it is the same as number of coupons
438
- os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
436
+ write(os, compressed.table_num_entries);
439
437
  // HIP values can be in two different places in the sequence of fields
440
438
  // this is the first HIP decision point
441
439
  if (has_hip) write_hip(os);
442
440
  }
443
441
  if (has_table) {
444
- os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
442
+ write(os, compressed.table_data_words);
445
443
  }
446
444
  if (has_window) {
447
- os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
445
+ write(os, compressed.window_data_words);
448
446
  }
449
447
  // this is the second HIP decision point
450
448
  if (has_hip && !(has_table && has_window)) write_hip(os);
451
449
  if (has_window) {
452
- os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
450
+ write(os, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
453
451
  }
454
452
  if (has_table) {
455
- os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
453
+ write(os, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
456
454
  }
457
455
  }
458
456
  }
@@ -471,36 +469,36 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
471
469
  const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
472
470
  vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
473
471
  uint8_t* ptr = bytes.data() + header_size_bytes;
474
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
472
+ ptr += copy_to_mem(preamble_ints, ptr);
475
473
  const uint8_t serial_version = SERIAL_VERSION;
476
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
474
+ ptr += copy_to_mem(serial_version, ptr);
477
475
  const uint8_t family = FAMILY;
478
- ptr += copy_to_mem(&family, ptr, sizeof(family));
479
- ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
480
- ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
476
+ ptr += copy_to_mem(family, ptr);
477
+ ptr += copy_to_mem(lg_k, ptr);
478
+ ptr += copy_to_mem(first_interesting_column, ptr);
481
479
  const uint8_t flags_byte(
482
480
  (1 << flags::IS_COMPRESSED)
483
481
  | (has_hip ? 1 << flags::HAS_HIP : 0)
484
482
  | (has_table ? 1 << flags::HAS_TABLE : 0)
485
483
  | (has_window ? 1 << flags::HAS_WINDOW : 0)
486
484
  );
487
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
485
+ ptr += copy_to_mem(flags_byte, ptr);
488
486
  const uint16_t seed_hash = compute_seed_hash(seed);
489
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
487
+ ptr += copy_to_mem(seed_hash, ptr);
490
488
  if (!is_empty()) {
491
- ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
489
+ ptr += copy_to_mem(num_coupons, ptr);
492
490
  if (has_table && has_window) {
493
491
  // if there is no window it is the same as number of coupons
494
- ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
492
+ ptr += copy_to_mem(compressed.table_num_entries, ptr);
495
493
  // HIP values can be in two different places in the sequence of fields
496
494
  // this is the first HIP decision point
497
495
  if (has_hip) ptr += copy_hip_to_mem(ptr);
498
496
  }
499
497
  if (has_table) {
500
- ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
498
+ ptr += copy_to_mem(compressed.table_data_words, ptr);
501
499
  }
502
500
  if (has_window) {
503
- ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
501
+ ptr += copy_to_mem(compressed.window_data_words, ptr);
504
502
  }
505
503
  // this is the second HIP decision point
506
504
  if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
@@ -517,20 +515,13 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
517
515
 
518
516
  template<typename A>
519
517
  cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
520
- uint8_t preamble_ints;
521
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
522
- uint8_t serial_version;
523
- is.read((char*)&serial_version, sizeof(serial_version));
524
- uint8_t family_id;
525
- is.read((char*)&family_id, sizeof(family_id));
526
- uint8_t lg_k;
527
- is.read((char*)&lg_k, sizeof(lg_k));
528
- uint8_t first_interesting_column;
529
- is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
530
- uint8_t flags_byte;
531
- is.read((char*)&flags_byte, sizeof(flags_byte));
532
- uint16_t seed_hash;
533
- is.read((char*)&seed_hash, sizeof(seed_hash));
518
+ const auto preamble_ints = read<uint8_t>(is);
519
+ const auto serial_version = read<uint8_t>(is);
520
+ const auto family_id = read<uint8_t>(is);
521
+ const auto lg_k = read<uint8_t>(is);
522
+ const auto first_interesting_column = read<uint8_t>(is);
523
+ const auto flags_byte = read<uint8_t>(is);
524
+ const auto seed_hash = read<uint16_t>(is);
534
525
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
535
526
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
536
527
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -542,31 +533,31 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
542
533
  double kxp = 0;
543
534
  double hip_est_accum = 0;
544
535
  if (has_table || has_window) {
545
- is.read((char*)&num_coupons, sizeof(num_coupons));
536
+ num_coupons = read<uint32_t>(is);
546
537
  if (has_table && has_window) {
547
- is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
538
+ compressed.table_num_entries = read<uint32_t>(is);
548
539
  if (has_hip) {
549
- is.read((char*)&kxp, sizeof(kxp));
550
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
540
+ kxp = read<double>(is);
541
+ hip_est_accum = read<double>(is);
551
542
  }
552
543
  }
553
544
  if (has_table) {
554
- is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
545
+ compressed.table_data_words = read<uint32_t>(is);
555
546
  }
556
547
  if (has_window) {
557
- is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
548
+ compressed.window_data_words = read<uint32_t>(is);
558
549
  }
559
550
  if (has_hip && !(has_table && has_window)) {
560
- is.read((char*)&kxp, sizeof(kxp));
561
- is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
551
+ kxp = read<double>(is);
552
+ hip_est_accum = read<double>(is);
562
553
  }
563
554
  if (has_window) {
564
555
  compressed.window_data.resize(compressed.window_data_words);
565
- is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
556
+ read(is, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
566
557
  }
567
558
  if (has_table) {
568
559
  compressed.table_data.resize(compressed.table_data_words);
569
- is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
560
+ read(is, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
570
561
  }
571
562
  if (!has_window) compressed.table_num_entries = num_coupons;
572
563
  }
@@ -602,19 +593,19 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
602
593
  const char* ptr = static_cast<const char*>(bytes);
603
594
  const char* base = static_cast<const char*>(bytes);
604
595
  uint8_t preamble_ints;
605
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
596
+ ptr += copy_from_mem(ptr, preamble_ints);
606
597
  uint8_t serial_version;
607
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
598
+ ptr += copy_from_mem(ptr, serial_version);
608
599
  uint8_t family_id;
609
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
600
+ ptr += copy_from_mem(ptr, family_id);
610
601
  uint8_t lg_k;
611
- ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
602
+ ptr += copy_from_mem(ptr, lg_k);
612
603
  uint8_t first_interesting_column;
613
- ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
604
+ ptr += copy_from_mem(ptr, first_interesting_column);
614
605
  uint8_t flags_byte;
615
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
606
+ ptr += copy_from_mem(ptr, flags_byte);
616
607
  uint16_t seed_hash;
617
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
608
+ ptr += copy_from_mem(ptr, seed_hash);
618
609
  const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
619
610
  const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
620
611
  const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
@@ -628,28 +619,28 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
628
619
  double hip_est_accum = 0;
629
620
  if (has_table || has_window) {
630
621
  check_memory_size(ptr - base + sizeof(num_coupons), size);
631
- ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
622
+ ptr += copy_from_mem(ptr, num_coupons);
632
623
  if (has_table && has_window) {
633
624
  check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
634
- ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
625
+ ptr += copy_from_mem(ptr, compressed.table_num_entries);
635
626
  if (has_hip) {
636
627
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
637
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
638
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
628
+ ptr += copy_from_mem(ptr, kxp);
629
+ ptr += copy_from_mem(ptr, hip_est_accum);
639
630
  }
640
631
  }
641
632
  if (has_table) {
642
633
  check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
643
- ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
634
+ ptr += copy_from_mem(ptr, compressed.table_data_words);
644
635
  }
645
636
  if (has_window) {
646
637
  check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
647
- ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
638
+ ptr += copy_from_mem(ptr, compressed.window_data_words);
648
639
  }
649
640
  if (has_hip && !(has_table && has_window)) {
650
641
  check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
651
- ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
652
- ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
642
+ ptr += copy_from_mem(ptr, kxp);
643
+ ptr += copy_from_mem(ptr, hip_est_accum);
653
644
  }
654
645
  if (has_window) {
655
646
  compressed.window_data.resize(compressed.window_data_words);
@@ -688,6 +679,49 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
688
679
  std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
689
680
  }
690
681
 
682
+ /*
683
+ * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
684
+ * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
685
+ * spaced over values of the quantity C/K between 3.0 and 8.0. This table does not include the
686
+ * worst-case space for the preamble, which is added by the function.
687
+ */
688
+ static const uint8_t CPC_EMPIRICAL_SIZE_MAX_LGK = 19;
689
+ static const size_t CPC_EMPIRICAL_MAX_SIZE_BYTES[] = {
690
+ 24, // lg_k = 4
691
+ 36, // lg_k = 5
692
+ 56, // lg_k = 6
693
+ 100, // lg_k = 7
694
+ 180, // lg_k = 8
695
+ 344, // lg_k = 9
696
+ 660, // lg_k = 10
697
+ 1292, // lg_k = 11
698
+ 2540, // lg_k = 12
699
+ 5020, // lg_k = 13
700
+ 9968, // lg_k = 14
701
+ 19836, // lg_k = 15
702
+ 39532, // lg_k = 16
703
+ 78880, // lg_k = 17
704
+ 157516, // lg_k = 18
705
+ 314656 // lg_k = 19
706
+ };
707
+ static const double CPC_EMPIRICAL_MAX_SIZE_FACTOR = 0.6; // 0.6 = 4.8 / 8.0
708
+ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
709
+
710
+ template<typename A>
711
+ size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
712
+ check_lg_k(lg_k);
713
+ if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - CPC_MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
714
+ const uint32_t k = 1 << lg_k;
715
+ return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
716
+ }
717
+
718
+ template<typename A>
719
+ void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
720
+ if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
721
+ throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
722
+ }
723
+ }
724
+
691
725
  template<typename A>
692
726
  uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
693
727
  return num_coupons;
@@ -696,7 +730,7 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
696
730
  template<typename A>
697
731
  bool cpc_sketch_alloc<A>::validate() const {
698
732
  vector_u64<A> bit_matrix = build_bit_matrix();
699
- const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
733
+ const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
700
734
  return num_bits_set == num_coupons;
701
735
  }
702
736
 
@@ -744,7 +778,7 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() con
744
778
 
745
779
  template<typename A>
746
780
  typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
747
- const uint64_t k = 1 << lg_k;
781
+ const uint32_t k = 1 << lg_k;
748
782
  const uint64_t c2 = c << 1;
749
783
  const uint64_t c8 = c << 3;
750
784
  const uint64_t c32 = c << 5;
@@ -757,15 +791,15 @@ typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8
757
791
 
758
792
  template<typename A>
759
793
  uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
760
- const uint64_t k = 1 << lg_k;
794
+ const uint32_t k = 1 << lg_k;
761
795
  const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
762
796
  if (tmp < 0) return 0;
763
- return tmp >> (lg_k + 3); // tmp / 8K
797
+ return static_cast<uint8_t>(tmp >> (lg_k + 3)); // tmp / 8K
764
798
  }
765
799
 
766
800
  template<typename A>
767
801
  vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
768
- const size_t k = 1 << lg_k;
802
+ const uint32_t k = 1 << lg_k;
769
803
  if (window_offset > 56) throw std::logic_error("offset > 56");
770
804
 
771
805
  // Fill the matrix with default rows in which the "early zone" is filled with ones.
@@ -782,12 +816,12 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
782
816
  }
783
817
 
784
818
  const uint32_t* slots = surprising_value_table.get_slots();
785
- const size_t num_slots = 1 << surprising_value_table.get_lg_size();
819
+ const uint32_t num_slots = 1 << surprising_value_table.get_lg_size();
786
820
  for (size_t i = 0; i < num_slots; i++) {
787
821
  const uint32_t row_col = slots[i];
788
822
  if (row_col != UINT32_MAX) {
789
823
  const uint8_t col = row_col & 63;
790
- const size_t row = row_col >> 6;
824
+ const uint32_t row = row_col >> 6;
791
825
  // Flip the specified matrix bit from its default value.
792
826
  // In the "early" zone the bit changes from 1 to 0.
793
827
  // In the "late" zone the bit changes from 0 to 1.
@@ -799,8 +833,8 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
799
833
 
800
834
  template<typename A>
801
835
  void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
802
- os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
803
- os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
836
+ write(os, kxp);
837
+ write(os, hip_est_accum);
804
838
  }
805
839
 
806
840
  template<typename A>