datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -334,7 +334,7 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
334
334
  num_bytes += (h_ / 8) + (h_ % 8 > 0);
335
335
  }
336
336
  // must iterate over the items
337
- for (auto& it: *this)
337
+ for (auto it: *this)
338
338
  num_bytes += S().size_of_item(it.first);
339
339
  return num_bytes;
340
340
  }
@@ -359,21 +359,21 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
359
359
  // first prelong
360
360
  uint8_t ser_ver(SER_VER);
361
361
  uint8_t family(FAMILY_ID);
362
- ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
363
- ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
364
- ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
365
- ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
366
- ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
362
+ ptr += copy_to_mem(first_byte, ptr);
363
+ ptr += copy_to_mem(ser_ver, ptr);
364
+ ptr += copy_to_mem(family, ptr);
365
+ ptr += copy_to_mem(flags, ptr);
366
+ ptr += copy_to_mem(k_, ptr);
367
367
 
368
368
  if (!empty) {
369
369
  // second and third prelongs
370
- ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
371
- ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
372
- ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
370
+ ptr += copy_to_mem(n_, ptr);
371
+ ptr += copy_to_mem(h_, ptr);
372
+ ptr += copy_to_mem(r_, ptr);
373
373
 
374
374
  // fourth prelong, if needed
375
375
  if (r_ > 0) {
376
- ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
376
+ ptr += copy_to_mem(total_wt_r_, ptr);
377
377
  }
378
378
 
379
379
  // first h_ weights
@@ -388,14 +388,14 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
388
388
  }
389
389
 
390
390
  if ((i & 0x7) == 0x7) {
391
- ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
391
+ ptr += copy_to_mem(val, ptr);
392
392
  val = 0;
393
393
  }
394
394
  }
395
395
 
396
396
  // write out any remaining values
397
397
  if ((h_ & 0x7) > 0) {
398
- ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
398
+ ptr += copy_to_mem(val, ptr);
399
399
  }
400
400
  }
401
401
 
@@ -428,25 +428,25 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
428
428
  // first prelong
429
429
  const uint8_t ser_ver(SER_VER);
430
430
  const uint8_t family(FAMILY_ID);
431
- os.write((char*)&first_byte, sizeof(uint8_t));
432
- os.write((char*)&ser_ver, sizeof(uint8_t));
433
- os.write((char*)&family, sizeof(uint8_t));
434
- os.write((char*)&flags, sizeof(uint8_t));
435
- os.write((char*)&k_, sizeof(uint32_t));
431
+ write(os, first_byte);
432
+ write(os, ser_ver);
433
+ write(os, family);
434
+ write(os, flags);
435
+ write(os, k_);
436
436
 
437
437
  if (!empty) {
438
438
  // second and third prelongs
439
- os.write((char*)&n_, sizeof(uint64_t));
440
- os.write((char*)&h_, sizeof(uint32_t));
441
- os.write((char*)&r_, sizeof(uint32_t));
439
+ write(os, n_);
440
+ write(os, h_);
441
+ write(os, r_);
442
442
 
443
443
  // fourth prelong, if needed
444
444
  if (r_ > 0) {
445
- os.write((char*)&total_wt_r_, sizeof(double));
445
+ write(os, total_wt_r_);
446
446
  }
447
447
 
448
448
  // write the first h_ weights
449
- os.write((char*)weights_, h_ * sizeof(double));
449
+ write(os, weights_, h_ * sizeof(double));
450
450
 
451
451
  // write the first h_ marks as packed bytes iff we have a gadget
452
452
  if (marks_ != nullptr) {
@@ -457,14 +457,14 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
457
457
  }
458
458
 
459
459
  if ((i & 0x7) == 0x7) {
460
- os.write((char*)&val, sizeof(uint8_t));
460
+ write(os, val);
461
461
  val = 0;
462
462
  }
463
463
  }
464
464
 
465
465
  // write out any remaining values
466
466
  if ((h_ & 0x7) > 0) {
467
- os.write((char*)&val, sizeof(uint8_t));
467
+ write(os, val);
468
468
  }
469
469
  }
470
470
 
@@ -481,17 +481,17 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
481
481
  const char* base = ptr;
482
482
  const char* end_ptr = ptr + size;
483
483
  uint8_t first_byte;
484
- ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
484
+ ptr += copy_from_mem(ptr, first_byte);
485
485
  uint8_t preamble_longs = first_byte & 0x3f;
486
486
  resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
487
487
  uint8_t serial_version;
488
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
488
+ ptr += copy_from_mem(ptr, serial_version);
489
489
  uint8_t family_id;
490
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
490
+ ptr += copy_from_mem(ptr, family_id);
491
491
  uint8_t flags;
492
- ptr += copy_from_mem(ptr, &flags, sizeof(flags));
492
+ ptr += copy_from_mem(ptr, flags);
493
493
  uint32_t k;
494
- ptr += copy_from_mem(ptr, &k, sizeof(k));
494
+ ptr += copy_from_mem(ptr, k);
495
495
 
496
496
  check_preamble_longs(preamble_longs, flags);
497
497
  check_family_and_serialization_version(family_id, serial_version);
@@ -507,16 +507,16 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
507
507
  // second and third prelongs
508
508
  uint64_t n;
509
509
  uint32_t h, r;
510
- ptr += copy_from_mem(ptr, &n, sizeof(n));
511
- ptr += copy_from_mem(ptr, &h, sizeof(h));
512
- ptr += copy_from_mem(ptr, &r, sizeof(r));
510
+ ptr += copy_from_mem(ptr, n);
511
+ ptr += copy_from_mem(ptr, h);
512
+ ptr += copy_from_mem(ptr, r);
513
513
 
514
514
  const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
515
515
 
516
516
  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
517
517
  double total_wt_r = 0.0;
518
518
  if (preamble_longs == PREAMBLE_LONGS_FULL) {
519
- ptr += copy_from_mem(ptr, &total_wt_r, sizeof(total_wt_r));
519
+ ptr += copy_from_mem(ptr, total_wt_r);
520
520
  if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
521
521
  throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
522
522
  "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
@@ -548,7 +548,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
548
548
  check_memory_size(ptr - base + size_marks, size);
549
549
  for (uint32_t i = 0; i < h; ++i) {
550
550
  if ((i & 0x7) == 0x0) { // should trigger on first iteration
551
- ptr += copy_from_mem(ptr, &val, sizeof(val));
551
+ ptr += copy_from_mem(ptr, val);
552
552
  }
553
553
  marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
554
554
  num_marks_in_h += (marks.get()[i] ? 1 : 0);
@@ -571,18 +571,13 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
571
571
 
572
572
  template<typename T, typename S, typename A>
573
573
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
574
- uint8_t first_byte;
575
- is.read((char*)&first_byte, sizeof(first_byte));
574
+ const auto first_byte = read<uint8_t>(is);
576
575
  uint8_t preamble_longs = first_byte & 0x3f;
577
- resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
578
- uint8_t serial_version;
579
- is.read((char*)&serial_version, sizeof(serial_version));
580
- uint8_t family_id;
581
- is.read((char*)&family_id, sizeof(family_id));
582
- uint8_t flags;
583
- is.read((char*)&flags, sizeof(flags));
584
- uint32_t k;
585
- is.read((char*)&k, sizeof(k));
576
+ const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
577
+ const auto serial_version = read<uint8_t>(is);
578
+ const auto family_id = read<uint8_t>(is);
579
+ const auto flags = read<uint8_t>(is);
580
+ const auto k = read<uint32_t>(is);
586
581
 
587
582
  check_preamble_longs(preamble_longs, flags);
588
583
  check_family_and_serialization_version(family_id, serial_version);
@@ -598,31 +593,27 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
598
593
  }
599
594
 
600
595
  // second and third prelongs
601
- uint64_t n;
602
- uint32_t h, r;
603
- is.read((char*)&n, sizeof(n));
604
- is.read((char*)&h, sizeof(h));
605
- is.read((char*)&r, sizeof(r));
596
+ const auto n = read<uint64_t>(is);
597
+ const auto h = read<uint32_t>(is);
598
+ const auto r = read<uint32_t>(is);
606
599
 
607
600
  const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
608
601
 
609
602
  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
610
603
  double total_wt_r = 0.0;
611
604
  if (preamble_longs == PREAMBLE_LONGS_FULL) {
612
- is.read((char*)&total_wt_r, sizeof(total_wt_r));
605
+ total_wt_r = read<double>(is);
613
606
  if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
614
607
  throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
615
608
  "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
616
609
  }
617
- } else {
618
- total_wt_r = 0.0;
619
610
  }
620
611
 
621
612
  // read the first h weights, fill remainder with -1.0
622
613
  std::unique_ptr<double, weights_deleter> weights(AllocDouble(allocator).allocate(array_size),
623
614
  weights_deleter(array_size, allocator));
624
615
  double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
625
- is.read((char*)wts, h * sizeof(double));
616
+ read(is, wts, h * sizeof(double));
626
617
  for (size_t i = 0; i < h; ++i) {
627
618
  if (!(wts[i] > 0.0)) {
628
619
  throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
@@ -638,7 +629,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
638
629
  uint8_t val = 0;
639
630
  for (uint32_t i = 0; i < h; ++i) {
640
631
  if ((i & 0x7) == 0x0) { // should trigger on first iteration
641
- is.read((char*)&val, sizeof(val));
632
+ val = read<uint8_t>(is);
642
633
  }
643
634
  marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
644
635
  num_marks_in_h += (marks.get()[i] ? 1 : 0);
@@ -1420,7 +1411,7 @@ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
1420
1411
  if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
1421
1412
  throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
1422
1413
 
1423
- size_t r_true_count = 0;
1414
+ uint32_t r_true_count = 0;
1424
1415
  ++idx; // skip the gap
1425
1416
  for (; idx < (k_ + 1); ++idx) {
1426
1417
  if (predicate(data_[idx])) {
@@ -30,8 +30,8 @@ namespace datasketches {
30
30
  template<typename T, typename S, typename A>
31
31
  var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
32
32
  n_(0),
33
- outer_tau_numer_(0),
34
- outer_tau_denom_(0.0),
33
+ outer_tau_numer_(0.0),
34
+ outer_tau_denom_(0),
35
35
  max_k_(max_k),
36
36
  gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
37
37
  {}
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
129
 
130
130
  template<typename T, typename S, typename A>
131
131
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
132
- uint8_t preamble_longs;
133
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
- uint8_t serial_version;
135
- is.read((char*)&serial_version, sizeof(serial_version));
136
- uint8_t family_id;
137
- is.read((char*)&family_id, sizeof(family_id));
138
- uint8_t flags;
139
- is.read((char*)&flags, sizeof(flags));
140
- uint32_t max_k;
141
- is.read((char*)&max_k, sizeof(max_k));
132
+ const auto preamble_longs = read<uint8_t>(is);
133
+ const auto serial_version = read<uint8_t>(is);
134
+ const auto family_id = read<uint8_t>(is);
135
+ const auto flags = read<uint8_t>(is);
136
+ const auto max_k = read<uint32_t>(is);
142
137
 
143
138
  check_preamble_longs(preamble_longs, flags);
144
139
  check_family_and_serialization_version(family_id, serial_version);
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
156
151
  return var_opt_union<T,S,A>(max_k);
157
152
  }
158
153
 
159
- uint64_t items_seen;
160
- is.read((char*)&items_seen, sizeof(items_seen));
161
- double outer_tau_numer;
162
- is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
163
- uint64_t outer_tau_denom;
164
- is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
154
+ const auto items_seen = read<uint64_t>(is);
155
+ const auto outer_tau_numer = read<double>(is);
156
+ const auto outer_tau_denom = read<uint64_t>(is);
165
157
 
166
158
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
167
159
 
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
176
168
  ensure_minimum_memory(size, 8);
177
169
  const char* ptr = static_cast<const char*>(bytes);
178
170
  uint8_t preamble_longs;
179
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
171
+ ptr += copy_from_mem(ptr, preamble_longs);
180
172
  uint8_t serial_version;
181
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
173
+ ptr += copy_from_mem(ptr, serial_version);
182
174
  uint8_t family_id;
183
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
175
+ ptr += copy_from_mem(ptr, family_id);
184
176
  uint8_t flags;
185
- ptr += copy_from_mem(ptr, &flags, sizeof(flags));
177
+ ptr += copy_from_mem(ptr, flags);
186
178
  uint32_t max_k;
187
- ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
179
+ ptr += copy_from_mem(ptr, max_k);
188
180
 
189
181
  check_preamble_longs(preamble_longs, flags);
190
182
  check_family_and_serialization_version(family_id, serial_version);
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
200
192
  }
201
193
 
202
194
  uint64_t items_seen;
203
- ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
195
+ ptr += copy_from_mem(ptr, items_seen);
204
196
  double outer_tau_numer;
205
- ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
197
+ ptr += copy_from_mem(ptr, outer_tau_numer);
206
198
  uint64_t outer_tau_denom;
207
- ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
199
+ ptr += copy_from_mem(ptr, outer_tau_denom);
208
200
 
209
201
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
202
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
238
230
  flags = 0;
239
231
  }
240
232
 
241
- os.write((char*) &preamble_longs, sizeof(uint8_t));
242
- os.write((char*) &serialization_version, sizeof(uint8_t));
243
- os.write((char*) &family_id, sizeof(uint8_t));
244
- os.write((char*) &flags, sizeof(uint8_t));
245
- os.write((char*) &max_k_, sizeof(uint32_t));
233
+ write(os, preamble_longs);
234
+ write(os, serialization_version);
235
+ write(os, family_id);
236
+ write(os, flags);
237
+ write(os, max_k_);
246
238
 
247
239
  if (!empty) {
248
- os.write((char*) &n_, sizeof(uint64_t));
249
- os.write((char*) &outer_tau_numer_, sizeof(double));
250
- os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
240
+ write(os, n_);
241
+ write(os, outer_tau_numer_);
242
+ write(os, outer_tau_denom_);
251
243
  gadget_.serialize(os);
252
244
  }
253
245
  }
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
275
267
  }
276
268
 
277
269
  // first prelong
278
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
279
- ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
280
- ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
281
- ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
282
- ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
270
+ ptr += copy_to_mem(preamble_longs, ptr);
271
+ ptr += copy_to_mem(serialization_version, ptr);
272
+ ptr += copy_to_mem(family_id, ptr);
273
+ ptr += copy_to_mem(flags, ptr);
274
+ ptr += copy_to_mem(max_k_, ptr);
283
275
 
284
276
  if (!empty) {
285
- ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
286
- ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
287
- ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
277
+ ptr += copy_to_mem(n_, ptr);
278
+ ptr += copy_to_mem(outer_tau_numer_, ptr);
279
+ ptr += copy_to_mem(outer_tau_denom_, ptr);
288
280
 
289
281
  auto gadget_bytes = gadget_.serialize();
290
282
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
71
71
 
72
72
  TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
73
73
  REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
74
- REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
74
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
75
75
  }
76
76
 
77
77
  TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
216
216
  // which covers about 10 orders of magnitude
217
217
  double w = std::exp(5 * N(rand));
218
218
  input_sum += w;
219
- sk.update(i, w);
219
+ sk.update(static_cast<int>(i), w);
220
220
  }
221
221
 
222
222
  double output_sum = 0.0;
223
- for (auto& it : sk) { // std::pair<int, weight>
223
+ for (auto it : sk) { // std::pair<int, weight>
224
224
  output_sum += it.second;
225
225
  }
226
226
 
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
350
350
  // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
351
351
  // added k-1 heavy items, leaving only 1 item left in R
352
352
  for (uint32_t i = 1; i <= k; ++i) {
353
- sk.update(-i, k + (i * wt_scale));
353
+ sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
354
354
  }
355
355
 
356
356
  auto it = sk.begin();
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
442
442
  // finally, a non-degenerate predicate
443
443
  // insert negative items with identical weights, filter for negative weights only
444
444
  for (uint32_t i = 1; i <= (k + 1); ++i) {
445
- sk.update(static_cast<int32_t>(-i), 1.0 * i);
445
+ sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
446
446
  total_weight += 1.0 * i;
447
447
  }
448
448
 
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
147
147
 
148
148
  TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
149
149
  REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
150
- REQUIRE_THROWS_AS(var_opt_union<int>(1<<31), std::invalid_argument);
150
+ REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
151
151
  }
152
152
 
153
153
  TEST_CASE("varopt union: bad family", "[var_opt_union]") {
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
179
179
  }
180
180
 
181
181
  TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
182
- uint64_t n = 4; // 2n < k
182
+ int n = 4; // 2n < k
183
183
  uint32_t k = 10;
184
184
  var_opt_sketch<int> sk1(k), sk2(k);
185
185
 
186
- for (uint64_t i = 1; i <= n; ++i) {
187
- sk1.update(i, i);
188
- sk2.update(static_cast<int64_t>(-i), i);
186
+ for (int i = 1; i <= n; ++i) {
187
+ sk1.update(i, static_cast<double>(i));
188
+ sk2.update(-i, static_cast<double>(i));
189
189
  }
190
190
 
191
191
  var_opt_union<int> u(k);
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
193
193
  u.update(sk2);
194
194
 
195
195
  var_opt_sketch<int> result = u.get_result();
196
- REQUIRE(result.get_n() == 2 * n);
196
+ REQUIRE(result.get_n() == 2ULL * n);
197
197
  REQUIRE(result.get_k() == k);
198
198
  }
199
199
 
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
204
204
  uint32_t k2 = 5;
205
205
  var_opt_sketch<int64_t> sk1(k1), sk2(k2);
206
206
  for (uint64_t i = 1; i <= n1; ++i) {
207
- sk1.update(i, i);
207
+ sk1.update(i, static_cast<double>(i));
208
208
  }
209
209
 
210
210
  for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
211
- sk2.update(static_cast<int64_t>(-i), i + 1000.0);
211
+ sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
212
212
  }
213
- sk2.update(-n2, 1000000.0);
213
+ sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
214
214
 
215
215
  var_opt_union<int64_t> u(k1);
216
216
  u.update(sk1);
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
258
258
  uint64_t n2 = 64;
259
259
 
260
260
  var_opt_sketch<float> sk(k_small);
261
- for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
262
- sk.update(-1, n1 * n1); // add a heavy item
261
+ for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
262
+ sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
263
263
 
264
264
  var_opt_union<float> u(k_max);
265
265
  u.update(sk);
266
266
 
267
267
  // another one, but different n to get a different per-item weight
268
268
  var_opt_sketch<float> sk2(k_small);
269
- for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
269
+ for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
270
270
  u.update(sk2);
271
271
 
272
272
  // should trigger migrate_marked_items_by_decreasing_k()