datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -334,7 +334,7 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
334
334
  num_bytes += (h_ / 8) + (h_ % 8 > 0);
335
335
  }
336
336
  // must iterate over the items
337
- for (auto& it: *this)
337
+ for (auto it: *this)
338
338
  num_bytes += S().size_of_item(it.first);
339
339
  return num_bytes;
340
340
  }
@@ -359,21 +359,21 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
359
359
  // first prelong
360
360
  uint8_t ser_ver(SER_VER);
361
361
  uint8_t family(FAMILY_ID);
362
- ptr += copy_to_mem(&first_byte, ptr, sizeof(uint8_t));
363
- ptr += copy_to_mem(&ser_ver, ptr, sizeof(uint8_t));
364
- ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
365
- ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
366
- ptr += copy_to_mem(&k_, ptr, sizeof(uint32_t));
362
+ ptr += copy_to_mem(first_byte, ptr);
363
+ ptr += copy_to_mem(ser_ver, ptr);
364
+ ptr += copy_to_mem(family, ptr);
365
+ ptr += copy_to_mem(flags, ptr);
366
+ ptr += copy_to_mem(k_, ptr);
367
367
 
368
368
  if (!empty) {
369
369
  // second and third prelongs
370
- ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
371
- ptr += copy_to_mem(&h_, ptr, sizeof(uint32_t));
372
- ptr += copy_to_mem(&r_, ptr, sizeof(uint32_t));
370
+ ptr += copy_to_mem(n_, ptr);
371
+ ptr += copy_to_mem(h_, ptr);
372
+ ptr += copy_to_mem(r_, ptr);
373
373
 
374
374
  // fourth prelong, if needed
375
375
  if (r_ > 0) {
376
- ptr += copy_to_mem(&total_wt_r_, ptr, sizeof(double));
376
+ ptr += copy_to_mem(total_wt_r_, ptr);
377
377
  }
378
378
 
379
379
  // first h_ weights
@@ -388,14 +388,14 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
388
388
  }
389
389
 
390
390
  if ((i & 0x7) == 0x7) {
391
- ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
391
+ ptr += copy_to_mem(val, ptr);
392
392
  val = 0;
393
393
  }
394
394
  }
395
395
 
396
396
  // write out any remaining values
397
397
  if ((h_ & 0x7) > 0) {
398
- ptr += copy_to_mem(&val, ptr, sizeof(uint8_t));
398
+ ptr += copy_to_mem(val, ptr);
399
399
  }
400
400
  }
401
401
 
@@ -428,25 +428,25 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
428
428
  // first prelong
429
429
  const uint8_t ser_ver(SER_VER);
430
430
  const uint8_t family(FAMILY_ID);
431
- os.write((char*)&first_byte, sizeof(uint8_t));
432
- os.write((char*)&ser_ver, sizeof(uint8_t));
433
- os.write((char*)&family, sizeof(uint8_t));
434
- os.write((char*)&flags, sizeof(uint8_t));
435
- os.write((char*)&k_, sizeof(uint32_t));
431
+ write(os, first_byte);
432
+ write(os, ser_ver);
433
+ write(os, family);
434
+ write(os, flags);
435
+ write(os, k_);
436
436
 
437
437
  if (!empty) {
438
438
  // second and third prelongs
439
- os.write((char*)&n_, sizeof(uint64_t));
440
- os.write((char*)&h_, sizeof(uint32_t));
441
- os.write((char*)&r_, sizeof(uint32_t));
439
+ write(os, n_);
440
+ write(os, h_);
441
+ write(os, r_);
442
442
 
443
443
  // fourth prelong, if needed
444
444
  if (r_ > 0) {
445
- os.write((char*)&total_wt_r_, sizeof(double));
445
+ write(os, total_wt_r_);
446
446
  }
447
447
 
448
448
  // write the first h_ weights
449
- os.write((char*)weights_, h_ * sizeof(double));
449
+ write(os, weights_, h_ * sizeof(double));
450
450
 
451
451
  // write the first h_ marks as packed bytes iff we have a gadget
452
452
  if (marks_ != nullptr) {
@@ -457,14 +457,14 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
457
457
  }
458
458
 
459
459
  if ((i & 0x7) == 0x7) {
460
- os.write((char*)&val, sizeof(uint8_t));
460
+ write(os, val);
461
461
  val = 0;
462
462
  }
463
463
  }
464
464
 
465
465
  // write out any remaining values
466
466
  if ((h_ & 0x7) > 0) {
467
- os.write((char*)&val, sizeof(uint8_t));
467
+ write(os, val);
468
468
  }
469
469
  }
470
470
 
@@ -481,17 +481,17 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
481
481
  const char* base = ptr;
482
482
  const char* end_ptr = ptr + size;
483
483
  uint8_t first_byte;
484
- ptr += copy_from_mem(ptr, &first_byte, sizeof(first_byte));
484
+ ptr += copy_from_mem(ptr, first_byte);
485
485
  uint8_t preamble_longs = first_byte & 0x3f;
486
486
  resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
487
487
  uint8_t serial_version;
488
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
488
+ ptr += copy_from_mem(ptr, serial_version);
489
489
  uint8_t family_id;
490
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
490
+ ptr += copy_from_mem(ptr, family_id);
491
491
  uint8_t flags;
492
- ptr += copy_from_mem(ptr, &flags, sizeof(flags));
492
+ ptr += copy_from_mem(ptr, flags);
493
493
  uint32_t k;
494
- ptr += copy_from_mem(ptr, &k, sizeof(k));
494
+ ptr += copy_from_mem(ptr, k);
495
495
 
496
496
  check_preamble_longs(preamble_longs, flags);
497
497
  check_family_and_serialization_version(family_id, serial_version);
@@ -507,16 +507,16 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
507
507
  // second and third prelongs
508
508
  uint64_t n;
509
509
  uint32_t h, r;
510
- ptr += copy_from_mem(ptr, &n, sizeof(n));
511
- ptr += copy_from_mem(ptr, &h, sizeof(h));
512
- ptr += copy_from_mem(ptr, &r, sizeof(r));
510
+ ptr += copy_from_mem(ptr, n);
511
+ ptr += copy_from_mem(ptr, h);
512
+ ptr += copy_from_mem(ptr, r);
513
513
 
514
514
  const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
515
515
 
516
516
  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
517
517
  double total_wt_r = 0.0;
518
518
  if (preamble_longs == PREAMBLE_LONGS_FULL) {
519
- ptr += copy_from_mem(ptr, &total_wt_r, sizeof(total_wt_r));
519
+ ptr += copy_from_mem(ptr, total_wt_r);
520
520
  if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
521
521
  throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
522
522
  "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
@@ -548,7 +548,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
548
548
  check_memory_size(ptr - base + size_marks, size);
549
549
  for (uint32_t i = 0; i < h; ++i) {
550
550
  if ((i & 0x7) == 0x0) { // should trigger on first iteration
551
- ptr += copy_from_mem(ptr, &val, sizeof(val));
551
+ ptr += copy_from_mem(ptr, val);
552
552
  }
553
553
  marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
554
554
  num_marks_in_h += (marks.get()[i] ? 1 : 0);
@@ -571,18 +571,13 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
571
571
 
572
572
  template<typename T, typename S, typename A>
573
573
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
574
- uint8_t first_byte;
575
- is.read((char*)&first_byte, sizeof(first_byte));
574
+ const auto first_byte = read<uint8_t>(is);
576
575
  uint8_t preamble_longs = first_byte & 0x3f;
577
- resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
578
- uint8_t serial_version;
579
- is.read((char*)&serial_version, sizeof(serial_version));
580
- uint8_t family_id;
581
- is.read((char*)&family_id, sizeof(family_id));
582
- uint8_t flags;
583
- is.read((char*)&flags, sizeof(flags));
584
- uint32_t k;
585
- is.read((char*)&k, sizeof(k));
576
+ const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
577
+ const auto serial_version = read<uint8_t>(is);
578
+ const auto family_id = read<uint8_t>(is);
579
+ const auto flags = read<uint8_t>(is);
580
+ const auto k = read<uint32_t>(is);
586
581
 
587
582
  check_preamble_longs(preamble_longs, flags);
588
583
  check_family_and_serialization_version(family_id, serial_version);
@@ -598,31 +593,27 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
598
593
  }
599
594
 
600
595
  // second and third prelongs
601
- uint64_t n;
602
- uint32_t h, r;
603
- is.read((char*)&n, sizeof(n));
604
- is.read((char*)&h, sizeof(h));
605
- is.read((char*)&r, sizeof(r));
596
+ const auto n = read<uint64_t>(is);
597
+ const auto h = read<uint32_t>(is);
598
+ const auto r = read<uint32_t>(is);
606
599
 
607
600
  const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
608
601
 
609
602
  // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
610
603
  double total_wt_r = 0.0;
611
604
  if (preamble_longs == PREAMBLE_LONGS_FULL) {
612
- is.read((char*)&total_wt_r, sizeof(total_wt_r));
605
+ total_wt_r = read<double>(is);
613
606
  if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
614
607
  throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
615
608
  "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
616
609
  }
617
- } else {
618
- total_wt_r = 0.0;
619
610
  }
620
611
 
621
612
  // read the first h weights, fill remainder with -1.0
622
613
  std::unique_ptr<double, weights_deleter> weights(AllocDouble(allocator).allocate(array_size),
623
614
  weights_deleter(array_size, allocator));
624
615
  double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
625
- is.read((char*)wts, h * sizeof(double));
616
+ read(is, wts, h * sizeof(double));
626
617
  for (size_t i = 0; i < h; ++i) {
627
618
  if (!(wts[i] > 0.0)) {
628
619
  throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
@@ -638,7 +629,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
638
629
  uint8_t val = 0;
639
630
  for (uint32_t i = 0; i < h; ++i) {
640
631
  if ((i & 0x7) == 0x0) { // should trigger on first iteration
641
- is.read((char*)&val, sizeof(val));
632
+ val = read<uint8_t>(is);
642
633
  }
643
634
  marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
644
635
  num_marks_in_h += (marks.get()[i] ? 1 : 0);
@@ -1420,7 +1411,7 @@ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
1420
1411
  if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
1421
1412
  throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
1422
1413
 
1423
- size_t r_true_count = 0;
1414
+ uint32_t r_true_count = 0;
1424
1415
  ++idx; // skip the gap
1425
1416
  for (; idx < (k_ + 1); ++idx) {
1426
1417
  if (predicate(data_[idx])) {
@@ -30,8 +30,8 @@ namespace datasketches {
30
30
  template<typename T, typename S, typename A>
31
31
  var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
32
32
  n_(0),
33
- outer_tau_numer_(0),
34
- outer_tau_denom_(0.0),
33
+ outer_tau_numer_(0.0),
34
+ outer_tau_denom_(0),
35
35
  max_k_(max_k),
36
36
  gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
37
37
  {}
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
129
 
130
130
  template<typename T, typename S, typename A>
131
131
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
132
- uint8_t preamble_longs;
133
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
134
- uint8_t serial_version;
135
- is.read((char*)&serial_version, sizeof(serial_version));
136
- uint8_t family_id;
137
- is.read((char*)&family_id, sizeof(family_id));
138
- uint8_t flags;
139
- is.read((char*)&flags, sizeof(flags));
140
- uint32_t max_k;
141
- is.read((char*)&max_k, sizeof(max_k));
132
+ const auto preamble_longs = read<uint8_t>(is);
133
+ const auto serial_version = read<uint8_t>(is);
134
+ const auto family_id = read<uint8_t>(is);
135
+ const auto flags = read<uint8_t>(is);
136
+ const auto max_k = read<uint32_t>(is);
142
137
 
143
138
  check_preamble_longs(preamble_longs, flags);
144
139
  check_family_and_serialization_version(family_id, serial_version);
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
156
151
  return var_opt_union<T,S,A>(max_k);
157
152
  }
158
153
 
159
- uint64_t items_seen;
160
- is.read((char*)&items_seen, sizeof(items_seen));
161
- double outer_tau_numer;
162
- is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
163
- uint64_t outer_tau_denom;
164
- is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
154
+ const auto items_seen = read<uint64_t>(is);
155
+ const auto outer_tau_numer = read<double>(is);
156
+ const auto outer_tau_denom = read<uint64_t>(is);
165
157
 
166
158
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
167
159
 
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
176
168
  ensure_minimum_memory(size, 8);
177
169
  const char* ptr = static_cast<const char*>(bytes);
178
170
  uint8_t preamble_longs;
179
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
171
+ ptr += copy_from_mem(ptr, preamble_longs);
180
172
  uint8_t serial_version;
181
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
173
+ ptr += copy_from_mem(ptr, serial_version);
182
174
  uint8_t family_id;
183
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
175
+ ptr += copy_from_mem(ptr, family_id);
184
176
  uint8_t flags;
185
- ptr += copy_from_mem(ptr, &flags, sizeof(flags));
177
+ ptr += copy_from_mem(ptr, flags);
186
178
  uint32_t max_k;
187
- ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
179
+ ptr += copy_from_mem(ptr, max_k);
188
180
 
189
181
  check_preamble_longs(preamble_longs, flags);
190
182
  check_family_and_serialization_version(family_id, serial_version);
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
200
192
  }
201
193
 
202
194
  uint64_t items_seen;
203
- ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
195
+ ptr += copy_from_mem(ptr, items_seen);
204
196
  double outer_tau_numer;
205
- ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
197
+ ptr += copy_from_mem(ptr, outer_tau_numer);
206
198
  uint64_t outer_tau_denom;
207
- ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
199
+ ptr += copy_from_mem(ptr, outer_tau_denom);
208
200
 
209
201
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
210
202
  var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
238
230
  flags = 0;
239
231
  }
240
232
 
241
- os.write((char*) &preamble_longs, sizeof(uint8_t));
242
- os.write((char*) &serialization_version, sizeof(uint8_t));
243
- os.write((char*) &family_id, sizeof(uint8_t));
244
- os.write((char*) &flags, sizeof(uint8_t));
245
- os.write((char*) &max_k_, sizeof(uint32_t));
233
+ write(os, preamble_longs);
234
+ write(os, serialization_version);
235
+ write(os, family_id);
236
+ write(os, flags);
237
+ write(os, max_k_);
246
238
 
247
239
  if (!empty) {
248
- os.write((char*) &n_, sizeof(uint64_t));
249
- os.write((char*) &outer_tau_numer_, sizeof(double));
250
- os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
240
+ write(os, n_);
241
+ write(os, outer_tau_numer_);
242
+ write(os, outer_tau_denom_);
251
243
  gadget_.serialize(os);
252
244
  }
253
245
  }
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
275
267
  }
276
268
 
277
269
  // first prelong
278
- ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
279
- ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
280
- ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
281
- ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
282
- ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
270
+ ptr += copy_to_mem(preamble_longs, ptr);
271
+ ptr += copy_to_mem(serialization_version, ptr);
272
+ ptr += copy_to_mem(family_id, ptr);
273
+ ptr += copy_to_mem(flags, ptr);
274
+ ptr += copy_to_mem(max_k_, ptr);
283
275
 
284
276
  if (!empty) {
285
- ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
286
- ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
287
- ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
277
+ ptr += copy_to_mem(n_, ptr);
278
+ ptr += copy_to_mem(outer_tau_numer_, ptr);
279
+ ptr += copy_to_mem(outer_tau_denom_, ptr);
288
280
 
289
281
  auto gadget_bytes = gadget_.serialize();
290
282
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
71
71
 
72
72
  TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
73
73
  REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
74
- REQUIRE_THROWS_AS(var_opt_sketch<int>(1 << 31), std::invalid_argument); // aka k < 0
74
+ REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
75
75
  }
76
76
 
77
77
  TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
216
216
  // which covers about 10 orders of magnitude
217
217
  double w = std::exp(5 * N(rand));
218
218
  input_sum += w;
219
- sk.update(i, w);
219
+ sk.update(static_cast<int>(i), w);
220
220
  }
221
221
 
222
222
  double output_sum = 0.0;
223
- for (auto& it : sk) { // std::pair<int, weight>
223
+ for (auto it : sk) { // std::pair<int, weight>
224
224
  output_sum += it.second;
225
225
  }
226
226
 
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
350
350
  // Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
351
351
  // added k-1 heavy items, leaving only 1 item left in R
352
352
  for (uint32_t i = 1; i <= k; ++i) {
353
- sk.update(-i, k + (i * wt_scale));
353
+ sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
354
354
  }
355
355
 
356
356
  auto it = sk.begin();
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
442
442
  // finally, a non-degenerate predicate
443
443
  // insert negative items with identical weights, filter for negative weights only
444
444
  for (uint32_t i = 1; i <= (k + 1); ++i) {
445
- sk.update(static_cast<int32_t>(-i), 1.0 * i);
445
+ sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
446
446
  total_weight += 1.0 * i;
447
447
  }
448
448
 
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
41
41
  static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
42
42
  var_opt_sketch<int> sk(k);
43
43
  for (uint64_t i = 0; i < n; ++i) {
44
- sk.update(i, 1.0);
44
+ sk.update(static_cast<int>(i), 1.0);
45
45
  }
46
46
  return sk;
47
47
  }
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
147
147
 
148
148
  TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
149
149
  REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
150
- REQUIRE_THROWS_AS(var_opt_union<int>(1<<31), std::invalid_argument);
150
+ REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
151
151
  }
152
152
 
153
153
  TEST_CASE("varopt union: bad family", "[var_opt_union]") {
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
179
179
  }
180
180
 
181
181
  TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
182
- uint64_t n = 4; // 2n < k
182
+ int n = 4; // 2n < k
183
183
  uint32_t k = 10;
184
184
  var_opt_sketch<int> sk1(k), sk2(k);
185
185
 
186
- for (uint64_t i = 1; i <= n; ++i) {
187
- sk1.update(i, i);
188
- sk2.update(static_cast<int64_t>(-i), i);
186
+ for (int i = 1; i <= n; ++i) {
187
+ sk1.update(i, static_cast<double>(i));
188
+ sk2.update(-i, static_cast<double>(i));
189
189
  }
190
190
 
191
191
  var_opt_union<int> u(k);
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
193
193
  u.update(sk2);
194
194
 
195
195
  var_opt_sketch<int> result = u.get_result();
196
- REQUIRE(result.get_n() == 2 * n);
196
+ REQUIRE(result.get_n() == 2ULL * n);
197
197
  REQUIRE(result.get_k() == k);
198
198
  }
199
199
 
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
204
204
  uint32_t k2 = 5;
205
205
  var_opt_sketch<int64_t> sk1(k1), sk2(k2);
206
206
  for (uint64_t i = 1; i <= n1; ++i) {
207
- sk1.update(i, i);
207
+ sk1.update(i, static_cast<double>(i));
208
208
  }
209
209
 
210
210
  for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
211
- sk2.update(static_cast<int64_t>(-i), i + 1000.0);
211
+ sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
212
212
  }
213
- sk2.update(-n2, 1000000.0);
213
+ sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
214
214
 
215
215
  var_opt_union<int64_t> u(k1);
216
216
  u.update(sk1);
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
258
258
  uint64_t n2 = 64;
259
259
 
260
260
  var_opt_sketch<float> sk(k_small);
261
- for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
262
- sk.update(-1, n1 * n1); // add a heavy item
261
+ for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
262
+ sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
263
263
 
264
264
  var_opt_union<float> u(k_max);
265
265
  u.update(sk);
266
266
 
267
267
  // another one, but different n to get a different per-item weight
268
268
  var_opt_sketch<float> sk2(k_small);
269
- for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
269
+ for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
270
270
  u.update(sk2);
271
271
 
272
272
  // should trigger migrate_marked_items_by_decreasing_k()