datasketches 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
@@ -334,7 +334,7 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
334
334
|
num_bytes += (h_ / 8) + (h_ % 8 > 0);
|
335
335
|
}
|
336
336
|
// must iterate over the items
|
337
|
-
for (auto
|
337
|
+
for (auto it: *this)
|
338
338
|
num_bytes += S().size_of_item(it.first);
|
339
339
|
return num_bytes;
|
340
340
|
}
|
@@ -359,21 +359,21 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
359
359
|
// first prelong
|
360
360
|
uint8_t ser_ver(SER_VER);
|
361
361
|
uint8_t family(FAMILY_ID);
|
362
|
-
ptr += copy_to_mem(
|
363
|
-
ptr += copy_to_mem(
|
364
|
-
ptr += copy_to_mem(
|
365
|
-
ptr += copy_to_mem(
|
366
|
-
ptr += copy_to_mem(
|
362
|
+
ptr += copy_to_mem(first_byte, ptr);
|
363
|
+
ptr += copy_to_mem(ser_ver, ptr);
|
364
|
+
ptr += copy_to_mem(family, ptr);
|
365
|
+
ptr += copy_to_mem(flags, ptr);
|
366
|
+
ptr += copy_to_mem(k_, ptr);
|
367
367
|
|
368
368
|
if (!empty) {
|
369
369
|
// second and third prelongs
|
370
|
-
ptr += copy_to_mem(
|
371
|
-
ptr += copy_to_mem(
|
372
|
-
ptr += copy_to_mem(
|
370
|
+
ptr += copy_to_mem(n_, ptr);
|
371
|
+
ptr += copy_to_mem(h_, ptr);
|
372
|
+
ptr += copy_to_mem(r_, ptr);
|
373
373
|
|
374
374
|
// fourth prelong, if needed
|
375
375
|
if (r_ > 0) {
|
376
|
-
ptr += copy_to_mem(
|
376
|
+
ptr += copy_to_mem(total_wt_r_, ptr);
|
377
377
|
}
|
378
378
|
|
379
379
|
// first h_ weights
|
@@ -388,14 +388,14 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
388
388
|
}
|
389
389
|
|
390
390
|
if ((i & 0x7) == 0x7) {
|
391
|
-
ptr += copy_to_mem(
|
391
|
+
ptr += copy_to_mem(val, ptr);
|
392
392
|
val = 0;
|
393
393
|
}
|
394
394
|
}
|
395
395
|
|
396
396
|
// write out any remaining values
|
397
397
|
if ((h_ & 0x7) > 0) {
|
398
|
-
ptr += copy_to_mem(
|
398
|
+
ptr += copy_to_mem(val, ptr);
|
399
399
|
}
|
400
400
|
}
|
401
401
|
|
@@ -428,25 +428,25 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
|
428
428
|
// first prelong
|
429
429
|
const uint8_t ser_ver(SER_VER);
|
430
430
|
const uint8_t family(FAMILY_ID);
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
431
|
+
write(os, first_byte);
|
432
|
+
write(os, ser_ver);
|
433
|
+
write(os, family);
|
434
|
+
write(os, flags);
|
435
|
+
write(os, k_);
|
436
436
|
|
437
437
|
if (!empty) {
|
438
438
|
// second and third prelongs
|
439
|
-
|
440
|
-
|
441
|
-
|
439
|
+
write(os, n_);
|
440
|
+
write(os, h_);
|
441
|
+
write(os, r_);
|
442
442
|
|
443
443
|
// fourth prelong, if needed
|
444
444
|
if (r_ > 0) {
|
445
|
-
|
445
|
+
write(os, total_wt_r_);
|
446
446
|
}
|
447
447
|
|
448
448
|
// write the first h_ weights
|
449
|
-
|
449
|
+
write(os, weights_, h_ * sizeof(double));
|
450
450
|
|
451
451
|
// write the first h_ marks as packed bytes iff we have a gadget
|
452
452
|
if (marks_ != nullptr) {
|
@@ -457,14 +457,14 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
|
457
457
|
}
|
458
458
|
|
459
459
|
if ((i & 0x7) == 0x7) {
|
460
|
-
|
460
|
+
write(os, val);
|
461
461
|
val = 0;
|
462
462
|
}
|
463
463
|
}
|
464
464
|
|
465
465
|
// write out any remaining values
|
466
466
|
if ((h_ & 0x7) > 0) {
|
467
|
-
|
467
|
+
write(os, val);
|
468
468
|
}
|
469
469
|
}
|
470
470
|
|
@@ -481,17 +481,17 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
481
481
|
const char* base = ptr;
|
482
482
|
const char* end_ptr = ptr + size;
|
483
483
|
uint8_t first_byte;
|
484
|
-
ptr += copy_from_mem(ptr,
|
484
|
+
ptr += copy_from_mem(ptr, first_byte);
|
485
485
|
uint8_t preamble_longs = first_byte & 0x3f;
|
486
486
|
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
487
487
|
uint8_t serial_version;
|
488
|
-
ptr += copy_from_mem(ptr,
|
488
|
+
ptr += copy_from_mem(ptr, serial_version);
|
489
489
|
uint8_t family_id;
|
490
|
-
ptr += copy_from_mem(ptr,
|
490
|
+
ptr += copy_from_mem(ptr, family_id);
|
491
491
|
uint8_t flags;
|
492
|
-
ptr += copy_from_mem(ptr,
|
492
|
+
ptr += copy_from_mem(ptr, flags);
|
493
493
|
uint32_t k;
|
494
|
-
ptr += copy_from_mem(ptr,
|
494
|
+
ptr += copy_from_mem(ptr, k);
|
495
495
|
|
496
496
|
check_preamble_longs(preamble_longs, flags);
|
497
497
|
check_family_and_serialization_version(family_id, serial_version);
|
@@ -507,16 +507,16 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
507
507
|
// second and third prelongs
|
508
508
|
uint64_t n;
|
509
509
|
uint32_t h, r;
|
510
|
-
ptr += copy_from_mem(ptr,
|
511
|
-
ptr += copy_from_mem(ptr,
|
512
|
-
ptr += copy_from_mem(ptr,
|
510
|
+
ptr += copy_from_mem(ptr, n);
|
511
|
+
ptr += copy_from_mem(ptr, h);
|
512
|
+
ptr += copy_from_mem(ptr, r);
|
513
513
|
|
514
514
|
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
515
515
|
|
516
516
|
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
517
517
|
double total_wt_r = 0.0;
|
518
518
|
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
519
|
-
ptr += copy_from_mem(ptr,
|
519
|
+
ptr += copy_from_mem(ptr, total_wt_r);
|
520
520
|
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
521
521
|
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
522
522
|
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
@@ -548,7 +548,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
548
548
|
check_memory_size(ptr - base + size_marks, size);
|
549
549
|
for (uint32_t i = 0; i < h; ++i) {
|
550
550
|
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
551
|
-
ptr += copy_from_mem(ptr,
|
551
|
+
ptr += copy_from_mem(ptr, val);
|
552
552
|
}
|
553
553
|
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
554
554
|
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
@@ -571,18 +571,13 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
571
571
|
|
572
572
|
template<typename T, typename S, typename A>
|
573
573
|
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
574
|
-
|
575
|
-
is.read((char*)&first_byte, sizeof(first_byte));
|
574
|
+
const auto first_byte = read<uint8_t>(is);
|
576
575
|
uint8_t preamble_longs = first_byte & 0x3f;
|
577
|
-
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
578
|
-
|
579
|
-
|
580
|
-
uint8_t
|
581
|
-
|
582
|
-
uint8_t flags;
|
583
|
-
is.read((char*)&flags, sizeof(flags));
|
584
|
-
uint32_t k;
|
585
|
-
is.read((char*)&k, sizeof(k));
|
576
|
+
const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
577
|
+
const auto serial_version = read<uint8_t>(is);
|
578
|
+
const auto family_id = read<uint8_t>(is);
|
579
|
+
const auto flags = read<uint8_t>(is);
|
580
|
+
const auto k = read<uint32_t>(is);
|
586
581
|
|
587
582
|
check_preamble_longs(preamble_longs, flags);
|
588
583
|
check_family_and_serialization_version(family_id, serial_version);
|
@@ -598,31 +593,27 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
|
|
598
593
|
}
|
599
594
|
|
600
595
|
// second and third prelongs
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
is.read((char*)&h, sizeof(h));
|
605
|
-
is.read((char*)&r, sizeof(r));
|
596
|
+
const auto n = read<uint64_t>(is);
|
597
|
+
const auto h = read<uint32_t>(is);
|
598
|
+
const auto r = read<uint32_t>(is);
|
606
599
|
|
607
600
|
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
608
601
|
|
609
602
|
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
610
603
|
double total_wt_r = 0.0;
|
611
604
|
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
612
|
-
|
605
|
+
total_wt_r = read<double>(is);
|
613
606
|
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
614
607
|
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
615
608
|
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
616
609
|
}
|
617
|
-
} else {
|
618
|
-
total_wt_r = 0.0;
|
619
610
|
}
|
620
611
|
|
621
612
|
// read the first h weights, fill remainder with -1.0
|
622
613
|
std::unique_ptr<double, weights_deleter> weights(AllocDouble(allocator).allocate(array_size),
|
623
614
|
weights_deleter(array_size, allocator));
|
624
615
|
double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
|
625
|
-
|
616
|
+
read(is, wts, h * sizeof(double));
|
626
617
|
for (size_t i = 0; i < h; ++i) {
|
627
618
|
if (!(wts[i] > 0.0)) {
|
628
619
|
throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
|
@@ -638,7 +629,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
|
|
638
629
|
uint8_t val = 0;
|
639
630
|
for (uint32_t i = 0; i < h; ++i) {
|
640
631
|
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
641
|
-
|
632
|
+
val = read<uint8_t>(is);
|
642
633
|
}
|
643
634
|
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
644
635
|
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
@@ -1420,7 +1411,7 @@ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
|
|
1420
1411
|
if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
|
1421
1412
|
throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
|
1422
1413
|
|
1423
|
-
|
1414
|
+
uint32_t r_true_count = 0;
|
1424
1415
|
++idx; // skip the gap
|
1425
1416
|
for (; idx < (k_ + 1); ++idx) {
|
1426
1417
|
if (predicate(data_[idx])) {
|
@@ -30,8 +30,8 @@ namespace datasketches {
|
|
30
30
|
template<typename T, typename S, typename A>
|
31
31
|
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
32
32
|
n_(0),
|
33
|
-
outer_tau_numer_(0),
|
34
|
-
outer_tau_denom_(0
|
33
|
+
outer_tau_numer_(0.0),
|
34
|
+
outer_tau_denom_(0),
|
35
35
|
max_k_(max_k),
|
36
36
|
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
37
37
|
{}
|
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
129
129
|
|
130
130
|
template<typename T, typename S, typename A>
|
131
131
|
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
132
|
-
|
133
|
-
|
134
|
-
uint8_t
|
135
|
-
|
136
|
-
|
137
|
-
is.read((char*)&family_id, sizeof(family_id));
|
138
|
-
uint8_t flags;
|
139
|
-
is.read((char*)&flags, sizeof(flags));
|
140
|
-
uint32_t max_k;
|
141
|
-
is.read((char*)&max_k, sizeof(max_k));
|
132
|
+
const auto preamble_longs = read<uint8_t>(is);
|
133
|
+
const auto serial_version = read<uint8_t>(is);
|
134
|
+
const auto family_id = read<uint8_t>(is);
|
135
|
+
const auto flags = read<uint8_t>(is);
|
136
|
+
const auto max_k = read<uint32_t>(is);
|
142
137
|
|
143
138
|
check_preamble_longs(preamble_longs, flags);
|
144
139
|
check_family_and_serialization_version(family_id, serial_version);
|
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
|
|
156
151
|
return var_opt_union<T,S,A>(max_k);
|
157
152
|
}
|
158
153
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
|
163
|
-
uint64_t outer_tau_denom;
|
164
|
-
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
154
|
+
const auto items_seen = read<uint64_t>(is);
|
155
|
+
const auto outer_tau_numer = read<double>(is);
|
156
|
+
const auto outer_tau_denom = read<uint64_t>(is);
|
165
157
|
|
166
158
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
167
159
|
|
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
176
168
|
ensure_minimum_memory(size, 8);
|
177
169
|
const char* ptr = static_cast<const char*>(bytes);
|
178
170
|
uint8_t preamble_longs;
|
179
|
-
ptr += copy_from_mem(ptr,
|
171
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
180
172
|
uint8_t serial_version;
|
181
|
-
ptr += copy_from_mem(ptr,
|
173
|
+
ptr += copy_from_mem(ptr, serial_version);
|
182
174
|
uint8_t family_id;
|
183
|
-
ptr += copy_from_mem(ptr,
|
175
|
+
ptr += copy_from_mem(ptr, family_id);
|
184
176
|
uint8_t flags;
|
185
|
-
ptr += copy_from_mem(ptr,
|
177
|
+
ptr += copy_from_mem(ptr, flags);
|
186
178
|
uint32_t max_k;
|
187
|
-
ptr += copy_from_mem(ptr,
|
179
|
+
ptr += copy_from_mem(ptr, max_k);
|
188
180
|
|
189
181
|
check_preamble_longs(preamble_longs, flags);
|
190
182
|
check_family_and_serialization_version(family_id, serial_version);
|
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
200
192
|
}
|
201
193
|
|
202
194
|
uint64_t items_seen;
|
203
|
-
ptr += copy_from_mem(ptr,
|
195
|
+
ptr += copy_from_mem(ptr, items_seen);
|
204
196
|
double outer_tau_numer;
|
205
|
-
ptr += copy_from_mem(ptr,
|
197
|
+
ptr += copy_from_mem(ptr, outer_tau_numer);
|
206
198
|
uint64_t outer_tau_denom;
|
207
|
-
ptr += copy_from_mem(ptr,
|
199
|
+
ptr += copy_from_mem(ptr, outer_tau_denom);
|
208
200
|
|
209
201
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
210
202
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
238
230
|
flags = 0;
|
239
231
|
}
|
240
232
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
233
|
+
write(os, preamble_longs);
|
234
|
+
write(os, serialization_version);
|
235
|
+
write(os, family_id);
|
236
|
+
write(os, flags);
|
237
|
+
write(os, max_k_);
|
246
238
|
|
247
239
|
if (!empty) {
|
248
|
-
|
249
|
-
|
250
|
-
|
240
|
+
write(os, n_);
|
241
|
+
write(os, outer_tau_numer_);
|
242
|
+
write(os, outer_tau_denom_);
|
251
243
|
gadget_.serialize(os);
|
252
244
|
}
|
253
245
|
}
|
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
|
|
275
267
|
}
|
276
268
|
|
277
269
|
// first prelong
|
278
|
-
ptr += copy_to_mem(
|
279
|
-
ptr += copy_to_mem(
|
280
|
-
ptr += copy_to_mem(
|
281
|
-
ptr += copy_to_mem(
|
282
|
-
ptr += copy_to_mem(
|
270
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
271
|
+
ptr += copy_to_mem(serialization_version, ptr);
|
272
|
+
ptr += copy_to_mem(family_id, ptr);
|
273
|
+
ptr += copy_to_mem(flags, ptr);
|
274
|
+
ptr += copy_to_mem(max_k_, ptr);
|
283
275
|
|
284
276
|
if (!empty) {
|
285
|
-
ptr += copy_to_mem(
|
286
|
-
ptr += copy_to_mem(
|
287
|
-
ptr += copy_to_mem(
|
277
|
+
ptr += copy_to_mem(n_, ptr);
|
278
|
+
ptr += copy_to_mem(outer_tau_numer_, ptr);
|
279
|
+
ptr += copy_to_mem(outer_tau_denom_, ptr);
|
288
280
|
|
289
281
|
auto gadget_bytes = gadget_.serialize();
|
290
282
|
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
42
42
|
var_opt_sketch<int> sk(k);
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
44
|
-
sk.update(i, 1.0);
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
45
45
|
}
|
46
46
|
return sk;
|
47
47
|
}
|
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
|
|
71
71
|
|
72
72
|
TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
|
73
73
|
REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
|
74
|
-
REQUIRE_THROWS_AS(var_opt_sketch<int>(
|
74
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
|
75
75
|
}
|
76
76
|
|
77
77
|
TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
|
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
216
216
|
// which covers about 10 orders of magnitude
|
217
217
|
double w = std::exp(5 * N(rand));
|
218
218
|
input_sum += w;
|
219
|
-
sk.update(i, w);
|
219
|
+
sk.update(static_cast<int>(i), w);
|
220
220
|
}
|
221
221
|
|
222
222
|
double output_sum = 0.0;
|
223
|
-
for (auto
|
223
|
+
for (auto it : sk) { // std::pair<int, weight>
|
224
224
|
output_sum += it.second;
|
225
225
|
}
|
226
226
|
|
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
|
350
350
|
// Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
|
351
351
|
// added k-1 heavy items, leaving only 1 item left in R
|
352
352
|
for (uint32_t i = 1; i <= k; ++i) {
|
353
|
-
sk.update(-i, k + (i * wt_scale));
|
353
|
+
sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
|
354
354
|
}
|
355
355
|
|
356
356
|
auto it = sk.begin();
|
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
|
|
442
442
|
// finally, a non-degenerate predicate
|
443
443
|
// insert negative items with identical weights, filter for negative weights only
|
444
444
|
for (uint32_t i = 1; i <= (k + 1); ++i) {
|
445
|
-
sk.update(static_cast<int32_t>(
|
445
|
+
sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
|
446
446
|
total_weight += 1.0 * i;
|
447
447
|
}
|
448
448
|
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
42
42
|
var_opt_sketch<int> sk(k);
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
44
|
-
sk.update(i, 1.0);
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
45
45
|
}
|
46
46
|
return sk;
|
47
47
|
}
|
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
|
|
147
147
|
|
148
148
|
TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
|
149
149
|
REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
|
150
|
-
REQUIRE_THROWS_AS(var_opt_union<int>(
|
150
|
+
REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
|
151
151
|
}
|
152
152
|
|
153
153
|
TEST_CASE("varopt union: bad family", "[var_opt_union]") {
|
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
|
|
179
179
|
}
|
180
180
|
|
181
181
|
TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
182
|
-
|
182
|
+
int n = 4; // 2n < k
|
183
183
|
uint32_t k = 10;
|
184
184
|
var_opt_sketch<int> sk1(k), sk2(k);
|
185
185
|
|
186
|
-
for (
|
187
|
-
sk1.update(i, i);
|
188
|
-
sk2.update(static_cast<
|
186
|
+
for (int i = 1; i <= n; ++i) {
|
187
|
+
sk1.update(i, static_cast<double>(i));
|
188
|
+
sk2.update(-i, static_cast<double>(i));
|
189
189
|
}
|
190
190
|
|
191
191
|
var_opt_union<int> u(k);
|
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
|
193
193
|
u.update(sk2);
|
194
194
|
|
195
195
|
var_opt_sketch<int> result = u.get_result();
|
196
|
-
REQUIRE(result.get_n() ==
|
196
|
+
REQUIRE(result.get_n() == 2ULL * n);
|
197
197
|
REQUIRE(result.get_k() == k);
|
198
198
|
}
|
199
199
|
|
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
|
|
204
204
|
uint32_t k2 = 5;
|
205
205
|
var_opt_sketch<int64_t> sk1(k1), sk2(k2);
|
206
206
|
for (uint64_t i = 1; i <= n1; ++i) {
|
207
|
-
sk1.update(i, i);
|
207
|
+
sk1.update(i, static_cast<double>(i));
|
208
208
|
}
|
209
209
|
|
210
210
|
for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
|
211
|
-
sk2.update(static_cast<int64_t>(
|
211
|
+
sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
|
212
212
|
}
|
213
|
-
sk2.update(-n2, 1000000.0);
|
213
|
+
sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
|
214
214
|
|
215
215
|
var_opt_union<int64_t> u(k1);
|
216
216
|
u.update(sk1);
|
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
|
|
258
258
|
uint64_t n2 = 64;
|
259
259
|
|
260
260
|
var_opt_sketch<float> sk(k_small);
|
261
|
-
for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
|
262
|
-
sk.update(-1, n1 * n1); // add a heavy item
|
261
|
+
for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
|
262
|
+
sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
|
263
263
|
|
264
264
|
var_opt_union<float> u(k_max);
|
265
265
|
u.update(sk);
|
266
266
|
|
267
267
|
// another one, but different n to get a different per-item weight
|
268
268
|
var_opt_sketch<float> sk2(k_small);
|
269
|
-
for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
|
269
|
+
for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
|
270
270
|
u.update(sk2);
|
271
271
|
|
272
272
|
// should trigger migrate_marked_items_by_decreasing_k()
|