datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -334,7 +334,7 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
|
334
334
|
num_bytes += (h_ / 8) + (h_ % 8 > 0);
|
|
335
335
|
}
|
|
336
336
|
// must iterate over the items
|
|
337
|
-
for (auto
|
|
337
|
+
for (auto it: *this)
|
|
338
338
|
num_bytes += S().size_of_item(it.first);
|
|
339
339
|
return num_bytes;
|
|
340
340
|
}
|
|
@@ -359,21 +359,21 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
|
359
359
|
// first prelong
|
|
360
360
|
uint8_t ser_ver(SER_VER);
|
|
361
361
|
uint8_t family(FAMILY_ID);
|
|
362
|
-
ptr += copy_to_mem(
|
|
363
|
-
ptr += copy_to_mem(
|
|
364
|
-
ptr += copy_to_mem(
|
|
365
|
-
ptr += copy_to_mem(
|
|
366
|
-
ptr += copy_to_mem(
|
|
362
|
+
ptr += copy_to_mem(first_byte, ptr);
|
|
363
|
+
ptr += copy_to_mem(ser_ver, ptr);
|
|
364
|
+
ptr += copy_to_mem(family, ptr);
|
|
365
|
+
ptr += copy_to_mem(flags, ptr);
|
|
366
|
+
ptr += copy_to_mem(k_, ptr);
|
|
367
367
|
|
|
368
368
|
if (!empty) {
|
|
369
369
|
// second and third prelongs
|
|
370
|
-
ptr += copy_to_mem(
|
|
371
|
-
ptr += copy_to_mem(
|
|
372
|
-
ptr += copy_to_mem(
|
|
370
|
+
ptr += copy_to_mem(n_, ptr);
|
|
371
|
+
ptr += copy_to_mem(h_, ptr);
|
|
372
|
+
ptr += copy_to_mem(r_, ptr);
|
|
373
373
|
|
|
374
374
|
// fourth prelong, if needed
|
|
375
375
|
if (r_ > 0) {
|
|
376
|
-
ptr += copy_to_mem(
|
|
376
|
+
ptr += copy_to_mem(total_wt_r_, ptr);
|
|
377
377
|
}
|
|
378
378
|
|
|
379
379
|
// first h_ weights
|
|
@@ -388,14 +388,14 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
|
388
388
|
}
|
|
389
389
|
|
|
390
390
|
if ((i & 0x7) == 0x7) {
|
|
391
|
-
ptr += copy_to_mem(
|
|
391
|
+
ptr += copy_to_mem(val, ptr);
|
|
392
392
|
val = 0;
|
|
393
393
|
}
|
|
394
394
|
}
|
|
395
395
|
|
|
396
396
|
// write out any remaining values
|
|
397
397
|
if ((h_ & 0x7) > 0) {
|
|
398
|
-
ptr += copy_to_mem(
|
|
398
|
+
ptr += copy_to_mem(val, ptr);
|
|
399
399
|
}
|
|
400
400
|
}
|
|
401
401
|
|
|
@@ -428,25 +428,25 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
|
|
428
428
|
// first prelong
|
|
429
429
|
const uint8_t ser_ver(SER_VER);
|
|
430
430
|
const uint8_t family(FAMILY_ID);
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
431
|
+
write(os, first_byte);
|
|
432
|
+
write(os, ser_ver);
|
|
433
|
+
write(os, family);
|
|
434
|
+
write(os, flags);
|
|
435
|
+
write(os, k_);
|
|
436
436
|
|
|
437
437
|
if (!empty) {
|
|
438
438
|
// second and third prelongs
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
439
|
+
write(os, n_);
|
|
440
|
+
write(os, h_);
|
|
441
|
+
write(os, r_);
|
|
442
442
|
|
|
443
443
|
// fourth prelong, if needed
|
|
444
444
|
if (r_ > 0) {
|
|
445
|
-
|
|
445
|
+
write(os, total_wt_r_);
|
|
446
446
|
}
|
|
447
447
|
|
|
448
448
|
// write the first h_ weights
|
|
449
|
-
|
|
449
|
+
write(os, weights_, h_ * sizeof(double));
|
|
450
450
|
|
|
451
451
|
// write the first h_ marks as packed bytes iff we have a gadget
|
|
452
452
|
if (marks_ != nullptr) {
|
|
@@ -457,14 +457,14 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
|
|
457
457
|
}
|
|
458
458
|
|
|
459
459
|
if ((i & 0x7) == 0x7) {
|
|
460
|
-
|
|
460
|
+
write(os, val);
|
|
461
461
|
val = 0;
|
|
462
462
|
}
|
|
463
463
|
}
|
|
464
464
|
|
|
465
465
|
// write out any remaining values
|
|
466
466
|
if ((h_ & 0x7) > 0) {
|
|
467
|
-
|
|
467
|
+
write(os, val);
|
|
468
468
|
}
|
|
469
469
|
}
|
|
470
470
|
|
|
@@ -481,17 +481,17 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
481
481
|
const char* base = ptr;
|
|
482
482
|
const char* end_ptr = ptr + size;
|
|
483
483
|
uint8_t first_byte;
|
|
484
|
-
ptr += copy_from_mem(ptr,
|
|
484
|
+
ptr += copy_from_mem(ptr, first_byte);
|
|
485
485
|
uint8_t preamble_longs = first_byte & 0x3f;
|
|
486
486
|
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
|
487
487
|
uint8_t serial_version;
|
|
488
|
-
ptr += copy_from_mem(ptr,
|
|
488
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
489
489
|
uint8_t family_id;
|
|
490
|
-
ptr += copy_from_mem(ptr,
|
|
490
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
491
491
|
uint8_t flags;
|
|
492
|
-
ptr += copy_from_mem(ptr,
|
|
492
|
+
ptr += copy_from_mem(ptr, flags);
|
|
493
493
|
uint32_t k;
|
|
494
|
-
ptr += copy_from_mem(ptr,
|
|
494
|
+
ptr += copy_from_mem(ptr, k);
|
|
495
495
|
|
|
496
496
|
check_preamble_longs(preamble_longs, flags);
|
|
497
497
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -507,16 +507,16 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
507
507
|
// second and third prelongs
|
|
508
508
|
uint64_t n;
|
|
509
509
|
uint32_t h, r;
|
|
510
|
-
ptr += copy_from_mem(ptr,
|
|
511
|
-
ptr += copy_from_mem(ptr,
|
|
512
|
-
ptr += copy_from_mem(ptr,
|
|
510
|
+
ptr += copy_from_mem(ptr, n);
|
|
511
|
+
ptr += copy_from_mem(ptr, h);
|
|
512
|
+
ptr += copy_from_mem(ptr, r);
|
|
513
513
|
|
|
514
514
|
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
|
515
515
|
|
|
516
516
|
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
|
517
517
|
double total_wt_r = 0.0;
|
|
518
518
|
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
|
519
|
-
ptr += copy_from_mem(ptr,
|
|
519
|
+
ptr += copy_from_mem(ptr, total_wt_r);
|
|
520
520
|
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
|
521
521
|
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
|
522
522
|
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
|
@@ -548,7 +548,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
548
548
|
check_memory_size(ptr - base + size_marks, size);
|
|
549
549
|
for (uint32_t i = 0; i < h; ++i) {
|
|
550
550
|
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
|
551
|
-
ptr += copy_from_mem(ptr,
|
|
551
|
+
ptr += copy_from_mem(ptr, val);
|
|
552
552
|
}
|
|
553
553
|
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
|
554
554
|
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
|
@@ -571,18 +571,13 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
571
571
|
|
|
572
572
|
template<typename T, typename S, typename A>
|
|
573
573
|
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
574
|
-
|
|
575
|
-
is.read((char*)&first_byte, sizeof(first_byte));
|
|
574
|
+
const auto first_byte = read<uint8_t>(is);
|
|
576
575
|
uint8_t preamble_longs = first_byte & 0x3f;
|
|
577
|
-
resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
uint8_t
|
|
581
|
-
|
|
582
|
-
uint8_t flags;
|
|
583
|
-
is.read((char*)&flags, sizeof(flags));
|
|
584
|
-
uint32_t k;
|
|
585
|
-
is.read((char*)&k, sizeof(k));
|
|
576
|
+
const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
|
577
|
+
const auto serial_version = read<uint8_t>(is);
|
|
578
|
+
const auto family_id = read<uint8_t>(is);
|
|
579
|
+
const auto flags = read<uint8_t>(is);
|
|
580
|
+
const auto k = read<uint32_t>(is);
|
|
586
581
|
|
|
587
582
|
check_preamble_longs(preamble_longs, flags);
|
|
588
583
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -598,31 +593,27 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
|
|
|
598
593
|
}
|
|
599
594
|
|
|
600
595
|
// second and third prelongs
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
is.read((char*)&h, sizeof(h));
|
|
605
|
-
is.read((char*)&r, sizeof(r));
|
|
596
|
+
const auto n = read<uint64_t>(is);
|
|
597
|
+
const auto h = read<uint32_t>(is);
|
|
598
|
+
const auto r = read<uint32_t>(is);
|
|
606
599
|
|
|
607
600
|
const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf);
|
|
608
601
|
|
|
609
602
|
// current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating
|
|
610
603
|
double total_wt_r = 0.0;
|
|
611
604
|
if (preamble_longs == PREAMBLE_LONGS_FULL) {
|
|
612
|
-
|
|
605
|
+
total_wt_r = read<double>(is);
|
|
613
606
|
if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) {
|
|
614
607
|
throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. "
|
|
615
608
|
"Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r));
|
|
616
609
|
}
|
|
617
|
-
} else {
|
|
618
|
-
total_wt_r = 0.0;
|
|
619
610
|
}
|
|
620
611
|
|
|
621
612
|
// read the first h weights, fill remainder with -1.0
|
|
622
613
|
std::unique_ptr<double, weights_deleter> weights(AllocDouble(allocator).allocate(array_size),
|
|
623
614
|
weights_deleter(array_size, allocator));
|
|
624
615
|
double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete
|
|
625
|
-
|
|
616
|
+
read(is, wts, h * sizeof(double));
|
|
626
617
|
for (size_t i = 0; i < h; ++i) {
|
|
627
618
|
if (!(wts[i] > 0.0)) {
|
|
628
619
|
throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i]));
|
|
@@ -638,7 +629,7 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
|
|
|
638
629
|
uint8_t val = 0;
|
|
639
630
|
for (uint32_t i = 0; i < h; ++i) {
|
|
640
631
|
if ((i & 0x7) == 0x0) { // should trigger on first iteration
|
|
641
|
-
|
|
632
|
+
val = read<uint8_t>(is);
|
|
642
633
|
}
|
|
643
634
|
marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1;
|
|
644
635
|
num_marks_in_h += (marks.get()[i] ? 1 : 0);
|
|
@@ -1420,7 +1411,7 @@ subset_summary var_opt_sketch<T, S, A>::estimate_subset_sum(P predicate) const {
|
|
|
1420
1411
|
if (effective_sampling_rate < 0.0 || effective_sampling_rate > 1.0)
|
|
1421
1412
|
throw std::logic_error("invalid sampling rate outside [0.0, 1.0]");
|
|
1422
1413
|
|
|
1423
|
-
|
|
1414
|
+
uint32_t r_true_count = 0;
|
|
1424
1415
|
++idx; // skip the gap
|
|
1425
1416
|
for (; idx < (k_ + 1); ++idx) {
|
|
1426
1417
|
if (predicate(data_[idx])) {
|
|
@@ -30,8 +30,8 @@ namespace datasketches {
|
|
|
30
30
|
template<typename T, typename S, typename A>
|
|
31
31
|
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k, const A& allocator) :
|
|
32
32
|
n_(0),
|
|
33
|
-
outer_tau_numer_(0),
|
|
34
|
-
outer_tau_denom_(0
|
|
33
|
+
outer_tau_numer_(0.0),
|
|
34
|
+
outer_tau_denom_(0),
|
|
35
35
|
max_k_(max_k),
|
|
36
36
|
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true, allocator)
|
|
37
37
|
{}
|
|
@@ -129,16 +129,11 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
|
129
129
|
|
|
130
130
|
template<typename T, typename S, typename A>
|
|
131
131
|
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
uint8_t
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
is.read((char*)&family_id, sizeof(family_id));
|
|
138
|
-
uint8_t flags;
|
|
139
|
-
is.read((char*)&flags, sizeof(flags));
|
|
140
|
-
uint32_t max_k;
|
|
141
|
-
is.read((char*)&max_k, sizeof(max_k));
|
|
132
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
133
|
+
const auto serial_version = read<uint8_t>(is);
|
|
134
|
+
const auto family_id = read<uint8_t>(is);
|
|
135
|
+
const auto flags = read<uint8_t>(is);
|
|
136
|
+
const auto max_k = read<uint32_t>(is);
|
|
142
137
|
|
|
143
138
|
check_preamble_longs(preamble_longs, flags);
|
|
144
139
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -156,12 +151,9 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
|
|
|
156
151
|
return var_opt_union<T,S,A>(max_k);
|
|
157
152
|
}
|
|
158
153
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
|
|
163
|
-
uint64_t outer_tau_denom;
|
|
164
|
-
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
|
154
|
+
const auto items_seen = read<uint64_t>(is);
|
|
155
|
+
const auto outer_tau_numer = read<double>(is);
|
|
156
|
+
const auto outer_tau_denom = read<uint64_t>(is);
|
|
165
157
|
|
|
166
158
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
|
167
159
|
|
|
@@ -176,15 +168,15 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
176
168
|
ensure_minimum_memory(size, 8);
|
|
177
169
|
const char* ptr = static_cast<const char*>(bytes);
|
|
178
170
|
uint8_t preamble_longs;
|
|
179
|
-
ptr += copy_from_mem(ptr,
|
|
171
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
180
172
|
uint8_t serial_version;
|
|
181
|
-
ptr += copy_from_mem(ptr,
|
|
173
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
182
174
|
uint8_t family_id;
|
|
183
|
-
ptr += copy_from_mem(ptr,
|
|
175
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
184
176
|
uint8_t flags;
|
|
185
|
-
ptr += copy_from_mem(ptr,
|
|
177
|
+
ptr += copy_from_mem(ptr, flags);
|
|
186
178
|
uint32_t max_k;
|
|
187
|
-
ptr += copy_from_mem(ptr,
|
|
179
|
+
ptr += copy_from_mem(ptr, max_k);
|
|
188
180
|
|
|
189
181
|
check_preamble_longs(preamble_longs, flags);
|
|
190
182
|
check_family_and_serialization_version(family_id, serial_version);
|
|
@@ -200,11 +192,11 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
200
192
|
}
|
|
201
193
|
|
|
202
194
|
uint64_t items_seen;
|
|
203
|
-
ptr += copy_from_mem(ptr,
|
|
195
|
+
ptr += copy_from_mem(ptr, items_seen);
|
|
204
196
|
double outer_tau_numer;
|
|
205
|
-
ptr += copy_from_mem(ptr,
|
|
197
|
+
ptr += copy_from_mem(ptr, outer_tau_numer);
|
|
206
198
|
uint64_t outer_tau_denom;
|
|
207
|
-
ptr += copy_from_mem(ptr,
|
|
199
|
+
ptr += copy_from_mem(ptr, outer_tau_denom);
|
|
208
200
|
|
|
209
201
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
|
210
202
|
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
|
@@ -238,16 +230,16 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
|
238
230
|
flags = 0;
|
|
239
231
|
}
|
|
240
232
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
233
|
+
write(os, preamble_longs);
|
|
234
|
+
write(os, serialization_version);
|
|
235
|
+
write(os, family_id);
|
|
236
|
+
write(os, flags);
|
|
237
|
+
write(os, max_k_);
|
|
246
238
|
|
|
247
239
|
if (!empty) {
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
240
|
+
write(os, n_);
|
|
241
|
+
write(os, outer_tau_numer_);
|
|
242
|
+
write(os, outer_tau_denom_);
|
|
251
243
|
gadget_.serialize(os);
|
|
252
244
|
}
|
|
253
245
|
}
|
|
@@ -275,16 +267,16 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
|
|
|
275
267
|
}
|
|
276
268
|
|
|
277
269
|
// first prelong
|
|
278
|
-
ptr += copy_to_mem(
|
|
279
|
-
ptr += copy_to_mem(
|
|
280
|
-
ptr += copy_to_mem(
|
|
281
|
-
ptr += copy_to_mem(
|
|
282
|
-
ptr += copy_to_mem(
|
|
270
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
271
|
+
ptr += copy_to_mem(serialization_version, ptr);
|
|
272
|
+
ptr += copy_to_mem(family_id, ptr);
|
|
273
|
+
ptr += copy_to_mem(flags, ptr);
|
|
274
|
+
ptr += copy_to_mem(max_k_, ptr);
|
|
283
275
|
|
|
284
276
|
if (!empty) {
|
|
285
|
-
ptr += copy_to_mem(
|
|
286
|
-
ptr += copy_to_mem(
|
|
287
|
-
ptr += copy_to_mem(
|
|
277
|
+
ptr += copy_to_mem(n_, ptr);
|
|
278
|
+
ptr += copy_to_mem(outer_tau_numer_, ptr);
|
|
279
|
+
ptr += copy_to_mem(outer_tau_denom_, ptr);
|
|
288
280
|
|
|
289
281
|
auto gadget_bytes = gadget_.serialize();
|
|
290
282
|
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
42
42
|
var_opt_sketch<int> sk(k);
|
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
|
44
|
-
sk.update(i, 1.0);
|
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
|
45
45
|
}
|
|
46
46
|
return sk;
|
|
47
47
|
}
|
|
@@ -71,7 +71,7 @@ static void check_if_equal(var_opt_sketch<T,S,A>& sk1, var_opt_sketch<T,S,A>& sk
|
|
|
71
71
|
|
|
72
72
|
TEST_CASE("varopt sketch: invalid k", "[var_opt_sketch]") {
|
|
73
73
|
REQUIRE_THROWS_AS(var_opt_sketch<int>(0), std::invalid_argument);
|
|
74
|
-
REQUIRE_THROWS_AS(var_opt_sketch<int>(
|
|
74
|
+
REQUIRE_THROWS_AS(var_opt_sketch<int>(1U << 31), std::invalid_argument); // aka k < 0
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
TEST_CASE("varopt sketch: bad serialization version", "[var_opt_sketch]") {
|
|
@@ -216,11 +216,11 @@ TEST_CASE("varopt sketch: cumulative weight", "[var_opt_sketch]") {
|
|
|
216
216
|
// which covers about 10 orders of magnitude
|
|
217
217
|
double w = std::exp(5 * N(rand));
|
|
218
218
|
input_sum += w;
|
|
219
|
-
sk.update(i, w);
|
|
219
|
+
sk.update(static_cast<int>(i), w);
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
double output_sum = 0.0;
|
|
223
|
-
for (auto
|
|
223
|
+
for (auto it : sk) { // std::pair<int, weight>
|
|
224
224
|
output_sum += it.second;
|
|
225
225
|
}
|
|
226
226
|
|
|
@@ -350,7 +350,7 @@ TEST_CASE("varopt sketch: pseudo-heavy update", "[var_opt_sketch]") {
|
|
|
350
350
|
// Last one should call update_pseudo_heavy_r_eq_1(), since we'll have
|
|
351
351
|
// added k-1 heavy items, leaving only 1 item left in R
|
|
352
352
|
for (uint32_t i = 1; i <= k; ++i) {
|
|
353
|
-
sk.update(-i, k + (i * wt_scale));
|
|
353
|
+
sk.update(-1 * static_cast<int>(i), k + (i * wt_scale));
|
|
354
354
|
}
|
|
355
355
|
|
|
356
356
|
auto it = sk.begin();
|
|
@@ -442,7 +442,7 @@ TEST_CASE("varopt sketch: estimate subset sum", "[var_opt_sketch]") {
|
|
|
442
442
|
// finally, a non-degenerate predicate
|
|
443
443
|
// insert negative items with identical weights, filter for negative weights only
|
|
444
444
|
for (uint32_t i = 1; i <= (k + 1); ++i) {
|
|
445
|
-
sk.update(static_cast<int32_t>(
|
|
445
|
+
sk.update(-1 * static_cast<int32_t>(i), static_cast<double>(i));
|
|
446
446
|
total_weight += 1.0 * i;
|
|
447
447
|
}
|
|
448
448
|
|
|
@@ -41,7 +41,7 @@ static constexpr double EPS = 1e-13;
|
|
|
41
41
|
static var_opt_sketch<int> create_unweighted_sketch(uint32_t k, uint64_t n) {
|
|
42
42
|
var_opt_sketch<int> sk(k);
|
|
43
43
|
for (uint64_t i = 0; i < n; ++i) {
|
|
44
|
-
sk.update(i, 1.0);
|
|
44
|
+
sk.update(static_cast<int>(i), 1.0);
|
|
45
45
|
}
|
|
46
46
|
return sk;
|
|
47
47
|
}
|
|
@@ -147,7 +147,7 @@ TEST_CASE("varopt union: bad serialization version", "[var_opt_union]") {
|
|
|
147
147
|
|
|
148
148
|
TEST_CASE("varopt union: invalid k", "[var_opt_union]") {
|
|
149
149
|
REQUIRE_THROWS_AS(var_opt_union<int>(0), std::invalid_argument);
|
|
150
|
-
REQUIRE_THROWS_AS(var_opt_union<int>(
|
|
150
|
+
REQUIRE_THROWS_AS(var_opt_union<int>(1U << 31), std::invalid_argument);
|
|
151
151
|
}
|
|
152
152
|
|
|
153
153
|
TEST_CASE("varopt union: bad family", "[var_opt_union]") {
|
|
@@ -179,13 +179,13 @@ TEST_CASE("varopt union: empty union", "[var_opt_union]") {
|
|
|
179
179
|
}
|
|
180
180
|
|
|
181
181
|
TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
|
182
|
-
|
|
182
|
+
int n = 4; // 2n < k
|
|
183
183
|
uint32_t k = 10;
|
|
184
184
|
var_opt_sketch<int> sk1(k), sk2(k);
|
|
185
185
|
|
|
186
|
-
for (
|
|
187
|
-
sk1.update(i, i);
|
|
188
|
-
sk2.update(static_cast<
|
|
186
|
+
for (int i = 1; i <= n; ++i) {
|
|
187
|
+
sk1.update(i, static_cast<double>(i));
|
|
188
|
+
sk2.update(-i, static_cast<double>(i));
|
|
189
189
|
}
|
|
190
190
|
|
|
191
191
|
var_opt_union<int> u(k);
|
|
@@ -193,7 +193,7 @@ TEST_CASE("varopt union: two exact sketches", "[var_opt_union]") {
|
|
|
193
193
|
u.update(sk2);
|
|
194
194
|
|
|
195
195
|
var_opt_sketch<int> result = u.get_result();
|
|
196
|
-
REQUIRE(result.get_n() ==
|
|
196
|
+
REQUIRE(result.get_n() == 2ULL * n);
|
|
197
197
|
REQUIRE(result.get_k() == k);
|
|
198
198
|
}
|
|
199
199
|
|
|
@@ -204,13 +204,13 @@ TEST_CASE("varopt union: heavy sampling sketch", "[var_opt_union]") {
|
|
|
204
204
|
uint32_t k2 = 5;
|
|
205
205
|
var_opt_sketch<int64_t> sk1(k1), sk2(k2);
|
|
206
206
|
for (uint64_t i = 1; i <= n1; ++i) {
|
|
207
|
-
sk1.update(i, i);
|
|
207
|
+
sk1.update(i, static_cast<double>(i));
|
|
208
208
|
}
|
|
209
209
|
|
|
210
210
|
for (uint64_t i = 1; i < n2; ++i) { // we'll add a very heavy one later
|
|
211
|
-
sk2.update(static_cast<int64_t>(
|
|
211
|
+
sk2.update(-1 * static_cast<int64_t>(i), i + 1000.0);
|
|
212
212
|
}
|
|
213
|
-
sk2.update(-n2, 1000000.0);
|
|
213
|
+
sk2.update(-1 * static_cast<int64_t>(n2), 1000000.0);
|
|
214
214
|
|
|
215
215
|
var_opt_union<int64_t> u(k1);
|
|
216
216
|
u.update(sk1);
|
|
@@ -258,15 +258,15 @@ TEST_CASE("varopt union: small sampling sketch", "[var_opt_union]") {
|
|
|
258
258
|
uint64_t n2 = 64;
|
|
259
259
|
|
|
260
260
|
var_opt_sketch<float> sk(k_small);
|
|
261
|
-
for (uint64_t i = 0; i < n1; ++i) { sk.update(i); }
|
|
262
|
-
sk.update(-1, n1 * n1); // add a heavy item
|
|
261
|
+
for (uint64_t i = 0; i < n1; ++i) { sk.update(static_cast<float>(i)); }
|
|
262
|
+
sk.update(-1.0f, static_cast<double>(n1 * n1)); // add a heavy item
|
|
263
263
|
|
|
264
264
|
var_opt_union<float> u(k_max);
|
|
265
265
|
u.update(sk);
|
|
266
266
|
|
|
267
267
|
// another one, but different n to get a different per-item weight
|
|
268
268
|
var_opt_sketch<float> sk2(k_small);
|
|
269
|
-
for (uint64_t i = 0; i < n2; ++i) { sk2.update(i); }
|
|
269
|
+
for (uint64_t i = 0; i < n2; ++i) { sk2.update(static_cast<float>(i)); }
|
|
270
270
|
u.update(sk2);
|
|
271
271
|
|
|
272
272
|
// should trigger migrate_marked_items_by_decreasing_k()
|