datasketches 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +17 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +254 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +595 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +447 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +9 -8
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +13 -3
@@ -24,11 +24,11 @@
|
|
24
24
|
#include <vector>
|
25
25
|
#include <stdexcept>
|
26
26
|
|
27
|
-
#include "serde.hpp"
|
28
27
|
#include "binomial_bounds.hpp"
|
29
28
|
#include "theta_helpers.hpp"
|
30
29
|
#include "count_zeros.hpp"
|
31
30
|
#include "bit_packing.hpp"
|
31
|
+
#include "memory_operations.hpp"
|
32
32
|
|
33
33
|
namespace datasketches {
|
34
34
|
|
@@ -341,6 +341,39 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
341
341
|
template<typename A>
|
342
342
|
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
343
343
|
|
344
|
+
template<typename A>
|
345
|
+
uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
|
346
|
+
if (compressed) {
|
347
|
+
return this->is_estimation_mode() ? 2 : 1;
|
348
|
+
}
|
349
|
+
return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
350
|
+
}
|
351
|
+
|
352
|
+
template<typename A>
|
353
|
+
size_t compact_theta_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
354
|
+
return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
|
355
|
+
}
|
356
|
+
|
357
|
+
template<typename A>
|
358
|
+
size_t compact_theta_sketch_alloc<A>::get_serialized_size_bytes(bool compressed) const {
|
359
|
+
if (compressed && is_suitable_for_compression()) {
|
360
|
+
return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
|
361
|
+
}
|
362
|
+
return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
|
363
|
+
}
|
364
|
+
|
365
|
+
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
366
|
+
template<typename A>
|
367
|
+
uint8_t compact_theta_sketch_alloc<A>::get_num_entries_bytes() const {
|
368
|
+
return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
369
|
+
}
|
370
|
+
|
371
|
+
template<typename A>
|
372
|
+
size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
|
373
|
+
const size_t compressed_bits = entry_bits * entries_.size();
|
374
|
+
return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
|
375
|
+
}
|
376
|
+
|
344
377
|
template<typename A>
|
345
378
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
346
379
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
@@ -366,12 +399,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
366
399
|
|
367
400
|
template<typename A>
|
368
401
|
auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
369
|
-
const
|
370
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
371
|
-
+ sizeof(uint64_t) * entries_.size();
|
402
|
+
const size_t size = get_serialized_size_bytes() + header_size_bytes;
|
372
403
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
373
404
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
374
|
-
|
405
|
+
const uint8_t preamble_longs = get_preamble_longs(false);
|
375
406
|
*ptr++ = preamble_longs;
|
376
407
|
*ptr++ = UNCOMPRESSED_SERIAL_VERSION;
|
377
408
|
*ptr++ = SKETCH_TYPE;
|
@@ -413,7 +444,7 @@ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
|
|
413
444
|
}
|
414
445
|
|
415
446
|
template<typename A>
|
416
|
-
uint8_t compact_theta_sketch_alloc<A>::
|
447
|
+
uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
|
417
448
|
// compression is based on leading zeros in deltas between ordered hash values
|
418
449
|
// assumes ordered sketch
|
419
450
|
uint64_t previous = 0;
|
@@ -423,16 +454,14 @@ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
|
|
423
454
|
ored |= delta;
|
424
455
|
previous = entry;
|
425
456
|
}
|
426
|
-
return count_leading_zeros_in_u64(ored);
|
457
|
+
return 64 - count_leading_zeros_in_u64(ored);
|
427
458
|
}
|
428
459
|
|
429
460
|
template<typename A>
|
430
461
|
void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
|
431
462
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
|
432
|
-
const uint8_t entry_bits =
|
433
|
-
|
434
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
435
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
463
|
+
const uint8_t entry_bits = compute_entry_bits();
|
464
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
436
465
|
|
437
466
|
write(os, preamble_longs);
|
438
467
|
write(os, COMPRESSED_SERIAL_VERSION);
|
@@ -477,25 +506,20 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
|
|
477
506
|
previous = entries_[i];
|
478
507
|
offset = pack_bits(delta, entry_bits, ptr, offset);
|
479
508
|
}
|
509
|
+
if (offset > 0) ++ptr;
|
480
510
|
write(os, buffer.data(), ptr - buffer.data());
|
481
511
|
}
|
482
512
|
}
|
483
513
|
|
484
514
|
template<typename A>
|
485
515
|
auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
|
486
|
-
const uint8_t
|
487
|
-
const uint8_t
|
488
|
-
const size_t
|
489
|
-
|
490
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
491
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
492
|
-
|
493
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
|
494
|
-
+ whole_bytes_to_hold_bits(compressed_bits);
|
516
|
+
const uint8_t entry_bits = compute_entry_bits();
|
517
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
518
|
+
const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
|
495
519
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
496
520
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
497
521
|
|
498
|
-
*ptr++ =
|
522
|
+
*ptr++ = get_preamble_longs(true);
|
499
523
|
*ptr++ = COMPRESSED_SERIAL_VERSION;
|
500
524
|
*ptr++ = SKETCH_TYPE;
|
501
525
|
*ptr++ = entry_bits;
|
@@ -30,29 +30,30 @@ static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
|
|
30
30
|
|
31
31
|
TEST_CASE("pack unpack bits") {
|
32
32
|
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
33
|
+
int n = 8;
|
33
34
|
const uint64_t mask = (1ULL << bits) - 1;
|
34
|
-
std::vector<uint64_t> input(
|
35
|
+
std::vector<uint64_t> input(n, 0);
|
35
36
|
const uint64_t igolden64 = IGOLDEN64;
|
36
37
|
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
37
|
-
for (int i = 0; i <
|
38
|
+
for (int i = 0; i < n; ++i) {
|
38
39
|
input[i] = value & mask;
|
39
40
|
value += igolden64;
|
40
41
|
}
|
41
|
-
std::vector<uint8_t> bytes(
|
42
|
+
std::vector<uint8_t> bytes(n * sizeof(uint64_t), 0);
|
42
43
|
uint8_t offset = 0;
|
43
44
|
uint8_t* ptr = bytes.data();
|
44
|
-
for (int i = 0; i <
|
45
|
+
for (int i = 0; i < n; ++i) {
|
45
46
|
offset = pack_bits(input[i], bits, ptr, offset);
|
46
47
|
}
|
47
48
|
|
48
|
-
std::vector<uint64_t> output(
|
49
|
+
std::vector<uint64_t> output(n, 0);
|
49
50
|
offset = 0;
|
50
51
|
const uint8_t* cptr = bytes.data();
|
51
|
-
for (int i = 0; i <
|
52
|
+
for (int i = 0; i < n; ++i) {
|
52
53
|
offset = unpack_bits(output[i], bits, cptr, offset);
|
53
54
|
}
|
54
|
-
for (int i = 0; i <
|
55
|
-
REQUIRE(
|
55
|
+
for (int i = 0; i < n; ++i) {
|
56
|
+
REQUIRE(input[i] == output[i]);
|
56
57
|
}
|
57
58
|
}
|
58
59
|
}
|
@@ -45,6 +45,23 @@ TEST_CASE("theta sketch", "[serde_compat]") {
|
|
45
45
|
}
|
46
46
|
}
|
47
47
|
|
48
|
+
TEST_CASE("theta sketch compressed", "[serde_compat]") {
|
49
|
+
const unsigned n_arr[] = {10, 100, 1000, 10000, 100000, 1000000};
|
50
|
+
for (const unsigned n: n_arr) {
|
51
|
+
std::ifstream is;
|
52
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
53
|
+
is.open(testBinaryInputPath + "theta_compressed_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
54
|
+
const auto sketch = compact_theta_sketch::deserialize(is);
|
55
|
+
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
|
56
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
57
|
+
for (const auto hash: sketch) {
|
58
|
+
REQUIRE(hash < sketch.get_theta64());
|
59
|
+
}
|
60
|
+
REQUIRE(sketch.is_ordered());
|
61
|
+
REQUIRE(std::is_sorted(sketch.begin(), sketch.end()));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
48
65
|
TEST_CASE("theta sketch non-empty no entries", "[serde_compat]") {
|
49
66
|
std::ifstream is;
|
50
67
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
@@ -43,7 +43,7 @@ TEST_CASE("theta sketch generate compressed", "[serialize_for_java]") {
|
|
43
43
|
REQUIRE_FALSE(sketch.is_empty());
|
44
44
|
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
45
45
|
std::ofstream os("theta_compressed_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
46
|
-
sketch.compact().
|
46
|
+
sketch.compact().serialize_compressed(os);
|
47
47
|
}
|
48
48
|
}
|
49
49
|
|
@@ -273,9 +273,11 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
|
|
273
273
|
for (int i = 0; i < n; i++) update_sketch.update(i);
|
274
274
|
|
275
275
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
276
|
-
update_sketch.compact()
|
277
|
-
|
276
|
+
auto compact_sketch = update_sketch.compact();
|
277
|
+
compact_sketch.serialize(s);
|
278
|
+
auto bytes = compact_sketch.serialize();
|
278
279
|
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
280
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes());
|
279
281
|
for (size_t i = 0; i < bytes.size(); ++i) {
|
280
282
|
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
281
283
|
}
|
@@ -515,12 +517,54 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
|
|
515
517
|
}
|
516
518
|
}
|
517
519
|
|
520
|
+
TEST_CASE("theta sketch: serialize deserialize small compressed", "[theta_sketch]") {
|
521
|
+
auto update_sketch = update_theta_sketch::builder().build();
|
522
|
+
for (int i = 0; i < 10; i++) update_sketch.update(i);
|
523
|
+
auto compact_sketch = update_sketch.compact();
|
524
|
+
|
525
|
+
auto bytes = compact_sketch.serialize_compressed();
|
526
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
527
|
+
{ // deserialize bytes
|
528
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
529
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
530
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
531
|
+
auto iter = deserialized_sketch.begin();
|
532
|
+
for (const auto key: compact_sketch) {
|
533
|
+
REQUIRE(*iter == key);
|
534
|
+
++iter;
|
535
|
+
}
|
536
|
+
}
|
537
|
+
{ // wrap bytes
|
538
|
+
auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
|
539
|
+
REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
540
|
+
REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
|
541
|
+
auto iter = wrapped_sketch.begin();
|
542
|
+
for (const auto key: compact_sketch) {
|
543
|
+
REQUIRE(*iter == key);
|
544
|
+
++iter;
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
549
|
+
compact_sketch.serialize_compressed(s);
|
550
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
551
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
552
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
553
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
554
|
+
auto iter = deserialized_sketch.begin();
|
555
|
+
for (const auto key: compact_sketch) {
|
556
|
+
REQUIRE(*iter == key);
|
557
|
+
++iter;
|
558
|
+
}
|
559
|
+
}
|
560
|
+
|
518
561
|
TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
519
562
|
auto update_sketch = update_theta_sketch::builder().build();
|
520
563
|
for (int i = 0; i < 10000; i++) update_sketch.update(i);
|
521
564
|
auto compact_sketch = update_sketch.compact();
|
522
565
|
|
523
566
|
auto bytes = compact_sketch.serialize_compressed();
|
567
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
524
568
|
{ // deserialize bytes
|
525
569
|
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
526
570
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
@@ -544,6 +588,7 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
544
588
|
|
545
589
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
546
590
|
compact_sketch.serialize_compressed(s);
|
591
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
547
592
|
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
548
593
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
549
594
|
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
@@ -554,4 +599,30 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
554
599
|
}
|
555
600
|
}
|
556
601
|
|
602
|
+
// The sketch reaches capacity for the first time at 2 * K * 15/16,
|
603
|
+
// but at that point it is still in exact mode, so the serialized size is not the maximum
|
604
|
+
// (theta in not serialized in the exact mode).
|
605
|
+
// So we need to catch the second time, but some updates will be ignored in the estimation mode,
|
606
|
+
// so we update more than enough times keeping track of the maximum.
|
607
|
+
// Potentially the exact number of updates to reach the peak can be figured out given this particular sequence,
|
608
|
+
// but not assuming that might be even better (say, in case we change the load factor or hash function
|
609
|
+
// or just out of principle not to rely on implementation details too much).
|
610
|
+
TEST_CASE("max serialized size", "[theta_sketch]") {
|
611
|
+
const uint8_t lg_k = 10;
|
612
|
+
auto sketch = update_theta_sketch::builder().set_lg_k(lg_k).build();
|
613
|
+
int value = 0;
|
614
|
+
|
615
|
+
// this will go over the first peak, which is not the highest
|
616
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) sketch.update(value++);
|
617
|
+
|
618
|
+
// this will to over the second peak keeping track of the max size
|
619
|
+
size_t max_size_bytes = 0;
|
620
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) {
|
621
|
+
sketch.update(value++);
|
622
|
+
auto bytes = sketch.compact().serialize();
|
623
|
+
max_size_bytes = std::max(max_size_bytes, bytes.size());
|
624
|
+
}
|
625
|
+
REQUIRE(max_size_bytes == compact_theta_sketch::get_max_serialized_size_bytes(lg_k));
|
626
|
+
}
|
627
|
+
|
557
628
|
} /* namespace datasketches */
|
@@ -381,6 +381,15 @@ public:
|
|
381
381
|
*/
|
382
382
|
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
|
383
383
|
|
384
|
+
/**
|
385
|
+
* Produces a Compact Tuple sketch from this sketch
|
386
|
+
* by applying a given predicate to each entry.
|
387
|
+
* @param predicate should return true for the entries to keep
|
388
|
+
* @return compact sketch with the entries retained according to the predicate
|
389
|
+
*/
|
390
|
+
template<typename Predicate>
|
391
|
+
compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
|
392
|
+
|
384
393
|
virtual iterator begin();
|
385
394
|
virtual iterator end();
|
386
395
|
virtual const_iterator begin() const;
|
@@ -480,6 +489,25 @@ public:
|
|
480
489
|
virtual uint32_t get_num_retained() const;
|
481
490
|
virtual uint16_t get_seed_hash() const;
|
482
491
|
|
492
|
+
/**
|
493
|
+
* Produces a Compact Tuple sketch from this sketch
|
494
|
+
* by applying a given predicate to each entry.
|
495
|
+
* @param predicate should return true for the entries to keep
|
496
|
+
* @return compact sketch with the entries retained according to the predicate
|
497
|
+
*/
|
498
|
+
template<typename Predicate>
|
499
|
+
compact_tuple_sketch filter(const Predicate& predicate) const;
|
500
|
+
|
501
|
+
/**
|
502
|
+
* Produces a Compact Tuple sketch from a given sketch (Update or Compact)
|
503
|
+
* by applying a given predicate to each entry.
|
504
|
+
* @param sketch input sketch
|
505
|
+
* @param predicate should return true for the entries to keep
|
506
|
+
* @return compact sketch with the entries retained according to the predicate
|
507
|
+
*/
|
508
|
+
template<typename Sketch, typename Predicate>
|
509
|
+
static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
|
510
|
+
|
483
511
|
/**
|
484
512
|
* This method serializes the sketch into a given stream in a binary form
|
485
513
|
* @param os output stream
|
@@ -579,7 +607,6 @@ protected:
|
|
579
607
|
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
|
580
608
|
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
|
581
609
|
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
|
582
|
-
|
583
610
|
};
|
584
611
|
|
585
612
|
/// Tuple base builder
|
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
|
|
258
258
|
return compact_tuple_sketch<S, A>(*this, ordered);
|
259
259
|
}
|
260
260
|
|
261
|
+
template<typename S, typename U, typename P, typename A>
|
262
|
+
template<typename Predicate>
|
263
|
+
compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
|
264
|
+
return compact_tuple_sketch<S, A>::filter(*this, predicate);
|
265
|
+
}
|
266
|
+
|
261
267
|
template<typename S, typename U, typename P, typename A>
|
262
268
|
void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
|
263
269
|
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
|
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
|
|
344
350
|
return seed_hash_;
|
345
351
|
}
|
346
352
|
|
353
|
+
template<typename S, typename A>
|
354
|
+
template<typename Predicate>
|
355
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
|
356
|
+
return filter(*this, predicate);
|
357
|
+
}
|
358
|
+
|
359
|
+
template<typename S, typename A>
|
360
|
+
template<typename Sketch, typename Predicate>
|
361
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
|
362
|
+
std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
|
363
|
+
entries.reserve(sketch.get_num_retained());
|
364
|
+
std::copy_if(
|
365
|
+
sketch.begin(),
|
366
|
+
sketch.end(),
|
367
|
+
std::back_inserter(entries),
|
368
|
+
[&predicate](const Entry& e) {return predicate(e.second);}
|
369
|
+
);
|
370
|
+
entries.shrink_to_fit();
|
371
|
+
return compact_tuple_sketch(
|
372
|
+
!sketch.is_estimation_mode() && entries.empty(),
|
373
|
+
sketch.is_ordered(),
|
374
|
+
sketch.get_seed_hash(),
|
375
|
+
sketch.get_theta64(),
|
376
|
+
std::move(entries)
|
377
|
+
);
|
378
|
+
}
|
379
|
+
|
347
380
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
348
381
|
template<typename S, typename A>
|
349
382
|
template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
|
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
|
|
310
310
|
REQUIRE(sketch.get_num_retained() == 3);
|
311
311
|
}
|
312
312
|
|
313
|
+
TEST_CASE("filter", "[tuple_sketch]") {
|
314
|
+
auto usk = update_tuple_sketch<int>::builder().build();
|
315
|
+
|
316
|
+
{ // empty update sketch
|
317
|
+
auto sk = usk.filter([](int){return true;});
|
318
|
+
REQUIRE(sk.is_empty());
|
319
|
+
REQUIRE(sk.is_ordered());
|
320
|
+
REQUIRE(sk.get_num_retained() == 0);
|
321
|
+
}
|
322
|
+
|
323
|
+
{ // empty compact sketch
|
324
|
+
auto sk = usk.compact().filter([](int){return true;});
|
325
|
+
REQUIRE(sk.is_empty());
|
326
|
+
REQUIRE(sk.is_ordered());
|
327
|
+
REQUIRE(sk.get_num_retained() == 0);
|
328
|
+
}
|
329
|
+
|
330
|
+
usk.update(1, 1);
|
331
|
+
usk.update(1, 1);
|
332
|
+
usk.update(2, 1);
|
333
|
+
usk.update(2, 1);
|
334
|
+
usk.update(3, 1);
|
335
|
+
|
336
|
+
{ // exact mode update sketch
|
337
|
+
auto sk = usk.filter([](int v){return v > 1;});
|
338
|
+
REQUIRE_FALSE(sk.is_empty());
|
339
|
+
REQUIRE_FALSE(sk.is_ordered());
|
340
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
341
|
+
REQUIRE(sk.get_num_retained() == 2);
|
342
|
+
}
|
343
|
+
|
344
|
+
{ // exact mode compact sketch
|
345
|
+
auto sk = usk.compact().filter([](int v){return v > 1;});
|
346
|
+
REQUIRE_FALSE(sk.is_empty());
|
347
|
+
REQUIRE(sk.is_ordered());
|
348
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
349
|
+
REQUIRE(sk.get_num_retained() == 2);
|
350
|
+
}
|
351
|
+
|
352
|
+
// only keys 1 and 2 had values of 2, which will become 3 after this update
|
353
|
+
// some entries are discarded in estimation mode, but these happen to survive
|
354
|
+
// the process is deterministic, so the test will always work
|
355
|
+
for (int i = 0; i < 10000; ++i) usk.update(i, 1);
|
356
|
+
|
357
|
+
{ // estimation mode update sketch
|
358
|
+
auto sk = usk.filter([](int v){return v > 2;});
|
359
|
+
REQUIRE_FALSE(sk.is_empty());
|
360
|
+
REQUIRE_FALSE(sk.is_ordered());
|
361
|
+
REQUIRE(sk.is_estimation_mode());
|
362
|
+
REQUIRE(sk.get_num_retained() == 2);
|
363
|
+
}
|
364
|
+
|
365
|
+
{ // estimation mode compact sketch
|
366
|
+
auto sk = usk.compact().filter([](int v){return v > 2;});
|
367
|
+
REQUIRE_FALSE(sk.is_empty());
|
368
|
+
REQUIRE(sk.is_ordered());
|
369
|
+
REQUIRE(sk.is_estimation_mode());
|
370
|
+
REQUIRE(sk.get_num_retained() == 2);
|
371
|
+
}
|
372
|
+
}
|
373
|
+
|
313
374
|
} /* namespace datasketches */
|
@@ -1 +1 @@
|
|
1
|
-
5.0
|
1
|
+
5.1.0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -231,6 +231,16 @@ files:
|
|
231
231
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp
|
232
232
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp
|
233
233
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp
|
234
|
+
- vendor/datasketches-cpp/tdigest/CMakeLists.txt
|
235
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest.hpp
|
236
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp
|
237
|
+
- vendor/datasketches-cpp/tdigest/test/CMakeLists.txt
|
238
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp
|
239
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp
|
240
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk
|
241
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk
|
242
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp
|
243
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp
|
234
244
|
- vendor/datasketches-cpp/theta/CMakeLists.txt
|
235
245
|
- vendor/datasketches-cpp/theta/include/bit_packing.hpp
|
236
246
|
- vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
|
@@ -324,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
324
334
|
- !ruby/object:Gem::Version
|
325
335
|
version: '0'
|
326
336
|
requirements: []
|
327
|
-
rubygems_version: 3.5.
|
337
|
+
rubygems_version: 3.5.11
|
328
338
|
signing_key:
|
329
339
|
specification_version: 4
|
330
340
|
summary: Sketch data structures for Ruby
|