datasketches 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
@@ -417,6 +417,20 @@ public:
|
|
417
417
|
virtual uint32_t get_num_retained() const;
|
418
418
|
virtual uint16_t get_seed_hash() const;
|
419
419
|
|
420
|
+
/**
|
421
|
+
* Computes maximum serialized size in bytes
|
422
|
+
* @param lg_k nominal number of entries in the sketch
|
423
|
+
*/
|
424
|
+
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
425
|
+
|
426
|
+
/**
|
427
|
+
* Computes size in bytes required to serialize the current state of the sketch.
|
428
|
+
* Computing compressed size is expensive. It takes iterating over all retained hashes,
|
429
|
+
* and the actual serialization will have to look at them again.
|
430
|
+
* @param compressed if true compressed size is returned (if applicable)
|
431
|
+
*/
|
432
|
+
size_t get_serialized_size_bytes(bool compressed = false) const;
|
433
|
+
|
420
434
|
/**
|
421
435
|
* This method serializes the sketch into a given stream in a binary form
|
422
436
|
* @param os output stream
|
@@ -486,8 +500,11 @@ private:
|
|
486
500
|
uint64_t theta_;
|
487
501
|
std::vector<uint64_t, Allocator> entries_;
|
488
502
|
|
503
|
+
uint8_t get_preamble_longs(bool compressed) const;
|
489
504
|
bool is_suitable_for_compression() const;
|
490
|
-
uint8_t
|
505
|
+
uint8_t compute_entry_bits() const;
|
506
|
+
uint8_t get_num_entries_bytes() const;
|
507
|
+
size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
|
491
508
|
void serialize_version_4(std::ostream& os) const;
|
492
509
|
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
493
510
|
|
@@ -24,11 +24,11 @@
|
|
24
24
|
#include <vector>
|
25
25
|
#include <stdexcept>
|
26
26
|
|
27
|
-
#include "serde.hpp"
|
28
27
|
#include "binomial_bounds.hpp"
|
29
28
|
#include "theta_helpers.hpp"
|
30
29
|
#include "count_zeros.hpp"
|
31
30
|
#include "bit_packing.hpp"
|
31
|
+
#include "memory_operations.hpp"
|
32
32
|
|
33
33
|
namespace datasketches {
|
34
34
|
|
@@ -341,6 +341,39 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
341
341
|
template<typename A>
|
342
342
|
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
343
343
|
|
344
|
+
template<typename A>
|
345
|
+
uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
|
346
|
+
if (compressed) {
|
347
|
+
return this->is_estimation_mode() ? 2 : 1;
|
348
|
+
}
|
349
|
+
return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
350
|
+
}
|
351
|
+
|
352
|
+
template<typename A>
|
353
|
+
size_t compact_theta_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
354
|
+
return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
|
355
|
+
}
|
356
|
+
|
357
|
+
template<typename A>
|
358
|
+
size_t compact_theta_sketch_alloc<A>::get_serialized_size_bytes(bool compressed) const {
|
359
|
+
if (compressed && is_suitable_for_compression()) {
|
360
|
+
return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
|
361
|
+
}
|
362
|
+
return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
|
363
|
+
}
|
364
|
+
|
365
|
+
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
366
|
+
template<typename A>
|
367
|
+
uint8_t compact_theta_sketch_alloc<A>::get_num_entries_bytes() const {
|
368
|
+
return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
369
|
+
}
|
370
|
+
|
371
|
+
template<typename A>
|
372
|
+
size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
|
373
|
+
const size_t compressed_bits = entry_bits * entries_.size();
|
374
|
+
return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
|
375
|
+
}
|
376
|
+
|
344
377
|
template<typename A>
|
345
378
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
346
379
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
@@ -366,12 +399,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
366
399
|
|
367
400
|
template<typename A>
|
368
401
|
auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
369
|
-
const
|
370
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
371
|
-
+ sizeof(uint64_t) * entries_.size();
|
402
|
+
const size_t size = get_serialized_size_bytes() + header_size_bytes;
|
372
403
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
373
404
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
374
|
-
|
405
|
+
const uint8_t preamble_longs = get_preamble_longs(false);
|
375
406
|
*ptr++ = preamble_longs;
|
376
407
|
*ptr++ = UNCOMPRESSED_SERIAL_VERSION;
|
377
408
|
*ptr++ = SKETCH_TYPE;
|
@@ -413,7 +444,7 @@ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
|
|
413
444
|
}
|
414
445
|
|
415
446
|
template<typename A>
|
416
|
-
uint8_t compact_theta_sketch_alloc<A>::
|
447
|
+
uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
|
417
448
|
// compression is based on leading zeros in deltas between ordered hash values
|
418
449
|
// assumes ordered sketch
|
419
450
|
uint64_t previous = 0;
|
@@ -423,16 +454,14 @@ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
|
|
423
454
|
ored |= delta;
|
424
455
|
previous = entry;
|
425
456
|
}
|
426
|
-
return count_leading_zeros_in_u64(ored);
|
457
|
+
return 64 - count_leading_zeros_in_u64(ored);
|
427
458
|
}
|
428
459
|
|
429
460
|
template<typename A>
|
430
461
|
void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
|
431
462
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
|
432
|
-
const uint8_t entry_bits =
|
433
|
-
|
434
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
435
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
463
|
+
const uint8_t entry_bits = compute_entry_bits();
|
464
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
436
465
|
|
437
466
|
write(os, preamble_longs);
|
438
467
|
write(os, COMPRESSED_SERIAL_VERSION);
|
@@ -477,25 +506,20 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
|
|
477
506
|
previous = entries_[i];
|
478
507
|
offset = pack_bits(delta, entry_bits, ptr, offset);
|
479
508
|
}
|
509
|
+
if (offset > 0) ++ptr;
|
480
510
|
write(os, buffer.data(), ptr - buffer.data());
|
481
511
|
}
|
482
512
|
}
|
483
513
|
|
484
514
|
template<typename A>
|
485
515
|
auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
|
486
|
-
const uint8_t
|
487
|
-
const uint8_t
|
488
|
-
const size_t
|
489
|
-
|
490
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
491
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
492
|
-
|
493
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
|
494
|
-
+ whole_bytes_to_hold_bits(compressed_bits);
|
516
|
+
const uint8_t entry_bits = compute_entry_bits();
|
517
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
518
|
+
const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
|
495
519
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
496
520
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
497
521
|
|
498
|
-
*ptr++ =
|
522
|
+
*ptr++ = get_preamble_longs(true);
|
499
523
|
*ptr++ = COMPRESSED_SERIAL_VERSION;
|
500
524
|
*ptr++ = SKETCH_TYPE;
|
501
525
|
*ptr++ = entry_bits;
|
@@ -29,50 +29,53 @@ namespace datasketches {
|
|
29
29
|
static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
|
30
30
|
|
31
31
|
TEST_CASE("pack unpack bits") {
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
32
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
33
|
+
for (int m = 0; m < 10000; ++m) {
|
34
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
35
|
+
int n = 8;
|
36
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
37
|
+
std::vector<uint64_t> input(n, 0);
|
38
|
+
for (int i = 0; i < n; ++i) {
|
39
|
+
input[i] = value & mask;
|
40
|
+
value += IGOLDEN64;
|
41
|
+
}
|
42
|
+
std::vector<uint8_t> bytes(n * sizeof(uint64_t), 0);
|
43
|
+
uint8_t offset = 0;
|
44
|
+
uint8_t* ptr = bytes.data();
|
45
|
+
for (int i = 0; i < n; ++i) {
|
46
|
+
offset = pack_bits(input[i], bits, ptr, offset);
|
47
|
+
}
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
49
|
+
std::vector<uint64_t> output(n, 0);
|
50
|
+
offset = 0;
|
51
|
+
const uint8_t* cptr = bytes.data();
|
52
|
+
for (int i = 0; i < n; ++i) {
|
53
|
+
offset = unpack_bits(output[i], bits, cptr, offset);
|
54
|
+
}
|
55
|
+
for (int i = 0; i < n; ++i) {
|
56
|
+
REQUIRE(input[i] == output[i]);
|
57
|
+
}
|
56
58
|
}
|
57
59
|
}
|
58
60
|
}
|
59
61
|
|
60
62
|
TEST_CASE("pack unpack blocks") {
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
63
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
64
|
+
for (int n = 0; n < 10000; ++n) {
|
65
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
66
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
67
|
+
std::vector<uint64_t> input(8, 0);
|
68
|
+
for (int i = 0; i < 8; ++i) {
|
69
|
+
input[i] = value & mask;
|
70
|
+
value += IGOLDEN64;
|
71
|
+
}
|
72
|
+
std::vector<uint8_t> bytes(bits, 0);
|
73
|
+
pack_bits_block8(input.data(), bytes.data(), bits);
|
74
|
+
std::vector<uint64_t> output(8, 0);
|
75
|
+
unpack_bits_block8(output.data(), bytes.data(), bits);
|
76
|
+
for (int i = 0; i < 8; ++i) {
|
77
|
+
REQUIRE(input[i] == output[i]);
|
78
|
+
}
|
76
79
|
}
|
77
80
|
}
|
78
81
|
}
|
@@ -45,6 +45,23 @@ TEST_CASE("theta sketch", "[serde_compat]") {
|
|
45
45
|
}
|
46
46
|
}
|
47
47
|
|
48
|
+
TEST_CASE("theta sketch compressed", "[serde_compat]") {
|
49
|
+
const unsigned n_arr[] = {10, 100, 1000, 10000, 100000, 1000000};
|
50
|
+
for (const unsigned n: n_arr) {
|
51
|
+
std::ifstream is;
|
52
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
53
|
+
is.open(testBinaryInputPath + "theta_compressed_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
54
|
+
const auto sketch = compact_theta_sketch::deserialize(is);
|
55
|
+
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
|
56
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
57
|
+
for (const auto hash: sketch) {
|
58
|
+
REQUIRE(hash < sketch.get_theta64());
|
59
|
+
}
|
60
|
+
REQUIRE(sketch.is_ordered());
|
61
|
+
REQUIRE(std::is_sorted(sketch.begin(), sketch.end()));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
48
65
|
TEST_CASE("theta sketch non-empty no entries", "[serde_compat]") {
|
49
66
|
std::ifstream is;
|
50
67
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
@@ -43,7 +43,7 @@ TEST_CASE("theta sketch generate compressed", "[serialize_for_java]") {
|
|
43
43
|
REQUIRE_FALSE(sketch.is_empty());
|
44
44
|
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
45
45
|
std::ofstream os("theta_compressed_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
46
|
-
sketch.compact().
|
46
|
+
sketch.compact().serialize_compressed(os);
|
47
47
|
}
|
48
48
|
}
|
49
49
|
|
@@ -273,9 +273,11 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
|
|
273
273
|
for (int i = 0; i < n; i++) update_sketch.update(i);
|
274
274
|
|
275
275
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
276
|
-
update_sketch.compact()
|
277
|
-
|
276
|
+
auto compact_sketch = update_sketch.compact();
|
277
|
+
compact_sketch.serialize(s);
|
278
|
+
auto bytes = compact_sketch.serialize();
|
278
279
|
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
280
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes());
|
279
281
|
for (size_t i = 0; i < bytes.size(); ++i) {
|
280
282
|
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
281
283
|
}
|
@@ -515,12 +517,54 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
|
|
515
517
|
}
|
516
518
|
}
|
517
519
|
|
520
|
+
TEST_CASE("theta sketch: serialize deserialize small compressed", "[theta_sketch]") {
|
521
|
+
auto update_sketch = update_theta_sketch::builder().build();
|
522
|
+
for (int i = 0; i < 10; i++) update_sketch.update(i);
|
523
|
+
auto compact_sketch = update_sketch.compact();
|
524
|
+
|
525
|
+
auto bytes = compact_sketch.serialize_compressed();
|
526
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
527
|
+
{ // deserialize bytes
|
528
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
529
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
530
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
531
|
+
auto iter = deserialized_sketch.begin();
|
532
|
+
for (const auto key: compact_sketch) {
|
533
|
+
REQUIRE(*iter == key);
|
534
|
+
++iter;
|
535
|
+
}
|
536
|
+
}
|
537
|
+
{ // wrap bytes
|
538
|
+
auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
|
539
|
+
REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
540
|
+
REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
|
541
|
+
auto iter = wrapped_sketch.begin();
|
542
|
+
for (const auto key: compact_sketch) {
|
543
|
+
REQUIRE(*iter == key);
|
544
|
+
++iter;
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
549
|
+
compact_sketch.serialize_compressed(s);
|
550
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
551
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
552
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
553
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
554
|
+
auto iter = deserialized_sketch.begin();
|
555
|
+
for (const auto key: compact_sketch) {
|
556
|
+
REQUIRE(*iter == key);
|
557
|
+
++iter;
|
558
|
+
}
|
559
|
+
}
|
560
|
+
|
518
561
|
TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
519
562
|
auto update_sketch = update_theta_sketch::builder().build();
|
520
563
|
for (int i = 0; i < 10000; i++) update_sketch.update(i);
|
521
564
|
auto compact_sketch = update_sketch.compact();
|
522
565
|
|
523
566
|
auto bytes = compact_sketch.serialize_compressed();
|
567
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
524
568
|
{ // deserialize bytes
|
525
569
|
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
526
570
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
@@ -544,6 +588,7 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
544
588
|
|
545
589
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
546
590
|
compact_sketch.serialize_compressed(s);
|
591
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
547
592
|
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
548
593
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
549
594
|
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
@@ -554,4 +599,30 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
554
599
|
}
|
555
600
|
}
|
556
601
|
|
602
|
+
// The sketch reaches capacity for the first time at 2 * K * 15/16,
|
603
|
+
// but at that point it is still in exact mode, so the serialized size is not the maximum
|
604
|
+
// (theta in not serialized in the exact mode).
|
605
|
+
// So we need to catch the second time, but some updates will be ignored in the estimation mode,
|
606
|
+
// so we update more than enough times keeping track of the maximum.
|
607
|
+
// Potentially the exact number of updates to reach the peak can be figured out given this particular sequence,
|
608
|
+
// but not assuming that might be even better (say, in case we change the load factor or hash function
|
609
|
+
// or just out of principle not to rely on implementation details too much).
|
610
|
+
TEST_CASE("max serialized size", "[theta_sketch]") {
|
611
|
+
const uint8_t lg_k = 10;
|
612
|
+
auto sketch = update_theta_sketch::builder().set_lg_k(lg_k).build();
|
613
|
+
int value = 0;
|
614
|
+
|
615
|
+
// this will go over the first peak, which is not the highest
|
616
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) sketch.update(value++);
|
617
|
+
|
618
|
+
// this will to over the second peak keeping track of the max size
|
619
|
+
size_t max_size_bytes = 0;
|
620
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) {
|
621
|
+
sketch.update(value++);
|
622
|
+
auto bytes = sketch.compact().serialize();
|
623
|
+
max_size_bytes = std::max(max_size_bytes, bytes.size());
|
624
|
+
}
|
625
|
+
REQUIRE(max_size_bytes == compact_theta_sketch::get_max_serialized_size_bytes(lg_k));
|
626
|
+
}
|
627
|
+
|
557
628
|
} /* namespace datasketches */
|
@@ -381,6 +381,15 @@ public:
|
|
381
381
|
*/
|
382
382
|
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
|
383
383
|
|
384
|
+
/**
|
385
|
+
* Produces a Compact Tuple sketch from this sketch
|
386
|
+
* by applying a given predicate to each entry.
|
387
|
+
* @param predicate should return true for the entries to keep
|
388
|
+
* @return compact sketch with the entries retained according to the predicate
|
389
|
+
*/
|
390
|
+
template<typename Predicate>
|
391
|
+
compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
|
392
|
+
|
384
393
|
virtual iterator begin();
|
385
394
|
virtual iterator end();
|
386
395
|
virtual const_iterator begin() const;
|
@@ -480,6 +489,25 @@ public:
|
|
480
489
|
virtual uint32_t get_num_retained() const;
|
481
490
|
virtual uint16_t get_seed_hash() const;
|
482
491
|
|
492
|
+
/**
|
493
|
+
* Produces a Compact Tuple sketch from this sketch
|
494
|
+
* by applying a given predicate to each entry.
|
495
|
+
* @param predicate should return true for the entries to keep
|
496
|
+
* @return compact sketch with the entries retained according to the predicate
|
497
|
+
*/
|
498
|
+
template<typename Predicate>
|
499
|
+
compact_tuple_sketch filter(const Predicate& predicate) const;
|
500
|
+
|
501
|
+
/**
|
502
|
+
* Produces a Compact Tuple sketch from a given sketch (Update or Compact)
|
503
|
+
* by applying a given predicate to each entry.
|
504
|
+
* @param sketch input sketch
|
505
|
+
* @param predicate should return true for the entries to keep
|
506
|
+
* @return compact sketch with the entries retained according to the predicate
|
507
|
+
*/
|
508
|
+
template<typename Sketch, typename Predicate>
|
509
|
+
static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
|
510
|
+
|
483
511
|
/**
|
484
512
|
* This method serializes the sketch into a given stream in a binary form
|
485
513
|
* @param os output stream
|
@@ -579,7 +607,6 @@ protected:
|
|
579
607
|
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
|
580
608
|
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
|
581
609
|
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
|
582
|
-
|
583
610
|
};
|
584
611
|
|
585
612
|
/// Tuple base builder
|
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
|
|
258
258
|
return compact_tuple_sketch<S, A>(*this, ordered);
|
259
259
|
}
|
260
260
|
|
261
|
+
template<typename S, typename U, typename P, typename A>
|
262
|
+
template<typename Predicate>
|
263
|
+
compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
|
264
|
+
return compact_tuple_sketch<S, A>::filter(*this, predicate);
|
265
|
+
}
|
266
|
+
|
261
267
|
template<typename S, typename U, typename P, typename A>
|
262
268
|
void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
|
263
269
|
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
|
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
|
|
344
350
|
return seed_hash_;
|
345
351
|
}
|
346
352
|
|
353
|
+
template<typename S, typename A>
|
354
|
+
template<typename Predicate>
|
355
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
|
356
|
+
return filter(*this, predicate);
|
357
|
+
}
|
358
|
+
|
359
|
+
template<typename S, typename A>
|
360
|
+
template<typename Sketch, typename Predicate>
|
361
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
|
362
|
+
std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
|
363
|
+
entries.reserve(sketch.get_num_retained());
|
364
|
+
std::copy_if(
|
365
|
+
sketch.begin(),
|
366
|
+
sketch.end(),
|
367
|
+
std::back_inserter(entries),
|
368
|
+
[&predicate](const Entry& e) {return predicate(e.second);}
|
369
|
+
);
|
370
|
+
entries.shrink_to_fit();
|
371
|
+
return compact_tuple_sketch(
|
372
|
+
!sketch.is_estimation_mode() && entries.empty(),
|
373
|
+
sketch.is_ordered(),
|
374
|
+
sketch.get_seed_hash(),
|
375
|
+
sketch.get_theta64(),
|
376
|
+
std::move(entries)
|
377
|
+
);
|
378
|
+
}
|
379
|
+
|
347
380
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
348
381
|
template<typename S, typename A>
|
349
382
|
template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
|
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
|
|
310
310
|
REQUIRE(sketch.get_num_retained() == 3);
|
311
311
|
}
|
312
312
|
|
313
|
+
TEST_CASE("filter", "[tuple_sketch]") {
|
314
|
+
auto usk = update_tuple_sketch<int>::builder().build();
|
315
|
+
|
316
|
+
{ // empty update sketch
|
317
|
+
auto sk = usk.filter([](int){return true;});
|
318
|
+
REQUIRE(sk.is_empty());
|
319
|
+
REQUIRE(sk.is_ordered());
|
320
|
+
REQUIRE(sk.get_num_retained() == 0);
|
321
|
+
}
|
322
|
+
|
323
|
+
{ // empty compact sketch
|
324
|
+
auto sk = usk.compact().filter([](int){return true;});
|
325
|
+
REQUIRE(sk.is_empty());
|
326
|
+
REQUIRE(sk.is_ordered());
|
327
|
+
REQUIRE(sk.get_num_retained() == 0);
|
328
|
+
}
|
329
|
+
|
330
|
+
usk.update(1, 1);
|
331
|
+
usk.update(1, 1);
|
332
|
+
usk.update(2, 1);
|
333
|
+
usk.update(2, 1);
|
334
|
+
usk.update(3, 1);
|
335
|
+
|
336
|
+
{ // exact mode update sketch
|
337
|
+
auto sk = usk.filter([](int v){return v > 1;});
|
338
|
+
REQUIRE_FALSE(sk.is_empty());
|
339
|
+
REQUIRE_FALSE(sk.is_ordered());
|
340
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
341
|
+
REQUIRE(sk.get_num_retained() == 2);
|
342
|
+
}
|
343
|
+
|
344
|
+
{ // exact mode compact sketch
|
345
|
+
auto sk = usk.compact().filter([](int v){return v > 1;});
|
346
|
+
REQUIRE_FALSE(sk.is_empty());
|
347
|
+
REQUIRE(sk.is_ordered());
|
348
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
349
|
+
REQUIRE(sk.get_num_retained() == 2);
|
350
|
+
}
|
351
|
+
|
352
|
+
// only keys 1 and 2 had values of 2, which will become 3 after this update
|
353
|
+
// some entries are discarded in estimation mode, but these happen to survive
|
354
|
+
// the process is deterministic, so the test will always work
|
355
|
+
for (int i = 0; i < 10000; ++i) usk.update(i, 1);
|
356
|
+
|
357
|
+
{ // estimation mode update sketch
|
358
|
+
auto sk = usk.filter([](int v){return v > 2;});
|
359
|
+
REQUIRE_FALSE(sk.is_empty());
|
360
|
+
REQUIRE_FALSE(sk.is_ordered());
|
361
|
+
REQUIRE(sk.is_estimation_mode());
|
362
|
+
REQUIRE(sk.get_num_retained() == 2);
|
363
|
+
}
|
364
|
+
|
365
|
+
{ // estimation mode compact sketch
|
366
|
+
auto sk = usk.compact().filter([](int v){return v > 2;});
|
367
|
+
REQUIRE_FALSE(sk.is_empty());
|
368
|
+
REQUIRE(sk.is_ordered());
|
369
|
+
REQUIRE(sk.is_estimation_mode());
|
370
|
+
REQUIRE(sk.get_num_retained() == 2);
|
371
|
+
}
|
372
|
+
}
|
373
|
+
|
313
374
|
} /* namespace datasketches */
|
@@ -1 +1 @@
|
|
1
|
-
5.0
|
1
|
+
5.2.0
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-16 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: rice
|
@@ -16,15 +15,14 @@ dependencies:
|
|
16
15
|
requirements:
|
17
16
|
- - ">="
|
18
17
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
18
|
+
version: 4.3.3
|
20
19
|
type: :runtime
|
21
20
|
prerelease: false
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
23
22
|
requirements:
|
24
23
|
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
27
|
-
description:
|
25
|
+
version: 4.3.3
|
28
26
|
email: andrew@ankane.org
|
29
27
|
executables: []
|
30
28
|
extensions:
|
@@ -72,6 +70,7 @@ files:
|
|
72
70
|
- vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp
|
73
71
|
- vendor/datasketches-cpp/common/include/serde.hpp
|
74
72
|
- vendor/datasketches-cpp/common/include/version.hpp.in
|
73
|
+
- vendor/datasketches-cpp/common/include/xxhash64.h
|
75
74
|
- vendor/datasketches-cpp/common/test/CMakeLists.txt
|
76
75
|
- vendor/datasketches-cpp/common/test/catch_runner.cpp
|
77
76
|
- vendor/datasketches-cpp/common/test/integration_test.cpp
|
@@ -124,6 +123,17 @@ files:
|
|
124
123
|
- vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp
|
125
124
|
- vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp
|
126
125
|
- vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp
|
126
|
+
- vendor/datasketches-cpp/filters/CMakeLists.txt
|
127
|
+
- vendor/datasketches-cpp/filters/include/bit_array_ops.hpp
|
128
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter.hpp
|
129
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp
|
130
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp
|
131
|
+
- vendor/datasketches-cpp/filters/test/CMakeLists.txt
|
132
|
+
- vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp
|
133
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp
|
134
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp
|
135
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp
|
136
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp
|
127
137
|
- vendor/datasketches-cpp/hll/CMakeLists.txt
|
128
138
|
- vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp
|
129
139
|
- vendor/datasketches-cpp/hll/include/AuxHashMap.hpp
|
@@ -231,6 +241,16 @@ files:
|
|
231
241
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp
|
232
242
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp
|
233
243
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp
|
244
|
+
- vendor/datasketches-cpp/tdigest/CMakeLists.txt
|
245
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest.hpp
|
246
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp
|
247
|
+
- vendor/datasketches-cpp/tdigest/test/CMakeLists.txt
|
248
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp
|
249
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp
|
250
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk
|
251
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk
|
252
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp
|
253
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp
|
234
254
|
- vendor/datasketches-cpp/theta/CMakeLists.txt
|
235
255
|
- vendor/datasketches-cpp/theta/include/bit_packing.hpp
|
236
256
|
- vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
|
@@ -309,7 +329,6 @@ homepage: https://github.com/ankane/datasketches-ruby
|
|
309
329
|
licenses:
|
310
330
|
- Apache-2.0
|
311
331
|
metadata: {}
|
312
|
-
post_install_message:
|
313
332
|
rdoc_options: []
|
314
333
|
require_paths:
|
315
334
|
- lib
|
@@ -324,8 +343,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
324
343
|
- !ruby/object:Gem::Version
|
325
344
|
version: '0'
|
326
345
|
requirements: []
|
327
|
-
rubygems_version: 3.
|
328
|
-
signing_key:
|
346
|
+
rubygems_version: 3.6.2
|
329
347
|
specification_version: 4
|
330
348
|
summary: Sketch data structures for Ruby
|
331
349
|
test_files: []
|