datasketches 0.4.2 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
@@ -417,6 +417,20 @@ public:
|
|
417
417
|
virtual uint32_t get_num_retained() const;
|
418
418
|
virtual uint16_t get_seed_hash() const;
|
419
419
|
|
420
|
+
/**
|
421
|
+
* Computes maximum serialized size in bytes
|
422
|
+
* @param lg_k nominal number of entries in the sketch
|
423
|
+
*/
|
424
|
+
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
425
|
+
|
426
|
+
/**
|
427
|
+
* Computes size in bytes required to serialize the current state of the sketch.
|
428
|
+
* Computing compressed size is expensive. It takes iterating over all retained hashes,
|
429
|
+
* and the actual serialization will have to look at them again.
|
430
|
+
* @param compressed if true compressed size is returned (if applicable)
|
431
|
+
*/
|
432
|
+
size_t get_serialized_size_bytes(bool compressed = false) const;
|
433
|
+
|
420
434
|
/**
|
421
435
|
* This method serializes the sketch into a given stream in a binary form
|
422
436
|
* @param os output stream
|
@@ -486,8 +500,11 @@ private:
|
|
486
500
|
uint64_t theta_;
|
487
501
|
std::vector<uint64_t, Allocator> entries_;
|
488
502
|
|
503
|
+
uint8_t get_preamble_longs(bool compressed) const;
|
489
504
|
bool is_suitable_for_compression() const;
|
490
|
-
uint8_t
|
505
|
+
uint8_t compute_entry_bits() const;
|
506
|
+
uint8_t get_num_entries_bytes() const;
|
507
|
+
size_t get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const;
|
491
508
|
void serialize_version_4(std::ostream& os) const;
|
492
509
|
vector_bytes serialize_version_4(unsigned header_size_bytes = 0) const;
|
493
510
|
|
@@ -24,11 +24,11 @@
|
|
24
24
|
#include <vector>
|
25
25
|
#include <stdexcept>
|
26
26
|
|
27
|
-
#include "serde.hpp"
|
28
27
|
#include "binomial_bounds.hpp"
|
29
28
|
#include "theta_helpers.hpp"
|
30
29
|
#include "count_zeros.hpp"
|
31
30
|
#include "bit_packing.hpp"
|
31
|
+
#include "memory_operations.hpp"
|
32
32
|
|
33
33
|
namespace datasketches {
|
34
34
|
|
@@ -341,6 +341,39 @@ auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
|
341
341
|
template<typename A>
|
342
342
|
void compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&) const {}
|
343
343
|
|
344
|
+
template<typename A>
|
345
|
+
uint8_t compact_theta_sketch_alloc<A>::get_preamble_longs(bool compressed) const {
|
346
|
+
if (compressed) {
|
347
|
+
return this->is_estimation_mode() ? 2 : 1;
|
348
|
+
}
|
349
|
+
return this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
350
|
+
}
|
351
|
+
|
352
|
+
template<typename A>
|
353
|
+
size_t compact_theta_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
354
|
+
return sizeof(uint64_t) * (3 + update_theta_sketch_alloc<A>::theta_table::get_capacity(lg_k + 1, lg_k));
|
355
|
+
}
|
356
|
+
|
357
|
+
template<typename A>
|
358
|
+
size_t compact_theta_sketch_alloc<A>::get_serialized_size_bytes(bool compressed) const {
|
359
|
+
if (compressed && is_suitable_for_compression()) {
|
360
|
+
return get_compressed_serialized_size_bytes(compute_entry_bits(), get_num_entries_bytes());
|
361
|
+
}
|
362
|
+
return sizeof(uint64_t) * get_preamble_longs(false) + sizeof(uint64_t) * entries_.size();
|
363
|
+
}
|
364
|
+
|
365
|
+
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
366
|
+
template<typename A>
|
367
|
+
uint8_t compact_theta_sketch_alloc<A>::get_num_entries_bytes() const {
|
368
|
+
return whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
369
|
+
}
|
370
|
+
|
371
|
+
template<typename A>
|
372
|
+
size_t compact_theta_sketch_alloc<A>::get_compressed_serialized_size_bytes(uint8_t entry_bits, uint8_t num_entries_bytes) const {
|
373
|
+
const size_t compressed_bits = entry_bits * entries_.size();
|
374
|
+
return sizeof(uint64_t) * get_preamble_longs(true) + num_entries_bytes + whole_bytes_to_hold_bits(compressed_bits);
|
375
|
+
}
|
376
|
+
|
344
377
|
template<typename A>
|
345
378
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
346
379
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
@@ -366,12 +399,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
366
399
|
|
367
400
|
template<typename A>
|
368
401
|
auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
369
|
-
const
|
370
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
|
371
|
-
+ sizeof(uint64_t) * entries_.size();
|
402
|
+
const size_t size = get_serialized_size_bytes() + header_size_bytes;
|
372
403
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
373
404
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
374
|
-
|
405
|
+
const uint8_t preamble_longs = get_preamble_longs(false);
|
375
406
|
*ptr++ = preamble_longs;
|
376
407
|
*ptr++ = UNCOMPRESSED_SERIAL_VERSION;
|
377
408
|
*ptr++ = SKETCH_TYPE;
|
@@ -413,7 +444,7 @@ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_by
|
|
413
444
|
}
|
414
445
|
|
415
446
|
template<typename A>
|
416
|
-
uint8_t compact_theta_sketch_alloc<A>::
|
447
|
+
uint8_t compact_theta_sketch_alloc<A>::compute_entry_bits() const {
|
417
448
|
// compression is based on leading zeros in deltas between ordered hash values
|
418
449
|
// assumes ordered sketch
|
419
450
|
uint64_t previous = 0;
|
@@ -423,16 +454,14 @@ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
|
|
423
454
|
ored |= delta;
|
424
455
|
previous = entry;
|
425
456
|
}
|
426
|
-
return count_leading_zeros_in_u64(ored);
|
457
|
+
return 64 - count_leading_zeros_in_u64(ored);
|
427
458
|
}
|
428
459
|
|
429
460
|
template<typename A>
|
430
461
|
void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
|
431
462
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
|
432
|
-
const uint8_t entry_bits =
|
433
|
-
|
434
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
435
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
463
|
+
const uint8_t entry_bits = compute_entry_bits();
|
464
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
436
465
|
|
437
466
|
write(os, preamble_longs);
|
438
467
|
write(os, COMPRESSED_SERIAL_VERSION);
|
@@ -477,25 +506,20 @@ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const
|
|
477
506
|
previous = entries_[i];
|
478
507
|
offset = pack_bits(delta, entry_bits, ptr, offset);
|
479
508
|
}
|
509
|
+
if (offset > 0) ++ptr;
|
480
510
|
write(os, buffer.data(), ptr - buffer.data());
|
481
511
|
}
|
482
512
|
}
|
483
513
|
|
484
514
|
template<typename A>
|
485
515
|
auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
|
486
|
-
const uint8_t
|
487
|
-
const uint8_t
|
488
|
-
const size_t
|
489
|
-
|
490
|
-
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
491
|
-
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(static_cast<uint32_t>(entries_.size())));
|
492
|
-
|
493
|
-
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
|
494
|
-
+ whole_bytes_to_hold_bits(compressed_bits);
|
516
|
+
const uint8_t entry_bits = compute_entry_bits();
|
517
|
+
const uint8_t num_entries_bytes = get_num_entries_bytes();
|
518
|
+
const size_t size = get_compressed_serialized_size_bytes(entry_bits, num_entries_bytes) + header_size_bytes;
|
495
519
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
496
520
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
497
521
|
|
498
|
-
*ptr++ =
|
522
|
+
*ptr++ = get_preamble_longs(true);
|
499
523
|
*ptr++ = COMPRESSED_SERIAL_VERSION;
|
500
524
|
*ptr++ = SKETCH_TYPE;
|
501
525
|
*ptr++ = entry_bits;
|
@@ -29,50 +29,53 @@ namespace datasketches {
|
|
29
29
|
static const uint64_t IGOLDEN64 = 0x9e3779b97f4a7c13ULL;
|
30
30
|
|
31
31
|
TEST_CASE("pack unpack bits") {
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
32
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
33
|
+
for (int m = 0; m < 10000; ++m) {
|
34
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
35
|
+
int n = 8;
|
36
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
37
|
+
std::vector<uint64_t> input(n, 0);
|
38
|
+
for (int i = 0; i < n; ++i) {
|
39
|
+
input[i] = value & mask;
|
40
|
+
value += IGOLDEN64;
|
41
|
+
}
|
42
|
+
std::vector<uint8_t> bytes(n * sizeof(uint64_t), 0);
|
43
|
+
uint8_t offset = 0;
|
44
|
+
uint8_t* ptr = bytes.data();
|
45
|
+
for (int i = 0; i < n; ++i) {
|
46
|
+
offset = pack_bits(input[i], bits, ptr, offset);
|
47
|
+
}
|
47
48
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
49
|
+
std::vector<uint64_t> output(n, 0);
|
50
|
+
offset = 0;
|
51
|
+
const uint8_t* cptr = bytes.data();
|
52
|
+
for (int i = 0; i < n; ++i) {
|
53
|
+
offset = unpack_bits(output[i], bits, cptr, offset);
|
54
|
+
}
|
55
|
+
for (int i = 0; i < n; ++i) {
|
56
|
+
REQUIRE(input[i] == output[i]);
|
57
|
+
}
|
56
58
|
}
|
57
59
|
}
|
58
60
|
}
|
59
61
|
|
60
62
|
TEST_CASE("pack unpack blocks") {
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
63
|
+
uint64_t value = 0xaa55aa55aa55aa55ULL; // arbitrary starting value
|
64
|
+
for (int n = 0; n < 10000; ++n) {
|
65
|
+
for (uint8_t bits = 1; bits <= 63; ++bits) {
|
66
|
+
const uint64_t mask = (1ULL << bits) - 1;
|
67
|
+
std::vector<uint64_t> input(8, 0);
|
68
|
+
for (int i = 0; i < 8; ++i) {
|
69
|
+
input[i] = value & mask;
|
70
|
+
value += IGOLDEN64;
|
71
|
+
}
|
72
|
+
std::vector<uint8_t> bytes(bits, 0);
|
73
|
+
pack_bits_block8(input.data(), bytes.data(), bits);
|
74
|
+
std::vector<uint64_t> output(8, 0);
|
75
|
+
unpack_bits_block8(output.data(), bytes.data(), bits);
|
76
|
+
for (int i = 0; i < 8; ++i) {
|
77
|
+
REQUIRE(input[i] == output[i]);
|
78
|
+
}
|
76
79
|
}
|
77
80
|
}
|
78
81
|
}
|
@@ -45,6 +45,23 @@ TEST_CASE("theta sketch", "[serde_compat]") {
|
|
45
45
|
}
|
46
46
|
}
|
47
47
|
|
48
|
+
TEST_CASE("theta sketch compressed", "[serde_compat]") {
|
49
|
+
const unsigned n_arr[] = {10, 100, 1000, 10000, 100000, 1000000};
|
50
|
+
for (const unsigned n: n_arr) {
|
51
|
+
std::ifstream is;
|
52
|
+
is.exceptions(std::ios::failbit | std::ios::badbit);
|
53
|
+
is.open(testBinaryInputPath + "theta_compressed_n" + std::to_string(n) + "_java.sk", std::ios::binary);
|
54
|
+
const auto sketch = compact_theta_sketch::deserialize(is);
|
55
|
+
REQUIRE(sketch.is_estimation_mode() == (n > 1000));
|
56
|
+
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
57
|
+
for (const auto hash: sketch) {
|
58
|
+
REQUIRE(hash < sketch.get_theta64());
|
59
|
+
}
|
60
|
+
REQUIRE(sketch.is_ordered());
|
61
|
+
REQUIRE(std::is_sorted(sketch.begin(), sketch.end()));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
48
65
|
TEST_CASE("theta sketch non-empty no entries", "[serde_compat]") {
|
49
66
|
std::ifstream is;
|
50
67
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
@@ -43,7 +43,7 @@ TEST_CASE("theta sketch generate compressed", "[serialize_for_java]") {
|
|
43
43
|
REQUIRE_FALSE(sketch.is_empty());
|
44
44
|
REQUIRE(sketch.get_estimate() == Approx(n).margin(n * 0.03));
|
45
45
|
std::ofstream os("theta_compressed_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
46
|
-
sketch.compact().
|
46
|
+
sketch.compact().serialize_compressed(os);
|
47
47
|
}
|
48
48
|
}
|
49
49
|
|
@@ -273,9 +273,11 @@ TEST_CASE("theta sketch: serialize deserialize stream and bytes equivalence", "[
|
|
273
273
|
for (int i = 0; i < n; i++) update_sketch.update(i);
|
274
274
|
|
275
275
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
276
|
-
update_sketch.compact()
|
277
|
-
|
276
|
+
auto compact_sketch = update_sketch.compact();
|
277
|
+
compact_sketch.serialize(s);
|
278
|
+
auto bytes = compact_sketch.serialize();
|
278
279
|
REQUIRE(bytes.size() == static_cast<size_t>(s.tellp()));
|
280
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes());
|
279
281
|
for (size_t i = 0; i < bytes.size(); ++i) {
|
280
282
|
REQUIRE(((char*)bytes.data())[i] == (char)s.get());
|
281
283
|
}
|
@@ -515,12 +517,54 @@ TEST_CASE("theta sketch: wrap compact v2 estimation from java", "[theta_sketch]"
|
|
515
517
|
}
|
516
518
|
}
|
517
519
|
|
520
|
+
TEST_CASE("theta sketch: serialize deserialize small compressed", "[theta_sketch]") {
|
521
|
+
auto update_sketch = update_theta_sketch::builder().build();
|
522
|
+
for (int i = 0; i < 10; i++) update_sketch.update(i);
|
523
|
+
auto compact_sketch = update_sketch.compact();
|
524
|
+
|
525
|
+
auto bytes = compact_sketch.serialize_compressed();
|
526
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
527
|
+
{ // deserialize bytes
|
528
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
529
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
530
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
531
|
+
auto iter = deserialized_sketch.begin();
|
532
|
+
for (const auto key: compact_sketch) {
|
533
|
+
REQUIRE(*iter == key);
|
534
|
+
++iter;
|
535
|
+
}
|
536
|
+
}
|
537
|
+
{ // wrap bytes
|
538
|
+
auto wrapped_sketch = wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size());
|
539
|
+
REQUIRE(wrapped_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
540
|
+
REQUIRE(wrapped_sketch.get_theta() == compact_sketch.get_theta());
|
541
|
+
auto iter = wrapped_sketch.begin();
|
542
|
+
for (const auto key: compact_sketch) {
|
543
|
+
REQUIRE(*iter == key);
|
544
|
+
++iter;
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
549
|
+
compact_sketch.serialize_compressed(s);
|
550
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
551
|
+
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
552
|
+
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
553
|
+
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
554
|
+
auto iter = deserialized_sketch.begin();
|
555
|
+
for (const auto key: compact_sketch) {
|
556
|
+
REQUIRE(*iter == key);
|
557
|
+
++iter;
|
558
|
+
}
|
559
|
+
}
|
560
|
+
|
518
561
|
TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
519
562
|
auto update_sketch = update_theta_sketch::builder().build();
|
520
563
|
for (int i = 0; i < 10000; i++) update_sketch.update(i);
|
521
564
|
auto compact_sketch = update_sketch.compact();
|
522
565
|
|
523
566
|
auto bytes = compact_sketch.serialize_compressed();
|
567
|
+
REQUIRE(bytes.size() == compact_sketch.get_serialized_size_bytes(true));
|
524
568
|
{ // deserialize bytes
|
525
569
|
auto deserialized_sketch = compact_theta_sketch::deserialize(bytes.data(), bytes.size());
|
526
570
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
@@ -544,6 +588,7 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
544
588
|
|
545
589
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
546
590
|
compact_sketch.serialize_compressed(s);
|
591
|
+
REQUIRE(static_cast<size_t>(s.tellp()) == compact_sketch.get_serialized_size_bytes(true));
|
547
592
|
auto deserialized_sketch = compact_theta_sketch::deserialize(s);
|
548
593
|
REQUIRE(deserialized_sketch.get_num_retained() == compact_sketch.get_num_retained());
|
549
594
|
REQUIRE(deserialized_sketch.get_theta() == compact_sketch.get_theta());
|
@@ -554,4 +599,30 @@ TEST_CASE("theta sketch: serialize deserialize compressed", "[theta_sketch]") {
|
|
554
599
|
}
|
555
600
|
}
|
556
601
|
|
602
|
+
// The sketch reaches capacity for the first time at 2 * K * 15/16,
|
603
|
+
// but at that point it is still in exact mode, so the serialized size is not the maximum
|
604
|
+
// (theta in not serialized in the exact mode).
|
605
|
+
// So we need to catch the second time, but some updates will be ignored in the estimation mode,
|
606
|
+
// so we update more than enough times keeping track of the maximum.
|
607
|
+
// Potentially the exact number of updates to reach the peak can be figured out given this particular sequence,
|
608
|
+
// but not assuming that might be even better (say, in case we change the load factor or hash function
|
609
|
+
// or just out of principle not to rely on implementation details too much).
|
610
|
+
TEST_CASE("max serialized size", "[theta_sketch]") {
|
611
|
+
const uint8_t lg_k = 10;
|
612
|
+
auto sketch = update_theta_sketch::builder().set_lg_k(lg_k).build();
|
613
|
+
int value = 0;
|
614
|
+
|
615
|
+
// this will go over the first peak, which is not the highest
|
616
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) sketch.update(value++);
|
617
|
+
|
618
|
+
// this will to over the second peak keeping track of the max size
|
619
|
+
size_t max_size_bytes = 0;
|
620
|
+
for (int i = 0; i < (1 << lg_k) * 2; ++i) {
|
621
|
+
sketch.update(value++);
|
622
|
+
auto bytes = sketch.compact().serialize();
|
623
|
+
max_size_bytes = std::max(max_size_bytes, bytes.size());
|
624
|
+
}
|
625
|
+
REQUIRE(max_size_bytes == compact_theta_sketch::get_max_serialized_size_bytes(lg_k));
|
626
|
+
}
|
627
|
+
|
557
628
|
} /* namespace datasketches */
|
@@ -381,6 +381,15 @@ public:
|
|
381
381
|
*/
|
382
382
|
compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
|
383
383
|
|
384
|
+
/**
|
385
|
+
* Produces a Compact Tuple sketch from this sketch
|
386
|
+
* by applying a given predicate to each entry.
|
387
|
+
* @param predicate should return true for the entries to keep
|
388
|
+
* @return compact sketch with the entries retained according to the predicate
|
389
|
+
*/
|
390
|
+
template<typename Predicate>
|
391
|
+
compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const;
|
392
|
+
|
384
393
|
virtual iterator begin();
|
385
394
|
virtual iterator end();
|
386
395
|
virtual const_iterator begin() const;
|
@@ -480,6 +489,25 @@ public:
|
|
480
489
|
virtual uint32_t get_num_retained() const;
|
481
490
|
virtual uint16_t get_seed_hash() const;
|
482
491
|
|
492
|
+
/**
|
493
|
+
* Produces a Compact Tuple sketch from this sketch
|
494
|
+
* by applying a given predicate to each entry.
|
495
|
+
* @param predicate should return true for the entries to keep
|
496
|
+
* @return compact sketch with the entries retained according to the predicate
|
497
|
+
*/
|
498
|
+
template<typename Predicate>
|
499
|
+
compact_tuple_sketch filter(const Predicate& predicate) const;
|
500
|
+
|
501
|
+
/**
|
502
|
+
* Produces a Compact Tuple sketch from a given sketch (Update or Compact)
|
503
|
+
* by applying a given predicate to each entry.
|
504
|
+
* @param sketch input sketch
|
505
|
+
* @param predicate should return true for the entries to keep
|
506
|
+
* @return compact sketch with the entries retained according to the predicate
|
507
|
+
*/
|
508
|
+
template<typename Sketch, typename Predicate>
|
509
|
+
static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate);
|
510
|
+
|
483
511
|
/**
|
484
512
|
* This method serializes the sketch into a given stream in a binary form
|
485
513
|
* @param os output stream
|
@@ -579,7 +607,6 @@ protected:
|
|
579
607
|
template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base;
|
580
608
|
template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base;
|
581
609
|
compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
|
582
|
-
|
583
610
|
};
|
584
611
|
|
585
612
|
/// Tuple base builder
|
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered
|
|
258
258
|
return compact_tuple_sketch<S, A>(*this, ordered);
|
259
259
|
}
|
260
260
|
|
261
|
+
template<typename S, typename U, typename P, typename A>
|
262
|
+
template<typename Predicate>
|
263
|
+
compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const {
|
264
|
+
return compact_tuple_sketch<S, A>::filter(*this, predicate);
|
265
|
+
}
|
266
|
+
|
261
267
|
template<typename S, typename U, typename P, typename A>
|
262
268
|
void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const {
|
263
269
|
os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl;
|
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const {
|
|
344
350
|
return seed_hash_;
|
345
351
|
}
|
346
352
|
|
353
|
+
template<typename S, typename A>
|
354
|
+
template<typename Predicate>
|
355
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const {
|
356
|
+
return filter(*this, predicate);
|
357
|
+
}
|
358
|
+
|
359
|
+
template<typename S, typename A>
|
360
|
+
template<typename Sketch, typename Predicate>
|
361
|
+
compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) {
|
362
|
+
std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
|
363
|
+
entries.reserve(sketch.get_num_retained());
|
364
|
+
std::copy_if(
|
365
|
+
sketch.begin(),
|
366
|
+
sketch.end(),
|
367
|
+
std::back_inserter(entries),
|
368
|
+
[&predicate](const Entry& e) {return predicate(e.second);}
|
369
|
+
);
|
370
|
+
entries.shrink_to_fit();
|
371
|
+
return compact_tuple_sketch(
|
372
|
+
!sketch.is_estimation_mode() && entries.empty(),
|
373
|
+
sketch.is_ordered(),
|
374
|
+
sketch.get_seed_hash(),
|
375
|
+
sketch.get_theta64(),
|
376
|
+
std::move(entries)
|
377
|
+
);
|
378
|
+
}
|
379
|
+
|
347
380
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
348
381
|
template<typename S, typename A>
|
349
382
|
template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type>
|
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk
|
|
310
310
|
REQUIRE(sketch.get_num_retained() == 3);
|
311
311
|
}
|
312
312
|
|
313
|
+
TEST_CASE("filter", "[tuple_sketch]") {
|
314
|
+
auto usk = update_tuple_sketch<int>::builder().build();
|
315
|
+
|
316
|
+
{ // empty update sketch
|
317
|
+
auto sk = usk.filter([](int){return true;});
|
318
|
+
REQUIRE(sk.is_empty());
|
319
|
+
REQUIRE(sk.is_ordered());
|
320
|
+
REQUIRE(sk.get_num_retained() == 0);
|
321
|
+
}
|
322
|
+
|
323
|
+
{ // empty compact sketch
|
324
|
+
auto sk = usk.compact().filter([](int){return true;});
|
325
|
+
REQUIRE(sk.is_empty());
|
326
|
+
REQUIRE(sk.is_ordered());
|
327
|
+
REQUIRE(sk.get_num_retained() == 0);
|
328
|
+
}
|
329
|
+
|
330
|
+
usk.update(1, 1);
|
331
|
+
usk.update(1, 1);
|
332
|
+
usk.update(2, 1);
|
333
|
+
usk.update(2, 1);
|
334
|
+
usk.update(3, 1);
|
335
|
+
|
336
|
+
{ // exact mode update sketch
|
337
|
+
auto sk = usk.filter([](int v){return v > 1;});
|
338
|
+
REQUIRE_FALSE(sk.is_empty());
|
339
|
+
REQUIRE_FALSE(sk.is_ordered());
|
340
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
341
|
+
REQUIRE(sk.get_num_retained() == 2);
|
342
|
+
}
|
343
|
+
|
344
|
+
{ // exact mode compact sketch
|
345
|
+
auto sk = usk.compact().filter([](int v){return v > 1;});
|
346
|
+
REQUIRE_FALSE(sk.is_empty());
|
347
|
+
REQUIRE(sk.is_ordered());
|
348
|
+
REQUIRE_FALSE(sk.is_estimation_mode());
|
349
|
+
REQUIRE(sk.get_num_retained() == 2);
|
350
|
+
}
|
351
|
+
|
352
|
+
// only keys 1 and 2 had values of 2, which will become 3 after this update
|
353
|
+
// some entries are discarded in estimation mode, but these happen to survive
|
354
|
+
// the process is deterministic, so the test will always work
|
355
|
+
for (int i = 0; i < 10000; ++i) usk.update(i, 1);
|
356
|
+
|
357
|
+
{ // estimation mode update sketch
|
358
|
+
auto sk = usk.filter([](int v){return v > 2;});
|
359
|
+
REQUIRE_FALSE(sk.is_empty());
|
360
|
+
REQUIRE_FALSE(sk.is_ordered());
|
361
|
+
REQUIRE(sk.is_estimation_mode());
|
362
|
+
REQUIRE(sk.get_num_retained() == 2);
|
363
|
+
}
|
364
|
+
|
365
|
+
{ // estimation mode compact sketch
|
366
|
+
auto sk = usk.compact().filter([](int v){return v > 2;});
|
367
|
+
REQUIRE_FALSE(sk.is_empty());
|
368
|
+
REQUIRE(sk.is_ordered());
|
369
|
+
REQUIRE(sk.is_estimation_mode());
|
370
|
+
REQUIRE(sk.get_num_retained() == 2);
|
371
|
+
}
|
372
|
+
}
|
373
|
+
|
313
374
|
} /* namespace datasketches */
|
@@ -1 +1 @@
|
|
1
|
-
5.0
|
1
|
+
5.2.0
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datasketches
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-16 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: rice
|
@@ -16,15 +15,14 @@ dependencies:
|
|
16
15
|
requirements:
|
17
16
|
- - ">="
|
18
17
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
18
|
+
version: 4.3.3
|
20
19
|
type: :runtime
|
21
20
|
prerelease: false
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
23
22
|
requirements:
|
24
23
|
- - ">="
|
25
24
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
27
|
-
description:
|
25
|
+
version: 4.3.3
|
28
26
|
email: andrew@ankane.org
|
29
27
|
executables: []
|
30
28
|
extensions:
|
@@ -72,6 +70,7 @@ files:
|
|
72
70
|
- vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp
|
73
71
|
- vendor/datasketches-cpp/common/include/serde.hpp
|
74
72
|
- vendor/datasketches-cpp/common/include/version.hpp.in
|
73
|
+
- vendor/datasketches-cpp/common/include/xxhash64.h
|
75
74
|
- vendor/datasketches-cpp/common/test/CMakeLists.txt
|
76
75
|
- vendor/datasketches-cpp/common/test/catch_runner.cpp
|
77
76
|
- vendor/datasketches-cpp/common/test/integration_test.cpp
|
@@ -124,6 +123,17 @@ files:
|
|
124
123
|
- vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp
|
125
124
|
- vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp
|
126
125
|
- vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp
|
126
|
+
- vendor/datasketches-cpp/filters/CMakeLists.txt
|
127
|
+
- vendor/datasketches-cpp/filters/include/bit_array_ops.hpp
|
128
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter.hpp
|
129
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp
|
130
|
+
- vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp
|
131
|
+
- vendor/datasketches-cpp/filters/test/CMakeLists.txt
|
132
|
+
- vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp
|
133
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp
|
134
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp
|
135
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp
|
136
|
+
- vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp
|
127
137
|
- vendor/datasketches-cpp/hll/CMakeLists.txt
|
128
138
|
- vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp
|
129
139
|
- vendor/datasketches-cpp/hll/include/AuxHashMap.hpp
|
@@ -231,6 +241,16 @@ files:
|
|
231
241
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp
|
232
242
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp
|
233
243
|
- vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp
|
244
|
+
- vendor/datasketches-cpp/tdigest/CMakeLists.txt
|
245
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest.hpp
|
246
|
+
- vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp
|
247
|
+
- vendor/datasketches-cpp/tdigest/test/CMakeLists.txt
|
248
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp
|
249
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp
|
250
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk
|
251
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk
|
252
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp
|
253
|
+
- vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp
|
234
254
|
- vendor/datasketches-cpp/theta/CMakeLists.txt
|
235
255
|
- vendor/datasketches-cpp/theta/include/bit_packing.hpp
|
236
256
|
- vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp
|
@@ -309,7 +329,6 @@ homepage: https://github.com/ankane/datasketches-ruby
|
|
309
329
|
licenses:
|
310
330
|
- Apache-2.0
|
311
331
|
metadata: {}
|
312
|
-
post_install_message:
|
313
332
|
rdoc_options: []
|
314
333
|
require_paths:
|
315
334
|
- lib
|
@@ -324,8 +343,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
324
343
|
- !ruby/object:Gem::Version
|
325
344
|
version: '0'
|
326
345
|
requirements: []
|
327
|
-
rubygems_version: 3.
|
328
|
-
signing_key:
|
346
|
+
rubygems_version: 3.6.2
|
329
347
|
specification_version: 4
|
330
348
|
summary: Sketch data structures for Ruby
|
331
349
|
test_files: []
|