datasketches 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/cpc_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
- data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
- data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
- data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
- data/vendor/datasketches-cpp/python/README.md +5 -5
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
- data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
- data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
- data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
- data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
- data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
- data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +31 -3
@@ -27,7 +27,8 @@
|
|
27
27
|
#include "serde.hpp"
|
28
28
|
#include "binomial_bounds.hpp"
|
29
29
|
#include "theta_helpers.hpp"
|
30
|
-
#include "
|
30
|
+
#include "count_zeros.hpp"
|
31
|
+
#include "bit_packing.hpp"
|
31
32
|
|
32
33
|
namespace datasketches {
|
33
34
|
|
@@ -38,7 +39,8 @@ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
|
|
38
39
|
|
39
40
|
template<typename A>
|
40
41
|
double base_theta_sketch_alloc<A>::get_theta() const {
|
41
|
-
return static_cast<double>(get_theta64()) /
|
42
|
+
return static_cast<double>(get_theta64()) /
|
43
|
+
static_cast<double>(theta_constants::MAX_THETA);
|
42
44
|
}
|
43
45
|
|
44
46
|
template<typename A>
|
@@ -343,12 +345,9 @@ template<typename A>
|
|
343
345
|
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
344
346
|
const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
|
345
347
|
write(os, preamble_longs);
|
346
|
-
|
347
|
-
write(os,
|
348
|
-
|
349
|
-
write(os, type);
|
350
|
-
const uint16_t unused16 = 0;
|
351
|
-
write(os, unused16);
|
348
|
+
write(os, UNCOMPRESSED_SERIAL_VERSION);
|
349
|
+
write(os, SKETCH_TYPE);
|
350
|
+
write<uint16_t>(os, 0); // unused
|
352
351
|
const uint8_t flags_byte(
|
353
352
|
(1 << flags::IS_COMPACT) |
|
354
353
|
(1 << flags::IS_READ_ONLY) |
|
@@ -356,13 +355,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
356
355
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
357
356
|
);
|
358
357
|
write(os, flags_byte);
|
359
|
-
|
360
|
-
write(os, seed_hash);
|
358
|
+
write(os, get_seed_hash());
|
361
359
|
if (preamble_longs > 1) {
|
362
|
-
|
363
|
-
write(os,
|
364
|
-
const uint32_t unused32 = 0;
|
365
|
-
write(os, unused32);
|
360
|
+
write<uint32_t>(os, entries_.size());
|
361
|
+
write<uint32_t>(os, 0); // unused
|
366
362
|
}
|
367
363
|
if (this->is_estimation_mode()) write(os, this->theta_);
|
368
364
|
if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
|
@@ -376,11 +372,9 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
376
372
|
vector_bytes bytes(size, 0, entries_.get_allocator());
|
377
373
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
378
374
|
|
379
|
-
ptr
|
380
|
-
|
381
|
-
ptr
|
382
|
-
const uint8_t type = SKETCH_TYPE;
|
383
|
-
ptr += copy_to_mem(type, ptr);
|
375
|
+
*ptr++ = preamble_longs;
|
376
|
+
*ptr++ = UNCOMPRESSED_SERIAL_VERSION;
|
377
|
+
*ptr++ = SKETCH_TYPE;
|
384
378
|
ptr += sizeof(uint16_t); // unused
|
385
379
|
const uint8_t flags_byte(
|
386
380
|
(1 << flags::IS_COMPACT) |
|
@@ -388,12 +382,10 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
388
382
|
(this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
|
389
383
|
(this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
|
390
384
|
);
|
391
|
-
ptr
|
392
|
-
|
393
|
-
ptr += copy_to_mem(seed_hash, ptr);
|
385
|
+
*ptr++ = flags_byte;
|
386
|
+
ptr += copy_to_mem(get_seed_hash(), ptr);
|
394
387
|
if (preamble_longs > 1) {
|
395
|
-
|
396
|
-
ptr += copy_to_mem(num_entries, ptr);
|
388
|
+
ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
|
397
389
|
ptr += sizeof(uint32_t); // unused
|
398
390
|
}
|
399
391
|
if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
|
@@ -401,131 +393,342 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
|
|
401
393
|
return bytes;
|
402
394
|
}
|
403
395
|
|
396
|
+
template<typename A>
|
397
|
+
bool compact_theta_sketch_alloc<A>::is_suitable_for_compression() const {
|
398
|
+
if (!this->is_ordered() || entries_.size() == 0 ||
|
399
|
+
(entries_.size() == 1 && !this->is_estimation_mode())) return false;
|
400
|
+
return true;
|
401
|
+
}
|
402
|
+
|
403
|
+
template<typename A>
|
404
|
+
void compact_theta_sketch_alloc<A>::serialize_compressed(std::ostream& os) const {
|
405
|
+
if (is_suitable_for_compression()) return serialize_version_4(os);
|
406
|
+
return serialize(os);
|
407
|
+
}
|
408
|
+
|
409
|
+
template<typename A>
|
410
|
+
auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
|
411
|
+
if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
|
412
|
+
return serialize(header_size_bytes);
|
413
|
+
}
|
414
|
+
|
415
|
+
template<typename A>
|
416
|
+
uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
|
417
|
+
// compression is based on leading zeros in deltas between ordered hash values
|
418
|
+
// assumes ordered sketch
|
419
|
+
uint64_t previous = 0;
|
420
|
+
uint64_t ored = 0;
|
421
|
+
for (const uint64_t entry: entries_) {
|
422
|
+
const uint64_t delta = entry - previous;
|
423
|
+
ored |= delta;
|
424
|
+
previous = entry;
|
425
|
+
}
|
426
|
+
return count_leading_zeros_in_u64(ored);
|
427
|
+
}
|
428
|
+
|
429
|
+
template<typename A>
|
430
|
+
void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
|
431
|
+
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
|
432
|
+
const uint8_t entry_bits = 64 - compute_min_leading_zeros();
|
433
|
+
|
434
|
+
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
435
|
+
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
|
436
|
+
|
437
|
+
write(os, preamble_longs);
|
438
|
+
write(os, COMPRESSED_SERIAL_VERSION);
|
439
|
+
write(os, SKETCH_TYPE);
|
440
|
+
write(os, entry_bits);
|
441
|
+
write(os, num_entries_bytes);
|
442
|
+
const uint8_t flags_byte(
|
443
|
+
(1 << flags::IS_COMPACT) |
|
444
|
+
(1 << flags::IS_READ_ONLY) |
|
445
|
+
(1 << flags::IS_ORDERED)
|
446
|
+
);
|
447
|
+
write(os, flags_byte);
|
448
|
+
write(os, get_seed_hash());
|
449
|
+
if (this->is_estimation_mode()) write(os, this->theta_);
|
450
|
+
uint32_t num_entries = entries_.size();
|
451
|
+
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
452
|
+
write<uint8_t>(os, num_entries & 0xff);
|
453
|
+
num_entries >>= 8;
|
454
|
+
}
|
455
|
+
|
456
|
+
uint64_t previous = 0;
|
457
|
+
uint64_t deltas[8];
|
458
|
+
vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
|
459
|
+
|
460
|
+
// pack blocks of 8 deltas
|
461
|
+
unsigned i;
|
462
|
+
for (i = 0; i + 7 < entries_.size(); i += 8) {
|
463
|
+
for (unsigned j = 0; j < 8; ++j) {
|
464
|
+
deltas[j] = entries_[i + j] - previous;
|
465
|
+
previous = entries_[i + j];
|
466
|
+
}
|
467
|
+
pack_bits_block8(deltas, buffer.data(), entry_bits);
|
468
|
+
write(os, buffer.data(), buffer.size());
|
469
|
+
}
|
470
|
+
|
471
|
+
// pack extra deltas if fewer than 8 of them left
|
472
|
+
if (i < entries_.size()) {
|
473
|
+
uint8_t offset = 0;
|
474
|
+
uint8_t* ptr = buffer.data();
|
475
|
+
for (; i < entries_.size(); ++i) {
|
476
|
+
const uint64_t delta = entries_[i] - previous;
|
477
|
+
previous = entries_[i];
|
478
|
+
offset = pack_bits(delta, entry_bits, ptr, offset);
|
479
|
+
}
|
480
|
+
write(os, buffer.data(), ptr - buffer.data());
|
481
|
+
}
|
482
|
+
}
|
483
|
+
|
484
|
+
template<typename A>
|
485
|
+
auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
|
486
|
+
const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
|
487
|
+
const uint8_t entry_bits = 64 - compute_min_leading_zeros();
|
488
|
+
const size_t compressed_bits = entry_bits * entries_.size();
|
489
|
+
|
490
|
+
// store num_entries as whole bytes since whole-byte blocks will follow (most probably)
|
491
|
+
const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
|
492
|
+
|
493
|
+
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
|
494
|
+
+ whole_bytes_to_hold_bits(compressed_bits);
|
495
|
+
vector_bytes bytes(size, 0, entries_.get_allocator());
|
496
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
497
|
+
|
498
|
+
*ptr++ = preamble_longs;
|
499
|
+
*ptr++ = COMPRESSED_SERIAL_VERSION;
|
500
|
+
*ptr++ = SKETCH_TYPE;
|
501
|
+
*ptr++ = entry_bits;
|
502
|
+
*ptr++ = num_entries_bytes;
|
503
|
+
const uint8_t flags_byte(
|
504
|
+
(1 << flags::IS_COMPACT) |
|
505
|
+
(1 << flags::IS_READ_ONLY) |
|
506
|
+
(1 << flags::IS_ORDERED)
|
507
|
+
);
|
508
|
+
*ptr++ = flags_byte;
|
509
|
+
ptr += copy_to_mem(get_seed_hash(), ptr);
|
510
|
+
if (this->is_estimation_mode()) {
|
511
|
+
ptr += copy_to_mem(theta_, ptr);
|
512
|
+
}
|
513
|
+
uint32_t num_entries = entries_.size();
|
514
|
+
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
515
|
+
*ptr++ = num_entries & 0xff;
|
516
|
+
num_entries >>= 8;
|
517
|
+
}
|
518
|
+
|
519
|
+
uint64_t previous = 0;
|
520
|
+
uint64_t deltas[8];
|
521
|
+
|
522
|
+
// pack blocks of 8 deltas
|
523
|
+
unsigned i;
|
524
|
+
for (i = 0; i + 7 < entries_.size(); i += 8) {
|
525
|
+
for (unsigned j = 0; j < 8; ++j) {
|
526
|
+
deltas[j] = entries_[i + j] - previous;
|
527
|
+
previous = entries_[i + j];
|
528
|
+
}
|
529
|
+
pack_bits_block8(deltas, ptr, entry_bits);
|
530
|
+
ptr += entry_bits;
|
531
|
+
}
|
532
|
+
|
533
|
+
// pack extra deltas if fewer than 8 of them left
|
534
|
+
uint8_t offset = 0;
|
535
|
+
for (; i < entries_.size(); ++i) {
|
536
|
+
const uint64_t delta = entries_[i] - previous;
|
537
|
+
previous = entries_[i];
|
538
|
+
offset = pack_bits(delta, entry_bits, ptr, offset);
|
539
|
+
}
|
540
|
+
return bytes;
|
541
|
+
}
|
542
|
+
|
404
543
|
template<typename A>
|
405
544
|
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
|
406
545
|
const auto preamble_longs = read<uint8_t>(is);
|
407
546
|
const auto serial_version = read<uint8_t>(is);
|
408
547
|
const auto type = read<uint8_t>(is);
|
548
|
+
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
409
549
|
switch (serial_version) {
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
if (preamble_longs == 1) {
|
423
|
-
num_entries = 1;
|
424
|
-
} else {
|
425
|
-
num_entries = read<uint32_t>(is);
|
426
|
-
read<uint32_t>(is); // unused
|
427
|
-
if (preamble_longs > 2) {
|
428
|
-
theta = read<uint64_t>(is);
|
429
|
-
}
|
430
|
-
}
|
431
|
-
}
|
432
|
-
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
433
|
-
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
550
|
+
case 4:
|
551
|
+
return deserialize_v4(preamble_longs, is, seed, allocator);
|
552
|
+
case 3:
|
553
|
+
return deserialize_v3(preamble_longs, is, seed, allocator);
|
554
|
+
case 1:
|
555
|
+
return deserialize_v1(preamble_longs, is, seed, allocator);
|
556
|
+
case 2:
|
557
|
+
return deserialize_v2(preamble_longs, is, seed, allocator);
|
558
|
+
default:
|
559
|
+
throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
|
560
|
+
}
|
561
|
+
}
|
434
562
|
|
435
|
-
|
563
|
+
template<typename A>
|
564
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
|
565
|
+
uint8_t, std::istream& is, uint64_t seed, const A& allocator)
|
566
|
+
{
|
567
|
+
const auto seed_hash = compute_seed_hash(seed);
|
568
|
+
read<uint8_t>(is); // unused
|
569
|
+
read<uint32_t>(is); // unused
|
570
|
+
const auto num_entries = read<uint32_t>(is);
|
571
|
+
read<uint32_t>(is); //unused
|
572
|
+
const auto theta = read<uint64_t>(is);
|
573
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
574
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
575
|
+
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
576
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
577
|
+
return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
|
578
|
+
}
|
579
|
+
|
580
|
+
template<typename A>
|
581
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
|
582
|
+
uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
|
583
|
+
{
|
584
|
+
read<uint8_t>(is); // unused
|
585
|
+
read<uint16_t>(is); // unused
|
586
|
+
const uint16_t seed_hash = read<uint16_t>(is);
|
587
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
588
|
+
if (preamble_longs == 1) {
|
589
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
590
|
+
std::vector<uint64_t, A> entries(0, 0, allocator);
|
591
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
592
|
+
} else if (preamble_longs == 2) {
|
593
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
594
|
+
read<uint32_t>(is); // unused
|
595
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
596
|
+
if (num_entries == 0) {
|
597
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
598
|
+
}
|
599
|
+
read(is, entries.data(), entries.size() * sizeof(uint64_t));
|
600
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
601
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
602
|
+
} else if (preamble_longs == 3) {
|
603
|
+
const uint32_t num_entries = read<uint32_t>(is);
|
604
|
+
read<uint32_t>(is); // unused
|
605
|
+
const auto theta = read<uint64_t>(is);
|
606
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
607
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
608
|
+
if (is_empty) {
|
436
609
|
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
437
|
-
return compact_theta_sketch_alloc(
|
610
|
+
return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
|
611
|
+
} else {
|
612
|
+
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
613
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
614
|
+
return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
|
615
|
+
}
|
616
|
+
} else {
|
617
|
+
throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
|
438
618
|
}
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
619
|
+
}
|
620
|
+
|
621
|
+
template<typename A>
|
622
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
|
623
|
+
uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
|
624
|
+
{
|
625
|
+
read<uint16_t>(is); // unused
|
626
|
+
const auto flags_byte = read<uint8_t>(is);
|
627
|
+
const auto seed_hash = read<uint16_t>(is);
|
628
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
629
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
630
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
631
|
+
uint32_t num_entries = 0;
|
632
|
+
if (!is_empty) {
|
633
|
+
if (preamble_longs == 1) {
|
634
|
+
num_entries = 1;
|
635
|
+
} else {
|
636
|
+
num_entries = read<uint32_t>(is);
|
443
637
|
read<uint32_t>(is); // unused
|
444
|
-
|
445
|
-
|
446
|
-
const auto theta = read<uint64_t>(is);
|
447
|
-
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
448
|
-
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
449
|
-
if (!is_empty)
|
450
|
-
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
451
|
-
if (!is.good())
|
452
|
-
throw std::runtime_error("error reading from std::istream");
|
453
|
-
return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
|
638
|
+
if (preamble_longs > 2) theta = read<uint64_t>(is);
|
639
|
+
}
|
454
640
|
}
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
|
487
|
-
} else {
|
488
|
-
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
489
|
-
if (!is.good())
|
490
|
-
throw std::runtime_error("error reading from std::istream");
|
491
|
-
return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
|
492
|
-
}
|
493
|
-
} else {
|
494
|
-
throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
|
495
|
-
}
|
641
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
642
|
+
if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
643
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
644
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
645
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
646
|
+
}
|
647
|
+
|
648
|
+
template<typename A>
|
649
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
|
650
|
+
uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
|
651
|
+
{
|
652
|
+
const auto entry_bits = read<uint8_t>(is);
|
653
|
+
const auto num_entries_bytes = read<uint8_t>(is);
|
654
|
+
const auto flags_byte = read<uint8_t>(is);
|
655
|
+
const auto seed_hash = read<uint16_t>(is);
|
656
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
657
|
+
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
658
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
659
|
+
if (preamble_longs > 1) theta = read<uint64_t>(is);
|
660
|
+
uint32_t num_entries = 0;
|
661
|
+
for (unsigned i = 0; i < num_entries_bytes; ++i) {
|
662
|
+
num_entries |= read<uint8_t>(is) << (i << 3);
|
663
|
+
}
|
664
|
+
vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
|
665
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
666
|
+
|
667
|
+
// unpack blocks of 8 deltas
|
668
|
+
unsigned i;
|
669
|
+
for (i = 0; i + 7 < num_entries; i += 8) {
|
670
|
+
read(is, buffer.data(), buffer.size());
|
671
|
+
unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
|
496
672
|
}
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
673
|
+
// unpack extra deltas if fewer than 8 of them left
|
674
|
+
if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
|
675
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
676
|
+
const uint8_t* ptr = buffer.data();
|
677
|
+
uint8_t offset = 0;
|
678
|
+
for (; i < num_entries; ++i) {
|
679
|
+
offset = unpack_bits(entries[i], entry_bits, ptr, offset);
|
503
680
|
}
|
681
|
+
// undo deltas
|
682
|
+
uint64_t previous = 0;
|
683
|
+
for (i = 0; i < num_entries; ++i) {
|
684
|
+
entries[i] += previous;
|
685
|
+
previous = entries[i];
|
686
|
+
}
|
687
|
+
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
688
|
+
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
504
689
|
}
|
505
690
|
|
506
691
|
template<typename A>
|
507
692
|
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
508
693
|
auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
|
509
|
-
|
694
|
+
if (data.entry_bits == 64) { // versions 1 to 3
|
695
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
|
696
|
+
return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
|
697
|
+
std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
|
698
|
+
} else { // version 4
|
699
|
+
std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
|
700
|
+
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
|
701
|
+
// unpack blocks of 8 deltas
|
702
|
+
unsigned i;
|
703
|
+
for (i = 0; i + 7 < data.num_entries; i += 8) {
|
704
|
+
unpack_bits_block8(&entries[i], ptr, data.entry_bits);
|
705
|
+
ptr += data.entry_bits;
|
706
|
+
}
|
707
|
+
// unpack extra deltas if fewer than 8 of them left
|
708
|
+
uint8_t offset = 0;
|
709
|
+
for (; i < data.num_entries; ++i) {
|
710
|
+
offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
|
711
|
+
}
|
712
|
+
// undo deltas
|
713
|
+
uint64_t previous = 0;
|
714
|
+
for (i = 0; i < data.num_entries; ++i) {
|
715
|
+
entries[i] += previous;
|
716
|
+
previous = entries[i];
|
717
|
+
}
|
718
|
+
return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
|
719
|
+
}
|
510
720
|
}
|
511
721
|
|
512
722
|
// wrapped compact sketch
|
513
723
|
|
514
724
|
template<typename A>
|
515
|
-
wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(
|
516
|
-
|
517
|
-
is_empty_(is_empty),
|
518
|
-
is_ordered_(is_ordered),
|
519
|
-
seed_hash_(seed_hash),
|
520
|
-
num_entries_(num_entries),
|
521
|
-
theta_(theta),
|
522
|
-
entries_(entries)
|
725
|
+
wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
|
726
|
+
data_(data)
|
523
727
|
{}
|
524
728
|
|
525
729
|
template<typename A>
|
526
730
|
const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
|
527
|
-
|
528
|
-
return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
|
731
|
+
return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
|
529
732
|
}
|
530
733
|
|
531
734
|
template<typename A>
|
@@ -535,37 +738,37 @@ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
|
|
535
738
|
|
536
739
|
template<typename A>
|
537
740
|
bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
|
538
|
-
return
|
741
|
+
return data_.is_empty;
|
539
742
|
}
|
540
743
|
|
541
744
|
template<typename A>
|
542
745
|
bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
|
543
|
-
return
|
746
|
+
return data_.is_ordered;
|
544
747
|
}
|
545
748
|
|
546
749
|
template<typename A>
|
547
750
|
uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
|
548
|
-
return
|
751
|
+
return data_.theta;
|
549
752
|
}
|
550
753
|
|
551
754
|
template<typename A>
|
552
755
|
uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
|
553
|
-
return
|
756
|
+
return data_.num_entries;
|
554
757
|
}
|
555
758
|
|
556
759
|
template<typename A>
|
557
760
|
uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
|
558
|
-
return
|
761
|
+
return data_.seed_hash;
|
559
762
|
}
|
560
763
|
|
561
764
|
template<typename A>
|
562
765
|
auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
|
563
|
-
return
|
766
|
+
return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
|
564
767
|
}
|
565
768
|
|
566
769
|
template<typename A>
|
567
770
|
auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
|
568
|
-
return
|
771
|
+
return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
|
569
772
|
}
|
570
773
|
|
571
774
|
template<typename A>
|
@@ -574,12 +777,109 @@ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&)
|
|
574
777
|
template<typename A>
|
575
778
|
void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
|
576
779
|
os << "### Retained entries" << std::endl;
|
577
|
-
for (const auto
|
780
|
+
for (const auto hash: *this) {
|
578
781
|
os << hash << std::endl;
|
579
782
|
}
|
580
783
|
os << "### End retained entries" << std::endl;
|
581
784
|
}
|
582
785
|
|
786
|
+
// assumes index == 0 or index == num_entries
|
787
|
+
template<typename Allocator>
|
788
|
+
wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
|
789
|
+
const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
|
790
|
+
ptr_(ptr),
|
791
|
+
entry_bits_(entry_bits),
|
792
|
+
num_entries_(num_entries),
|
793
|
+
index_(index),
|
794
|
+
previous_(0),
|
795
|
+
is_block_mode_(num_entries_ >= 8),
|
796
|
+
buf_i_(0),
|
797
|
+
offset_(0)
|
798
|
+
{
|
799
|
+
if (entry_bits == 64) { // no compression
|
800
|
+
ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
|
801
|
+
} else if (index < num_entries) {
|
802
|
+
if (is_block_mode_) {
|
803
|
+
unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
|
804
|
+
ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
|
805
|
+
for (int i = 0; i < 8; ++i) {
|
806
|
+
buffer_[i] += previous_;
|
807
|
+
previous_ = buffer_[i];
|
808
|
+
}
|
809
|
+
} else {
|
810
|
+
offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
|
811
|
+
buffer_[0] += previous_;
|
812
|
+
previous_ = buffer_[0];
|
813
|
+
}
|
814
|
+
}
|
815
|
+
}
|
816
|
+
|
817
|
+
template<typename Allocator>
|
818
|
+
auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
|
819
|
+
if (entry_bits_ == 64) { // no compression
|
820
|
+
ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
|
821
|
+
return *this;
|
822
|
+
}
|
823
|
+
++index_;
|
824
|
+
if (index_ < num_entries_) {
|
825
|
+
if (is_block_mode_) {
|
826
|
+
++buf_i_;
|
827
|
+
if (buf_i_ == 8) {
|
828
|
+
buf_i_ = 0;
|
829
|
+
if (index_ + 8 < num_entries_) {
|
830
|
+
unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
|
831
|
+
ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
|
832
|
+
for (int i = 0; i < 8; ++i) {
|
833
|
+
buffer_[i] += previous_;
|
834
|
+
previous_ = buffer_[i];
|
835
|
+
}
|
836
|
+
} else {
|
837
|
+
is_block_mode_ = false;
|
838
|
+
offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
|
839
|
+
buffer_[0] += previous_;
|
840
|
+
previous_ = buffer_[0];
|
841
|
+
}
|
842
|
+
}
|
843
|
+
} else {
|
844
|
+
offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
|
845
|
+
buffer_[0] += previous_;
|
846
|
+
previous_ = buffer_[0];
|
847
|
+
}
|
848
|
+
}
|
849
|
+
return *this;
|
850
|
+
}
|
851
|
+
|
852
|
+
template<typename Allocator>
|
853
|
+
auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
|
854
|
+
const_iterator tmp(*this);
|
855
|
+
operator++();
|
856
|
+
return tmp;
|
857
|
+
}
|
858
|
+
|
859
|
+
template<typename Allocator>
|
860
|
+
bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
|
861
|
+
if (entry_bits_ == 64) return ptr_ != other.ptr_;
|
862
|
+
return index_ != other.index_;
|
863
|
+
}
|
864
|
+
|
865
|
+
template<typename Allocator>
|
866
|
+
bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
|
867
|
+
if (entry_bits_ == 64) return ptr_ == other.ptr_;
|
868
|
+
return index_ == other.index_;
|
869
|
+
}
|
870
|
+
|
871
|
+
template<typename Allocator>
|
872
|
+
const uint64_t& wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const {
|
873
|
+
if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
|
874
|
+
return buffer_[buf_i_];
|
875
|
+
}
|
876
|
+
|
877
|
+
template<typename Allocator>
|
878
|
+
const uint64_t* wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const {
|
879
|
+
if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
|
880
|
+
return buffer_ + buf_i_;
|
881
|
+
}
|
882
|
+
|
583
883
|
} /* namespace datasketches */
|
584
884
|
|
585
885
|
#endif
|