datasketches 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -27,7 +27,8 @@
27
27
  #include "serde.hpp"
28
28
  #include "binomial_bounds.hpp"
29
29
  #include "theta_helpers.hpp"
30
- #include "compact_theta_sketch_parser.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "bit_packing.hpp"
31
32
 
32
33
  namespace datasketches {
33
34
 
@@ -38,7 +39,8 @@ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
38
39
 
39
40
  template<typename A>
40
41
  double base_theta_sketch_alloc<A>::get_theta() const {
41
- return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
42
+ return static_cast<double>(get_theta64()) /
43
+ static_cast<double>(theta_constants::MAX_THETA);
42
44
  }
43
45
 
44
46
  template<typename A>
@@ -343,12 +345,9 @@ template<typename A>
343
345
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
344
346
  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
345
347
  write(os, preamble_longs);
346
- const uint8_t serial_version = SERIAL_VERSION;
347
- write(os, serial_version);
348
- const uint8_t type = SKETCH_TYPE;
349
- write(os, type);
350
- const uint16_t unused16 = 0;
351
- write(os, unused16);
348
+ write(os, UNCOMPRESSED_SERIAL_VERSION);
349
+ write(os, SKETCH_TYPE);
350
+ write<uint16_t>(os, 0); // unused
352
351
  const uint8_t flags_byte(
353
352
  (1 << flags::IS_COMPACT) |
354
353
  (1 << flags::IS_READ_ONLY) |
@@ -356,13 +355,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
356
355
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
357
356
  );
358
357
  write(os, flags_byte);
359
- const uint16_t seed_hash = get_seed_hash();
360
- write(os, seed_hash);
358
+ write(os, get_seed_hash());
361
359
  if (preamble_longs > 1) {
362
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
363
- write(os, num_entries);
364
- const uint32_t unused32 = 0;
365
- write(os, unused32);
360
+ write<uint32_t>(os, entries_.size());
361
+ write<uint32_t>(os, 0); // unused
366
362
  }
367
363
  if (this->is_estimation_mode()) write(os, this->theta_);
368
364
  if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
@@ -376,11 +372,9 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
376
372
  vector_bytes bytes(size, 0, entries_.get_allocator());
377
373
  uint8_t* ptr = bytes.data() + header_size_bytes;
378
374
 
379
- ptr += copy_to_mem(preamble_longs, ptr);
380
- const uint8_t serial_version = SERIAL_VERSION;
381
- ptr += copy_to_mem(serial_version, ptr);
382
- const uint8_t type = SKETCH_TYPE;
383
- ptr += copy_to_mem(type, ptr);
375
+ *ptr++ = preamble_longs;
376
+ *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
377
+ *ptr++ = SKETCH_TYPE;
384
378
  ptr += sizeof(uint16_t); // unused
385
379
  const uint8_t flags_byte(
386
380
  (1 << flags::IS_COMPACT) |
@@ -388,12 +382,10 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
388
382
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
389
383
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
390
384
  );
391
- ptr += copy_to_mem(flags_byte, ptr);
392
- const uint16_t seed_hash = get_seed_hash();
393
- ptr += copy_to_mem(seed_hash, ptr);
385
+ *ptr++ = flags_byte;
386
+ ptr += copy_to_mem(get_seed_hash(), ptr);
394
387
  if (preamble_longs > 1) {
395
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
396
- ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
397
389
  ptr += sizeof(uint32_t); // unused
398
390
  }
399
391
  if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
@@ -401,131 +393,342 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
401
393
  return bytes;
402
394
  }
403
395
 
396
+ template<typename A>
397
+ bool compact_theta_sketch_alloc<A>::is_suitable_for_compression() const {
398
+ if (!this->is_ordered() || entries_.size() == 0 ||
399
+ (entries_.size() == 1 && !this->is_estimation_mode())) return false;
400
+ return true;
401
+ }
402
+
403
+ template<typename A>
404
+ void compact_theta_sketch_alloc<A>::serialize_compressed(std::ostream& os) const {
405
+ if (is_suitable_for_compression()) return serialize_version_4(os);
406
+ return serialize(os);
407
+ }
408
+
409
+ template<typename A>
410
+ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
411
+ if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
412
+ return serialize(header_size_bytes);
413
+ }
414
+
415
+ template<typename A>
416
+ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
417
+ // compression is based on leading zeros in deltas between ordered hash values
418
+ // assumes ordered sketch
419
+ uint64_t previous = 0;
420
+ uint64_t ored = 0;
421
+ for (const uint64_t entry: entries_) {
422
+ const uint64_t delta = entry - previous;
423
+ ored |= delta;
424
+ previous = entry;
425
+ }
426
+ return count_leading_zeros_in_u64(ored);
427
+ }
428
+
429
+ template<typename A>
430
+ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
431
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
432
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
433
+
434
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
435
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
436
+
437
+ write(os, preamble_longs);
438
+ write(os, COMPRESSED_SERIAL_VERSION);
439
+ write(os, SKETCH_TYPE);
440
+ write(os, entry_bits);
441
+ write(os, num_entries_bytes);
442
+ const uint8_t flags_byte(
443
+ (1 << flags::IS_COMPACT) |
444
+ (1 << flags::IS_READ_ONLY) |
445
+ (1 << flags::IS_ORDERED)
446
+ );
447
+ write(os, flags_byte);
448
+ write(os, get_seed_hash());
449
+ if (this->is_estimation_mode()) write(os, this->theta_);
450
+ uint32_t num_entries = entries_.size();
451
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
452
+ write<uint8_t>(os, num_entries & 0xff);
453
+ num_entries >>= 8;
454
+ }
455
+
456
+ uint64_t previous = 0;
457
+ uint64_t deltas[8];
458
+ vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
459
+
460
+ // pack blocks of 8 deltas
461
+ unsigned i;
462
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
463
+ for (unsigned j = 0; j < 8; ++j) {
464
+ deltas[j] = entries_[i + j] - previous;
465
+ previous = entries_[i + j];
466
+ }
467
+ pack_bits_block8(deltas, buffer.data(), entry_bits);
468
+ write(os, buffer.data(), buffer.size());
469
+ }
470
+
471
+ // pack extra deltas if fewer than 8 of them left
472
+ if (i < entries_.size()) {
473
+ uint8_t offset = 0;
474
+ uint8_t* ptr = buffer.data();
475
+ for (; i < entries_.size(); ++i) {
476
+ const uint64_t delta = entries_[i] - previous;
477
+ previous = entries_[i];
478
+ offset = pack_bits(delta, entry_bits, ptr, offset);
479
+ }
480
+ write(os, buffer.data(), ptr - buffer.data());
481
+ }
482
+ }
483
+
484
+ template<typename A>
485
+ auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
486
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
487
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
488
+ const size_t compressed_bits = entry_bits * entries_.size();
489
+
490
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
491
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
492
+
493
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
494
+ + whole_bytes_to_hold_bits(compressed_bits);
495
+ vector_bytes bytes(size, 0, entries_.get_allocator());
496
+ uint8_t* ptr = bytes.data() + header_size_bytes;
497
+
498
+ *ptr++ = preamble_longs;
499
+ *ptr++ = COMPRESSED_SERIAL_VERSION;
500
+ *ptr++ = SKETCH_TYPE;
501
+ *ptr++ = entry_bits;
502
+ *ptr++ = num_entries_bytes;
503
+ const uint8_t flags_byte(
504
+ (1 << flags::IS_COMPACT) |
505
+ (1 << flags::IS_READ_ONLY) |
506
+ (1 << flags::IS_ORDERED)
507
+ );
508
+ *ptr++ = flags_byte;
509
+ ptr += copy_to_mem(get_seed_hash(), ptr);
510
+ if (this->is_estimation_mode()) {
511
+ ptr += copy_to_mem(theta_, ptr);
512
+ }
513
+ uint32_t num_entries = entries_.size();
514
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
515
+ *ptr++ = num_entries & 0xff;
516
+ num_entries >>= 8;
517
+ }
518
+
519
+ uint64_t previous = 0;
520
+ uint64_t deltas[8];
521
+
522
+ // pack blocks of 8 deltas
523
+ unsigned i;
524
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
525
+ for (unsigned j = 0; j < 8; ++j) {
526
+ deltas[j] = entries_[i + j] - previous;
527
+ previous = entries_[i + j];
528
+ }
529
+ pack_bits_block8(deltas, ptr, entry_bits);
530
+ ptr += entry_bits;
531
+ }
532
+
533
+ // pack extra deltas if fewer than 8 of them left
534
+ uint8_t offset = 0;
535
+ for (; i < entries_.size(); ++i) {
536
+ const uint64_t delta = entries_[i] - previous;
537
+ previous = entries_[i];
538
+ offset = pack_bits(delta, entry_bits, ptr, offset);
539
+ }
540
+ return bytes;
541
+ }
542
+
404
543
  template<typename A>
405
544
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
406
545
  const auto preamble_longs = read<uint8_t>(is);
407
546
  const auto serial_version = read<uint8_t>(is);
408
547
  const auto type = read<uint8_t>(is);
548
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
409
549
  switch (serial_version) {
410
- case SERIAL_VERSION: {
411
- read<uint16_t>(is); // unused
412
- const auto flags_byte = read<uint8_t>(is);
413
- const auto seed_hash = read<uint16_t>(is);
414
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
415
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
416
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
417
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
418
-
419
- uint64_t theta = theta_constants::MAX_THETA;
420
- uint32_t num_entries = 0;
421
- if (!is_empty) {
422
- if (preamble_longs == 1) {
423
- num_entries = 1;
424
- } else {
425
- num_entries = read<uint32_t>(is);
426
- read<uint32_t>(is); // unused
427
- if (preamble_longs > 2) {
428
- theta = read<uint64_t>(is);
429
- }
430
- }
431
- }
432
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
433
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
550
+ case 4:
551
+ return deserialize_v4(preamble_longs, is, seed, allocator);
552
+ case 3:
553
+ return deserialize_v3(preamble_longs, is, seed, allocator);
554
+ case 1:
555
+ return deserialize_v1(preamble_longs, is, seed, allocator);
556
+ case 2:
557
+ return deserialize_v2(preamble_longs, is, seed, allocator);
558
+ default:
559
+ throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
560
+ }
561
+ }
434
562
 
435
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
563
+ template<typename A>
564
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
565
+ uint8_t, std::istream& is, uint64_t seed, const A& allocator)
566
+ {
567
+ const auto seed_hash = compute_seed_hash(seed);
568
+ read<uint8_t>(is); // unused
569
+ read<uint32_t>(is); // unused
570
+ const auto num_entries = read<uint32_t>(is);
571
+ read<uint32_t>(is); //unused
572
+ const auto theta = read<uint64_t>(is);
573
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
574
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
575
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
576
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
577
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
578
+ }
579
+
580
+ template<typename A>
581
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
582
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
583
+ {
584
+ read<uint8_t>(is); // unused
585
+ read<uint16_t>(is); // unused
586
+ const uint16_t seed_hash = read<uint16_t>(is);
587
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
588
+ if (preamble_longs == 1) {
589
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
590
+ std::vector<uint64_t, A> entries(0, 0, allocator);
591
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
592
+ } else if (preamble_longs == 2) {
593
+ const uint32_t num_entries = read<uint32_t>(is);
594
+ read<uint32_t>(is); // unused
595
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
596
+ if (num_entries == 0) {
597
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
598
+ }
599
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
600
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
601
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
602
+ } else if (preamble_longs == 3) {
603
+ const uint32_t num_entries = read<uint32_t>(is);
604
+ read<uint32_t>(is); // unused
605
+ const auto theta = read<uint64_t>(is);
606
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
607
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
608
+ if (is_empty) {
436
609
  if (!is.good()) throw std::runtime_error("error reading from std::istream");
437
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
610
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
611
+ } else {
612
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
613
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
614
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
615
+ }
616
+ } else {
617
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
438
618
  }
439
- case 1: {
440
- const auto seed_hash = compute_seed_hash(seed);
441
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
442
- read<uint8_t>(is); // unused
619
+ }
620
+
621
+ template<typename A>
622
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
623
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
624
+ {
625
+ read<uint16_t>(is); // unused
626
+ const auto flags_byte = read<uint8_t>(is);
627
+ const auto seed_hash = read<uint16_t>(is);
628
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
629
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
630
+ uint64_t theta = theta_constants::MAX_THETA;
631
+ uint32_t num_entries = 0;
632
+ if (!is_empty) {
633
+ if (preamble_longs == 1) {
634
+ num_entries = 1;
635
+ } else {
636
+ num_entries = read<uint32_t>(is);
443
637
  read<uint32_t>(is); // unused
444
- const auto num_entries = read<uint32_t>(is);
445
- read<uint32_t>(is); //unused
446
- const auto theta = read<uint64_t>(is);
447
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
448
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
449
- if (!is_empty)
450
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
451
- if (!is.good())
452
- throw std::runtime_error("error reading from std::istream");
453
- return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
638
+ if (preamble_longs > 2) theta = read<uint64_t>(is);
639
+ }
454
640
  }
455
- case 2: {
456
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
- read<uint8_t>(is); // unused
458
- read<uint16_t>(is); // unused
459
- const uint16_t seed_hash = read<uint16_t>(is);
460
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
461
- if (preamble_longs == 1) {
462
- if (!is.good())
463
- throw std::runtime_error("error reading from std::istream");
464
- std::vector<uint64_t, A> entries(0, 0, allocator);
465
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
466
- } else if (preamble_longs == 2) {
467
- const uint32_t num_entries = read<uint32_t>(is);
468
- read<uint32_t>(is); // unused
469
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
470
- if (num_entries == 0) {
471
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
472
- }
473
- read(is, entries.data(), entries.size() * sizeof(uint64_t));
474
- if (!is.good())
475
- throw std::runtime_error("error reading from std::istream");
476
- return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
477
- } else if (preamble_longs == 3) {
478
- const uint32_t num_entries = read<uint32_t>(is);
479
- read<uint32_t>(is); // unused
480
- const auto theta = read<uint64_t>(is);
481
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
482
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
483
- if (is_empty) {
484
- if (!is.good())
485
- throw std::runtime_error("error reading from std::istream");
486
- return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
487
- } else {
488
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
489
- if (!is.good())
490
- throw std::runtime_error("error reading from std::istream");
491
- return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
492
- }
493
- } else {
494
- throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
495
- }
641
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
642
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
643
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
644
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
645
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
646
+ }
647
+
648
+ template<typename A>
649
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
650
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
651
+ {
652
+ const auto entry_bits = read<uint8_t>(is);
653
+ const auto num_entries_bytes = read<uint8_t>(is);
654
+ const auto flags_byte = read<uint8_t>(is);
655
+ const auto seed_hash = read<uint16_t>(is);
656
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
657
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
658
+ uint64_t theta = theta_constants::MAX_THETA;
659
+ if (preamble_longs > 1) theta = read<uint64_t>(is);
660
+ uint32_t num_entries = 0;
661
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
662
+ num_entries |= read<uint8_t>(is) << (i << 3);
663
+ }
664
+ vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
665
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666
+
667
+ // unpack blocks of 8 deltas
668
+ unsigned i;
669
+ for (i = 0; i + 7 < num_entries; i += 8) {
670
+ read(is, buffer.data(), buffer.size());
671
+ unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
496
672
  }
497
- default:
498
- // this should always fail since the valid cases are handled above
499
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
500
- // this throw is never reached, because check_serial_version will throw an informative exception.
501
- // This is only here to avoid a compiler warning about a path without a return value.
502
- throw std::invalid_argument("unexpected sketch serialization version");
673
+ // unpack extra deltas if fewer than 8 of them left
674
+ if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
675
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
676
+ const uint8_t* ptr = buffer.data();
677
+ uint8_t offset = 0;
678
+ for (; i < num_entries; ++i) {
679
+ offset = unpack_bits(entries[i], entry_bits, ptr, offset);
503
680
  }
681
+ // undo deltas
682
+ uint64_t previous = 0;
683
+ for (i = 0; i < num_entries; ++i) {
684
+ entries[i] += previous;
685
+ previous = entries[i];
686
+ }
687
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
688
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
504
689
  }
505
690
 
506
691
  template<typename A>
507
692
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
508
693
  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
509
- return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
694
+ if (data.entry_bits == 64) { // versions 1 to 3
695
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
696
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
697
+ std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
698
+ } else { // version 4
699
+ std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
700
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
701
+ // unpack blocks of 8 deltas
702
+ unsigned i;
703
+ for (i = 0; i + 7 < data.num_entries; i += 8) {
704
+ unpack_bits_block8(&entries[i], ptr, data.entry_bits);
705
+ ptr += data.entry_bits;
706
+ }
707
+ // unpack extra deltas if fewer than 8 of them left
708
+ uint8_t offset = 0;
709
+ for (; i < data.num_entries; ++i) {
710
+ offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
711
+ }
712
+ // undo deltas
713
+ uint64_t previous = 0;
714
+ for (i = 0; i < data.num_entries; ++i) {
715
+ entries[i] += previous;
716
+ previous = entries[i];
717
+ }
718
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
719
+ }
510
720
  }
511
721
 
512
722
  // wrapped compact sketch
513
723
 
514
724
  template<typename A>
515
- wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
516
- uint64_t theta, const uint64_t* entries):
517
- is_empty_(is_empty),
518
- is_ordered_(is_ordered),
519
- seed_hash_(seed_hash),
520
- num_entries_(num_entries),
521
- theta_(theta),
522
- entries_(entries)
725
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
726
+ data_(data)
523
727
  {}
524
728
 
525
729
  template<typename A>
526
730
  const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
527
- auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
528
- return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
731
+ return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
529
732
  }
530
733
 
531
734
  template<typename A>
@@ -535,37 +738,37 @@ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
535
738
 
536
739
  template<typename A>
537
740
  bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
538
- return is_empty_;
741
+ return data_.is_empty;
539
742
  }
540
743
 
541
744
  template<typename A>
542
745
  bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
543
- return is_ordered_;
746
+ return data_.is_ordered;
544
747
  }
545
748
 
546
749
  template<typename A>
547
750
  uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
548
- return theta_;
751
+ return data_.theta;
549
752
  }
550
753
 
551
754
  template<typename A>
552
755
  uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
553
- return static_cast<uint32_t>(num_entries_);
756
+ return data_.num_entries;
554
757
  }
555
758
 
556
759
  template<typename A>
557
760
  uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
558
- return seed_hash_;
761
+ return data_.seed_hash;
559
762
  }
560
763
 
561
764
  template<typename A>
562
765
  auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
563
- return entries_;
766
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
564
767
  }
565
768
 
566
769
  template<typename A>
567
770
  auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
568
- return entries_ + num_entries_;
771
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
569
772
  }
570
773
 
571
774
  template<typename A>
@@ -574,12 +777,109 @@ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&)
574
777
  template<typename A>
575
778
  void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
576
779
  os << "### Retained entries" << std::endl;
577
- for (const auto& hash: *this) {
780
+ for (const auto hash: *this) {
578
781
  os << hash << std::endl;
579
782
  }
580
783
  os << "### End retained entries" << std::endl;
581
784
  }
582
785
 
786
+ // assumes index == 0 or index == num_entries
787
+ template<typename Allocator>
788
+ wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
789
+ const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
790
+ ptr_(ptr),
791
+ entry_bits_(entry_bits),
792
+ num_entries_(num_entries),
793
+ index_(index),
794
+ previous_(0),
795
+ is_block_mode_(num_entries_ >= 8),
796
+ buf_i_(0),
797
+ offset_(0)
798
+ {
799
+ if (entry_bits == 64) { // no compression
800
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
801
+ } else if (index < num_entries) {
802
+ if (is_block_mode_) {
803
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
804
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
805
+ for (int i = 0; i < 8; ++i) {
806
+ buffer_[i] += previous_;
807
+ previous_ = buffer_[i];
808
+ }
809
+ } else {
810
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
811
+ buffer_[0] += previous_;
812
+ previous_ = buffer_[0];
813
+ }
814
+ }
815
+ }
816
+
817
+ template<typename Allocator>
818
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
819
+ if (entry_bits_ == 64) { // no compression
820
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
821
+ return *this;
822
+ }
823
+ ++index_;
824
+ if (index_ < num_entries_) {
825
+ if (is_block_mode_) {
826
+ ++buf_i_;
827
+ if (buf_i_ == 8) {
828
+ buf_i_ = 0;
829
+ if (index_ + 8 < num_entries_) {
830
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
831
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
832
+ for (int i = 0; i < 8; ++i) {
833
+ buffer_[i] += previous_;
834
+ previous_ = buffer_[i];
835
+ }
836
+ } else {
837
+ is_block_mode_ = false;
838
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
839
+ buffer_[0] += previous_;
840
+ previous_ = buffer_[0];
841
+ }
842
+ }
843
+ } else {
844
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
845
+ buffer_[0] += previous_;
846
+ previous_ = buffer_[0];
847
+ }
848
+ }
849
+ return *this;
850
+ }
851
+
852
+ template<typename Allocator>
853
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
854
+ const_iterator tmp(*this);
855
+ operator++();
856
+ return tmp;
857
+ }
858
+
859
+ template<typename Allocator>
860
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
861
+ if (entry_bits_ == 64) return ptr_ != other.ptr_;
862
+ return index_ != other.index_;
863
+ }
864
+
865
+ template<typename Allocator>
866
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
867
+ if (entry_bits_ == 64) return ptr_ == other.ptr_;
868
+ return index_ == other.index_;
869
+ }
870
+
871
+ template<typename Allocator>
872
+ const uint64_t& wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const {
873
+ if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
874
+ return buffer_[buf_i_];
875
+ }
876
+
877
+ template<typename Allocator>
878
+ const uint64_t* wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const {
879
+ if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
880
+ return buffer_ + buf_i_;
881
+ }
882
+
583
883
  } /* namespace datasketches */
584
884
 
585
885
  #endif