datasketches 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (113) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/cpc_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +22 -20
  6. data/vendor/datasketches-cpp/NOTICE +1 -1
  7. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +25 -27
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +8 -6
  9. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +11 -0
  10. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +5 -4
  11. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
  12. data/vendor/datasketches-cpp/common/test/integration_test.cpp +6 -0
  13. data/vendor/datasketches-cpp/count/CMakeLists.txt +42 -0
  14. data/vendor/datasketches-cpp/count/include/count_min.hpp +351 -0
  15. data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +517 -0
  16. data/vendor/datasketches-cpp/count/test/CMakeLists.txt +43 -0
  17. data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +155 -0
  18. data/vendor/datasketches-cpp/count/test/count_min_test.cpp +306 -0
  19. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +3 -3
  20. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +1 -1
  21. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +16 -8
  22. data/vendor/datasketches-cpp/density/CMakeLists.txt +42 -0
  23. data/vendor/datasketches-cpp/density/include/density_sketch.hpp +236 -0
  24. data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +543 -0
  25. data/vendor/datasketches-cpp/density/test/CMakeLists.txt +35 -0
  26. data/vendor/datasketches-cpp/density/test/density_sketch_test.cpp +244 -0
  27. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +9 -3
  28. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +19 -11
  29. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +2 -5
  30. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +19 -7
  31. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +1 -1
  32. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +98 -42
  33. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +2 -0
  34. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +92 -59
  35. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +16 -6
  36. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +3 -21
  37. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +8 -0
  38. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +14 -6
  39. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +1 -1
  40. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +8 -2
  41. data/vendor/datasketches-cpp/hll/include/hll.hpp +9 -8
  42. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +7 -1
  43. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -1
  44. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +8 -3
  45. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +2 -2
  46. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +2 -2
  47. data/vendor/datasketches-cpp/python/CMakeLists.txt +6 -0
  48. data/vendor/datasketches-cpp/python/README.md +5 -5
  49. data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +87 -0
  50. data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +35 -0
  51. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +15 -9
  52. data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +77 -0
  53. data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +205 -0
  54. data/vendor/datasketches-cpp/python/datasketches/__init__.py +17 -1
  55. data/vendor/datasketches-cpp/python/include/kernel_function.hpp +98 -0
  56. data/vendor/datasketches-cpp/python/include/py_object_lt.hpp +37 -0
  57. data/vendor/datasketches-cpp/python/include/py_object_ostream.hpp +48 -0
  58. data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +104 -0
  59. data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +136 -0
  60. data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +101 -0
  61. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +16 -30
  62. data/vendor/datasketches-cpp/python/src/datasketches.cpp +6 -0
  63. data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +95 -0
  64. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +127 -73
  65. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +28 -36
  66. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +108 -160
  67. data/vendor/datasketches-cpp/python/src/py_serde.cpp +5 -4
  68. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +99 -148
  69. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +117 -178
  70. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +67 -73
  71. data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +215 -0
  72. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +1 -1
  73. data/vendor/datasketches-cpp/python/tests/count_min_test.py +86 -0
  74. data/vendor/datasketches-cpp/python/tests/cpc_test.py +10 -10
  75. data/vendor/datasketches-cpp/python/tests/density_test.py +93 -0
  76. data/vendor/datasketches-cpp/python/tests/fi_test.py +41 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +19 -20
  78. data/vendor/datasketches-cpp/python/tests/kll_test.py +40 -6
  79. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +39 -5
  80. data/vendor/datasketches-cpp/python/tests/req_test.py +38 -5
  81. data/vendor/datasketches-cpp/python/tests/theta_test.py +16 -14
  82. data/vendor/datasketches-cpp/python/tests/tuple_test.py +206 -0
  83. data/vendor/datasketches-cpp/python/tests/vo_test.py +7 -0
  84. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +8 -3
  85. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +4 -4
  86. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +1 -1
  87. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +0 -2
  88. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +8 -3
  89. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +2 -2
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +20 -6
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +30 -16
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +5 -1
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +19 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +33 -14
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -2
  96. data/vendor/datasketches-cpp/setup.py +1 -1
  97. data/vendor/datasketches-cpp/theta/CMakeLists.txt +1 -0
  98. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +6279 -0
  99. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +14 -8
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +60 -46
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +4 -2
  102. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +58 -10
  103. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +430 -130
  104. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -9
  105. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +16 -4
  106. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +2 -2
  107. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  108. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +80 -0
  109. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +42 -3
  110. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -0
  111. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +2 -1
  112. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  113. metadata +31 -3
@@ -27,7 +27,8 @@
27
27
  #include "serde.hpp"
28
28
  #include "binomial_bounds.hpp"
29
29
  #include "theta_helpers.hpp"
30
- #include "compact_theta_sketch_parser.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "bit_packing.hpp"
31
32
 
32
33
  namespace datasketches {
33
34
 
@@ -38,7 +39,8 @@ bool base_theta_sketch_alloc<A>::is_estimation_mode() const {
38
39
 
39
40
  template<typename A>
40
41
  double base_theta_sketch_alloc<A>::get_theta() const {
41
- return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
42
+ return static_cast<double>(get_theta64()) /
43
+ static_cast<double>(theta_constants::MAX_THETA);
42
44
  }
43
45
 
44
46
  template<typename A>
@@ -343,12 +345,9 @@ template<typename A>
343
345
  void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
344
346
  const uint8_t preamble_longs = this->is_estimation_mode() ? 3 : this->is_empty() || entries_.size() == 1 ? 1 : 2;
345
347
  write(os, preamble_longs);
346
- const uint8_t serial_version = SERIAL_VERSION;
347
- write(os, serial_version);
348
- const uint8_t type = SKETCH_TYPE;
349
- write(os, type);
350
- const uint16_t unused16 = 0;
351
- write(os, unused16);
348
+ write(os, UNCOMPRESSED_SERIAL_VERSION);
349
+ write(os, SKETCH_TYPE);
350
+ write<uint16_t>(os, 0); // unused
352
351
  const uint8_t flags_byte(
353
352
  (1 << flags::IS_COMPACT) |
354
353
  (1 << flags::IS_READ_ONLY) |
@@ -356,13 +355,10 @@ void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
356
355
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
357
356
  );
358
357
  write(os, flags_byte);
359
- const uint16_t seed_hash = get_seed_hash();
360
- write(os, seed_hash);
358
+ write(os, get_seed_hash());
361
359
  if (preamble_longs > 1) {
362
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
363
- write(os, num_entries);
364
- const uint32_t unused32 = 0;
365
- write(os, unused32);
360
+ write<uint32_t>(os, entries_.size());
361
+ write<uint32_t>(os, 0); // unused
366
362
  }
367
363
  if (this->is_estimation_mode()) write(os, this->theta_);
368
364
  if (entries_.size() > 0) write(os, entries_.data(), entries_.size() * sizeof(uint64_t));
@@ -376,11 +372,9 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
376
372
  vector_bytes bytes(size, 0, entries_.get_allocator());
377
373
  uint8_t* ptr = bytes.data() + header_size_bytes;
378
374
 
379
- ptr += copy_to_mem(preamble_longs, ptr);
380
- const uint8_t serial_version = SERIAL_VERSION;
381
- ptr += copy_to_mem(serial_version, ptr);
382
- const uint8_t type = SKETCH_TYPE;
383
- ptr += copy_to_mem(type, ptr);
375
+ *ptr++ = preamble_longs;
376
+ *ptr++ = UNCOMPRESSED_SERIAL_VERSION;
377
+ *ptr++ = SKETCH_TYPE;
384
378
  ptr += sizeof(uint16_t); // unused
385
379
  const uint8_t flags_byte(
386
380
  (1 << flags::IS_COMPACT) |
@@ -388,12 +382,10 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
388
382
  (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
389
383
  (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
390
384
  );
391
- ptr += copy_to_mem(flags_byte, ptr);
392
- const uint16_t seed_hash = get_seed_hash();
393
- ptr += copy_to_mem(seed_hash, ptr);
385
+ *ptr++ = flags_byte;
386
+ ptr += copy_to_mem(get_seed_hash(), ptr);
394
387
  if (preamble_longs > 1) {
395
- const uint32_t num_entries = static_cast<uint32_t>(entries_.size());
396
- ptr += copy_to_mem(num_entries, ptr);
388
+ ptr += copy_to_mem<uint32_t>(entries_.size(), ptr);
397
389
  ptr += sizeof(uint32_t); // unused
398
390
  }
399
391
  if (this->is_estimation_mode()) ptr += copy_to_mem(theta_, ptr);
@@ -401,131 +393,342 @@ auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const
401
393
  return bytes;
402
394
  }
403
395
 
396
+ template<typename A>
397
+ bool compact_theta_sketch_alloc<A>::is_suitable_for_compression() const {
398
+ if (!this->is_ordered() || entries_.size() == 0 ||
399
+ (entries_.size() == 1 && !this->is_estimation_mode())) return false;
400
+ return true;
401
+ }
402
+
403
+ template<typename A>
404
+ void compact_theta_sketch_alloc<A>::serialize_compressed(std::ostream& os) const {
405
+ if (is_suitable_for_compression()) return serialize_version_4(os);
406
+ return serialize(os);
407
+ }
408
+
409
+ template<typename A>
410
+ auto compact_theta_sketch_alloc<A>::serialize_compressed(unsigned header_size_bytes) const -> vector_bytes {
411
+ if (is_suitable_for_compression()) return serialize_version_4(header_size_bytes);
412
+ return serialize(header_size_bytes);
413
+ }
414
+
415
+ template<typename A>
416
+ uint8_t compact_theta_sketch_alloc<A>::compute_min_leading_zeros() const {
417
+ // compression is based on leading zeros in deltas between ordered hash values
418
+ // assumes ordered sketch
419
+ uint64_t previous = 0;
420
+ uint64_t ored = 0;
421
+ for (const uint64_t entry: entries_) {
422
+ const uint64_t delta = entry - previous;
423
+ ored |= delta;
424
+ previous = entry;
425
+ }
426
+ return count_leading_zeros_in_u64(ored);
427
+ }
428
+
429
+ template<typename A>
430
+ void compact_theta_sketch_alloc<A>::serialize_version_4(std::ostream& os) const {
431
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
432
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
433
+
434
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
435
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
436
+
437
+ write(os, preamble_longs);
438
+ write(os, COMPRESSED_SERIAL_VERSION);
439
+ write(os, SKETCH_TYPE);
440
+ write(os, entry_bits);
441
+ write(os, num_entries_bytes);
442
+ const uint8_t flags_byte(
443
+ (1 << flags::IS_COMPACT) |
444
+ (1 << flags::IS_READ_ONLY) |
445
+ (1 << flags::IS_ORDERED)
446
+ );
447
+ write(os, flags_byte);
448
+ write(os, get_seed_hash());
449
+ if (this->is_estimation_mode()) write(os, this->theta_);
450
+ uint32_t num_entries = entries_.size();
451
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
452
+ write<uint8_t>(os, num_entries & 0xff);
453
+ num_entries >>= 8;
454
+ }
455
+
456
+ uint64_t previous = 0;
457
+ uint64_t deltas[8];
458
+ vector_bytes buffer(entry_bits, 0, entries_.get_allocator()); // block of 8 entries takes entry_bits bytes
459
+
460
+ // pack blocks of 8 deltas
461
+ unsigned i;
462
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
463
+ for (unsigned j = 0; j < 8; ++j) {
464
+ deltas[j] = entries_[i + j] - previous;
465
+ previous = entries_[i + j];
466
+ }
467
+ pack_bits_block8(deltas, buffer.data(), entry_bits);
468
+ write(os, buffer.data(), buffer.size());
469
+ }
470
+
471
+ // pack extra deltas if fewer than 8 of them left
472
+ if (i < entries_.size()) {
473
+ uint8_t offset = 0;
474
+ uint8_t* ptr = buffer.data();
475
+ for (; i < entries_.size(); ++i) {
476
+ const uint64_t delta = entries_[i] - previous;
477
+ previous = entries_[i];
478
+ offset = pack_bits(delta, entry_bits, ptr, offset);
479
+ }
480
+ write(os, buffer.data(), ptr - buffer.data());
481
+ }
482
+ }
483
+
484
+ template<typename A>
485
+ auto compact_theta_sketch_alloc<A>::serialize_version_4(unsigned header_size_bytes) const -> vector_bytes {
486
+ const uint8_t preamble_longs = this->is_estimation_mode() ? 2 : 1;
487
+ const uint8_t entry_bits = 64 - compute_min_leading_zeros();
488
+ const size_t compressed_bits = entry_bits * entries_.size();
489
+
490
+ // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
491
+ const uint8_t num_entries_bytes = whole_bytes_to_hold_bits<uint8_t>(32 - count_leading_zeros_in_u32(entries_.size()));
492
+
493
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + num_entries_bytes
494
+ + whole_bytes_to_hold_bits(compressed_bits);
495
+ vector_bytes bytes(size, 0, entries_.get_allocator());
496
+ uint8_t* ptr = bytes.data() + header_size_bytes;
497
+
498
+ *ptr++ = preamble_longs;
499
+ *ptr++ = COMPRESSED_SERIAL_VERSION;
500
+ *ptr++ = SKETCH_TYPE;
501
+ *ptr++ = entry_bits;
502
+ *ptr++ = num_entries_bytes;
503
+ const uint8_t flags_byte(
504
+ (1 << flags::IS_COMPACT) |
505
+ (1 << flags::IS_READ_ONLY) |
506
+ (1 << flags::IS_ORDERED)
507
+ );
508
+ *ptr++ = flags_byte;
509
+ ptr += copy_to_mem(get_seed_hash(), ptr);
510
+ if (this->is_estimation_mode()) {
511
+ ptr += copy_to_mem(theta_, ptr);
512
+ }
513
+ uint32_t num_entries = entries_.size();
514
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
515
+ *ptr++ = num_entries & 0xff;
516
+ num_entries >>= 8;
517
+ }
518
+
519
+ uint64_t previous = 0;
520
+ uint64_t deltas[8];
521
+
522
+ // pack blocks of 8 deltas
523
+ unsigned i;
524
+ for (i = 0; i + 7 < entries_.size(); i += 8) {
525
+ for (unsigned j = 0; j < 8; ++j) {
526
+ deltas[j] = entries_[i + j] - previous;
527
+ previous = entries_[i + j];
528
+ }
529
+ pack_bits_block8(deltas, ptr, entry_bits);
530
+ ptr += entry_bits;
531
+ }
532
+
533
+ // pack extra deltas if fewer than 8 of them left
534
+ uint8_t offset = 0;
535
+ for (; i < entries_.size(); ++i) {
536
+ const uint64_t delta = entries_[i] - previous;
537
+ previous = entries_[i];
538
+ offset = pack_bits(delta, entry_bits, ptr, offset);
539
+ }
540
+ return bytes;
541
+ }
542
+
404
543
  template<typename A>
405
544
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
406
545
  const auto preamble_longs = read<uint8_t>(is);
407
546
  const auto serial_version = read<uint8_t>(is);
408
547
  const auto type = read<uint8_t>(is);
548
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
409
549
  switch (serial_version) {
410
- case SERIAL_VERSION: {
411
- read<uint16_t>(is); // unused
412
- const auto flags_byte = read<uint8_t>(is);
413
- const auto seed_hash = read<uint16_t>(is);
414
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
415
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
416
- const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
417
- if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
418
-
419
- uint64_t theta = theta_constants::MAX_THETA;
420
- uint32_t num_entries = 0;
421
- if (!is_empty) {
422
- if (preamble_longs == 1) {
423
- num_entries = 1;
424
- } else {
425
- num_entries = read<uint32_t>(is);
426
- read<uint32_t>(is); // unused
427
- if (preamble_longs > 2) {
428
- theta = read<uint64_t>(is);
429
- }
430
- }
431
- }
432
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
433
- if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
550
+ case 4:
551
+ return deserialize_v4(preamble_longs, is, seed, allocator);
552
+ case 3:
553
+ return deserialize_v3(preamble_longs, is, seed, allocator);
554
+ case 1:
555
+ return deserialize_v1(preamble_longs, is, seed, allocator);
556
+ case 2:
557
+ return deserialize_v2(preamble_longs, is, seed, allocator);
558
+ default:
559
+ throw std::invalid_argument("unexpected sketch serialization version " + std::to_string(serial_version));
560
+ }
561
+ }
434
562
 
435
- const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
563
+ template<typename A>
564
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v1(
565
+ uint8_t, std::istream& is, uint64_t seed, const A& allocator)
566
+ {
567
+ const auto seed_hash = compute_seed_hash(seed);
568
+ read<uint8_t>(is); // unused
569
+ read<uint32_t>(is); // unused
570
+ const auto num_entries = read<uint32_t>(is);
571
+ read<uint32_t>(is); //unused
572
+ const auto theta = read<uint64_t>(is);
573
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
574
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
575
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
576
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
577
+ return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
578
+ }
579
+
580
+ template<typename A>
581
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v2(
582
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
583
+ {
584
+ read<uint8_t>(is); // unused
585
+ read<uint16_t>(is); // unused
586
+ const uint16_t seed_hash = read<uint16_t>(is);
587
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
588
+ if (preamble_longs == 1) {
589
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
590
+ std::vector<uint64_t, A> entries(0, 0, allocator);
591
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
592
+ } else if (preamble_longs == 2) {
593
+ const uint32_t num_entries = read<uint32_t>(is);
594
+ read<uint32_t>(is); // unused
595
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
596
+ if (num_entries == 0) {
597
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
598
+ }
599
+ read(is, entries.data(), entries.size() * sizeof(uint64_t));
600
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
601
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
602
+ } else if (preamble_longs == 3) {
603
+ const uint32_t num_entries = read<uint32_t>(is);
604
+ read<uint32_t>(is); // unused
605
+ const auto theta = read<uint64_t>(is);
606
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
607
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
608
+ if (is_empty) {
436
609
  if (!is.good()) throw std::runtime_error("error reading from std::istream");
437
- return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
610
+ return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
611
+ } else {
612
+ read(is, entries.data(), sizeof(uint64_t) * entries.size());
613
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
614
+ return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
615
+ }
616
+ } else {
617
+ throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
438
618
  }
439
- case 1: {
440
- const auto seed_hash = compute_seed_hash(seed);
441
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
442
- read<uint8_t>(is); // unused
619
+ }
620
+
621
+ template<typename A>
622
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v3(
623
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
624
+ {
625
+ read<uint16_t>(is); // unused
626
+ const auto flags_byte = read<uint8_t>(is);
627
+ const auto seed_hash = read<uint16_t>(is);
628
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
629
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
630
+ uint64_t theta = theta_constants::MAX_THETA;
631
+ uint32_t num_entries = 0;
632
+ if (!is_empty) {
633
+ if (preamble_longs == 1) {
634
+ num_entries = 1;
635
+ } else {
636
+ num_entries = read<uint32_t>(is);
443
637
  read<uint32_t>(is); // unused
444
- const auto num_entries = read<uint32_t>(is);
445
- read<uint32_t>(is); //unused
446
- const auto theta = read<uint64_t>(is);
447
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
448
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
449
- if (!is_empty)
450
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
451
- if (!is.good())
452
- throw std::runtime_error("error reading from std::istream");
453
- return compact_theta_sketch_alloc(is_empty, true, seed_hash, theta, std::move(entries));
638
+ if (preamble_longs > 2) theta = read<uint64_t>(is);
639
+ }
454
640
  }
455
- case 2: {
456
- checker<true>::check_sketch_type(type, SKETCH_TYPE);
457
- read<uint8_t>(is); // unused
458
- read<uint16_t>(is); // unused
459
- const uint16_t seed_hash = read<uint16_t>(is);
460
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
461
- if (preamble_longs == 1) {
462
- if (!is.good())
463
- throw std::runtime_error("error reading from std::istream");
464
- std::vector<uint64_t, A> entries(0, 0, allocator);
465
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
466
- } else if (preamble_longs == 2) {
467
- const uint32_t num_entries = read<uint32_t>(is);
468
- read<uint32_t>(is); // unused
469
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
470
- if (num_entries == 0) {
471
- return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
472
- }
473
- read(is, entries.data(), entries.size() * sizeof(uint64_t));
474
- if (!is.good())
475
- throw std::runtime_error("error reading from std::istream");
476
- return compact_theta_sketch_alloc(false, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
477
- } else if (preamble_longs == 3) {
478
- const uint32_t num_entries = read<uint32_t>(is);
479
- read<uint32_t>(is); // unused
480
- const auto theta = read<uint64_t>(is);
481
- bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
482
- std::vector<uint64_t, A> entries(num_entries, 0, allocator);
483
- if (is_empty) {
484
- if (!is.good())
485
- throw std::runtime_error("error reading from std::istream");
486
- return compact_theta_sketch_alloc(true, true, seed_hash, theta, std::move(entries));
487
- } else {
488
- read(is, entries.data(), sizeof(uint64_t) * entries.size());
489
- if (!is.good())
490
- throw std::runtime_error("error reading from std::istream");
491
- return compact_theta_sketch_alloc(false, true, seed_hash, theta, std::move(entries));
492
- }
493
- } else {
494
- throw std::invalid_argument(std::to_string(preamble_longs) + " longs of premable, but expected 1, 2, or 3");
495
- }
641
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
642
+ if (!is_empty) read(is, entries.data(), sizeof(uint64_t) * entries.size());
643
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
644
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
645
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
646
+ }
647
+
648
+ template<typename A>
649
+ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize_v4(
650
+ uint8_t preamble_longs, std::istream& is, uint64_t seed, const A& allocator)
651
+ {
652
+ const auto entry_bits = read<uint8_t>(is);
653
+ const auto num_entries_bytes = read<uint8_t>(is);
654
+ const auto flags_byte = read<uint8_t>(is);
655
+ const auto seed_hash = read<uint16_t>(is);
656
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
657
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
658
+ uint64_t theta = theta_constants::MAX_THETA;
659
+ if (preamble_longs > 1) theta = read<uint64_t>(is);
660
+ uint32_t num_entries = 0;
661
+ for (unsigned i = 0; i < num_entries_bytes; ++i) {
662
+ num_entries |= read<uint8_t>(is) << (i << 3);
663
+ }
664
+ vector_bytes buffer(entry_bits, 0, allocator); // block of 8 entries takes entry_bits bytes
665
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
666
+
667
+ // unpack blocks of 8 deltas
668
+ unsigned i;
669
+ for (i = 0; i + 7 < num_entries; i += 8) {
670
+ read(is, buffer.data(), buffer.size());
671
+ unpack_bits_block8(&entries[i], buffer.data(), entry_bits);
496
672
  }
497
- default:
498
- // this should always fail since the valid cases are handled above
499
- checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
500
- // this throw is never reached, because check_serial_version will throw an informative exception.
501
- // This is only here to avoid a compiler warning about a path without a return value.
502
- throw std::invalid_argument("unexpected sketch serialization version");
673
+ // unpack extra deltas if fewer than 8 of them left
674
+ if (i < num_entries) read(is, buffer.data(), whole_bytes_to_hold_bits((num_entries - i) * entry_bits));
675
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
676
+ const uint8_t* ptr = buffer.data();
677
+ uint8_t offset = 0;
678
+ for (; i < num_entries; ++i) {
679
+ offset = unpack_bits(entries[i], entry_bits, ptr, offset);
503
680
  }
681
+ // undo deltas
682
+ uint64_t previous = 0;
683
+ for (i = 0; i < num_entries; ++i) {
684
+ entries[i] += previous;
685
+ previous = entries[i];
686
+ }
687
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
688
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
504
689
  }
505
690
 
506
691
  template<typename A>
507
692
  compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
508
693
  auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
509
- return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
694
+ if (data.entry_bits == 64) { // versions 1 to 3
695
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(data.entries_start_ptr);
696
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta,
697
+ std::vector<uint64_t, A>(entries, entries + data.num_entries, allocator));
698
+ } else { // version 4
699
+ std::vector<uint64_t, A> entries(data.num_entries, 0, allocator);
700
+ const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data.entries_start_ptr);
701
+ // unpack blocks of 8 deltas
702
+ unsigned i;
703
+ for (i = 0; i + 7 < data.num_entries; i += 8) {
704
+ unpack_bits_block8(&entries[i], ptr, data.entry_bits);
705
+ ptr += data.entry_bits;
706
+ }
707
+ // unpack extra deltas if fewer than 8 of them left
708
+ uint8_t offset = 0;
709
+ for (; i < data.num_entries; ++i) {
710
+ offset = unpack_bits(entries[i], data.entry_bits, ptr, offset);
711
+ }
712
+ // undo deltas
713
+ uint64_t previous = 0;
714
+ for (i = 0; i < data.num_entries; ++i) {
715
+ entries[i] += previous;
716
+ previous = entries[i];
717
+ }
718
+ return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::move(entries));
719
+ }
510
720
  }
511
721
 
512
722
  // wrapped compact sketch
513
723
 
514
724
  template<typename A>
515
- wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint32_t num_entries,
516
- uint64_t theta, const uint64_t* entries):
517
- is_empty_(is_empty),
518
- is_ordered_(is_ordered),
519
- seed_hash_(seed_hash),
520
- num_entries_(num_entries),
521
- theta_(theta),
522
- entries_(entries)
725
+ wrapped_compact_theta_sketch_alloc<A>::wrapped_compact_theta_sketch_alloc(const data_type& data):
726
+ data_(data)
523
727
  {}
524
728
 
525
729
  template<typename A>
526
730
  const wrapped_compact_theta_sketch_alloc<A> wrapped_compact_theta_sketch_alloc<A>::wrap(const void* bytes, size_t size, uint64_t seed, bool dump_on_error) {
527
- auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error);
528
- return wrapped_compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.num_entries, data.theta, data.entries);
731
+ return wrapped_compact_theta_sketch_alloc(compact_theta_sketch_parser<true>::parse(bytes, size, seed, dump_on_error));
529
732
  }
530
733
 
531
734
  template<typename A>
@@ -535,37 +738,37 @@ A wrapped_compact_theta_sketch_alloc<A>::get_allocator() const {
535
738
 
536
739
  template<typename A>
537
740
  bool wrapped_compact_theta_sketch_alloc<A>::is_empty() const {
538
- return is_empty_;
741
+ return data_.is_empty;
539
742
  }
540
743
 
541
744
  template<typename A>
542
745
  bool wrapped_compact_theta_sketch_alloc<A>::is_ordered() const {
543
- return is_ordered_;
746
+ return data_.is_ordered;
544
747
  }
545
748
 
546
749
  template<typename A>
547
750
  uint64_t wrapped_compact_theta_sketch_alloc<A>::get_theta64() const {
548
- return theta_;
751
+ return data_.theta;
549
752
  }
550
753
 
551
754
  template<typename A>
552
755
  uint32_t wrapped_compact_theta_sketch_alloc<A>::get_num_retained() const {
553
- return static_cast<uint32_t>(num_entries_);
756
+ return data_.num_entries;
554
757
  }
555
758
 
556
759
  template<typename A>
557
760
  uint16_t wrapped_compact_theta_sketch_alloc<A>::get_seed_hash() const {
558
- return seed_hash_;
761
+ return data_.seed_hash;
559
762
  }
560
763
 
561
764
  template<typename A>
562
765
  auto wrapped_compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
563
- return entries_;
766
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, 0);
564
767
  }
565
768
 
566
769
  template<typename A>
567
770
  auto wrapped_compact_theta_sketch_alloc<A>::end() const -> const_iterator {
568
- return entries_ + num_entries_;
771
+ return const_iterator(data_.entries_start_ptr, data_.entry_bits, data_.num_entries, data_.num_entries);
569
772
  }
570
773
 
571
774
  template<typename A>
@@ -574,12 +777,109 @@ void wrapped_compact_theta_sketch_alloc<A>::print_specifics(std::ostringstream&)
574
777
  template<typename A>
575
778
  void wrapped_compact_theta_sketch_alloc<A>::print_items(std::ostringstream& os) const {
576
779
  os << "### Retained entries" << std::endl;
577
- for (const auto& hash: *this) {
780
+ for (const auto hash: *this) {
578
781
  os << hash << std::endl;
579
782
  }
580
783
  os << "### End retained entries" << std::endl;
581
784
  }
582
785
 
786
+ // assumes index == 0 or index == num_entries
787
+ template<typename Allocator>
788
+ wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::const_iterator(
789
+ const void* ptr, uint8_t entry_bits, uint32_t num_entries, uint32_t index):
790
+ ptr_(ptr),
791
+ entry_bits_(entry_bits),
792
+ num_entries_(num_entries),
793
+ index_(index),
794
+ previous_(0),
795
+ is_block_mode_(num_entries_ >= 8),
796
+ buf_i_(0),
797
+ offset_(0)
798
+ {
799
+ if (entry_bits == 64) { // no compression
800
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr) + index;
801
+ } else if (index < num_entries) {
802
+ if (is_block_mode_) {
803
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
804
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
805
+ for (int i = 0; i < 8; ++i) {
806
+ buffer_[i] += previous_;
807
+ previous_ = buffer_[i];
808
+ }
809
+ } else {
810
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
811
+ buffer_[0] += previous_;
812
+ previous_ = buffer_[0];
813
+ }
814
+ }
815
+ }
816
+
817
+ template<typename Allocator>
818
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++() -> const_iterator& {
819
+ if (entry_bits_ == 64) { // no compression
820
+ ptr_ = reinterpret_cast<const uint64_t*>(ptr_) + 1;
821
+ return *this;
822
+ }
823
+ ++index_;
824
+ if (index_ < num_entries_) {
825
+ if (is_block_mode_) {
826
+ ++buf_i_;
827
+ if (buf_i_ == 8) {
828
+ buf_i_ = 0;
829
+ if (index_ + 8 < num_entries_) {
830
+ unpack_bits_block8(buffer_, reinterpret_cast<const uint8_t*>(ptr_), entry_bits_);
831
+ ptr_ = reinterpret_cast<const uint8_t*>(ptr_) + entry_bits_;
832
+ for (int i = 0; i < 8; ++i) {
833
+ buffer_[i] += previous_;
834
+ previous_ = buffer_[i];
835
+ }
836
+ } else {
837
+ is_block_mode_ = false;
838
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
839
+ buffer_[0] += previous_;
840
+ previous_ = buffer_[0];
841
+ }
842
+ }
843
+ } else {
844
+ offset_ = unpack_bits(buffer_[0], entry_bits_, reinterpret_cast<const uint8_t*&>(ptr_), offset_);
845
+ buffer_[0] += previous_;
846
+ previous_ = buffer_[0];
847
+ }
848
+ }
849
+ return *this;
850
+ }
851
+
852
+ template<typename Allocator>
853
+ auto wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator++(int) -> const_iterator {
854
+ const_iterator tmp(*this);
855
+ operator++();
856
+ return tmp;
857
+ }
858
+
859
+ template<typename Allocator>
860
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator!=(const const_iterator& other) const {
861
+ if (entry_bits_ == 64) return ptr_ != other.ptr_;
862
+ return index_ != other.index_;
863
+ }
864
+
865
+ template<typename Allocator>
866
+ bool wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator==(const const_iterator& other) const {
867
+ if (entry_bits_ == 64) return ptr_ == other.ptr_;
868
+ return index_ == other.index_;
869
+ }
870
+
871
+ template<typename Allocator>
872
+ const uint64_t& wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator*() const {
873
+ if (entry_bits_ == 64) return *reinterpret_cast<const uint64_t*>(ptr_);
874
+ return buffer_[buf_i_];
875
+ }
876
+
877
+ template<typename Allocator>
878
+ const uint64_t* wrapped_compact_theta_sketch_alloc<Allocator>::const_iterator::operator->() const {
879
+ if (entry_bits_ == 64) return reinterpret_cast<const uint64_t*>(ptr_);
880
+ return buffer_ + buf_i_;
881
+ }
882
+
583
883
  } /* namespace datasketches */
584
884
 
585
885
  #endif