@nxtedition/rocksdb 8.0.1 → 8.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  2. package/deps/rocksdb/rocksdb/Makefile +2 -2
  3. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
  8. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +20 -146
  9. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
  10. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
  11. package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
  12. package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
  33. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
  35. package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
  36. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
  37. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
  38. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
  47. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
  48. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
  49. package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
  50. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
  51. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
  52. package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
  53. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
  54. package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
  56. package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
  57. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  58. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
  59. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
  60. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
  62. package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
  63. package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
  65. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
  68. package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
  69. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
  74. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
  75. package/deps/rocksdb/rocksdb/db/version_set.cc +212 -35
  76. package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
  77. package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
  78. package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
  82. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
  83. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
  84. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
  85. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
  86. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
  87. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
  88. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
  89. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
  90. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
  91. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  94. package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
  95. package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
  96. package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
  97. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  98. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  99. package/deps/rocksdb/rocksdb/src.mk +2 -1
  100. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
  103. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
  105. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
  106. package/deps/rocksdb/rocksdb/table/format.cc +24 -20
  107. package/deps/rocksdb/rocksdb/table/format.h +5 -2
  108. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
  109. package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
  110. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
  111. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  112. package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
  113. package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
  114. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
  115. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
  116. package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
  117. package/deps/rocksdb/rocksdb/util/status.cc +7 -0
  118. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
  119. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
  120. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
  121. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
  122. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
  123. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
  124. package/deps/rocksdb/rocksdb.gyp +2 -1
  125. package/package.json +1 -1
  126. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  127. package/prebuilds/linux-x64/node.napi.node +0 -0
  128. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
  129. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
@@ -39,6 +39,8 @@
39
39
  #include "db/table_cache.h"
40
40
  #include "db/version_builder.h"
41
41
  #include "db/version_edit_handler.h"
42
+ #include "table/compaction_merging_iterator.h"
43
+
42
44
  #if USE_COROUTINES
43
45
  #include "folly/experimental/coro/BlockingWait.h"
44
46
  #include "folly/experimental/coro/Collect.h"
@@ -1771,8 +1773,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
1771
1773
  file->stats.num_reads_sampled.load(std::memory_order_relaxed),
1772
1774
  file->being_compacted, file->temperature,
1773
1775
  file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
1774
- file->TryGetFileCreationTime(), file->file_checksum,
1775
- file->file_checksum_func_name);
1776
+ file->TryGetFileCreationTime(), file->epoch_number,
1777
+ file->file_checksum, file->file_checksum_func_name);
1776
1778
  files.back().num_entries = file->num_entries;
1777
1779
  files.back().num_deletions = file->num_deletions;
1778
1780
  level_size += file->fd.GetFileSize();
@@ -2036,7 +2038,8 @@ VersionStorageInfo::VersionStorageInfo(
2036
2038
  const InternalKeyComparator* internal_comparator,
2037
2039
  const Comparator* user_comparator, int levels,
2038
2040
  CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
2039
- bool _force_consistency_checks)
2041
+ bool _force_consistency_checks,
2042
+ EpochNumberRequirement epoch_number_requirement)
2040
2043
  : internal_comparator_(internal_comparator),
2041
2044
  user_comparator_(user_comparator),
2042
2045
  // cfd is nullptr if Version is dummy
@@ -2064,7 +2067,8 @@ VersionStorageInfo::VersionStorageInfo(
2064
2067
  current_num_samples_(0),
2065
2068
  estimated_compaction_needed_bytes_(0),
2066
2069
  finalized_(false),
2067
- force_consistency_checks_(_force_consistency_checks) {
2070
+ force_consistency_checks_(_force_consistency_checks),
2071
+ epoch_number_requirement_(epoch_number_requirement) {
2068
2072
  if (ref_vstorage != nullptr) {
2069
2073
  accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
2070
2074
  accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
@@ -2085,7 +2089,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
2085
2089
  const FileOptions& file_opt,
2086
2090
  const MutableCFOptions mutable_cf_options,
2087
2091
  const std::shared_ptr<IOTracer>& io_tracer,
2088
- uint64_t version_number)
2092
+ uint64_t version_number,
2093
+ EpochNumberRequirement epoch_number_requirement)
2089
2094
  : env_(vset->env_),
2090
2095
  clock_(vset->clock_),
2091
2096
  cfd_(column_family_data),
@@ -2104,7 +2109,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
2104
2109
  (cfd_ == nullptr || cfd_->current() == nullptr)
2105
2110
  ? nullptr
2106
2111
  : cfd_->current()->storage_info(),
2107
- cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
2112
+ cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks,
2113
+ epoch_number_requirement),
2108
2114
  vset_(vset),
2109
2115
  next_(this),
2110
2116
  prev_(this),
@@ -2539,16 +2545,19 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2539
2545
  }
2540
2546
  f = fp.GetNextFileInLevel();
2541
2547
  }
2542
- if (s.ok() && mget_tasks.size() > 0) {
2548
+ if (mget_tasks.size() > 0) {
2543
2549
  RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
2544
2550
  mget_tasks.size());
2545
2551
  // Collect all results so far
2546
2552
  std::vector<Status> statuses = folly::coro::blockingWait(
2547
2553
  folly::coro::collectAllRange(std::move(mget_tasks))
2548
2554
  .scheduleOn(&range->context()->executor()));
2549
- for (Status stat : statuses) {
2550
- if (!stat.ok()) {
2551
- s = stat;
2555
+ if (s.ok()) {
2556
+ for (Status stat : statuses) {
2557
+ if (!stat.ok()) {
2558
+ s = std::move(stat);
2559
+ break;
2560
+ }
2552
2561
  }
2553
2562
  }
2554
2563
 
@@ -2794,6 +2803,9 @@ Status Version::MultiGetAsync(
2794
2803
  unsigned int num_tasks_queued = 0;
2795
2804
  to_process.pop_front();
2796
2805
  if (batch->IsSearchEnded() || batch->GetRange().empty()) {
2806
+ // If to_process is empty, i.e no more batches to look at, then we need
2807
+ // schedule the enqueued coroutines and wait for them. Otherwise, we
2808
+ // skip this batch and move to the next one in to_process.
2797
2809
  if (!to_process.empty()) {
2798
2810
  continue;
2799
2811
  }
@@ -2802,9 +2814,6 @@ Status Version::MultiGetAsync(
2802
2814
  // to_process
2803
2815
  s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
2804
2816
  to_process, num_tasks_queued, mget_stats);
2805
- if (!s.ok()) {
2806
- break;
2807
- }
2808
2817
  // If ProcessBatch didn't enqueue any coroutine tasks, it means all
2809
2818
  // keys were filtered out. So put the batch back in to_process to
2810
2819
  // lookup in the next level
@@ -2815,8 +2824,10 @@ Status Version::MultiGetAsync(
2815
2824
  waiting.emplace_back(idx);
2816
2825
  }
2817
2826
  }
2818
- if (to_process.empty()) {
2819
- if (s.ok() && mget_tasks.size() > 0) {
2827
+ // If ProcessBatch() returned an error, then schedule the enqueued
2828
+ // coroutines and wait for them, then abort the MultiGet.
2829
+ if (to_process.empty() || !s.ok()) {
2830
+ if (mget_tasks.size() > 0) {
2820
2831
  assert(waiting.size());
2821
2832
  RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
2822
2833
  // Collect all results so far
@@ -2824,10 +2835,12 @@ Status Version::MultiGetAsync(
2824
2835
  folly::coro::collectAllRange(std::move(mget_tasks))
2825
2836
  .scheduleOn(&range->context()->executor()));
2826
2837
  mget_tasks.clear();
2827
- for (Status stat : statuses) {
2828
- if (!stat.ok()) {
2829
- s = stat;
2830
- break;
2838
+ if (s.ok()) {
2839
+ for (Status stat : statuses) {
2840
+ if (!stat.ok()) {
2841
+ s = std::move(stat);
2842
+ break;
2843
+ }
2831
2844
  }
2832
2845
  }
2833
2846
 
@@ -2850,6 +2863,9 @@ Status Version::MultiGetAsync(
2850
2863
  assert(!s.ok() || waiting.size() == 0);
2851
2864
  }
2852
2865
  }
2866
+ if (!s.ok()) {
2867
+ break;
2868
+ }
2853
2869
  }
2854
2870
 
2855
2871
  uint64_t num_levels = 0;
@@ -4270,6 +4286,74 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
4270
4286
  return scratch->buffer;
4271
4287
  }
4272
4288
 
4289
+ bool VersionStorageInfo::HasMissingEpochNumber() const {
4290
+ for (int level = 0; level < num_levels_; ++level) {
4291
+ for (const FileMetaData* f : files_[level]) {
4292
+ if (f->epoch_number == kUnknownEpochNumber) {
4293
+ return true;
4294
+ }
4295
+ }
4296
+ }
4297
+ return false;
4298
+ }
4299
+
4300
+ uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const {
4301
+ uint64_t max_epoch_number = kUnknownEpochNumber;
4302
+ for (int level = 0; level < num_levels_; ++level) {
4303
+ for (const FileMetaData* f : files_[level]) {
4304
+ max_epoch_number = std::max(max_epoch_number, f->epoch_number);
4305
+ }
4306
+ }
4307
+ return max_epoch_number;
4308
+ }
4309
+
4310
+ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) {
4311
+ cfd->ResetNextEpochNumber();
4312
+
4313
+ bool reserve_epoch_num_for_file_ingested_behind =
4314
+ cfd->ioptions()->allow_ingest_behind;
4315
+ if (reserve_epoch_num_for_file_ingested_behind) {
4316
+ uint64_t reserved_epoch_number = cfd->NewEpochNumber();
4317
+ assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind);
4318
+ ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(),
4319
+ "[%s]CF has reserved epoch number %" PRIu64
4320
+ " for files ingested "
4321
+ "behind since `Options::allow_ingest_behind` is true",
4322
+ cfd->GetName().c_str(), reserved_epoch_number);
4323
+ }
4324
+
4325
+ if (HasMissingEpochNumber()) {
4326
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing);
4327
+ assert(num_levels_ >= 1);
4328
+
4329
+ for (int level = num_levels_ - 1; level >= 1; --level) {
4330
+ auto& files_at_level = files_[level];
4331
+ if (files_at_level.empty()) {
4332
+ continue;
4333
+ }
4334
+ uint64_t next_epoch_number = cfd->NewEpochNumber();
4335
+ for (FileMetaData* f : files_at_level) {
4336
+ f->epoch_number = next_epoch_number;
4337
+ }
4338
+ }
4339
+
4340
+ for (auto file_meta_iter = files_[0].rbegin();
4341
+ file_meta_iter != files_[0].rend(); file_meta_iter++) {
4342
+ FileMetaData* f = *file_meta_iter;
4343
+ f->epoch_number = cfd->NewEpochNumber();
4344
+ }
4345
+
4346
+ ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(),
4347
+ "[%s]CF's epoch numbers are inferred based on seqno",
4348
+ cfd->GetName().c_str());
4349
+ epoch_number_requirement_ = EpochNumberRequirement::kMustPresent;
4350
+ } else {
4351
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent);
4352
+ cfd->SetNextEpochNumber(
4353
+ std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber()));
4354
+ }
4355
+ }
4356
+
4273
4357
  uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
4274
4358
  uint64_t result = 0;
4275
4359
  std::vector<FileMetaData*> overlaps;
@@ -4967,10 +5051,15 @@ Status VersionSet::ProcessManifestWrites(
4967
5051
  if (!descriptor_log_ ||
4968
5052
  manifest_file_size_ > db_options_->max_manifest_file_size) {
4969
5053
  TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
5054
+ TEST_SYNC_POINT_CALLBACK(
5055
+ "VersionSet::ProcessManifestWrites:BeforeNewManifest", nullptr);
4970
5056
  new_descriptor_log = true;
4971
5057
  } else {
4972
5058
  pending_manifest_file_number_ = manifest_file_number_;
4973
5059
  }
5060
+ TEST_SYNC_POINT_CALLBACK(
5061
+ "VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot",
5062
+ &new_descriptor_log);
4974
5063
 
4975
5064
  // Local cached copy of state variable(s). WriteCurrentStateToManifest()
4976
5065
  // reads its content after releasing db mutex to avoid race with
@@ -5099,6 +5188,7 @@ Status VersionSet::ProcessManifestWrites(
5099
5188
  break;
5100
5189
  }
5101
5190
  }
5191
+
5102
5192
  if (s.ok()) {
5103
5193
  io_s = SyncManifest(db_options_, descriptor_log_->file());
5104
5194
  manifest_io_status = io_s;
@@ -5506,7 +5596,8 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
5506
5596
  Status VersionSet::Recover(
5507
5597
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
5508
5598
  std::string* db_id, bool no_error_if_files_missing) {
5509
- // Read "CURRENT" file, which contains a pointer to the current manifest file
5599
+ // Read "CURRENT" file, which contains a pointer to the current manifest
5600
+ // file
5510
5601
  std::string manifest_path;
5511
5602
  Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
5512
5603
  &manifest_file_number_);
@@ -5540,7 +5631,8 @@ Status VersionSet::Recover(
5540
5631
  true /* checksum */, 0 /* log_number */);
5541
5632
  VersionEditHandler handler(
5542
5633
  read_only, column_families, const_cast<VersionSet*>(this),
5543
- /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
5634
+ /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
5635
+ EpochNumberRequirement::kMightMissing);
5544
5636
  handler.Iterate(reader, &log_read_status);
5545
5637
  s = handler.status();
5546
5638
  if (s.ok()) {
@@ -5549,6 +5641,9 @@ Status VersionSet::Recover(
5549
5641
  assert(current_manifest_file_size != 0);
5550
5642
  handler.GetDbId(db_id);
5551
5643
  }
5644
+ if (s.ok()) {
5645
+ RecoverEpochNumbers();
5646
+ }
5552
5647
  }
5553
5648
 
5554
5649
  if (s.ok()) {
@@ -5708,7 +5803,8 @@ Status VersionSet::TryRecoverFromOneManifest(
5708
5803
  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
5709
5804
  /*checksum=*/true, /*log_num=*/0);
5710
5805
  VersionEditHandlerPointInTime handler_pit(
5711
- read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
5806
+ read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
5807
+ EpochNumberRequirement::kMightMissing);
5712
5808
 
5713
5809
  handler_pit.Iterate(reader, &s);
5714
5810
 
@@ -5717,7 +5813,21 @@ Status VersionSet::TryRecoverFromOneManifest(
5717
5813
  assert(nullptr != has_missing_table_file);
5718
5814
  *has_missing_table_file = handler_pit.HasMissingFiles();
5719
5815
 
5720
- return handler_pit.status();
5816
+ s = handler_pit.status();
5817
+ if (s.ok()) {
5818
+ RecoverEpochNumbers();
5819
+ }
5820
+ return s;
5821
+ }
5822
+
5823
+ void VersionSet::RecoverEpochNumbers() {
5824
+ for (auto cfd : *column_family_set_) {
5825
+ if (cfd->IsDropped()) {
5826
+ continue;
5827
+ }
5828
+ assert(cfd->initialized());
5829
+ cfd->RecoverEpochNumbers();
5830
+ }
5721
5831
  }
5722
5832
 
5723
5833
  Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
@@ -6037,6 +6147,22 @@ Status VersionSet::WriteCurrentStateToManifest(
6037
6147
  }
6038
6148
  }
6039
6149
 
6150
+ // New manifest should rollover the WAL deletion record from previous
6151
+ // manifest. Otherwise, when an addition record of a deleted WAL gets added to
6152
+ // this new manifest later (which can happens in e.g, SyncWAL()), this new
6153
+ // manifest creates an illusion that such WAL hasn't been deleted.
6154
+ VersionEdit wal_deletions;
6155
+ wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
6156
+ std::string wal_deletions_record;
6157
+ if (!wal_deletions.EncodeTo(&wal_deletions_record)) {
6158
+ return Status::Corruption("Unable to Encode VersionEdit: " +
6159
+ wal_deletions.DebugString(true));
6160
+ }
6161
+ io_s = log->AddRecord(wal_deletions_record);
6162
+ if (!io_s.ok()) {
6163
+ return io_s;
6164
+ }
6165
+
6040
6166
  for (auto cfd : *column_family_set_) {
6041
6167
  assert(cfd);
6042
6168
 
@@ -6088,7 +6214,7 @@ Status VersionSet::WriteCurrentStateToManifest(
6088
6214
  f->fd.smallest_seqno, f->fd.largest_seqno,
6089
6215
  f->marked_for_compaction, f->temperature,
6090
6216
  f->oldest_blob_file_number, f->oldest_ancester_time,
6091
- f->file_creation_time, f->file_checksum,
6217
+ f->file_creation_time, f->epoch_number, f->file_checksum,
6092
6218
  f->file_checksum_func_name, f->unique_id);
6093
6219
  }
6094
6220
  }
@@ -6460,6 +6586,14 @@ InternalIterator* VersionSet::MakeInputIterator(
6460
6586
  c->num_input_levels() - 1
6461
6587
  : c->num_input_levels());
6462
6588
  InternalIterator** list = new InternalIterator*[space];
6589
+ // First item in the pair is a pointer to range tombstones.
6590
+ // Second item is a pointer to a member of a LevelIterator,
6591
+ // that will be initialized to where CompactionMergingIterator stores
6592
+ // pointer to its range tombstones. This is used by LevelIterator
6593
+ // to update pointer to range tombstones as it traverse different SST files.
6594
+ std::vector<
6595
+ std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
6596
+ range_tombstones;
6463
6597
  size_t num = 0;
6464
6598
  for (size_t which = 0; which < c->num_input_levels(); which++) {
6465
6599
  if (c->input_levels(which)->num_files != 0) {
@@ -6480,7 +6614,7 @@ InternalIterator* VersionSet::MakeInputIterator(
6480
6614
  end.value(), fmd.smallest.user_key()) < 0) {
6481
6615
  continue;
6482
6616
  }
6483
-
6617
+ TruncatedRangeDelIterator* range_tombstone_iter = nullptr;
6484
6618
  list[num++] = cfd->table_cache()->NewIterator(
6485
6619
  read_options, file_options_compactions,
6486
6620
  cfd->internal_comparator(), fmd, range_del_agg,
@@ -6493,10 +6627,13 @@ InternalIterator* VersionSet::MakeInputIterator(
6493
6627
  MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
6494
6628
  /*smallest_compaction_key=*/nullptr,
6495
6629
  /*largest_compaction_key=*/nullptr,
6496
- /*allow_unprepared_value=*/false);
6630
+ /*allow_unprepared_value=*/false,
6631
+ /*range_del_iter=*/&range_tombstone_iter);
6632
+ range_tombstones.emplace_back(range_tombstone_iter, nullptr);
6497
6633
  }
6498
6634
  } else {
6499
6635
  // Create concatenating iterator for the files from this level
6636
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
6500
6637
  list[num++] = new LevelIterator(
6501
6638
  cfd->table_cache(), read_options, file_options_compactions,
6502
6639
  cfd->internal_comparator(), c->input_levels(which),
@@ -6505,14 +6642,15 @@ InternalIterator* VersionSet::MakeInputIterator(
6505
6642
  /*no per level latency histogram=*/nullptr,
6506
6643
  TableReaderCaller::kCompaction, /*skip_filters=*/false,
6507
6644
  /*level=*/static_cast<int>(c->level(which)), range_del_agg,
6508
- c->boundaries(which));
6645
+ c->boundaries(which), false, &tombstone_iter_ptr);
6646
+ range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
6509
6647
  }
6510
6648
  }
6511
6649
  }
6512
6650
  assert(num <= space);
6513
- InternalIterator* result =
6514
- NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
6515
- static_cast<int>(num));
6651
+ InternalIterator* result = NewCompactionMergingIterator(
6652
+ &c->column_family_data()->internal_comparator(), list,
6653
+ static_cast<int>(num), range_tombstones);
6516
6654
  delete[] list;
6517
6655
  return result;
6518
6656
  }
@@ -6579,6 +6717,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
6579
6717
  filemetadata.temperature = file->temperature;
6580
6718
  filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
6581
6719
  filemetadata.file_creation_time = file->TryGetFileCreationTime();
6720
+ filemetadata.epoch_number = file->epoch_number;
6582
6721
  metadata->push_back(filemetadata);
6583
6722
  }
6584
6723
  }
@@ -6704,8 +6843,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
6704
6843
  return all_versions_blob_file_size;
6705
6844
  }
6706
6845
 
6707
- Status VersionSet::VerifyFileMetadata(const std::string& fpath,
6708
- const FileMetaData& meta) const {
6846
+ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
6847
+ const std::string& fpath, int level,
6848
+ const FileMetaData& meta) {
6709
6849
  uint64_t fsize = 0;
6710
6850
  Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
6711
6851
  if (status.ok()) {
@@ -6713,6 +6853,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath,
6713
6853
  status = Status::Corruption("File size mismatch: " + fpath);
6714
6854
  }
6715
6855
  }
6856
+ if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
6857
+ assert(cfd);
6858
+ TableCache* table_cache = cfd->table_cache();
6859
+ assert(table_cache);
6860
+
6861
+ const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
6862
+ assert(cf_opts);
6863
+ std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
6864
+ size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
6865
+
6866
+ const FileOptions& file_opts = file_options();
6867
+
6868
+ Version* version = cfd->current();
6869
+ assert(version);
6870
+ VersionStorageInfo& storage_info = version->storage_info_;
6871
+ const InternalKeyComparator* icmp = storage_info.InternalComparator();
6872
+ assert(icmp);
6873
+
6874
+ InternalStats* internal_stats = cfd->internal_stats();
6875
+
6876
+ FileMetaData meta_copy = meta;
6877
+ status = table_cache->FindTable(
6878
+ ReadOptions(), file_opts, *icmp, meta_copy,
6879
+ &(meta_copy.table_reader_handle), pe,
6880
+ /*no_io=*/false, /*record_read_stats=*/true,
6881
+ internal_stats->GetFileReadHist(level), false, level,
6882
+ /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
6883
+ meta_copy.temperature);
6884
+ if (meta_copy.table_reader_handle) {
6885
+ table_cache->ReleaseHandle(meta_copy.table_reader_handle);
6886
+ }
6887
+ }
6716
6888
  return status;
6717
6889
  }
6718
6890
 
@@ -6748,12 +6920,17 @@ Status ReactiveVersionSet::Recover(
6748
6920
  log::Reader* reader = manifest_reader->get();
6749
6921
  assert(reader);
6750
6922
 
6751
- manifest_tailer_.reset(new ManifestTailer(
6752
- column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
6923
+ manifest_tailer_.reset(
6924
+ new ManifestTailer(column_families, const_cast<ReactiveVersionSet*>(this),
6925
+ io_tracer_, EpochNumberRequirement::kMightMissing));
6753
6926
 
6754
6927
  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
6755
6928
 
6756
- return manifest_tailer_->status();
6929
+ s = manifest_tailer_->status();
6930
+ if (s.ok()) {
6931
+ RecoverEpochNumbers();
6932
+ }
6933
+ return s;
6757
6934
  }
6758
6935
 
6759
6936
  Status ReactiveVersionSet::ReadAndApply(
@@ -116,6 +116,10 @@ extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
116
116
  extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
117
117
  const std::vector<FileMetaData*>& files,
118
118
  Arena* arena);
119
+ enum EpochNumberRequirement {
120
+ kMightMissing,
121
+ kMustPresent,
122
+ };
119
123
 
120
124
  // Information of the storage associated with each Version, including number of
121
125
  // levels of LSM tree, files information at each level, files marked for
@@ -126,7 +130,9 @@ class VersionStorageInfo {
126
130
  const Comparator* user_comparator, int num_levels,
127
131
  CompactionStyle compaction_style,
128
132
  VersionStorageInfo* src_vstorage,
129
- bool _force_consistency_checks);
133
+ bool _force_consistency_checks,
134
+ EpochNumberRequirement epoch_number_requirement =
135
+ EpochNumberRequirement::kMustPresent);
130
136
  // No copying allowed
131
137
  VersionStorageInfo(const VersionStorageInfo&) = delete;
132
138
  void operator=(const VersionStorageInfo&) = delete;
@@ -319,6 +325,17 @@ class VersionStorageInfo {
319
325
  return files_[level];
320
326
  }
321
327
 
328
+ bool HasMissingEpochNumber() const;
329
+ uint64_t GetMaxEpochNumberOfFiles() const;
330
+ EpochNumberRequirement GetEpochNumberRequirement() const {
331
+ return epoch_number_requirement_;
332
+ }
333
+ void SetEpochNumberRequirement(
334
+ EpochNumberRequirement epoch_number_requirement) {
335
+ epoch_number_requirement_ = epoch_number_requirement;
336
+ }
337
+ void RecoverEpochNumbers(ColumnFamilyData* cfd);
338
+
322
339
  class FileLocation {
323
340
  public:
324
341
  FileLocation() = default;
@@ -440,6 +457,11 @@ class VersionStorageInfo {
440
457
  return files_marked_for_compaction_;
441
458
  }
442
459
 
460
+ void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) {
461
+ f->marked_for_compaction = true;
462
+ files_marked_for_compaction_.emplace_back(level, f);
463
+ }
464
+
443
465
  // REQUIRES: ComputeCompactionScore has been called
444
466
  // REQUIRES: DB mutex held during access
445
467
  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
@@ -723,6 +745,8 @@ class VersionStorageInfo {
723
745
  // is compiled in release mode
724
746
  bool force_consistency_checks_;
725
747
 
748
+ EpochNumberRequirement epoch_number_requirement_;
749
+
726
750
  friend class Version;
727
751
  friend class VersionSet;
728
752
  };
@@ -1047,7 +1071,9 @@ class Version {
1047
1071
  Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
1048
1072
  MutableCFOptions mutable_cf_options,
1049
1073
  const std::shared_ptr<IOTracer>& io_tracer,
1050
- uint64_t version_number = 0);
1074
+ uint64_t version_number = 0,
1075
+ EpochNumberRequirement epoch_number_requirement =
1076
+ EpochNumberRequirement::kMustPresent);
1051
1077
 
1052
1078
  ~Version();
1053
1079
 
@@ -1188,6 +1214,10 @@ class VersionSet {
1188
1214
  const std::vector<ColumnFamilyDescriptor>& column_families,
1189
1215
  bool read_only, std::string* db_id, bool* has_missing_table_file);
1190
1216
 
1217
+ // Recover the next epoch number of each CFs and epoch number
1218
+ // of their files (if missing)
1219
+ void RecoverEpochNumbers();
1220
+
1191
1221
  // Reads a manifest file and returns a list of column families in
1192
1222
  // column_families.
1193
1223
  static Status ListColumnFamilies(std::vector<std::string>* column_families,
@@ -1501,8 +1531,8 @@ class VersionSet {
1501
1531
  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
1502
1532
  const VersionEdit* edit);
1503
1533
 
1504
- Status VerifyFileMetadata(const std::string& fpath,
1505
- const FileMetaData& meta) const;
1534
+ Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
1535
+ int level, const FileMetaData& meta);
1506
1536
 
1507
1537
  // Protected by DB mutex.
1508
1538
  WalSet wals_;