@nxtedition/rocksdb 8.0.0 → 8.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/BUILDING.md +2 -2
  2. package/binding.cc +2 -7
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -9
  4. package/deps/rocksdb/rocksdb/Makefile +2 -2
  5. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
  10. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +57 -146
  11. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
  12. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
  13. package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
  14. package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
  15. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
  35. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  36. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
  37. package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
  38. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
  39. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
  49. package/deps/rocksdb/rocksdb/db/db_iter.cc +57 -36
  50. package/deps/rocksdb/rocksdb/db/db_iter.h +2 -1
  51. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
  52. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
  53. package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
  54. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
  55. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
  56. package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
  57. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
  58. package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
  60. package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
  61. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  62. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
  63. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
  64. package/deps/rocksdb/rocksdb/db/memtable.cc +55 -9
  65. package/deps/rocksdb/rocksdb/db/merge_helper.cc +76 -102
  66. package/deps/rocksdb/rocksdb/db/merge_helper.h +2 -11
  67. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
  68. package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
  69. package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
  70. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
  71. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  72. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
  73. package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
  74. package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
  75. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
  77. package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
  78. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
  79. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
  80. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
  81. package/deps/rocksdb/rocksdb/db/version_set.cc +219 -38
  82. package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
  83. package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
  84. package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +122 -61
  85. package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
  89. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
  90. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +1 -1
  91. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
  92. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
  93. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
  94. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
  95. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
  96. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
  97. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
  98. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
  99. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  100. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
  101. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +3 -0
  102. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  103. package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
  104. package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
  105. package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
  106. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  107. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  108. package/deps/rocksdb/rocksdb/src.mk +2 -1
  109. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
  110. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +2 -10
  111. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +12 -29
  112. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
  113. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +0 -39
  114. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +0 -1
  115. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
  116. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
  117. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
  118. package/deps/rocksdb/rocksdb/table/format.cc +24 -20
  119. package/deps/rocksdb/rocksdb/table/format.h +5 -2
  120. package/deps/rocksdb/rocksdb/table/get_context.cc +52 -11
  121. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
  122. package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
  123. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
  124. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  125. package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
  126. package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
  127. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
  128. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
  129. package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
  130. package/deps/rocksdb/rocksdb/util/crc32c.cc +1 -1
  131. package/deps/rocksdb/rocksdb/util/status.cc +7 -0
  132. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
  133. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
  134. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
  135. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
  136. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
  137. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
  138. package/deps/rocksdb/rocksdb.gyp +2 -1
  139. package/package.json +1 -1
  140. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  141. package/prebuilds/linux-x64/node.napi.node +0 -0
  142. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
  143. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
  144. package/max_rev_operator.h +0 -100
@@ -39,6 +39,8 @@
39
39
  #include "db/table_cache.h"
40
40
  #include "db/version_builder.h"
41
41
  #include "db/version_edit_handler.h"
42
+ #include "table/compaction_merging_iterator.h"
43
+
42
44
  #if USE_COROUTINES
43
45
  #include "folly/experimental/coro/BlockingWait.h"
44
46
  #include "folly/experimental/coro/Collect.h"
@@ -1771,8 +1773,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
1771
1773
  file->stats.num_reads_sampled.load(std::memory_order_relaxed),
1772
1774
  file->being_compacted, file->temperature,
1773
1775
  file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
1774
- file->TryGetFileCreationTime(), file->file_checksum,
1775
- file->file_checksum_func_name);
1776
+ file->TryGetFileCreationTime(), file->epoch_number,
1777
+ file->file_checksum, file->file_checksum_func_name);
1776
1778
  files.back().num_entries = file->num_entries;
1777
1779
  files.back().num_deletions = file->num_deletions;
1778
1780
  level_size += file->fd.GetFileSize();
@@ -2036,7 +2038,8 @@ VersionStorageInfo::VersionStorageInfo(
2036
2038
  const InternalKeyComparator* internal_comparator,
2037
2039
  const Comparator* user_comparator, int levels,
2038
2040
  CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
2039
- bool _force_consistency_checks)
2041
+ bool _force_consistency_checks,
2042
+ EpochNumberRequirement epoch_number_requirement)
2040
2043
  : internal_comparator_(internal_comparator),
2041
2044
  user_comparator_(user_comparator),
2042
2045
  // cfd is nullptr if Version is dummy
@@ -2064,7 +2067,8 @@ VersionStorageInfo::VersionStorageInfo(
2064
2067
  current_num_samples_(0),
2065
2068
  estimated_compaction_needed_bytes_(0),
2066
2069
  finalized_(false),
2067
- force_consistency_checks_(_force_consistency_checks) {
2070
+ force_consistency_checks_(_force_consistency_checks),
2071
+ epoch_number_requirement_(epoch_number_requirement) {
2068
2072
  if (ref_vstorage != nullptr) {
2069
2073
  accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
2070
2074
  accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
@@ -2085,7 +2089,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
2085
2089
  const FileOptions& file_opt,
2086
2090
  const MutableCFOptions mutable_cf_options,
2087
2091
  const std::shared_ptr<IOTracer>& io_tracer,
2088
- uint64_t version_number)
2092
+ uint64_t version_number,
2093
+ EpochNumberRequirement epoch_number_requirement)
2089
2094
  : env_(vset->env_),
2090
2095
  clock_(vset->clock_),
2091
2096
  cfd_(column_family_data),
@@ -2104,7 +2109,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
2104
2109
  (cfd_ == nullptr || cfd_->current() == nullptr)
2105
2110
  ? nullptr
2106
2111
  : cfd_->current()->storage_info(),
2107
- cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
2112
+ cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks,
2113
+ epoch_number_requirement),
2108
2114
  vset_(vset),
2109
2115
  next_(this),
2110
2116
  prev_(this),
@@ -2384,15 +2390,19 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
2384
2390
  }
2385
2391
  // merge_operands are in saver and we hit the beginning of the key history
2386
2392
  // do a final merge of nullptr and operands;
2387
- std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
2388
- if (str_value || columns) {
2393
+ if (value || columns) {
2394
+ std::string result;
2389
2395
  *status = MergeHelper::TimedFullMerge(
2390
2396
  merge_operator_, user_key, nullptr, merge_context->GetOperands(),
2391
- str_value, columns, info_log_, db_statistics_, clock_,
2397
+ &result, info_log_, db_statistics_, clock_,
2392
2398
  /* result_operand */ nullptr, /* update_num_ops_stats */ true);
2393
2399
  if (status->ok()) {
2394
2400
  if (LIKELY(value != nullptr)) {
2401
+ *(value->GetSelf()) = std::move(result);
2395
2402
  value->PinSelf();
2403
+ } else {
2404
+ assert(columns != nullptr);
2405
+ columns->SetPlainValue(result);
2396
2406
  }
2397
2407
  }
2398
2408
  }
@@ -2535,16 +2545,19 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
2535
2545
  }
2536
2546
  f = fp.GetNextFileInLevel();
2537
2547
  }
2538
- if (s.ok() && mget_tasks.size() > 0) {
2548
+ if (mget_tasks.size() > 0) {
2539
2549
  RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
2540
2550
  mget_tasks.size());
2541
2551
  // Collect all results so far
2542
2552
  std::vector<Status> statuses = folly::coro::blockingWait(
2543
2553
  folly::coro::collectAllRange(std::move(mget_tasks))
2544
2554
  .scheduleOn(&range->context()->executor()));
2545
- for (Status stat : statuses) {
2546
- if (!stat.ok()) {
2547
- s = stat;
2555
+ if (s.ok()) {
2556
+ for (Status stat : statuses) {
2557
+ if (!stat.ok()) {
2558
+ s = std::move(stat);
2559
+ break;
2560
+ }
2548
2561
  }
2549
2562
  }
2550
2563
 
@@ -2790,6 +2803,9 @@ Status Version::MultiGetAsync(
2790
2803
  unsigned int num_tasks_queued = 0;
2791
2804
  to_process.pop_front();
2792
2805
  if (batch->IsSearchEnded() || batch->GetRange().empty()) {
2806
+ // If to_process is empty, i.e no more batches to look at, then we need
2807
+ // schedule the enqueued coroutines and wait for them. Otherwise, we
2808
+ // skip this batch and move to the next one in to_process.
2793
2809
  if (!to_process.empty()) {
2794
2810
  continue;
2795
2811
  }
@@ -2798,9 +2814,6 @@ Status Version::MultiGetAsync(
2798
2814
  // to_process
2799
2815
  s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
2800
2816
  to_process, num_tasks_queued, mget_stats);
2801
- if (!s.ok()) {
2802
- break;
2803
- }
2804
2817
  // If ProcessBatch didn't enqueue any coroutine tasks, it means all
2805
2818
  // keys were filtered out. So put the batch back in to_process to
2806
2819
  // lookup in the next level
@@ -2811,8 +2824,10 @@ Status Version::MultiGetAsync(
2811
2824
  waiting.emplace_back(idx);
2812
2825
  }
2813
2826
  }
2814
- if (to_process.empty()) {
2815
- if (s.ok() && mget_tasks.size() > 0) {
2827
+ // If ProcessBatch() returned an error, then schedule the enqueued
2828
+ // coroutines and wait for them, then abort the MultiGet.
2829
+ if (to_process.empty() || !s.ok()) {
2830
+ if (mget_tasks.size() > 0) {
2816
2831
  assert(waiting.size());
2817
2832
  RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
2818
2833
  // Collect all results so far
@@ -2820,10 +2835,12 @@ Status Version::MultiGetAsync(
2820
2835
  folly::coro::collectAllRange(std::move(mget_tasks))
2821
2836
  .scheduleOn(&range->context()->executor()));
2822
2837
  mget_tasks.clear();
2823
- for (Status stat : statuses) {
2824
- if (!stat.ok()) {
2825
- s = stat;
2826
- break;
2838
+ if (s.ok()) {
2839
+ for (Status stat : statuses) {
2840
+ if (!stat.ok()) {
2841
+ s = std::move(stat);
2842
+ break;
2843
+ }
2827
2844
  }
2828
2845
  }
2829
2846
 
@@ -2846,6 +2863,9 @@ Status Version::MultiGetAsync(
2846
2863
  assert(!s.ok() || waiting.size() == 0);
2847
2864
  }
2848
2865
  }
2866
+ if (!s.ok()) {
2867
+ break;
2868
+ }
2849
2869
  }
2850
2870
 
2851
2871
  uint64_t num_levels = 0;
@@ -4266,6 +4286,74 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
4266
4286
  return scratch->buffer;
4267
4287
  }
4268
4288
 
4289
+ bool VersionStorageInfo::HasMissingEpochNumber() const {
4290
+ for (int level = 0; level < num_levels_; ++level) {
4291
+ for (const FileMetaData* f : files_[level]) {
4292
+ if (f->epoch_number == kUnknownEpochNumber) {
4293
+ return true;
4294
+ }
4295
+ }
4296
+ }
4297
+ return false;
4298
+ }
4299
+
4300
+ uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const {
4301
+ uint64_t max_epoch_number = kUnknownEpochNumber;
4302
+ for (int level = 0; level < num_levels_; ++level) {
4303
+ for (const FileMetaData* f : files_[level]) {
4304
+ max_epoch_number = std::max(max_epoch_number, f->epoch_number);
4305
+ }
4306
+ }
4307
+ return max_epoch_number;
4308
+ }
4309
+
4310
+ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) {
4311
+ cfd->ResetNextEpochNumber();
4312
+
4313
+ bool reserve_epoch_num_for_file_ingested_behind =
4314
+ cfd->ioptions()->allow_ingest_behind;
4315
+ if (reserve_epoch_num_for_file_ingested_behind) {
4316
+ uint64_t reserved_epoch_number = cfd->NewEpochNumber();
4317
+ assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind);
4318
+ ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(),
4319
+ "[%s]CF has reserved epoch number %" PRIu64
4320
+ " for files ingested "
4321
+ "behind since `Options::allow_ingest_behind` is true",
4322
+ cfd->GetName().c_str(), reserved_epoch_number);
4323
+ }
4324
+
4325
+ if (HasMissingEpochNumber()) {
4326
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing);
4327
+ assert(num_levels_ >= 1);
4328
+
4329
+ for (int level = num_levels_ - 1; level >= 1; --level) {
4330
+ auto& files_at_level = files_[level];
4331
+ if (files_at_level.empty()) {
4332
+ continue;
4333
+ }
4334
+ uint64_t next_epoch_number = cfd->NewEpochNumber();
4335
+ for (FileMetaData* f : files_at_level) {
4336
+ f->epoch_number = next_epoch_number;
4337
+ }
4338
+ }
4339
+
4340
+ for (auto file_meta_iter = files_[0].rbegin();
4341
+ file_meta_iter != files_[0].rend(); file_meta_iter++) {
4342
+ FileMetaData* f = *file_meta_iter;
4343
+ f->epoch_number = cfd->NewEpochNumber();
4344
+ }
4345
+
4346
+ ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(),
4347
+ "[%s]CF's epoch numbers are inferred based on seqno",
4348
+ cfd->GetName().c_str());
4349
+ epoch_number_requirement_ = EpochNumberRequirement::kMustPresent;
4350
+ } else {
4351
+ assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent);
4352
+ cfd->SetNextEpochNumber(
4353
+ std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber()));
4354
+ }
4355
+ }
4356
+
4269
4357
  uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
4270
4358
  uint64_t result = 0;
4271
4359
  std::vector<FileMetaData*> overlaps;
@@ -4963,10 +5051,15 @@ Status VersionSet::ProcessManifestWrites(
4963
5051
  if (!descriptor_log_ ||
4964
5052
  manifest_file_size_ > db_options_->max_manifest_file_size) {
4965
5053
  TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
5054
+ TEST_SYNC_POINT_CALLBACK(
5055
+ "VersionSet::ProcessManifestWrites:BeforeNewManifest", nullptr);
4966
5056
  new_descriptor_log = true;
4967
5057
  } else {
4968
5058
  pending_manifest_file_number_ = manifest_file_number_;
4969
5059
  }
5060
+ TEST_SYNC_POINT_CALLBACK(
5061
+ "VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot",
5062
+ &new_descriptor_log);
4970
5063
 
4971
5064
  // Local cached copy of state variable(s). WriteCurrentStateToManifest()
4972
5065
  // reads its content after releasing db mutex to avoid race with
@@ -5095,6 +5188,7 @@ Status VersionSet::ProcessManifestWrites(
5095
5188
  break;
5096
5189
  }
5097
5190
  }
5191
+
5098
5192
  if (s.ok()) {
5099
5193
  io_s = SyncManifest(db_options_, descriptor_log_->file());
5100
5194
  manifest_io_status = io_s;
@@ -5502,7 +5596,8 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
5502
5596
  Status VersionSet::Recover(
5503
5597
  const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
5504
5598
  std::string* db_id, bool no_error_if_files_missing) {
5505
- // Read "CURRENT" file, which contains a pointer to the current manifest file
5599
+ // Read "CURRENT" file, which contains a pointer to the current manifest
5600
+ // file
5506
5601
  std::string manifest_path;
5507
5602
  Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
5508
5603
  &manifest_file_number_);
@@ -5536,7 +5631,8 @@ Status VersionSet::Recover(
5536
5631
  true /* checksum */, 0 /* log_number */);
5537
5632
  VersionEditHandler handler(
5538
5633
  read_only, column_families, const_cast<VersionSet*>(this),
5539
- /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
5634
+ /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
5635
+ EpochNumberRequirement::kMightMissing);
5540
5636
  handler.Iterate(reader, &log_read_status);
5541
5637
  s = handler.status();
5542
5638
  if (s.ok()) {
@@ -5545,6 +5641,9 @@ Status VersionSet::Recover(
5545
5641
  assert(current_manifest_file_size != 0);
5546
5642
  handler.GetDbId(db_id);
5547
5643
  }
5644
+ if (s.ok()) {
5645
+ RecoverEpochNumbers();
5646
+ }
5548
5647
  }
5549
5648
 
5550
5649
  if (s.ok()) {
@@ -5704,7 +5803,8 @@ Status VersionSet::TryRecoverFromOneManifest(
5704
5803
  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
5705
5804
  /*checksum=*/true, /*log_num=*/0);
5706
5805
  VersionEditHandlerPointInTime handler_pit(
5707
- read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
5806
+ read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
5807
+ EpochNumberRequirement::kMightMissing);
5708
5808
 
5709
5809
  handler_pit.Iterate(reader, &s);
5710
5810
 
@@ -5713,7 +5813,21 @@ Status VersionSet::TryRecoverFromOneManifest(
5713
5813
  assert(nullptr != has_missing_table_file);
5714
5814
  *has_missing_table_file = handler_pit.HasMissingFiles();
5715
5815
 
5716
- return handler_pit.status();
5816
+ s = handler_pit.status();
5817
+ if (s.ok()) {
5818
+ RecoverEpochNumbers();
5819
+ }
5820
+ return s;
5821
+ }
5822
+
5823
+ void VersionSet::RecoverEpochNumbers() {
5824
+ for (auto cfd : *column_family_set_) {
5825
+ if (cfd->IsDropped()) {
5826
+ continue;
5827
+ }
5828
+ assert(cfd->initialized());
5829
+ cfd->RecoverEpochNumbers();
5830
+ }
5717
5831
  }
5718
5832
 
5719
5833
  Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
@@ -6033,6 +6147,22 @@ Status VersionSet::WriteCurrentStateToManifest(
6033
6147
  }
6034
6148
  }
6035
6149
 
6150
+ // New manifest should rollover the WAL deletion record from previous
6151
+ // manifest. Otherwise, when an addition record of a deleted WAL gets added to
6152
+ // this new manifest later (which can happens in e.g, SyncWAL()), this new
6153
+ // manifest creates an illusion that such WAL hasn't been deleted.
6154
+ VersionEdit wal_deletions;
6155
+ wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
6156
+ std::string wal_deletions_record;
6157
+ if (!wal_deletions.EncodeTo(&wal_deletions_record)) {
6158
+ return Status::Corruption("Unable to Encode VersionEdit: " +
6159
+ wal_deletions.DebugString(true));
6160
+ }
6161
+ io_s = log->AddRecord(wal_deletions_record);
6162
+ if (!io_s.ok()) {
6163
+ return io_s;
6164
+ }
6165
+
6036
6166
  for (auto cfd : *column_family_set_) {
6037
6167
  assert(cfd);
6038
6168
 
@@ -6084,7 +6214,7 @@ Status VersionSet::WriteCurrentStateToManifest(
6084
6214
  f->fd.smallest_seqno, f->fd.largest_seqno,
6085
6215
  f->marked_for_compaction, f->temperature,
6086
6216
  f->oldest_blob_file_number, f->oldest_ancester_time,
6087
- f->file_creation_time, f->file_checksum,
6217
+ f->file_creation_time, f->epoch_number, f->file_checksum,
6088
6218
  f->file_checksum_func_name, f->unique_id);
6089
6219
  }
6090
6220
  }
@@ -6456,6 +6586,14 @@ InternalIterator* VersionSet::MakeInputIterator(
6456
6586
  c->num_input_levels() - 1
6457
6587
  : c->num_input_levels());
6458
6588
  InternalIterator** list = new InternalIterator*[space];
6589
+ // First item in the pair is a pointer to range tombstones.
6590
+ // Second item is a pointer to a member of a LevelIterator,
6591
+ // that will be initialized to where CompactionMergingIterator stores
6592
+ // pointer to its range tombstones. This is used by LevelIterator
6593
+ // to update pointer to range tombstones as it traverse different SST files.
6594
+ std::vector<
6595
+ std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
6596
+ range_tombstones;
6459
6597
  size_t num = 0;
6460
6598
  for (size_t which = 0; which < c->num_input_levels(); which++) {
6461
6599
  if (c->input_levels(which)->num_files != 0) {
@@ -6476,7 +6614,7 @@ InternalIterator* VersionSet::MakeInputIterator(
6476
6614
  end.value(), fmd.smallest.user_key()) < 0) {
6477
6615
  continue;
6478
6616
  }
6479
-
6617
+ TruncatedRangeDelIterator* range_tombstone_iter = nullptr;
6480
6618
  list[num++] = cfd->table_cache()->NewIterator(
6481
6619
  read_options, file_options_compactions,
6482
6620
  cfd->internal_comparator(), fmd, range_del_agg,
@@ -6489,10 +6627,13 @@ InternalIterator* VersionSet::MakeInputIterator(
6489
6627
  MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
6490
6628
  /*smallest_compaction_key=*/nullptr,
6491
6629
  /*largest_compaction_key=*/nullptr,
6492
- /*allow_unprepared_value=*/false);
6630
+ /*allow_unprepared_value=*/false,
6631
+ /*range_del_iter=*/&range_tombstone_iter);
6632
+ range_tombstones.emplace_back(range_tombstone_iter, nullptr);
6493
6633
  }
6494
6634
  } else {
6495
6635
  // Create concatenating iterator for the files from this level
6636
+ TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
6496
6637
  list[num++] = new LevelIterator(
6497
6638
  cfd->table_cache(), read_options, file_options_compactions,
6498
6639
  cfd->internal_comparator(), c->input_levels(which),
@@ -6501,14 +6642,15 @@ InternalIterator* VersionSet::MakeInputIterator(
6501
6642
  /*no per level latency histogram=*/nullptr,
6502
6643
  TableReaderCaller::kCompaction, /*skip_filters=*/false,
6503
6644
  /*level=*/static_cast<int>(c->level(which)), range_del_agg,
6504
- c->boundaries(which));
6645
+ c->boundaries(which), false, &tombstone_iter_ptr);
6646
+ range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
6505
6647
  }
6506
6648
  }
6507
6649
  }
6508
6650
  assert(num <= space);
6509
- InternalIterator* result =
6510
- NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
6511
- static_cast<int>(num));
6651
+ InternalIterator* result = NewCompactionMergingIterator(
6652
+ &c->column_family_data()->internal_comparator(), list,
6653
+ static_cast<int>(num), range_tombstones);
6512
6654
  delete[] list;
6513
6655
  return result;
6514
6656
  }
@@ -6575,6 +6717,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
6575
6717
  filemetadata.temperature = file->temperature;
6576
6718
  filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
6577
6719
  filemetadata.file_creation_time = file->TryGetFileCreationTime();
6720
+ filemetadata.epoch_number = file->epoch_number;
6578
6721
  metadata->push_back(filemetadata);
6579
6722
  }
6580
6723
  }
@@ -6700,8 +6843,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
6700
6843
  return all_versions_blob_file_size;
6701
6844
  }
6702
6845
 
6703
- Status VersionSet::VerifyFileMetadata(const std::string& fpath,
6704
- const FileMetaData& meta) const {
6846
+ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
6847
+ const std::string& fpath, int level,
6848
+ const FileMetaData& meta) {
6705
6849
  uint64_t fsize = 0;
6706
6850
  Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
6707
6851
  if (status.ok()) {
@@ -6709,6 +6853,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath,
6709
6853
  status = Status::Corruption("File size mismatch: " + fpath);
6710
6854
  }
6711
6855
  }
6856
+ if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
6857
+ assert(cfd);
6858
+ TableCache* table_cache = cfd->table_cache();
6859
+ assert(table_cache);
6860
+
6861
+ const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
6862
+ assert(cf_opts);
6863
+ std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
6864
+ size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
6865
+
6866
+ const FileOptions& file_opts = file_options();
6867
+
6868
+ Version* version = cfd->current();
6869
+ assert(version);
6870
+ VersionStorageInfo& storage_info = version->storage_info_;
6871
+ const InternalKeyComparator* icmp = storage_info.InternalComparator();
6872
+ assert(icmp);
6873
+
6874
+ InternalStats* internal_stats = cfd->internal_stats();
6875
+
6876
+ FileMetaData meta_copy = meta;
6877
+ status = table_cache->FindTable(
6878
+ ReadOptions(), file_opts, *icmp, meta_copy,
6879
+ &(meta_copy.table_reader_handle), pe,
6880
+ /*no_io=*/false, /*record_read_stats=*/true,
6881
+ internal_stats->GetFileReadHist(level), false, level,
6882
+ /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
6883
+ meta_copy.temperature);
6884
+ if (meta_copy.table_reader_handle) {
6885
+ table_cache->ReleaseHandle(meta_copy.table_reader_handle);
6886
+ }
6887
+ }
6712
6888
  return status;
6713
6889
  }
6714
6890
 
@@ -6744,12 +6920,17 @@ Status ReactiveVersionSet::Recover(
6744
6920
  log::Reader* reader = manifest_reader->get();
6745
6921
  assert(reader);
6746
6922
 
6747
- manifest_tailer_.reset(new ManifestTailer(
6748
- column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
6923
+ manifest_tailer_.reset(
6924
+ new ManifestTailer(column_families, const_cast<ReactiveVersionSet*>(this),
6925
+ io_tracer_, EpochNumberRequirement::kMightMissing));
6749
6926
 
6750
6927
  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
6751
6928
 
6752
- return manifest_tailer_->status();
6929
+ s = manifest_tailer_->status();
6930
+ if (s.ok()) {
6931
+ RecoverEpochNumbers();
6932
+ }
6933
+ return s;
6753
6934
  }
6754
6935
 
6755
6936
  Status ReactiveVersionSet::ReadAndApply(
@@ -116,6 +116,10 @@ extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
116
116
  extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
117
117
  const std::vector<FileMetaData*>& files,
118
118
  Arena* arena);
119
+ enum EpochNumberRequirement {
120
+ kMightMissing,
121
+ kMustPresent,
122
+ };
119
123
 
120
124
  // Information of the storage associated with each Version, including number of
121
125
  // levels of LSM tree, files information at each level, files marked for
@@ -126,7 +130,9 @@ class VersionStorageInfo {
126
130
  const Comparator* user_comparator, int num_levels,
127
131
  CompactionStyle compaction_style,
128
132
  VersionStorageInfo* src_vstorage,
129
- bool _force_consistency_checks);
133
+ bool _force_consistency_checks,
134
+ EpochNumberRequirement epoch_number_requirement =
135
+ EpochNumberRequirement::kMustPresent);
130
136
  // No copying allowed
131
137
  VersionStorageInfo(const VersionStorageInfo&) = delete;
132
138
  void operator=(const VersionStorageInfo&) = delete;
@@ -319,6 +325,17 @@ class VersionStorageInfo {
319
325
  return files_[level];
320
326
  }
321
327
 
328
+ bool HasMissingEpochNumber() const;
329
+ uint64_t GetMaxEpochNumberOfFiles() const;
330
+ EpochNumberRequirement GetEpochNumberRequirement() const {
331
+ return epoch_number_requirement_;
332
+ }
333
+ void SetEpochNumberRequirement(
334
+ EpochNumberRequirement epoch_number_requirement) {
335
+ epoch_number_requirement_ = epoch_number_requirement;
336
+ }
337
+ void RecoverEpochNumbers(ColumnFamilyData* cfd);
338
+
322
339
  class FileLocation {
323
340
  public:
324
341
  FileLocation() = default;
@@ -440,6 +457,11 @@ class VersionStorageInfo {
440
457
  return files_marked_for_compaction_;
441
458
  }
442
459
 
460
+ void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) {
461
+ f->marked_for_compaction = true;
462
+ files_marked_for_compaction_.emplace_back(level, f);
463
+ }
464
+
443
465
  // REQUIRES: ComputeCompactionScore has been called
444
466
  // REQUIRES: DB mutex held during access
445
467
  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
@@ -723,6 +745,8 @@ class VersionStorageInfo {
723
745
  // is compiled in release mode
724
746
  bool force_consistency_checks_;
725
747
 
748
+ EpochNumberRequirement epoch_number_requirement_;
749
+
726
750
  friend class Version;
727
751
  friend class VersionSet;
728
752
  };
@@ -1047,7 +1071,9 @@ class Version {
1047
1071
  Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
1048
1072
  MutableCFOptions mutable_cf_options,
1049
1073
  const std::shared_ptr<IOTracer>& io_tracer,
1050
- uint64_t version_number = 0);
1074
+ uint64_t version_number = 0,
1075
+ EpochNumberRequirement epoch_number_requirement =
1076
+ EpochNumberRequirement::kMustPresent);
1051
1077
 
1052
1078
  ~Version();
1053
1079
 
@@ -1188,6 +1214,10 @@ class VersionSet {
1188
1214
  const std::vector<ColumnFamilyDescriptor>& column_families,
1189
1215
  bool read_only, std::string* db_id, bool* has_missing_table_file);
1190
1216
 
1217
+ // Recover the next epoch number of each CFs and epoch number
1218
+ // of their files (if missing)
1219
+ void RecoverEpochNumbers();
1220
+
1191
1221
  // Reads a manifest file and returns a list of column families in
1192
1222
  // column_families.
1193
1223
  static Status ListColumnFamilies(std::vector<std::string>* column_families,
@@ -1501,8 +1531,8 @@ class VersionSet {
1501
1531
  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
1502
1532
  const VersionEdit* edit);
1503
1533
 
1504
- Status VerifyFileMetadata(const std::string& fpath,
1505
- const FileMetaData& meta) const;
1534
+ Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
1535
+ int level, const FileMetaData& meta);
1506
1536
 
1507
1537
  // Protected by DB mutex.
1508
1538
  WalSet wals_;