@nxtedition/rocksdb 12.1.3 → 12.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/binding.cc +12 -13
  2. package/binding.gyp +0 -4
  3. package/deps/rocksdb/rocksdb/Makefile +10 -5
  4. package/deps/rocksdb/rocksdb/TARGETS +9 -7
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
  6. package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
  13. package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
  15. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
  16. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
  19. package/deps/rocksdb/rocksdb/db/c.cc +9 -0
  20. package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
  21. package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
  22. package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
  40. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
  54. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
  56. package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
  57. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
  58. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
  59. package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
  60. package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
  61. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  62. package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
  63. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
  65. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
  66. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
  67. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
  68. package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
  70. package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
  71. package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
  72. package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
  73. package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  75. package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
  78. package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
  79. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
  80. package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
  81. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
  85. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
  86. package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
  87. package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
  88. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
  89. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
  90. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
  91. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
  92. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
  94. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
  95. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
  96. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
  97. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
  98. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  100. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  101. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
  102. package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
  103. package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
  104. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  105. package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
  106. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  107. package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
  108. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
  109. package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
  110. package/deps/rocksdb/rocksdb/port/port.h +16 -0
  111. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
  112. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
  113. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
  114. package/deps/rocksdb/rocksdb/table/format.cc +32 -4
  115. package/deps/rocksdb/rocksdb/table/format.h +12 -1
  116. package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
  117. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
  118. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
  119. package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
  120. package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
  121. package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
  122. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
  123. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
  124. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
  125. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
  126. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
  127. package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
  128. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
  129. package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
  130. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
  131. package/package.json +1 -1
  132. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  133. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -43,6 +43,14 @@ uint64_t DBImpl::GetObsoleteSstFilesSize() {
43
43
  return versions_->GetObsoleteSstFilesSize();
44
44
  }
45
45
 
46
+ uint64_t DBImpl::MinOptionsFileNumberToKeep() {
47
+ mutex_.AssertHeld();
48
+ if (!min_options_file_numbers_.empty()) {
49
+ return *min_options_file_numbers_.begin();
50
+ }
51
+ return std::numeric_limits<uint64_t>::max();
52
+ }
53
+
46
54
  Status DBImpl::DisableFileDeletions() {
47
55
  Status s;
48
56
  int my_disable_delete_obsolete_files;
@@ -147,6 +155,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
147
155
  // here but later find newer generated unfinalized files while scanning.
148
156
  job_context->min_pending_output = MinObsoleteSstNumberToKeep();
149
157
  job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine();
158
+ job_context->min_options_file_number = MinOptionsFileNumberToKeep();
150
159
 
151
160
  // Get obsolete files. This function will also update the list of
152
161
  // pending files in VersionSet().
@@ -498,7 +507,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
498
507
  dbname_);
499
508
 
500
509
  // File numbers of most recent two OPTIONS file in candidate_files (found in
501
- // previos FindObsoleteFiles(full_scan=true))
510
+ // previous FindObsoleteFiles(full_scan=true))
502
511
  // At this point, there must not be any duplicate file numbers in
503
512
  // candidate_files.
504
513
  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
@@ -519,6 +528,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
519
528
  }
520
529
  }
521
530
 
531
+ // For remote compactions, we need to keep OPTIONS file that may get
532
+ // referenced by the remote worker
533
+
534
+ optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number);
535
+
522
536
  // Close WALs before trying to delete them.
523
537
  for (const auto w : state.logs_to_free) {
524
538
  // TODO: maybe check the return value of Close.
@@ -722,6 +736,38 @@ void DBImpl::DeleteObsoleteFiles() {
722
736
  mutex_.Lock();
723
737
  }
724
738
 
739
+ VersionEdit GetDBRecoveryEditForObsoletingMemTables(
740
+ VersionSet* vset, const ColumnFamilyData& cfd,
741
+ const autovector<VersionEdit*>& edit_list,
742
+ const autovector<MemTable*>& memtables, LogsWithPrepTracker* prep_tracker) {
743
+ VersionEdit wal_deletion_edit;
744
+ uint64_t min_wal_number_to_keep = 0;
745
+ assert(edit_list.size() > 0);
746
+ if (vset->db_options()->allow_2pc) {
747
+ // Note that if mempurge is successful, the edit_list will
748
+ // not be applicable (contains info of new min_log number to keep,
749
+ // and level 0 file path of SST file created during normal flush,
750
+ // so both pieces of information are irrelevant after a successful
751
+ // mempurge operation).
752
+ min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
753
+ vset, cfd, edit_list, memtables, prep_tracker);
754
+
755
+ // We piggyback the information of earliest log file to keep in the
756
+ // manifest entry for the last file flushed.
757
+ } else {
758
+ min_wal_number_to_keep =
759
+ PrecomputeMinLogNumberToKeepNon2PC(vset, cfd, edit_list);
760
+ }
761
+
762
+ wal_deletion_edit.SetMinLogNumberToKeep(min_wal_number_to_keep);
763
+ if (vset->db_options()->track_and_verify_wals_in_manifest) {
764
+ if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
765
+ wal_deletion_edit.DeleteWalsBefore(min_wal_number_to_keep);
766
+ }
767
+ }
768
+ return wal_deletion_edit;
769
+ }
770
+
725
771
  uint64_t FindMinPrepLogReferencedByMemTable(
726
772
  VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
727
773
  uint64_t min_log = 0;
@@ -921,59 +967,60 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
921
967
  }
922
968
 
923
969
  void DBImpl::SetDBId(std::string&& id, bool read_only,
924
- RecoveryContext* recovery_ctx) {
970
+ VersionEdit* version_edit) {
925
971
  assert(db_id_.empty());
926
972
  assert(!id.empty());
927
973
  db_id_ = std::move(id);
928
- if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
929
- assert(recovery_ctx != nullptr);
974
+ if (!read_only && version_edit) {
975
+ assert(version_edit != nullptr);
930
976
  assert(versions_->GetColumnFamilySet() != nullptr);
931
- VersionEdit edit;
932
- edit.SetDBId(db_id_);
977
+ version_edit->SetDBId(db_id_);
933
978
  versions_->db_id_ = db_id_;
934
- recovery_ctx->UpdateVersionEdits(
935
- versions_->GetColumnFamilySet()->GetDefault(), edit);
936
979
  }
937
980
  }
938
981
 
939
982
  Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
940
- RecoveryContext* recovery_ctx) {
983
+ bool is_new_db, VersionEdit* version_edit) {
941
984
  Status s;
942
- // Check for the IDENTITY file and create it if not there or
943
- // broken or not matching manifest
944
- std::string db_id_in_file;
945
- s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
946
- if (s.ok()) {
947
- s = GetDbIdentityFromIdentityFile(&db_id_in_file);
948
- if (s.ok() && !db_id_in_file.empty()) {
949
- if (db_id_.empty()) {
950
- // Loaded from file and wasn't already known from manifest
951
- SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
952
- return s;
953
- } else if (db_id_ == db_id_in_file) {
954
- // Loaded from file and matches manifest
955
- return s;
985
+ if (!is_new_db) {
986
+ // Check for the IDENTITY file and create it if not there or
987
+ // broken or not matching manifest
988
+ std::string db_id_in_file;
989
+ s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
990
+ if (s.ok()) {
991
+ s = GetDbIdentityFromIdentityFile(&db_id_in_file);
992
+ if (s.ok() && !db_id_in_file.empty()) {
993
+ if (db_id_.empty()) {
994
+ // Loaded from file and wasn't already known from manifest
995
+ SetDBId(std::move(db_id_in_file), read_only, version_edit);
996
+ return s;
997
+ } else if (db_id_ == db_id_in_file) {
998
+ // Loaded from file and matches manifest
999
+ return s;
1000
+ }
956
1001
  }
957
1002
  }
958
- }
959
- if (s.IsNotFound()) {
960
- s = Status::OK();
961
- }
962
- if (!s.ok()) {
963
- assert(s.IsIOError());
964
- return s;
1003
+ if (s.IsNotFound()) {
1004
+ s = Status::OK();
1005
+ }
1006
+ if (!s.ok()) {
1007
+ assert(s.IsIOError());
1008
+ return s;
1009
+ }
965
1010
  }
966
1011
  // Otherwise IDENTITY file is missing or no good.
967
1012
  // Generate new id if needed
968
1013
  if (db_id_.empty()) {
969
- SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
1014
+ SetDBId(env_->GenerateUniqueId(), read_only, version_edit);
970
1015
  }
971
1016
  // Persist it to IDENTITY file if allowed
972
- if (!read_only) {
1017
+ if (!read_only && immutable_db_options_.write_identity_file) {
973
1018
  s = SetIdentityFile(write_options, env_, dbname_,
974
1019
  immutable_db_options_.metadata_write_temperature,
975
1020
  db_id_);
976
1021
  }
1022
+ // NOTE: an obsolete IDENTITY file with write_identity_file=false is handled
1023
+ // elsewhere, so that it's only deleted after successful recovery
977
1024
  return s;
978
1025
  }
979
1026
 
@@ -289,28 +289,25 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
289
289
  "start_time and end_time cannot be the same");
290
290
  }
291
291
  }
292
+
293
+ if (!db_options.write_dbid_to_manifest && !db_options.write_identity_file) {
294
+ return Status::InvalidArgument(
295
+ "write_dbid_to_manifest and write_identity_file cannot both be false");
296
+ }
292
297
  return Status::OK();
293
298
  }
294
299
 
295
300
  Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
296
- VersionEdit new_db;
301
+ VersionEdit new_db_edit;
297
302
  const WriteOptions write_options(Env::IOActivity::kDBOpen);
298
- Status s = SetIdentityFile(write_options, env_, dbname_,
299
- immutable_db_options_.metadata_write_temperature);
303
+ Status s = SetupDBId(write_options, /*read_only=*/false, /*is_new_db=*/true,
304
+ &new_db_edit);
300
305
  if (!s.ok()) {
301
306
  return s;
302
307
  }
303
- if (immutable_db_options_.write_dbid_to_manifest) {
304
- std::string temp_db_id;
305
- s = GetDbIdentityFromIdentityFile(&temp_db_id);
306
- if (!s.ok()) {
307
- return s;
308
- }
309
- new_db.SetDBId(temp_db_id);
310
- }
311
- new_db.SetLogNumber(0);
312
- new_db.SetNextFile(2);
313
- new_db.SetLastSequence(0);
308
+ new_db_edit.SetLogNumber(0);
309
+ new_db_edit.SetNextFile(2);
310
+ new_db_edit.SetLastSequence(0);
314
311
 
315
312
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
316
313
  const std::string manifest = DescriptorFileName(dbname_, 1);
@@ -342,7 +339,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
342
339
  tmp_set.Contains(FileType::kDescriptorFile)));
343
340
  log::Writer log(std::move(file_writer), 0, false);
344
341
  std::string record;
345
- new_db.EncodeTo(&record);
342
+ new_db_edit.EncodeTo(&record);
346
343
  s = log.AddRecord(write_options, record);
347
344
  if (s.ok()) {
348
345
  s = SyncManifest(&immutable_db_options_, write_options, log.file());
@@ -528,7 +525,7 @@ Status DBImpl::Recover(
528
525
  }
529
526
  assert(s.ok());
530
527
  }
531
- assert(db_id_.empty());
528
+ assert(is_new_db || db_id_.empty());
532
529
  Status s;
533
530
  bool missing_table_file = false;
534
531
  if (!immutable_db_options_.best_efforts_recovery) {
@@ -674,7 +671,17 @@ Status DBImpl::Recover(
674
671
  }
675
672
  }
676
673
  }
677
- s = SetupDBId(write_options, read_only, recovery_ctx);
674
+ if (is_new_db) {
675
+ // Already set up DB ID in NewDB
676
+ } else if (immutable_db_options_.write_dbid_to_manifest && recovery_ctx) {
677
+ VersionEdit edit;
678
+ s = SetupDBId(write_options, read_only, is_new_db, &edit);
679
+ recovery_ctx->UpdateVersionEdits(
680
+ versions_->GetColumnFamilySet()->GetDefault(), edit);
681
+ } else {
682
+ s = SetupDBId(write_options, read_only, is_new_db, nullptr);
683
+ }
684
+ assert(!s.ok() || !db_id_.empty());
678
685
  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
679
686
  if (s.ok() && !read_only) {
680
687
  s = MaybeUpdateNextFileNumber(recovery_ctx);
@@ -1662,7 +1669,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1662
1669
  TableProperties table_properties;
1663
1670
  {
1664
1671
  ScopedArenaPtr<InternalIterator> iter(
1665
- mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena));
1672
+ mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena,
1673
+ /*prefix_extractor=*/nullptr));
1666
1674
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
1667
1675
  "[%s] [WriteLevel0TableForRecovery]"
1668
1676
  " Level-0 table #%" PRIu64 ": started",
@@ -1681,7 +1689,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1681
1689
  meta.oldest_ancester_time = current_time;
1682
1690
  meta.epoch_number = cfd->NewEpochNumber();
1683
1691
  {
1684
- auto write_hint = cfd->CalculateSSTWriteHint(0);
1692
+ auto write_hint =
1693
+ cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
1685
1694
  mutex_.Unlock();
1686
1695
 
1687
1696
  SequenceNumber earliest_write_conflict_snapshot;
@@ -1987,46 +1996,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
1987
1996
 
1988
1997
  void DBImpl::TrackExistingDataFiles(
1989
1998
  const std::vector<std::string>& existing_data_files) {
1990
- auto sfm = static_cast<SstFileManagerImpl*>(
1991
- immutable_db_options_.sst_file_manager.get());
1992
- assert(sfm);
1993
- std::vector<ColumnFamilyMetaData> metadata;
1994
- GetAllColumnFamilyMetaData(&metadata);
1995
-
1996
- std::unordered_set<std::string> referenced_files;
1997
- for (const auto& md : metadata) {
1998
- for (const auto& lmd : md.levels) {
1999
- for (const auto& fmd : lmd.files) {
2000
- // We're assuming that each sst file name exists in at most one of
2001
- // the paths.
2002
- std::string file_path =
2003
- fmd.directory + kFilePathSeparator + fmd.relative_filename;
2004
- sfm->OnAddFile(file_path, fmd.size).PermitUncheckedError();
2005
- referenced_files.insert(file_path);
2006
- }
2007
- }
2008
- for (const auto& bmd : md.blob_files) {
2009
- std::string name = bmd.blob_file_name;
2010
- // The BlobMetaData.blob_file_name may start with "/".
2011
- if (!name.empty() && name[0] == kFilePathSeparator) {
2012
- name = name.substr(1);
2013
- }
2014
- // We're assuming that each blob file name exists in at most one of
2015
- // the paths.
2016
- std::string file_path = bmd.blob_file_path + kFilePathSeparator + name;
2017
- sfm->OnAddFile(file_path, bmd.blob_file_size).PermitUncheckedError();
2018
- referenced_files.insert(file_path);
2019
- }
2020
- }
2021
-
2022
- for (const auto& file_path : existing_data_files) {
2023
- if (referenced_files.find(file_path) != referenced_files.end()) {
2024
- continue;
2025
- }
2026
- // There shouldn't be any duplicated files. In case there is, SstFileManager
2027
- // will take care of deduping it.
2028
- sfm->OnAddFile(file_path).PermitUncheckedError();
2029
- }
1999
+ TrackOrUntrackFiles(existing_data_files, /*track=*/true);
2030
2000
  }
2031
2001
 
2032
2002
  Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
@@ -2170,6 +2140,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2170
2140
  s = impl->LogAndApplyForRecovery(recovery_ctx);
2171
2141
  }
2172
2142
 
2143
+ if (s.ok() && !impl->immutable_db_options_.write_identity_file) {
2144
+ // On successful recovery, delete an obsolete IDENTITY file to avoid DB ID
2145
+ // inconsistency
2146
+ impl->env_->DeleteFile(IdentityFileName(impl->dbname_))
2147
+ .PermitUncheckedError();
2148
+ }
2149
+
2173
2150
  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
2174
2151
  impl->mutex_.AssertHeld();
2175
2152
  s = impl->InitPersistStatsColumnFamily();
@@ -12,7 +12,8 @@
12
12
  #include "logging/auto_roll_logger.h"
13
13
  #include "logging/logging.h"
14
14
  #include "monitoring/perf_context_imp.h"
15
- #include "rocksdb/configurable.h"
15
+ #include "rocksdb/convenience.h"
16
+ #include "rocksdb/utilities/options_util.h"
16
17
  #include "util/cast_util.h"
17
18
  #include "util/write_batch_util.h"
18
19
 
@@ -938,69 +939,101 @@ Status DB::OpenAndCompact(
938
939
  const std::string& output_directory, const std::string& input,
939
940
  std::string* output,
940
941
  const CompactionServiceOptionsOverride& override_options) {
942
+ // Check for cancellation
941
943
  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
942
944
  return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
943
945
  }
946
+
947
+ // 1. Deserialize Compaction Input
944
948
  CompactionServiceInput compaction_input;
945
949
  Status s = CompactionServiceInput::Read(input, &compaction_input);
946
950
  if (!s.ok()) {
947
951
  return s;
948
952
  }
949
953
 
950
- compaction_input.db_options.max_open_files = -1;
951
- compaction_input.db_options.compaction_service = nullptr;
952
- if (compaction_input.db_options.statistics) {
953
- compaction_input.db_options.statistics.reset();
954
+ // 2. Load the options
955
+ DBOptions db_options;
956
+ ConfigOptions config_options;
957
+ config_options.env = override_options.env;
958
+ std::vector<ColumnFamilyDescriptor> all_column_families;
959
+
960
+ std::string options_file_name =
961
+ OptionsFileName(name, compaction_input.options_file_number);
962
+
963
+ s = LoadOptionsFromFile(config_options, options_file_name, &db_options,
964
+ &all_column_families);
965
+ if (!s.ok()) {
966
+ return s;
954
967
  }
955
- compaction_input.db_options.env = override_options.env;
956
- compaction_input.db_options.file_checksum_gen_factory =
957
- override_options.file_checksum_gen_factory;
958
- compaction_input.db_options.statistics = override_options.statistics;
959
- compaction_input.column_family.options.comparator =
960
- override_options.comparator;
961
- compaction_input.column_family.options.merge_operator =
962
- override_options.merge_operator;
963
- compaction_input.column_family.options.compaction_filter =
964
- override_options.compaction_filter;
965
- compaction_input.column_family.options.compaction_filter_factory =
966
- override_options.compaction_filter_factory;
967
- compaction_input.column_family.options.prefix_extractor =
968
- override_options.prefix_extractor;
969
- compaction_input.column_family.options.table_factory =
970
- override_options.table_factory;
971
- compaction_input.column_family.options.sst_partitioner_factory =
972
- override_options.sst_partitioner_factory;
973
- compaction_input.column_family.options.table_properties_collector_factories =
974
- override_options.table_properties_collector_factories;
975
- compaction_input.db_options.listeners = override_options.listeners;
976
968
 
969
+ // 3. Override pointer configurations in DBOptions with
970
+ // CompactionServiceOptionsOverride
971
+ db_options.env = override_options.env;
972
+ db_options.file_checksum_gen_factory =
973
+ override_options.file_checksum_gen_factory;
974
+ db_options.statistics = override_options.statistics;
975
+ db_options.listeners = override_options.listeners;
976
+ db_options.compaction_service = nullptr;
977
+ // We will close the DB after the compaction anyway.
978
+ // Open as many files as needed for the compaction.
979
+ db_options.max_open_files = -1;
980
+
981
+ // 4. Filter CFs that are needed for OpenAndCompact()
982
+ // We do not need to open all column families for the remote compaction.
983
+ // Only open default CF + target CF. If target CF == default CF, we will open
984
+ // just the default CF (Due to current limitation, DB cannot open without the
985
+ // default CF)
977
986
  std::vector<ColumnFamilyDescriptor> column_families;
978
- column_families.push_back(compaction_input.column_family);
979
- // TODO: we have to open default CF, because of an implementation limitation,
980
- // currently we just use the same CF option from input, which is not collect
981
- // and open may fail.
982
- if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
983
- column_families.emplace_back(kDefaultColumnFamilyName,
984
- compaction_input.column_family.options);
987
+ for (auto& cf : all_column_families) {
988
+ if (cf.name == compaction_input.cf_name) {
989
+ cf.options.comparator = override_options.comparator;
990
+ cf.options.merge_operator = override_options.merge_operator;
991
+ cf.options.compaction_filter = override_options.compaction_filter;
992
+ cf.options.compaction_filter_factory =
993
+ override_options.compaction_filter_factory;
994
+ cf.options.prefix_extractor = override_options.prefix_extractor;
995
+ cf.options.table_factory = override_options.table_factory;
996
+ cf.options.sst_partitioner_factory =
997
+ override_options.sst_partitioner_factory;
998
+ cf.options.table_properties_collector_factories =
999
+ override_options.table_properties_collector_factories;
1000
+ column_families.emplace_back(cf);
1001
+ } else if (cf.name == kDefaultColumnFamilyName) {
1002
+ column_families.emplace_back(cf);
1003
+ }
985
1004
  }
986
1005
 
1006
+ // 5. Open db As Secondary
987
1007
  DB* db;
988
1008
  std::vector<ColumnFamilyHandle*> handles;
989
-
990
- s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
991
- column_families, &handles, &db);
1009
+ s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
1010
+ &handles, &db);
992
1011
  if (!s.ok()) {
993
1012
  return s;
994
1013
  }
1014
+ assert(db);
1015
+
1016
+ // 6. Find the handle of the Column Family that this will compact
1017
+ ColumnFamilyHandle* cfh = nullptr;
1018
+ for (auto* handle : handles) {
1019
+ if (compaction_input.cf_name == handle->GetName()) {
1020
+ cfh = handle;
1021
+ break;
1022
+ }
1023
+ }
1024
+ assert(cfh);
995
1025
 
1026
+ // 7. Run the compaction without installation.
1027
+ // Output will be stored in the directory specified by output_directory
996
1028
  CompactionServiceResult compaction_result;
997
1029
  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
998
- assert(handles.size() > 0);
999
- s = db_secondary->CompactWithoutInstallation(
1000
- options, handles[0], compaction_input, &compaction_result);
1030
+ s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
1031
+ &compaction_result);
1001
1032
 
1033
+ // 8. Serialize the result
1002
1034
  Status serialization_status = compaction_result.Write(output);
1003
1035
 
1036
+ // 9. Close the db and return
1004
1037
  for (auto& handle : handles) {
1005
1038
  delete handle;
1006
1039
  }
@@ -656,7 +656,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
656
656
 
657
657
  if (!io_s.ok()) {
658
658
  // Check WriteToWAL status
659
- IOStatusCheck(io_s);
659
+ WALIOStatusCheck(io_s);
660
660
  }
661
661
  if (!w.CallbackFailed()) {
662
662
  if (!io_s.ok()) {
@@ -799,7 +799,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
799
799
 
800
800
  if (!io_s.ok()) {
801
801
  // Check WriteToWAL status
802
- IOStatusCheck(io_s);
802
+ WALIOStatusCheck(io_s);
803
803
  } else if (!w.CallbackFailed()) {
804
804
  WriteStatusCheck(w.status);
805
805
  }
@@ -1077,7 +1077,7 @@ Status DBImpl::WriteImplWALOnly(
1077
1077
  // This error checking and return is moved up to avoid using uninitialized
1078
1078
  // last_sequence.
1079
1079
  if (!io_s.ok()) {
1080
- IOStatusCheck(io_s);
1080
+ WALIOStatusCheck(io_s);
1081
1081
  write_thread->ExitAsBatchGroupLeader(write_group, status);
1082
1082
  return status;
1083
1083
  }
@@ -1175,7 +1175,7 @@ void DBImpl::WriteStatusCheck(const Status& status) {
1175
1175
  }
1176
1176
  }
1177
1177
 
1178
- void DBImpl::IOStatusCheck(const IOStatus& io_status) {
1178
+ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
1179
1179
  // Is setting bg_error_ enough here? This will at least stop
1180
1180
  // compaction and fail any further writes.
1181
1181
  if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
@@ -1183,7 +1183,8 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) {
1183
1183
  io_status.IsIOFenced()) {
1184
1184
  mutex_.Lock();
1185
1185
  // Maybe change the return status to void?
1186
- error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
1186
+ error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback,
1187
+ /*wal_related=*/true);
1187
1188
  mutex_.Unlock();
1188
1189
  } else {
1189
1190
  // Force writable file to be continue writable.
@@ -1789,13 +1790,13 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
1789
1790
  if (!immutable_db_options_.atomic_flush) {
1790
1791
  FlushRequest flush_req;
1791
1792
  GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req);
1792
- SchedulePendingFlush(flush_req);
1793
+ EnqueuePendingFlush(flush_req);
1793
1794
  }
1794
1795
  }
1795
1796
  if (immutable_db_options_.atomic_flush) {
1796
1797
  FlushRequest flush_req;
1797
1798
  GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req);
1798
- SchedulePendingFlush(flush_req);
1799
+ EnqueuePendingFlush(flush_req);
1799
1800
  }
1800
1801
  MaybeScheduleFlushOrCompaction();
1801
1802
  }
@@ -1881,13 +1882,13 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
1881
1882
  FlushRequest flush_req;
1882
1883
  GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager,
1883
1884
  &flush_req);
1884
- SchedulePendingFlush(flush_req);
1885
+ EnqueuePendingFlush(flush_req);
1885
1886
  }
1886
1887
  }
1887
1888
  if (immutable_db_options_.atomic_flush) {
1888
1889
  FlushRequest flush_req;
1889
1890
  GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req);
1890
- SchedulePendingFlush(flush_req);
1891
+ EnqueuePendingFlush(flush_req);
1891
1892
  }
1892
1893
  MaybeScheduleFlushOrCompaction();
1893
1894
  }
@@ -2163,12 +2164,12 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
2163
2164
  AssignAtomicFlushSeq(cfds);
2164
2165
  FlushRequest flush_req;
2165
2166
  GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req);
2166
- SchedulePendingFlush(flush_req);
2167
+ EnqueuePendingFlush(flush_req);
2167
2168
  } else {
2168
2169
  for (auto* cfd : cfds) {
2169
2170
  FlushRequest flush_req;
2170
2171
  GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req);
2171
- SchedulePendingFlush(flush_req);
2172
+ EnqueuePendingFlush(flush_req);
2172
2173
  }
2173
2174
  }
2174
2175
  MaybeScheduleFlushOrCompaction();
@@ -2326,7 +2327,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2326
2327
  // We may have lost data from the WritableFileBuffer in-memory buffer for
2327
2328
  // the current log, so treat it as a fatal error and set bg_error
2328
2329
  if (!io_s.ok()) {
2329
- error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
2330
+ error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable,
2331
+ /*wal_related=*/true);
2330
2332
  } else {
2331
2333
  error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
2332
2334
  }
@@ -895,6 +895,81 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
895
895
  SyncPoint::GetInstance()->DisableProcessing();
896
896
  }
897
897
 
898
+ TEST_P(DBIOCorruptionTest, FooterReadCorruptionRetry) {
899
+ Random rnd(300);
900
+ bool retry = false;
901
+
902
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
903
+ "ReadFooterFromFileInternal:0", [&](void* arg) {
904
+ Slice* data = static_cast<Slice*>(arg);
905
+ if (!retry) {
906
+ std::memcpy(const_cast<char*>(data->data()),
907
+ rnd.RandomString(static_cast<int>(data->size())).c_str(),
908
+ data->size());
909
+ retry = true;
910
+ }
911
+ });
912
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
913
+
914
+ ASSERT_OK(Put("key1", "val1"));
915
+ Status s = Flush();
916
+ if (std::get<2>(GetParam())) {
917
+ ASSERT_OK(s);
918
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
919
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
920
+ 1);
921
+
922
+ std::string val;
923
+ ReadOptions ro;
924
+ ro.async_io = std::get<1>(GetParam());
925
+ ASSERT_OK(dbfull()->Get(ro, "key1", &val));
926
+ ASSERT_EQ(val, "val1");
927
+ } else {
928
+ ASSERT_NOK(s);
929
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
930
+ ASSERT_GT(stats()->getTickerCount(SST_FOOTER_CORRUPTION_COUNT), 0);
931
+ }
932
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
933
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
934
+ }
935
+
936
+ TEST_P(DBIOCorruptionTest, TablePropertiesCorruptionRetry) {
937
+ Random rnd(300);
938
+ bool retry = false;
939
+
940
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
941
+ "ReadTablePropertiesHelper:0", [&](void* arg) {
942
+ Slice* data = static_cast<Slice*>(arg);
943
+ if (!retry) {
944
+ std::memcpy(const_cast<char*>(data->data()),
945
+ rnd.RandomString(static_cast<int>(data->size())).c_str(),
946
+ data->size());
947
+ retry = true;
948
+ }
949
+ });
950
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
951
+
952
+ ASSERT_OK(Put("key1", "val1"));
953
+ Status s = Flush();
954
+ if (std::get<2>(GetParam())) {
955
+ ASSERT_OK(s);
956
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
957
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
958
+ 1);
959
+
960
+ std::string val;
961
+ ReadOptions ro;
962
+ ro.async_io = std::get<1>(GetParam());
963
+ ASSERT_OK(dbfull()->Get(ro, "key1", &val));
964
+ ASSERT_EQ(val, "val1");
965
+ } else {
966
+ ASSERT_NOK(s);
967
+ ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
968
+ }
969
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
970
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
971
+ }
972
+
898
973
  // The parameters are - 1. Use FS provided buffer, 2. Use async IO ReadOption,
899
974
  // 3. Retry with verify_and_reconstruct_read IOOption
900
975
  INSTANTIATE_TEST_CASE_P(DBIOCorruptionTest, DBIOCorruptionTest,