@nxtedition/rocksdb 7.0.0-alpha.6 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/binding.cc +37 -36
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +10 -3
  3. package/deps/rocksdb/rocksdb/Makefile +8 -1
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +50 -2
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +9 -3
  7. package/deps/rocksdb/rocksdb/cache/cache_test.cc +111 -33
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +71 -31
  9. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +31 -30
  10. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +21 -8
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +35 -38
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +22 -9
  13. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +48 -0
  14. package/deps/rocksdb/rocksdb/db/blob/db_blob_compaction_test.cc +78 -0
  15. package/deps/rocksdb/rocksdb/db/builder.cc +7 -5
  16. package/deps/rocksdb/rocksdb/db/c.cc +777 -108
  17. package/deps/rocksdb/rocksdb/db/c_test.c +290 -30
  18. package/deps/rocksdb/rocksdb/db/column_family.cc +13 -0
  19. package/deps/rocksdb/rocksdb/db/column_family_test.cc +24 -36
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +18 -4
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +24 -6
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +6 -9
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +38 -40
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +4 -4
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -17
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -5
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +253 -24
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +9 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +3 -2
  30. package/deps/rocksdb/rocksdb/db/corruption_test.cc +67 -10
  31. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +83 -7
  32. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +5 -2
  33. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +68 -0
  34. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +40 -1
  35. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +94 -23
  36. package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.h +17 -4
  37. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +263 -58
  38. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +186 -23
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +43 -14
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +24 -28
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +116 -83
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +13 -5
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +71 -34
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +8 -3
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +72 -33
  46. package/deps/rocksdb/rocksdb/db/db_readonly_with_timestamp_test.cc +629 -0
  47. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +438 -10
  48. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +43 -2
  49. package/deps/rocksdb/rocksdb/db/db_test.cc +41 -1
  50. package/deps/rocksdb/rocksdb/db/db_test2.cc +41 -12
  51. package/deps/rocksdb/rocksdb/db/db_test_util.h +1 -0
  52. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +90 -0
  53. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +109 -16
  54. package/deps/rocksdb/rocksdb/db/dbformat.h +1 -1
  55. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +54 -0
  56. package/deps/rocksdb/rocksdb/db/flush_job.cc +3 -3
  57. package/deps/rocksdb/rocksdb/db/log_reader.cc +22 -4
  58. package/deps/rocksdb/rocksdb/db/log_reader.h +4 -0
  59. package/deps/rocksdb/rocksdb/db/memtable.cc +4 -0
  60. package/deps/rocksdb/rocksdb/db/post_memtable_callback.h +25 -0
  61. package/deps/rocksdb/rocksdb/db/repair.cc +1 -1
  62. package/deps/rocksdb/rocksdb/db/repair_test.cc +3 -2
  63. package/deps/rocksdb/rocksdb/db/snapshot_impl.h +65 -2
  64. package/deps/rocksdb/rocksdb/db/transaction_log_impl.cc +3 -2
  65. package/deps/rocksdb/rocksdb/db/version_set.cc +52 -0
  66. package/deps/rocksdb/rocksdb/db/version_set.h +57 -43
  67. package/deps/rocksdb/rocksdb/db/wal_manager.cc +14 -4
  68. package/deps/rocksdb/rocksdb/db/wal_manager.h +16 -0
  69. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.cc +141 -0
  70. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization.h +55 -0
  71. package/deps/rocksdb/rocksdb/db/wide/wide_column_serialization_test.cc +292 -0
  72. package/deps/rocksdb/rocksdb/db/write_thread.h +6 -1
  73. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +42 -19
  75. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +28 -0
  76. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +6 -2
  77. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +11 -5
  78. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +18 -12
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +74 -167
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +4 -9
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +16 -9
  82. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +117 -10
  83. package/deps/rocksdb/rocksdb/env/composite_env.cc +7 -0
  84. package/deps/rocksdb/rocksdb/env/env.cc +4 -0
  85. package/deps/rocksdb/rocksdb/env/env_posix.cc +3 -3
  86. package/deps/rocksdb/rocksdb/env/env_test.cc +5 -5
  87. package/deps/rocksdb/rocksdb/env/file_system_tracer.cc +45 -0
  88. package/deps/rocksdb/rocksdb/env/file_system_tracer.h +14 -0
  89. package/deps/rocksdb/rocksdb/env/fs_posix.cc +1 -1
  90. package/deps/rocksdb/rocksdb/env/io_posix.cc +50 -24
  91. package/deps/rocksdb/rocksdb/env/io_posix.h +9 -7
  92. package/deps/rocksdb/rocksdb/env/mock_env.cc +9 -3
  93. package/deps/rocksdb/rocksdb/file/file_util.cc +4 -1
  94. package/deps/rocksdb/rocksdb/file/filename.cc +14 -0
  95. package/deps/rocksdb/rocksdb/file/line_file_reader.cc +9 -4
  96. package/deps/rocksdb/rocksdb/file/line_file_reader.h +3 -2
  97. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +157 -0
  98. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -1
  99. package/deps/rocksdb/rocksdb/file/sequence_file_reader.cc +68 -32
  100. package/deps/rocksdb/rocksdb/file/sequence_file_reader.h +20 -6
  101. package/deps/rocksdb/rocksdb/file/writable_file_writer.cc +10 -6
  102. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +4 -2
  103. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +16 -0
  104. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +231 -2
  105. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +4 -2
  106. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +3 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +13 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/io_status.h +4 -20
  109. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +1 -1
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +31 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/snapshot.h +2 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +4 -20
  113. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +2 -2
  114. package/deps/rocksdb/rocksdb/include/rocksdb/trace_record.h +1 -0
  115. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  116. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction.h +34 -0
  117. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +36 -0
  118. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +74 -0
  119. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +36 -3
  120. package/deps/rocksdb/rocksdb/logging/auto_roll_logger_test.cc +16 -3
  121. package/deps/rocksdb/rocksdb/logging/env_logger.h +3 -3
  122. package/deps/rocksdb/rocksdb/logging/log_buffer.cc +2 -2
  123. package/deps/rocksdb/rocksdb/logging/log_buffer.h +1 -1
  124. package/deps/rocksdb/rocksdb/logging/posix_logger.h +3 -3
  125. package/deps/rocksdb/rocksdb/memory/arena.cc +0 -1
  126. package/deps/rocksdb/rocksdb/microbench/db_basic_bench.cc +61 -73
  127. package/deps/rocksdb/rocksdb/monitoring/histogram.cc +6 -5
  128. package/deps/rocksdb/rocksdb/monitoring/histogram_test.cc +6 -0
  129. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +7 -3
  130. package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -0
  131. package/deps/rocksdb/rocksdb/options/cf_options.h +3 -0
  132. package/deps/rocksdb/rocksdb/options/options.cc +4 -1
  133. package/deps/rocksdb/rocksdb/options/options_helper.cc +1 -0
  134. package/deps/rocksdb/rocksdb/options/options_parser.cc +1 -1
  135. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +1 -0
  136. package/deps/rocksdb/rocksdb/options/options_test.cc +4 -0
  137. package/deps/rocksdb/rocksdb/port/port_posix.h +0 -2
  138. package/deps/rocksdb/rocksdb/port/sys_time.h +27 -11
  139. package/deps/rocksdb/rocksdb/port/win/env_win.cc +1 -1
  140. package/deps/rocksdb/rocksdb/port/win/io_win.cc +16 -0
  141. package/deps/rocksdb/rocksdb/port/win/io_win.h +11 -2
  142. package/deps/rocksdb/rocksdb/port/win/port_win.cc +1 -1
  143. package/deps/rocksdb/rocksdb/port/win/port_win.h +2 -16
  144. package/deps/rocksdb/rocksdb/port/win/win_jemalloc.cc +2 -2
  145. package/deps/rocksdb/rocksdb/port/win/win_logger.cc +2 -2
  146. package/deps/rocksdb/rocksdb/rocksdb.pc.in +4 -5
  147. package/deps/rocksdb/rocksdb/src.mk +3 -0
  148. package/deps/rocksdb/rocksdb/table/block_based/block_based_filter_block.cc +7 -5
  149. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +39 -43
  150. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +2 -4
  151. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +42 -34
  152. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +1 -7
  153. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +2 -2
  154. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +2 -2
  155. package/deps/rocksdb/rocksdb/table/block_based/block_prefix_index.cc +7 -13
  156. package/deps/rocksdb/rocksdb/table/block_based/block_prefix_index.h +9 -5
  157. package/deps/rocksdb/rocksdb/table/block_based/block_type.h +5 -2
  158. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +4 -4
  159. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +6 -2
  160. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +8 -5
  161. package/deps/rocksdb/rocksdb/table/block_based/hash_index_reader.cc +2 -2
  162. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +19 -14
  163. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +2 -0
  164. package/deps/rocksdb/rocksdb/table/format.h +1 -3
  165. package/deps/rocksdb/rocksdb/table/get_context.cc +5 -0
  166. package/deps/rocksdb/rocksdb/table/multiget_context.h +3 -0
  167. package/deps/rocksdb/rocksdb/table/scoped_arena_iterator.h +3 -4
  168. package/deps/rocksdb/rocksdb/table/table_test.cc +1 -1
  169. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +102 -6
  170. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +1 -0
  171. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +19 -2
  172. package/deps/rocksdb/rocksdb/tools/trace_analyzer_test.cc +2 -1
  173. package/deps/rocksdb/rocksdb/tools/trace_analyzer_tool.cc +2 -1
  174. package/deps/rocksdb/rocksdb/util/aligned_buffer.h +2 -4
  175. package/deps/rocksdb/rocksdb/util/autovector.h +11 -1
  176. package/deps/rocksdb/rocksdb/util/cleanable.cc +1 -0
  177. package/deps/rocksdb/rocksdb/util/compression.h +5 -7
  178. package/deps/rocksdb/rocksdb/util/file_reader_writer_test.cc +14 -8
  179. package/deps/rocksdb/rocksdb/util/string_util.cc +1 -1
  180. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +33 -63
  181. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +1 -1
  182. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +3 -2
  183. package/deps/rocksdb/rocksdb/utilities/counted_fs.cc +14 -0
  184. package/deps/rocksdb/rocksdb/utilities/counted_fs.h +7 -1
  185. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.cc +7 -0
  186. package/deps/rocksdb/rocksdb/utilities/fault_injection_env.h +1 -0
  187. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +8 -0
  188. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +3 -0
  189. package/deps/rocksdb/rocksdb/utilities/persistent_cache/hash_table_bench.cc +6 -4
  190. package/deps/rocksdb/rocksdb/utilities/persistent_cache/volatile_tier_impl.h +2 -3
  191. package/deps/rocksdb/rocksdb/utilities/transactions/optimistic_transaction_test.cc +34 -21
  192. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +31 -7
  193. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +1 -0
  194. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +63 -0
  195. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +40 -0
  196. package/deps/rocksdb/rocksdb/utilities/transactions/timestamped_snapshot_test.cc +426 -0
  197. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.cc +37 -0
  198. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_base.h +6 -0
  199. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +16 -18
  200. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.h +18 -0
  201. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +61 -0
  202. package/deps/rocksdb/rocksdb.gyp +1 -0
  203. package/index.js +5 -2
  204. package/package.json +1 -1
  205. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  206. package/prebuilds/darwin-x64/node.napi.node +0 -0
  207. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -32,6 +32,7 @@
32
32
  #include "db/log_writer.h"
33
33
  #include "db/logs_with_prep_tracker.h"
34
34
  #include "db/memtable_list.h"
35
+ #include "db/post_memtable_callback.h"
35
36
  #include "db/pre_release_callback.h"
36
37
  #include "db/range_del_aggregator.h"
37
38
  #include "db/read_callback.h"
@@ -113,6 +114,68 @@ class Directories {
113
114
 
114
115
  FSDirectory* GetDbDir() { return db_dir_.get(); }
115
116
 
117
+ IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
118
+ // close all directories for all database paths
119
+ IOStatus s = IOStatus::OK();
120
+ IOStatus temp_s = IOStatus::OK();
121
+
122
+ // The default implementation for Close() in Directory/FSDirectory class
123
+ // "NotSupported" status, the upper level interface should be able to
124
+ // handle this error so that Close() does not fail after upgrading when
125
+ // run on FileSystems that have not implemented `Directory::Close()` or
126
+ // `FSDirectory::Close()` yet
127
+
128
+ if (db_dir_) {
129
+ temp_s = db_dir_->Close(options, dbg);
130
+ if (!temp_s.ok()) {
131
+ if (temp_s.IsNotSupported()) {
132
+ temp_s.PermitUncheckedError();
133
+ } else {
134
+ s = temp_s;
135
+ }
136
+ }
137
+ }
138
+
139
+ if (!s.ok()) {
140
+ return s;
141
+ }
142
+
143
+ if (wal_dir_) {
144
+ s = wal_dir_->Close(options, dbg);
145
+ if (!temp_s.ok()) {
146
+ if (temp_s.IsNotSupported()) {
147
+ temp_s.PermitUncheckedError();
148
+ } else {
149
+ s = temp_s;
150
+ }
151
+ }
152
+ }
153
+
154
+ if (!s.ok()) {
155
+ return s;
156
+ }
157
+
158
+ if (data_dirs_.size() > 0 && s.ok()) {
159
+ for (auto& data_dir_ptr : data_dirs_) {
160
+ if (data_dir_ptr) {
161
+ temp_s = data_dir_ptr->Close(options, dbg);
162
+ if (!temp_s.ok()) {
163
+ if (temp_s.IsNotSupported()) {
164
+ temp_s.PermitUncheckedError();
165
+ } else {
166
+ return temp_s;
167
+ }
168
+ }
169
+ }
170
+ }
171
+ }
172
+
173
+ // Mark temp_s as checked when temp_s is still the initial status
174
+ // (IOStatus::OK(), not checked yet)
175
+ temp_s.PermitUncheckedError();
176
+ return s;
177
+ }
178
+
116
179
  private:
117
180
  std::unique_ptr<FSDirectory> db_dir_;
118
181
  std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
@@ -283,6 +346,19 @@ class DBImpl : public DB {
283
346
 
284
347
  virtual const Snapshot* GetSnapshot() override;
285
348
  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
349
+ // Create a timestamped snapshot. This snapshot can be shared by multiple
350
+ // readers. If any of them uses it for write conflict checking, then
351
+ // is_write_conflict_boundary is true. For simplicity, set it to true by
352
+ // default.
353
+ std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
354
+ SequenceNumber snapshot_seq, uint64_t ts);
355
+ std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
356
+ void ReleaseTimestampedSnapshotsOlderThan(
357
+ uint64_t ts, size_t* remaining_total_ss = nullptr);
358
+ Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
359
+ std::vector<std::shared_ptr<const Snapshot>>&
360
+ timestamped_snapshots) const;
361
+
286
362
  using DB::GetProperty;
287
363
  virtual bool GetProperty(ColumnFamilyHandle* column_family,
288
364
  const Slice& property, std::string* value) override;
@@ -1160,6 +1236,8 @@ class DBImpl : public DB {
1160
1236
  static void TEST_ResetDbSessionIdGen();
1161
1237
  static std::string GenerateDbSessionId(Env* env);
1162
1238
 
1239
+ bool seq_per_batch() const { return seq_per_batch_; }
1240
+
1163
1241
  protected:
1164
1242
  const std::string dbname_;
1165
1243
  // TODO(peterd): unify with VersionSet::db_id_
@@ -1183,6 +1261,9 @@ class DBImpl : public DB {
1183
1261
  InstrumentedMutex trace_mutex_;
1184
1262
  BlockCacheTracer block_cache_tracer_;
1185
1263
 
1264
+ // constant false canceled flag, used when the compaction is not manual
1265
+ const std::atomic<bool> kManualCompactionCanceledFalse_{false};
1266
+
1186
1267
  // State below is protected by mutex_
1187
1268
  // With two_write_queues enabled, some of the variables that accessed during
1188
1269
  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
@@ -1207,9 +1288,6 @@ class DBImpl : public DB {
1207
1288
  // only used for dynamically adjusting max_total_wal_size. it is a sum of
1208
1289
  // [write_buffer_size * max_write_buffer_number] over all column families
1209
1290
  uint64_t max_total_in_memory_state_;
1210
- // If true, we have only one (default) column family. We use this to optimize
1211
- // some code-paths
1212
- bool single_column_family_mode_;
1213
1291
 
1214
1292
  // The options to access storage files
1215
1293
  const FileOptions file_options_;
@@ -1240,6 +1318,39 @@ class DBImpl : public DB {
1240
1318
 
1241
1319
  std::atomic<bool> shutting_down_;
1242
1320
 
1321
+ // RecoveryContext struct stores the context about version edits along
1322
+ // with corresponding column_family_data and column_family_options.
1323
+ class RecoveryContext {
1324
+ public:
1325
+ ~RecoveryContext() {
1326
+ for (auto& edit_list : edit_lists_) {
1327
+ for (auto* edit : edit_list) {
1328
+ delete edit;
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
1334
+ assert(cfd != nullptr);
1335
+ if (map_.find(cfd->GetID()) == map_.end()) {
1336
+ uint32_t size = static_cast<uint32_t>(map_.size());
1337
+ map_.emplace(cfd->GetID(), size);
1338
+ cfds_.emplace_back(cfd);
1339
+ mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
1340
+ edit_lists_.emplace_back(autovector<VersionEdit*>());
1341
+ }
1342
+ uint32_t i = map_[cfd->GetID()];
1343
+ edit_lists_[i].emplace_back(new VersionEdit(edit));
1344
+ }
1345
+
1346
+ std::unordered_map<uint32_t, uint32_t> map_; // cf_id to index;
1347
+ autovector<ColumnFamilyData*> cfds_;
1348
+ autovector<const MutableCFOptions*> mutable_cf_opts_;
1349
+ autovector<autovector<VersionEdit*>> edit_lists_;
1350
+ // files_to_delete_ contains sst files
1351
+ std::unordered_set<std::string> files_to_delete_;
1352
+ };
1353
+
1243
1354
  // Except in DB::Open(), WriteOptionsFile can only be called when:
1244
1355
  // Persist options to options file.
1245
1356
  // If need_mutex_lock = false, the method will lock DB mutex.
@@ -1309,7 +1420,8 @@ class DBImpl : public DB {
1309
1420
  uint64_t* log_used = nullptr, uint64_t log_ref = 0,
1310
1421
  bool disable_memtable = false, uint64_t* seq_used = nullptr,
1311
1422
  size_t batch_cnt = 0,
1312
- PreReleaseCallback* pre_release_callback = nullptr);
1423
+ PreReleaseCallback* pre_release_callback = nullptr,
1424
+ PostMemTableCallback* post_memtable_callback = nullptr);
1313
1425
 
1314
1426
  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
1315
1427
  WriteCallback* callback = nullptr,
@@ -1356,16 +1468,19 @@ class DBImpl : public DB {
1356
1468
  // be made to the descriptor are added to *edit.
1357
1469
  // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
1358
1470
  // skipped.
1471
+ // recovery_ctx stores the context about version edits and all those
1472
+ // edits are persisted to new Manifest after successfully syncing the new WAL.
1359
1473
  virtual Status Recover(
1360
1474
  const std::vector<ColumnFamilyDescriptor>& column_families,
1361
1475
  bool read_only = false, bool error_if_wal_file_exists = false,
1362
1476
  bool error_if_data_exists_in_wals = false,
1363
- uint64_t* recovered_seq = nullptr);
1477
+ uint64_t* recovered_seq = nullptr,
1478
+ RecoveryContext* recovery_ctx = nullptr);
1364
1479
 
1365
1480
  virtual bool OwnTablesAndLogs() const { return true; }
1366
1481
 
1367
1482
  // Set DB identity file, and write DB ID to manifest if necessary.
1368
- Status SetDBId(bool read_only);
1483
+ Status SetDBId(bool read_only, RecoveryContext* recovery_ctx);
1369
1484
 
1370
1485
  // REQUIRES: db mutex held when calling this function, but the db mutex can
1371
1486
  // be released and re-acquired. Db mutex will be held when the function
@@ -1374,20 +1489,31 @@ class DBImpl : public DB {
1374
1489
  // not referenced in the MANIFEST (e.g.
1375
1490
  // 1. It's best effort recovery;
1376
1491
  // 2. The VersionEdits referencing the SST files are appended to
1377
- // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
1492
+ // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
1378
1493
  // still not synced to MANIFEST during recovery.)
1379
- // We delete these SST files. In the
1494
+ // It stores the SST files to be deleted in RecoveryContext. In the
1380
1495
  // meantime, we find out the largest file number present in the paths, and
1381
1496
  // bump up the version set's next_file_number_ to be 1 + largest_file_number.
1382
- Status DeleteUnreferencedSstFiles();
1497
+ // recovery_ctx stores the context about version edits and files to be
1498
+ // deleted. All those edits are persisted to new Manifest after successfully
1499
+ // syncing the new WAL.
1500
+ Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
1383
1501
 
1384
1502
  // SetDbSessionId() should be called in the constuctor DBImpl()
1385
1503
  // to ensure that db_session_id_ gets updated every time the DB is opened
1386
1504
  void SetDbSessionId();
1387
1505
 
1388
1506
  Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
1389
- Status FailIfTsSizesMismatch(const ColumnFamilyHandle* column_family,
1390
- const Slice& ts) const;
1507
+ Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts,
1508
+ bool ts_for_read) const;
1509
+
1510
+ // recovery_ctx stores the context about version edits and
1511
+ // LogAndApplyForRecovery persist all those edits to new Manifest after
1512
+ // successfully syncing new WAL.
1513
+ // LogAndApplyForRecovery should be called only once during recovery and it
1514
+ // should be called when RocksDB writes to a first new MANIFEST since this
1515
+ // recovery.
1516
+ Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
1391
1517
 
1392
1518
  private:
1393
1519
  friend class DB;
@@ -1526,7 +1652,11 @@ class DBImpl : public DB {
1526
1652
  output_path_id(_output_path_id),
1527
1653
  exclusive(_exclusive),
1528
1654
  disallow_trivial_move(_disallow_trivial_move),
1529
- canceled(_canceled) {}
1655
+ canceled(_canceled ? *_canceled : canceled_internal_storage) {}
1656
+ // When _canceled is not provided by ther user, we assign the reference of
1657
+ // canceled_internal_storage to it to consolidate canceled and
1658
+ // manual_compaction_paused since DisableManualCompaction() might be
1659
+ // called
1530
1660
 
1531
1661
  ColumnFamilyData* cfd;
1532
1662
  int input_level;
@@ -1543,7 +1673,12 @@ class DBImpl : public DB {
1543
1673
  InternalKey* manual_end = nullptr; // how far we are compacting
1544
1674
  InternalKey tmp_storage; // Used to keep track of compaction progress
1545
1675
  InternalKey tmp_storage1; // Used to keep track of compaction progress
1546
- std::atomic<bool>* canceled; // Compaction canceled by the user?
1676
+
1677
+ // When the user provides a canceled pointer in CompactRangeOptions, the
1678
+ // above varaibe is the reference of the user-provided
1679
+ // `canceled`, otherwise, it is the reference of canceled_internal_storage
1680
+ std::atomic<bool> canceled_internal_storage = false;
1681
+ std::atomic<bool>& canceled; // Compaction canceled pointer reference
1547
1682
  };
1548
1683
  struct PrepickedCompaction {
1549
1684
  // background compaction takes ownership of `compaction`.
@@ -1645,7 +1780,8 @@ class DBImpl : public DB {
1645
1780
  // corrupted_log_found is set to true if we recover from a corrupted log file.
1646
1781
  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
1647
1782
  SequenceNumber* next_sequence, bool read_only,
1648
- bool* corrupted_log_found);
1783
+ bool* corrupted_log_found,
1784
+ RecoveryContext* recovery_ctx);
1649
1785
 
1650
1786
  // The following two methods are used to flush a memtable to
1651
1787
  // storage. The first one is used at database RecoveryTime (when the
@@ -1789,12 +1925,13 @@ class DBImpl : public DB {
1789
1925
  IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
1790
1926
  uint64_t* log_used, uint64_t* log_size,
1791
1927
  Env::IOPriority rate_limiter_priority,
1792
- bool with_db_mutex = false, bool with_log_mutex = false);
1928
+ LogFileNumberSize& log_file_number_size);
1793
1929
 
1794
1930
  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
1795
1931
  log::Writer* log_writer, uint64_t* log_used,
1796
1932
  bool need_log_sync, bool need_log_dir_sync,
1797
- SequenceNumber sequence);
1933
+ SequenceNumber sequence,
1934
+ LogFileNumberSize& log_file_number_size);
1798
1935
 
1799
1936
  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
1800
1937
  uint64_t* log_used,
@@ -1919,10 +2056,24 @@ class DBImpl : public DB {
1919
2056
  SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
1920
2057
  bool lock = true);
1921
2058
 
2059
+ // If snapshot_seq != kMaxSequenceNumber, then this function can only be
2060
+ // called from the write thread that publishes sequence numbers to readers.
2061
+ // For 1) write-committed, or 2) write-prepared + one-write-queue, this will
2062
+ // be the write thread performing memtable writes. For write-prepared with
2063
+ // two write queues, this will be the write thread writing commit marker to
2064
+ // the WAL.
2065
+ // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
2066
+ // ensuring no writes to the database.
2067
+ std::pair<Status, std::shared_ptr<const SnapshotImpl>>
2068
+ CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
2069
+ bool lock = true);
2070
+
1922
2071
  uint64_t GetMaxTotalWalSize() const;
1923
2072
 
1924
2073
  FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
1925
2074
 
2075
+ Status MaybeReleaseTimestampedSnapshotsAndCheck();
2076
+
1926
2077
  Status CloseHelper();
1927
2078
 
1928
2079
  void WaitForBackgroundWork();
@@ -2124,11 +2275,7 @@ class DBImpl : public DB {
2124
2275
  // are protected by locking both mutex_ and log_write_mutex_, and reads must
2125
2276
  // be under either mutex_ or log_write_mutex_.
2126
2277
  std::deque<LogFileNumberSize> alive_log_files_;
2127
- // Caching the result of `alive_log_files_.back()` so that we do not have to
2128
- // call `alive_log_files_.back()` in the write thread (WriteToWAL()) which
2129
- // requires locking db mutex if log_mutex_ is not already held in
2130
- // two-write-queues mode.
2131
- std::deque<LogFileNumberSize>::reverse_iterator alive_log_files_tail_;
2278
+
2132
2279
  // Log files that aren't fully synced, and the current log file.
2133
2280
  // Synchronization:
2134
2281
  // - push_back() is done from write_thread_ with locked mutex_ and
@@ -2192,6 +2339,8 @@ class DBImpl : public DB {
2192
2339
 
2193
2340
  SnapshotList snapshots_;
2194
2341
 
2342
+ TimestampedSnapshotList timestamped_snapshots_;
2343
+
2195
2344
  // For each background job, pending_outputs_ keeps the current file number at
2196
2345
  // the time that background job started.
2197
2346
  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
@@ -2479,8 +2628,9 @@ inline Status DBImpl::FailIfCfHasTs(
2479
2628
  return Status::OK();
2480
2629
  }
2481
2630
 
2482
- inline Status DBImpl::FailIfTsSizesMismatch(
2483
- const ColumnFamilyHandle* column_family, const Slice& ts) const {
2631
+ inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
2632
+ const Slice& ts,
2633
+ bool ts_for_read) const {
2484
2634
  if (!column_family) {
2485
2635
  return Status::InvalidArgument("column family handle cannot be null");
2486
2636
  }
@@ -2500,6 +2650,19 @@ inline Status DBImpl::FailIfTsSizesMismatch(
2500
2650
  << ts_sz << " given";
2501
2651
  return Status::InvalidArgument(oss.str());
2502
2652
  }
2653
+ if (ts_for_read) {
2654
+ auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
2655
+ auto cfd = cfh->cfd();
2656
+ std::string current_ts_low = cfd->GetFullHistoryTsLow();
2657
+ if (!current_ts_low.empty() &&
2658
+ ucmp->CompareTimestamp(ts, current_ts_low) < 0) {
2659
+ std::stringstream oss;
2660
+ oss << "Read timestamp: " << ts.ToString(true)
2661
+ << " is smaller than full_history_ts_low: "
2662
+ << Slice(current_ts_low).ToString(true) << std::endl;
2663
+ return Status::InvalidArgument(oss.str());
2664
+ }
2665
+ }
2503
2666
  return Status::OK();
2504
2667
  }
2505
2668
 
@@ -952,6 +952,8 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
952
952
  VersionEdit edit;
953
953
  edit.SetColumnFamily(cfd->GetID());
954
954
  edit.SetFullHistoryTsLow(ts_low);
955
+ TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
956
+ &edit);
955
957
 
956
958
  InstrumentedMutexLock l(&mutex_);
957
959
  std::string current_ts_low = cfd->GetFullHistoryTsLow();
@@ -959,12 +961,25 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
959
961
  assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty());
960
962
  if (!current_ts_low.empty() &&
961
963
  ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
962
- return Status::InvalidArgument(
963
- "Cannot decrease full_history_timestamp_low");
964
+ return Status::InvalidArgument("Cannot decrease full_history_ts_low");
964
965
  }
965
966
 
966
- return versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
967
- &mutex_);
967
+ Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
968
+ &edit, &mutex_);
969
+ if (!s.ok()) {
970
+ return s;
971
+ }
972
+ current_ts_low = cfd->GetFullHistoryTsLow();
973
+ if (!current_ts_low.empty() &&
974
+ ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) {
975
+ std::stringstream oss;
976
+ oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true)
977
+ << " is set to be higher than the requested "
978
+ "timestamp: "
979
+ << Slice(ts_low).ToString(true) << std::endl;
980
+ return Status::TryAgain(oss.str());
981
+ }
982
+ return Status::OK();
968
983
  }
969
984
 
970
985
  Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
@@ -1217,6 +1232,10 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
1217
1232
 
1218
1233
  // Perform CompactFiles
1219
1234
  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
1235
+ TEST_SYNC_POINT_CALLBACK(
1236
+ "TestCompactFiles:PausingManualCompaction:3",
1237
+ reinterpret_cast<void*>(
1238
+ const_cast<std::atomic<int>*>(&manual_compaction_paused_)));
1220
1239
  {
1221
1240
  InstrumentedMutexLock l(&mutex_);
1222
1241
 
@@ -1372,7 +1391,7 @@ Status DBImpl::CompactFilesImpl(
1372
1391
  c->mutable_cf_options()->paranoid_file_checks,
1373
1392
  c->mutable_cf_options()->report_bg_io_stats, dbname_,
1374
1393
  &compaction_job_stats, Env::Priority::USER, io_tracer_,
1375
- &manual_compaction_paused_, nullptr, db_id_, db_session_id_,
1394
+ kManualCompactionCanceledFalse_, db_id_, db_session_id_,
1376
1395
  c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
1377
1396
  &blob_callback_);
1378
1397
 
@@ -1838,8 +1857,7 @@ Status DBImpl::RunManualCompaction(
1838
1857
  // and `CompactRangeOptions::canceled` might not work well together.
1839
1858
  while (bg_bottom_compaction_scheduled_ > 0 ||
1840
1859
  bg_compaction_scheduled_ > 0) {
1841
- if (manual_compaction_paused_ > 0 ||
1842
- (manual.canceled != nullptr && *manual.canceled == true)) {
1860
+ if (manual_compaction_paused_ > 0 || manual.canceled == true) {
1843
1861
  // Pretend the error came from compaction so the below cleanup/error
1844
1862
  // handling code can process it.
1845
1863
  manual.done = true;
@@ -2376,10 +2394,18 @@ Status DBImpl::EnableAutoCompaction(
2376
2394
  return s;
2377
2395
  }
2378
2396
 
2397
+ // NOTE: Calling DisableManualCompaction() may overwrite the
2398
+ // user-provided canceled variable in CompactRangeOptions
2379
2399
  void DBImpl::DisableManualCompaction() {
2380
2400
  InstrumentedMutexLock l(&mutex_);
2381
2401
  manual_compaction_paused_.fetch_add(1, std::memory_order_release);
2382
2402
 
2403
+ // Mark the canceled as true when the cancellation is triggered by
2404
+ // manual_compaction_paused (may overwrite user-provided `canceled`)
2405
+ for (const auto& manual_compaction : manual_compaction_dequeue_) {
2406
+ manual_compaction->canceled = true;
2407
+ }
2408
+
2383
2409
  // Wake up manual compactions waiting to start.
2384
2410
  bg_cv_.SignalAll();
2385
2411
 
@@ -2392,6 +2418,11 @@ void DBImpl::DisableManualCompaction() {
2392
2418
  }
2393
2419
  }
2394
2420
 
2421
+ // NOTE: In contrast to DisableManualCompaction(), calling
2422
+ // EnableManualCompaction() does NOT overwrite the user-provided *canceled
2423
+ // variable to be false since there is NO CHANCE a canceled compaction
2424
+ // is uncanceled. In other words, a canceled compaction must have been
2425
+ // dropped out of the manual compaction queue, when we disable it.
2395
2426
  void DBImpl::EnableManualCompaction() {
2396
2427
  InstrumentedMutexLock l(&mutex_);
2397
2428
  assert(manual_compaction_paused_ > 0);
@@ -3037,10 +3068,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3037
3068
  if (shutting_down_.load(std::memory_order_acquire)) {
3038
3069
  status = Status::ShutdownInProgress();
3039
3070
  } else if (is_manual &&
3040
- manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
3041
- status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
3042
- } else if (is_manual && manual_compaction->canceled &&
3043
- manual_compaction->canceled->load(std::memory_order_acquire)) {
3071
+ manual_compaction->canceled.load(std::memory_order_acquire)) {
3044
3072
  status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
3045
3073
  }
3046
3074
  } else {
@@ -3357,6 +3385,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3357
3385
  GetSnapshotContext(job_context, &snapshot_seqs,
3358
3386
  &earliest_write_conflict_snapshot, &snapshot_checker);
3359
3387
  assert(is_snapshot_supported_ || snapshots_.empty());
3388
+
3360
3389
  CompactionJob compaction_job(
3361
3390
  job_context->job_id, c.get(), immutable_db_options_,
3362
3391
  mutable_db_options_, file_options_for_compaction_, versions_.get(),
@@ -3368,9 +3397,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3368
3397
  c->mutable_cf_options()->paranoid_file_checks,
3369
3398
  c->mutable_cf_options()->report_bg_io_stats, dbname_,
3370
3399
  &compaction_job_stats, thread_pri, io_tracer_,
3371
- is_manual ? &manual_compaction_paused_ : nullptr,
3372
- is_manual ? manual_compaction->canceled : nullptr, db_id_,
3373
- db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
3400
+ is_manual ? manual_compaction->canceled
3401
+ : kManualCompactionCanceledFalse_,
3402
+ db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
3374
3403
  c->trim_ts(), &blob_callback_);
3375
3404
  compaction_job.Prepare();
3376
3405
 
@@ -166,8 +166,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
166
166
  job_context->log_number = MinLogNumberToKeep();
167
167
  job_context->prev_log_number = versions_->prev_log_number();
168
168
 
169
- versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
170
169
  if (doing_the_full_scan) {
170
+ versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
171
171
  InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
172
172
  dbname_);
173
173
  std::set<std::string> paths;
@@ -242,6 +242,14 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
242
242
  log_file, immutable_db_options_.db_log_dir);
243
243
  }
244
244
  }
245
+ } else {
246
+ // Instead of filling ob_context->sst_live and job_context->blob_live,
247
+ // directly remove files that show up in any Version. This is because
248
+ // candidate files tend to be a small percentage of all files, so it is
249
+ // usually cheaper to check them against every version, compared to
250
+ // building a map for all files.
251
+ versions_->RemoveLiveFiles(job_context->sst_delete_files,
252
+ job_context->blob_delete_files);
245
253
  }
246
254
 
247
255
  // logs_ is empty when called during recovery, in which case there can't yet
@@ -395,8 +403,10 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
395
403
  state.manifest_delete_files.size());
396
404
  // We may ignore the dbname when generating the file names.
397
405
  for (auto& file : state.sst_delete_files) {
398
- candidate_files.emplace_back(
399
- MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
406
+ if (!file.only_delete_metadata) {
407
+ candidate_files.emplace_back(
408
+ MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
409
+ }
400
410
  if (file.metadata->table_reader_handle) {
401
411
  table_cache_->Release(file.metadata->table_reader_handle);
402
412
  }
@@ -863,7 +873,7 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
863
873
  return min_log_number_to_keep;
864
874
  }
865
875
 
866
- Status DBImpl::SetDBId(bool read_only) {
876
+ Status DBImpl::SetDBId(bool read_only, RecoveryContext* recovery_ctx) {
867
877
  Status s;
868
878
  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
869
879
  // the very first time.
@@ -890,14 +900,14 @@ Status DBImpl::SetDBId(bool read_only) {
890
900
  }
891
901
  s = GetDbIdentityFromIdentityFile(&db_id_);
892
902
  if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
903
+ assert(!read_only);
904
+ assert(recovery_ctx != nullptr);
905
+ assert(versions_->GetColumnFamilySet() != nullptr);
893
906
  VersionEdit edit;
894
907
  edit.SetDBId(db_id_);
895
- Options options;
896
- MutableCFOptions mutable_cf_options(options);
897
908
  versions_->db_id_ = db_id_;
898
- s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
899
- mutable_cf_options, &edit, &mutex_, nullptr,
900
- /* new_descriptor_log */ false);
909
+ recovery_ctx->UpdateVersionEdits(
910
+ versions_->GetColumnFamilySet()->GetDefault(), edit);
901
911
  }
902
912
  } else if (!read_only) {
903
913
  s = SetIdentityFile(env_, dbname_, db_id_);
@@ -905,7 +915,7 @@ Status DBImpl::SetDBId(bool read_only) {
905
915
  return s;
906
916
  }
907
917
 
908
- Status DBImpl::DeleteUnreferencedSstFiles() {
918
+ Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
909
919
  mutex_.AssertHeld();
910
920
  std::vector<std::string> paths;
911
921
  paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
@@ -925,7 +935,6 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
925
935
 
926
936
  uint64_t next_file_number = versions_->current_next_file_number();
927
937
  uint64_t largest_file_number = next_file_number;
928
- std::set<std::string> files_to_delete;
929
938
  Status s;
930
939
  for (const auto& path : paths) {
931
940
  std::vector<std::string> files;
@@ -943,8 +952,9 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
943
952
  const std::string normalized_fpath = path + fname;
944
953
  largest_file_number = std::max(largest_file_number, number);
945
954
  if (type == kTableFile && number >= next_file_number &&
946
- files_to_delete.find(normalized_fpath) == files_to_delete.end()) {
947
- files_to_delete.insert(normalized_fpath);
955
+ recovery_ctx->files_to_delete_.find(normalized_fpath) ==
956
+ recovery_ctx->files_to_delete_.end()) {
957
+ recovery_ctx->files_to_delete_.emplace(normalized_fpath);
948
958
  }
949
959
  }
950
960
  }
@@ -961,21 +971,7 @@ Status DBImpl::DeleteUnreferencedSstFiles() {
961
971
  assert(versions_->GetColumnFamilySet());
962
972
  ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
963
973
  assert(default_cfd);
964
- s = versions_->LogAndApply(
965
- default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
966
- directories_.GetDbDir(), /*new_descriptor_log*/ false);
967
- if (!s.ok()) {
968
- return s;
969
- }
970
-
971
- mutex_.Unlock();
972
- for (const auto& fname : files_to_delete) {
973
- s = env_->DeleteFile(fname);
974
- if (!s.ok()) {
975
- break;
976
- }
977
- }
978
- mutex_.Lock();
974
+ recovery_ctx->UpdateVersionEdits(default_cfd, edit);
979
975
  return s;
980
976
  }
981
977