@nxtedition/rocksdb 11.0.2 → 11.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/binding.cc +133 -122
  2. package/deps/rocksdb/rocksdb/db/column_family_test.cc +15 -7
  3. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +4 -2
  4. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -4
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +11 -7
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +17 -11
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +15 -0
  8. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +155 -0
  9. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +564 -461
  10. package/deps/rocksdb/rocksdb/db/db_follower_test.cc +8 -4
  11. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +40 -24
  12. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +8 -1
  13. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +7 -4
  14. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +5 -0
  15. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -1
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +19 -1
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +20 -16
  18. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +27 -0
  19. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +10 -2
  20. package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +85 -0
  21. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +55 -2
  22. package/deps/rocksdb/rocksdb/db/db_test2.cc +231 -0
  23. package/deps/rocksdb/rocksdb/db/db_test_util.cc +5 -0
  24. package/deps/rocksdb/rocksdb/db/db_test_util.h +10 -1
  25. package/deps/rocksdb/rocksdb/db/db_universal_compaction_test.cc +0 -1
  26. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +175 -1
  27. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +64 -0
  28. package/deps/rocksdb/rocksdb/db/dbformat.h +5 -6
  29. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +8 -8
  30. package/deps/rocksdb/rocksdb/db/experimental.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +2 -4
  32. package/deps/rocksdb/rocksdb/db/flush_job.cc +7 -2
  33. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +4 -2
  34. package/deps/rocksdb/rocksdb/db/listener_test.cc +5 -5
  35. package/deps/rocksdb/rocksdb/db/log_writer.cc +12 -3
  36. package/deps/rocksdb/rocksdb/db/memtable.cc +83 -23
  37. package/deps/rocksdb/rocksdb/db/memtable.h +11 -3
  38. package/deps/rocksdb/rocksdb/db/memtable_list.cc +7 -5
  39. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +21 -0
  40. package/deps/rocksdb/rocksdb/db/version_builder.cc +462 -33
  41. package/deps/rocksdb/rocksdb/db/version_builder.h +70 -23
  42. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +95 -207
  43. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +54 -35
  44. package/deps/rocksdb/rocksdb/db/version_set.cc +13 -11
  45. package/deps/rocksdb/rocksdb/db/version_set_test.cc +313 -59
  46. package/deps/rocksdb/rocksdb/db/write_batch.cc +124 -64
  47. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -3
  48. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_compaction_filter.h +1 -1
  49. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +4 -1
  50. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +9 -0
  51. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_listener.h +4 -32
  52. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +7 -3
  53. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +60 -172
  54. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +57 -2
  55. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +23 -15
  56. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.h +2 -3
  57. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.cc +1 -1
  58. package/deps/rocksdb/rocksdb/db_stress_tool/expected_value.h +4 -1
  59. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +200 -92
  60. package/deps/rocksdb/rocksdb/env/file_system.cc +3 -3
  61. package/deps/rocksdb/rocksdb/file/delete_scheduler.cc +124 -23
  62. package/deps/rocksdb/rocksdb/file/delete_scheduler.h +61 -8
  63. package/deps/rocksdb/rocksdb/file/delete_scheduler_test.cc +141 -2
  64. package/deps/rocksdb/rocksdb/file/file_util.cc +17 -2
  65. package/deps/rocksdb/rocksdb/file/file_util.h +10 -0
  66. package/deps/rocksdb/rocksdb/file/filename.cc +11 -3
  67. package/deps/rocksdb/rocksdb/file/filename.h +2 -1
  68. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +18 -0
  69. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +27 -4
  70. package/deps/rocksdb/rocksdb/file/writable_file_writer.h +8 -1
  71. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +8 -13
  72. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +4 -0
  73. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +5 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +5 -2
  75. package/deps/rocksdb/rocksdb/include/rocksdb/filter_policy.h +2 -1
  76. package/deps/rocksdb/rocksdb/include/rocksdb/memtablerep.h +34 -0
  77. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table.h +27 -9
  80. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +2 -0
  81. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +12 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/transaction_db.h +21 -0
  83. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  84. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +29 -1
  85. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +102 -33
  86. package/deps/rocksdb/rocksdb/memtable/skiplistrep.cc +46 -3
  87. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +4 -0
  88. package/deps/rocksdb/rocksdb/options/cf_options.cc +6 -0
  89. package/deps/rocksdb/rocksdb/options/cf_options.h +2 -0
  90. package/deps/rocksdb/rocksdb/options/db_options.cc +15 -1
  91. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  92. package/deps/rocksdb/rocksdb/options/options_helper.cc +10 -0
  93. package/deps/rocksdb/rocksdb/options/options_parser.cc +3 -2
  94. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +9 -2
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +75 -35
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +6 -0
  97. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +4 -0
  98. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +8 -1
  99. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +40 -15
  100. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +98 -17
  101. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +14 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +21 -91
  103. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +13 -21
  104. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +14 -5
  105. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +62 -53
  106. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +60 -38
  107. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +175 -78
  108. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +65 -36
  109. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +25 -15
  110. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +13 -1
  111. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +18 -4
  112. package/deps/rocksdb/rocksdb/table/meta_blocks.h +4 -0
  113. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -0
  114. package/deps/rocksdb/rocksdb/utilities/blob_db/blob_db_test.cc +2 -2
  115. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +47 -18
  116. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.h +1 -2
  117. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +95 -0
  118. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +26 -15
  119. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.h +62 -19
  120. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.cc +73 -34
  121. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction.h +5 -0
  122. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +10 -3
  123. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +2 -1
  124. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.cc +8 -5
  125. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_util.h +7 -4
  126. package/deps/rocksdb/rocksdb/utilities/transactions/write_committed_transaction_ts_test.cc +225 -0
  127. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn.cc +2 -1
  128. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_txn_db.h +17 -0
  129. package/deps/rocksdb/rocksdb/utilities/transactions/write_unprepared_txn.cc +5 -2
  130. package/index.js +5 -17
  131. package/iterator.js +9 -1
  132. package/package.json +1 -1
  133. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  134. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -31,6 +31,7 @@ DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
31
31
  total_trash_size_(0),
32
32
  rate_bytes_per_sec_(rate_bytes_per_sec),
33
33
  pending_files_(0),
34
+ next_trash_bucket_(0),
34
35
  bytes_max_delete_chunk_(bytes_max_delete_chunk),
35
36
  closing_(false),
36
37
  cv_(&mu_),
@@ -66,10 +67,8 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
66
67
  total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) {
67
68
  // Rate limiting is disabled or trash size makes up more than
68
69
  // max_trash_db_ratio_ (default 25%) of the total DB size
69
- TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
70
- Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
70
+ Status s = DeleteFileImmediately(file_path, /*accounted=*/true);
71
71
  if (s.ok()) {
72
- s = sst_file_manager_->OnDeleteFile(file_path);
73
72
  ROCKS_LOG_INFO(info_log_,
74
73
  "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
75
74
  ", total_trash_size %" PRIu64 ", total_size %" PRIi64
@@ -77,15 +76,57 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
77
76
  file_path.c_str(), rate_bytes_per_sec_.load(),
78
77
  total_trash_size_.load(), total_size,
79
78
  max_trash_db_ratio_.load());
80
- InstrumentedMutexLock l(&mu_);
81
- RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
82
79
  }
83
80
  return s;
84
81
  }
82
+ return AddFileToDeletionQueue(file_path, dir_to_sync, /*bucket=*/std::nullopt,
83
+ /*accounted=*/true);
84
+ }
85
+
86
+ Status DeleteScheduler::DeleteUnaccountedFile(const std::string& file_path,
87
+ const std::string& dir_to_sync,
88
+ const bool force_bg,
89
+ std::optional<int32_t> bucket) {
90
+ uint64_t num_hard_links = 1;
91
+ fs_->NumFileLinks(file_path, IOOptions(), &num_hard_links, nullptr)
92
+ .PermitUncheckedError();
93
+
94
+ // We can tolerate rare races where we might immediately delete both links
95
+ // to a file.
96
+ if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && num_hard_links > 1)) {
97
+ Status s = DeleteFileImmediately(file_path, /*accounted=*/false);
98
+ if (s.ok()) {
99
+ ROCKS_LOG_INFO(info_log_,
100
+ "Deleted file %s immediately, rate_bytes_per_sec %" PRIi64,
101
+ file_path.c_str(), rate_bytes_per_sec_.load());
102
+ }
103
+ return s;
104
+ }
105
+ return AddFileToDeletionQueue(file_path, dir_to_sync, bucket,
106
+ /*accounted=*/false);
107
+ }
85
108
 
109
+ Status DeleteScheduler::DeleteFileImmediately(const std::string& file_path,
110
+ bool accounted) {
111
+ TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
112
+ TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteFile::cb",
113
+ const_cast<std::string*>(&file_path));
114
+ Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
115
+ if (s.ok()) {
116
+ s = OnDeleteFile(file_path, accounted);
117
+ InstrumentedMutexLock l(&mu_);
118
+ RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
119
+ }
120
+ return s;
121
+ }
122
+
123
+ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
124
+ const std::string& dir_to_sync,
125
+ std::optional<int32_t> bucket,
126
+ bool accounted) {
86
127
  // Move file to trash
87
128
  std::string trash_file;
88
- Status s = MarkAsTrash(file_path, &trash_file);
129
+ Status s = MarkAsTrash(file_path, accounted, &trash_file);
89
130
  ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
90
131
  s.ToString().c_str());
91
132
 
@@ -94,7 +135,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
94
135
  file_path.c_str(), s.ToString().c_str());
95
136
  s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
96
137
  if (s.ok()) {
97
- s = sst_file_manager_->OnDeleteFile(file_path);
138
+ s = OnDeleteFile(file_path, accounted);
98
139
  ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
99
140
  trash_file.c_str());
100
141
  InstrumentedMutexLock l(&mu_);
@@ -104,11 +145,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
104
145
  }
105
146
 
106
147
  // Update the total trash size
107
- uint64_t trash_file_size = 0;
108
- IOStatus io_s =
109
- fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
110
- if (io_s.ok()) {
111
- total_trash_size_.fetch_add(trash_file_size);
148
+ if (accounted) {
149
+ uint64_t trash_file_size = 0;
150
+ IOStatus io_s =
151
+ fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
152
+ if (io_s.ok()) {
153
+ total_trash_size_.fetch_add(trash_file_size);
154
+ }
112
155
  }
113
156
  //**TODO: What should we do if we failed to
114
157
  // get the file size?
@@ -117,8 +160,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
117
160
  {
118
161
  InstrumentedMutexLock l(&mu_);
119
162
  RecordTick(stats_.get(), FILES_MARKED_TRASH);
120
- queue_.emplace(trash_file, dir_to_sync);
163
+ queue_.emplace(trash_file, dir_to_sync, accounted, bucket);
121
164
  pending_files_++;
165
+ if (bucket.has_value()) {
166
+ auto iter = pending_files_in_buckets_.find(bucket.value());
167
+ assert(iter != pending_files_in_buckets_.end());
168
+ if (iter != pending_files_in_buckets_.end()) {
169
+ iter->second++;
170
+ }
171
+ }
122
172
  if (pending_files_ == 1) {
123
173
  cv_.SignalAll();
124
174
  }
@@ -177,7 +227,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
177
227
  }
178
228
 
179
229
  Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
180
- std::string* trash_file) {
230
+ bool accounted, std::string* trash_file) {
181
231
  // Sanity check of the path
182
232
  size_t idx = file_path.rfind('/');
183
233
  if (idx == std::string::npos || idx == file_path.size() - 1) {
@@ -211,7 +261,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
211
261
  }
212
262
  cnt++;
213
263
  }
214
- if (s.ok()) {
264
+ if (s.ok() && accounted) {
215
265
  s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
216
266
  }
217
267
  return s;
@@ -235,6 +285,8 @@ void DeleteScheduler::BackgroundEmptyTrash() {
235
285
  uint64_t total_deleted_bytes = 0;
236
286
  int64_t current_delete_rate = rate_bytes_per_sec_.load();
237
287
  while (!queue_.empty() && !closing_) {
288
+ // Satisfy static analysis.
289
+ std::optional<int32_t> bucket = std::nullopt;
238
290
  if (current_delete_rate != rate_bytes_per_sec_.load()) {
239
291
  // User changed the delete rate
240
292
  current_delete_rate = rate_bytes_per_sec_.load();
@@ -247,14 +299,17 @@ void DeleteScheduler::BackgroundEmptyTrash() {
247
299
  // Get new file to delete
248
300
  const FileAndDir& fad = queue_.front();
249
301
  std::string path_in_trash = fad.fname;
302
+ std::string dir_to_sync = fad.dir;
303
+ bool accounted = fad.accounted;
304
+ bucket = fad.bucket;
250
305
 
251
306
  // We don't need to hold the lock while deleting the file
252
307
  mu_.Unlock();
253
308
  uint64_t deleted_bytes = 0;
254
309
  bool is_complete = true;
255
310
  // Delete file from trash and update total_penlty value
256
- Status s =
257
- DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
311
+ Status s = DeleteTrashFile(path_in_trash, dir_to_sync, accounted,
312
+ &deleted_bytes, &is_complete);
258
313
  total_deleted_bytes += deleted_bytes;
259
314
  mu_.Lock();
260
315
  if (is_complete) {
@@ -288,12 +343,20 @@ void DeleteScheduler::BackgroundEmptyTrash() {
288
343
  TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
289
344
  &total_penalty);
290
345
 
346
+ int32_t pending_files_in_bucket = std::numeric_limits<int32_t>::max();
291
347
  if (is_complete) {
292
348
  pending_files_--;
349
+ if (bucket.has_value()) {
350
+ auto iter = pending_files_in_buckets_.find(bucket.value());
351
+ assert(iter != pending_files_in_buckets_.end());
352
+ if (iter != pending_files_in_buckets_.end()) {
353
+ pending_files_in_bucket = iter->second--;
354
+ }
355
+ }
293
356
  }
294
- if (pending_files_ == 0) {
295
- // Unblock WaitForEmptyTrash since there are no more files waiting
296
- // to be deleted
357
+ if (pending_files_ == 0 || pending_files_in_bucket == 0) {
358
+ // Unblock WaitForEmptyTrash or WaitForEmptyTrashBucket since there are
359
+ // no more files waiting to be deleted
297
360
  cv_.SignalAll();
298
361
  }
299
362
  }
@@ -302,12 +365,14 @@ void DeleteScheduler::BackgroundEmptyTrash() {
302
365
 
303
366
  Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
304
367
  const std::string& dir_to_sync,
305
- uint64_t* deleted_bytes,
368
+ bool accounted, uint64_t* deleted_bytes,
306
369
  bool* is_complete) {
307
370
  uint64_t file_size;
308
371
  Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
309
372
  *is_complete = true;
310
373
  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
374
+ TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteTrashFile::cb",
375
+ const_cast<std::string*>(&path_in_trash));
311
376
  if (s.ok()) {
312
377
  bool need_full_delete = true;
313
378
  if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
@@ -374,7 +439,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
374
439
  }
375
440
  if (s.ok()) {
376
441
  *deleted_bytes = file_size;
377
- s = sst_file_manager_->OnDeleteFile(path_in_trash);
442
+ s = OnDeleteFile(path_in_trash, accounted);
378
443
  }
379
444
  }
380
445
  }
@@ -384,12 +449,24 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
384
449
  path_in_trash.c_str(), s.ToString().c_str());
385
450
  *deleted_bytes = 0;
386
451
  } else {
387
- total_trash_size_.fetch_sub(*deleted_bytes);
452
+ if (accounted) {
453
+ total_trash_size_.fetch_sub(*deleted_bytes);
454
+ }
388
455
  }
389
456
 
390
457
  return s;
391
458
  }
392
459
 
460
+ Status DeleteScheduler::OnDeleteFile(const std::string& file_path,
461
+ bool accounted) {
462
+ if (accounted) {
463
+ return sst_file_manager_->OnDeleteFile(file_path);
464
+ }
465
+ TEST_SYNC_POINT_CALLBACK("DeleteScheduler::OnDeleteFile",
466
+ const_cast<std::string*>(&file_path));
467
+ return Status::OK();
468
+ }
469
+
393
470
  void DeleteScheduler::WaitForEmptyTrash() {
394
471
  InstrumentedMutexLock l(&mu_);
395
472
  while (pending_files_ > 0 && !closing_) {
@@ -397,6 +474,30 @@ void DeleteScheduler::WaitForEmptyTrash() {
397
474
  }
398
475
  }
399
476
 
477
+ std::optional<int32_t> DeleteScheduler::NewTrashBucket() {
478
+ if (rate_bytes_per_sec_.load() <= 0) {
479
+ return std::nullopt;
480
+ }
481
+ InstrumentedMutexLock l(&mu_);
482
+ int32_t bucket_number = next_trash_bucket_++;
483
+ pending_files_in_buckets_.emplace(bucket_number, 0);
484
+ return bucket_number;
485
+ }
486
+
487
+ void DeleteScheduler::WaitForEmptyTrashBucket(int32_t bucket) {
488
+ InstrumentedMutexLock l(&mu_);
489
+ if (bucket >= next_trash_bucket_) {
490
+ return;
491
+ }
492
+ auto iter = pending_files_in_buckets_.find(bucket);
493
+ while (iter != pending_files_in_buckets_.end() && iter->second > 0 &&
494
+ !closing_) {
495
+ cv_.Wait();
496
+ iter = pending_files_in_buckets_.find(bucket);
497
+ }
498
+ pending_files_in_buckets_.erase(bucket);
499
+ }
500
+
400
501
  void DeleteScheduler::MaybeCreateBackgroundThread() {
401
502
  if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
402
503
  bg_thread_.reset(
@@ -7,6 +7,7 @@
7
7
 
8
8
 
9
9
  #include <map>
10
+ #include <optional>
10
11
  #include <queue>
11
12
  #include <string>
12
13
  #include <thread>
@@ -48,16 +49,45 @@ class DeleteScheduler {
48
49
  MaybeCreateBackgroundThread();
49
50
  }
50
51
 
51
- // Mark file as trash directory and schedule its deletion. If force_bg is
52
- // set, it forces the file to always be deleted in the background thread,
53
- // except when rate limiting is disabled
52
+ // Delete an accounted file that is tracked by `SstFileManager` and should be
53
+ // tracked by this `DeleteScheduler` when it's deleted.
54
+ // The file is deleted immediately if slow deletion is disabled. If force_bg
55
+ // is not set and trash to db size ratio exceeded the configured threshold,
56
+ // it is immediately deleted too. In all other cases, the file will be moved
57
+ // to a trash directory and scheduled for deletion by a background thread.
54
58
  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
55
59
  const bool force_bg = false);
56
60
 
57
- // Wait for all files being deleteing in the background to finish or for
61
+ // Delete an unaccounted file that is not tracked by `SstFileManager` and
62
+ // should not be tracked by this `DeleteScheduler` when it's deleted.
63
+ // The file is deleted immediately if slow deletion is disabled. If force_bg
64
+ // is not set and the file have more than 1 hard link, it is immediately
65
+ // deleted too. In all other cases, the file will be moved to a trash
66
+ // directory and scheduled for deletion by a background thread.
67
+ // This API also supports assign a file to a specified bucket created by
68
+ // `NewTrashBucket` when delete files in the background. So the caller can
69
+ // wait for a specific bucket to be empty by checking the
70
+ // `WaitForEmptyTrashBucket` API.
71
+ Status DeleteUnaccountedFile(const std::string& file_path,
72
+ const std::string& dir_to_sync,
73
+ const bool force_bg = false,
74
+ std::optional<int32_t> bucket = std::nullopt);
75
+
76
+ // Wait for all files being deleted in the background to finish or for
58
77
  // destructor to be called.
59
78
  void WaitForEmptyTrash();
60
79
 
80
+ // Creates a new trash bucket. A bucket is only created and returned when slow
81
+ // deletion is enabled.
82
+ // For each bucket that is created, the user should also call
83
+ // `WaitForEmptyTrashBucket` after scheduling file deletions to make sure the
84
+ // trash files are all cleared.
85
+ std::optional<int32_t> NewTrashBucket();
86
+
87
+ // Wait for all the files in the specified bucket to be deleted in the
88
+ // background or for the destructor to be called.
89
+ void WaitForEmptyTrashBucket(int32_t bucket);
90
+
61
91
  // Return a map containing errors that happened in BackgroundEmptyTrash
62
92
  // file_path => error status
63
93
  std::map<std::string, Status> GetBackgroundErrors();
@@ -87,12 +117,21 @@ class DeleteScheduler {
87
117
  }
88
118
 
89
119
  private:
90
- Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
120
+ Status DeleteFileImmediately(const std::string& file_path, bool accounted);
121
+
122
+ Status AddFileToDeletionQueue(const std::string& file_path,
123
+ const std::string& dir_to_sync,
124
+ std::optional<int32_t> bucket, bool accounted);
125
+
126
+ Status MarkAsTrash(const std::string& file_path, bool accounted,
127
+ std::string* path_in_trash);
91
128
 
92
129
  Status DeleteTrashFile(const std::string& path_in_trash,
93
- const std::string& dir_to_sync,
130
+ const std::string& dir_to_sync, bool accounted,
94
131
  uint64_t* deleted_bytes, bool* is_complete);
95
132
 
133
+ Status OnDeleteFile(const std::string& file_path, bool accounted);
134
+
96
135
  void BackgroundEmptyTrash();
97
136
 
98
137
  void MaybeCreateBackgroundThread();
@@ -104,19 +143,28 @@ class DeleteScheduler {
104
143
  std::atomic<uint64_t> total_trash_size_;
105
144
  // Maximum number of bytes that should be deleted per second
106
145
  std::atomic<int64_t> rate_bytes_per_sec_;
107
- // Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
146
+ // Mutex to protect queue_, pending_files_, next_trash_bucket_,
147
+ // pending_files_in_buckets_, bg_errors_, closing_, stats_
108
148
  InstrumentedMutex mu_;
109
149
 
110
150
  struct FileAndDir {
111
- FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
151
+ FileAndDir(const std::string& _fname, const std::string& _dir,
152
+ bool _accounted, std::optional<int32_t> _bucket)
153
+ : fname(_fname), dir(_dir), accounted(_accounted), bucket(_bucket) {}
112
154
  std::string fname;
113
155
  std::string dir; // empty will be skipped.
156
+ bool accounted;
157
+ std::optional<int32_t> bucket;
114
158
  };
115
159
 
116
160
  // Queue of trash files that need to be deleted
117
161
  std::queue<FileAndDir> queue_;
118
162
  // Number of trash files that are waiting to be deleted
119
163
  int32_t pending_files_;
164
+ // Next trash bucket that can be created
165
+ int32_t next_trash_bucket_;
166
+ // A mapping from trash bucket to number of pending files in the bucket
167
+ std::map<int32_t, int32_t> pending_files_in_buckets_;
120
168
  uint64_t bytes_max_delete_chunk_;
121
169
  // Errors that happened in BackgroundEmptyTrash (file_path => error)
122
170
  std::map<std::string, Status> bg_errors_;
@@ -127,6 +175,7 @@ class DeleteScheduler {
127
175
  // Condition variable signaled in these conditions
128
176
  // - pending_files_ value change from 0 => 1
129
177
  // - pending_files_ value change from 1 => 0
178
+ // - a value in pending_files_in_buckets change from 1 => 0
130
179
  // - closing_ value is set to true
131
180
  InstrumentedCondVar cv_;
132
181
  // Background thread running BackgroundEmptyTrash
@@ -138,6 +187,10 @@ class DeleteScheduler {
138
187
  // If the trash size constitutes for more than this fraction of the total DB
139
188
  // size we will start deleting new files passed to DeleteScheduler
140
189
  // immediately
190
+ // Unaccounted files passed for deletion will not cause change in
191
+ // total_trash_size_ or affect the DeleteScheduler::total_trash_size_ over
192
+ // SstFileManager::total_size_ ratio. Their slow deletion is not subject to
193
+ // this configured threshold either.
141
194
  std::atomic<double> max_trash_db_ratio_;
142
195
  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
143
196
  std::shared_ptr<Statistics> stats_;
@@ -78,7 +78,7 @@ class DeleteSchedulerTest : public testing::Test {
78
78
  }
79
79
 
80
80
  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
81
- size_t dummy_files_dirs_idx = 0) {
81
+ size_t dummy_files_dirs_idx = 0, bool track = true) {
82
82
  std::string file_path =
83
83
  dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
84
84
  std::unique_ptr<WritableFile> f;
@@ -86,7 +86,9 @@ class DeleteSchedulerTest : public testing::Test {
86
86
  std::string data(size, 'A');
87
87
  EXPECT_OK(f->Append(data));
88
88
  EXPECT_OK(f->Close());
89
- EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
89
+ if (track) {
90
+ EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
91
+ }
90
92
  return file_path;
91
93
  }
92
94
 
@@ -353,6 +355,8 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
353
355
  ASSERT_EQ(num_files,
354
356
  stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
355
357
 
358
+ ASSERT_FALSE(delete_scheduler_->NewTrashBucket().has_value());
359
+
356
360
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
357
361
  }
358
362
 
@@ -718,6 +722,141 @@ TEST_F(DeleteSchedulerTest, IsTrashCheck) {
718
722
  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
719
723
  }
720
724
 
725
+ TEST_F(DeleteSchedulerTest, DeleteAccountedAndUnaccountedFiles) {
726
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
727
+ NewDeleteScheduler();
728
+
729
+ // Create 100 files, every file is 1 KB
730
+ int num_files = 100; // 100 files
731
+ uint64_t file_size = 1024; // 1 KB as a file size
732
+ std::vector<std::string> generated_files;
733
+ for (int i = 0; i < num_files; i++) {
734
+ std::string file_name = "file" + std::to_string(i) + ".data";
735
+ generated_files.push_back(NewDummyFile(file_name, file_size,
736
+ /*dummy_files_dirs_idx*/ 0,
737
+ /*track=*/false));
738
+ }
739
+
740
+ for (int i = 0; i < num_files; i++) {
741
+ if (i % 2) {
742
+ ASSERT_OK(sst_file_mgr_->OnAddFile(generated_files[i], file_size));
743
+ ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
744
+ } else {
745
+ ASSERT_OK(
746
+ delete_scheduler_->DeleteUnaccountedFile(generated_files[i], ""));
747
+ }
748
+ }
749
+
750
+ delete_scheduler_->WaitForEmptyTrash();
751
+ ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
752
+ ASSERT_EQ(0, sst_file_mgr_->GetTotalSize());
753
+ }
754
+
755
+ TEST_F(DeleteSchedulerTest, ConcurrentlyDeleteUnaccountedFilesInBuckets) {
756
+ int bg_delete_file = 0;
757
+ int fg_delete_file = 0;
758
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
759
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
760
+ [&](void* /*arg*/) { bg_delete_file++; });
761
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
762
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
763
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
764
+ NewDeleteScheduler();
765
+
766
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
767
+ // Create 1000 files, every file is 1 KB
768
+ int num_files = 1000;
769
+ uint64_t file_size = 1024; // 1 KB as a file size
770
+ std::vector<std::string> generated_files;
771
+ for (int i = 0; i < num_files; i++) {
772
+ std::string file_name = "file" + std::to_string(i) + ".data";
773
+ generated_files.push_back(NewDummyFile(file_name, file_size,
774
+ /*dummy_files_dirs_idx*/ 0,
775
+ /*track=*/false));
776
+ }
777
+ // Concurrently delete files in different buckets and check all the buckets
778
+ // are empty.
779
+ int thread_cnt = 10;
780
+ int files_per_thread = 100;
781
+ std::atomic<int> thread_num(0);
782
+ std::vector<port::Thread> threads;
783
+ std::function<void()> delete_thread = [&]() {
784
+ std::optional<int32_t> bucket = delete_scheduler_->NewTrashBucket();
785
+ ASSERT_TRUE(bucket.has_value());
786
+ int idx = thread_num.fetch_add(1);
787
+ int range_start = idx * files_per_thread;
788
+ int range_end = range_start + files_per_thread;
789
+ for (int j = range_start; j < range_end; j++) {
790
+ ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(
791
+ generated_files[j], "", /*false_bg=*/false, bucket));
792
+ }
793
+ delete_scheduler_->WaitForEmptyTrashBucket(bucket.value());
794
+ };
795
+
796
+ for (int i = 0; i < thread_cnt; i++) {
797
+ threads.emplace_back(delete_thread);
798
+ }
799
+
800
+ for (size_t i = 0; i < threads.size(); i++) {
801
+ threads[i].join();
802
+ }
803
+
804
+ ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
805
+ ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
806
+ ASSERT_EQ(1000, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
807
+ ASSERT_EQ(0, fg_delete_file);
808
+ ASSERT_EQ(1000, bg_delete_file);
809
+
810
+ // OK to re check an already empty bucket
811
+ delete_scheduler_->WaitForEmptyTrashBucket(9);
812
+ // Invalid bucket return too.
813
+ delete_scheduler_->WaitForEmptyTrashBucket(100);
814
+ std::optional<int32_t> next_bucket = delete_scheduler_->NewTrashBucket();
815
+ ASSERT_TRUE(next_bucket.has_value());
816
+ ASSERT_EQ(10, next_bucket.value());
817
+ delete_scheduler_->WaitForEmptyTrashBucket(10);
818
+
819
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
820
+ }
821
+
822
+ TEST_F(DeleteSchedulerTest,
823
+ ImmediatelyDeleteUnaccountedFilesWithRemainingLinks) {
824
+ int bg_delete_file = 0;
825
+ int fg_delete_file = 0;
826
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
827
+ "DeleteScheduler::DeleteTrashFile:DeleteFile",
828
+ [&](void* /*arg*/) { bg_delete_file++; });
829
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
830
+ "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
831
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
832
+
833
+ rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
834
+ NewDeleteScheduler();
835
+
836
+ std::string file1 = NewDummyFile("data_1", 500 * 1024,
837
+ /*dummy_files_dirs_idx*/ 0, /*track=*/false);
838
+ std::string file2 = NewDummyFile("data_2", 100 * 1024,
839
+ /*dummy_files_dirs_idx*/ 0, /*track=*/false);
840
+
841
+ ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
842
+ ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
843
+
844
+ // Should delete in 4 batch if there is no hardlink
845
+ ASSERT_OK(
846
+ delete_scheduler_->DeleteUnaccountedFile(file1, "", /*force_bg=*/false));
847
+ ASSERT_OK(
848
+ delete_scheduler_->DeleteUnaccountedFile(file2, "", /*force_bg=*/false));
849
+
850
+ delete_scheduler_->WaitForEmptyTrash();
851
+
852
+ ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
853
+ ASSERT_EQ(0, bg_delete_file);
854
+ ASSERT_EQ(2, fg_delete_file);
855
+ ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
856
+ ASSERT_EQ(2, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
857
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
858
+ }
859
+
721
860
  } // namespace ROCKSDB_NAMESPACE
722
861
 
723
862
  int main(int argc, char** argv) {
@@ -125,8 +125,8 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination,
125
125
  Status DeleteDBFile(const ImmutableDBOptions* db_options,
126
126
  const std::string& fname, const std::string& dir_to_sync,
127
127
  const bool force_bg, const bool force_fg) {
128
- SstFileManagerImpl* sfm =
129
- static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
128
+ SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
129
+ db_options->sst_file_manager.get());
130
130
  if (sfm && !force_fg) {
131
131
  return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
132
132
  } else {
@@ -134,6 +134,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
134
134
  }
135
135
  }
136
136
 
137
+ Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
138
+ const std::string& fname,
139
+ const std::string& dir_to_sync,
140
+ const bool force_bg, const bool force_fg,
141
+ std::optional<int32_t> bucket) {
142
+ SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
143
+ db_options->sst_file_manager.get());
144
+ if (sfm && !force_fg) {
145
+ return sfm->ScheduleUnaccountedFileDeletion(fname, dir_to_sync, force_bg,
146
+ bucket);
147
+ } else {
148
+ return db_options->env->DeleteFile(fname);
149
+ }
150
+ }
151
+
137
152
  // requested_checksum_func_name brings the function name of the checksum
138
153
  // generator in checksum_factory. Empty string is permitted, in which case the
139
154
  // name of the generator created by the factory is unchecked. When
@@ -55,6 +55,16 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
55
55
  const std::string& fname, const std::string& path_to_sync,
56
56
  const bool force_bg, const bool force_fg);
57
57
 
58
+ // Delete an unaccounted DB file that is not tracked by SstFileManager and will
59
+ // not be tracked by its DeleteScheduler when getting deleted.
60
+ // If a legitimate bucket is provided and this file is scheduled for slow
61
+ // deletion, it will be assigned to the specified trash bucket.
62
+ Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
63
+ const std::string& fname,
64
+ const std::string& dir_to_sync,
65
+ const bool force_bg, const bool force_fg,
66
+ std::optional<int32_t> bucket);
67
+
58
68
  // TODO(hx235): pass the whole DBOptions intead of its individual fields
59
69
  IOStatus GenerateOneFileChecksum(
60
70
  FileSystem* fs, const std::string& file_path,
@@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
388
388
 
389
389
  IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
390
390
  const std::string& dbname, uint64_t descriptor_number,
391
+ Temperature temp,
391
392
  FSDirectory* dir_contains_current_file) {
392
393
  // Remove leading "dbname/" and add newline to manifest file name
393
394
  std::string manifest = DescriptorFileName(dbname, descriptor_number);
@@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
397
398
  std::string tmp = TempFileName(dbname, descriptor_number);
398
399
  IOOptions opts;
399
400
  IOStatus s = PrepareIOFromWriteOptions(write_options, opts);
401
+ FileOptions file_opts;
402
+ file_opts.temperature = temp;
400
403
  if (s.ok()) {
401
- s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts);
404
+ s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts,
405
+ file_opts);
402
406
  }
403
407
  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
404
408
  if (s.ok()) {
@@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
423
427
  }
424
428
 
425
429
  Status SetIdentityFile(const WriteOptions& write_options, Env* env,
426
- const std::string& dbname, const std::string& db_id) {
430
+ const std::string& dbname, Temperature temp,
431
+ const std::string& db_id) {
427
432
  std::string id;
428
433
  if (db_id.empty()) {
429
434
  id = env->GenerateUniqueId();
@@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env,
437
442
  Status s;
438
443
  IOOptions opts;
439
444
  s = PrepareIOFromWriteOptions(write_options, opts);
445
+ FileOptions file_opts;
446
+ file_opts.temperature = temp;
440
447
  if (s.ok()) {
441
- s = WriteStringToFile(env, id, tmp, true, &opts);
448
+ s = WriteStringToFile(env->GetFileSystem().get(), id, tmp,
449
+ /*should_sync=*/true, opts, file_opts);
442
450
  }
443
451
  if (s.ok()) {
444
452
  s = env->RenameFile(tmp, identify_file_name);