@nxtedition/rocksdb 7.0.37 → 7.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/binding.cc +17 -26
  2. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
  3. package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
  4. package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
  14. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
  15. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
  23. package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
  24. package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
  25. package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
  26. package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
  27. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
  28. package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
  29. package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
  30. package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
  32. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
  33. package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
  34. package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
  35. package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
  36. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
  37. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
  40. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
  41. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
  42. package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
  43. package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
  44. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
  45. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
  46. package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
  47. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
  48. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
  49. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
  50. package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
  51. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
  52. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
  54. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
  55. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
  56. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
  57. package/index.js +2 -2
  58. package/iterator.js +1 -1
  59. package/package.json +1 -1
  60. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  61. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -82,9 +82,10 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
82
82
  return false;
83
83
  }
84
84
 
85
- IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
85
+ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
86
+ VersionEdit* synced_wals) {
86
87
  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
87
- mutex_.AssertHeld();
88
+ InstrumentedMutexLock l(&log_write_mutex_);
88
89
  autovector<log::Writer*, 1> logs_to_sync;
89
90
  uint64_t current_log_number = logfile_number_;
90
91
  while (logs_.front().number < current_log_number &&
@@ -100,7 +101,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
100
101
 
101
102
  IOStatus io_s;
102
103
  if (!logs_to_sync.empty()) {
103
- mutex_.Unlock();
104
+ log_write_mutex_.Unlock();
104
105
 
105
106
  assert(job_context);
106
107
 
@@ -128,12 +129,12 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
128
129
 
129
130
  TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
130
131
  /*arg=*/nullptr);
131
- mutex_.Lock();
132
+ log_write_mutex_.Lock();
132
133
 
133
134
  // "number <= current_log_number - 1" is equivalent to
134
135
  // "number < current_log_number".
135
136
  if (io_s.ok()) {
136
- io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true));
137
+ MarkLogsSynced(current_log_number - 1, true, synced_wals);
137
138
  } else {
138
139
  MarkLogsNotSynced(current_log_number - 1);
139
140
  }
@@ -220,8 +221,16 @@ Status DBImpl::FlushMemTableToOutputFile(
220
221
  bool need_cancel = false;
221
222
  IOStatus log_io_s = IOStatus::OK();
222
223
  if (needs_to_sync_closed_wals) {
223
- // SyncClosedLogs() may unlock and re-lock the db_mutex.
224
- log_io_s = SyncClosedLogs(job_context);
224
+ // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
225
+ // times.
226
+ VersionEdit synced_wals;
227
+ mutex_.Unlock();
228
+ log_io_s = SyncClosedLogs(job_context, &synced_wals);
229
+ mutex_.Lock();
230
+ if (log_io_s.ok() && synced_wals.IsWalAddition()) {
231
+ log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
232
+ }
233
+
225
234
  if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
226
235
  !log_io_s.IsColumnFamilyDropped()) {
227
236
  error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
@@ -474,7 +483,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
474
483
  if (logfile_number_ > 0) {
475
484
  // TODO (yanqin) investigate whether we should sync the closed logs for
476
485
  // single column family case.
477
- log_io_s = SyncClosedLogs(job_context);
486
+ VersionEdit synced_wals;
487
+ mutex_.Unlock();
488
+ log_io_s = SyncClosedLogs(job_context, &synced_wals);
489
+ mutex_.Lock();
490
+ if (log_io_s.ok() && synced_wals.IsWalAddition()) {
491
+ log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
492
+ }
493
+
478
494
  if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
479
495
  !log_io_s.IsColumnFamilyDropped()) {
480
496
  if (total_log_size_ > 0) {
@@ -1392,7 +1408,8 @@ Status DBImpl::CompactFilesImpl(
1392
1408
  &compaction_job_stats, Env::Priority::USER, io_tracer_,
1393
1409
  kManualCompactionCanceledFalse_, db_id_, db_session_id_,
1394
1410
  c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
1395
- &blob_callback_);
1411
+ &blob_callback_, &bg_compaction_scheduled_,
1412
+ &bg_bottom_compaction_scheduled_);
1396
1413
 
1397
1414
  // Creating a compaction influences the compaction score because the score
1398
1415
  // takes running compactions into account (by skipping files that are already
@@ -3314,7 +3331,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3314
3331
  if (start_level > 0) {
3315
3332
  auto vstorage = c->input_version()->storage_info();
3316
3333
  c->edit()->AddCompactCursor(
3317
- start_level, vstorage->GetNextCompactCursor(start_level));
3334
+ start_level,
3335
+ vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
3318
3336
  }
3319
3337
  }
3320
3338
  status = versions_->LogAndApply(c->column_family_data(),
@@ -3399,7 +3417,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3399
3417
  is_manual ? manual_compaction->canceled
3400
3418
  : kManualCompactionCanceledFalse_,
3401
3419
  db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
3402
- c->trim_ts(), &blob_callback_);
3420
+ c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
3421
+ &bg_bottom_compaction_scheduled_);
3403
3422
  compaction_job.Prepare();
3404
3423
 
3405
3424
  NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
@@ -223,7 +223,7 @@ void DBImpl::TEST_EndWrite(void* w) {
223
223
  }
224
224
 
225
225
  size_t DBImpl::TEST_LogsToFreeSize() {
226
- InstrumentedMutexLock l(&mutex_);
226
+ InstrumentedMutexLock l(&log_write_mutex_);
227
227
  return logs_to_free_.size();
228
228
  }
229
229
 
@@ -271,6 +271,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
271
271
 
272
272
  // logs_ is empty when called during recovery, in which case there can't yet
273
273
  // be any tracked obsolete logs
274
+ log_write_mutex_.Lock();
275
+
276
+ if (alive_log_files_.empty() || logs_.empty()) {
277
+ mutex_.AssertHeld();
278
+ // We may reach here if the db is DBImplSecondary
279
+ log_write_mutex_.Unlock();
280
+ return;
281
+ }
282
+
274
283
  if (!alive_log_files_.empty() && !logs_.empty()) {
275
284
  uint64_t min_log_number = job_context->log_number;
276
285
  size_t num_alive_log_files = alive_log_files_.size();
@@ -292,17 +301,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
292
301
  }
293
302
  job_context->size_log_to_delete += earliest.size;
294
303
  total_log_size_ -= earliest.size;
295
- if (two_write_queues_) {
296
- log_write_mutex_.Lock();
297
- }
298
304
  alive_log_files_.pop_front();
299
- if (two_write_queues_) {
300
- log_write_mutex_.Unlock();
301
- }
305
+
302
306
  // Current log should always stay alive since it can't have
303
307
  // number < MinLogNumber().
304
308
  assert(alive_log_files_.size());
305
309
  }
310
+ log_write_mutex_.Unlock();
311
+ mutex_.Unlock();
312
+ log_write_mutex_.Lock();
306
313
  while (!logs_.empty() && logs_.front().number < min_log_number) {
307
314
  auto& log = logs_.front();
308
315
  if (log.IsSyncing()) {
@@ -311,10 +318,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
311
318
  continue;
312
319
  }
313
320
  logs_to_free_.push_back(log.ReleaseWriter());
314
- {
315
- InstrumentedMutexLock wl(&log_write_mutex_);
316
- logs_.pop_front();
317
- }
321
+ logs_.pop_front();
318
322
  }
319
323
  // Current log cannot be obsolete.
320
324
  assert(!logs_.empty());
@@ -323,23 +327,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
323
327
  // We're just cleaning up for DB::Write().
324
328
  assert(job_context->logs_to_free.empty());
325
329
  job_context->logs_to_free = logs_to_free_;
330
+
331
+ logs_to_free_.clear();
332
+ log_write_mutex_.Unlock();
333
+ mutex_.Lock();
326
334
  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
327
335
  log_recycle_files_.end());
328
- logs_to_free_.clear();
329
- }
330
-
331
- namespace {
332
- bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
333
- const JobContext::CandidateFileInfo& second) {
334
- if (first.file_name > second.file_name) {
335
- return true;
336
- } else if (first.file_name < second.file_name) {
337
- return false;
338
- } else {
339
- return (first.file_path > second.file_path);
340
- }
341
336
  }
342
- } // namespace
343
337
 
344
338
  // Delete obsolete files and log status and information of file deletion
345
339
  void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
@@ -445,7 +439,16 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
445
439
  // dedup state.candidate_files so we don't try to delete the same
446
440
  // file twice
447
441
  std::sort(candidate_files.begin(), candidate_files.end(),
448
- CompareCandidateFile);
442
+ [](const JobContext::CandidateFileInfo& lhs,
443
+ const JobContext::CandidateFileInfo& rhs) {
444
+ if (lhs.file_name > rhs.file_name) {
445
+ return true;
446
+ } else if (lhs.file_name < rhs.file_name) {
447
+ return false;
448
+ } else {
449
+ return (lhs.file_path > rhs.file_path);
450
+ }
451
+ });
449
452
  candidate_files.erase(
450
453
  std::unique(candidate_files.begin(), candidate_files.end()),
451
454
  candidate_files.end());
@@ -1459,9 +1459,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
1459
1459
  Status s;
1460
1460
  mutex_.AssertHeld();
1461
1461
  assert(immutable_db_options_.avoid_flush_during_recovery);
1462
- if (two_write_queues_) {
1463
- log_write_mutex_.Lock();
1464
- }
1465
1462
  // Mark these as alive so they'll be considered for deletion later by
1466
1463
  // FindObsoleteFiles()
1467
1464
  total_log_size_ = 0;
@@ -1486,9 +1483,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
1486
1483
  total_log_size_ += log.size;
1487
1484
  alive_log_files_.push_back(log);
1488
1485
  }
1489
- if (two_write_queues_) {
1490
- log_write_mutex_.Unlock();
1491
- }
1492
1486
  return s;
1493
1487
  }
1494
1488
 
@@ -1871,16 +1865,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
1871
1865
  }
1872
1866
 
1873
1867
  if (s.ok()) {
1874
- if (impl->two_write_queues_) {
1875
- impl->log_write_mutex_.Lock();
1876
- }
1877
1868
  impl->alive_log_files_.push_back(
1878
1869
  DBImpl::LogFileNumberSize(impl->logfile_number_));
1879
- if (impl->two_write_queues_) {
1880
- impl->log_write_mutex_.Unlock();
1881
- }
1882
- }
1883
- if (s.ok()) {
1884
1870
  // In WritePrepared there could be gap in sequence numbers. This breaks
1885
1871
  // the trick we use in kPointInTimeRecovery which assumes the first seq in
1886
1872
  // the log right after the corrupted log is one larger than the last seq
@@ -349,14 +349,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
349
349
  // when it finds suitable, and finish them in the same write batch.
350
350
  // This is how a write job could be done by the other writer.
351
351
  WriteContext write_context;
352
+ LogContext log_context(write_options.sync);
352
353
  WriteThread::WriteGroup write_group;
353
354
  bool in_parallel_group = false;
354
355
  uint64_t last_sequence = kMaxSequenceNumber;
355
356
 
356
- mutex_.Lock();
357
-
358
- bool need_log_sync = write_options.sync;
359
- bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
360
357
  assert(!two_write_queues_ || !disable_memtable);
361
358
  {
362
359
  // With concurrent writes we do preprocess only in the write thread that
@@ -366,7 +363,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
366
363
  // PreprocessWrite does its own perf timing.
367
364
  PERF_TIMER_STOP(write_pre_and_post_process_time);
368
365
 
369
- status = PreprocessWrite(write_options, &need_log_sync, &write_context);
366
+ status = PreprocessWrite(write_options, &log_context, &write_context);
370
367
  if (!two_write_queues_) {
371
368
  // Assign it after ::PreprocessWrite since the sequence might advance
372
369
  // inside it by WriteRecoverableState
@@ -376,13 +373,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
376
373
  PERF_TIMER_START(write_pre_and_post_process_time);
377
374
  }
378
375
 
379
- log::Writer* log_writer = logs_.back().writer;
380
- LogFileNumberSize& log_file_number_size = alive_log_files_.back();
381
-
382
- assert(log_writer->get_log_number() == log_file_number_size.number);
383
-
384
- mutex_.Unlock();
385
-
386
376
  // Add to log and apply to memtable. We can release the lock
387
377
  // during this phase since &w is currently responsible for logging
388
378
  // and protects against concurrent loggers and concurrent writes
@@ -477,10 +467,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
477
467
 
478
468
  if (!two_write_queues_) {
479
469
  if (status.ok() && !write_options.disableWAL) {
470
+ assert(log_context.log_file_number_size);
471
+ LogFileNumberSize& log_file_number_size =
472
+ *(log_context.log_file_number_size);
480
473
  PERF_TIMER_GUARD(write_wal_time);
481
- io_s = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
482
- need_log_dir_sync, last_sequence + 1,
483
- log_file_number_size);
474
+ io_s =
475
+ WriteToWAL(write_group, log_context.writer, log_used,
476
+ log_context.need_log_sync, log_context.need_log_dir_sync,
477
+ last_sequence + 1, log_file_number_size);
484
478
  }
485
479
  } else {
486
480
  if (status.ok() && !write_options.disableWAL) {
@@ -582,14 +576,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
582
576
  assert(pre_release_cb_status.ok());
583
577
  }
584
578
 
585
- if (need_log_sync) {
586
- mutex_.Lock();
579
+ if (log_context.need_log_sync) {
580
+ VersionEdit synced_wals;
581
+ log_write_mutex_.Lock();
587
582
  if (status.ok()) {
588
- status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
583
+ MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
584
+ &synced_wals);
589
585
  } else {
590
586
  MarkLogsNotSynced(logfile_number_);
591
587
  }
592
- mutex_.Unlock();
588
+ log_write_mutex_.Unlock();
589
+ if (status.ok() && synced_wals.IsWalAddition()) {
590
+ InstrumentedMutexLock l(&mutex_);
591
+ status = ApplyWALToManifest(&synced_wals);
592
+ }
593
+
593
594
  // Requesting sync with two_write_queues_ is expected to be very rare. We
594
595
  // hence provide a simple implementation that is not necessarily efficient.
595
596
  if (two_write_queues_) {
@@ -652,19 +653,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
652
653
  if (w.callback && !w.callback->AllowWriteBatching()) {
653
654
  write_thread_.WaitForMemTableWriters();
654
655
  }
655
- mutex_.Lock();
656
- bool need_log_sync = !write_options.disableWAL && write_options.sync;
657
- bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
656
+ LogContext log_context(!write_options.disableWAL && write_options.sync);
658
657
  // PreprocessWrite does its own perf timing.
659
658
  PERF_TIMER_STOP(write_pre_and_post_process_time);
660
- w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
659
+ w.status = PreprocessWrite(write_options, &log_context, &write_context);
661
660
  PERF_TIMER_START(write_pre_and_post_process_time);
662
- log::Writer* log_writer = logs_.back().writer;
663
- LogFileNumberSize& log_file_number_size = alive_log_files_.back();
664
-
665
- assert(log_writer->get_log_number() == log_file_number_size.number);
666
-
667
- mutex_.Unlock();
668
661
 
669
662
  // This can set non-OK status if callback fail.
670
663
  last_batch_group_size_ =
@@ -727,9 +720,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
727
720
  wal_write_group.size - 1);
728
721
  RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
729
722
  }
723
+ assert(log_context.log_file_number_size);
724
+ LogFileNumberSize& log_file_number_size =
725
+ *(log_context.log_file_number_size);
730
726
  io_s =
731
- WriteToWAL(wal_write_group, log_writer, log_used, need_log_sync,
732
- need_log_dir_sync, current_sequence, log_file_number_size);
727
+ WriteToWAL(wal_write_group, log_context.writer, log_used,
728
+ log_context.need_log_sync, log_context.need_log_dir_sync,
729
+ current_sequence, log_file_number_size);
733
730
  w.status = io_s;
734
731
  }
735
732
 
@@ -740,16 +737,20 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
740
737
  WriteStatusCheck(w.status);
741
738
  }
742
739
 
743
- if (need_log_sync) {
744
- mutex_.Lock();
740
+ VersionEdit synced_wals;
741
+ if (log_context.need_log_sync) {
742
+ InstrumentedMutexLock l(&log_write_mutex_);
745
743
  if (w.status.ok()) {
746
- w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
744
+ MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
745
+ &synced_wals);
747
746
  } else {
748
747
  MarkLogsNotSynced(logfile_number_);
749
748
  }
750
- mutex_.Unlock();
751
749
  }
752
-
750
+ if (w.status.ok() && synced_wals.IsWalAddition()) {
751
+ InstrumentedMutexLock l(&mutex_);
752
+ w.status = ApplyWALToManifest(&synced_wals);
753
+ }
753
754
  write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
754
755
  }
755
756
 
@@ -893,9 +894,8 @@ Status DBImpl::WriteImplWALOnly(
893
894
  // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
894
895
  // without paying the cost of obtaining the mutex.
895
896
  if (status.ok()) {
896
- InstrumentedMutexLock l(&mutex_);
897
- bool need_log_sync = false;
898
- status = PreprocessWrite(write_options, &need_log_sync, &write_context);
897
+ LogContext log_context;
898
+ status = PreprocessWrite(write_options, &log_context, &write_context);
899
899
  WriteStatusCheckOnLocked(status);
900
900
  }
901
901
  if (!status.ok()) {
@@ -1057,9 +1057,8 @@ Status DBImpl::WriteImplWALOnly(
1057
1057
  void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
1058
1058
  // Is setting bg_error_ enough here? This will at least stop
1059
1059
  // compaction and fail any further writes.
1060
- // Caller must hold mutex_.
1060
+ InstrumentedMutexLock l(&mutex_);
1061
1061
  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
1062
- mutex_.AssertHeld();
1063
1062
  if (immutable_db_options_.paranoid_checks && !status.ok() &&
1064
1063
  !status.IsBusy() && !status.IsIncomplete()) {
1065
1064
  // Maybe change the return status to void?
@@ -1110,13 +1109,13 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
1110
1109
  }
1111
1110
 
1112
1111
  Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1113
- bool* need_log_sync,
1112
+ LogContext* log_context,
1114
1113
  WriteContext* write_context) {
1115
- mutex_.AssertHeld();
1116
- assert(write_context != nullptr && need_log_sync != nullptr);
1114
+ assert(write_context != nullptr && log_context != nullptr);
1117
1115
  Status status;
1118
1116
 
1119
1117
  if (error_handler_.IsDBStopped()) {
1118
+ InstrumentedMutexLock l(&mutex_);
1120
1119
  status = error_handler_.GetBGError();
1121
1120
  }
1122
1121
 
@@ -1124,11 +1123,11 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1124
1123
 
1125
1124
  if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
1126
1125
  assert(versions_);
1126
+ InstrumentedMutexLock l(&mutex_);
1127
1127
  const ColumnFamilySet* const column_families =
1128
1128
  versions_->GetColumnFamilySet();
1129
1129
  assert(column_families);
1130
1130
  size_t num_cfs = column_families->NumberOfColumnFamilies();
1131
-
1132
1131
  assert(num_cfs >= 1);
1133
1132
  if (num_cfs > 1) {
1134
1133
  WaitForPendingWrites();
@@ -1142,15 +1141,18 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1142
1141
  // thread is writing to another DB with the same write buffer, they may also
1143
1142
  // be flushed. We may end up with flushing much more DBs than needed. It's
1144
1143
  // suboptimal but still correct.
1144
+ InstrumentedMutexLock l(&mutex_);
1145
1145
  WaitForPendingWrites();
1146
1146
  status = HandleWriteBufferManagerFlush(write_context);
1147
1147
  }
1148
1148
 
1149
1149
  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
1150
+ InstrumentedMutexLock l(&mutex_);
1150
1151
  status = TrimMemtableHistory(write_context);
1151
1152
  }
1152
1153
 
1153
1154
  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
1155
+ InstrumentedMutexLock l(&mutex_);
1154
1156
  WaitForPendingWrites();
1155
1157
  status = ScheduleFlushes(write_context);
1156
1158
  }
@@ -1166,6 +1168,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1166
1168
  // for previous one. It might create a fairness issue that expiration
1167
1169
  // might happen for smaller writes but larger writes can go through.
1168
1170
  // Can optimize it if it is an issue.
1171
+ InstrumentedMutexLock l(&mutex_);
1169
1172
  status = DelayWrite(last_batch_group_size_, write_options);
1170
1173
  PERF_TIMER_START(write_pre_and_post_process_time);
1171
1174
  }
@@ -1180,11 +1183,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1180
1183
  if (write_options.no_slowdown) {
1181
1184
  status = Status::Incomplete("Write stall");
1182
1185
  } else {
1186
+ InstrumentedMutexLock l(&mutex_);
1183
1187
  WriteBufferManagerStallWrites();
1184
1188
  }
1185
1189
  }
1186
-
1187
- if (status.ok() && *need_log_sync) {
1190
+ InstrumentedMutexLock l(&log_write_mutex_);
1191
+ if (status.ok() && log_context->need_log_sync) {
1188
1192
  // Wait until the parallel syncs are finished. Any sync process has to sync
1189
1193
  // the front log too so it is enough to check the status of front()
1190
1194
  // We do a while loop since log_sync_cv_ is signalled when any sync is
@@ -1204,8 +1208,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
1204
1208
  log.PrepareForSync();
1205
1209
  }
1206
1210
  } else {
1207
- *need_log_sync = false;
1211
+ log_context->need_log_sync = false;
1208
1212
  }
1213
+ log_context->writer = logs_.back().writer;
1214
+ log_context->need_log_dir_sync =
1215
+ log_context->need_log_dir_sync && !log_dir_synced_;
1216
+ log_context->log_file_number_size = std::addressof(alive_log_files_.back());
1209
1217
 
1210
1218
  return status;
1211
1219
  }
@@ -1714,10 +1722,12 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
1714
1722
  }
1715
1723
 
1716
1724
  uint64_t DBImpl::GetMaxTotalWalSize() const {
1717
- mutex_.AssertHeld();
1718
- return mutable_db_options_.max_total_wal_size == 0
1719
- ? 4 * max_total_in_memory_state_
1720
- : mutable_db_options_.max_total_wal_size;
1725
+ uint64_t max_total_wal_size =
1726
+ max_total_wal_size_.load(std::memory_order_acquire);
1727
+ if (max_total_wal_size > 0) {
1728
+ return max_total_wal_size;
1729
+ }
1730
+ return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
1721
1731
  }
1722
1732
 
1723
1733
  // REQUIRES: mutex_ is held
@@ -2065,7 +2075,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2065
2075
  log_recycle_files_.pop_front();
2066
2076
  }
2067
2077
  if (s.ok() && creating_new_log) {
2068
- log_write_mutex_.Lock();
2078
+ InstrumentedMutexLock l(&log_write_mutex_);
2069
2079
  assert(new_log != nullptr);
2070
2080
  if (!logs_.empty()) {
2071
2081
  // Alway flush the buffer of the last log before switching to a new one
@@ -2089,7 +2099,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
2089
2099
  logs_.emplace_back(logfile_number_, new_log);
2090
2100
  alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
2091
2101
  }
2092
- log_write_mutex_.Unlock();
2093
2102
  }
2094
2103
 
2095
2104
  if (!s.ok()) {
@@ -492,7 +492,7 @@ TEST_F(DBTest, PutSingleDeleteGet) {
492
492
  ASSERT_EQ("v2", Get(1, "foo2"));
493
493
  ASSERT_OK(SingleDelete(1, "foo"));
494
494
  ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
495
- // Skip FIFO and universal compaction beccause they do not apply to the test
495
+ // Skip FIFO and universal compaction because they do not apply to the test
496
496
  // case. Skip MergePut because single delete does not get removed when it
497
497
  // encounters a merge.
498
498
  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
@@ -645,7 +645,7 @@ TEST_F(DBTest, SingleDeleteFlush) {
645
645
 
646
646
  ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
647
647
  ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
648
- // Skip FIFO and universal compaction beccause they do not apply to the test
648
+ // Skip FIFO and universal compaction beccaus they do not apply to the test
649
649
  // case. Skip MergePut because single delete does not get removed when it
650
650
  // encounters a merge.
651
651
  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
@@ -668,7 +668,7 @@ TEST_F(DBTest, SingleDeletePutFlush) {
668
668
  ASSERT_OK(Flush(1));
669
669
 
670
670
  ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
671
- // Skip FIFO and universal compaction beccause they do not apply to the test
671
+ // Skip FIFO and universal compaction beccaus they do not apply to the test
672
672
  // case. Skip MergePut because single delete does not get removed when it
673
673
  // encounters a merge.
674
674
  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
@@ -1993,7 +1993,7 @@ TEST_F(DBTest, UnremovableSingleDelete) {
1993
1993
  ASSERT_EQ("first", Get(1, "foo", snapshot));
1994
1994
  ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
1995
1995
  db_->ReleaseSnapshot(snapshot);
1996
- // Skip FIFO and universal compaction beccause they do not apply to the test
1996
+ // Skip FIFO and universal compaction because they do not apply to the test
1997
1997
  // case. Skip MergePut because single delete does not get removed when it
1998
1998
  // encounters a merge.
1999
1999
  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
@@ -2773,7 +2773,7 @@ INSTANTIATE_TEST_CASE_P(
2773
2773
  #endif // ROCKSDB_LITE
2774
2774
 
2775
2775
  // Group commit test:
2776
- #if !defined(TRAVIS) && !defined(OS_WIN)
2776
+ #if !defined(OS_WIN)
2777
2777
  // Disable this test temporarily on Travis and appveyor as it fails
2778
2778
  // intermittently. Github issue: #4151
2779
2779
  namespace {
@@ -2850,7 +2850,7 @@ TEST_F(DBTest, GroupCommitTest) {
2850
2850
  ASSERT_GT(hist_data.average, 0.0);
2851
2851
  } while (ChangeOptions(kSkipNoSeekToLast));
2852
2852
  }
2853
- #endif // TRAVIS
2853
+ #endif // OS_WIN
2854
2854
 
2855
2855
  namespace {
2856
2856
  using KVMap = std::map<std::string, std::string>;
@@ -358,6 +358,9 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
358
358
  RecoverFromNoSpace();
359
359
  }
360
360
  }
361
+ if (bg_error_.severity() >= Status::Severity::kHardError) {
362
+ is_db_stopped_.store(true, std::memory_order_release);
363
+ }
361
364
  return bg_error_;
362
365
  }
363
366
 
@@ -736,6 +739,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
736
739
  // the bg_error and notify user.
737
740
  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
738
741
  Status old_bg_error = bg_error_;
742
+ is_db_stopped_.store(false, std::memory_order_release);
739
743
  bg_error_ = Status::OK();
740
744
  bg_error_.PermitUncheckedError();
741
745
  EventHelpers::NotifyOnErrorRecoveryEnd(
@@ -792,6 +796,9 @@ void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
792
796
  if (bg_err.severity() > bg_error_.severity()) {
793
797
  bg_error_ = bg_err;
794
798
  }
799
+ if (bg_error_.severity() >= Status::Severity::kHardError) {
800
+ is_db_stopped_.store(true, std::memory_order_release);
801
+ }
795
802
  return;
796
803
  }
797
804
 
@@ -38,6 +38,7 @@ class ErrorHandler {
38
38
  auto_recovery_(false),
39
39
  recovery_in_prog_(false),
40
40
  soft_error_no_bg_work_(false),
41
+ is_db_stopped_(false),
41
42
  bg_error_stats_(db_options.statistics) {
42
43
  // Clear the checked flag for uninitialized errors
43
44
  bg_error_.PermitUncheckedError();
@@ -59,16 +60,15 @@ class ErrorHandler {
59
60
 
60
61
  Status ClearBGError();
61
62
 
62
- bool IsDBStopped() {
63
- return !bg_error_.ok() &&
64
- bg_error_.severity() >= Status::Severity::kHardError;
65
- }
63
+ bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
66
64
 
67
- bool IsBGWorkStopped() {
68
- return !bg_error_.ok() &&
69
- (bg_error_.severity() >= Status::Severity::kHardError ||
70
- !auto_recovery_ || soft_error_no_bg_work_);
71
- }
65
+ bool IsBGWorkStopped() {
66
+ assert(db_mutex_);
67
+ db_mutex_->AssertHeld();
68
+ return !bg_error_.ok() &&
69
+ (bg_error_.severity() >= Status::Severity::kHardError ||
70
+ !auto_recovery_ || soft_error_no_bg_work_);
71
+ }
72
72
 
73
73
  bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
74
74
 
@@ -105,6 +105,7 @@ class ErrorHandler {
105
105
 
106
106
  // Used to store the context for recover, such as flush reason.
107
107
  DBRecoverContext recover_context_;
108
+ std::atomic<bool> is_db_stopped_;
108
109
 
109
110
  // The pointer of DB statistics.
110
111
  std::shared_ptr<Statistics> bg_error_stats_;
@@ -960,12 +960,19 @@ TEST_P(CompressionLogTest, Fragmentation) {
960
960
  return;
961
961
  }
962
962
  ASSERT_OK(SetupTestEnv());
963
- Write("small");
964
- Write(BigString("medium", 50000));
965
- Write(BigString("large", 100000));
966
- ASSERT_EQ("small", Read());
967
- ASSERT_EQ(BigString("medium", 50000), Read());
968
- ASSERT_EQ(BigString("large", 100000), Read());
963
+ Random rnd(301);
964
+ const std::vector<std::string> wal_entries = {
965
+ "small",
966
+ rnd.RandomBinaryString(3 * kBlockSize / 2), // Spans into block 2
967
+ rnd.RandomBinaryString(3 * kBlockSize), // Spans into block 5
968
+ };
969
+ for (const std::string& wal_entry : wal_entries) {
970
+ Write(wal_entry);
971
+ }
972
+
973
+ for (const std::string& wal_entry : wal_entries) {
974
+ ASSERT_EQ(wal_entry, Read());
975
+ }
969
976
  ASSERT_EQ("EOF", Read());
970
977
  }
971
978