@nxtedition/rocksdb 7.0.37 → 7.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/binding.cc +17 -26
  2. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
  3. package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
  4. package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
  14. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
  15. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
  23. package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
  24. package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
  25. package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
  26. package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
  27. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
  28. package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
  29. package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
  30. package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
  32. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
  33. package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
  34. package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
  35. package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
  36. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
  37. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
  40. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
  41. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
  42. package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
  43. package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
  44. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
  45. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
  46. package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
  47. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
  48. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
  49. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
  50. package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
  51. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
  52. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
  54. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
  55. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
  56. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
  57. package/index.js +2 -2
  58. package/iterator.js +1 -1
  59. package/package.json +1 -1
  60. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  61. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -161,8 +161,32 @@ class ChangeLevelConflictsWithAuto
161
161
  ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
162
162
  };
163
163
 
164
- namespace {
164
+ // Param = true: grab the compaction pressure token (enable
165
+ // parallel compactions)
166
+ // Param = false: Not grab the token (no parallel compactions)
167
+ class RoundRobinSubcompactionsAgainstPressureToken
168
+ : public DBCompactionTest,
169
+ public ::testing::WithParamInterface<bool> {
170
+ public:
171
+ RoundRobinSubcompactionsAgainstPressureToken() {
172
+ grab_pressure_token_ = GetParam();
173
+ }
174
+ bool grab_pressure_token_;
175
+ };
165
176
 
177
+ class RoundRobinSubcompactionsAgainstResources
178
+ : public DBCompactionTest,
179
+ public ::testing::WithParamInterface<std::tuple<int, int>> {
180
+ public:
181
+ RoundRobinSubcompactionsAgainstResources() {
182
+ total_low_pri_threads_ = std::get<0>(GetParam());
183
+ max_compaction_limits_ = std::get<1>(GetParam());
184
+ }
185
+ int total_low_pri_threads_;
186
+ int max_compaction_limits_;
187
+ };
188
+
189
+ namespace {
166
190
  class FlushedFileCollector : public EventListener {
167
191
  public:
168
192
  FlushedFileCollector() {}
@@ -5306,6 +5330,187 @@ TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
5306
5330
  }
5307
5331
  }
5308
5332
 
5333
+ TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
5334
+ const int kKeysPerBuffer = 100;
5335
+ Options options = CurrentOptions();
5336
+ options.num_levels = 4;
5337
+ options.max_bytes_for_level_multiplier = 2;
5338
+ options.level0_file_num_compaction_trigger = 4;
5339
+ options.target_file_size_base = kKeysPerBuffer * 1024;
5340
+ options.compaction_pri = CompactionPri::kRoundRobin;
5341
+ options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
5342
+ options.disable_auto_compactions = true;
5343
+ // Setup 7 threads but limited subcompactions so that
5344
+ // RoundRobin requires extra compactions from reserved threads
5345
+ options.max_subcompactions = 1;
5346
+ options.max_background_compactions = 7;
5347
+ options.max_compaction_bytes = 100000000;
5348
+ DestroyAndReopen(options);
5349
+ env_->SetBackgroundThreads(7, Env::LOW);
5350
+
5351
+ Random rnd(301);
5352
+ const std::vector<int> files_per_level = {0, 15, 25};
5353
+ for (int lvl = 2; lvl > 0; lvl--) {
5354
+ for (int i = 0; i < files_per_level[lvl]; i++) {
5355
+ for (int j = 0; j < kKeysPerBuffer; j++) {
5356
+ // Add (lvl-1) to ensure nearly equivallent number of files
5357
+ // in L2 are overlapped with fils selected to compact from
5358
+ // L1
5359
+ ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
5360
+ rnd.RandomString(1010)));
5361
+ }
5362
+ ASSERT_OK(Flush());
5363
+ }
5364
+ MoveFilesToLevel(lvl);
5365
+ ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
5366
+ }
5367
+ // 15 files in L1; 25 files in L2
5368
+
5369
+ // This is a variable for making sure the following callback is called
5370
+ // and the assertions in it are indeed excuted.
5371
+ bool num_planned_subcompactions_verified = false;
5372
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5373
+ "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
5374
+ uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
5375
+ if (grab_pressure_token_) {
5376
+ // 7 files are selected for round-robin under auto
5377
+ // compaction. The number of planned subcompaction is restricted by
5378
+ // the limited number of max_background_compactions
5379
+ ASSERT_EQ(num_planned_subcompactions, 7);
5380
+ } else {
5381
+ ASSERT_EQ(num_planned_subcompactions, 1);
5382
+ }
5383
+ num_planned_subcompactions_verified = true;
5384
+ });
5385
+
5386
+ // The following 3 dependencies have to be added to ensure the auto
5387
+ // compaction and the pressure token is correctly enabled. Same for
5388
+ // RoundRobinSubcompactionsUsingResources and
5389
+ // DBCompactionTest.RoundRobinSubcompactionsShrinkResources
5390
+ SyncPoint::GetInstance()->LoadDependency(
5391
+ {{"RoundRobinSubcompactionsAgainstPressureToken:0",
5392
+ "BackgroundCallCompaction:0"},
5393
+ {"CompactionJob::AcquireSubcompactionResources:0",
5394
+ "RoundRobinSubcompactionsAgainstPressureToken:1"},
5395
+ {"RoundRobinSubcompactionsAgainstPressureToken:2",
5396
+ "CompactionJob::AcquireSubcompactionResources:1"}});
5397
+ SyncPoint::GetInstance()->EnableProcessing();
5398
+
5399
+ ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
5400
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
5401
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
5402
+ std::unique_ptr<WriteControllerToken> pressure_token;
5403
+ if (grab_pressure_token_) {
5404
+ pressure_token =
5405
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
5406
+ }
5407
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
5408
+
5409
+ ASSERT_OK(dbfull()->WaitForCompact());
5410
+ ASSERT_TRUE(num_planned_subcompactions_verified);
5411
+ SyncPoint::GetInstance()->DisableProcessing();
5412
+ SyncPoint::GetInstance()->ClearAllCallBacks();
5413
+ }
5414
+
5415
+ INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
5416
+ RoundRobinSubcompactionsAgainstPressureToken,
5417
+ testing::Bool());
5418
+
5419
+ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
5420
+ const int kKeysPerBuffer = 200;
5421
+ Options options = CurrentOptions();
5422
+ options.num_levels = 4;
5423
+ options.level0_file_num_compaction_trigger = 3;
5424
+ options.target_file_size_base = kKeysPerBuffer * 1024;
5425
+ options.compaction_pri = CompactionPri::kRoundRobin;
5426
+ options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
5427
+ options.disable_auto_compactions = true;
5428
+ options.max_subcompactions = 1;
5429
+ options.max_background_compactions = max_compaction_limits_;
5430
+ // Set a large number for max_compaction_bytes so that one round-robin
5431
+ // compaction is enough to make post-compaction L1 size less than
5432
+ // the maximum size (this test assumes only one round-robin compaction
5433
+ // is triggered by kLevelMaxLevelSize)
5434
+ options.max_compaction_bytes = 100000000;
5435
+
5436
+ DestroyAndReopen(options);
5437
+ env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
5438
+
5439
+ Random rnd(301);
5440
+ const std::vector<int> files_per_level = {0, 40, 100};
5441
+ for (int lvl = 2; lvl > 0; lvl--) {
5442
+ for (int i = 0; i < files_per_level[lvl]; i++) {
5443
+ for (int j = 0; j < kKeysPerBuffer; j++) {
5444
+ // Add (lvl-1) to ensure nearly equivallent number of files
5445
+ // in L2 are overlapped with fils selected to compact from
5446
+ // L1
5447
+ ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
5448
+ rnd.RandomString(1010)));
5449
+ }
5450
+ ASSERT_OK(Flush());
5451
+ }
5452
+ MoveFilesToLevel(lvl);
5453
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
5454
+ ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
5455
+ }
5456
+
5457
+ // 40 files in L1; 100 files in L2
5458
+ // This is a variable for making sure the following callback is called
5459
+ // and the assertions in it are indeed excuted.
5460
+ bool num_planned_subcompactions_verified = false;
5461
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
5462
+ "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
5463
+ uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
5464
+ // More than 10 files are selected for round-robin under auto
5465
+ // compaction. The number of planned subcompaction is restricted by
5466
+ // the minimum number between available threads and compaction limits
5467
+ ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
5468
+ std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
5469
+ num_planned_subcompactions_verified = true;
5470
+ });
5471
+ SyncPoint::GetInstance()->LoadDependency(
5472
+ {{"RoundRobinSubcompactionsAgainstResources:0",
5473
+ "BackgroundCallCompaction:0"},
5474
+ {"CompactionJob::AcquireSubcompactionResources:0",
5475
+ "RoundRobinSubcompactionsAgainstResources:1"},
5476
+ {"RoundRobinSubcompactionsAgainstResources:2",
5477
+ "CompactionJob::AcquireSubcompactionResources:1"},
5478
+ {"CompactionJob::ReleaseSubcompactionResources:0",
5479
+ "RoundRobinSubcompactionsAgainstResources:3"},
5480
+ {"RoundRobinSubcompactionsAgainstResources:4",
5481
+ "CompactionJob::ReleaseSubcompactionResources:1"}});
5482
+ SyncPoint::GetInstance()->EnableProcessing();
5483
+
5484
+ ASSERT_OK(dbfull()->WaitForCompact());
5485
+ ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
5486
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
5487
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
5488
+ auto pressure_token =
5489
+ dbfull()->TEST_write_controler().GetCompactionPressureToken();
5490
+
5491
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
5492
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
5493
+ // We can reserve more threads now except one is being used
5494
+ ASSERT_EQ(total_low_pri_threads_ - 1,
5495
+ env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
5496
+ ASSERT_EQ(
5497
+ total_low_pri_threads_ - 1,
5498
+ env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
5499
+ TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
5500
+ ASSERT_OK(dbfull()->WaitForCompact());
5501
+ ASSERT_TRUE(num_planned_subcompactions_verified);
5502
+ SyncPoint::GetInstance()->DisableProcessing();
5503
+ SyncPoint::GetInstance()->ClearAllCallBacks();
5504
+ }
5505
+
5506
+ INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
5507
+ RoundRobinSubcompactionsAgainstResources,
5508
+ ::testing::Values(std::make_tuple(1, 5),
5509
+ std::make_tuple(5, 1),
5510
+ std::make_tuple(10, 5),
5511
+ std::make_tuple(5, 10),
5512
+ std::make_tuple(10, 10)));
5513
+
5309
5514
  TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
5310
5515
  Options options = CurrentOptions();
5311
5516
  options.num_levels = 3;
@@ -5659,18 +5864,10 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
5659
5864
  for (int j = 0; j != kNumKeysPerFile; ++j) {
5660
5865
  ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
5661
5866
  }
5662
- if (0 == i) {
5663
- // When we reach here, the memtables have kNumKeysPerFile keys. Note that
5664
- // flush is not yet triggered. We need to write an extra key so that the
5665
- // write path will call PreprocessWrite and flush the previous key-value
5666
- // pairs to e flushed. After that, there will be the newest key in the
5667
- // memtable, and a bunch of L0 files. Since there is already one key in
5668
- // the memtable, then for i = 1, 2, ..., we do not have to write this
5669
- // extra key to trigger flush.
5670
- ASSERT_OK(Put("", ""));
5867
+ if (i > 0) {
5868
+ ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5869
+ ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
5671
5870
  }
5672
- ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
5673
- ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
5674
5871
  }
5675
5872
  // When we reach this point, there will be level0_stop_writes_trigger L0
5676
5873
  // files and one extra key (99) in memory, which overlaps with the external
@@ -185,7 +185,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
185
185
  log_dir_synced_(false),
186
186
  log_empty_(true),
187
187
  persist_stats_cf_handle_(nullptr),
188
- log_sync_cv_(&mutex_),
188
+ log_sync_cv_(&log_write_mutex_),
189
189
  total_log_size_(0),
190
190
  is_snapshot_supported_(true),
191
191
  write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
@@ -273,6 +273,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
273
273
  mutable_db_options_.Dump(immutable_db_options_.info_log.get());
274
274
  DumpSupportInfo(immutable_db_options_.info_log.get());
275
275
 
276
+ max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
277
+ std::memory_order_relaxed);
276
278
  if (write_buffer_manager_) {
277
279
  wbm_stall_.reset(new WBMStallInterface());
278
280
  }
@@ -625,26 +627,28 @@ Status DBImpl::CloseHelper() {
625
627
  job_context.Clean();
626
628
  mutex_.Lock();
627
629
  }
628
-
629
- for (auto l : logs_to_free_) {
630
- delete l;
631
- }
632
- for (auto& log : logs_) {
633
- uint64_t log_number = log.writer->get_log_number();
634
- Status s = log.ClearWriter();
635
- if (!s.ok()) {
636
- ROCKS_LOG_WARN(
637
- immutable_db_options_.info_log,
638
- "Unable to Sync WAL file %s with error -- %s",
639
- LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
640
- s.ToString().c_str());
641
- // Retain the first error
642
- if (ret.ok()) {
643
- ret = s;
630
+ {
631
+ InstrumentedMutexLock lock(&log_write_mutex_);
632
+ for (auto l : logs_to_free_) {
633
+ delete l;
634
+ }
635
+ for (auto& log : logs_) {
636
+ uint64_t log_number = log.writer->get_log_number();
637
+ Status s = log.ClearWriter();
638
+ if (!s.ok()) {
639
+ ROCKS_LOG_WARN(
640
+ immutable_db_options_.info_log,
641
+ "Unable to Sync WAL file %s with error -- %s",
642
+ LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
643
+ s.ToString().c_str());
644
+ // Retain the first error
645
+ if (ret.ok()) {
646
+ ret = s;
647
+ }
644
648
  }
645
649
  }
650
+ logs_.clear();
646
651
  }
647
- logs_.clear();
648
652
 
649
653
  // Table cache may have table handles holding blocks from the block cache.
650
654
  // We need to release them before the block cache is destroyed. The block
@@ -1108,6 +1112,7 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
1108
1112
  }
1109
1113
 
1110
1114
  void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
1115
+ mutex_.AssertHeld();
1111
1116
  if (!job_context->logs_to_free.empty()) {
1112
1117
  for (auto l : job_context->logs_to_free) {
1113
1118
  AddToLogsToFreeQueue(l);
@@ -1285,6 +1290,11 @@ Status DBImpl::SetDBOptions(
1285
1290
  new_options.stats_persist_period_sec);
1286
1291
  mutex_.Lock();
1287
1292
  }
1293
+ if (new_options.max_total_wal_size !=
1294
+ mutable_db_options_.max_total_wal_size) {
1295
+ max_total_wal_size_.store(new_options.max_total_wal_size,
1296
+ std::memory_order_release);
1297
+ }
1288
1298
  write_controller_.set_max_delayed_write_rate(
1289
1299
  new_options.delayed_write_rate);
1290
1300
  table_cache_.get()->SetCapacity(new_options.max_open_files == -1
@@ -1405,7 +1415,7 @@ Status DBImpl::SyncWAL() {
1405
1415
  uint64_t current_log_number;
1406
1416
 
1407
1417
  {
1408
- InstrumentedMutexLock l(&mutex_);
1418
+ InstrumentedMutexLock l(&log_write_mutex_);
1409
1419
  assert(!logs_.empty());
1410
1420
 
1411
1421
  // This SyncWAL() call only cares about logs up to this number.
@@ -1462,19 +1472,37 @@ Status DBImpl::SyncWAL() {
1462
1472
  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
1463
1473
 
1464
1474
  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
1475
+ VersionEdit synced_wals;
1465
1476
  {
1466
- InstrumentedMutexLock l(&mutex_);
1477
+ InstrumentedMutexLock l(&log_write_mutex_);
1467
1478
  if (status.ok()) {
1468
- status = MarkLogsSynced(current_log_number, need_log_dir_sync);
1479
+ MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
1469
1480
  } else {
1470
1481
  MarkLogsNotSynced(current_log_number);
1471
1482
  }
1472
1483
  }
1484
+ if (status.ok() && synced_wals.IsWalAddition()) {
1485
+ InstrumentedMutexLock l(&mutex_);
1486
+ status = ApplyWALToManifest(&synced_wals);
1487
+ }
1488
+
1473
1489
  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
1474
1490
 
1475
1491
  return status;
1476
1492
  }
1477
1493
 
1494
+ Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
1495
+ // not empty, write to MANIFEST.
1496
+ mutex_.AssertHeld();
1497
+ Status status =
1498
+ versions_->LogAndApplyToDefaultColumnFamily(synced_wals, &mutex_);
1499
+ if (!status.ok() && versions_->io_status().IsIOError()) {
1500
+ status = error_handler_.SetBGError(versions_->io_status(),
1501
+ BackgroundErrorReason::kManifestWrite);
1502
+ }
1503
+ return status;
1504
+ }
1505
+
1478
1506
  Status DBImpl::LockWAL() {
1479
1507
  log_write_mutex_.Lock();
1480
1508
  auto cur_log_writer = logs_.back().writer;
@@ -1494,12 +1522,12 @@ Status DBImpl::UnlockWAL() {
1494
1522
  return Status::OK();
1495
1523
  }
1496
1524
 
1497
- Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
1498
- mutex_.AssertHeld();
1525
+ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
1526
+ VersionEdit* synced_wals) {
1527
+ log_write_mutex_.AssertHeld();
1499
1528
  if (synced_dir && logfile_number_ == up_to) {
1500
1529
  log_dir_synced_ = true;
1501
1530
  }
1502
- VersionEdit synced_wals;
1503
1531
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
1504
1532
  auto& wal = *it;
1505
1533
  assert(wal.IsSyncing());
@@ -1507,11 +1535,9 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
1507
1535
  if (logs_.size() > 1) {
1508
1536
  if (immutable_db_options_.track_and_verify_wals_in_manifest &&
1509
1537
  wal.GetPreSyncSize() > 0) {
1510
- synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1538
+ synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1511
1539
  }
1512
1540
  logs_to_free_.push_back(wal.ReleaseWriter());
1513
- // To modify logs_ both mutex_ and log_write_mutex_ must be held
1514
- InstrumentedMutexLock l(&log_write_mutex_);
1515
1541
  it = logs_.erase(it);
1516
1542
  } else {
1517
1543
  wal.FinishSync();
@@ -1520,22 +1546,11 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
1520
1546
  }
1521
1547
  assert(logs_.empty() || logs_[0].number > up_to ||
1522
1548
  (logs_.size() == 1 && !logs_[0].IsSyncing()));
1523
-
1524
- Status s;
1525
- if (synced_wals.IsWalAddition()) {
1526
- // not empty, write to MANIFEST.
1527
- s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
1528
- if (!s.ok() && versions_->io_status().IsIOError()) {
1529
- s = error_handler_.SetBGError(versions_->io_status(),
1530
- BackgroundErrorReason::kManifestWrite);
1531
- }
1532
- }
1533
1549
  log_sync_cv_.SignalAll();
1534
- return s;
1535
1550
  }
1536
1551
 
1537
1552
  void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
1538
- mutex_.AssertHeld();
1553
+ log_write_mutex_.AssertHeld();
1539
1554
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
1540
1555
  ++it) {
1541
1556
  auto& wal = *it;
@@ -998,6 +998,7 @@ class DBImpl : public DB {
998
998
  }
999
999
 
1000
1000
  void AddToLogsToFreeQueue(log::Writer* log_writer) {
1001
+ mutex_.AssertHeld();
1001
1002
  logs_to_free_queue_.push_back(log_writer);
1002
1003
  }
1003
1004
 
@@ -1298,7 +1299,7 @@ class DBImpl : public DB {
1298
1299
 
1299
1300
  // only used for dynamically adjusting max_total_wal_size. it is a sum of
1300
1301
  // [write_buffer_size * max_write_buffer_number] over all column families
1301
- uint64_t max_total_in_memory_state_;
1302
+ std::atomic<uint64_t> max_total_in_memory_state_;
1302
1303
 
1303
1304
  // The options to access storage files
1304
1305
  const FileOptions file_options_;
@@ -1648,6 +1649,15 @@ class DBImpl : public DB {
1648
1649
  uint64_t pre_sync_size = 0;
1649
1650
  };
1650
1651
 
1652
+ struct LogContext {
1653
+ explicit LogContext(bool need_sync = false)
1654
+ : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
1655
+ bool need_log_sync = false;
1656
+ bool need_log_dir_sync = false;
1657
+ log::Writer* writer = nullptr;
1658
+ LogFileNumberSize* log_file_number_size = nullptr;
1659
+ };
1660
+
1651
1661
  // PurgeFileInfo is a structure to hold information of files to be deleted in
1652
1662
  // purge_files_
1653
1663
  struct PurgeFileInfo {
@@ -1801,7 +1811,7 @@ class DBImpl : public DB {
1801
1811
  void ReleaseFileNumberFromPendingOutputs(
1802
1812
  std::unique_ptr<std::list<uint64_t>::iterator>& v);
1803
1813
 
1804
- IOStatus SyncClosedLogs(JobContext* job_context);
1814
+ IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
1805
1815
 
1806
1816
  // Flush the in-memory write buffer to storage. Switches to a new
1807
1817
  // log-file/memtable and writes a new descriptor iff successful. Then
@@ -1961,8 +1971,8 @@ class DBImpl : public DB {
1961
1971
  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
1962
1972
 
1963
1973
  // REQUIRES: mutex locked
1964
- Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
1965
- WriteContext* write_context);
1974
+ Status PreprocessWrite(const WriteOptions& write_options,
1975
+ LogContext* log_context, WriteContext* write_context);
1966
1976
 
1967
1977
  // Merge write batches in the write group into merged_batch.
1968
1978
  // Returns OK if merge is successful.
@@ -2101,7 +2111,8 @@ class DBImpl : public DB {
2101
2111
  std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
2102
2112
 
2103
2113
  // helper function to call after some of the logs_ were synced
2104
- Status MarkLogsSynced(uint64_t up_to, bool synced_dir);
2114
+ void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
2115
+ Status ApplyWALToManifest(VersionEdit* edit);
2105
2116
  // WALs with log number up to up_to are not synced successfully.
2106
2117
  void MarkLogsNotSynced(uint64_t up_to);
2107
2118
 
@@ -2307,8 +2318,9 @@ class DBImpl : public DB {
2307
2318
  // logfile_number_ is currently updated only in write_thread_, it can be read
2308
2319
  // from the same write_thread_ without any locks.
2309
2320
  uint64_t logfile_number_;
2310
- std::deque<uint64_t>
2311
- log_recycle_files_; // a list of log files that we can recycle
2321
+ // Log files that we can recycle. Must be protected by db mutex_.
2322
+ std::deque<uint64_t> log_recycle_files_;
2323
+ // Protected by log_write_mutex_.
2312
2324
  bool log_dir_synced_;
2313
2325
  // Without two_write_queues, read and writes to log_empty_ are protected by
2314
2326
  // mutex_. Since it is currently updated/read only in write_thread_, it can be
@@ -2322,26 +2334,93 @@ class DBImpl : public DB {
2322
2334
 
2323
2335
  bool persistent_stats_cfd_exists_ = true;
2324
2336
 
2325
- // Without two_write_queues, read and writes to alive_log_files_ are
2326
- // protected by mutex_. With two_write_queues_, writes
2327
- // are protected by locking both mutex_ and log_write_mutex_, and reads must
2328
- // be under either mutex_ or log_write_mutex_.
2337
+ // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
2338
+ // as follows:
2339
+ // 1. read by FindObsoleteFiles() which can be called in either application
2340
+ // thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
2341
+ // held.
2342
+ // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
2343
+ // are held.
2344
+ // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
2345
+ // (actually called by Open()), only mutex_ is held because at this point,
2346
+ // the DB::Open() call has not returned success to application, and the
2347
+ // only other thread(s) that can conflict are bg threads calling
2348
+ // FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
2349
+ // are held when accessing alive_log_files_.
2350
+ // 4. read by DBImpl::Open() is protected by mutex_.
2351
+ // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
2352
+ // held. This is done by the write group leader. Note that in the case of
2353
+ // two-write-queues, another WAL-only write thread can be writing to the
2354
+ // WAL concurrently. See 9.
2355
+ // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
2356
+ // done by write group leader.
2357
+ // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
2358
+ // two-write-queues. Only log_write_mutex_ is held to protect concurrent
2359
+ // pop_front() by FindObsoleteFiles().
2360
+ // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
2361
+ // is held to protect the data structure from concurrent pop_front() by
2362
+ // FindObsoleteFiles().
2363
+ // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
2364
+ // of two-write-queues. Only log_write_mutex_ is held. This suffices to
2365
+ // protect the data structure from concurrent push_back() by current
2366
+ // write group leader as well as pop_front() by FindObsoleteFiles().
2329
2367
  std::deque<LogFileNumberSize> alive_log_files_;
2330
2368
 
2331
2369
  // Log files that aren't fully synced, and the current log file.
2332
2370
  // Synchronization:
2333
- // - push_back() is done from write_thread_ with locked mutex_ and
2334
- // log_write_mutex_
2335
- // - pop_front() is done from any thread with locked mutex_ and
2336
- // log_write_mutex_
2337
- // - reads are done with either locked mutex_ or log_write_mutex_
2371
+ // 1. read by FindObsoleteFiles() which can be called either in application
2372
+ // thread or RocksDB bg threads. log_write_mutex_ is always held, while
2373
+ // some reads are performed without mutex_.
2374
+ // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
2375
+ // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
2376
+ // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
2377
+ // Note that at this point, DB::Open() has not returned success to
2378
+ // application, thus the only other thread(s) that can conflict are bg
2379
+ // threads calling FindObsoleteFiles(). See 1.
2380
+ // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
2381
+ // and mutex_.
2382
+ // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
2383
+ // log_write_mutex_. These two can be called by application threads after
2384
+ // DB::Open() returns success to applications.
2385
+ // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
2386
+ // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
2387
+ // log_write_mutex_.
2388
+ // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
2389
+ // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
2390
+ // happen in bg flush threads after DB::Open() returns success to
2391
+ // applications.
2392
+ // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
2393
+ // holds only the log_write_mutex_. This is done by the write group
2394
+ // leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
2395
+ // can happen concurrently. This is fine because log_write_mutex_ is used
2396
+ // by all parties. See 2, 5, 9.
2397
+ // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
2398
+ // log_write_mutex_. This happens in the write group leader.
2399
+ // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
2400
+ // log_write_mutex_. This happens in the write group leader. Can conflict
2401
+ // with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
2402
+ // SyncClosedLogs(), etc. as well as application threads calling
2403
+ // FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
2404
+ // require at least log_write_mutex_.
2405
+ // 14. iteration called in WriteToWAL(write_group) protected by
2406
+ // log_write_mutex_. This is done by write group leader when
2407
+ // two-write-queues is disabled and write needs to sync logs.
2408
+ // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
2409
+ // This can be done by the write group leader if two-write-queues is
2410
+ // enabled. It can also be done by another WAL-only write thread.
2411
+ //
2412
+ // Other observations:
2338
2413
  // - back() and items with getting_synced=true are not popped,
2339
2414
  // - The same thread that sets getting_synced=true will reset it.
2340
2415
  // - it follows that the object referred by back() can be safely read from
2341
- // the write_thread_ without using mutex
2416
+ // the write_thread_ without using mutex. Note that calling back() without
2417
+ // mutex may be unsafe because different implementations of deque::back() may
2418
+ // access other member variables of deque, causing undefined behaviors.
2419
+ // Generally, do not access stl containers without proper synchronization.
2342
2420
  // - it follows that the items with getting_synced=true can be safely read
2343
2421
  // from the same thread that has set getting_synced=true
2344
2422
  std::deque<LogWriterNumber> logs_;
2423
+
2345
2424
  // Signaled when getting_synced becomes false for some of the logs_.
2346
2425
  InstrumentedCondVar log_sync_cv_;
2347
2426
  // This is the app-level state that is written to the WAL but will be used
@@ -2356,7 +2435,7 @@ class DBImpl : public DB {
2356
2435
  std::atomic<uint64_t> total_log_size_;
2357
2436
 
2358
2437
  // If this is non-empty, we need to delete these log files in background
2359
- // threads. Protected by db mutex.
2438
+ // threads. Protected by log_write_mutex_.
2360
2439
  autovector<log::Writer*> logs_to_free_;
2361
2440
 
2362
2441
  bool is_snapshot_supported_;
@@ -2436,10 +2515,13 @@ class DBImpl : public DB {
2436
2515
  // JobContext. Current implementation tracks table and blob files only.
2437
2516
  std::unordered_set<uint64_t> files_grabbed_for_purge_;
2438
2517
 
2439
- // A queue to store log writers to close
2518
+ // A queue to store log writers to close. Protected by db mutex_.
2440
2519
  std::deque<log::Writer*> logs_to_free_queue_;
2520
+
2441
2521
  std::deque<SuperVersion*> superversions_to_free_queue_;
2522
+
2442
2523
  int unscheduled_flushes_;
2524
+
2443
2525
  int unscheduled_compactions_;
2444
2526
 
2445
2527
  // count how many background compactions are running or have been scheduled in
@@ -2592,6 +2674,7 @@ class DBImpl : public DB {
2592
2674
  InstrumentedCondVar atomic_flush_install_cv_;
2593
2675
 
2594
2676
  bool wal_in_db_path_;
2677
+ std::atomic<uint64_t> max_total_wal_size_;
2595
2678
 
2596
2679
  BlobFileCompletionCallback blob_callback_;
2597
2680