@nxtedition/rocksdb 7.0.37 → 7.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +17 -26
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
- package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
- package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
- package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
- package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
- package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
- package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
- package/index.js +2 -2
- package/iterator.js +1 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -161,8 +161,32 @@ class ChangeLevelConflictsWithAuto
|
|
|
161
161
|
ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
|
|
162
162
|
};
|
|
163
163
|
|
|
164
|
-
|
|
164
|
+
// Param = true: grab the compaction pressure token (enable
|
|
165
|
+
// parallel compactions)
|
|
166
|
+
// Param = false: Not grab the token (no parallel compactions)
|
|
167
|
+
class RoundRobinSubcompactionsAgainstPressureToken
|
|
168
|
+
: public DBCompactionTest,
|
|
169
|
+
public ::testing::WithParamInterface<bool> {
|
|
170
|
+
public:
|
|
171
|
+
RoundRobinSubcompactionsAgainstPressureToken() {
|
|
172
|
+
grab_pressure_token_ = GetParam();
|
|
173
|
+
}
|
|
174
|
+
bool grab_pressure_token_;
|
|
175
|
+
};
|
|
165
176
|
|
|
177
|
+
class RoundRobinSubcompactionsAgainstResources
|
|
178
|
+
: public DBCompactionTest,
|
|
179
|
+
public ::testing::WithParamInterface<std::tuple<int, int>> {
|
|
180
|
+
public:
|
|
181
|
+
RoundRobinSubcompactionsAgainstResources() {
|
|
182
|
+
total_low_pri_threads_ = std::get<0>(GetParam());
|
|
183
|
+
max_compaction_limits_ = std::get<1>(GetParam());
|
|
184
|
+
}
|
|
185
|
+
int total_low_pri_threads_;
|
|
186
|
+
int max_compaction_limits_;
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
namespace {
|
|
166
190
|
class FlushedFileCollector : public EventListener {
|
|
167
191
|
public:
|
|
168
192
|
FlushedFileCollector() {}
|
|
@@ -5306,6 +5330,187 @@ TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
|
|
|
5306
5330
|
}
|
|
5307
5331
|
}
|
|
5308
5332
|
|
|
5333
|
+
TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
|
|
5334
|
+
const int kKeysPerBuffer = 100;
|
|
5335
|
+
Options options = CurrentOptions();
|
|
5336
|
+
options.num_levels = 4;
|
|
5337
|
+
options.max_bytes_for_level_multiplier = 2;
|
|
5338
|
+
options.level0_file_num_compaction_trigger = 4;
|
|
5339
|
+
options.target_file_size_base = kKeysPerBuffer * 1024;
|
|
5340
|
+
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
5341
|
+
options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
|
|
5342
|
+
options.disable_auto_compactions = true;
|
|
5343
|
+
// Setup 7 threads but limited subcompactions so that
|
|
5344
|
+
// RoundRobin requires extra compactions from reserved threads
|
|
5345
|
+
options.max_subcompactions = 1;
|
|
5346
|
+
options.max_background_compactions = 7;
|
|
5347
|
+
options.max_compaction_bytes = 100000000;
|
|
5348
|
+
DestroyAndReopen(options);
|
|
5349
|
+
env_->SetBackgroundThreads(7, Env::LOW);
|
|
5350
|
+
|
|
5351
|
+
Random rnd(301);
|
|
5352
|
+
const std::vector<int> files_per_level = {0, 15, 25};
|
|
5353
|
+
for (int lvl = 2; lvl > 0; lvl--) {
|
|
5354
|
+
for (int i = 0; i < files_per_level[lvl]; i++) {
|
|
5355
|
+
for (int j = 0; j < kKeysPerBuffer; j++) {
|
|
5356
|
+
// Add (lvl-1) to ensure nearly equivallent number of files
|
|
5357
|
+
// in L2 are overlapped with fils selected to compact from
|
|
5358
|
+
// L1
|
|
5359
|
+
ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
|
|
5360
|
+
rnd.RandomString(1010)));
|
|
5361
|
+
}
|
|
5362
|
+
ASSERT_OK(Flush());
|
|
5363
|
+
}
|
|
5364
|
+
MoveFilesToLevel(lvl);
|
|
5365
|
+
ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
|
|
5366
|
+
}
|
|
5367
|
+
// 15 files in L1; 25 files in L2
|
|
5368
|
+
|
|
5369
|
+
// This is a variable for making sure the following callback is called
|
|
5370
|
+
// and the assertions in it are indeed excuted.
|
|
5371
|
+
bool num_planned_subcompactions_verified = false;
|
|
5372
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
5373
|
+
"CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
|
|
5374
|
+
uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
|
|
5375
|
+
if (grab_pressure_token_) {
|
|
5376
|
+
// 7 files are selected for round-robin under auto
|
|
5377
|
+
// compaction. The number of planned subcompaction is restricted by
|
|
5378
|
+
// the limited number of max_background_compactions
|
|
5379
|
+
ASSERT_EQ(num_planned_subcompactions, 7);
|
|
5380
|
+
} else {
|
|
5381
|
+
ASSERT_EQ(num_planned_subcompactions, 1);
|
|
5382
|
+
}
|
|
5383
|
+
num_planned_subcompactions_verified = true;
|
|
5384
|
+
});
|
|
5385
|
+
|
|
5386
|
+
// The following 3 dependencies have to be added to ensure the auto
|
|
5387
|
+
// compaction and the pressure token is correctly enabled. Same for
|
|
5388
|
+
// RoundRobinSubcompactionsUsingResources and
|
|
5389
|
+
// DBCompactionTest.RoundRobinSubcompactionsShrinkResources
|
|
5390
|
+
SyncPoint::GetInstance()->LoadDependency(
|
|
5391
|
+
{{"RoundRobinSubcompactionsAgainstPressureToken:0",
|
|
5392
|
+
"BackgroundCallCompaction:0"},
|
|
5393
|
+
{"CompactionJob::AcquireSubcompactionResources:0",
|
|
5394
|
+
"RoundRobinSubcompactionsAgainstPressureToken:1"},
|
|
5395
|
+
{"RoundRobinSubcompactionsAgainstPressureToken:2",
|
|
5396
|
+
"CompactionJob::AcquireSubcompactionResources:1"}});
|
|
5397
|
+
SyncPoint::GetInstance()->EnableProcessing();
|
|
5398
|
+
|
|
5399
|
+
ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
|
|
5400
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
|
|
5401
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
|
|
5402
|
+
std::unique_ptr<WriteControllerToken> pressure_token;
|
|
5403
|
+
if (grab_pressure_token_) {
|
|
5404
|
+
pressure_token =
|
|
5405
|
+
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
|
5406
|
+
}
|
|
5407
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
|
|
5408
|
+
|
|
5409
|
+
ASSERT_OK(dbfull()->WaitForCompact());
|
|
5410
|
+
ASSERT_TRUE(num_planned_subcompactions_verified);
|
|
5411
|
+
SyncPoint::GetInstance()->DisableProcessing();
|
|
5412
|
+
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
5413
|
+
}
|
|
5414
|
+
|
|
5415
|
+
INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
|
|
5416
|
+
RoundRobinSubcompactionsAgainstPressureToken,
|
|
5417
|
+
testing::Bool());
|
|
5418
|
+
|
|
5419
|
+
TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
|
|
5420
|
+
const int kKeysPerBuffer = 200;
|
|
5421
|
+
Options options = CurrentOptions();
|
|
5422
|
+
options.num_levels = 4;
|
|
5423
|
+
options.level0_file_num_compaction_trigger = 3;
|
|
5424
|
+
options.target_file_size_base = kKeysPerBuffer * 1024;
|
|
5425
|
+
options.compaction_pri = CompactionPri::kRoundRobin;
|
|
5426
|
+
options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
|
|
5427
|
+
options.disable_auto_compactions = true;
|
|
5428
|
+
options.max_subcompactions = 1;
|
|
5429
|
+
options.max_background_compactions = max_compaction_limits_;
|
|
5430
|
+
// Set a large number for max_compaction_bytes so that one round-robin
|
|
5431
|
+
// compaction is enough to make post-compaction L1 size less than
|
|
5432
|
+
// the maximum size (this test assumes only one round-robin compaction
|
|
5433
|
+
// is triggered by kLevelMaxLevelSize)
|
|
5434
|
+
options.max_compaction_bytes = 100000000;
|
|
5435
|
+
|
|
5436
|
+
DestroyAndReopen(options);
|
|
5437
|
+
env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
|
|
5438
|
+
|
|
5439
|
+
Random rnd(301);
|
|
5440
|
+
const std::vector<int> files_per_level = {0, 40, 100};
|
|
5441
|
+
for (int lvl = 2; lvl > 0; lvl--) {
|
|
5442
|
+
for (int i = 0; i < files_per_level[lvl]; i++) {
|
|
5443
|
+
for (int j = 0; j < kKeysPerBuffer; j++) {
|
|
5444
|
+
// Add (lvl-1) to ensure nearly equivallent number of files
|
|
5445
|
+
// in L2 are overlapped with fils selected to compact from
|
|
5446
|
+
// L1
|
|
5447
|
+
ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
|
|
5448
|
+
rnd.RandomString(1010)));
|
|
5449
|
+
}
|
|
5450
|
+
ASSERT_OK(Flush());
|
|
5451
|
+
}
|
|
5452
|
+
MoveFilesToLevel(lvl);
|
|
5453
|
+
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
5454
|
+
ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
|
|
5455
|
+
}
|
|
5456
|
+
|
|
5457
|
+
// 40 files in L1; 100 files in L2
|
|
5458
|
+
// This is a variable for making sure the following callback is called
|
|
5459
|
+
// and the assertions in it are indeed excuted.
|
|
5460
|
+
bool num_planned_subcompactions_verified = false;
|
|
5461
|
+
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
5462
|
+
"CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
|
|
5463
|
+
uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
|
|
5464
|
+
// More than 10 files are selected for round-robin under auto
|
|
5465
|
+
// compaction. The number of planned subcompaction is restricted by
|
|
5466
|
+
// the minimum number between available threads and compaction limits
|
|
5467
|
+
ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
|
|
5468
|
+
std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
|
|
5469
|
+
num_planned_subcompactions_verified = true;
|
|
5470
|
+
});
|
|
5471
|
+
SyncPoint::GetInstance()->LoadDependency(
|
|
5472
|
+
{{"RoundRobinSubcompactionsAgainstResources:0",
|
|
5473
|
+
"BackgroundCallCompaction:0"},
|
|
5474
|
+
{"CompactionJob::AcquireSubcompactionResources:0",
|
|
5475
|
+
"RoundRobinSubcompactionsAgainstResources:1"},
|
|
5476
|
+
{"RoundRobinSubcompactionsAgainstResources:2",
|
|
5477
|
+
"CompactionJob::AcquireSubcompactionResources:1"},
|
|
5478
|
+
{"CompactionJob::ReleaseSubcompactionResources:0",
|
|
5479
|
+
"RoundRobinSubcompactionsAgainstResources:3"},
|
|
5480
|
+
{"RoundRobinSubcompactionsAgainstResources:4",
|
|
5481
|
+
"CompactionJob::ReleaseSubcompactionResources:1"}});
|
|
5482
|
+
SyncPoint::GetInstance()->EnableProcessing();
|
|
5483
|
+
|
|
5484
|
+
ASSERT_OK(dbfull()->WaitForCompact());
|
|
5485
|
+
ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
|
|
5486
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
|
|
5487
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
|
|
5488
|
+
auto pressure_token =
|
|
5489
|
+
dbfull()->TEST_write_controler().GetCompactionPressureToken();
|
|
5490
|
+
|
|
5491
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
|
|
5492
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
|
|
5493
|
+
// We can reserve more threads now except one is being used
|
|
5494
|
+
ASSERT_EQ(total_low_pri_threads_ - 1,
|
|
5495
|
+
env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
|
|
5496
|
+
ASSERT_EQ(
|
|
5497
|
+
total_low_pri_threads_ - 1,
|
|
5498
|
+
env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
|
|
5499
|
+
TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
|
|
5500
|
+
ASSERT_OK(dbfull()->WaitForCompact());
|
|
5501
|
+
ASSERT_TRUE(num_planned_subcompactions_verified);
|
|
5502
|
+
SyncPoint::GetInstance()->DisableProcessing();
|
|
5503
|
+
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
5504
|
+
}
|
|
5505
|
+
|
|
5506
|
+
INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
|
|
5507
|
+
RoundRobinSubcompactionsAgainstResources,
|
|
5508
|
+
::testing::Values(std::make_tuple(1, 5),
|
|
5509
|
+
std::make_tuple(5, 1),
|
|
5510
|
+
std::make_tuple(10, 5),
|
|
5511
|
+
std::make_tuple(5, 10),
|
|
5512
|
+
std::make_tuple(10, 10)));
|
|
5513
|
+
|
|
5309
5514
|
TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
|
|
5310
5515
|
Options options = CurrentOptions();
|
|
5311
5516
|
options.num_levels = 3;
|
|
@@ -5659,18 +5864,10 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
|
|
|
5659
5864
|
for (int j = 0; j != kNumKeysPerFile; ++j) {
|
|
5660
5865
|
ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
|
|
5661
5866
|
}
|
|
5662
|
-
if (
|
|
5663
|
-
|
|
5664
|
-
|
|
5665
|
-
// write path will call PreprocessWrite and flush the previous key-value
|
|
5666
|
-
// pairs to e flushed. After that, there will be the newest key in the
|
|
5667
|
-
// memtable, and a bunch of L0 files. Since there is already one key in
|
|
5668
|
-
// the memtable, then for i = 1, 2, ..., we do not have to write this
|
|
5669
|
-
// extra key to trigger flush.
|
|
5670
|
-
ASSERT_OK(Put("", ""));
|
|
5867
|
+
if (i > 0) {
|
|
5868
|
+
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
5869
|
+
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
|
|
5671
5870
|
}
|
|
5672
|
-
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
5673
|
-
ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
|
|
5674
5871
|
}
|
|
5675
5872
|
// When we reach this point, there will be level0_stop_writes_trigger L0
|
|
5676
5873
|
// files and one extra key (99) in memory, which overlaps with the external
|
|
@@ -185,7 +185,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
185
185
|
log_dir_synced_(false),
|
|
186
186
|
log_empty_(true),
|
|
187
187
|
persist_stats_cf_handle_(nullptr),
|
|
188
|
-
log_sync_cv_(&
|
|
188
|
+
log_sync_cv_(&log_write_mutex_),
|
|
189
189
|
total_log_size_(0),
|
|
190
190
|
is_snapshot_supported_(true),
|
|
191
191
|
write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
|
|
@@ -273,6 +273,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
|
|
|
273
273
|
mutable_db_options_.Dump(immutable_db_options_.info_log.get());
|
|
274
274
|
DumpSupportInfo(immutable_db_options_.info_log.get());
|
|
275
275
|
|
|
276
|
+
max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
|
|
277
|
+
std::memory_order_relaxed);
|
|
276
278
|
if (write_buffer_manager_) {
|
|
277
279
|
wbm_stall_.reset(new WBMStallInterface());
|
|
278
280
|
}
|
|
@@ -625,26 +627,28 @@ Status DBImpl::CloseHelper() {
|
|
|
625
627
|
job_context.Clean();
|
|
626
628
|
mutex_.Lock();
|
|
627
629
|
}
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
ret
|
|
630
|
+
{
|
|
631
|
+
InstrumentedMutexLock lock(&log_write_mutex_);
|
|
632
|
+
for (auto l : logs_to_free_) {
|
|
633
|
+
delete l;
|
|
634
|
+
}
|
|
635
|
+
for (auto& log : logs_) {
|
|
636
|
+
uint64_t log_number = log.writer->get_log_number();
|
|
637
|
+
Status s = log.ClearWriter();
|
|
638
|
+
if (!s.ok()) {
|
|
639
|
+
ROCKS_LOG_WARN(
|
|
640
|
+
immutable_db_options_.info_log,
|
|
641
|
+
"Unable to Sync WAL file %s with error -- %s",
|
|
642
|
+
LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
|
|
643
|
+
s.ToString().c_str());
|
|
644
|
+
// Retain the first error
|
|
645
|
+
if (ret.ok()) {
|
|
646
|
+
ret = s;
|
|
647
|
+
}
|
|
644
648
|
}
|
|
645
649
|
}
|
|
650
|
+
logs_.clear();
|
|
646
651
|
}
|
|
647
|
-
logs_.clear();
|
|
648
652
|
|
|
649
653
|
// Table cache may have table handles holding blocks from the block cache.
|
|
650
654
|
// We need to release them before the block cache is destroyed. The block
|
|
@@ -1108,6 +1112,7 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
|
|
|
1108
1112
|
}
|
|
1109
1113
|
|
|
1110
1114
|
void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
|
|
1115
|
+
mutex_.AssertHeld();
|
|
1111
1116
|
if (!job_context->logs_to_free.empty()) {
|
|
1112
1117
|
for (auto l : job_context->logs_to_free) {
|
|
1113
1118
|
AddToLogsToFreeQueue(l);
|
|
@@ -1285,6 +1290,11 @@ Status DBImpl::SetDBOptions(
|
|
|
1285
1290
|
new_options.stats_persist_period_sec);
|
|
1286
1291
|
mutex_.Lock();
|
|
1287
1292
|
}
|
|
1293
|
+
if (new_options.max_total_wal_size !=
|
|
1294
|
+
mutable_db_options_.max_total_wal_size) {
|
|
1295
|
+
max_total_wal_size_.store(new_options.max_total_wal_size,
|
|
1296
|
+
std::memory_order_release);
|
|
1297
|
+
}
|
|
1288
1298
|
write_controller_.set_max_delayed_write_rate(
|
|
1289
1299
|
new_options.delayed_write_rate);
|
|
1290
1300
|
table_cache_.get()->SetCapacity(new_options.max_open_files == -1
|
|
@@ -1405,7 +1415,7 @@ Status DBImpl::SyncWAL() {
|
|
|
1405
1415
|
uint64_t current_log_number;
|
|
1406
1416
|
|
|
1407
1417
|
{
|
|
1408
|
-
InstrumentedMutexLock l(&
|
|
1418
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
1409
1419
|
assert(!logs_.empty());
|
|
1410
1420
|
|
|
1411
1421
|
// This SyncWAL() call only cares about logs up to this number.
|
|
@@ -1462,19 +1472,37 @@ Status DBImpl::SyncWAL() {
|
|
|
1462
1472
|
TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
|
|
1463
1473
|
|
|
1464
1474
|
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
|
|
1475
|
+
VersionEdit synced_wals;
|
|
1465
1476
|
{
|
|
1466
|
-
InstrumentedMutexLock l(&
|
|
1477
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
1467
1478
|
if (status.ok()) {
|
|
1468
|
-
|
|
1479
|
+
MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
|
|
1469
1480
|
} else {
|
|
1470
1481
|
MarkLogsNotSynced(current_log_number);
|
|
1471
1482
|
}
|
|
1472
1483
|
}
|
|
1484
|
+
if (status.ok() && synced_wals.IsWalAddition()) {
|
|
1485
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1486
|
+
status = ApplyWALToManifest(&synced_wals);
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1473
1489
|
TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
|
|
1474
1490
|
|
|
1475
1491
|
return status;
|
|
1476
1492
|
}
|
|
1477
1493
|
|
|
1494
|
+
Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
|
|
1495
|
+
// not empty, write to MANIFEST.
|
|
1496
|
+
mutex_.AssertHeld();
|
|
1497
|
+
Status status =
|
|
1498
|
+
versions_->LogAndApplyToDefaultColumnFamily(synced_wals, &mutex_);
|
|
1499
|
+
if (!status.ok() && versions_->io_status().IsIOError()) {
|
|
1500
|
+
status = error_handler_.SetBGError(versions_->io_status(),
|
|
1501
|
+
BackgroundErrorReason::kManifestWrite);
|
|
1502
|
+
}
|
|
1503
|
+
return status;
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1478
1506
|
Status DBImpl::LockWAL() {
|
|
1479
1507
|
log_write_mutex_.Lock();
|
|
1480
1508
|
auto cur_log_writer = logs_.back().writer;
|
|
@@ -1494,12 +1522,12 @@ Status DBImpl::UnlockWAL() {
|
|
|
1494
1522
|
return Status::OK();
|
|
1495
1523
|
}
|
|
1496
1524
|
|
|
1497
|
-
|
|
1498
|
-
|
|
1525
|
+
void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
|
|
1526
|
+
VersionEdit* synced_wals) {
|
|
1527
|
+
log_write_mutex_.AssertHeld();
|
|
1499
1528
|
if (synced_dir && logfile_number_ == up_to) {
|
|
1500
1529
|
log_dir_synced_ = true;
|
|
1501
1530
|
}
|
|
1502
|
-
VersionEdit synced_wals;
|
|
1503
1531
|
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
|
|
1504
1532
|
auto& wal = *it;
|
|
1505
1533
|
assert(wal.IsSyncing());
|
|
@@ -1507,11 +1535,9 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
|
|
|
1507
1535
|
if (logs_.size() > 1) {
|
|
1508
1536
|
if (immutable_db_options_.track_and_verify_wals_in_manifest &&
|
|
1509
1537
|
wal.GetPreSyncSize() > 0) {
|
|
1510
|
-
synced_wals
|
|
1538
|
+
synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
|
|
1511
1539
|
}
|
|
1512
1540
|
logs_to_free_.push_back(wal.ReleaseWriter());
|
|
1513
|
-
// To modify logs_ both mutex_ and log_write_mutex_ must be held
|
|
1514
|
-
InstrumentedMutexLock l(&log_write_mutex_);
|
|
1515
1541
|
it = logs_.erase(it);
|
|
1516
1542
|
} else {
|
|
1517
1543
|
wal.FinishSync();
|
|
@@ -1520,22 +1546,11 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
|
|
|
1520
1546
|
}
|
|
1521
1547
|
assert(logs_.empty() || logs_[0].number > up_to ||
|
|
1522
1548
|
(logs_.size() == 1 && !logs_[0].IsSyncing()));
|
|
1523
|
-
|
|
1524
|
-
Status s;
|
|
1525
|
-
if (synced_wals.IsWalAddition()) {
|
|
1526
|
-
// not empty, write to MANIFEST.
|
|
1527
|
-
s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
|
|
1528
|
-
if (!s.ok() && versions_->io_status().IsIOError()) {
|
|
1529
|
-
s = error_handler_.SetBGError(versions_->io_status(),
|
|
1530
|
-
BackgroundErrorReason::kManifestWrite);
|
|
1531
|
-
}
|
|
1532
|
-
}
|
|
1533
1549
|
log_sync_cv_.SignalAll();
|
|
1534
|
-
return s;
|
|
1535
1550
|
}
|
|
1536
1551
|
|
|
1537
1552
|
void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
|
|
1538
|
-
|
|
1553
|
+
log_write_mutex_.AssertHeld();
|
|
1539
1554
|
for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
|
|
1540
1555
|
++it) {
|
|
1541
1556
|
auto& wal = *it;
|
|
@@ -998,6 +998,7 @@ class DBImpl : public DB {
|
|
|
998
998
|
}
|
|
999
999
|
|
|
1000
1000
|
void AddToLogsToFreeQueue(log::Writer* log_writer) {
|
|
1001
|
+
mutex_.AssertHeld();
|
|
1001
1002
|
logs_to_free_queue_.push_back(log_writer);
|
|
1002
1003
|
}
|
|
1003
1004
|
|
|
@@ -1298,7 +1299,7 @@ class DBImpl : public DB {
|
|
|
1298
1299
|
|
|
1299
1300
|
// only used for dynamically adjusting max_total_wal_size. it is a sum of
|
|
1300
1301
|
// [write_buffer_size * max_write_buffer_number] over all column families
|
|
1301
|
-
uint64_t max_total_in_memory_state_;
|
|
1302
|
+
std::atomic<uint64_t> max_total_in_memory_state_;
|
|
1302
1303
|
|
|
1303
1304
|
// The options to access storage files
|
|
1304
1305
|
const FileOptions file_options_;
|
|
@@ -1648,6 +1649,15 @@ class DBImpl : public DB {
|
|
|
1648
1649
|
uint64_t pre_sync_size = 0;
|
|
1649
1650
|
};
|
|
1650
1651
|
|
|
1652
|
+
struct LogContext {
|
|
1653
|
+
explicit LogContext(bool need_sync = false)
|
|
1654
|
+
: need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
|
|
1655
|
+
bool need_log_sync = false;
|
|
1656
|
+
bool need_log_dir_sync = false;
|
|
1657
|
+
log::Writer* writer = nullptr;
|
|
1658
|
+
LogFileNumberSize* log_file_number_size = nullptr;
|
|
1659
|
+
};
|
|
1660
|
+
|
|
1651
1661
|
// PurgeFileInfo is a structure to hold information of files to be deleted in
|
|
1652
1662
|
// purge_files_
|
|
1653
1663
|
struct PurgeFileInfo {
|
|
@@ -1801,7 +1811,7 @@ class DBImpl : public DB {
|
|
|
1801
1811
|
void ReleaseFileNumberFromPendingOutputs(
|
|
1802
1812
|
std::unique_ptr<std::list<uint64_t>::iterator>& v);
|
|
1803
1813
|
|
|
1804
|
-
IOStatus SyncClosedLogs(JobContext* job_context);
|
|
1814
|
+
IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
|
|
1805
1815
|
|
|
1806
1816
|
// Flush the in-memory write buffer to storage. Switches to a new
|
|
1807
1817
|
// log-file/memtable and writes a new descriptor iff successful. Then
|
|
@@ -1961,8 +1971,8 @@ class DBImpl : public DB {
|
|
|
1961
1971
|
Status HandleWriteBufferManagerFlush(WriteContext* write_context);
|
|
1962
1972
|
|
|
1963
1973
|
// REQUIRES: mutex locked
|
|
1964
|
-
Status PreprocessWrite(const WriteOptions& write_options,
|
|
1965
|
-
WriteContext* write_context);
|
|
1974
|
+
Status PreprocessWrite(const WriteOptions& write_options,
|
|
1975
|
+
LogContext* log_context, WriteContext* write_context);
|
|
1966
1976
|
|
|
1967
1977
|
// Merge write batches in the write group into merged_batch.
|
|
1968
1978
|
// Returns OK if merge is successful.
|
|
@@ -2101,7 +2111,8 @@ class DBImpl : public DB {
|
|
|
2101
2111
|
std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
|
|
2102
2112
|
|
|
2103
2113
|
// helper function to call after some of the logs_ were synced
|
|
2104
|
-
|
|
2114
|
+
void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
|
|
2115
|
+
Status ApplyWALToManifest(VersionEdit* edit);
|
|
2105
2116
|
// WALs with log number up to up_to are not synced successfully.
|
|
2106
2117
|
void MarkLogsNotSynced(uint64_t up_to);
|
|
2107
2118
|
|
|
@@ -2307,8 +2318,9 @@ class DBImpl : public DB {
|
|
|
2307
2318
|
// logfile_number_ is currently updated only in write_thread_, it can be read
|
|
2308
2319
|
// from the same write_thread_ without any locks.
|
|
2309
2320
|
uint64_t logfile_number_;
|
|
2310
|
-
|
|
2311
|
-
|
|
2321
|
+
// Log files that we can recycle. Must be protected by db mutex_.
|
|
2322
|
+
std::deque<uint64_t> log_recycle_files_;
|
|
2323
|
+
// Protected by log_write_mutex_.
|
|
2312
2324
|
bool log_dir_synced_;
|
|
2313
2325
|
// Without two_write_queues, read and writes to log_empty_ are protected by
|
|
2314
2326
|
// mutex_. Since it is currently updated/read only in write_thread_, it can be
|
|
@@ -2322,26 +2334,93 @@ class DBImpl : public DB {
|
|
|
2322
2334
|
|
|
2323
2335
|
bool persistent_stats_cfd_exists_ = true;
|
|
2324
2336
|
|
|
2325
|
-
//
|
|
2326
|
-
//
|
|
2327
|
-
//
|
|
2328
|
-
//
|
|
2337
|
+
// alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
|
|
2338
|
+
// as follows:
|
|
2339
|
+
// 1. read by FindObsoleteFiles() which can be called in either application
|
|
2340
|
+
// thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
|
|
2341
|
+
// held.
|
|
2342
|
+
// 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
|
|
2343
|
+
// are held.
|
|
2344
|
+
// 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
|
|
2345
|
+
// (actually called by Open()), only mutex_ is held because at this point,
|
|
2346
|
+
// the DB::Open() call has not returned success to application, and the
|
|
2347
|
+
// only other thread(s) that can conflict are bg threads calling
|
|
2348
|
+
// FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
|
|
2349
|
+
// are held when accessing alive_log_files_.
|
|
2350
|
+
// 4. read by DBImpl::Open() is protected by mutex_.
|
|
2351
|
+
// 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
|
|
2352
|
+
// held. This is done by the write group leader. Note that in the case of
|
|
2353
|
+
// two-write-queues, another WAL-only write thread can be writing to the
|
|
2354
|
+
// WAL concurrently. See 9.
|
|
2355
|
+
// 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
|
|
2356
|
+
// done by write group leader.
|
|
2357
|
+
// 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
|
|
2358
|
+
// two-write-queues. Only log_write_mutex_ is held to protect concurrent
|
|
2359
|
+
// pop_front() by FindObsoleteFiles().
|
|
2360
|
+
// 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
|
|
2361
|
+
// is held to protect the data structure from concurrent pop_front() by
|
|
2362
|
+
// FindObsoleteFiles().
|
|
2363
|
+
// 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
|
|
2364
|
+
// of two-write-queues. Only log_write_mutex_ is held. This suffices to
|
|
2365
|
+
// protect the data structure from concurrent push_back() by current
|
|
2366
|
+
// write group leader as well as pop_front() by FindObsoleteFiles().
|
|
2329
2367
|
std::deque<LogFileNumberSize> alive_log_files_;
|
|
2330
2368
|
|
|
2331
2369
|
// Log files that aren't fully synced, and the current log file.
|
|
2332
2370
|
// Synchronization:
|
|
2333
|
-
//
|
|
2334
|
-
//
|
|
2335
|
-
//
|
|
2336
|
-
//
|
|
2337
|
-
//
|
|
2371
|
+
// 1. read by FindObsoleteFiles() which can be called either in application
|
|
2372
|
+
// thread or RocksDB bg threads. log_write_mutex_ is always held, while
|
|
2373
|
+
// some reads are performed without mutex_.
|
|
2374
|
+
// 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
|
|
2375
|
+
// 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
|
|
2376
|
+
// 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
|
|
2377
|
+
// Note that at this point, DB::Open() has not returned success to
|
|
2378
|
+
// application, thus the only other thread(s) that can conflict are bg
|
|
2379
|
+
// threads calling FindObsoleteFiles(). See 1.
|
|
2380
|
+
// 5. iteration and clear() from CloseHelper() always hold log_write_mutex
|
|
2381
|
+
// and mutex_.
|
|
2382
|
+
// 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
|
|
2383
|
+
// log_write_mutex_. These two can be called by application threads after
|
|
2384
|
+
// DB::Open() returns success to applications.
|
|
2385
|
+
// 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
|
|
2386
|
+
// 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
|
|
2387
|
+
// log_write_mutex_.
|
|
2388
|
+
// 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
|
|
2389
|
+
// 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
|
|
2390
|
+
// happen in bg flush threads after DB::Open() returns success to
|
|
2391
|
+
// applications.
|
|
2392
|
+
// 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
|
|
2393
|
+
// holds only the log_write_mutex_. This is done by the write group
|
|
2394
|
+
// leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
|
|
2395
|
+
// can happen concurrently. This is fine because log_write_mutex_ is used
|
|
2396
|
+
// by all parties. See 2, 5, 9.
|
|
2397
|
+
// 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
|
|
2398
|
+
// log_write_mutex_. This happens in the write group leader.
|
|
2399
|
+
// 13. emplace_back() by SwitchMemtable() hold both mutex_ and
|
|
2400
|
+
// log_write_mutex_. This happens in the write group leader. Can conflict
|
|
2401
|
+
// with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
|
|
2402
|
+
// SyncClosedLogs(), etc. as well as application threads calling
|
|
2403
|
+
// FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
|
|
2404
|
+
// require at least log_write_mutex_.
|
|
2405
|
+
// 14. iteration called in WriteToWAL(write_group) protected by
|
|
2406
|
+
// log_write_mutex_. This is done by write group leader when
|
|
2407
|
+
// two-write-queues is disabled and write needs to sync logs.
|
|
2408
|
+
// 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
|
|
2409
|
+
// This can be done by the write group leader if two-write-queues is
|
|
2410
|
+
// enabled. It can also be done by another WAL-only write thread.
|
|
2411
|
+
//
|
|
2412
|
+
// Other observations:
|
|
2338
2413
|
// - back() and items with getting_synced=true are not popped,
|
|
2339
2414
|
// - The same thread that sets getting_synced=true will reset it.
|
|
2340
2415
|
// - it follows that the object referred by back() can be safely read from
|
|
2341
|
-
// the write_thread_ without using mutex
|
|
2416
|
+
// the write_thread_ without using mutex. Note that calling back() without
|
|
2417
|
+
// mutex may be unsafe because different implementations of deque::back() may
|
|
2418
|
+
// access other member variables of deque, causing undefined behaviors.
|
|
2419
|
+
// Generally, do not access stl containers without proper synchronization.
|
|
2342
2420
|
// - it follows that the items with getting_synced=true can be safely read
|
|
2343
2421
|
// from the same thread that has set getting_synced=true
|
|
2344
2422
|
std::deque<LogWriterNumber> logs_;
|
|
2423
|
+
|
|
2345
2424
|
// Signaled when getting_synced becomes false for some of the logs_.
|
|
2346
2425
|
InstrumentedCondVar log_sync_cv_;
|
|
2347
2426
|
// This is the app-level state that is written to the WAL but will be used
|
|
@@ -2356,7 +2435,7 @@ class DBImpl : public DB {
|
|
|
2356
2435
|
std::atomic<uint64_t> total_log_size_;
|
|
2357
2436
|
|
|
2358
2437
|
// If this is non-empty, we need to delete these log files in background
|
|
2359
|
-
// threads. Protected by
|
|
2438
|
+
// threads. Protected by log_write_mutex_.
|
|
2360
2439
|
autovector<log::Writer*> logs_to_free_;
|
|
2361
2440
|
|
|
2362
2441
|
bool is_snapshot_supported_;
|
|
@@ -2436,10 +2515,13 @@ class DBImpl : public DB {
|
|
|
2436
2515
|
// JobContext. Current implementation tracks table and blob files only.
|
|
2437
2516
|
std::unordered_set<uint64_t> files_grabbed_for_purge_;
|
|
2438
2517
|
|
|
2439
|
-
// A queue to store log writers to close
|
|
2518
|
+
// A queue to store log writers to close. Protected by db mutex_.
|
|
2440
2519
|
std::deque<log::Writer*> logs_to_free_queue_;
|
|
2520
|
+
|
|
2441
2521
|
std::deque<SuperVersion*> superversions_to_free_queue_;
|
|
2522
|
+
|
|
2442
2523
|
int unscheduled_flushes_;
|
|
2524
|
+
|
|
2443
2525
|
int unscheduled_compactions_;
|
|
2444
2526
|
|
|
2445
2527
|
// count how many background compactions are running or have been scheduled in
|
|
@@ -2592,6 +2674,7 @@ class DBImpl : public DB {
|
|
|
2592
2674
|
InstrumentedCondVar atomic_flush_install_cv_;
|
|
2593
2675
|
|
|
2594
2676
|
bool wal_in_db_path_;
|
|
2677
|
+
std::atomic<uint64_t> max_total_wal_size_;
|
|
2595
2678
|
|
|
2596
2679
|
BlobFileCompletionCallback blob_callback_;
|
|
2597
2680
|
|