@nxtedition/rocksdb 7.0.37 → 7.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +17 -26
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
- package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
- package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
- package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
- package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
- package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
- package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
- package/index.js +2 -2
- package/iterator.js +1 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -82,9 +82,10 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
|
|
|
82
82
|
return false;
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
IOStatus DBImpl::SyncClosedLogs(JobContext* job_context
|
|
85
|
+
IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
|
|
86
|
+
VersionEdit* synced_wals) {
|
|
86
87
|
TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
|
|
87
|
-
|
|
88
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
88
89
|
autovector<log::Writer*, 1> logs_to_sync;
|
|
89
90
|
uint64_t current_log_number = logfile_number_;
|
|
90
91
|
while (logs_.front().number < current_log_number &&
|
|
@@ -100,7 +101,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
|
|
|
100
101
|
|
|
101
102
|
IOStatus io_s;
|
|
102
103
|
if (!logs_to_sync.empty()) {
|
|
103
|
-
|
|
104
|
+
log_write_mutex_.Unlock();
|
|
104
105
|
|
|
105
106
|
assert(job_context);
|
|
106
107
|
|
|
@@ -128,12 +129,12 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
|
|
|
128
129
|
|
|
129
130
|
TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
|
|
130
131
|
/*arg=*/nullptr);
|
|
131
|
-
|
|
132
|
+
log_write_mutex_.Lock();
|
|
132
133
|
|
|
133
134
|
// "number <= current_log_number - 1" is equivalent to
|
|
134
135
|
// "number < current_log_number".
|
|
135
136
|
if (io_s.ok()) {
|
|
136
|
-
|
|
137
|
+
MarkLogsSynced(current_log_number - 1, true, synced_wals);
|
|
137
138
|
} else {
|
|
138
139
|
MarkLogsNotSynced(current_log_number - 1);
|
|
139
140
|
}
|
|
@@ -220,8 +221,16 @@ Status DBImpl::FlushMemTableToOutputFile(
|
|
|
220
221
|
bool need_cancel = false;
|
|
221
222
|
IOStatus log_io_s = IOStatus::OK();
|
|
222
223
|
if (needs_to_sync_closed_wals) {
|
|
223
|
-
// SyncClosedLogs() may unlock and re-lock the
|
|
224
|
-
|
|
224
|
+
// SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
|
|
225
|
+
// times.
|
|
226
|
+
VersionEdit synced_wals;
|
|
227
|
+
mutex_.Unlock();
|
|
228
|
+
log_io_s = SyncClosedLogs(job_context, &synced_wals);
|
|
229
|
+
mutex_.Lock();
|
|
230
|
+
if (log_io_s.ok() && synced_wals.IsWalAddition()) {
|
|
231
|
+
log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
|
|
232
|
+
}
|
|
233
|
+
|
|
225
234
|
if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
|
|
226
235
|
!log_io_s.IsColumnFamilyDropped()) {
|
|
227
236
|
error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
|
|
@@ -474,7 +483,14 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
|
|
|
474
483
|
if (logfile_number_ > 0) {
|
|
475
484
|
// TODO (yanqin) investigate whether we should sync the closed logs for
|
|
476
485
|
// single column family case.
|
|
477
|
-
|
|
486
|
+
VersionEdit synced_wals;
|
|
487
|
+
mutex_.Unlock();
|
|
488
|
+
log_io_s = SyncClosedLogs(job_context, &synced_wals);
|
|
489
|
+
mutex_.Lock();
|
|
490
|
+
if (log_io_s.ok() && synced_wals.IsWalAddition()) {
|
|
491
|
+
log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
|
|
492
|
+
}
|
|
493
|
+
|
|
478
494
|
if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
|
|
479
495
|
!log_io_s.IsColumnFamilyDropped()) {
|
|
480
496
|
if (total_log_size_ > 0) {
|
|
@@ -1392,7 +1408,8 @@ Status DBImpl::CompactFilesImpl(
|
|
|
1392
1408
|
&compaction_job_stats, Env::Priority::USER, io_tracer_,
|
|
1393
1409
|
kManualCompactionCanceledFalse_, db_id_, db_session_id_,
|
|
1394
1410
|
c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
|
|
1395
|
-
&blob_callback_
|
|
1411
|
+
&blob_callback_, &bg_compaction_scheduled_,
|
|
1412
|
+
&bg_bottom_compaction_scheduled_);
|
|
1396
1413
|
|
|
1397
1414
|
// Creating a compaction influences the compaction score because the score
|
|
1398
1415
|
// takes running compactions into account (by skipping files that are already
|
|
@@ -3314,7 +3331,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
|
|
3314
3331
|
if (start_level > 0) {
|
|
3315
3332
|
auto vstorage = c->input_version()->storage_info();
|
|
3316
3333
|
c->edit()->AddCompactCursor(
|
|
3317
|
-
start_level,
|
|
3334
|
+
start_level,
|
|
3335
|
+
vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
|
|
3318
3336
|
}
|
|
3319
3337
|
}
|
|
3320
3338
|
status = versions_->LogAndApply(c->column_family_data(),
|
|
@@ -3399,7 +3417,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
|
|
|
3399
3417
|
is_manual ? manual_compaction->canceled
|
|
3400
3418
|
: kManualCompactionCanceledFalse_,
|
|
3401
3419
|
db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
|
|
3402
|
-
c->trim_ts(), &blob_callback_
|
|
3420
|
+
c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
|
|
3421
|
+
&bg_bottom_compaction_scheduled_);
|
|
3403
3422
|
compaction_job.Prepare();
|
|
3404
3423
|
|
|
3405
3424
|
NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
|
|
@@ -271,6 +271,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|
|
271
271
|
|
|
272
272
|
// logs_ is empty when called during recovery, in which case there can't yet
|
|
273
273
|
// be any tracked obsolete logs
|
|
274
|
+
log_write_mutex_.Lock();
|
|
275
|
+
|
|
276
|
+
if (alive_log_files_.empty() || logs_.empty()) {
|
|
277
|
+
mutex_.AssertHeld();
|
|
278
|
+
// We may reach here if the db is DBImplSecondary
|
|
279
|
+
log_write_mutex_.Unlock();
|
|
280
|
+
return;
|
|
281
|
+
}
|
|
282
|
+
|
|
274
283
|
if (!alive_log_files_.empty() && !logs_.empty()) {
|
|
275
284
|
uint64_t min_log_number = job_context->log_number;
|
|
276
285
|
size_t num_alive_log_files = alive_log_files_.size();
|
|
@@ -292,17 +301,15 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|
|
292
301
|
}
|
|
293
302
|
job_context->size_log_to_delete += earliest.size;
|
|
294
303
|
total_log_size_ -= earliest.size;
|
|
295
|
-
if (two_write_queues_) {
|
|
296
|
-
log_write_mutex_.Lock();
|
|
297
|
-
}
|
|
298
304
|
alive_log_files_.pop_front();
|
|
299
|
-
|
|
300
|
-
log_write_mutex_.Unlock();
|
|
301
|
-
}
|
|
305
|
+
|
|
302
306
|
// Current log should always stay alive since it can't have
|
|
303
307
|
// number < MinLogNumber().
|
|
304
308
|
assert(alive_log_files_.size());
|
|
305
309
|
}
|
|
310
|
+
log_write_mutex_.Unlock();
|
|
311
|
+
mutex_.Unlock();
|
|
312
|
+
log_write_mutex_.Lock();
|
|
306
313
|
while (!logs_.empty() && logs_.front().number < min_log_number) {
|
|
307
314
|
auto& log = logs_.front();
|
|
308
315
|
if (log.IsSyncing()) {
|
|
@@ -311,10 +318,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|
|
311
318
|
continue;
|
|
312
319
|
}
|
|
313
320
|
logs_to_free_.push_back(log.ReleaseWriter());
|
|
314
|
-
|
|
315
|
-
InstrumentedMutexLock wl(&log_write_mutex_);
|
|
316
|
-
logs_.pop_front();
|
|
317
|
-
}
|
|
321
|
+
logs_.pop_front();
|
|
318
322
|
}
|
|
319
323
|
// Current log cannot be obsolete.
|
|
320
324
|
assert(!logs_.empty());
|
|
@@ -323,23 +327,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|
|
323
327
|
// We're just cleaning up for DB::Write().
|
|
324
328
|
assert(job_context->logs_to_free.empty());
|
|
325
329
|
job_context->logs_to_free = logs_to_free_;
|
|
330
|
+
|
|
331
|
+
logs_to_free_.clear();
|
|
332
|
+
log_write_mutex_.Unlock();
|
|
333
|
+
mutex_.Lock();
|
|
326
334
|
job_context->log_recycle_files.assign(log_recycle_files_.begin(),
|
|
327
335
|
log_recycle_files_.end());
|
|
328
|
-
logs_to_free_.clear();
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
namespace {
|
|
332
|
-
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
|
|
333
|
-
const JobContext::CandidateFileInfo& second) {
|
|
334
|
-
if (first.file_name > second.file_name) {
|
|
335
|
-
return true;
|
|
336
|
-
} else if (first.file_name < second.file_name) {
|
|
337
|
-
return false;
|
|
338
|
-
} else {
|
|
339
|
-
return (first.file_path > second.file_path);
|
|
340
|
-
}
|
|
341
336
|
}
|
|
342
|
-
} // namespace
|
|
343
337
|
|
|
344
338
|
// Delete obsolete files and log status and information of file deletion
|
|
345
339
|
void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
|
|
@@ -445,7 +439,16 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
|
|
|
445
439
|
// dedup state.candidate_files so we don't try to delete the same
|
|
446
440
|
// file twice
|
|
447
441
|
std::sort(candidate_files.begin(), candidate_files.end(),
|
|
448
|
-
|
|
442
|
+
[](const JobContext::CandidateFileInfo& lhs,
|
|
443
|
+
const JobContext::CandidateFileInfo& rhs) {
|
|
444
|
+
if (lhs.file_name > rhs.file_name) {
|
|
445
|
+
return true;
|
|
446
|
+
} else if (lhs.file_name < rhs.file_name) {
|
|
447
|
+
return false;
|
|
448
|
+
} else {
|
|
449
|
+
return (lhs.file_path > rhs.file_path);
|
|
450
|
+
}
|
|
451
|
+
});
|
|
449
452
|
candidate_files.erase(
|
|
450
453
|
std::unique(candidate_files.begin(), candidate_files.end()),
|
|
451
454
|
candidate_files.end());
|
|
@@ -1459,9 +1459,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
|
|
1459
1459
|
Status s;
|
|
1460
1460
|
mutex_.AssertHeld();
|
|
1461
1461
|
assert(immutable_db_options_.avoid_flush_during_recovery);
|
|
1462
|
-
if (two_write_queues_) {
|
|
1463
|
-
log_write_mutex_.Lock();
|
|
1464
|
-
}
|
|
1465
1462
|
// Mark these as alive so they'll be considered for deletion later by
|
|
1466
1463
|
// FindObsoleteFiles()
|
|
1467
1464
|
total_log_size_ = 0;
|
|
@@ -1486,9 +1483,6 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
|
|
|
1486
1483
|
total_log_size_ += log.size;
|
|
1487
1484
|
alive_log_files_.push_back(log);
|
|
1488
1485
|
}
|
|
1489
|
-
if (two_write_queues_) {
|
|
1490
|
-
log_write_mutex_.Unlock();
|
|
1491
|
-
}
|
|
1492
1486
|
return s;
|
|
1493
1487
|
}
|
|
1494
1488
|
|
|
@@ -1871,16 +1865,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
1871
1865
|
}
|
|
1872
1866
|
|
|
1873
1867
|
if (s.ok()) {
|
|
1874
|
-
if (impl->two_write_queues_) {
|
|
1875
|
-
impl->log_write_mutex_.Lock();
|
|
1876
|
-
}
|
|
1877
1868
|
impl->alive_log_files_.push_back(
|
|
1878
1869
|
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
|
1879
|
-
if (impl->two_write_queues_) {
|
|
1880
|
-
impl->log_write_mutex_.Unlock();
|
|
1881
|
-
}
|
|
1882
|
-
}
|
|
1883
|
-
if (s.ok()) {
|
|
1884
1870
|
// In WritePrepared there could be gap in sequence numbers. This breaks
|
|
1885
1871
|
// the trick we use in kPointInTimeRecovery which assumes the first seq in
|
|
1886
1872
|
// the log right after the corrupted log is one larger than the last seq
|
|
@@ -349,14 +349,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
349
349
|
// when it finds suitable, and finish them in the same write batch.
|
|
350
350
|
// This is how a write job could be done by the other writer.
|
|
351
351
|
WriteContext write_context;
|
|
352
|
+
LogContext log_context(write_options.sync);
|
|
352
353
|
WriteThread::WriteGroup write_group;
|
|
353
354
|
bool in_parallel_group = false;
|
|
354
355
|
uint64_t last_sequence = kMaxSequenceNumber;
|
|
355
356
|
|
|
356
|
-
mutex_.Lock();
|
|
357
|
-
|
|
358
|
-
bool need_log_sync = write_options.sync;
|
|
359
|
-
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
|
|
360
357
|
assert(!two_write_queues_ || !disable_memtable);
|
|
361
358
|
{
|
|
362
359
|
// With concurrent writes we do preprocess only in the write thread that
|
|
@@ -366,7 +363,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
366
363
|
// PreprocessWrite does its own perf timing.
|
|
367
364
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
368
365
|
|
|
369
|
-
status = PreprocessWrite(write_options, &
|
|
366
|
+
status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
370
367
|
if (!two_write_queues_) {
|
|
371
368
|
// Assign it after ::PreprocessWrite since the sequence might advance
|
|
372
369
|
// inside it by WriteRecoverableState
|
|
@@ -376,13 +373,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
376
373
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
377
374
|
}
|
|
378
375
|
|
|
379
|
-
log::Writer* log_writer = logs_.back().writer;
|
|
380
|
-
LogFileNumberSize& log_file_number_size = alive_log_files_.back();
|
|
381
|
-
|
|
382
|
-
assert(log_writer->get_log_number() == log_file_number_size.number);
|
|
383
|
-
|
|
384
|
-
mutex_.Unlock();
|
|
385
|
-
|
|
386
376
|
// Add to log and apply to memtable. We can release the lock
|
|
387
377
|
// during this phase since &w is currently responsible for logging
|
|
388
378
|
// and protects against concurrent loggers and concurrent writes
|
|
@@ -477,10 +467,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
477
467
|
|
|
478
468
|
if (!two_write_queues_) {
|
|
479
469
|
if (status.ok() && !write_options.disableWAL) {
|
|
470
|
+
assert(log_context.log_file_number_size);
|
|
471
|
+
LogFileNumberSize& log_file_number_size =
|
|
472
|
+
*(log_context.log_file_number_size);
|
|
480
473
|
PERF_TIMER_GUARD(write_wal_time);
|
|
481
|
-
io_s =
|
|
482
|
-
|
|
483
|
-
|
|
474
|
+
io_s =
|
|
475
|
+
WriteToWAL(write_group, log_context.writer, log_used,
|
|
476
|
+
log_context.need_log_sync, log_context.need_log_dir_sync,
|
|
477
|
+
last_sequence + 1, log_file_number_size);
|
|
484
478
|
}
|
|
485
479
|
} else {
|
|
486
480
|
if (status.ok() && !write_options.disableWAL) {
|
|
@@ -582,14 +576,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|
|
582
576
|
assert(pre_release_cb_status.ok());
|
|
583
577
|
}
|
|
584
578
|
|
|
585
|
-
if (need_log_sync) {
|
|
586
|
-
|
|
579
|
+
if (log_context.need_log_sync) {
|
|
580
|
+
VersionEdit synced_wals;
|
|
581
|
+
log_write_mutex_.Lock();
|
|
587
582
|
if (status.ok()) {
|
|
588
|
-
|
|
583
|
+
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
|
|
584
|
+
&synced_wals);
|
|
589
585
|
} else {
|
|
590
586
|
MarkLogsNotSynced(logfile_number_);
|
|
591
587
|
}
|
|
592
|
-
|
|
588
|
+
log_write_mutex_.Unlock();
|
|
589
|
+
if (status.ok() && synced_wals.IsWalAddition()) {
|
|
590
|
+
InstrumentedMutexLock l(&mutex_);
|
|
591
|
+
status = ApplyWALToManifest(&synced_wals);
|
|
592
|
+
}
|
|
593
|
+
|
|
593
594
|
// Requesting sync with two_write_queues_ is expected to be very rare. We
|
|
594
595
|
// hence provide a simple implementation that is not necessarily efficient.
|
|
595
596
|
if (two_write_queues_) {
|
|
@@ -652,19 +653,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
652
653
|
if (w.callback && !w.callback->AllowWriteBatching()) {
|
|
653
654
|
write_thread_.WaitForMemTableWriters();
|
|
654
655
|
}
|
|
655
|
-
|
|
656
|
-
bool need_log_sync = !write_options.disableWAL && write_options.sync;
|
|
657
|
-
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
|
|
656
|
+
LogContext log_context(!write_options.disableWAL && write_options.sync);
|
|
658
657
|
// PreprocessWrite does its own perf timing.
|
|
659
658
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
660
|
-
w.status = PreprocessWrite(write_options, &
|
|
659
|
+
w.status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
661
660
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
662
|
-
log::Writer* log_writer = logs_.back().writer;
|
|
663
|
-
LogFileNumberSize& log_file_number_size = alive_log_files_.back();
|
|
664
|
-
|
|
665
|
-
assert(log_writer->get_log_number() == log_file_number_size.number);
|
|
666
|
-
|
|
667
|
-
mutex_.Unlock();
|
|
668
661
|
|
|
669
662
|
// This can set non-OK status if callback fail.
|
|
670
663
|
last_batch_group_size_ =
|
|
@@ -727,9 +720,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
727
720
|
wal_write_group.size - 1);
|
|
728
721
|
RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
|
|
729
722
|
}
|
|
723
|
+
assert(log_context.log_file_number_size);
|
|
724
|
+
LogFileNumberSize& log_file_number_size =
|
|
725
|
+
*(log_context.log_file_number_size);
|
|
730
726
|
io_s =
|
|
731
|
-
WriteToWAL(wal_write_group,
|
|
732
|
-
|
|
727
|
+
WriteToWAL(wal_write_group, log_context.writer, log_used,
|
|
728
|
+
log_context.need_log_sync, log_context.need_log_dir_sync,
|
|
729
|
+
current_sequence, log_file_number_size);
|
|
733
730
|
w.status = io_s;
|
|
734
731
|
}
|
|
735
732
|
|
|
@@ -740,16 +737,20 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
|
|
|
740
737
|
WriteStatusCheck(w.status);
|
|
741
738
|
}
|
|
742
739
|
|
|
743
|
-
|
|
744
|
-
|
|
740
|
+
VersionEdit synced_wals;
|
|
741
|
+
if (log_context.need_log_sync) {
|
|
742
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
745
743
|
if (w.status.ok()) {
|
|
746
|
-
|
|
744
|
+
MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
|
|
745
|
+
&synced_wals);
|
|
747
746
|
} else {
|
|
748
747
|
MarkLogsNotSynced(logfile_number_);
|
|
749
748
|
}
|
|
750
|
-
mutex_.Unlock();
|
|
751
749
|
}
|
|
752
|
-
|
|
750
|
+
if (w.status.ok() && synced_wals.IsWalAddition()) {
|
|
751
|
+
InstrumentedMutexLock l(&mutex_);
|
|
752
|
+
w.status = ApplyWALToManifest(&synced_wals);
|
|
753
|
+
}
|
|
753
754
|
write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
|
|
754
755
|
}
|
|
755
756
|
|
|
@@ -893,9 +894,8 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
893
894
|
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
|
|
894
895
|
// without paying the cost of obtaining the mutex.
|
|
895
896
|
if (status.ok()) {
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
status = PreprocessWrite(write_options, &need_log_sync, &write_context);
|
|
897
|
+
LogContext log_context;
|
|
898
|
+
status = PreprocessWrite(write_options, &log_context, &write_context);
|
|
899
899
|
WriteStatusCheckOnLocked(status);
|
|
900
900
|
}
|
|
901
901
|
if (!status.ok()) {
|
|
@@ -1057,9 +1057,8 @@ Status DBImpl::WriteImplWALOnly(
|
|
|
1057
1057
|
void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
|
|
1058
1058
|
// Is setting bg_error_ enough here? This will at least stop
|
|
1059
1059
|
// compaction and fail any further writes.
|
|
1060
|
-
|
|
1060
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1061
1061
|
assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
|
|
1062
|
-
mutex_.AssertHeld();
|
|
1063
1062
|
if (immutable_db_options_.paranoid_checks && !status.ok() &&
|
|
1064
1063
|
!status.IsBusy() && !status.IsIncomplete()) {
|
|
1065
1064
|
// Maybe change the return status to void?
|
|
@@ -1110,13 +1109,13 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
|
|
|
1110
1109
|
}
|
|
1111
1110
|
|
|
1112
1111
|
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
1113
|
-
|
|
1112
|
+
LogContext* log_context,
|
|
1114
1113
|
WriteContext* write_context) {
|
|
1115
|
-
|
|
1116
|
-
assert(write_context != nullptr && need_log_sync != nullptr);
|
|
1114
|
+
assert(write_context != nullptr && log_context != nullptr);
|
|
1117
1115
|
Status status;
|
|
1118
1116
|
|
|
1119
1117
|
if (error_handler_.IsDBStopped()) {
|
|
1118
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1120
1119
|
status = error_handler_.GetBGError();
|
|
1121
1120
|
}
|
|
1122
1121
|
|
|
@@ -1124,11 +1123,11 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1124
1123
|
|
|
1125
1124
|
if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
|
|
1126
1125
|
assert(versions_);
|
|
1126
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1127
1127
|
const ColumnFamilySet* const column_families =
|
|
1128
1128
|
versions_->GetColumnFamilySet();
|
|
1129
1129
|
assert(column_families);
|
|
1130
1130
|
size_t num_cfs = column_families->NumberOfColumnFamilies();
|
|
1131
|
-
|
|
1132
1131
|
assert(num_cfs >= 1);
|
|
1133
1132
|
if (num_cfs > 1) {
|
|
1134
1133
|
WaitForPendingWrites();
|
|
@@ -1142,15 +1141,18 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1142
1141
|
// thread is writing to another DB with the same write buffer, they may also
|
|
1143
1142
|
// be flushed. We may end up with flushing much more DBs than needed. It's
|
|
1144
1143
|
// suboptimal but still correct.
|
|
1144
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1145
1145
|
WaitForPendingWrites();
|
|
1146
1146
|
status = HandleWriteBufferManagerFlush(write_context);
|
|
1147
1147
|
}
|
|
1148
1148
|
|
|
1149
1149
|
if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
|
|
1150
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1150
1151
|
status = TrimMemtableHistory(write_context);
|
|
1151
1152
|
}
|
|
1152
1153
|
|
|
1153
1154
|
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
|
|
1155
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1154
1156
|
WaitForPendingWrites();
|
|
1155
1157
|
status = ScheduleFlushes(write_context);
|
|
1156
1158
|
}
|
|
@@ -1166,6 +1168,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1166
1168
|
// for previous one. It might create a fairness issue that expiration
|
|
1167
1169
|
// might happen for smaller writes but larger writes can go through.
|
|
1168
1170
|
// Can optimize it if it is an issue.
|
|
1171
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1169
1172
|
status = DelayWrite(last_batch_group_size_, write_options);
|
|
1170
1173
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
|
1171
1174
|
}
|
|
@@ -1180,11 +1183,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1180
1183
|
if (write_options.no_slowdown) {
|
|
1181
1184
|
status = Status::Incomplete("Write stall");
|
|
1182
1185
|
} else {
|
|
1186
|
+
InstrumentedMutexLock l(&mutex_);
|
|
1183
1187
|
WriteBufferManagerStallWrites();
|
|
1184
1188
|
}
|
|
1185
1189
|
}
|
|
1186
|
-
|
|
1187
|
-
if (status.ok() &&
|
|
1190
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
1191
|
+
if (status.ok() && log_context->need_log_sync) {
|
|
1188
1192
|
// Wait until the parallel syncs are finished. Any sync process has to sync
|
|
1189
1193
|
// the front log too so it is enough to check the status of front()
|
|
1190
1194
|
// We do a while loop since log_sync_cv_ is signalled when any sync is
|
|
@@ -1204,8 +1208,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
|
|
|
1204
1208
|
log.PrepareForSync();
|
|
1205
1209
|
}
|
|
1206
1210
|
} else {
|
|
1207
|
-
|
|
1211
|
+
log_context->need_log_sync = false;
|
|
1208
1212
|
}
|
|
1213
|
+
log_context->writer = logs_.back().writer;
|
|
1214
|
+
log_context->need_log_dir_sync =
|
|
1215
|
+
log_context->need_log_dir_sync && !log_dir_synced_;
|
|
1216
|
+
log_context->log_file_number_size = std::addressof(alive_log_files_.back());
|
|
1209
1217
|
|
|
1210
1218
|
return status;
|
|
1211
1219
|
}
|
|
@@ -1714,10 +1722,12 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
|
|
|
1714
1722
|
}
|
|
1715
1723
|
|
|
1716
1724
|
uint64_t DBImpl::GetMaxTotalWalSize() const {
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1725
|
+
uint64_t max_total_wal_size =
|
|
1726
|
+
max_total_wal_size_.load(std::memory_order_acquire);
|
|
1727
|
+
if (max_total_wal_size > 0) {
|
|
1728
|
+
return max_total_wal_size;
|
|
1729
|
+
}
|
|
1730
|
+
return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
|
|
1721
1731
|
}
|
|
1722
1732
|
|
|
1723
1733
|
// REQUIRES: mutex_ is held
|
|
@@ -2065,7 +2075,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2065
2075
|
log_recycle_files_.pop_front();
|
|
2066
2076
|
}
|
|
2067
2077
|
if (s.ok() && creating_new_log) {
|
|
2068
|
-
log_write_mutex_
|
|
2078
|
+
InstrumentedMutexLock l(&log_write_mutex_);
|
|
2069
2079
|
assert(new_log != nullptr);
|
|
2070
2080
|
if (!logs_.empty()) {
|
|
2071
2081
|
// Alway flush the buffer of the last log before switching to a new one
|
|
@@ -2089,7 +2099,6 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
|
|
|
2089
2099
|
logs_.emplace_back(logfile_number_, new_log);
|
|
2090
2100
|
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
|
2091
2101
|
}
|
|
2092
|
-
log_write_mutex_.Unlock();
|
|
2093
2102
|
}
|
|
2094
2103
|
|
|
2095
2104
|
if (!s.ok()) {
|
|
@@ -492,7 +492,7 @@ TEST_F(DBTest, PutSingleDeleteGet) {
|
|
|
492
492
|
ASSERT_EQ("v2", Get(1, "foo2"));
|
|
493
493
|
ASSERT_OK(SingleDelete(1, "foo"));
|
|
494
494
|
ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
|
|
495
|
-
// Skip FIFO and universal compaction
|
|
495
|
+
// Skip FIFO and universal compaction because they do not apply to the test
|
|
496
496
|
// case. Skip MergePut because single delete does not get removed when it
|
|
497
497
|
// encounters a merge.
|
|
498
498
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
@@ -645,7 +645,7 @@ TEST_F(DBTest, SingleDeleteFlush) {
|
|
|
645
645
|
|
|
646
646
|
ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
|
|
647
647
|
ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
|
|
648
|
-
// Skip FIFO and universal compaction
|
|
648
|
+
// Skip FIFO and universal compaction beccaus they do not apply to the test
|
|
649
649
|
// case. Skip MergePut because single delete does not get removed when it
|
|
650
650
|
// encounters a merge.
|
|
651
651
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
@@ -668,7 +668,7 @@ TEST_F(DBTest, SingleDeletePutFlush) {
|
|
|
668
668
|
ASSERT_OK(Flush(1));
|
|
669
669
|
|
|
670
670
|
ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
|
|
671
|
-
// Skip FIFO and universal compaction
|
|
671
|
+
// Skip FIFO and universal compaction beccaus they do not apply to the test
|
|
672
672
|
// case. Skip MergePut because single delete does not get removed when it
|
|
673
673
|
// encounters a merge.
|
|
674
674
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
@@ -1993,7 +1993,7 @@ TEST_F(DBTest, UnremovableSingleDelete) {
|
|
|
1993
1993
|
ASSERT_EQ("first", Get(1, "foo", snapshot));
|
|
1994
1994
|
ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
|
|
1995
1995
|
db_->ReleaseSnapshot(snapshot);
|
|
1996
|
-
// Skip FIFO and universal compaction
|
|
1996
|
+
// Skip FIFO and universal compaction because they do not apply to the test
|
|
1997
1997
|
// case. Skip MergePut because single delete does not get removed when it
|
|
1998
1998
|
// encounters a merge.
|
|
1999
1999
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
@@ -2773,7 +2773,7 @@ INSTANTIATE_TEST_CASE_P(
|
|
|
2773
2773
|
#endif // ROCKSDB_LITE
|
|
2774
2774
|
|
|
2775
2775
|
// Group commit test:
|
|
2776
|
-
#if !defined(
|
|
2776
|
+
#if !defined(OS_WIN)
|
|
2777
2777
|
// Disable this test temporarily on Travis and appveyor as it fails
|
|
2778
2778
|
// intermittently. Github issue: #4151
|
|
2779
2779
|
namespace {
|
|
@@ -2850,7 +2850,7 @@ TEST_F(DBTest, GroupCommitTest) {
|
|
|
2850
2850
|
ASSERT_GT(hist_data.average, 0.0);
|
|
2851
2851
|
} while (ChangeOptions(kSkipNoSeekToLast));
|
|
2852
2852
|
}
|
|
2853
|
-
#endif //
|
|
2853
|
+
#endif // OS_WIN
|
|
2854
2854
|
|
|
2855
2855
|
namespace {
|
|
2856
2856
|
using KVMap = std::map<std::string, std::string>;
|
|
@@ -358,6 +358,9 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
|
|
|
358
358
|
RecoverFromNoSpace();
|
|
359
359
|
}
|
|
360
360
|
}
|
|
361
|
+
if (bg_error_.severity() >= Status::Severity::kHardError) {
|
|
362
|
+
is_db_stopped_.store(true, std::memory_order_release);
|
|
363
|
+
}
|
|
361
364
|
return bg_error_;
|
|
362
365
|
}
|
|
363
366
|
|
|
@@ -736,6 +739,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
|
|
|
736
739
|
// the bg_error and notify user.
|
|
737
740
|
TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
|
|
738
741
|
Status old_bg_error = bg_error_;
|
|
742
|
+
is_db_stopped_.store(false, std::memory_order_release);
|
|
739
743
|
bg_error_ = Status::OK();
|
|
740
744
|
bg_error_.PermitUncheckedError();
|
|
741
745
|
EventHelpers::NotifyOnErrorRecoveryEnd(
|
|
@@ -792,6 +796,9 @@ void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
|
|
|
792
796
|
if (bg_err.severity() > bg_error_.severity()) {
|
|
793
797
|
bg_error_ = bg_err;
|
|
794
798
|
}
|
|
799
|
+
if (bg_error_.severity() >= Status::Severity::kHardError) {
|
|
800
|
+
is_db_stopped_.store(true, std::memory_order_release);
|
|
801
|
+
}
|
|
795
802
|
return;
|
|
796
803
|
}
|
|
797
804
|
|
|
@@ -38,6 +38,7 @@ class ErrorHandler {
|
|
|
38
38
|
auto_recovery_(false),
|
|
39
39
|
recovery_in_prog_(false),
|
|
40
40
|
soft_error_no_bg_work_(false),
|
|
41
|
+
is_db_stopped_(false),
|
|
41
42
|
bg_error_stats_(db_options.statistics) {
|
|
42
43
|
// Clear the checked flag for uninitialized errors
|
|
43
44
|
bg_error_.PermitUncheckedError();
|
|
@@ -59,16 +60,15 @@ class ErrorHandler {
|
|
|
59
60
|
|
|
60
61
|
Status ClearBGError();
|
|
61
62
|
|
|
62
|
-
bool IsDBStopped() {
|
|
63
|
-
return !bg_error_.ok() &&
|
|
64
|
-
bg_error_.severity() >= Status::Severity::kHardError;
|
|
65
|
-
}
|
|
63
|
+
bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
|
|
66
64
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
65
|
+
bool IsBGWorkStopped() {
|
|
66
|
+
assert(db_mutex_);
|
|
67
|
+
db_mutex_->AssertHeld();
|
|
68
|
+
return !bg_error_.ok() &&
|
|
69
|
+
(bg_error_.severity() >= Status::Severity::kHardError ||
|
|
70
|
+
!auto_recovery_ || soft_error_no_bg_work_);
|
|
71
|
+
}
|
|
72
72
|
|
|
73
73
|
bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
|
|
74
74
|
|
|
@@ -105,6 +105,7 @@ class ErrorHandler {
|
|
|
105
105
|
|
|
106
106
|
// Used to store the context for recover, such as flush reason.
|
|
107
107
|
DBRecoverContext recover_context_;
|
|
108
|
+
std::atomic<bool> is_db_stopped_;
|
|
108
109
|
|
|
109
110
|
// The pointer of DB statistics.
|
|
110
111
|
std::shared_ptr<Statistics> bg_error_stats_;
|
|
@@ -960,12 +960,19 @@ TEST_P(CompressionLogTest, Fragmentation) {
|
|
|
960
960
|
return;
|
|
961
961
|
}
|
|
962
962
|
ASSERT_OK(SetupTestEnv());
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
963
|
+
Random rnd(301);
|
|
964
|
+
const std::vector<std::string> wal_entries = {
|
|
965
|
+
"small",
|
|
966
|
+
rnd.RandomBinaryString(3 * kBlockSize / 2), // Spans into block 2
|
|
967
|
+
rnd.RandomBinaryString(3 * kBlockSize), // Spans into block 5
|
|
968
|
+
};
|
|
969
|
+
for (const std::string& wal_entry : wal_entries) {
|
|
970
|
+
Write(wal_entry);
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
for (const std::string& wal_entry : wal_entries) {
|
|
974
|
+
ASSERT_EQ(wal_entry, Read());
|
|
975
|
+
}
|
|
969
976
|
ASSERT_EQ("EOF", Read());
|
|
970
977
|
}
|
|
971
978
|
|