@nxtedition/rocksdb 7.0.39 → 7.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +2 -12
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
- package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
- package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
- package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
- package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
- package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
- package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <algorithm>
|
|
13
13
|
#include <cinttypes>
|
|
14
14
|
#include <memory>
|
|
15
|
+
#include <optional>
|
|
15
16
|
#include <set>
|
|
16
17
|
#include <utility>
|
|
17
18
|
#include <vector>
|
|
@@ -30,6 +31,7 @@
|
|
|
30
31
|
#include "db/log_writer.h"
|
|
31
32
|
#include "db/merge_helper.h"
|
|
32
33
|
#include "db/range_del_aggregator.h"
|
|
34
|
+
#include "db/version_edit.h"
|
|
33
35
|
#include "db/version_set.h"
|
|
34
36
|
#include "file/filename.h"
|
|
35
37
|
#include "file/read_write_util.h"
|
|
@@ -44,6 +46,7 @@
|
|
|
44
46
|
#include "port/port.h"
|
|
45
47
|
#include "rocksdb/db.h"
|
|
46
48
|
#include "rocksdb/env.h"
|
|
49
|
+
#include "rocksdb/options.h"
|
|
47
50
|
#include "rocksdb/statistics.h"
|
|
48
51
|
#include "rocksdb/status.h"
|
|
49
52
|
#include "rocksdb/table.h"
|
|
@@ -120,7 +123,8 @@ CompactionJob::CompactionJob(
|
|
|
120
123
|
const std::atomic<bool>& manual_compaction_canceled,
|
|
121
124
|
const std::string& db_id, const std::string& db_session_id,
|
|
122
125
|
std::string full_history_ts_low, std::string trim_ts,
|
|
123
|
-
BlobFileCompletionCallback* blob_callback
|
|
126
|
+
BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
|
|
127
|
+
int* bg_bottom_compaction_scheduled)
|
|
124
128
|
: compact_(new CompactionState(compaction)),
|
|
125
129
|
compaction_stats_(compaction->compaction_reason(), 1),
|
|
126
130
|
db_options_(db_options),
|
|
@@ -159,9 +163,13 @@ CompactionJob::CompactionJob(
|
|
|
159
163
|
thread_pri_(thread_pri),
|
|
160
164
|
full_history_ts_low_(std::move(full_history_ts_low)),
|
|
161
165
|
trim_ts_(std::move(trim_ts)),
|
|
162
|
-
blob_callback_(blob_callback)
|
|
166
|
+
blob_callback_(blob_callback),
|
|
167
|
+
extra_num_subcompaction_threads_reserved_(0),
|
|
168
|
+
bg_compaction_scheduled_(bg_compaction_scheduled),
|
|
169
|
+
bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
|
|
163
170
|
assert(compaction_job_stats_ != nullptr);
|
|
164
171
|
assert(log_buffer_ != nullptr);
|
|
172
|
+
|
|
165
173
|
const auto* cfd = compact_->compaction->column_family_data();
|
|
166
174
|
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
|
|
167
175
|
db_options_.enable_thread_tracking);
|
|
@@ -232,24 +240,22 @@ void CompactionJob::Prepare() {
|
|
|
232
240
|
bottommost_level_ = c->bottommost_level();
|
|
233
241
|
|
|
234
242
|
if (c->ShouldFormSubcompactions()) {
|
|
235
|
-
{
|
|
236
243
|
StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
|
|
237
244
|
GenSubcompactionBoundaries();
|
|
238
|
-
|
|
239
|
-
|
|
245
|
+
}
|
|
246
|
+
if (boundaries_.size() > 1) {
|
|
240
247
|
for (size_t i = 0; i <= boundaries_.size(); i++) {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
248
|
+
compact_->sub_compact_states.emplace_back(
|
|
249
|
+
c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
|
|
250
|
+
(i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
|
|
251
|
+
: std::nullopt,
|
|
252
|
+
static_cast<uint32_t>(i));
|
|
245
253
|
}
|
|
246
254
|
RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
|
|
247
255
|
compact_->sub_compact_states.size());
|
|
248
256
|
} else {
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
compact_->sub_compact_states.emplace_back(c, start, end, /*sub_job_id*/ 0);
|
|
257
|
+
compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
|
|
258
|
+
/*sub_job_id*/ 0);
|
|
253
259
|
}
|
|
254
260
|
|
|
255
261
|
if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
|
|
@@ -290,6 +296,99 @@ void CompactionJob::Prepare() {
|
|
|
290
296
|
}
|
|
291
297
|
}
|
|
292
298
|
|
|
299
|
+
uint64_t CompactionJob::GetSubcompactionsLimit() {
|
|
300
|
+
return extra_num_subcompaction_threads_reserved_ +
|
|
301
|
+
std::max(
|
|
302
|
+
std::uint64_t(1),
|
|
303
|
+
static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
void CompactionJob::AcquireSubcompactionResources(
|
|
307
|
+
int num_extra_required_subcompactions) {
|
|
308
|
+
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
|
|
309
|
+
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
|
|
310
|
+
int max_db_compactions =
|
|
311
|
+
DBImpl::GetBGJobLimits(
|
|
312
|
+
mutable_db_options_copy_.max_background_flushes,
|
|
313
|
+
mutable_db_options_copy_.max_background_compactions,
|
|
314
|
+
mutable_db_options_copy_.max_background_jobs,
|
|
315
|
+
versions_->GetColumnFamilySet()
|
|
316
|
+
->write_controller()
|
|
317
|
+
->NeedSpeedupCompaction())
|
|
318
|
+
.max_compactions;
|
|
319
|
+
// Apply min function first since We need to compute the extra subcompaction
|
|
320
|
+
// against compaction limits. And then try to reserve threads for extra
|
|
321
|
+
// subcompactions. The actual number of reserved threads could be less than
|
|
322
|
+
// the desired number.
|
|
323
|
+
int available_bg_compactions_against_db_limit =
|
|
324
|
+
std::max(max_db_compactions - *bg_compaction_scheduled_ -
|
|
325
|
+
*bg_bottom_compaction_scheduled_,
|
|
326
|
+
0);
|
|
327
|
+
db_mutex_->Lock();
|
|
328
|
+
// Reservation only supports backgrdoun threads of which the priority is
|
|
329
|
+
// between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
|
|
330
|
+
// origin thread_pri_ is higher than that. Similar to ReleaseThreads().
|
|
331
|
+
extra_num_subcompaction_threads_reserved_ =
|
|
332
|
+
env_->ReserveThreads(std::min(num_extra_required_subcompactions,
|
|
333
|
+
available_bg_compactions_against_db_limit),
|
|
334
|
+
std::min(thread_pri_, Env::Priority::HIGH));
|
|
335
|
+
|
|
336
|
+
// Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
|
|
337
|
+
// depending on if this compaction has the bottommost priority
|
|
338
|
+
if (thread_pri_ == Env::Priority::BOTTOM) {
|
|
339
|
+
*bg_bottom_compaction_scheduled_ +=
|
|
340
|
+
extra_num_subcompaction_threads_reserved_;
|
|
341
|
+
} else {
|
|
342
|
+
*bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
|
|
343
|
+
}
|
|
344
|
+
db_mutex_->Unlock();
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
|
|
348
|
+
// Do nothing when we have zero resources to shrink
|
|
349
|
+
if (num_extra_resources == 0) return;
|
|
350
|
+
db_mutex_->Lock();
|
|
351
|
+
// We cannot release threads more than what we reserved before
|
|
352
|
+
int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
|
|
353
|
+
(int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
|
|
354
|
+
// Update the number of reserved threads and the number of background
|
|
355
|
+
// scheduled compactions for this compaction job
|
|
356
|
+
extra_num_subcompaction_threads_reserved_ -=
|
|
357
|
+
extra_num_subcompaction_threads_released;
|
|
358
|
+
// TODO (zichen): design a test case with new subcompaction partitioning
|
|
359
|
+
// when the number of actual partitions is less than the number of planned
|
|
360
|
+
// partitions
|
|
361
|
+
assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
|
|
362
|
+
// Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
|
|
363
|
+
// depending on if this compaction has the bottommost priority
|
|
364
|
+
if (thread_pri_ == Env::Priority::BOTTOM) {
|
|
365
|
+
*bg_bottom_compaction_scheduled_ -=
|
|
366
|
+
extra_num_subcompaction_threads_released;
|
|
367
|
+
} else {
|
|
368
|
+
*bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
|
|
369
|
+
}
|
|
370
|
+
db_mutex_->Unlock();
|
|
371
|
+
TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
void CompactionJob::ReleaseSubcompactionResources() {
|
|
375
|
+
if (extra_num_subcompaction_threads_reserved_ == 0) {
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
// The number of reserved threads becomes larger than 0 only if the
|
|
379
|
+
// compaction prioity is round robin and there is no sufficient
|
|
380
|
+
// sub-compactions available
|
|
381
|
+
|
|
382
|
+
// The scheduled compaction must be no less than 1 + extra number
|
|
383
|
+
// subcompactions using acquired resources since this compaction job has not
|
|
384
|
+
// finished yet
|
|
385
|
+
assert(*bg_bottom_compaction_scheduled_ >=
|
|
386
|
+
1 + extra_num_subcompaction_threads_reserved_ ||
|
|
387
|
+
*bg_compaction_scheduled_ >=
|
|
388
|
+
1 + extra_num_subcompaction_threads_reserved_);
|
|
389
|
+
ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
|
|
390
|
+
}
|
|
391
|
+
|
|
293
392
|
struct RangeWithSize {
|
|
294
393
|
Range range;
|
|
295
394
|
uint64_t size;
|
|
@@ -299,15 +398,51 @@ struct RangeWithSize {
|
|
|
299
398
|
};
|
|
300
399
|
|
|
301
400
|
void CompactionJob::GenSubcompactionBoundaries() {
|
|
401
|
+
// The goal is to find some boundary keys so that we can evenly partition
|
|
402
|
+
// the compaction input data into max_subcompactions ranges.
|
|
403
|
+
// For every input file, we ask TableReader to estimate 128 anchor points
|
|
404
|
+
// that evenly partition the input file into 128 ranges and the range
|
|
405
|
+
// sizes. This can be calculated by scanning index blocks of the file.
|
|
406
|
+
// Once we have the anchor points for all the input files, we merge them
|
|
407
|
+
// together and try to find keys dividing ranges evenly.
|
|
408
|
+
// For example, if we have two input files, and each returns following
|
|
409
|
+
// ranges:
|
|
410
|
+
// File1: (a1, 1000), (b1, 1200), (c1, 1100)
|
|
411
|
+
// File2: (a2, 1100), (b2, 1000), (c2, 1000)
|
|
412
|
+
// We total sort the keys to following:
|
|
413
|
+
// (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
|
|
414
|
+
// We calculate the total size by adding up all ranges' size, which is 6400.
|
|
415
|
+
// If we would like to partition into 2 subcompactions, the target of the
|
|
416
|
+
// range size is 3200. Based on the size, we take "b1" as the partition key
|
|
417
|
+
// since the first three ranges would hit 3200.
|
|
418
|
+
//
|
|
419
|
+
// Note that the ranges are actually overlapping. For example, in the example
|
|
420
|
+
// above, the range ending with "b1" is overlapping with the range ending with
|
|
421
|
+
// "b2". So the size 1000+1100+1200 is an underestimation of data size up to
|
|
422
|
+
// "b1". In extreme cases where we only compact N L0 files, a range can
|
|
423
|
+
// overlap with N-1 other ranges. Since we requested a relatively large number
|
|
424
|
+
// (128) of ranges from each input files, even N range overlapping would
|
|
425
|
+
// cause relatively small inaccuracy.
|
|
426
|
+
|
|
302
427
|
auto* c = compact_->compaction;
|
|
428
|
+
if (c->max_subcompactions() <= 1 &&
|
|
429
|
+
!(c->immutable_options()->compaction_pri == kRoundRobin &&
|
|
430
|
+
c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
303
433
|
auto* cfd = c->column_family_data();
|
|
304
434
|
const Comparator* cfd_comparator = cfd->user_comparator();
|
|
305
|
-
|
|
435
|
+
const InternalKeyComparator& icomp = cfd->internal_comparator();
|
|
436
|
+
|
|
437
|
+
auto* v = compact_->compaction->input_version();
|
|
438
|
+
int base_level = v->storage_info()->base_level();
|
|
439
|
+
InstrumentedMutexUnlock unlock_guard(db_mutex_);
|
|
440
|
+
|
|
441
|
+
uint64_t total_size = 0;
|
|
442
|
+
std::vector<TableReader::Anchor> all_anchors;
|
|
306
443
|
int start_lvl = c->start_level();
|
|
307
444
|
int out_lvl = c->output_level();
|
|
308
445
|
|
|
309
|
-
// Add the starting and/or ending key of certain input files as a potential
|
|
310
|
-
// boundary
|
|
311
446
|
for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
|
|
312
447
|
int lvl = c->level(lvl_idx);
|
|
313
448
|
if (lvl >= start_lvl && lvl <= out_lvl) {
|
|
@@ -318,108 +453,102 @@ void CompactionJob::GenSubcompactionBoundaries() {
|
|
|
318
453
|
continue;
|
|
319
454
|
}
|
|
320
455
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
456
|
+
for (size_t i = 0; i < num_files; i++) {
|
|
457
|
+
FileMetaData* f = flevel->files[i].file_metadata;
|
|
458
|
+
std::vector<TableReader::Anchor> my_anchors;
|
|
459
|
+
Status s = cfd->table_cache()->ApproximateKeyAnchors(
|
|
460
|
+
ReadOptions(), icomp, f->fd, my_anchors);
|
|
461
|
+
if (!s.ok() || my_anchors.empty()) {
|
|
462
|
+
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
|
|
327
463
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
bounds.emplace_back(flevel->files[0].smallest_key);
|
|
332
|
-
bounds.emplace_back(flevel->files[num_files - 1].largest_key);
|
|
333
|
-
if (lvl == out_lvl) {
|
|
334
|
-
// For the last level include the starting keys of all files since
|
|
335
|
-
// the last level is the largest and probably has the widest key
|
|
336
|
-
// range. Since it's range partitioned, the ending key of one file
|
|
337
|
-
// and the starting key of the next are very close (or identical).
|
|
338
|
-
for (size_t i = 1; i < num_files; i++) {
|
|
339
|
-
bounds.emplace_back(flevel->files[i].smallest_key);
|
|
340
|
-
}
|
|
464
|
+
for (auto& ac : my_anchors) {
|
|
465
|
+
// Can be optimize to avoid this loop.
|
|
466
|
+
total_size += ac.range_size;
|
|
341
467
|
}
|
|
468
|
+
|
|
469
|
+
all_anchors.insert(all_anchors.end(), my_anchors.begin(),
|
|
470
|
+
my_anchors.end());
|
|
342
471
|
}
|
|
343
472
|
}
|
|
344
473
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
474
|
+
// Here we total sort all the anchor points across all files and go through
|
|
475
|
+
// them in the sorted order to find partitioning boundaries.
|
|
476
|
+
// Not the most efficient implementation. A much more efficient algorithm
|
|
477
|
+
// probably exists. But they are more complex. If performance turns out to
|
|
478
|
+
// be a problem, we can optimize.
|
|
479
|
+
std::sort(
|
|
480
|
+
all_anchors.begin(), all_anchors.end(),
|
|
481
|
+
[cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
|
|
482
|
+
return cfd_comparator->Compare(a.user_key, b.user_key) < 0;
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
// Get the number of planned subcompactions, may update reserve threads
|
|
486
|
+
// and update extra_num_subcompaction_threads_reserved_ for round-robin
|
|
487
|
+
uint64_t num_planned_subcompactions;
|
|
488
|
+
if (c->immutable_options()->compaction_pri == kRoundRobin &&
|
|
489
|
+
c->immutable_options()->compaction_style == kCompactionStyleLevel) {
|
|
490
|
+
// For round-robin compaction prioity, we need to employ more
|
|
491
|
+
// subcompactions (may exceed the max_subcompaction limit). The extra
|
|
492
|
+
// subcompactions will be executed using reserved threads and taken into
|
|
493
|
+
// account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
|
|
494
|
+
|
|
495
|
+
// Initialized by the number of input files
|
|
496
|
+
num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
|
|
497
|
+
uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
|
|
498
|
+
if (max_subcompactions_limit < num_planned_subcompactions) {
|
|
499
|
+
// Assert two pointers are not empty so that we can use extra
|
|
500
|
+
// subcompactions against db compaction limits
|
|
501
|
+
assert(bg_bottom_compaction_scheduled_ != nullptr);
|
|
502
|
+
assert(bg_compaction_scheduled_ != nullptr);
|
|
503
|
+
// Reserve resources when max_subcompaction is not sufficient
|
|
504
|
+
AcquireSubcompactionResources(
|
|
505
|
+
(int)(num_planned_subcompactions - max_subcompactions_limit));
|
|
506
|
+
// Subcompactions limit changes after acquiring additional resources.
|
|
507
|
+
// Need to call GetSubcompactionsLimit() again to update the number
|
|
508
|
+
// of planned subcompactions
|
|
509
|
+
num_planned_subcompactions =
|
|
510
|
+
std::min(num_planned_subcompactions, GetSubcompactionsLimit());
|
|
374
511
|
}
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
// ApproximateSize could potentially create table reader iterator to seek
|
|
379
|
-
// to the index block and may incur I/O cost in the process. Unlock db
|
|
380
|
-
// mutex to reduce contention
|
|
381
|
-
db_mutex_->Unlock();
|
|
382
|
-
uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
|
|
383
|
-
b, start_lvl, out_lvl + 1,
|
|
384
|
-
TableReaderCaller::kCompaction);
|
|
385
|
-
db_mutex_->Lock();
|
|
386
|
-
ranges.emplace_back(a, b, size);
|
|
387
|
-
sum += size;
|
|
512
|
+
} else {
|
|
513
|
+
num_planned_subcompactions = GetSubcompactionsLimit();
|
|
388
514
|
}
|
|
389
515
|
|
|
516
|
+
TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
|
|
517
|
+
&num_planned_subcompactions);
|
|
518
|
+
if (num_planned_subcompactions == 1) return;
|
|
519
|
+
|
|
390
520
|
// Group the ranges into subcompactions
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
|
|
394
|
-
sum / min_file_fill_percent /
|
|
521
|
+
uint64_t target_range_size = std::max(
|
|
522
|
+
total_size / num_planned_subcompactions,
|
|
395
523
|
MaxFileSizeForLevel(
|
|
396
524
|
*(c->mutable_cf_options()), out_lvl,
|
|
397
525
|
c->immutable_options()->compaction_style, base_level,
|
|
398
|
-
c->immutable_options()->level_compaction_dynamic_level_bytes))
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if (sum >= mean) {
|
|
417
|
-
boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
|
|
418
|
-
subcompactions--;
|
|
419
|
-
sum = 0;
|
|
420
|
-
}
|
|
526
|
+
c->immutable_options()->level_compaction_dynamic_level_bytes));
|
|
527
|
+
|
|
528
|
+
if (target_range_size >= total_size) {
|
|
529
|
+
return;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
uint64_t next_threshold = target_range_size;
|
|
533
|
+
uint64_t cumulative_size = 0;
|
|
534
|
+
uint64_t num_actual_subcompactions = 1U;
|
|
535
|
+
for (TableReader::Anchor& anchor : all_anchors) {
|
|
536
|
+
cumulative_size += anchor.range_size;
|
|
537
|
+
if (cumulative_size > next_threshold) {
|
|
538
|
+
next_threshold += target_range_size;
|
|
539
|
+
num_actual_subcompactions++;
|
|
540
|
+
boundaries_.push_back(anchor.user_key);
|
|
541
|
+
}
|
|
542
|
+
if (num_actual_subcompactions == num_planned_subcompactions) {
|
|
543
|
+
break;
|
|
421
544
|
}
|
|
422
545
|
}
|
|
546
|
+
TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
|
|
547
|
+
&num_actual_subcompactions);
|
|
548
|
+
// Shrink extra subcompactions resources when extra resrouces are acquired
|
|
549
|
+
ShrinkSubcompactionResources(
|
|
550
|
+
std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
|
|
551
|
+
extra_num_subcompaction_threads_reserved_));
|
|
423
552
|
}
|
|
424
553
|
|
|
425
554
|
Status CompactionJob::Run() {
|
|
@@ -582,6 +711,7 @@ Status CompactionJob::Run() {
|
|
|
582
711
|
for (auto& thread : thread_pool) {
|
|
583
712
|
thread.join();
|
|
584
713
|
}
|
|
714
|
+
|
|
585
715
|
for (const auto& state : compact_->sub_compact_states) {
|
|
586
716
|
if (!state.status.ok()) {
|
|
587
717
|
status = state.status;
|
|
@@ -590,6 +720,10 @@ Status CompactionJob::Run() {
|
|
|
590
720
|
}
|
|
591
721
|
}
|
|
592
722
|
|
|
723
|
+
ReleaseSubcompactionResources();
|
|
724
|
+
TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
|
|
725
|
+
TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
|
|
726
|
+
|
|
593
727
|
TablePropertiesCollection tp;
|
|
594
728
|
for (const auto& state : compact_->sub_compact_states) {
|
|
595
729
|
for (const auto& output : state.GetOutputs()) {
|
|
@@ -885,8 +1019,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
885
1019
|
|
|
886
1020
|
// TODO: since we already use C++17, should use
|
|
887
1021
|
// std::optional<const Slice> instead.
|
|
888
|
-
const Slice
|
|
889
|
-
const Slice
|
|
1022
|
+
const std::optional<Slice> start = sub_compact->start;
|
|
1023
|
+
const std::optional<Slice> end = sub_compact->end;
|
|
890
1024
|
|
|
891
1025
|
ReadOptions read_options;
|
|
892
1026
|
read_options.verify_checksums = true;
|
|
@@ -900,19 +1034,20 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
900
1034
|
|
|
901
1035
|
// Note: if we're going to support subcompactions for user-defined timestamps,
|
|
902
1036
|
// the timestamp part will have to be stripped from the bounds here.
|
|
903
|
-
assert((!start && !end
|
|
904
|
-
|
|
905
|
-
|
|
1037
|
+
assert((!start.has_value() && !end.has_value()) ||
|
|
1038
|
+
cfd->user_comparator()->timestamp_size() == 0);
|
|
1039
|
+
if (start.has_value()) {
|
|
1040
|
+
read_options.iterate_lower_bound = &start.value();
|
|
1041
|
+
}
|
|
1042
|
+
if (end.has_value()) {
|
|
1043
|
+
read_options.iterate_upper_bound = &end.value();
|
|
1044
|
+
}
|
|
906
1045
|
|
|
907
1046
|
// Although the v2 aggregator is what the level iterator(s) know about,
|
|
908
1047
|
// the AddTombstones calls will be propagated down to the v1 aggregator.
|
|
909
1048
|
std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
|
|
910
1049
|
read_options, sub_compact->compaction, range_del_agg.get(),
|
|
911
|
-
file_options_for_read_,
|
|
912
|
-
(start == nullptr) ? std::optional<const Slice>{}
|
|
913
|
-
: std::optional<const Slice>{*start},
|
|
914
|
-
(end == nullptr) ? std::optional<const Slice>{}
|
|
915
|
-
: std::optional<const Slice>{*end}));
|
|
1050
|
+
file_options_for_read_, start, end));
|
|
916
1051
|
InternalIterator* input = raw_input.get();
|
|
917
1052
|
|
|
918
1053
|
IterKey start_ikey;
|
|
@@ -920,20 +1055,21 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
920
1055
|
Slice start_slice;
|
|
921
1056
|
Slice end_slice;
|
|
922
1057
|
|
|
923
|
-
if (start) {
|
|
924
|
-
start_ikey.SetInternalKey(
|
|
1058
|
+
if (start.has_value()) {
|
|
1059
|
+
start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
|
|
1060
|
+
kValueTypeForSeek);
|
|
925
1061
|
start_slice = start_ikey.GetInternalKey();
|
|
926
1062
|
}
|
|
927
|
-
if (end) {
|
|
928
|
-
end_ikey.SetInternalKey(
|
|
1063
|
+
if (end.has_value()) {
|
|
1064
|
+
end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
|
|
929
1065
|
end_slice = end_ikey.GetInternalKey();
|
|
930
1066
|
}
|
|
931
1067
|
|
|
932
1068
|
std::unique_ptr<InternalIterator> clip;
|
|
933
|
-
if (start || end) {
|
|
1069
|
+
if (start.has_value() || end.has_value()) {
|
|
934
1070
|
clip = std::make_unique<ClippingIterator>(
|
|
935
|
-
raw_input.get(), start ? &start_slice : nullptr,
|
|
936
|
-
end ? &end_slice : nullptr, &cfd->internal_comparator());
|
|
1071
|
+
raw_input.get(), start.has_value() ? &start_slice : nullptr,
|
|
1072
|
+
end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
|
|
937
1073
|
input = clip.get();
|
|
938
1074
|
}
|
|
939
1075
|
|
|
@@ -1061,8 +1197,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
1061
1197
|
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
|
|
1062
1198
|
// returns true.
|
|
1063
1199
|
|
|
1064
|
-
assert(!end ||
|
|
1065
|
-
|
|
1200
|
+
assert(!end.has_value() || cfd->user_comparator()->Compare(
|
|
1201
|
+
c_iter->user_key(), end.value()) < 0);
|
|
1066
1202
|
|
|
1067
1203
|
if (c_iter_stats.num_input_records % kRecordStatsEvery ==
|
|
1068
1204
|
kRecordStatsEvery - 1) {
|
|
@@ -1280,10 +1416,12 @@ Status CompactionJob::FinishCompactionOutputFile(
|
|
|
1280
1416
|
// output_to_penultimate_level compaction here, as it's only used to decide
|
|
1281
1417
|
// if range dels could be dropped.
|
|
1282
1418
|
if (outputs.HasRangeDel()) {
|
|
1283
|
-
s = outputs.AddRangeDels(
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1419
|
+
s = outputs.AddRangeDels(
|
|
1420
|
+
sub_compact->start.has_value() ? &(sub_compact->start.value())
|
|
1421
|
+
: nullptr,
|
|
1422
|
+
sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
|
|
1423
|
+
range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
|
|
1424
|
+
earliest_snapshot, next_table_min_key);
|
|
1287
1425
|
}
|
|
1288
1426
|
RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
|
|
1289
1427
|
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
|
|
@@ -1495,7 +1633,8 @@ Status CompactionJob::InstallCompactionResults(
|
|
|
1495
1633
|
if (start_level > 0) {
|
|
1496
1634
|
auto vstorage = compaction->input_version()->storage_info();
|
|
1497
1635
|
edit->AddCompactCursor(start_level,
|
|
1498
|
-
vstorage->GetNextCompactCursor(
|
|
1636
|
+
vstorage->GetNextCompactCursor(
|
|
1637
|
+
start_level, compaction->num_input_files(0)));
|
|
1499
1638
|
}
|
|
1500
1639
|
}
|
|
1501
1640
|
|
|
@@ -1595,16 +1734,16 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
|
|
|
1595
1734
|
}
|
|
1596
1735
|
uint64_t current_time = static_cast<uint64_t>(temp_current_time);
|
|
1597
1736
|
InternalKey tmp_start, tmp_end;
|
|
1598
|
-
if (sub_compact->start
|
|
1599
|
-
tmp_start.SetMinPossibleForUserKey(
|
|
1737
|
+
if (sub_compact->start.has_value()) {
|
|
1738
|
+
tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
|
|
1600
1739
|
}
|
|
1601
|
-
if (sub_compact->end
|
|
1602
|
-
tmp_end.SetMinPossibleForUserKey(
|
|
1740
|
+
if (sub_compact->end.has_value()) {
|
|
1741
|
+
tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
|
|
1603
1742
|
}
|
|
1604
1743
|
uint64_t oldest_ancester_time =
|
|
1605
1744
|
sub_compact->compaction->MinInputFileOldestAncesterTime(
|
|
1606
|
-
|
|
1607
|
-
|
|
1745
|
+
sub_compact->start.has_value() ? &tmp_start : nullptr,
|
|
1746
|
+
sub_compact->end.has_value() ? &tmp_end : nullptr);
|
|
1608
1747
|
if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
|
|
1609
1748
|
oldest_ancester_time = current_time;
|
|
1610
1749
|
}
|
|
@@ -164,7 +164,9 @@ class CompactionJob {
|
|
|
164
164
|
const std::atomic<bool>& manual_compaction_canceled,
|
|
165
165
|
const std::string& db_id = "", const std::string& db_session_id = "",
|
|
166
166
|
std::string full_history_ts_low = "", std::string trim_ts = "",
|
|
167
|
-
BlobFileCompletionCallback* blob_callback = nullptr
|
|
167
|
+
BlobFileCompletionCallback* blob_callback = nullptr,
|
|
168
|
+
int* bg_compaction_scheduled = nullptr,
|
|
169
|
+
int* bg_bottom_compaction_scheduled = nullptr);
|
|
168
170
|
|
|
169
171
|
virtual ~CompactionJob();
|
|
170
172
|
|
|
@@ -225,6 +227,26 @@ class CompactionJob {
|
|
|
225
227
|
// consecutive groups such that each group has a similar size.
|
|
226
228
|
void GenSubcompactionBoundaries();
|
|
227
229
|
|
|
230
|
+
// Get the number of planned subcompactions based on max_subcompactions and
|
|
231
|
+
// extra reserved resources
|
|
232
|
+
uint64_t GetSubcompactionsLimit();
|
|
233
|
+
|
|
234
|
+
// Additional reserved threads are reserved and the number is stored in
|
|
235
|
+
// extra_num_subcompaction_threads_reserved__. For now, this happens only if
|
|
236
|
+
// the compaction priority is round-robin and max_subcompactions is not
|
|
237
|
+
// sufficient (extra resources may be needed)
|
|
238
|
+
void AcquireSubcompactionResources(int num_extra_required_subcompactions);
|
|
239
|
+
|
|
240
|
+
// Additional threads may be reserved during IncreaseSubcompactionResources()
|
|
241
|
+
// if num_actual_subcompactions is less than num_planned_subcompactions.
|
|
242
|
+
// Additional threads will be released and the bg_compaction_scheduled_ or
|
|
243
|
+
// bg_bottom_compaction_scheduled_ will be updated if they are used.
|
|
244
|
+
// DB Mutex lock is required.
|
|
245
|
+
void ShrinkSubcompactionResources(uint64_t num_extra_resources);
|
|
246
|
+
|
|
247
|
+
// Release all reserved threads and update the compaction limits.
|
|
248
|
+
void ReleaseSubcompactionResources();
|
|
249
|
+
|
|
228
250
|
CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
|
|
229
251
|
SubcompactionState* sub_compact);
|
|
230
252
|
|
|
@@ -292,13 +314,22 @@ class CompactionJob {
|
|
|
292
314
|
bool paranoid_file_checks_;
|
|
293
315
|
bool measure_io_stats_;
|
|
294
316
|
// Stores the Slices that designate the boundaries for each subcompaction
|
|
295
|
-
std::vector<
|
|
317
|
+
std::vector<std::string> boundaries_;
|
|
296
318
|
Env::Priority thread_pri_;
|
|
297
319
|
std::string full_history_ts_low_;
|
|
298
320
|
std::string trim_ts_;
|
|
299
321
|
BlobFileCompletionCallback* blob_callback_;
|
|
300
322
|
|
|
301
323
|
uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
|
|
324
|
+
// Stores the number of reserved threads in shared env_ for the number of
|
|
325
|
+
// extra subcompaction in kRoundRobin compaction priority
|
|
326
|
+
int extra_num_subcompaction_threads_reserved_;
|
|
327
|
+
|
|
328
|
+
// Stores the pointer to bg_compaction_scheduled_,
|
|
329
|
+
// bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
|
|
330
|
+
// or updating it.
|
|
331
|
+
int* bg_compaction_scheduled_;
|
|
332
|
+
int* bg_bottom_compaction_scheduled_;
|
|
302
333
|
|
|
303
334
|
// Stores the sequence number to time mapping gathered from all input files
|
|
304
335
|
// it also collects the smallest_seqno -> oldest_ancester_time from the SST.
|
|
@@ -462,7 +462,7 @@ bool CompactionPicker::SetupOtherInputs(
|
|
|
462
462
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
463
463
|
VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
|
|
464
464
|
CompactionInputFiles* output_level_inputs, int* parent_index,
|
|
465
|
-
int base_index) {
|
|
465
|
+
int base_index, bool only_expand_towards_right) {
|
|
466
466
|
assert(!inputs->empty());
|
|
467
467
|
assert(output_level_inputs->empty());
|
|
468
468
|
const int input_level = inputs->level;
|
|
@@ -515,8 +515,16 @@ bool CompactionPicker::SetupOtherInputs(
|
|
|
515
515
|
InternalKey all_start, all_limit;
|
|
516
516
|
GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
|
|
517
517
|
bool try_overlapping_inputs = true;
|
|
518
|
-
|
|
519
|
-
|
|
518
|
+
if (only_expand_towards_right) {
|
|
519
|
+
// Round-robin compaction only allows expansion towards the larger side.
|
|
520
|
+
vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
|
|
521
|
+
&expanded_inputs.files, base_index,
|
|
522
|
+
nullptr);
|
|
523
|
+
} else {
|
|
524
|
+
vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
|
|
525
|
+
&expanded_inputs.files, base_index,
|
|
526
|
+
nullptr);
|
|
527
|
+
}
|
|
520
528
|
uint64_t expanded_inputs_size =
|
|
521
529
|
TotalCompensatedFileSize(expanded_inputs.files);
|
|
522
530
|
if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
|