@nxtedition/rocksdb 7.0.38 → 7.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +62 -33
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +27 -11
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +310 -337
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +394 -352
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
- package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
- package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
- package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
- package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
- package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +100 -78
- package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
- package/index.js +2 -2
- package/iterator.js +1 -1
- package/max_rev_operator.h +114 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -414,7 +414,7 @@ void BlobFileReader::MultiGetBlob(const ReadOptions& read_options,
|
|
|
414
414
|
assert(blob_reqs[i]->offset >= adjustment);
|
|
415
415
|
adjustments.push_back(adjustment);
|
|
416
416
|
|
|
417
|
-
FSReadRequest read_req;
|
|
417
|
+
FSReadRequest read_req = {};
|
|
418
418
|
read_req.offset = blob_reqs[i]->offset - adjustment;
|
|
419
419
|
read_req.len = blob_reqs[i]->len + adjustment;
|
|
420
420
|
read_reqs.emplace_back(read_req);
|
|
@@ -786,13 +786,13 @@ namespace {
|
|
|
786
786
|
std::unique_ptr<WriteControllerToken> SetupDelay(
|
|
787
787
|
WriteController* write_controller, uint64_t compaction_needed_bytes,
|
|
788
788
|
uint64_t prev_compaction_need_bytes, bool penalize_stop,
|
|
789
|
-
bool
|
|
789
|
+
bool auto_compactions_disabled) {
|
|
790
790
|
const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s.
|
|
791
791
|
|
|
792
792
|
uint64_t max_write_rate = write_controller->max_delayed_write_rate();
|
|
793
793
|
uint64_t write_rate = write_controller->delayed_write_rate();
|
|
794
794
|
|
|
795
|
-
if (
|
|
795
|
+
if (auto_compactions_disabled) {
|
|
796
796
|
// When auto compaction is disabled, always use the value user gave.
|
|
797
797
|
write_rate = max_write_rate;
|
|
798
798
|
} else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
|
|
@@ -1116,7 +1116,7 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
|
|
|
1116
1116
|
CreateColumnFamilies({"one", "two", "three"});
|
|
1117
1117
|
ColumnFamilyOptions default_cf, one, two, three;
|
|
1118
1118
|
// setup options. all column families have max_write_buffer_number setup to 10
|
|
1119
|
-
// "default" -> 100KB memtable, start flushing
|
|
1119
|
+
// "default" -> 100KB memtable, start flushing immediately
|
|
1120
1120
|
// "one" -> 200KB memtable, start flushing with two immutable memtables
|
|
1121
1121
|
// "two" -> 1MB memtable, start flushing with three immutable memtables
|
|
1122
1122
|
// "three" -> 90KB memtable, start flushing with four immutable memtables
|
|
@@ -660,7 +660,7 @@ bool Compaction::IsOutputLevelEmpty() const {
|
|
|
660
660
|
}
|
|
661
661
|
|
|
662
662
|
bool Compaction::ShouldFormSubcompactions() const {
|
|
663
|
-
if (
|
|
663
|
+
if (cfd_ == nullptr) {
|
|
664
664
|
return false;
|
|
665
665
|
}
|
|
666
666
|
|
|
@@ -671,9 +671,19 @@ bool Compaction::ShouldFormSubcompactions() const {
|
|
|
671
671
|
return false;
|
|
672
672
|
}
|
|
673
673
|
|
|
674
|
+
// Round-Robin pri under leveled compaction allows subcompactions by default
|
|
675
|
+
// and the number of subcompactions can be larger than max_subcompactions_
|
|
676
|
+
if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
|
|
677
|
+
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
678
|
+
return output_level_ > 0;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
if (max_subcompactions_ <= 1) {
|
|
682
|
+
return false;
|
|
683
|
+
}
|
|
684
|
+
|
|
674
685
|
if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
675
|
-
return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0
|
|
676
|
-
!IsOutputLevelEmpty();
|
|
686
|
+
return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
|
|
677
687
|
} else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
|
|
678
688
|
return number_levels_ > 1 && output_level_ > 0;
|
|
679
689
|
} else {
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <algorithm>
|
|
13
13
|
#include <cinttypes>
|
|
14
14
|
#include <memory>
|
|
15
|
+
#include <optional>
|
|
15
16
|
#include <set>
|
|
16
17
|
#include <utility>
|
|
17
18
|
#include <vector>
|
|
@@ -30,6 +31,7 @@
|
|
|
30
31
|
#include "db/log_writer.h"
|
|
31
32
|
#include "db/merge_helper.h"
|
|
32
33
|
#include "db/range_del_aggregator.h"
|
|
34
|
+
#include "db/version_edit.h"
|
|
33
35
|
#include "db/version_set.h"
|
|
34
36
|
#include "file/filename.h"
|
|
35
37
|
#include "file/read_write_util.h"
|
|
@@ -44,6 +46,7 @@
|
|
|
44
46
|
#include "port/port.h"
|
|
45
47
|
#include "rocksdb/db.h"
|
|
46
48
|
#include "rocksdb/env.h"
|
|
49
|
+
#include "rocksdb/options.h"
|
|
47
50
|
#include "rocksdb/statistics.h"
|
|
48
51
|
#include "rocksdb/status.h"
|
|
49
52
|
#include "rocksdb/table.h"
|
|
@@ -120,7 +123,8 @@ CompactionJob::CompactionJob(
|
|
|
120
123
|
const std::atomic<bool>& manual_compaction_canceled,
|
|
121
124
|
const std::string& db_id, const std::string& db_session_id,
|
|
122
125
|
std::string full_history_ts_low, std::string trim_ts,
|
|
123
|
-
BlobFileCompletionCallback* blob_callback
|
|
126
|
+
BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
|
|
127
|
+
int* bg_bottom_compaction_scheduled)
|
|
124
128
|
: compact_(new CompactionState(compaction)),
|
|
125
129
|
compaction_stats_(compaction->compaction_reason(), 1),
|
|
126
130
|
db_options_(db_options),
|
|
@@ -159,9 +163,13 @@ CompactionJob::CompactionJob(
|
|
|
159
163
|
thread_pri_(thread_pri),
|
|
160
164
|
full_history_ts_low_(std::move(full_history_ts_low)),
|
|
161
165
|
trim_ts_(std::move(trim_ts)),
|
|
162
|
-
blob_callback_(blob_callback)
|
|
166
|
+
blob_callback_(blob_callback),
|
|
167
|
+
extra_num_subcompaction_threads_reserved_(0),
|
|
168
|
+
bg_compaction_scheduled_(bg_compaction_scheduled),
|
|
169
|
+
bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
|
|
163
170
|
assert(compaction_job_stats_ != nullptr);
|
|
164
171
|
assert(log_buffer_ != nullptr);
|
|
172
|
+
|
|
165
173
|
const auto* cfd = compact_->compaction->column_family_data();
|
|
166
174
|
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
|
|
167
175
|
db_options_.enable_thread_tracking);
|
|
@@ -232,24 +240,22 @@ void CompactionJob::Prepare() {
|
|
|
232
240
|
bottommost_level_ = c->bottommost_level();
|
|
233
241
|
|
|
234
242
|
if (c->ShouldFormSubcompactions()) {
|
|
235
|
-
{
|
|
236
243
|
StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
|
|
237
244
|
GenSubcompactionBoundaries();
|
|
238
|
-
|
|
239
|
-
|
|
245
|
+
}
|
|
246
|
+
if (boundaries_.size() > 1) {
|
|
240
247
|
for (size_t i = 0; i <= boundaries_.size(); i++) {
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
248
|
+
compact_->sub_compact_states.emplace_back(
|
|
249
|
+
c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
|
|
250
|
+
(i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
|
|
251
|
+
: std::nullopt,
|
|
252
|
+
static_cast<uint32_t>(i));
|
|
245
253
|
}
|
|
246
254
|
RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
|
|
247
255
|
compact_->sub_compact_states.size());
|
|
248
256
|
} else {
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
compact_->sub_compact_states.emplace_back(c, start, end, /*sub_job_id*/ 0);
|
|
257
|
+
compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
|
|
258
|
+
/*sub_job_id*/ 0);
|
|
253
259
|
}
|
|
254
260
|
|
|
255
261
|
if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
|
|
@@ -290,6 +296,99 @@ void CompactionJob::Prepare() {
|
|
|
290
296
|
}
|
|
291
297
|
}
|
|
292
298
|
|
|
299
|
+
uint64_t CompactionJob::GetSubcompactionsLimit() {
|
|
300
|
+
return extra_num_subcompaction_threads_reserved_ +
|
|
301
|
+
std::max(
|
|
302
|
+
std::uint64_t(1),
|
|
303
|
+
static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
void CompactionJob::AcquireSubcompactionResources(
|
|
307
|
+
int num_extra_required_subcompactions) {
|
|
308
|
+
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
|
|
309
|
+
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
|
|
310
|
+
int max_db_compactions =
|
|
311
|
+
DBImpl::GetBGJobLimits(
|
|
312
|
+
mutable_db_options_copy_.max_background_flushes,
|
|
313
|
+
mutable_db_options_copy_.max_background_compactions,
|
|
314
|
+
mutable_db_options_copy_.max_background_jobs,
|
|
315
|
+
versions_->GetColumnFamilySet()
|
|
316
|
+
->write_controller()
|
|
317
|
+
->NeedSpeedupCompaction())
|
|
318
|
+
.max_compactions;
|
|
319
|
+
// Apply min function first since We need to compute the extra subcompaction
|
|
320
|
+
// against compaction limits. And then try to reserve threads for extra
|
|
321
|
+
// subcompactions. The actual number of reserved threads could be less than
|
|
322
|
+
// the desired number.
|
|
323
|
+
int available_bg_compactions_against_db_limit =
|
|
324
|
+
std::max(max_db_compactions - *bg_compaction_scheduled_ -
|
|
325
|
+
*bg_bottom_compaction_scheduled_,
|
|
326
|
+
0);
|
|
327
|
+
db_mutex_->Lock();
|
|
328
|
+
// Reservation only supports backgrdoun threads of which the priority is
|
|
329
|
+
// between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
|
|
330
|
+
// origin thread_pri_ is higher than that. Similar to ReleaseThreads().
|
|
331
|
+
extra_num_subcompaction_threads_reserved_ =
|
|
332
|
+
env_->ReserveThreads(std::min(num_extra_required_subcompactions,
|
|
333
|
+
available_bg_compactions_against_db_limit),
|
|
334
|
+
std::min(thread_pri_, Env::Priority::HIGH));
|
|
335
|
+
|
|
336
|
+
// Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
|
|
337
|
+
// depending on if this compaction has the bottommost priority
|
|
338
|
+
if (thread_pri_ == Env::Priority::BOTTOM) {
|
|
339
|
+
*bg_bottom_compaction_scheduled_ +=
|
|
340
|
+
extra_num_subcompaction_threads_reserved_;
|
|
341
|
+
} else {
|
|
342
|
+
*bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
|
|
343
|
+
}
|
|
344
|
+
db_mutex_->Unlock();
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
|
|
348
|
+
// Do nothing when we have zero resources to shrink
|
|
349
|
+
if (num_extra_resources == 0) return;
|
|
350
|
+
db_mutex_->Lock();
|
|
351
|
+
// We cannot release threads more than what we reserved before
|
|
352
|
+
int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
|
|
353
|
+
(int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
|
|
354
|
+
// Update the number of reserved threads and the number of background
|
|
355
|
+
// scheduled compactions for this compaction job
|
|
356
|
+
extra_num_subcompaction_threads_reserved_ -=
|
|
357
|
+
extra_num_subcompaction_threads_released;
|
|
358
|
+
// TODO (zichen): design a test case with new subcompaction partitioning
|
|
359
|
+
// when the number of actual partitions is less than the number of planned
|
|
360
|
+
// partitions
|
|
361
|
+
assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
|
|
362
|
+
// Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
|
|
363
|
+
// depending on if this compaction has the bottommost priority
|
|
364
|
+
if (thread_pri_ == Env::Priority::BOTTOM) {
|
|
365
|
+
*bg_bottom_compaction_scheduled_ -=
|
|
366
|
+
extra_num_subcompaction_threads_released;
|
|
367
|
+
} else {
|
|
368
|
+
*bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
|
|
369
|
+
}
|
|
370
|
+
db_mutex_->Unlock();
|
|
371
|
+
TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
void CompactionJob::ReleaseSubcompactionResources() {
|
|
375
|
+
if (extra_num_subcompaction_threads_reserved_ == 0) {
|
|
376
|
+
return;
|
|
377
|
+
}
|
|
378
|
+
// The number of reserved threads becomes larger than 0 only if the
|
|
379
|
+
// compaction prioity is round robin and there is no sufficient
|
|
380
|
+
// sub-compactions available
|
|
381
|
+
|
|
382
|
+
// The scheduled compaction must be no less than 1 + extra number
|
|
383
|
+
// subcompactions using acquired resources since this compaction job has not
|
|
384
|
+
// finished yet
|
|
385
|
+
assert(*bg_bottom_compaction_scheduled_ >=
|
|
386
|
+
1 + extra_num_subcompaction_threads_reserved_ ||
|
|
387
|
+
*bg_compaction_scheduled_ >=
|
|
388
|
+
1 + extra_num_subcompaction_threads_reserved_);
|
|
389
|
+
ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
|
|
390
|
+
}
|
|
391
|
+
|
|
293
392
|
struct RangeWithSize {
|
|
294
393
|
Range range;
|
|
295
394
|
uint64_t size;
|
|
@@ -299,15 +398,51 @@ struct RangeWithSize {
|
|
|
299
398
|
};
|
|
300
399
|
|
|
301
400
|
void CompactionJob::GenSubcompactionBoundaries() {
|
|
401
|
+
// The goal is to find some boundary keys so that we can evenly partition
|
|
402
|
+
// the compaction input data into max_subcompactions ranges.
|
|
403
|
+
// For every input file, we ask TableReader to estimate 128 anchor points
|
|
404
|
+
// that evenly partition the input file into 128 ranges and the range
|
|
405
|
+
// sizes. This can be calculated by scanning index blocks of the file.
|
|
406
|
+
// Once we have the anchor points for all the input files, we merge them
|
|
407
|
+
// together and try to find keys dividing ranges evenly.
|
|
408
|
+
// For example, if we have two input files, and each returns following
|
|
409
|
+
// ranges:
|
|
410
|
+
// File1: (a1, 1000), (b1, 1200), (c1, 1100)
|
|
411
|
+
// File2: (a2, 1100), (b2, 1000), (c2, 1000)
|
|
412
|
+
// We total sort the keys to following:
|
|
413
|
+
// (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
|
|
414
|
+
// We calculate the total size by adding up all ranges' size, which is 6400.
|
|
415
|
+
// If we would like to partition into 2 subcompactions, the target of the
|
|
416
|
+
// range size is 3200. Based on the size, we take "b1" as the partition key
|
|
417
|
+
// since the first three ranges would hit 3200.
|
|
418
|
+
//
|
|
419
|
+
// Note that the ranges are actually overlapping. For example, in the example
|
|
420
|
+
// above, the range ending with "b1" is overlapping with the range ending with
|
|
421
|
+
// "b2". So the size 1000+1100+1200 is an underestimation of data size up to
|
|
422
|
+
// "b1". In extreme cases where we only compact N L0 files, a range can
|
|
423
|
+
// overlap with N-1 other ranges. Since we requested a relatively large number
|
|
424
|
+
// (128) of ranges from each input files, even N range overlapping would
|
|
425
|
+
// cause relatively small inaccuracy.
|
|
426
|
+
|
|
302
427
|
auto* c = compact_->compaction;
|
|
428
|
+
if (c->max_subcompactions() <= 1 &&
|
|
429
|
+
!(c->immutable_options()->compaction_pri == kRoundRobin &&
|
|
430
|
+
c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
303
433
|
auto* cfd = c->column_family_data();
|
|
304
434
|
const Comparator* cfd_comparator = cfd->user_comparator();
|
|
305
|
-
|
|
435
|
+
const InternalKeyComparator& icomp = cfd->internal_comparator();
|
|
436
|
+
|
|
437
|
+
auto* v = compact_->compaction->input_version();
|
|
438
|
+
int base_level = v->storage_info()->base_level();
|
|
439
|
+
InstrumentedMutexUnlock unlock_guard(db_mutex_);
|
|
440
|
+
|
|
441
|
+
uint64_t total_size = 0;
|
|
442
|
+
std::vector<TableReader::Anchor> all_anchors;
|
|
306
443
|
int start_lvl = c->start_level();
|
|
307
444
|
int out_lvl = c->output_level();
|
|
308
445
|
|
|
309
|
-
// Add the starting and/or ending key of certain input files as a potential
|
|
310
|
-
// boundary
|
|
311
446
|
for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
|
|
312
447
|
int lvl = c->level(lvl_idx);
|
|
313
448
|
if (lvl >= start_lvl && lvl <= out_lvl) {
|
|
@@ -318,108 +453,102 @@ void CompactionJob::GenSubcompactionBoundaries() {
|
|
|
318
453
|
continue;
|
|
319
454
|
}
|
|
320
455
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
456
|
+
for (size_t i = 0; i < num_files; i++) {
|
|
457
|
+
FileMetaData* f = flevel->files[i].file_metadata;
|
|
458
|
+
std::vector<TableReader::Anchor> my_anchors;
|
|
459
|
+
Status s = cfd->table_cache()->ApproximateKeyAnchors(
|
|
460
|
+
ReadOptions(), icomp, f->fd, my_anchors);
|
|
461
|
+
if (!s.ok() || my_anchors.empty()) {
|
|
462
|
+
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
|
|
327
463
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
bounds.emplace_back(flevel->files[0].smallest_key);
|
|
332
|
-
bounds.emplace_back(flevel->files[num_files - 1].largest_key);
|
|
333
|
-
if (lvl == out_lvl) {
|
|
334
|
-
// For the last level include the starting keys of all files since
|
|
335
|
-
// the last level is the largest and probably has the widest key
|
|
336
|
-
// range. Since it's range partitioned, the ending key of one file
|
|
337
|
-
// and the starting key of the next are very close (or identical).
|
|
338
|
-
for (size_t i = 1; i < num_files; i++) {
|
|
339
|
-
bounds.emplace_back(flevel->files[i].smallest_key);
|
|
340
|
-
}
|
|
464
|
+
for (auto& ac : my_anchors) {
|
|
465
|
+
// Can be optimize to avoid this loop.
|
|
466
|
+
total_size += ac.range_size;
|
|
341
467
|
}
|
|
468
|
+
|
|
469
|
+
all_anchors.insert(all_anchors.end(), my_anchors.begin(),
|
|
470
|
+
my_anchors.end());
|
|
342
471
|
}
|
|
343
472
|
}
|
|
344
473
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
474
|
+
// Here we total sort all the anchor points across all files and go through
|
|
475
|
+
// them in the sorted order to find partitioning boundaries.
|
|
476
|
+
// Not the most efficient implementation. A much more efficient algorithm
|
|
477
|
+
// probably exists. But they are more complex. If performance turns out to
|
|
478
|
+
// be a problem, we can optimize.
|
|
479
|
+
std::sort(
|
|
480
|
+
all_anchors.begin(), all_anchors.end(),
|
|
481
|
+
[cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
|
|
482
|
+
return cfd_comparator->Compare(a.user_key, b.user_key) < 0;
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
// Get the number of planned subcompactions, may update reserve threads
|
|
486
|
+
// and update extra_num_subcompaction_threads_reserved_ for round-robin
|
|
487
|
+
uint64_t num_planned_subcompactions;
|
|
488
|
+
if (c->immutable_options()->compaction_pri == kRoundRobin &&
|
|
489
|
+
c->immutable_options()->compaction_style == kCompactionStyleLevel) {
|
|
490
|
+
// For round-robin compaction prioity, we need to employ more
|
|
491
|
+
// subcompactions (may exceed the max_subcompaction limit). The extra
|
|
492
|
+
// subcompactions will be executed using reserved threads and taken into
|
|
493
|
+
// account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
|
|
494
|
+
|
|
495
|
+
// Initialized by the number of input files
|
|
496
|
+
num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
|
|
497
|
+
uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
|
|
498
|
+
if (max_subcompactions_limit < num_planned_subcompactions) {
|
|
499
|
+
// Assert two pointers are not empty so that we can use extra
|
|
500
|
+
// subcompactions against db compaction limits
|
|
501
|
+
assert(bg_bottom_compaction_scheduled_ != nullptr);
|
|
502
|
+
assert(bg_compaction_scheduled_ != nullptr);
|
|
503
|
+
// Reserve resources when max_subcompaction is not sufficient
|
|
504
|
+
AcquireSubcompactionResources(
|
|
505
|
+
(int)(num_planned_subcompactions - max_subcompactions_limit));
|
|
506
|
+
// Subcompactions limit changes after acquiring additional resources.
|
|
507
|
+
// Need to call GetSubcompactionsLimit() again to update the number
|
|
508
|
+
// of planned subcompactions
|
|
509
|
+
num_planned_subcompactions =
|
|
510
|
+
std::min(num_planned_subcompactions, GetSubcompactionsLimit());
|
|
374
511
|
}
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
// ApproximateSize could potentially create table reader iterator to seek
|
|
379
|
-
// to the index block and may incur I/O cost in the process. Unlock db
|
|
380
|
-
// mutex to reduce contention
|
|
381
|
-
db_mutex_->Unlock();
|
|
382
|
-
uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
|
|
383
|
-
b, start_lvl, out_lvl + 1,
|
|
384
|
-
TableReaderCaller::kCompaction);
|
|
385
|
-
db_mutex_->Lock();
|
|
386
|
-
ranges.emplace_back(a, b, size);
|
|
387
|
-
sum += size;
|
|
512
|
+
} else {
|
|
513
|
+
num_planned_subcompactions = GetSubcompactionsLimit();
|
|
388
514
|
}
|
|
389
515
|
|
|
516
|
+
TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
|
|
517
|
+
&num_planned_subcompactions);
|
|
518
|
+
if (num_planned_subcompactions == 1) return;
|
|
519
|
+
|
|
390
520
|
// Group the ranges into subcompactions
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
|
|
394
|
-
sum / min_file_fill_percent /
|
|
521
|
+
uint64_t target_range_size = std::max(
|
|
522
|
+
total_size / num_planned_subcompactions,
|
|
395
523
|
MaxFileSizeForLevel(
|
|
396
524
|
*(c->mutable_cf_options()), out_lvl,
|
|
397
525
|
c->immutable_options()->compaction_style, base_level,
|
|
398
|
-
c->immutable_options()->level_compaction_dynamic_level_bytes))
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if (sum >= mean) {
|
|
417
|
-
boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
|
|
418
|
-
subcompactions--;
|
|
419
|
-
sum = 0;
|
|
420
|
-
}
|
|
526
|
+
c->immutable_options()->level_compaction_dynamic_level_bytes));
|
|
527
|
+
|
|
528
|
+
if (target_range_size >= total_size) {
|
|
529
|
+
return;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
uint64_t next_threshold = target_range_size;
|
|
533
|
+
uint64_t cumulative_size = 0;
|
|
534
|
+
uint64_t num_actual_subcompactions = 1U;
|
|
535
|
+
for (TableReader::Anchor& anchor : all_anchors) {
|
|
536
|
+
cumulative_size += anchor.range_size;
|
|
537
|
+
if (cumulative_size > next_threshold) {
|
|
538
|
+
next_threshold += target_range_size;
|
|
539
|
+
num_actual_subcompactions++;
|
|
540
|
+
boundaries_.push_back(anchor.user_key);
|
|
541
|
+
}
|
|
542
|
+
if (num_actual_subcompactions == num_planned_subcompactions) {
|
|
543
|
+
break;
|
|
421
544
|
}
|
|
422
545
|
}
|
|
546
|
+
TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
|
|
547
|
+
&num_actual_subcompactions);
|
|
548
|
+
// Shrink extra subcompactions resources when extra resrouces are acquired
|
|
549
|
+
ShrinkSubcompactionResources(
|
|
550
|
+
std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
|
|
551
|
+
extra_num_subcompaction_threads_reserved_));
|
|
423
552
|
}
|
|
424
553
|
|
|
425
554
|
Status CompactionJob::Run() {
|
|
@@ -582,6 +711,7 @@ Status CompactionJob::Run() {
|
|
|
582
711
|
for (auto& thread : thread_pool) {
|
|
583
712
|
thread.join();
|
|
584
713
|
}
|
|
714
|
+
|
|
585
715
|
for (const auto& state : compact_->sub_compact_states) {
|
|
586
716
|
if (!state.status.ok()) {
|
|
587
717
|
status = state.status;
|
|
@@ -590,6 +720,10 @@ Status CompactionJob::Run() {
|
|
|
590
720
|
}
|
|
591
721
|
}
|
|
592
722
|
|
|
723
|
+
ReleaseSubcompactionResources();
|
|
724
|
+
TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
|
|
725
|
+
TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
|
|
726
|
+
|
|
593
727
|
TablePropertiesCollection tp;
|
|
594
728
|
for (const auto& state : compact_->sub_compact_states) {
|
|
595
729
|
for (const auto& output : state.GetOutputs()) {
|
|
@@ -885,8 +1019,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
885
1019
|
|
|
886
1020
|
// TODO: since we already use C++17, should use
|
|
887
1021
|
// std::optional<const Slice> instead.
|
|
888
|
-
const Slice
|
|
889
|
-
const Slice
|
|
1022
|
+
const std::optional<Slice> start = sub_compact->start;
|
|
1023
|
+
const std::optional<Slice> end = sub_compact->end;
|
|
890
1024
|
|
|
891
1025
|
ReadOptions read_options;
|
|
892
1026
|
read_options.verify_checksums = true;
|
|
@@ -900,19 +1034,20 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
900
1034
|
|
|
901
1035
|
// Note: if we're going to support subcompactions for user-defined timestamps,
|
|
902
1036
|
// the timestamp part will have to be stripped from the bounds here.
|
|
903
|
-
assert((!start && !end
|
|
904
|
-
|
|
905
|
-
|
|
1037
|
+
assert((!start.has_value() && !end.has_value()) ||
|
|
1038
|
+
cfd->user_comparator()->timestamp_size() == 0);
|
|
1039
|
+
if (start.has_value()) {
|
|
1040
|
+
read_options.iterate_lower_bound = &start.value();
|
|
1041
|
+
}
|
|
1042
|
+
if (end.has_value()) {
|
|
1043
|
+
read_options.iterate_upper_bound = &end.value();
|
|
1044
|
+
}
|
|
906
1045
|
|
|
907
1046
|
// Although the v2 aggregator is what the level iterator(s) know about,
|
|
908
1047
|
// the AddTombstones calls will be propagated down to the v1 aggregator.
|
|
909
1048
|
std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
|
|
910
1049
|
read_options, sub_compact->compaction, range_del_agg.get(),
|
|
911
|
-
file_options_for_read_,
|
|
912
|
-
(start == nullptr) ? std::optional<const Slice>{}
|
|
913
|
-
: std::optional<const Slice>{*start},
|
|
914
|
-
(end == nullptr) ? std::optional<const Slice>{}
|
|
915
|
-
: std::optional<const Slice>{*end}));
|
|
1050
|
+
file_options_for_read_, start, end));
|
|
916
1051
|
InternalIterator* input = raw_input.get();
|
|
917
1052
|
|
|
918
1053
|
IterKey start_ikey;
|
|
@@ -920,20 +1055,21 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
920
1055
|
Slice start_slice;
|
|
921
1056
|
Slice end_slice;
|
|
922
1057
|
|
|
923
|
-
if (start) {
|
|
924
|
-
start_ikey.SetInternalKey(
|
|
1058
|
+
if (start.has_value()) {
|
|
1059
|
+
start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
|
|
1060
|
+
kValueTypeForSeek);
|
|
925
1061
|
start_slice = start_ikey.GetInternalKey();
|
|
926
1062
|
}
|
|
927
|
-
if (end) {
|
|
928
|
-
end_ikey.SetInternalKey(
|
|
1063
|
+
if (end.has_value()) {
|
|
1064
|
+
end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
|
|
929
1065
|
end_slice = end_ikey.GetInternalKey();
|
|
930
1066
|
}
|
|
931
1067
|
|
|
932
1068
|
std::unique_ptr<InternalIterator> clip;
|
|
933
|
-
if (start || end) {
|
|
1069
|
+
if (start.has_value() || end.has_value()) {
|
|
934
1070
|
clip = std::make_unique<ClippingIterator>(
|
|
935
|
-
raw_input.get(), start ? &start_slice : nullptr,
|
|
936
|
-
end ? &end_slice : nullptr, &cfd->internal_comparator());
|
|
1071
|
+
raw_input.get(), start.has_value() ? &start_slice : nullptr,
|
|
1072
|
+
end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
|
|
937
1073
|
input = clip.get();
|
|
938
1074
|
}
|
|
939
1075
|
|
|
@@ -1061,8 +1197,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
|
|
|
1061
1197
|
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
|
|
1062
1198
|
// returns true.
|
|
1063
1199
|
|
|
1064
|
-
assert(!end ||
|
|
1065
|
-
|
|
1200
|
+
assert(!end.has_value() || cfd->user_comparator()->Compare(
|
|
1201
|
+
c_iter->user_key(), end.value()) < 0);
|
|
1066
1202
|
|
|
1067
1203
|
if (c_iter_stats.num_input_records % kRecordStatsEvery ==
|
|
1068
1204
|
kRecordStatsEvery - 1) {
|
|
@@ -1280,10 +1416,12 @@ Status CompactionJob::FinishCompactionOutputFile(
|
|
|
1280
1416
|
// output_to_penultimate_level compaction here, as it's only used to decide
|
|
1281
1417
|
// if range dels could be dropped.
|
|
1282
1418
|
if (outputs.HasRangeDel()) {
|
|
1283
|
-
s = outputs.AddRangeDels(
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1419
|
+
s = outputs.AddRangeDels(
|
|
1420
|
+
sub_compact->start.has_value() ? &(sub_compact->start.value())
|
|
1421
|
+
: nullptr,
|
|
1422
|
+
sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
|
|
1423
|
+
range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
|
|
1424
|
+
earliest_snapshot, next_table_min_key);
|
|
1287
1425
|
}
|
|
1288
1426
|
RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
|
|
1289
1427
|
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
|
|
@@ -1495,7 +1633,8 @@ Status CompactionJob::InstallCompactionResults(
|
|
|
1495
1633
|
if (start_level > 0) {
|
|
1496
1634
|
auto vstorage = compaction->input_version()->storage_info();
|
|
1497
1635
|
edit->AddCompactCursor(start_level,
|
|
1498
|
-
vstorage->GetNextCompactCursor(
|
|
1636
|
+
vstorage->GetNextCompactCursor(
|
|
1637
|
+
start_level, compaction->num_input_files(0)));
|
|
1499
1638
|
}
|
|
1500
1639
|
}
|
|
1501
1640
|
|
|
@@ -1595,16 +1734,16 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
|
|
|
1595
1734
|
}
|
|
1596
1735
|
uint64_t current_time = static_cast<uint64_t>(temp_current_time);
|
|
1597
1736
|
InternalKey tmp_start, tmp_end;
|
|
1598
|
-
if (sub_compact->start
|
|
1599
|
-
tmp_start.SetMinPossibleForUserKey(
|
|
1737
|
+
if (sub_compact->start.has_value()) {
|
|
1738
|
+
tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
|
|
1600
1739
|
}
|
|
1601
|
-
if (sub_compact->end
|
|
1602
|
-
tmp_end.SetMinPossibleForUserKey(
|
|
1740
|
+
if (sub_compact->end.has_value()) {
|
|
1741
|
+
tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
|
|
1603
1742
|
}
|
|
1604
1743
|
uint64_t oldest_ancester_time =
|
|
1605
1744
|
sub_compact->compaction->MinInputFileOldestAncesterTime(
|
|
1606
|
-
|
|
1607
|
-
|
|
1745
|
+
sub_compact->start.has_value() ? &tmp_start : nullptr,
|
|
1746
|
+
sub_compact->end.has_value() ? &tmp_end : nullptr);
|
|
1608
1747
|
if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
|
|
1609
1748
|
oldest_ancester_time = current_time;
|
|
1610
1749
|
}
|