@nxtedition/rocksdb 7.0.39 → 7.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/binding.cc +2 -12
  2. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
  3. package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
  4. package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
  5. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
  6. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
  7. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
  14. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
  15. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
  16. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
  17. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
  18. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
  23. package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
  24. package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
  25. package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
  26. package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
  27. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
  28. package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
  29. package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
  30. package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
  31. package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
  32. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
  33. package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
  34. package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
  35. package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
  36. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
  37. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
  38. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
  40. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
  41. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
  42. package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
  43. package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
  44. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
  45. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
  46. package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
  47. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
  48. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
  49. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
  50. package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
  51. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
  52. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
  53. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
  54. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
  55. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
  56. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
  57. package/package.json +1 -1
  58. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  59. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -12,6 +12,7 @@
12
12
  #include <algorithm>
13
13
  #include <cinttypes>
14
14
  #include <memory>
15
+ #include <optional>
15
16
  #include <set>
16
17
  #include <utility>
17
18
  #include <vector>
@@ -30,6 +31,7 @@
30
31
  #include "db/log_writer.h"
31
32
  #include "db/merge_helper.h"
32
33
  #include "db/range_del_aggregator.h"
34
+ #include "db/version_edit.h"
33
35
  #include "db/version_set.h"
34
36
  #include "file/filename.h"
35
37
  #include "file/read_write_util.h"
@@ -44,6 +46,7 @@
44
46
  #include "port/port.h"
45
47
  #include "rocksdb/db.h"
46
48
  #include "rocksdb/env.h"
49
+ #include "rocksdb/options.h"
47
50
  #include "rocksdb/statistics.h"
48
51
  #include "rocksdb/status.h"
49
52
  #include "rocksdb/table.h"
@@ -120,7 +123,8 @@ CompactionJob::CompactionJob(
120
123
  const std::atomic<bool>& manual_compaction_canceled,
121
124
  const std::string& db_id, const std::string& db_session_id,
122
125
  std::string full_history_ts_low, std::string trim_ts,
123
- BlobFileCompletionCallback* blob_callback)
126
+ BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
127
+ int* bg_bottom_compaction_scheduled)
124
128
  : compact_(new CompactionState(compaction)),
125
129
  compaction_stats_(compaction->compaction_reason(), 1),
126
130
  db_options_(db_options),
@@ -159,9 +163,13 @@ CompactionJob::CompactionJob(
159
163
  thread_pri_(thread_pri),
160
164
  full_history_ts_low_(std::move(full_history_ts_low)),
161
165
  trim_ts_(std::move(trim_ts)),
162
- blob_callback_(blob_callback) {
166
+ blob_callback_(blob_callback),
167
+ extra_num_subcompaction_threads_reserved_(0),
168
+ bg_compaction_scheduled_(bg_compaction_scheduled),
169
+ bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
163
170
  assert(compaction_job_stats_ != nullptr);
164
171
  assert(log_buffer_ != nullptr);
172
+
165
173
  const auto* cfd = compact_->compaction->column_family_data();
166
174
  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
167
175
  db_options_.enable_thread_tracking);
@@ -232,24 +240,22 @@ void CompactionJob::Prepare() {
232
240
  bottommost_level_ = c->bottommost_level();
233
241
 
234
242
  if (c->ShouldFormSubcompactions()) {
235
- {
236
243
  StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
237
244
  GenSubcompactionBoundaries();
238
- }
239
-
245
+ }
246
+ if (boundaries_.size() > 1) {
240
247
  for (size_t i = 0; i <= boundaries_.size(); i++) {
241
- Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
242
- Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
243
- compact_->sub_compact_states.emplace_back(c, start, end,
244
- static_cast<uint32_t>(i));
248
+ compact_->sub_compact_states.emplace_back(
249
+ c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
250
+ (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
251
+ : std::nullopt,
252
+ static_cast<uint32_t>(i));
245
253
  }
246
254
  RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
247
255
  compact_->sub_compact_states.size());
248
256
  } else {
249
- constexpr Slice* start = nullptr;
250
- constexpr Slice* end = nullptr;
251
-
252
- compact_->sub_compact_states.emplace_back(c, start, end, /*sub_job_id*/ 0);
257
+ compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
258
+ /*sub_job_id*/ 0);
253
259
  }
254
260
 
255
261
  if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
@@ -290,6 +296,99 @@ void CompactionJob::Prepare() {
290
296
  }
291
297
  }
292
298
 
299
+ uint64_t CompactionJob::GetSubcompactionsLimit() {
300
+ return extra_num_subcompaction_threads_reserved_ +
301
+ std::max(
302
+ std::uint64_t(1),
303
+ static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
304
+ }
305
+
306
+ void CompactionJob::AcquireSubcompactionResources(
307
+ int num_extra_required_subcompactions) {
308
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
309
+ TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
310
+ int max_db_compactions =
311
+ DBImpl::GetBGJobLimits(
312
+ mutable_db_options_copy_.max_background_flushes,
313
+ mutable_db_options_copy_.max_background_compactions,
314
+ mutable_db_options_copy_.max_background_jobs,
315
+ versions_->GetColumnFamilySet()
316
+ ->write_controller()
317
+ ->NeedSpeedupCompaction())
318
+ .max_compactions;
319
+ // Apply min function first since We need to compute the extra subcompaction
320
+ // against compaction limits. And then try to reserve threads for extra
321
+ // subcompactions. The actual number of reserved threads could be less than
322
+ // the desired number.
323
+ int available_bg_compactions_against_db_limit =
324
+ std::max(max_db_compactions - *bg_compaction_scheduled_ -
325
+ *bg_bottom_compaction_scheduled_,
326
+ 0);
327
+ db_mutex_->Lock();
328
+ // Reservation only supports backgrdoun threads of which the priority is
329
+ // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
330
+ // origin thread_pri_ is higher than that. Similar to ReleaseThreads().
331
+ extra_num_subcompaction_threads_reserved_ =
332
+ env_->ReserveThreads(std::min(num_extra_required_subcompactions,
333
+ available_bg_compactions_against_db_limit),
334
+ std::min(thread_pri_, Env::Priority::HIGH));
335
+
336
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
337
+ // depending on if this compaction has the bottommost priority
338
+ if (thread_pri_ == Env::Priority::BOTTOM) {
339
+ *bg_bottom_compaction_scheduled_ +=
340
+ extra_num_subcompaction_threads_reserved_;
341
+ } else {
342
+ *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
343
+ }
344
+ db_mutex_->Unlock();
345
+ }
346
+
347
+ void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
348
+ // Do nothing when we have zero resources to shrink
349
+ if (num_extra_resources == 0) return;
350
+ db_mutex_->Lock();
351
+ // We cannot release threads more than what we reserved before
352
+ int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
353
+ (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
354
+ // Update the number of reserved threads and the number of background
355
+ // scheduled compactions for this compaction job
356
+ extra_num_subcompaction_threads_reserved_ -=
357
+ extra_num_subcompaction_threads_released;
358
+ // TODO (zichen): design a test case with new subcompaction partitioning
359
+ // when the number of actual partitions is less than the number of planned
360
+ // partitions
361
+ assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
362
+ // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
363
+ // depending on if this compaction has the bottommost priority
364
+ if (thread_pri_ == Env::Priority::BOTTOM) {
365
+ *bg_bottom_compaction_scheduled_ -=
366
+ extra_num_subcompaction_threads_released;
367
+ } else {
368
+ *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
369
+ }
370
+ db_mutex_->Unlock();
371
+ TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
372
+ }
373
+
374
+ void CompactionJob::ReleaseSubcompactionResources() {
375
+ if (extra_num_subcompaction_threads_reserved_ == 0) {
376
+ return;
377
+ }
378
+ // The number of reserved threads becomes larger than 0 only if the
379
+ // compaction prioity is round robin and there is no sufficient
380
+ // sub-compactions available
381
+
382
+ // The scheduled compaction must be no less than 1 + extra number
383
+ // subcompactions using acquired resources since this compaction job has not
384
+ // finished yet
385
+ assert(*bg_bottom_compaction_scheduled_ >=
386
+ 1 + extra_num_subcompaction_threads_reserved_ ||
387
+ *bg_compaction_scheduled_ >=
388
+ 1 + extra_num_subcompaction_threads_reserved_);
389
+ ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
390
+ }
391
+
293
392
  struct RangeWithSize {
294
393
  Range range;
295
394
  uint64_t size;
@@ -299,15 +398,51 @@ struct RangeWithSize {
299
398
  };
300
399
 
301
400
  void CompactionJob::GenSubcompactionBoundaries() {
401
+ // The goal is to find some boundary keys so that we can evenly partition
402
+ // the compaction input data into max_subcompactions ranges.
403
+ // For every input file, we ask TableReader to estimate 128 anchor points
404
+ // that evenly partition the input file into 128 ranges and the range
405
+ // sizes. This can be calculated by scanning index blocks of the file.
406
+ // Once we have the anchor points for all the input files, we merge them
407
+ // together and try to find keys dividing ranges evenly.
408
+ // For example, if we have two input files, and each returns following
409
+ // ranges:
410
+ // File1: (a1, 1000), (b1, 1200), (c1, 1100)
411
+ // File2: (a2, 1100), (b2, 1000), (c2, 1000)
412
+ // We total sort the keys to following:
413
+ // (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
414
+ // We calculate the total size by adding up all ranges' size, which is 6400.
415
+ // If we would like to partition into 2 subcompactions, the target of the
416
+ // range size is 3200. Based on the size, we take "b1" as the partition key
417
+ // since the first three ranges would hit 3200.
418
+ //
419
+ // Note that the ranges are actually overlapping. For example, in the example
420
+ // above, the range ending with "b1" is overlapping with the range ending with
421
+ // "b2". So the size 1000+1100+1200 is an underestimation of data size up to
422
+ // "b1". In extreme cases where we only compact N L0 files, a range can
423
+ // overlap with N-1 other ranges. Since we requested a relatively large number
424
+ // (128) of ranges from each input files, even N range overlapping would
425
+ // cause relatively small inaccuracy.
426
+
302
427
  auto* c = compact_->compaction;
428
+ if (c->max_subcompactions() <= 1 &&
429
+ !(c->immutable_options()->compaction_pri == kRoundRobin &&
430
+ c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
431
+ return;
432
+ }
303
433
  auto* cfd = c->column_family_data();
304
434
  const Comparator* cfd_comparator = cfd->user_comparator();
305
- std::vector<Slice> bounds;
435
+ const InternalKeyComparator& icomp = cfd->internal_comparator();
436
+
437
+ auto* v = compact_->compaction->input_version();
438
+ int base_level = v->storage_info()->base_level();
439
+ InstrumentedMutexUnlock unlock_guard(db_mutex_);
440
+
441
+ uint64_t total_size = 0;
442
+ std::vector<TableReader::Anchor> all_anchors;
306
443
  int start_lvl = c->start_level();
307
444
  int out_lvl = c->output_level();
308
445
 
309
- // Add the starting and/or ending key of certain input files as a potential
310
- // boundary
311
446
  for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
312
447
  int lvl = c->level(lvl_idx);
313
448
  if (lvl >= start_lvl && lvl <= out_lvl) {
@@ -318,108 +453,102 @@ void CompactionJob::GenSubcompactionBoundaries() {
318
453
  continue;
319
454
  }
320
455
 
321
- if (lvl == 0) {
322
- // For level 0 add the starting and ending key of each file since the
323
- // files may have greatly differing key ranges (not range-partitioned)
324
- for (size_t i = 0; i < num_files; i++) {
325
- bounds.emplace_back(flevel->files[i].smallest_key);
326
- bounds.emplace_back(flevel->files[i].largest_key);
456
+ for (size_t i = 0; i < num_files; i++) {
457
+ FileMetaData* f = flevel->files[i].file_metadata;
458
+ std::vector<TableReader::Anchor> my_anchors;
459
+ Status s = cfd->table_cache()->ApproximateKeyAnchors(
460
+ ReadOptions(), icomp, f->fd, my_anchors);
461
+ if (!s.ok() || my_anchors.empty()) {
462
+ my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
327
463
  }
328
- } else {
329
- // For all other levels add the smallest/largest key in the level to
330
- // encompass the range covered by that level
331
- bounds.emplace_back(flevel->files[0].smallest_key);
332
- bounds.emplace_back(flevel->files[num_files - 1].largest_key);
333
- if (lvl == out_lvl) {
334
- // For the last level include the starting keys of all files since
335
- // the last level is the largest and probably has the widest key
336
- // range. Since it's range partitioned, the ending key of one file
337
- // and the starting key of the next are very close (or identical).
338
- for (size_t i = 1; i < num_files; i++) {
339
- bounds.emplace_back(flevel->files[i].smallest_key);
340
- }
464
+ for (auto& ac : my_anchors) {
465
+ // Can be optimize to avoid this loop.
466
+ total_size += ac.range_size;
341
467
  }
468
+
469
+ all_anchors.insert(all_anchors.end(), my_anchors.begin(),
470
+ my_anchors.end());
342
471
  }
343
472
  }
344
473
  }
345
-
346
- std::sort(bounds.begin(), bounds.end(),
347
- [cfd_comparator](const Slice& a, const Slice& b) -> bool {
348
- return cfd_comparator->Compare(ExtractUserKey(a),
349
- ExtractUserKey(b)) < 0;
350
- });
351
- // Remove duplicated entries from bounds
352
- bounds.erase(
353
- std::unique(bounds.begin(), bounds.end(),
354
- [cfd_comparator](const Slice& a, const Slice& b) -> bool {
355
- return cfd_comparator->Compare(ExtractUserKey(a),
356
- ExtractUserKey(b)) == 0;
357
- }),
358
- bounds.end());
359
-
360
- // Combine consecutive pairs of boundaries into ranges with an approximate
361
- // size of data covered by keys in that range
362
- uint64_t sum = 0;
363
- std::vector<RangeWithSize> ranges;
364
- // Get input version from CompactionState since it's already referenced
365
- // earlier in SetInputVersioCompaction::SetInputVersion and will not change
366
- // when db_mutex_ is released below
367
- auto* v = compact_->compaction->input_version();
368
- for (auto it = bounds.begin();;) {
369
- const Slice a = *it;
370
- ++it;
371
-
372
- if (it == bounds.end()) {
373
- break;
474
+ // Here we total sort all the anchor points across all files and go through
475
+ // them in the sorted order to find partitioning boundaries.
476
+ // Not the most efficient implementation. A much more efficient algorithm
477
+ // probably exists. But they are more complex. If performance turns out to
478
+ // be a problem, we can optimize.
479
+ std::sort(
480
+ all_anchors.begin(), all_anchors.end(),
481
+ [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
482
+ return cfd_comparator->Compare(a.user_key, b.user_key) < 0;
483
+ });
484
+
485
+ // Get the number of planned subcompactions, may update reserve threads
486
+ // and update extra_num_subcompaction_threads_reserved_ for round-robin
487
+ uint64_t num_planned_subcompactions;
488
+ if (c->immutable_options()->compaction_pri == kRoundRobin &&
489
+ c->immutable_options()->compaction_style == kCompactionStyleLevel) {
490
+ // For round-robin compaction prioity, we need to employ more
491
+ // subcompactions (may exceed the max_subcompaction limit). The extra
492
+ // subcompactions will be executed using reserved threads and taken into
493
+ // account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
494
+
495
+ // Initialized by the number of input files
496
+ num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
497
+ uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
498
+ if (max_subcompactions_limit < num_planned_subcompactions) {
499
+ // Assert two pointers are not empty so that we can use extra
500
+ // subcompactions against db compaction limits
501
+ assert(bg_bottom_compaction_scheduled_ != nullptr);
502
+ assert(bg_compaction_scheduled_ != nullptr);
503
+ // Reserve resources when max_subcompaction is not sufficient
504
+ AcquireSubcompactionResources(
505
+ (int)(num_planned_subcompactions - max_subcompactions_limit));
506
+ // Subcompactions limit changes after acquiring additional resources.
507
+ // Need to call GetSubcompactionsLimit() again to update the number
508
+ // of planned subcompactions
509
+ num_planned_subcompactions =
510
+ std::min(num_planned_subcompactions, GetSubcompactionsLimit());
374
511
  }
375
-
376
- const Slice b = *it;
377
-
378
- // ApproximateSize could potentially create table reader iterator to seek
379
- // to the index block and may incur I/O cost in the process. Unlock db
380
- // mutex to reduce contention
381
- db_mutex_->Unlock();
382
- uint64_t size = versions_->ApproximateSize(SizeApproximationOptions(), v, a,
383
- b, start_lvl, out_lvl + 1,
384
- TableReaderCaller::kCompaction);
385
- db_mutex_->Lock();
386
- ranges.emplace_back(a, b, size);
387
- sum += size;
512
+ } else {
513
+ num_planned_subcompactions = GetSubcompactionsLimit();
388
514
  }
389
515
 
516
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
517
+ &num_planned_subcompactions);
518
+ if (num_planned_subcompactions == 1) return;
519
+
390
520
  // Group the ranges into subcompactions
391
- const double min_file_fill_percent = 4.0 / 5;
392
- int base_level = v->storage_info()->base_level();
393
- uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
394
- sum / min_file_fill_percent /
521
+ uint64_t target_range_size = std::max(
522
+ total_size / num_planned_subcompactions,
395
523
  MaxFileSizeForLevel(
396
524
  *(c->mutable_cf_options()), out_lvl,
397
525
  c->immutable_options()->compaction_style, base_level,
398
- c->immutable_options()->level_compaction_dynamic_level_bytes)));
399
- uint64_t subcompactions =
400
- std::min({static_cast<uint64_t>(ranges.size()),
401
- static_cast<uint64_t>(c->max_subcompactions()),
402
- max_output_files});
403
-
404
- if (subcompactions > 1) {
405
- double mean = sum * 1.0 / subcompactions;
406
- // Greedily add ranges to the subcompaction until the sum of the ranges'
407
- // sizes becomes >= the expected mean size of a subcompaction
408
- sum = 0;
409
- for (size_t i = 0; i + 1 < ranges.size(); i++) {
410
- sum += ranges[i].size;
411
- if (subcompactions == 1) {
412
- // If there's only one left to schedule then it goes to the end so no
413
- // need to put an end boundary
414
- continue;
415
- }
416
- if (sum >= mean) {
417
- boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
418
- subcompactions--;
419
- sum = 0;
420
- }
526
+ c->immutable_options()->level_compaction_dynamic_level_bytes));
527
+
528
+ if (target_range_size >= total_size) {
529
+ return;
530
+ }
531
+
532
+ uint64_t next_threshold = target_range_size;
533
+ uint64_t cumulative_size = 0;
534
+ uint64_t num_actual_subcompactions = 1U;
535
+ for (TableReader::Anchor& anchor : all_anchors) {
536
+ cumulative_size += anchor.range_size;
537
+ if (cumulative_size > next_threshold) {
538
+ next_threshold += target_range_size;
539
+ num_actual_subcompactions++;
540
+ boundaries_.push_back(anchor.user_key);
541
+ }
542
+ if (num_actual_subcompactions == num_planned_subcompactions) {
543
+ break;
421
544
  }
422
545
  }
546
+ TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
547
+ &num_actual_subcompactions);
548
+ // Shrink extra subcompactions resources when extra resrouces are acquired
549
+ ShrinkSubcompactionResources(
550
+ std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
551
+ extra_num_subcompaction_threads_reserved_));
423
552
  }
424
553
 
425
554
  Status CompactionJob::Run() {
@@ -582,6 +711,7 @@ Status CompactionJob::Run() {
582
711
  for (auto& thread : thread_pool) {
583
712
  thread.join();
584
713
  }
714
+
585
715
  for (const auto& state : compact_->sub_compact_states) {
586
716
  if (!state.status.ok()) {
587
717
  status = state.status;
@@ -590,6 +720,10 @@ Status CompactionJob::Run() {
590
720
  }
591
721
  }
592
722
 
723
+ ReleaseSubcompactionResources();
724
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
725
+ TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
726
+
593
727
  TablePropertiesCollection tp;
594
728
  for (const auto& state : compact_->sub_compact_states) {
595
729
  for (const auto& output : state.GetOutputs()) {
@@ -885,8 +1019,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
885
1019
 
886
1020
  // TODO: since we already use C++17, should use
887
1021
  // std::optional<const Slice> instead.
888
- const Slice* const start = sub_compact->start;
889
- const Slice* const end = sub_compact->end;
1022
+ const std::optional<Slice> start = sub_compact->start;
1023
+ const std::optional<Slice> end = sub_compact->end;
890
1024
 
891
1025
  ReadOptions read_options;
892
1026
  read_options.verify_checksums = true;
@@ -900,19 +1034,20 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
900
1034
 
901
1035
  // Note: if we're going to support subcompactions for user-defined timestamps,
902
1036
  // the timestamp part will have to be stripped from the bounds here.
903
- assert((!start && !end) || cfd->user_comparator()->timestamp_size() == 0);
904
- read_options.iterate_lower_bound = start;
905
- read_options.iterate_upper_bound = end;
1037
+ assert((!start.has_value() && !end.has_value()) ||
1038
+ cfd->user_comparator()->timestamp_size() == 0);
1039
+ if (start.has_value()) {
1040
+ read_options.iterate_lower_bound = &start.value();
1041
+ }
1042
+ if (end.has_value()) {
1043
+ read_options.iterate_upper_bound = &end.value();
1044
+ }
906
1045
 
907
1046
  // Although the v2 aggregator is what the level iterator(s) know about,
908
1047
  // the AddTombstones calls will be propagated down to the v1 aggregator.
909
1048
  std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
910
1049
  read_options, sub_compact->compaction, range_del_agg.get(),
911
- file_options_for_read_,
912
- (start == nullptr) ? std::optional<const Slice>{}
913
- : std::optional<const Slice>{*start},
914
- (end == nullptr) ? std::optional<const Slice>{}
915
- : std::optional<const Slice>{*end}));
1050
+ file_options_for_read_, start, end));
916
1051
  InternalIterator* input = raw_input.get();
917
1052
 
918
1053
  IterKey start_ikey;
@@ -920,20 +1055,21 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
920
1055
  Slice start_slice;
921
1056
  Slice end_slice;
922
1057
 
923
- if (start) {
924
- start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
1058
+ if (start.has_value()) {
1059
+ start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
1060
+ kValueTypeForSeek);
925
1061
  start_slice = start_ikey.GetInternalKey();
926
1062
  }
927
- if (end) {
928
- end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
1063
+ if (end.has_value()) {
1064
+ end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
929
1065
  end_slice = end_ikey.GetInternalKey();
930
1066
  }
931
1067
 
932
1068
  std::unique_ptr<InternalIterator> clip;
933
- if (start || end) {
1069
+ if (start.has_value() || end.has_value()) {
934
1070
  clip = std::make_unique<ClippingIterator>(
935
- raw_input.get(), start ? &start_slice : nullptr,
936
- end ? &end_slice : nullptr, &cfd->internal_comparator());
1071
+ raw_input.get(), start.has_value() ? &start_slice : nullptr,
1072
+ end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
937
1073
  input = clip.get();
938
1074
  }
939
1075
 
@@ -1061,8 +1197,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1061
1197
  // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
1062
1198
  // returns true.
1063
1199
 
1064
- assert(!end ||
1065
- cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
1200
+ assert(!end.has_value() || cfd->user_comparator()->Compare(
1201
+ c_iter->user_key(), end.value()) < 0);
1066
1202
 
1067
1203
  if (c_iter_stats.num_input_records % kRecordStatsEvery ==
1068
1204
  kRecordStatsEvery - 1) {
@@ -1280,10 +1416,12 @@ Status CompactionJob::FinishCompactionOutputFile(
1280
1416
  // output_to_penultimate_level compaction here, as it's only used to decide
1281
1417
  // if range dels could be dropped.
1282
1418
  if (outputs.HasRangeDel()) {
1283
- s = outputs.AddRangeDels(sub_compact->start, sub_compact->end,
1284
- range_del_out_stats, bottommost_level_,
1285
- cfd->internal_comparator(), earliest_snapshot,
1286
- next_table_min_key);
1419
+ s = outputs.AddRangeDels(
1420
+ sub_compact->start.has_value() ? &(sub_compact->start.value())
1421
+ : nullptr,
1422
+ sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
1423
+ range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
1424
+ earliest_snapshot, next_table_min_key);
1287
1425
  }
1288
1426
  RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
1289
1427
  TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
@@ -1495,7 +1633,8 @@ Status CompactionJob::InstallCompactionResults(
1495
1633
  if (start_level > 0) {
1496
1634
  auto vstorage = compaction->input_version()->storage_info();
1497
1635
  edit->AddCompactCursor(start_level,
1498
- vstorage->GetNextCompactCursor(start_level));
1636
+ vstorage->GetNextCompactCursor(
1637
+ start_level, compaction->num_input_files(0)));
1499
1638
  }
1500
1639
  }
1501
1640
 
@@ -1595,16 +1734,16 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
1595
1734
  }
1596
1735
  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
1597
1736
  InternalKey tmp_start, tmp_end;
1598
- if (sub_compact->start != nullptr) {
1599
- tmp_start.SetMinPossibleForUserKey(*(sub_compact->start));
1737
+ if (sub_compact->start.has_value()) {
1738
+ tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
1600
1739
  }
1601
- if (sub_compact->end != nullptr) {
1602
- tmp_end.SetMinPossibleForUserKey(*(sub_compact->end));
1740
+ if (sub_compact->end.has_value()) {
1741
+ tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
1603
1742
  }
1604
1743
  uint64_t oldest_ancester_time =
1605
1744
  sub_compact->compaction->MinInputFileOldestAncesterTime(
1606
- (sub_compact->start != nullptr) ? &tmp_start : nullptr,
1607
- (sub_compact->end != nullptr) ? &tmp_end : nullptr);
1745
+ sub_compact->start.has_value() ? &tmp_start : nullptr,
1746
+ sub_compact->end.has_value() ? &tmp_end : nullptr);
1608
1747
  if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
1609
1748
  oldest_ancester_time = current_time;
1610
1749
  }
@@ -164,7 +164,9 @@ class CompactionJob {
164
164
  const std::atomic<bool>& manual_compaction_canceled,
165
165
  const std::string& db_id = "", const std::string& db_session_id = "",
166
166
  std::string full_history_ts_low = "", std::string trim_ts = "",
167
- BlobFileCompletionCallback* blob_callback = nullptr);
167
+ BlobFileCompletionCallback* blob_callback = nullptr,
168
+ int* bg_compaction_scheduled = nullptr,
169
+ int* bg_bottom_compaction_scheduled = nullptr);
168
170
 
169
171
  virtual ~CompactionJob();
170
172
 
@@ -225,6 +227,26 @@ class CompactionJob {
225
227
  // consecutive groups such that each group has a similar size.
226
228
  void GenSubcompactionBoundaries();
227
229
 
230
+ // Get the number of planned subcompactions based on max_subcompactions and
231
+ // extra reserved resources
232
+ uint64_t GetSubcompactionsLimit();
233
+
234
+ // Additional reserved threads are reserved and the number is stored in
235
+ // extra_num_subcompaction_threads_reserved__. For now, this happens only if
236
+ // the compaction priority is round-robin and max_subcompactions is not
237
+ // sufficient (extra resources may be needed)
238
+ void AcquireSubcompactionResources(int num_extra_required_subcompactions);
239
+
240
+ // Additional threads may be reserved during IncreaseSubcompactionResources()
241
+ // if num_actual_subcompactions is less than num_planned_subcompactions.
242
+ // Additional threads will be released and the bg_compaction_scheduled_ or
243
+ // bg_bottom_compaction_scheduled_ will be updated if they are used.
244
+ // DB Mutex lock is required.
245
+ void ShrinkSubcompactionResources(uint64_t num_extra_resources);
246
+
247
+ // Release all reserved threads and update the compaction limits.
248
+ void ReleaseSubcompactionResources();
249
+
228
250
  CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
229
251
  SubcompactionState* sub_compact);
230
252
 
@@ -292,13 +314,22 @@ class CompactionJob {
292
314
  bool paranoid_file_checks_;
293
315
  bool measure_io_stats_;
294
316
  // Stores the Slices that designate the boundaries for each subcompaction
295
- std::vector<Slice> boundaries_;
317
+ std::vector<std::string> boundaries_;
296
318
  Env::Priority thread_pri_;
297
319
  std::string full_history_ts_low_;
298
320
  std::string trim_ts_;
299
321
  BlobFileCompletionCallback* blob_callback_;
300
322
 
301
323
  uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
324
+ // Stores the number of reserved threads in shared env_ for the number of
325
+ // extra subcompaction in kRoundRobin compaction priority
326
+ int extra_num_subcompaction_threads_reserved_;
327
+
328
+ // Stores the pointer to bg_compaction_scheduled_,
329
+ // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
330
+ // or updating it.
331
+ int* bg_compaction_scheduled_;
332
+ int* bg_bottom_compaction_scheduled_;
302
333
 
303
334
  // Stores the sequence number to time mapping gathered from all input files
304
335
  // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
@@ -462,7 +462,7 @@ bool CompactionPicker::SetupOtherInputs(
462
462
  const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
463
463
  VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
464
464
  CompactionInputFiles* output_level_inputs, int* parent_index,
465
- int base_index) {
465
+ int base_index, bool only_expand_towards_right) {
466
466
  assert(!inputs->empty());
467
467
  assert(output_level_inputs->empty());
468
468
  const int input_level = inputs->level;
@@ -515,8 +515,16 @@ bool CompactionPicker::SetupOtherInputs(
515
515
  InternalKey all_start, all_limit;
516
516
  GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
517
517
  bool try_overlapping_inputs = true;
518
- vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
519
- &expanded_inputs.files, base_index, nullptr);
518
+ if (only_expand_towards_right) {
519
+ // Round-robin compaction only allows expansion towards the larger side.
520
+ vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
521
+ &expanded_inputs.files, base_index,
522
+ nullptr);
523
+ } else {
524
+ vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
525
+ &expanded_inputs.files, base_index,
526
+ nullptr);
527
+ }
520
528
  uint64_t expanded_inputs_size =
521
529
  TotalCompensatedFileSize(expanded_inputs.files);
522
530
  if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {