@nxtedition/rocksdb 8.0.1 → 8.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  2. package/deps/rocksdb/rocksdb/Makefile +2 -2
  3. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
  8. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +20 -146
  9. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
  10. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
  11. package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
  12. package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
  33. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
  35. package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
  36. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
  37. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
  38. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
  47. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
  48. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
  49. package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
  50. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
  51. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
  52. package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
  53. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
  54. package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
  56. package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
  57. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  58. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
  59. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
  60. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
  62. package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
  63. package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
  65. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
  68. package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
  69. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
  74. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
  75. package/deps/rocksdb/rocksdb/db/version_set.cc +212 -35
  76. package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
  77. package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
  78. package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
  82. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
  83. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
  84. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
  85. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
  86. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
  87. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
  88. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
  89. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
  90. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
  91. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  94. package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
  95. package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
  96. package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
  97. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  98. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  99. package/deps/rocksdb/rocksdb/src.mk +2 -1
  100. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
  103. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
  105. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
  106. package/deps/rocksdb/rocksdb/table/format.cc +24 -20
  107. package/deps/rocksdb/rocksdb/table/format.h +5 -2
  108. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
  109. package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
  110. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
  111. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  112. package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
  113. package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
  114. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
  115. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
  116. package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
  117. package/deps/rocksdb/rocksdb/util/status.cc +7 -0
  118. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
  119. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
  120. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
  121. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
  122. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
  123. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
  124. package/deps/rocksdb/rocksdb.gyp +2 -1
  125. package/package.json +1 -1
  126. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  127. package/prebuilds/linux-x64/node.napi.node +0 -0
  128. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
  129. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
@@ -565,7 +565,8 @@ ColumnFamilyData::ColumnFamilyData(
565
565
  allow_2pc_(db_options.allow_2pc),
566
566
  last_memtable_id_(0),
567
567
  db_paths_registered_(false),
568
- mempurge_used_(false) {
568
+ mempurge_used_(false),
569
+ next_epoch_number_(1) {
569
570
  if (id_ != kDummyColumnFamilyDataId) {
570
571
  // TODO(cc): RegisterDbPaths can be expensive, considering moving it
571
572
  // outside of this constructor which might be called with db mutex held.
@@ -1128,12 +1129,9 @@ bool ColumnFamilyData::NeedsCompaction() const {
1128
1129
  Compaction* ColumnFamilyData::PickCompaction(
1129
1130
  const MutableCFOptions& mutable_options,
1130
1131
  const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
1131
- SequenceNumber earliest_mem_seqno =
1132
- std::min(mem_->GetEarliestSequenceNumber(),
1133
- imm_.current()->GetEarliestSequenceNumber(false));
1134
1132
  auto* result = compaction_picker_->PickCompaction(
1135
1133
  GetName(), mutable_options, mutable_db_options, current_->storage_info(),
1136
- log_buffer, earliest_mem_seqno);
1134
+ log_buffer);
1137
1135
  if (result != nullptr) {
1138
1136
  result->SetInputVersion(current_);
1139
1137
  }
@@ -1212,14 +1210,11 @@ Compaction* ColumnFamilyData::CompactRange(
1212
1210
  const InternalKey* begin, const InternalKey* end,
1213
1211
  InternalKey** compaction_end, bool* conflict,
1214
1212
  uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
1215
- SequenceNumber earliest_mem_seqno =
1216
- std::min(mem_->GetEarliestSequenceNumber(),
1217
- imm_.current()->GetEarliestSequenceNumber(false));
1218
1213
  auto* result = compaction_picker_->CompactRange(
1219
1214
  GetName(), mutable_cf_options, mutable_db_options,
1220
1215
  current_->storage_info(), input_level, output_level,
1221
1216
  compact_range_options, begin, end, compaction_end, conflict,
1222
- max_file_num_to_ignore, trim_ts, earliest_mem_seqno);
1217
+ max_file_num_to_ignore, trim_ts);
1223
1218
  if (result != nullptr) {
1224
1219
  result->SetInputVersion(current_);
1225
1220
  }
@@ -1523,6 +1518,13 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
1523
1518
  return data_dirs_[path_id].get();
1524
1519
  }
1525
1520
 
1521
+ void ColumnFamilyData::RecoverEpochNumbers() {
1522
+ assert(current_);
1523
+ auto* vstorage = current_->storage_info();
1524
+ assert(vstorage);
1525
+ vstorage->RecoverEpochNumbers(this);
1526
+ }
1527
+
1526
1528
  ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
1527
1529
  const ImmutableDBOptions* db_options,
1528
1530
  const FileOptions& file_options,
@@ -533,6 +533,24 @@ class ColumnFamilyData {
533
533
  void SetMempurgeUsed() { mempurge_used_ = true; }
534
534
  bool GetMempurgeUsed() { return mempurge_used_; }
535
535
 
536
+ // Allocate and return a new epoch number
537
+ uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); }
538
+
539
+ // Get the next epoch number to be assigned
540
+ uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); }
541
+
542
+ // Set the next epoch number to be assigned
543
+ void SetNextEpochNumber(uint64_t next_epoch_number) {
544
+ next_epoch_number_.store(next_epoch_number);
545
+ }
546
+
547
+ // Reset the next epoch number to be assigned
548
+ void ResetNextEpochNumber() { next_epoch_number_.store(1); }
549
+
550
+ // Recover the next epoch number of this CF and epoch number
551
+ // of its files (if missing)
552
+ void RecoverEpochNumbers();
553
+
536
554
  private:
537
555
  friend class ColumnFamilySet;
538
556
  ColumnFamilyData(uint32_t id, const std::string& name,
@@ -634,6 +652,8 @@ class ColumnFamilyData {
634
652
  // a Version associated with this CFD
635
653
  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
636
654
  bool mempurge_used_;
655
+
656
+ std::atomic<uint64_t> next_epoch_number_;
637
657
  };
638
658
 
639
659
  // ColumnFamilySet has interesting thread-safety requirements
@@ -188,6 +188,11 @@ class ClippingIterator : public InternalIterator {
188
188
  return iter_->GetProperty(prop_name, prop);
189
189
  }
190
190
 
191
+ bool IsDeleteRangeSentinelKey() const override {
192
+ assert(valid_);
193
+ return iter_->IsDeleteRangeSentinelKey();
194
+ }
195
+
191
196
  private:
192
197
  void UpdateValid() {
193
198
  assert(!iter_->Valid() || iter_->status().ok());
@@ -20,9 +20,6 @@
20
20
 
21
21
  namespace ROCKSDB_NAMESPACE {
22
22
 
23
- const uint64_t kRangeTombstoneSentinel =
24
- PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
25
-
26
23
  int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
27
24
  const InternalKey& b) {
28
25
  auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
@@ -332,6 +329,7 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
332
329
  // the case that the penultimate level is empty).
333
330
  if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
334
331
  exclude_level = kInvalidLevel;
332
+ penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
335
333
  std::set<uint64_t> penultimate_inputs;
336
334
  for (const auto& input_lvl : inputs_) {
337
335
  if (input_lvl.level == penultimate_level_) {
@@ -345,7 +343,8 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
345
343
  if (penultimate_inputs.find(file->fd.GetNumber()) ==
346
344
  penultimate_inputs.end()) {
347
345
  exclude_level = number_levels_ - 1;
348
- penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
346
+ penultimate_output_range_type_ =
347
+ PenultimateOutputRangeType::kNonLastRange;
349
348
  break;
350
349
  }
351
350
  }
@@ -354,35 +353,6 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
354
353
  GetBoundaryKeys(input_vstorage_, inputs_,
355
354
  &penultimate_level_smallest_user_key_,
356
355
  &penultimate_level_largest_user_key_, exclude_level);
357
-
358
- // If there's a case that the penultimate level output range is overlapping
359
- // with the existing files, disable the penultimate level output by setting
360
- // the range to empty. One example is the range delete could have overlap
361
- // boundary with the next file. (which is actually a false overlap)
362
- // TODO: Exclude such false overlap, so it won't disable the penultimate
363
- // output.
364
- std::set<uint64_t> penultimate_inputs;
365
- for (const auto& input_lvl : inputs_) {
366
- if (input_lvl.level == penultimate_level_) {
367
- for (const auto& file : input_lvl.files) {
368
- penultimate_inputs.emplace(file->fd.GetNumber());
369
- }
370
- }
371
- }
372
-
373
- auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
374
- for (const auto& file : penultimate_files) {
375
- if (penultimate_inputs.find(file->fd.GetNumber()) ==
376
- penultimate_inputs.end() &&
377
- OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
378
- file->largest.user_key())) {
379
- // basically disable the penultimate range output. which should be rare
380
- // or a false overlap caused by range del
381
- penultimate_level_smallest_user_key_ = "";
382
- penultimate_level_largest_user_key_ = "";
383
- penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
384
- }
385
- }
386
356
  }
387
357
 
388
358
  Compaction::~Compaction() {
@@ -807,6 +777,16 @@ uint64_t Compaction::MinInputFileOldestAncesterTime(
807
777
  return min_oldest_ancester_time;
808
778
  }
809
779
 
780
+ uint64_t Compaction::MinInputFileEpochNumber() const {
781
+ uint64_t min_epoch_number = std::numeric_limits<uint64_t>::max();
782
+ for (const auto& inputs_per_level : inputs_) {
783
+ for (const auto& file : inputs_per_level.files) {
784
+ min_epoch_number = std::min(min_epoch_number, file->epoch_number);
785
+ }
786
+ }
787
+ return min_epoch_number;
788
+ }
789
+
810
790
  int Compaction::EvaluatePenultimateLevel(
811
791
  const VersionStorageInfo* vstorage,
812
792
  const ImmutableOptions& immutable_options, const int start_level,
@@ -18,6 +18,8 @@ namespace ROCKSDB_NAMESPACE {
18
18
  // The file contains class Compaction, as well as some helper functions
19
19
  // and data structures used by the class.
20
20
 
21
+ const uint64_t kRangeTombstoneSentinel =
22
+ PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
21
23
  // Utility for comparing sstable boundary keys. Returns -1 if either a or b is
22
24
  // null which provides the property that a==null indicates a key that is less
23
25
  // than any key and b==null indicates a key that is greater than any key. Note
@@ -378,6 +380,9 @@ class Compaction {
378
380
  // This is used to filter out some input files' ancester's time range.
379
381
  uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
380
382
  const InternalKey* end) const;
383
+ // Return the minimum epoch number among
384
+ // input files' associated with this compaction
385
+ uint64_t MinInputFileEpochNumber() const;
381
386
 
382
387
  // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
383
388
  // compaction begin and compaction completion callbacks match.
@@ -377,6 +377,7 @@ void CompactionIterator::NextFromInput() {
377
377
  value_ = input_.value();
378
378
  blob_value_.Reset();
379
379
  iter_stats_.num_input_records++;
380
+ is_range_del_ = input_.IsDeleteRangeSentinelKey();
380
381
 
381
382
  Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
382
383
  if (!pik_status.ok()) {
@@ -396,7 +397,10 @@ void CompactionIterator::NextFromInput() {
396
397
  break;
397
398
  }
398
399
  TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
399
-
400
+ if (is_range_del_) {
401
+ validity_info_.SetValid(kRangeDeletion);
402
+ break;
403
+ }
400
404
  // Update input statistics
401
405
  if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
402
406
  ikey_.type == kTypeDeletionWithTimestamp) {
@@ -618,6 +622,14 @@ void CompactionIterator::NextFromInput() {
618
622
 
619
623
  ParsedInternalKey next_ikey;
620
624
  AdvanceInputIter();
625
+ while (input_.Valid() && input_.IsDeleteRangeSentinelKey() &&
626
+ ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
627
+ .ok() &&
628
+ cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
629
+ // skip range tombstone start keys with the same user key
630
+ // since they are not "real" point keys.
631
+ AdvanceInputIter();
632
+ }
621
633
 
622
634
  // Check whether the next key exists, is not corrupt, and is the same key
623
635
  // as the single delete.
@@ -625,6 +637,7 @@ void CompactionIterator::NextFromInput() {
625
637
  ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
626
638
  .ok() &&
627
639
  cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
640
+ assert(!input_.IsDeleteRangeSentinelKey());
628
641
  #ifndef NDEBUG
629
642
  const Compaction* c =
630
643
  compaction_ ? compaction_->real_compaction() : nullptr;
@@ -849,12 +862,14 @@ void CompactionIterator::NextFromInput() {
849
862
  // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
850
863
  // considered to have a different user key unless the timestamp is older
851
864
  // than *full_history_ts_low_.
865
+ //
866
+ // Range tombstone start keys are skipped as they are not "real" keys.
852
867
  while (!IsPausingManualCompaction() && !IsShuttingDown() &&
853
868
  input_.Valid() &&
854
869
  (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
855
870
  .ok()) &&
856
871
  cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
857
- (prev_snapshot == 0 ||
872
+ (prev_snapshot == 0 || input_.IsDeleteRangeSentinelKey() ||
858
873
  DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
859
874
  AdvanceInputIter();
860
875
  }
@@ -1105,7 +1120,9 @@ void CompactionIterator::DecideOutputLevel() {
1105
1120
  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
1106
1121
  &context);
1107
1122
  output_to_penultimate_level_ = context.output_to_penultimate_level;
1108
- #endif /* !NDEBUG */
1123
+ #else
1124
+ output_to_penultimate_level_ = false;
1125
+ #endif // NDEBUG
1109
1126
 
1110
1127
  // if the key is newer than the cutoff sequence or within the earliest
1111
1128
  // snapshot, it should output to the penultimate level.
@@ -1145,10 +1162,12 @@ void CompactionIterator::DecideOutputLevel() {
1145
1162
 
1146
1163
  void CompactionIterator::PrepareOutput() {
1147
1164
  if (Valid()) {
1148
- if (ikey_.type == kTypeValue) {
1149
- ExtractLargeValueIfNeeded();
1150
- } else if (ikey_.type == kTypeBlobIndex) {
1151
- GarbageCollectBlobIfNeeded();
1165
+ if (LIKELY(!is_range_del_)) {
1166
+ if (ikey_.type == kTypeValue) {
1167
+ ExtractLargeValueIfNeeded();
1168
+ } else if (ikey_.type == kTypeBlobIndex) {
1169
+ GarbageCollectBlobIfNeeded();
1170
+ }
1152
1171
  }
1153
1172
 
1154
1173
  if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
@@ -1171,7 +1190,7 @@ void CompactionIterator::PrepareOutput() {
1171
1190
  DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
1172
1191
  ikey_.type != kTypeMerge && current_key_committed_ &&
1173
1192
  !output_to_penultimate_level_ &&
1174
- ikey_.sequence < preserve_time_min_seqno_) {
1193
+ ikey_.sequence < preserve_time_min_seqno_ && !is_range_del_) {
1175
1194
  if (ikey_.type == kTypeDeletion ||
1176
1195
  (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
1177
1196
  ROCKS_LOG_FATAL(
@@ -63,6 +63,10 @@ class SequenceIterWrapper : public InternalIterator {
63
63
  void SeekToLast() override { assert(false); }
64
64
 
65
65
  uint64_t num_itered() const { return num_itered_; }
66
+ bool IsDeleteRangeSentinelKey() const override {
67
+ assert(Valid());
68
+ return inner_iter_->IsDeleteRangeSentinelKey();
69
+ }
66
70
 
67
71
  private:
68
72
  InternalKeyComparator icmp_;
@@ -242,7 +246,12 @@ class CompactionIterator {
242
246
  const Status& status() const { return status_; }
243
247
  const ParsedInternalKey& ikey() const { return ikey_; }
244
248
  inline bool Valid() const { return validity_info_.IsValid(); }
245
- const Slice& user_key() const { return current_user_key_; }
249
+ const Slice& user_key() const {
250
+ if (UNLIKELY(is_range_del_)) {
251
+ return ikey_.user_key;
252
+ }
253
+ return current_user_key_;
254
+ }
246
255
  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
247
256
  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
248
257
  // If the current key should be placed on penultimate level, only valid if
@@ -252,6 +261,8 @@ class CompactionIterator {
252
261
  }
253
262
  Status InputStatus() const { return input_.status(); }
254
263
 
264
+ bool IsDeleteRangeSentinelKey() const { return is_range_del_; }
265
+
255
266
  private:
256
267
  // Processes the input stream to find the next output
257
268
  void NextFromInput();
@@ -385,6 +396,7 @@ class CompactionIterator {
385
396
  kKeepSD = 8,
386
397
  kKeepDel = 9,
387
398
  kNewUserKey = 10,
399
+ kRangeDeletion = 11,
388
400
  };
389
401
 
390
402
  struct ValidityInfo {
@@ -492,6 +504,10 @@ class CompactionIterator {
492
504
  // This is a best-effort facility, so memory_order_relaxed is sufficient.
493
505
  return manual_compaction_canceled_.load(std::memory_order_relaxed);
494
506
  }
507
+
508
+ // Stores whether the current compaction iterator output
509
+ // is a range tombstone start key.
510
+ bool is_range_del_{false};
495
511
  };
496
512
 
497
513
  inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
@@ -1286,7 +1286,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
1286
1286
  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
1287
1287
  // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
1288
1288
  // returns true.
1289
-
1290
1289
  assert(!end.has_value() || cfd->user_comparator()->Compare(
1291
1290
  c_iter->user_key(), end.value()) < 0);
1292
1291
 
@@ -1834,12 +1833,14 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
1834
1833
  }
1835
1834
 
1836
1835
  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
1836
+ uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber();
1837
1837
  {
1838
1838
  FileMetaData meta;
1839
1839
  meta.fd = FileDescriptor(file_number,
1840
1840
  sub_compact->compaction->output_path_id(), 0);
1841
1841
  meta.oldest_ancester_time = oldest_ancester_time;
1842
1842
  meta.file_creation_time = current_time;
1843
+ meta.epoch_number = epoch_number;
1843
1844
  meta.temperature = temperature;
1844
1845
  assert(!db_id_.empty());
1845
1846
  assert(!db_session_id_.empty());
@@ -402,6 +402,7 @@ struct CompactionServiceOutputFile {
402
402
  std::string largest_internal_key;
403
403
  uint64_t oldest_ancester_time;
404
404
  uint64_t file_creation_time;
405
+ uint64_t epoch_number;
405
406
  uint64_t paranoid_hash;
406
407
  bool marked_for_compaction;
407
408
  UniqueId64x2 unique_id;
@@ -411,8 +412,8 @@ struct CompactionServiceOutputFile {
411
412
  const std::string& name, SequenceNumber smallest, SequenceNumber largest,
412
413
  std::string _smallest_internal_key, std::string _largest_internal_key,
413
414
  uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
414
- uint64_t _paranoid_hash, bool _marked_for_compaction,
415
- UniqueId64x2 _unique_id)
415
+ uint64_t _epoch_number, uint64_t _paranoid_hash,
416
+ bool _marked_for_compaction, UniqueId64x2 _unique_id)
416
417
  : file_name(name),
417
418
  smallest_seqno(smallest),
418
419
  largest_seqno(largest),
@@ -420,6 +421,7 @@ struct CompactionServiceOutputFile {
420
421
  largest_internal_key(std::move(_largest_internal_key)),
421
422
  oldest_ancester_time(_oldest_ancester_time),
422
423
  file_creation_time(_file_creation_time),
424
+ epoch_number(_epoch_number),
423
425
  paranoid_hash(_paranoid_hash),
424
426
  marked_for_compaction(_marked_for_compaction),
425
427
  unique_id(std::move(_unique_id)) {}
@@ -380,11 +380,13 @@ class CompactionJobTestBase : public testing::Test {
380
380
  }
381
381
 
382
382
  VersionEdit edit;
383
- edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
384
- smallest_seqno, largest_seqno, false, Temperature::kUnknown,
385
- oldest_blob_file_number, kUnknownOldestAncesterTime,
386
- kUnknownFileCreationTime, kUnknownFileChecksum,
387
- kUnknownFileChecksumFuncName, kNullUniqueId64x2);
383
+ edit.AddFile(
384
+ level, file_number, 0, file_size, smallest_key, largest_key,
385
+ smallest_seqno, largest_seqno, false, Temperature::kUnknown,
386
+ oldest_blob_file_number, kUnknownOldestAncesterTime,
387
+ kUnknownFileCreationTime,
388
+ versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
389
+ kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
388
390
 
389
391
  mutex_.Lock();
390
392
  EXPECT_OK(
@@ -1655,7 +1657,7 @@ TEST_F(CompactionJobTest, ResultSerialization) {
1655
1657
  rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
1656
1658
  rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
1657
1659
  rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
1658
- rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
1660
+ rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
1659
1661
  }
1660
1662
  result.output_level = rnd.Uniform(10);
1661
1663
  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
@@ -333,8 +333,14 @@ Status CompactionOutputs::AddToOutput(
333
333
  const CompactionFileOpenFunc& open_file_func,
334
334
  const CompactionFileCloseFunc& close_file_func) {
335
335
  Status s;
336
+ bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
337
+ if (is_range_del && compaction_->bottommost_level()) {
338
+ // We don't consider range tombstone for bottommost level since:
339
+ // 1. there is no grandparent and hence no overlap to consider
340
+ // 2. range tombstone may be dropped at bottommost level.
341
+ return s;
342
+ }
336
343
  const Slice& key = c_iter.key();
337
-
338
344
  if (ShouldStopBefore(c_iter) && HasBuilder()) {
339
345
  s = close_file_func(*this, c_iter.InputStatus(), key);
340
346
  if (!s.ok()) {
@@ -344,6 +350,13 @@ Status CompactionOutputs::AddToOutput(
344
350
  grandparent_boundary_switched_num_ = 0;
345
351
  grandparent_overlapped_bytes_ =
346
352
  GetCurrentKeyGrandparentOverlappedBytes(key);
353
+ if (UNLIKELY(is_range_del)) {
354
+ // lower bound for this new output file, this is needed as the lower bound
355
+ // does not come from the smallest point key in this case.
356
+ range_tombstone_lower_bound_.DecodeFrom(key);
357
+ } else {
358
+ range_tombstone_lower_bound_.Clear();
359
+ }
347
360
  }
348
361
 
349
362
  // Open output file if necessary
@@ -354,6 +367,17 @@ Status CompactionOutputs::AddToOutput(
354
367
  }
355
368
  }
356
369
 
370
+ // c_iter may emit range deletion keys, so update `last_key_for_partitioner_`
371
+ // here before returning below when `is_range_del` is true
372
+ if (partitioner_) {
373
+ last_key_for_partitioner_.assign(c_iter.user_key().data_,
374
+ c_iter.user_key().size_);
375
+ }
376
+
377
+ if (UNLIKELY(is_range_del)) {
378
+ return s;
379
+ }
380
+
357
381
  assert(builder_ != nullptr);
358
382
  const Slice& value = c_iter.value();
359
383
  s = current_output().validator.Add(key, value);
@@ -377,11 +401,6 @@ Status CompactionOutputs::AddToOutput(
377
401
  s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
378
402
  ikey.type);
379
403
 
380
- if (partitioner_) {
381
- last_key_for_partitioner_.assign(c_iter.user_key().data_,
382
- c_iter.user_key().size_);
383
- }
384
-
385
404
  return s;
386
405
  }
387
406
 
@@ -398,13 +417,19 @@ Status CompactionOutputs::AddRangeDels(
398
417
  std::string smallest_user_key;
399
418
  const Slice *lower_bound, *upper_bound;
400
419
  bool lower_bound_from_sub_compact = false;
401
-
420
+ bool lower_bound_from_range_tombstone = false;
402
421
  size_t output_size = outputs_.size();
403
422
  if (output_size == 1) {
404
423
  // For the first output table, include range tombstones before the min
405
424
  // key but after the subcompaction boundary.
406
425
  lower_bound = comp_start_user_key;
407
426
  lower_bound_from_sub_compact = true;
427
+ } else if (range_tombstone_lower_bound_.size() > 0) {
428
+ assert(meta.smallest.size() == 0 ||
429
+ icmp.Compare(range_tombstone_lower_bound_, meta.smallest) <= 0);
430
+ lower_bound_guard = range_tombstone_lower_bound_.user_key();
431
+ lower_bound = &lower_bound_guard;
432
+ lower_bound_from_range_tombstone = true;
408
433
  } else if (meta.smallest.size() > 0) {
409
434
  // For subsequent output tables, only include range tombstones from min
410
435
  // key onwards since the previous file was extended to contain range
@@ -532,6 +557,39 @@ Status CompactionOutputs::AddRangeDels(
532
557
  smallest_candidate =
533
558
  InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
534
559
  }
560
+ } else if (lower_bound_from_range_tombstone) {
561
+ // Range tombstone keys can be truncated at file boundaries of the files
562
+ // that contain them.
563
+ //
564
+ // If this lower bound is from a range tombstone key that is not
565
+ // truncated, i.e., it was not truncated when reading from the input
566
+ // files, then its sequence number and `op_type` will be
567
+ // kMaxSequenceNumber and kTypeRangeDeletion (see
568
+ // TruncatedRangeDelIterator::start_key()). In this case, when this key
569
+ // was used as the upper bound to cut the previous compaction output
570
+ // file, the previous file's largest key could have the same value as
571
+ // this key (see the upperbound logic below). To guarantee
572
+ // non-overlapping ranges between output files, we use the range
573
+ // tombstone's actual sequence number (tombstone.seq_) for the lower
574
+ // bound of this file. If this range tombstone key is truncated, then
575
+ // the previous file's largest key will be smaller than this range
576
+ // tombstone key, so we can use it as the lower bound directly.
577
+ if (ExtractInternalKeyFooter(range_tombstone_lower_bound_.Encode()) ==
578
+ kRangeTombstoneSentinel) {
579
+ if (ts_sz) {
580
+ smallest_candidate =
581
+ InternalKey(range_tombstone_lower_bound_.user_key(),
582
+ tombstone.seq_, kTypeRangeDeletion, tombstone.ts_);
583
+ } else {
584
+ smallest_candidate =
585
+ InternalKey(range_tombstone_lower_bound_.user_key(),
586
+ tombstone.seq_, kTypeRangeDeletion);
587
+ }
588
+ } else {
589
+ assert(GetInternalKeySeqno(range_tombstone_lower_bound_.Encode()) <
590
+ kMaxSequenceNumber);
591
+ smallest_candidate = range_tombstone_lower_bound_;
592
+ }
535
593
  } else {
536
594
  smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
537
595
  }
@@ -307,6 +307,7 @@ class CompactionOutputs {
307
307
  std::unique_ptr<SstPartitioner> partitioner_;
308
308
 
309
309
  // A flag determines if this subcompaction has been split by the cursor
310
+ // for RoundRobin compaction
310
311
  bool is_split_ = false;
311
312
 
312
313
  // We also maintain the output split key for each subcompaction to avoid
@@ -338,6 +339,10 @@ class CompactionOutputs {
338
339
  // for the current output file, how many file boundaries has it crossed,
339
340
  // basically number of files overlapped * 2
340
341
  size_t grandparent_boundary_switched_num_ = 0;
342
+
343
+ // The smallest key of the current output file, this is set when current
344
+ // output file's smallest key is a range tombstone start key.
345
+ InternalKey range_tombstone_lower_bound_;
341
346
  };
342
347
 
343
348
  // helper struct to concatenate the last level and penultimate level outputs
@@ -31,27 +31,15 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
31
31
  size_t min_files_to_compact,
32
32
  uint64_t max_compact_bytes_per_del_file,
33
33
  uint64_t max_compaction_bytes,
34
- CompactionInputFiles* comp_inputs,
35
- const SequenceNumber earliest_mem_seqno) {
36
- // Do not pick ingested file when there is at least one memtable not flushed
37
- // which of seqno is overlap with the sst.
34
+ CompactionInputFiles* comp_inputs) {
38
35
  TEST_SYNC_POINT("FindIntraL0Compaction");
36
+
39
37
  size_t start = 0;
40
- for (; start < level_files.size(); start++) {
41
- if (level_files[start]->being_compacted) {
42
- return false;
43
- }
44
- // If there is no data in memtable, the earliest sequence number would the
45
- // largest sequence number in last memtable.
46
- // Because all files are sorted in descending order by largest_seqno, so we
47
- // only need to check the first one.
48
- if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
49
- break;
50
- }
51
- }
52
- if (start >= level_files.size()) {
38
+
39
+ if (level_files.size() == 0 || level_files[start]->being_compacted) {
53
40
  return false;
54
41
  }
42
+
55
43
  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
56
44
  size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
57
45
  // Compaction range will be [start, limit).
@@ -613,8 +601,7 @@ Compaction* CompactionPicker::CompactRange(
613
601
  int input_level, int output_level,
614
602
  const CompactRangeOptions& compact_range_options, const InternalKey* begin,
615
603
  const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
616
- uint64_t max_file_num_to_ignore, const std::string& trim_ts,
617
- const SequenceNumber /*earliest_mem_seqno*/) {
604
+ uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
618
605
  // CompactionPickerFIFO has its own implementation of compact range
619
606
  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
620
607
 
@@ -919,8 +906,7 @@ bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
919
906
 
920
907
  Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
921
908
  std::unordered_set<uint64_t>* input_files,
922
- const ColumnFamilyMetaData& cf_meta, const int output_level,
923
- const SequenceNumber earliest_mem_seqno) const {
909
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
924
910
  auto& levels = cf_meta.levels;
925
911
  auto comparator = icmp_->user_comparator();
926
912
 
@@ -997,12 +983,6 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
997
983
  current_files[f].name +
998
984
  " is currently being compacted.");
999
985
  }
1000
- if (output_level == 0 &&
1001
- current_files[f].largest_seqno > earliest_mem_seqno) {
1002
- return Status::Aborted(
1003
- "Necessary compaction input file " + current_files[f].name +
1004
- " has overlapping seqnos with earliest memtable seqnos.");
1005
- }
1006
986
 
1007
987
  input_files->insert(TableFileNameToNumber(current_files[f].name));
1008
988
  }
@@ -1060,14 +1040,12 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
1060
1040
  "A running compaction is writing to the same output level in an "
1061
1041
  "overlapping key range");
1062
1042
  }
1063
-
1064
1043
  return Status::OK();
1065
1044
  }
1066
1045
 
1067
1046
  Status CompactionPicker::SanitizeCompactionInputFiles(
1068
1047
  std::unordered_set<uint64_t>* input_files,
1069
- const ColumnFamilyMetaData& cf_meta, const int output_level,
1070
- const SequenceNumber earliest_mem_seqno) const {
1048
+ const ColumnFamilyMetaData& cf_meta, const int output_level) const {
1071
1049
  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
1072
1050
  cf_meta.levels[cf_meta.levels.size() - 1].level);
1073
1051
  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
@@ -1093,8 +1071,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
1093
1071
  "A compaction must contain at least one file.");
1094
1072
  }
1095
1073
 
1096
- Status s = SanitizeCompactionInputFilesForAllLevels(
1097
- input_files, cf_meta, output_level, earliest_mem_seqno);
1074
+ Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
1075
+ output_level);
1098
1076
 
1099
1077
  if (!s.ok()) {
1100
1078
  return s;