@nxtedition/rocksdb 7.0.23 → 7.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/binding.cc +3 -1
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
  3. package/deps/rocksdb/rocksdb/Makefile +6 -2
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
  6. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
  21. package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
  25. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
  26. package/deps/rocksdb/rocksdb/db/c.cc +68 -0
  27. package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
  28. package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
  49. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
  51. package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
  52. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
  55. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
  61. package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
  62. package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
  63. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
  64. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
  65. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  67. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
  68. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
  69. package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
  70. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
  71. package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
  72. package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
  73. package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
  74. package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
  75. package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
  78. package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
  79. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  80. package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
  82. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
  93. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
  94. package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
  95. package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
  96. package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
  97. package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
  98. package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
  99. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
  100. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
  101. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
  102. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
  103. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
  104. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
  107. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
  108. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
  111. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
  113. package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
  115. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
  116. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
  117. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
  118. package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
  119. package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
  120. package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
  121. package/deps/rocksdb/rocksdb/src.mk +5 -0
  122. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
  127. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  131. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
  133. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
  134. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
  135. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  136. package/deps/rocksdb/rocksdb/util/compression.h +2 -0
  137. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
  138. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
  139. package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
  140. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
  143. package/deps/rocksdb/rocksdb.gyp +5 -1
  144. package/package.json +1 -1
  145. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  146. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -81,6 +81,7 @@ class Compaction {
81
81
  std::vector<FileMetaData*> grandparents,
82
82
  bool manual_compaction = false, const std::string& trim_ts = "",
83
83
  double score = -1, bool deletion_compaction = false,
84
+ bool l0_files_might_overlap = true,
84
85
  CompactionReason compaction_reason = CompactionReason::kUnknown,
85
86
  BlobGarbageCollectionPolicy blob_garbage_collection_policy =
86
87
  BlobGarbageCollectionPolicy::kUseDefault,
@@ -181,7 +182,7 @@ class Compaction {
181
182
  // split the output files according to the existing cursor in the output
182
183
  // level under round-robin compaction policy. Empty indicates no required
183
184
  // splitting key
184
- const InternalKey GetOutputSplitKey() const { return output_split_key_; }
185
+ const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
185
186
 
186
187
  // If true, then the compaction can be done by simply deleting input files.
187
188
  bool deletion_compaction() const { return deletion_compaction_; }
@@ -301,7 +302,25 @@ class Compaction {
301
302
 
302
303
  Slice GetLargestUserKey() const { return largest_user_key_; }
303
304
 
304
- int GetInputBaseLevel() const;
305
+ // Return true if the compaction supports per_key_placement
306
+ bool SupportsPerKeyPlacement() const;
307
+
308
+ // Get per_key_placement penultimate output level, which is `last_level - 1`
309
+ // if per_key_placement feature is supported. Otherwise, return -1.
310
+ int GetPenultimateLevel() const;
311
+
312
+ // Return true if the given range is overlap with penultimate level output
313
+ // range.
314
+ bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
315
+ const Slice& largest_key) const;
316
+
317
+ // Return true if the key is within penultimate level output range for
318
+ // per_key_placement feature, which is safe to place the key to the
319
+ // penultimate level. different compaction strategy has different rules.
320
+ // If per_key_placement is not supported, always return false.
321
+ // TODO: currently it doesn't support moving data from the last level to the
322
+ // penultimate level
323
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const;
305
324
 
306
325
  CompactionReason compaction_reason() const { return compaction_reason_; }
307
326
 
@@ -338,6 +357,15 @@ class Compaction {
338
357
  return notify_on_compaction_completion_;
339
358
  }
340
359
 
360
+ static constexpr int kInvalidLevel = -1;
361
+ // Evaluate penultimate output level. If the compaction supports
362
+ // per_key_placement feature, it returns the penultimate level number.
363
+ // Otherwise, it's set to kInvalidLevel (-1), which means
364
+ // output_to_penultimate_level is not supported.
365
+ static int EvaluatePenultimateLevel(const ImmutableOptions& immutable_options,
366
+ const int start_level,
367
+ const int output_level);
368
+
341
369
  private:
342
370
  // mark (or clear) all files that are being compacted
343
371
  void MarkFilesBeingCompacted(bool mark_as_compacted);
@@ -345,7 +373,18 @@ class Compaction {
345
373
  // get the smallest and largest key present in files to be compacted
346
374
  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
347
375
  const std::vector<CompactionInputFiles>& inputs,
348
- Slice* smallest_key, Slice* largest_key);
376
+ Slice* smallest_key, Slice* largest_key,
377
+ int exclude_level = -1);
378
+
379
+ // populate penultimate level output range, which will be used to determine if
380
+ // a key is safe to output to the penultimate level (details see
381
+ // `Compaction::WithinPenultimateLevelOutputRange()`.
382
+ // TODO: Currently the penultimate level output range is the min/max keys of
383
+ // non-last-level input files. Which is only good if there's no key moved
384
+ // from the last level to the penultimate level. For a more complicated per
385
+ // key placement which may move data from the last level to the penultimate
386
+ // level, it needs extra check.
387
+ void PopulatePenultimateLevelOutputRange();
349
388
 
350
389
  // Get the atomic file boundaries for all files in the compaction. Necessary
351
390
  // in order to avoid the scenario described in
@@ -386,7 +425,12 @@ class Compaction {
386
425
  // If true, then the compaction can be done by simply deleting input files.
387
426
  const bool deletion_compaction_;
388
427
  // should it split the output file using the compact cursor?
389
- InternalKey output_split_key_;
428
+ const InternalKey* output_split_key_;
429
+
430
+ // L0 files in LSM-tree might be overlapping. But the compaction picking
431
+ // logic might pick a subset of the files that aren't overlapping. if
432
+ // that is the case, set the value to false. Otherwise, set it true.
433
+ bool l0_files_might_overlap_;
390
434
 
391
435
  // Compaction input files organized by level. Constant after construction
392
436
  const std::vector<CompactionInputFiles> inputs_;
@@ -438,7 +482,35 @@ class Compaction {
438
482
 
439
483
  // Blob garbage collection age cutoff.
440
484
  double blob_garbage_collection_age_cutoff_;
485
+
486
+ // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
487
+ // means not supported.
488
+ const int penultimate_level_;
489
+
490
+ // Key range for penultimate level output
491
+ Slice penultimate_level_smallest_user_key_;
492
+ Slice penultimate_level_largest_user_key_;
493
+ };
494
+
495
+ #ifndef NDEBUG
496
+ // Helper struct only for tests, which contains the data to decide if a key
497
+ // should be output to the penultimate level.
498
+ // TODO: remove this when the public feature knob is available
499
+ struct PerKeyPlacementContext {
500
+ const int level;
501
+ const Slice key;
502
+ const Slice value;
503
+ const SequenceNumber seq_num;
504
+
505
+ bool output_to_penultimate_level;
506
+
507
+ PerKeyPlacementContext(int _level, Slice _key, Slice _value,
508
+ SequenceNumber _seq_num)
509
+ : level(_level), key(_key), value(_value), seq_num(_seq_num) {
510
+ output_to_penultimate_level = false;
511
+ }
441
512
  };
513
+ #endif /* !NDEBUG */
442
514
 
443
515
  // Return sum of sizes of all files in `files`.
444
516
  extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
@@ -1075,6 +1075,52 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
1075
1075
  }
1076
1076
  }
1077
1077
 
1078
+ void CompactionIterator::DecideOutputLevel() {
1079
+ #ifndef NDEBUG
1080
+ // TODO: will be set by sequence number or key range, for now, it will only be
1081
+ // set by unittest
1082
+ PerKeyPlacementContext context(level_, ikey_.user_key, value_,
1083
+ ikey_.sequence);
1084
+ TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
1085
+ &context);
1086
+ output_to_penultimate_level_ = context.output_to_penultimate_level;
1087
+ #endif /* !NDEBUG */
1088
+
1089
+ // if the key is within the earliest snapshot, it has to output to the
1090
+ // penultimate level.
1091
+ if (ikey_.sequence > earliest_snapshot_) {
1092
+ output_to_penultimate_level_ = true;
1093
+ }
1094
+
1095
+ if (output_to_penultimate_level_) {
1096
+ // If it's decided to output to the penultimate level, but unsafe to do so,
1097
+ // still output to the last level. For example, moving the data from a lower
1098
+ // level to a higher level outside of the higher-level input key range is
1099
+ // considered unsafe, because the key may conflict with higher-level SSTs
1100
+ // not from this compaction.
1101
+ // TODO: add statistic for declined output_to_penultimate_level
1102
+ bool safe_to_penultimate_level =
1103
+ compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
1104
+ if (!safe_to_penultimate_level) {
1105
+ output_to_penultimate_level_ = false;
1106
+ // It could happen when disable/enable `bottommost_temperature` while
1107
+ // holding a snapshot. When `bottommost_temperature` is not set
1108
+ // (==kUnknown), the data newer than any snapshot is pushed to the last
1109
+ // level, but when the per_key_placement feature is enabled on the fly,
1110
+ // the data later than the snapshot has to be moved to the penultimate
1111
+ // level, which may or may not be safe. So the user needs to make sure all
1112
+ // snapshot is released before enabling `bottommost_temperature` feature
1113
+ // We will migrate the feature to `last_level_temperature` and maybe make
1114
+ // it not dynamically changeable.
1115
+ if (ikey_.sequence > earliest_snapshot_) {
1116
+ status_ = Status::Corruption(
1117
+ "Unsafe to store Seq later than snapshot in the last level if "
1118
+ "per_key_placement is enabled");
1119
+ }
1120
+ }
1121
+ }
1122
+ }
1123
+
1078
1124
  void CompactionIterator::PrepareOutput() {
1079
1125
  if (valid_) {
1080
1126
  if (ikey_.type == kTypeValue) {
@@ -1083,6 +1129,10 @@ void CompactionIterator::PrepareOutput() {
1083
1129
  GarbageCollectBlobIfNeeded();
1084
1130
  }
1085
1131
 
1132
+ if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
1133
+ DecideOutputLevel();
1134
+ }
1135
+
1086
1136
  // Zeroing out the sequence number leads to better compression.
1087
1137
  // If this is the bottommost level (no files in lower levels)
1088
1138
  // and the earliest snapshot is larger than this seqno
@@ -1097,7 +1147,8 @@ void CompactionIterator::PrepareOutput() {
1097
1147
  if (valid_ && compaction_ != nullptr &&
1098
1148
  !compaction_->allow_ingest_behind() && bottommost_level_ &&
1099
1149
  DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
1100
- ikey_.type != kTypeMerge && current_key_committed_) {
1150
+ ikey_.type != kTypeMerge && current_key_committed_ &&
1151
+ !output_to_penultimate_level_) {
1101
1152
  if (ikey_.type == kTypeDeletion ||
1102
1153
  (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
1103
1154
  ROCKS_LOG_FATAL(
@@ -32,7 +32,7 @@ class SequenceIterWrapper : public InternalIterator {
32
32
  public:
33
33
  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
34
34
  bool need_count_entries)
35
- : icmp_(cmp, /*named=*/false),
35
+ : icmp_(cmp),
36
36
  inner_iter_(iter),
37
37
  need_count_entries_(need_count_entries) {}
38
38
  bool Valid() const override { return inner_iter_->Valid(); }
@@ -105,6 +105,10 @@ class CompactionIterator {
105
105
  virtual bool DoesInputReferenceBlobFiles() const = 0;
106
106
 
107
107
  virtual const Compaction* real_compaction() const = 0;
108
+
109
+ virtual bool SupportsPerKeyPlacement() const = 0;
110
+
111
+ virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
108
112
  };
109
113
 
110
114
  class RealCompaction : public CompactionProxy {
@@ -163,6 +167,16 @@ class CompactionIterator {
163
167
 
164
168
  const Compaction* real_compaction() const override { return compaction_; }
165
169
 
170
+ bool SupportsPerKeyPlacement() const override {
171
+ return compaction_->SupportsPerKeyPlacement();
172
+ }
173
+
174
+ // Check if key is within penultimate level output range, to see if it's
175
+ // safe to output to the penultimate level for per_key_placement feature.
176
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
177
+ return compaction_->WithinPenultimateLevelOutputRange(key);
178
+ }
179
+
166
180
  private:
167
181
  const Compaction* compaction_;
168
182
  };
@@ -227,6 +241,12 @@ class CompactionIterator {
227
241
  const Slice& user_key() const { return current_user_key_; }
228
242
  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
229
243
  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
244
+ // If the current key should be placed on penultimate level, only valid if
245
+ // per_key_placement is supported
246
+ bool output_to_penultimate_level() const {
247
+ return output_to_penultimate_level_;
248
+ }
249
+ Status InputStatus() const { return input_.status(); }
230
250
 
231
251
  private:
232
252
  // Processes the input stream to find the next output
@@ -235,6 +255,10 @@ class CompactionIterator {
235
255
  // Do final preparations before presenting the output to the callee.
236
256
  void PrepareOutput();
237
257
 
258
+ // Decide the current key should be output to the last level or penultimate
259
+ // level, only call for compaction supports per key placement
260
+ void DecideOutputLevel();
261
+
238
262
  // Passes the output value to the blob file builder (if any), and replaces it
239
263
  // with the corresponding blob reference if it has been actually written to a
240
264
  // blob file (i.e. if it passed the value size check). Returns true if the
@@ -417,6 +441,11 @@ class CompactionIterator {
417
441
  // just been zeroed out during bottommost compaction.
418
442
  bool last_key_seq_zeroed_{false};
419
443
 
444
+ // True if the current key should be output to the penultimate level if
445
+ // possible, compaction logic makes the final decision on which level to
446
+ // output to.
447
+ bool output_to_penultimate_level_{false};
448
+
420
449
  void AdvanceInputIter() { input_.Next(); }
421
450
 
422
451
  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
@@ -180,11 +180,21 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
180
180
 
181
181
  const Compaction* real_compaction() const override { return nullptr; }
182
182
 
183
+ bool SupportsPerKeyPlacement() const override {
184
+ return supports_per_key_placement;
185
+ }
186
+
187
+ bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
188
+ return (!key.starts_with("unsafe_pb"));
189
+ }
190
+
183
191
  bool key_not_exists_beyond_output_level = false;
184
192
 
185
193
  bool is_bottommost_level = false;
186
194
 
187
195
  bool is_allow_ingest_behind = false;
196
+
197
+ bool supports_per_key_placement = false;
188
198
  };
189
199
 
190
200
  // A simplified snapshot checker which assumes each snapshot has a global
@@ -254,6 +264,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
254
264
  compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
255
265
  compaction_proxy_->key_not_exists_beyond_output_level =
256
266
  key_not_exists_beyond_output_level;
267
+ compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
257
268
  compaction.reset(compaction_proxy_);
258
269
  }
259
270
  bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
@@ -295,6 +306,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
295
306
 
296
307
  virtual bool AllowIngestBehind() const { return false; }
297
308
 
309
+ virtual bool SupportsPerKeyPlacement() const { return false; }
310
+
298
311
  void RunTest(
299
312
  const std::vector<std::string>& input_keys,
300
313
  const std::vector<std::string>& input_values,
@@ -756,6 +769,119 @@ TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
756
769
  INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
757
770
  testing::Values(true, false));
758
771
 
772
+ class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
773
+ public:
774
+ bool SupportsPerKeyPlacement() const override { return true; }
775
+ };
776
+
777
+ TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
778
+ std::atomic_uint64_t latest_cold_seq = 0;
779
+
780
+ SyncPoint::GetInstance()->SetCallBack(
781
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
782
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
783
+ context->output_to_penultimate_level =
784
+ context->seq_num > latest_cold_seq;
785
+ });
786
+ SyncPoint::GetInstance()->EnableProcessing();
787
+
788
+ latest_cold_seq = 5;
789
+
790
+ InitIterators(
791
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
792
+ test::KeyStr("c", 5, kTypeValue)},
793
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
794
+ nullptr, nullptr, true);
795
+ c_iter_->SeekToFirst();
796
+ ASSERT_TRUE(c_iter_->Valid());
797
+
798
+ // the first 2 keys are hot, which should has
799
+ // `output_to_penultimate_level()==true` and seq num not zeroed out
800
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
801
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
802
+ c_iter_->Next();
803
+ ASSERT_TRUE(c_iter_->Valid());
804
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
805
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
806
+ c_iter_->Next();
807
+ ASSERT_TRUE(c_iter_->Valid());
808
+ // `a` is cold data, which should be output to bottommost
809
+ ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
810
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
811
+ c_iter_->Next();
812
+ ASSERT_OK(c_iter_->status());
813
+ ASSERT_FALSE(c_iter_->Valid());
814
+ }
815
+
816
+ TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
817
+ AddSnapshot(5);
818
+
819
+ InitIterators(
820
+ {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
821
+ test::KeyStr("b", 5, kTypeValue)},
822
+ {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
823
+ nullptr, nullptr, true);
824
+ c_iter_->SeekToFirst();
825
+ ASSERT_TRUE(c_iter_->Valid());
826
+
827
+ // The first key and the tombstone are within snapshot, which should output
828
+ // to the penultimate level (and seq num cannot be zeroed out).
829
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
830
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
831
+ c_iter_->Next();
832
+ ASSERT_TRUE(c_iter_->Valid());
833
+ ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
834
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
835
+ c_iter_->Next();
836
+ ASSERT_TRUE(c_iter_->Valid());
837
+ // `a` is not protected by the snapshot, the sequence number is zero out and
838
+ // should output bottommost
839
+ ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
840
+ ASSERT_FALSE(c_iter_->output_to_penultimate_level());
841
+ c_iter_->Next();
842
+ ASSERT_OK(c_iter_->status());
843
+ ASSERT_FALSE(c_iter_->Valid());
844
+ }
845
+
846
+ TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
847
+ std::atomic_uint64_t latest_cold_seq = 0;
848
+
849
+ SyncPoint::GetInstance()->SetCallBack(
850
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
851
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
852
+ context->output_to_penultimate_level =
853
+ context->seq_num > latest_cold_seq;
854
+ });
855
+ SyncPoint::GetInstance()->EnableProcessing();
856
+
857
+ latest_cold_seq = 6;
858
+
859
+ AddSnapshot(5);
860
+
861
+ InitIterators({test::KeyStr("a", 7, kTypeValue),
862
+ test::KeyStr("unsafe_pb", 6, kTypeValue),
863
+ test::KeyStr("c", 5, kTypeValue)},
864
+ {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
865
+ kMaxSequenceNumber, nullptr, nullptr, true);
866
+ c_iter_->SeekToFirst();
867
+ ASSERT_TRUE(c_iter_->Valid());
868
+
869
+ ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
870
+ ASSERT_TRUE(c_iter_->output_to_penultimate_level());
871
+ // the 2nd key is unsafe to output_to_penultimate_level, but it's within
872
+ // snapshot so for per_key_placement feature it has to be outputted to the
873
+ // penultimate level. which is a corruption. We should never see
874
+ // such case as the data with seq num (within snapshot) should always come
875
+ // from higher compaction input level, which makes it safe to
876
+ // output_to_penultimate_level.
877
+ c_iter_->Next();
878
+ ASSERT_TRUE(c_iter_->status().IsCorruption());
879
+ }
880
+
881
+ INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
882
+ PerKeyPlacementCompIteratorTest,
883
+ testing::Values(true, false));
884
+
759
885
  // Tests how CompactionIterator work together with SnapshotChecker.
760
886
  class CompactionIteratorWithSnapshotCheckerTest
761
887
  : public CompactionIteratorTest {