@nxtedition/rocksdb 7.0.24 → 7.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/binding.cc +12 -3
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
  3. package/deps/rocksdb/rocksdb/Makefile +6 -2
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
  6. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
  21. package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
  25. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
  26. package/deps/rocksdb/rocksdb/db/c.cc +68 -0
  27. package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
  28. package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
  49. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
  51. package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
  52. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
  55. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
  61. package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
  62. package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
  63. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
  64. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
  65. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  67. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
  68. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
  69. package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
  70. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
  71. package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
  72. package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
  73. package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
  74. package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
  75. package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
  78. package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
  79. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  80. package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
  82. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
  93. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
  94. package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
  95. package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
  96. package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
  97. package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
  98. package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
  99. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
  100. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
  101. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
  102. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
  103. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
  104. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
  107. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
  108. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
  111. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
  113. package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
  115. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
  116. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
  117. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
  118. package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
  119. package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
  120. package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
  121. package/deps/rocksdb/rocksdb/src.mk +5 -0
  122. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
  127. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  131. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
  133. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
  134. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
  135. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  136. package/deps/rocksdb/rocksdb/util/compression.h +2 -0
  137. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
  138. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
  139. package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
  140. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
  143. package/deps/rocksdb/rocksdb.gyp +5 -1
  144. package/package.json +1 -1
  145. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  146. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -20,6 +20,7 @@
20
20
  #include "db/blob/blob_file_completion_callback.h"
21
21
  #include "db/column_family.h"
22
22
  #include "db/compaction/compaction_iterator.h"
23
+ #include "db/compaction/compaction_outputs.h"
23
24
  #include "db/flush_scheduler.h"
24
25
  #include "db/internal_stats.h"
25
26
  #include "db/job_context.h"
@@ -47,6 +48,7 @@
47
48
  namespace ROCKSDB_NAMESPACE {
48
49
 
49
50
  class Arena;
51
+ class CompactionState;
50
52
  class ErrorHandler;
51
53
  class MemTable;
52
54
  class SnapshotChecker;
@@ -56,11 +58,91 @@ class Version;
56
58
  class VersionEdit;
57
59
  class VersionSet;
58
60
 
61
+ class SubcompactionState;
62
+
59
63
  // CompactionJob is responsible for executing the compaction. Each (manual or
60
64
  // automated) compaction corresponds to a CompactionJob object, and usually
61
65
  // goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
62
66
  // will divide the compaction into subcompactions and execute them in parallel
63
67
  // if needed.
68
+ //
69
+ // CompactionJob has 2 main stats:
70
+ // 1. CompactionJobStats compaction_job_stats_
71
+ // CompactionJobStats is a public data structure which is part of Compaction
72
+ // event listener that rocksdb share the job stats with the user.
73
+ // Internally it's an aggregation of all the compaction_job_stats from each
74
+ // `SubcompactionState`:
75
+ // +------------------------+
76
+ // | SubcompactionState |
77
+ // | |
78
+ // +--------->| compaction_job_stats |
79
+ // | | |
80
+ // | +------------------------+
81
+ // +------------------------+ |
82
+ // | CompactionJob | | +------------------------+
83
+ // | | | | SubcompactionState |
84
+ // | compaction_job_stats +-----+ | |
85
+ // | | +--------->| compaction_job_stats |
86
+ // | | | | |
87
+ // +------------------------+ | +------------------------+
88
+ // |
89
+ // | +------------------------+
90
+ // | | SubcompactionState |
91
+ // | | |
92
+ // +--------->+ compaction_job_stats |
93
+ // | | |
94
+ // | +------------------------+
95
+ // |
96
+ // | +------------------------+
97
+ // | | ... |
98
+ // +--------->+ |
99
+ // +------------------------+
100
+ //
101
+ // 2. CompactionStatsFull compaction_stats_
102
+ // `CompactionStatsFull` is an internal stats about the compaction, which
103
+ // is eventually sent to `ColumnFamilyData::internal_stats_` and used for
104
+ // logging and public metrics.
105
+ // Internally, it's an aggregation of stats_ from each `SubcompactionState`.
106
+ // It has 2 parts, normal stats about the main compaction information and
107
+ // the penultimate level output stats.
108
+ // `SubcompactionState` maintains the CompactionOutputs for normal output and
109
+ // the penultimate level output if exists, the per_level stats is
110
+ // stored with the outputs.
111
+ // +---------------------------+
112
+ // | SubcompactionState |
113
+ // | |
114
+ // | +----------------------+ |
115
+ // | | CompactionOutputs | |
116
+ // | | (normal output) | |
117
+ // +---->| stats_ | |
118
+ // | | +----------------------+ |
119
+ // | | |
120
+ // | | +----------------------+ |
121
+ // +--------------------------------+ | | | CompactionOutputs | |
122
+ // | CompactionJob | | | | (penultimate_level) | |
123
+ // | | +--------->| stats_ | |
124
+ // | compaction_stats_ | | | | +----------------------+ |
125
+ // | +-------------------------+ | | | | |
126
+ // | |stats (normal) |------|----+ +---------------------------+
127
+ // | +-------------------------+ | | |
128
+ // | | | |
129
+ // | +-------------------------+ | | | +---------------------------+
130
+ // | |penultimate_level_stats +------+ | | SubcompactionState |
131
+ // | +-------------------------+ | | | | |
132
+ // | | | | | +----------------------+ |
133
+ // | | | | | | CompactionOutputs | |
134
+ // +--------------------------------+ | | | | (normal output) | |
135
+ // | +---->| stats_ | |
136
+ // | | +----------------------+ |
137
+ // | | |
138
+ // | | +----------------------+ |
139
+ // | | | CompactionOutputs | |
140
+ // | | | (penultimate_level) | |
141
+ // +--------->| stats_ | |
142
+ // | +----------------------+ |
143
+ // | |
144
+ // +---------------------------+
145
+
64
146
  class CompactionJob {
65
147
  public:
66
148
  CompactionJob(
@@ -107,11 +189,6 @@ class CompactionJob {
107
189
  IOStatus io_status() const { return io_status_; }
108
190
 
109
191
  protected:
110
- struct SubcompactionState;
111
- // CompactionJob state
112
- struct CompactionState;
113
-
114
- void AggregateStatistics();
115
192
  void UpdateCompactionStats();
116
193
  void LogCompaction();
117
194
  virtual void RecordCompactionIOStats();
@@ -122,7 +199,7 @@ class CompactionJob {
122
199
  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
123
200
 
124
201
  CompactionState* compact_;
125
- InternalStats::CompactionStats compaction_stats_;
202
+ InternalStats::CompactionStatsFull compaction_stats_;
126
203
  const ImmutableDBOptions& db_options_;
127
204
  const MutableDBOptions mutable_db_options_copy_;
128
205
  LogBuffer* log_buffer_;
@@ -135,6 +212,8 @@ class CompactionJob {
135
212
 
136
213
  IOStatus io_status_;
137
214
 
215
+ CompactionJobStats* compaction_job_stats_;
216
+
138
217
  private:
139
218
  friend class CompactionJobTestBase;
140
219
 
@@ -150,15 +229,14 @@ class CompactionJob {
150
229
 
151
230
  // update the thread status for starting a compaction.
152
231
  void ReportStartedCompaction(Compaction* compaction);
153
- void AllocateCompactionOutputFileNumbers();
154
232
 
155
- Status FinishCompactionOutputFile(
156
- const Status& input_status, SubcompactionState* sub_compact,
157
- CompactionRangeDelAggregator* range_del_agg,
158
- CompactionIterationStats* range_del_out_stats,
159
- const Slice* next_table_min_key = nullptr);
233
+ Status FinishCompactionOutputFile(const Status& input_status,
234
+ SubcompactionState* sub_compact,
235
+ CompactionOutputs& outputs,
236
+ const Slice& next_table_min_key);
160
237
  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
161
- Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
238
+ Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
239
+ CompactionOutputs& outputs);
162
240
  void UpdateCompactionJobStats(
163
241
  const InternalStats::CompactionStats& stats) const;
164
242
  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
@@ -167,20 +245,12 @@ class CompactionJob {
167
245
  void UpdateCompactionInputStatsHelper(
168
246
  int* num_files, uint64_t* bytes_read, int input_level);
169
247
 
170
- #ifndef ROCKSDB_LITE
171
- void BuildSubcompactionJobInfo(
172
- SubcompactionState* sub_compact,
173
- SubcompactionJobInfo* subcompaction_job_info) const;
174
- #endif // ROCKSDB_LITE
175
-
176
248
  void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
177
249
 
178
250
  void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
179
251
 
180
252
  uint32_t job_id_;
181
253
 
182
- CompactionJobStats* compaction_job_stats_;
183
-
184
254
  // DBImpl state
185
255
  const std::string& dbname_;
186
256
  const std::string db_id_;
@@ -222,14 +292,12 @@ class CompactionJob {
222
292
  bool measure_io_stats_;
223
293
  // Stores the Slices that designate the boundaries for each subcompaction
224
294
  std::vector<Slice> boundaries_;
225
- // Stores the approx size of keys covered in the range of each subcompaction
226
- std::vector<uint64_t> sizes_;
227
295
  Env::Priority thread_pri_;
228
296
  std::string full_history_ts_low_;
229
297
  std::string trim_ts_;
230
298
  BlobFileCompletionCallback* blob_callback_;
231
299
 
232
- uint64_t GetCompactionId(SubcompactionState* sub_compact);
300
+ uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
233
301
 
234
302
  // Get table file name in where it's outputting to, which should also be in
235
303
  // `output_directory_`.
@@ -265,7 +333,6 @@ struct CompactionServiceInput {
265
333
  std::string begin;
266
334
  bool has_end = false;
267
335
  std::string end;
268
- uint64_t approx_size = 0;
269
336
 
270
337
  // serialization interface to read and write the object
271
338
  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
@@ -357,7 +424,7 @@ class CompactionServiceCompactionJob : private CompactionJob {
357
424
  const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
358
425
  const std::atomic<bool>& manual_compaction_canceled,
359
426
  const std::string& db_id, const std::string& db_session_id,
360
- const std::string& output_path,
427
+ std::string output_path,
361
428
  const CompactionServiceInput& compaction_service_input,
362
429
  CompactionServiceResult* compaction_service_result);
363
430
 
@@ -482,6 +482,17 @@ class CompactionJobTestBase : public testing::Test {
482
482
  cfd_ = versions_->GetColumnFamilySet()->GetDefault();
483
483
  }
484
484
 
485
+ void RunLastLevelCompaction(
486
+ const std::vector<std::vector<FileMetaData*>>& input_files,
487
+ std::function<void(Compaction& comp)>&& verify_func,
488
+ const std::vector<SequenceNumber>& snapshots = {}) {
489
+ const int kLastLevel = cf_options_.num_levels - 1;
490
+ verify_per_key_placement_ = std::move(verify_func);
491
+ mock::KVVector empty_map;
492
+ RunCompaction(input_files, empty_map, snapshots, kMaxSequenceNumber,
493
+ kLastLevel, false);
494
+ }
495
+
485
496
  void RunCompaction(
486
497
  const std::vector<std::vector<FileMetaData*>>& input_files,
487
498
  const mock::KVVector& expected_results,
@@ -571,6 +582,12 @@ class CompactionJobTestBase : public testing::Test {
571
582
  if (check_get_priority) {
572
583
  CheckGetRateLimiterPriority(compaction_job);
573
584
  }
585
+
586
+ if (verify_per_key_placement_) {
587
+ // Verify per_key_placement compaction
588
+ assert(compaction.SupportsPerKeyPlacement());
589
+ verify_per_key_placement_(compaction);
590
+ }
574
591
  }
575
592
 
576
593
  void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
@@ -620,6 +637,7 @@ class CompactionJobTestBase : public testing::Test {
620
637
  std::string full_history_ts_low_;
621
638
  const std::function<std::string(uint64_t)> encode_u64_ts_;
622
639
  bool test_io_priority_;
640
+ std::function<void(Compaction& comp)> verify_per_key_placement_;
623
641
  };
624
642
 
625
643
  // TODO(icanadi) Make it simpler once we mock out VersionSet
@@ -1311,6 +1329,75 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
1311
1329
  /* expected_oldest_blob_file_number */ 19);
1312
1330
  }
1313
1331
 
1332
+ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
1333
+ cf_options_.bottommost_temperature = Temperature::kCold;
1334
+ SyncPoint::GetInstance()->SetCallBack(
1335
+ "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
1336
+ auto supports_per_key_placement = static_cast<bool*>(arg);
1337
+ *supports_per_key_placement = true;
1338
+ });
1339
+
1340
+ std::atomic_uint64_t latest_cold_seq = 0;
1341
+
1342
+ SyncPoint::GetInstance()->SetCallBack(
1343
+ "CompactionIterator::PrepareOutput.context", [&](void* arg) {
1344
+ auto context = static_cast<PerKeyPlacementContext*>(arg);
1345
+ context->output_to_penultimate_level =
1346
+ context->seq_num > latest_cold_seq;
1347
+ });
1348
+ SyncPoint::GetInstance()->EnableProcessing();
1349
+
1350
+ NewDB();
1351
+
1352
+ // Add files on different levels that may overlap
1353
+ auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
1354
+ AddMockFile(file0_1);
1355
+
1356
+ auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
1357
+ {KeyStr("f", 11U, kTypeValue), "val"}});
1358
+ AddMockFile(file1_1, 1);
1359
+ auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
1360
+ {KeyStr("k", 13U, kTypeValue), "val"}});
1361
+ AddMockFile(file1_2, 1);
1362
+ auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
1363
+ {KeyStr("u", 15U, kTypeValue), "val"}});
1364
+ AddMockFile(file1_3, 1);
1365
+
1366
+ auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
1367
+ {KeyStr("h", 9U, kTypeValue), "val"}});
1368
+ AddMockFile(file2_1, 2);
1369
+ auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
1370
+ {KeyStr("p", 7U, kTypeValue), "val"}});
1371
+ AddMockFile(file2_2, 2);
1372
+
1373
+ auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
1374
+ {KeyStr("k", 3U, kTypeValue), "val"}});
1375
+ AddMockFile(file3_1, 3);
1376
+ auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
1377
+ {KeyStr("x", 5U, kTypeValue), "val"}});
1378
+ AddMockFile(file3_2, 3);
1379
+
1380
+ auto cfd = versions_->GetColumnFamilySet()->GetDefault();
1381
+ auto files0 = cfd->current()->storage_info()->LevelFiles(0);
1382
+ auto files1 = cfd->current()->storage_info()->LevelFiles(1);
1383
+ auto files2 = cfd->current()->storage_info()->LevelFiles(2);
1384
+ auto files3 = cfd->current()->storage_info()->LevelFiles(3);
1385
+
1386
+ RunLastLevelCompaction(
1387
+ {files0, files1, files2, files3}, /*verify_func=*/[&](Compaction& comp) {
1388
+ for (char c = 'a'; c <= 'z'; c++) {
1389
+ std::string c_str;
1390
+ c_str = c;
1391
+ const Slice key(c_str);
1392
+ if (c == 'a') {
1393
+ ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
1394
+ } else {
1395
+ ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
1396
+ }
1397
+ }
1398
+ });
1399
+ }
1400
+
1314
1401
  TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
1315
1402
  db_options_.enforce_single_del_contracts = false;
1316
1403
  NewDB();
@@ -1360,7 +1447,6 @@ TEST_F(CompactionJobTest, InputSerialization) {
1360
1447
  if (input.has_end) {
1361
1448
  input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
1362
1449
  }
1363
- input.approx_size = rnd64.Uniform(UINT64_MAX);
1364
1450
 
1365
1451
  std::string output;
1366
1452
  ASSERT_OK(input.Write(&output));
@@ -0,0 +1,314 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ //
3
+ // This source code is licensed under both the GPLv2 (found in the
4
+ // COPYING file in the root directory) and Apache 2.0 License
5
+ // (found in the LICENSE.Apache file in the root directory).
6
+ //
7
+ // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
8
+ // Use of this source code is governed by a BSD-style license that can be
9
+ // found in the LICENSE file. See the AUTHORS file for names of contributors.
10
+
11
+ #include "db/compaction/compaction_outputs.h"
12
+
13
+ #include "db/builder.h"
14
+
15
+ namespace ROCKSDB_NAMESPACE {
16
+
17
+ void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
18
+ builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
19
+ }
20
+
21
+ Status CompactionOutputs::Finish(const Status& intput_status) {
22
+ FileMetaData* meta = GetMetaData();
23
+ assert(meta != nullptr);
24
+ Status s = intput_status;
25
+ if (s.ok()) {
26
+ s = builder_->Finish();
27
+ } else {
28
+ builder_->Abandon();
29
+ }
30
+ Status io_s = builder_->io_status();
31
+ if (s.ok()) {
32
+ s = io_s;
33
+ } else {
34
+ io_s.PermitUncheckedError();
35
+ }
36
+ const uint64_t current_bytes = builder_->FileSize();
37
+ if (s.ok()) {
38
+ meta->fd.file_size = current_bytes;
39
+ meta->marked_for_compaction = builder_->NeedCompact();
40
+ }
41
+ current_output().finished = true;
42
+ stats_.bytes_written += current_bytes;
43
+ stats_.num_output_files = outputs_.size();
44
+
45
+ return s;
46
+ }
47
+
48
+ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
49
+ SystemClock* clock,
50
+ Statistics* statistics,
51
+ bool use_fsync) {
52
+ IOStatus io_s;
53
+ if (input_status.ok()) {
54
+ StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
55
+ io_s = file_writer_->Sync(use_fsync);
56
+ }
57
+ if (input_status.ok() && io_s.ok()) {
58
+ io_s = file_writer_->Close();
59
+ }
60
+
61
+ if (input_status.ok() && io_s.ok()) {
62
+ FileMetaData* meta = GetMetaData();
63
+ meta->file_checksum = file_writer_->GetFileChecksum();
64
+ meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
65
+ }
66
+
67
+ file_writer_.reset();
68
+
69
+ return io_s;
70
+ }
71
+
72
+ Status CompactionOutputs::AddToOutput(
73
+ const CompactionIterator& c_iter,
74
+ const CompactionFileOpenFunc& open_file_func,
75
+ const CompactionFileCloseFunc& close_file_func) {
76
+ Status s;
77
+ const Slice& key = c_iter.key();
78
+
79
+ if (!pending_close_ && c_iter.Valid() && partitioner_ && HasBuilder() &&
80
+ partitioner_->ShouldPartition(
81
+ PartitionerRequest(last_key_for_partitioner_, c_iter.user_key(),
82
+ current_output_file_size_)) == kRequired) {
83
+ pending_close_ = true;
84
+ }
85
+
86
+ if (pending_close_) {
87
+ s = close_file_func(*this, c_iter.InputStatus(), key);
88
+ pending_close_ = false;
89
+ }
90
+ if (!s.ok()) {
91
+ return s;
92
+ }
93
+
94
+ // Open output file if necessary
95
+ if (!HasBuilder()) {
96
+ s = open_file_func(*this);
97
+ }
98
+ if (!s.ok()) {
99
+ return s;
100
+ }
101
+
102
+ Output& curr = current_output();
103
+ assert(builder_ != nullptr);
104
+ const Slice& value = c_iter.value();
105
+ s = curr.validator.Add(key, value);
106
+ if (!s.ok()) {
107
+ return s;
108
+ }
109
+ builder_->Add(key, value);
110
+
111
+ stats_.num_output_records++;
112
+ current_output_file_size_ = builder_->EstimatedFileSize();
113
+
114
+ if (blob_garbage_meter_) {
115
+ s = blob_garbage_meter_->ProcessOutFlow(key, value);
116
+ }
117
+
118
+ if (!s.ok()) {
119
+ return s;
120
+ }
121
+
122
+ const ParsedInternalKey& ikey = c_iter.ikey();
123
+ s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
124
+ ikey.type);
125
+
126
+ // Close output file if it is big enough. Two possibilities determine it's
127
+ // time to close it: (1) the current key should be this file's last key, (2)
128
+ // the next key should not be in this file.
129
+ //
130
+ // TODO(aekmekji): determine if file should be closed earlier than this
131
+ // during subcompactions (i.e. if output size, estimated by input size, is
132
+ // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
133
+ // and 0.6MB instead of 1MB and 0.2MB)
134
+ if (compaction_->output_level() != 0 &&
135
+ current_output_file_size_ >= compaction_->max_output_file_size()) {
136
+ pending_close_ = true;
137
+ }
138
+
139
+ if (partitioner_) {
140
+ last_key_for_partitioner_.assign(c_iter.user_key().data_,
141
+ c_iter.user_key().size_);
142
+ }
143
+
144
+ return s;
145
+ }
146
+
147
+ Status CompactionOutputs::AddRangeDels(
148
+ const Slice* comp_start, const Slice* comp_end,
149
+ CompactionIterationStats& range_del_out_stats, bool bottommost_level,
150
+ const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
151
+ const Slice& next_table_min_key) {
152
+ assert(HasRangeDel());
153
+ FileMetaData& meta = current_output().meta;
154
+ const Comparator* ucmp = icmp.user_comparator();
155
+
156
+ Slice lower_bound_guard, upper_bound_guard;
157
+ std::string smallest_user_key;
158
+ const Slice *lower_bound, *upper_bound;
159
+ bool lower_bound_from_sub_compact = false;
160
+
161
+ size_t output_size = outputs_.size();
162
+ if (output_size == 1) {
163
+ // For the first output table, include range tombstones before the min
164
+ // key but after the subcompaction boundary.
165
+ lower_bound = comp_start;
166
+ lower_bound_from_sub_compact = true;
167
+ } else if (meta.smallest.size() > 0) {
168
+ // For subsequent output tables, only include range tombstones from min
169
+ // key onwards since the previous file was extended to contain range
170
+ // tombstones falling before min key.
171
+ smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
172
+ lower_bound_guard = Slice(smallest_user_key);
173
+ lower_bound = &lower_bound_guard;
174
+ } else {
175
+ lower_bound = nullptr;
176
+ }
177
+ if (!next_table_min_key.empty()) {
178
+ // This may be the last file in the subcompaction in some cases, so we
179
+ // need to compare the end key of subcompaction with the next file start
180
+ // key. When the end key is chosen by the subcompaction, we know that
181
+ // it must be the biggest key in output file. Therefore, it is safe to
182
+ // use the smaller key as the upper bound of the output file, to ensure
183
+ // that there is no overlapping between different output files.
184
+ upper_bound_guard = ExtractUserKey(next_table_min_key);
185
+ if (comp_end != nullptr &&
186
+ ucmp->Compare(upper_bound_guard, *comp_end) >= 0) {
187
+ upper_bound = comp_end;
188
+ } else {
189
+ upper_bound = &upper_bound_guard;
190
+ }
191
+ } else {
192
+ // This is the last file in the subcompaction, so extend until the
193
+ // subcompaction ends.
194
+ upper_bound = comp_end;
195
+ }
196
+ bool has_overlapping_endpoints;
197
+ if (upper_bound != nullptr && meta.largest.size() > 0) {
198
+ has_overlapping_endpoints =
199
+ ucmp->Compare(meta.largest.user_key(), *upper_bound) == 0;
200
+ } else {
201
+ has_overlapping_endpoints = false;
202
+ }
203
+
204
+ // The end key of the subcompaction must be bigger or equal to the upper
205
+ // bound. If the end of subcompaction is null or the upper bound is null,
206
+ // it means that this file is the last file in the compaction. So there
207
+ // will be no overlapping between this file and others.
208
+ assert(comp_end == nullptr || upper_bound == nullptr ||
209
+ ucmp->Compare(*upper_bound, *comp_end) <= 0);
210
+ auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
211
+ has_overlapping_endpoints);
212
+ // Position the range tombstone output iterator. There may be tombstone
213
+ // fragments that are entirely out of range, so make sure that we do not
214
+ // include those.
215
+ if (lower_bound != nullptr) {
216
+ it->Seek(*lower_bound);
217
+ } else {
218
+ it->SeekToFirst();
219
+ }
220
+ for (; it->Valid(); it->Next()) {
221
+ auto tombstone = it->Tombstone();
222
+ if (upper_bound != nullptr) {
223
+ int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
224
+ if ((has_overlapping_endpoints && cmp < 0) ||
225
+ (!has_overlapping_endpoints && cmp <= 0)) {
226
+ // Tombstones starting after upper_bound only need to be included in
227
+ // the next table. If the current SST ends before upper_bound, i.e.,
228
+ // `has_overlapping_endpoints == false`, we can also skip over range
229
+ // tombstones that start exactly at upper_bound. Such range
230
+ // tombstones will be included in the next file and are not relevant
231
+ // to the point keys or endpoints of the current file.
232
+ break;
233
+ }
234
+ }
235
+
236
+ if (bottommost_level && tombstone.seq_ <= earliest_snapshot) {
237
+ // TODO(andrewkr): tombstones that span multiple output files are
238
+ // counted for each compaction output file, so lots of double
239
+ // counting.
240
+ range_del_out_stats.num_range_del_drop_obsolete++;
241
+ range_del_out_stats.num_record_drop_obsolete++;
242
+ continue;
243
+ }
244
+
245
+ auto kv = tombstone.Serialize();
246
+ assert(lower_bound == nullptr ||
247
+ ucmp->Compare(*lower_bound, kv.second) < 0);
248
+ // Range tombstone is not supported by output validator yet.
249
+ builder_->Add(kv.first.Encode(), kv.second);
250
+ InternalKey smallest_candidate = std::move(kv.first);
251
+ if (lower_bound != nullptr &&
252
+ ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
253
+ // Pretend the smallest key has the same user key as lower_bound
254
+ // (the max key in the previous table or subcompaction) in order for
255
+ // files to appear key-space partitioned.
256
+ //
257
+ // When lower_bound is chosen by a subcompaction, we know that
258
+ // subcompactions over smaller keys cannot contain any keys at
259
+ // lower_bound. We also know that smaller subcompactions exist,
260
+ // because otherwise the subcompaction woud be unbounded on the left.
261
+ // As a result, we know that no other files on the output level will
262
+ // contain actual keys at lower_bound (an output file may have a
263
+ // largest key of lower_bound@kMaxSequenceNumber, but this only
264
+ // indicates a large range tombstone was truncated). Therefore, it is
265
+ // safe to use the tombstone's sequence number, to ensure that keys at
266
+ // lower_bound at lower levels are covered by truncated tombstones.
267
+ //
268
+ // If lower_bound was chosen by the smallest data key in the file,
269
+ // choose lowest seqnum so this file's smallest internal key comes
270
+ // after the previous file's largest. The fake seqnum is OK because
271
+ // the read path's file-picking code only considers user key.
272
+ smallest_candidate = InternalKey(
273
+ *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
274
+ kTypeRangeDeletion);
275
+ }
276
+ InternalKey largest_candidate = tombstone.SerializeEndKey();
277
+ if (upper_bound != nullptr &&
278
+ ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
279
+ // Pretend the largest key has the same user key as upper_bound (the
280
+ // min key in the following table or subcompaction) in order for files
281
+ // to appear key-space partitioned.
282
+ //
283
+ // Choose highest seqnum so this file's largest internal key comes
284
+ // before the next file's/subcompaction's smallest. The fake seqnum is
285
+ // OK because the read path's file-picking code only considers the
286
+ // user key portion.
287
+ //
288
+ // Note Seek() also creates InternalKey with (user_key,
289
+ // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
290
+ // kTypeRangeDeletion (0xF), so the range tombstone comes before the
291
+ // Seek() key in InternalKey's ordering. So Seek() will look in the
292
+ // next file for the user key.
293
+ largest_candidate =
294
+ InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
295
+ }
296
+ #ifndef NDEBUG
297
+ SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
298
+ if (meta.smallest.size() > 0) {
299
+ smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
300
+ }
301
+ #endif
302
+ meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
303
+ tombstone.seq_, icmp);
304
+ // The smallest key in a file is used for range tombstone truncation, so
305
+ // it cannot have a seqnum of 0 (unless the smallest data key in a file
306
+ // has a seqnum of 0). Otherwise, the truncated tombstone may expose
307
+ // deleted keys at lower levels.
308
+ assert(smallest_ikey_seqnum == 0 ||
309
+ ExtractInternalKeyFooter(meta.smallest.Encode()) !=
310
+ PackSequenceAndType(0, kTypeRangeDeletion));
311
+ }
312
+ return Status::OK();
313
+ }
314
+ } // namespace ROCKSDB_NAMESPACE