@nxtedition/rocksdb 7.0.24 → 7.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +12 -3
- package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
- package/deps/rocksdb/rocksdb/Makefile +6 -2
- package/deps/rocksdb/rocksdb/TARGETS +14 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
- package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
- package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
- package/deps/rocksdb/rocksdb/db/c.cc +68 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
- package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
- package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
- package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
- package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
- package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
- package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
- package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
- package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
- package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
- package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
- package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
- package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
- package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
- package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
- package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
- package/deps/rocksdb/rocksdb/src.mk +5 -0
- package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
- package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/util/compression.h +2 -0
- package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
- package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
- package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb.gyp +5 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include "db/blob/blob_file_completion_callback.h"
|
|
21
21
|
#include "db/column_family.h"
|
|
22
22
|
#include "db/compaction/compaction_iterator.h"
|
|
23
|
+
#include "db/compaction/compaction_outputs.h"
|
|
23
24
|
#include "db/flush_scheduler.h"
|
|
24
25
|
#include "db/internal_stats.h"
|
|
25
26
|
#include "db/job_context.h"
|
|
@@ -47,6 +48,7 @@
|
|
|
47
48
|
namespace ROCKSDB_NAMESPACE {
|
|
48
49
|
|
|
49
50
|
class Arena;
|
|
51
|
+
class CompactionState;
|
|
50
52
|
class ErrorHandler;
|
|
51
53
|
class MemTable;
|
|
52
54
|
class SnapshotChecker;
|
|
@@ -56,11 +58,91 @@ class Version;
|
|
|
56
58
|
class VersionEdit;
|
|
57
59
|
class VersionSet;
|
|
58
60
|
|
|
61
|
+
class SubcompactionState;
|
|
62
|
+
|
|
59
63
|
// CompactionJob is responsible for executing the compaction. Each (manual or
|
|
60
64
|
// automated) compaction corresponds to a CompactionJob object, and usually
|
|
61
65
|
// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
|
|
62
66
|
// will divide the compaction into subcompactions and execute them in parallel
|
|
63
67
|
// if needed.
|
|
68
|
+
//
|
|
69
|
+
// CompactionJob has 2 main stats:
|
|
70
|
+
// 1. CompactionJobStats compaction_job_stats_
|
|
71
|
+
// CompactionJobStats is a public data structure which is part of Compaction
|
|
72
|
+
// event listener that rocksdb share the job stats with the user.
|
|
73
|
+
// Internally it's an aggregation of all the compaction_job_stats from each
|
|
74
|
+
// `SubcompactionState`:
|
|
75
|
+
// +------------------------+
|
|
76
|
+
// | SubcompactionState |
|
|
77
|
+
// | |
|
|
78
|
+
// +--------->| compaction_job_stats |
|
|
79
|
+
// | | |
|
|
80
|
+
// | +------------------------+
|
|
81
|
+
// +------------------------+ |
|
|
82
|
+
// | CompactionJob | | +------------------------+
|
|
83
|
+
// | | | | SubcompactionState |
|
|
84
|
+
// | compaction_job_stats +-----+ | |
|
|
85
|
+
// | | +--------->| compaction_job_stats |
|
|
86
|
+
// | | | | |
|
|
87
|
+
// +------------------------+ | +------------------------+
|
|
88
|
+
// |
|
|
89
|
+
// | +------------------------+
|
|
90
|
+
// | | SubcompactionState |
|
|
91
|
+
// | | |
|
|
92
|
+
// +--------->+ compaction_job_stats |
|
|
93
|
+
// | | |
|
|
94
|
+
// | +------------------------+
|
|
95
|
+
// |
|
|
96
|
+
// | +------------------------+
|
|
97
|
+
// | | ... |
|
|
98
|
+
// +--------->+ |
|
|
99
|
+
// +------------------------+
|
|
100
|
+
//
|
|
101
|
+
// 2. CompactionStatsFull compaction_stats_
|
|
102
|
+
// `CompactionStatsFull` is an internal stats about the compaction, which
|
|
103
|
+
// is eventually sent to `ColumnFamilyData::internal_stats_` and used for
|
|
104
|
+
// logging and public metrics.
|
|
105
|
+
// Internally, it's an aggregation of stats_ from each `SubcompactionState`.
|
|
106
|
+
// It has 2 parts, normal stats about the main compaction information and
|
|
107
|
+
// the penultimate level output stats.
|
|
108
|
+
// `SubcompactionState` maintains the CompactionOutputs for normal output and
|
|
109
|
+
// the penultimate level output if exists, the per_level stats is
|
|
110
|
+
// stored with the outputs.
|
|
111
|
+
// +---------------------------+
|
|
112
|
+
// | SubcompactionState |
|
|
113
|
+
// | |
|
|
114
|
+
// | +----------------------+ |
|
|
115
|
+
// | | CompactionOutputs | |
|
|
116
|
+
// | | (normal output) | |
|
|
117
|
+
// +---->| stats_ | |
|
|
118
|
+
// | | +----------------------+ |
|
|
119
|
+
// | | |
|
|
120
|
+
// | | +----------------------+ |
|
|
121
|
+
// +--------------------------------+ | | | CompactionOutputs | |
|
|
122
|
+
// | CompactionJob | | | | (penultimate_level) | |
|
|
123
|
+
// | | +--------->| stats_ | |
|
|
124
|
+
// | compaction_stats_ | | | | +----------------------+ |
|
|
125
|
+
// | +-------------------------+ | | | | |
|
|
126
|
+
// | |stats (normal) |------|----+ +---------------------------+
|
|
127
|
+
// | +-------------------------+ | | |
|
|
128
|
+
// | | | |
|
|
129
|
+
// | +-------------------------+ | | | +---------------------------+
|
|
130
|
+
// | |penultimate_level_stats +------+ | | SubcompactionState |
|
|
131
|
+
// | +-------------------------+ | | | | |
|
|
132
|
+
// | | | | | +----------------------+ |
|
|
133
|
+
// | | | | | | CompactionOutputs | |
|
|
134
|
+
// +--------------------------------+ | | | | (normal output) | |
|
|
135
|
+
// | +---->| stats_ | |
|
|
136
|
+
// | | +----------------------+ |
|
|
137
|
+
// | | |
|
|
138
|
+
// | | +----------------------+ |
|
|
139
|
+
// | | | CompactionOutputs | |
|
|
140
|
+
// | | | (penultimate_level) | |
|
|
141
|
+
// +--------->| stats_ | |
|
|
142
|
+
// | +----------------------+ |
|
|
143
|
+
// | |
|
|
144
|
+
// +---------------------------+
|
|
145
|
+
|
|
64
146
|
class CompactionJob {
|
|
65
147
|
public:
|
|
66
148
|
CompactionJob(
|
|
@@ -107,11 +189,6 @@ class CompactionJob {
|
|
|
107
189
|
IOStatus io_status() const { return io_status_; }
|
|
108
190
|
|
|
109
191
|
protected:
|
|
110
|
-
struct SubcompactionState;
|
|
111
|
-
// CompactionJob state
|
|
112
|
-
struct CompactionState;
|
|
113
|
-
|
|
114
|
-
void AggregateStatistics();
|
|
115
192
|
void UpdateCompactionStats();
|
|
116
193
|
void LogCompaction();
|
|
117
194
|
virtual void RecordCompactionIOStats();
|
|
@@ -122,7 +199,7 @@ class CompactionJob {
|
|
|
122
199
|
void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
|
|
123
200
|
|
|
124
201
|
CompactionState* compact_;
|
|
125
|
-
InternalStats::
|
|
202
|
+
InternalStats::CompactionStatsFull compaction_stats_;
|
|
126
203
|
const ImmutableDBOptions& db_options_;
|
|
127
204
|
const MutableDBOptions mutable_db_options_copy_;
|
|
128
205
|
LogBuffer* log_buffer_;
|
|
@@ -135,6 +212,8 @@ class CompactionJob {
|
|
|
135
212
|
|
|
136
213
|
IOStatus io_status_;
|
|
137
214
|
|
|
215
|
+
CompactionJobStats* compaction_job_stats_;
|
|
216
|
+
|
|
138
217
|
private:
|
|
139
218
|
friend class CompactionJobTestBase;
|
|
140
219
|
|
|
@@ -150,15 +229,14 @@ class CompactionJob {
|
|
|
150
229
|
|
|
151
230
|
// update the thread status for starting a compaction.
|
|
152
231
|
void ReportStartedCompaction(Compaction* compaction);
|
|
153
|
-
void AllocateCompactionOutputFileNumbers();
|
|
154
232
|
|
|
155
|
-
Status FinishCompactionOutputFile(
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
const Slice* next_table_min_key = nullptr);
|
|
233
|
+
Status FinishCompactionOutputFile(const Status& input_status,
|
|
234
|
+
SubcompactionState* sub_compact,
|
|
235
|
+
CompactionOutputs& outputs,
|
|
236
|
+
const Slice& next_table_min_key);
|
|
160
237
|
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
|
|
161
|
-
Status OpenCompactionOutputFile(SubcompactionState* sub_compact
|
|
238
|
+
Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
|
|
239
|
+
CompactionOutputs& outputs);
|
|
162
240
|
void UpdateCompactionJobStats(
|
|
163
241
|
const InternalStats::CompactionStats& stats) const;
|
|
164
242
|
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
|
|
@@ -167,20 +245,12 @@ class CompactionJob {
|
|
|
167
245
|
void UpdateCompactionInputStatsHelper(
|
|
168
246
|
int* num_files, uint64_t* bytes_read, int input_level);
|
|
169
247
|
|
|
170
|
-
#ifndef ROCKSDB_LITE
|
|
171
|
-
void BuildSubcompactionJobInfo(
|
|
172
|
-
SubcompactionState* sub_compact,
|
|
173
|
-
SubcompactionJobInfo* subcompaction_job_info) const;
|
|
174
|
-
#endif // ROCKSDB_LITE
|
|
175
|
-
|
|
176
248
|
void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
|
|
177
249
|
|
|
178
250
|
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
|
|
179
251
|
|
|
180
252
|
uint32_t job_id_;
|
|
181
253
|
|
|
182
|
-
CompactionJobStats* compaction_job_stats_;
|
|
183
|
-
|
|
184
254
|
// DBImpl state
|
|
185
255
|
const std::string& dbname_;
|
|
186
256
|
const std::string db_id_;
|
|
@@ -222,14 +292,12 @@ class CompactionJob {
|
|
|
222
292
|
bool measure_io_stats_;
|
|
223
293
|
// Stores the Slices that designate the boundaries for each subcompaction
|
|
224
294
|
std::vector<Slice> boundaries_;
|
|
225
|
-
// Stores the approx size of keys covered in the range of each subcompaction
|
|
226
|
-
std::vector<uint64_t> sizes_;
|
|
227
295
|
Env::Priority thread_pri_;
|
|
228
296
|
std::string full_history_ts_low_;
|
|
229
297
|
std::string trim_ts_;
|
|
230
298
|
BlobFileCompletionCallback* blob_callback_;
|
|
231
299
|
|
|
232
|
-
uint64_t GetCompactionId(SubcompactionState* sub_compact);
|
|
300
|
+
uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
|
|
233
301
|
|
|
234
302
|
// Get table file name in where it's outputting to, which should also be in
|
|
235
303
|
// `output_directory_`.
|
|
@@ -265,7 +333,6 @@ struct CompactionServiceInput {
|
|
|
265
333
|
std::string begin;
|
|
266
334
|
bool has_end = false;
|
|
267
335
|
std::string end;
|
|
268
|
-
uint64_t approx_size = 0;
|
|
269
336
|
|
|
270
337
|
// serialization interface to read and write the object
|
|
271
338
|
static Status Read(const std::string& data_str, CompactionServiceInput* obj);
|
|
@@ -357,7 +424,7 @@ class CompactionServiceCompactionJob : private CompactionJob {
|
|
|
357
424
|
const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
|
|
358
425
|
const std::atomic<bool>& manual_compaction_canceled,
|
|
359
426
|
const std::string& db_id, const std::string& db_session_id,
|
|
360
|
-
|
|
427
|
+
std::string output_path,
|
|
361
428
|
const CompactionServiceInput& compaction_service_input,
|
|
362
429
|
CompactionServiceResult* compaction_service_result);
|
|
363
430
|
|
|
@@ -482,6 +482,17 @@ class CompactionJobTestBase : public testing::Test {
|
|
|
482
482
|
cfd_ = versions_->GetColumnFamilySet()->GetDefault();
|
|
483
483
|
}
|
|
484
484
|
|
|
485
|
+
void RunLastLevelCompaction(
|
|
486
|
+
const std::vector<std::vector<FileMetaData*>>& input_files,
|
|
487
|
+
std::function<void(Compaction& comp)>&& verify_func,
|
|
488
|
+
const std::vector<SequenceNumber>& snapshots = {}) {
|
|
489
|
+
const int kLastLevel = cf_options_.num_levels - 1;
|
|
490
|
+
verify_per_key_placement_ = std::move(verify_func);
|
|
491
|
+
mock::KVVector empty_map;
|
|
492
|
+
RunCompaction(input_files, empty_map, snapshots, kMaxSequenceNumber,
|
|
493
|
+
kLastLevel, false);
|
|
494
|
+
}
|
|
495
|
+
|
|
485
496
|
void RunCompaction(
|
|
486
497
|
const std::vector<std::vector<FileMetaData*>>& input_files,
|
|
487
498
|
const mock::KVVector& expected_results,
|
|
@@ -571,6 +582,12 @@ class CompactionJobTestBase : public testing::Test {
|
|
|
571
582
|
if (check_get_priority) {
|
|
572
583
|
CheckGetRateLimiterPriority(compaction_job);
|
|
573
584
|
}
|
|
585
|
+
|
|
586
|
+
if (verify_per_key_placement_) {
|
|
587
|
+
// Verify per_key_placement compaction
|
|
588
|
+
assert(compaction.SupportsPerKeyPlacement());
|
|
589
|
+
verify_per_key_placement_(compaction);
|
|
590
|
+
}
|
|
574
591
|
}
|
|
575
592
|
|
|
576
593
|
void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
|
|
@@ -620,6 +637,7 @@ class CompactionJobTestBase : public testing::Test {
|
|
|
620
637
|
std::string full_history_ts_low_;
|
|
621
638
|
const std::function<std::string(uint64_t)> encode_u64_ts_;
|
|
622
639
|
bool test_io_priority_;
|
|
640
|
+
std::function<void(Compaction& comp)> verify_per_key_placement_;
|
|
623
641
|
};
|
|
624
642
|
|
|
625
643
|
// TODO(icanadi) Make it simpler once we mock out VersionSet
|
|
@@ -1311,6 +1329,75 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
|
|
|
1311
1329
|
/* expected_oldest_blob_file_number */ 19);
|
|
1312
1330
|
}
|
|
1313
1331
|
|
|
1332
|
+
TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
|
|
1333
|
+
cf_options_.bottommost_temperature = Temperature::kCold;
|
|
1334
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
1335
|
+
"Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
|
|
1336
|
+
auto supports_per_key_placement = static_cast<bool*>(arg);
|
|
1337
|
+
*supports_per_key_placement = true;
|
|
1338
|
+
});
|
|
1339
|
+
|
|
1340
|
+
std::atomic_uint64_t latest_cold_seq = 0;
|
|
1341
|
+
|
|
1342
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
1343
|
+
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
|
1344
|
+
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
|
1345
|
+
context->output_to_penultimate_level =
|
|
1346
|
+
context->seq_num > latest_cold_seq;
|
|
1347
|
+
});
|
|
1348
|
+
SyncPoint::GetInstance()->EnableProcessing();
|
|
1349
|
+
|
|
1350
|
+
NewDB();
|
|
1351
|
+
|
|
1352
|
+
// Add files on different levels that may overlap
|
|
1353
|
+
auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
|
|
1354
|
+
AddMockFile(file0_1);
|
|
1355
|
+
|
|
1356
|
+
auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
|
|
1357
|
+
{KeyStr("f", 11U, kTypeValue), "val"}});
|
|
1358
|
+
AddMockFile(file1_1, 1);
|
|
1359
|
+
auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
|
|
1360
|
+
{KeyStr("k", 13U, kTypeValue), "val"}});
|
|
1361
|
+
AddMockFile(file1_2, 1);
|
|
1362
|
+
auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
|
|
1363
|
+
{KeyStr("u", 15U, kTypeValue), "val"}});
|
|
1364
|
+
AddMockFile(file1_3, 1);
|
|
1365
|
+
|
|
1366
|
+
auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
|
|
1367
|
+
{KeyStr("h", 9U, kTypeValue), "val"}});
|
|
1368
|
+
AddMockFile(file2_1, 2);
|
|
1369
|
+
auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
|
|
1370
|
+
{KeyStr("p", 7U, kTypeValue), "val"}});
|
|
1371
|
+
AddMockFile(file2_2, 2);
|
|
1372
|
+
|
|
1373
|
+
auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
|
|
1374
|
+
{KeyStr("k", 3U, kTypeValue), "val"}});
|
|
1375
|
+
AddMockFile(file3_1, 3);
|
|
1376
|
+
auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
|
|
1377
|
+
{KeyStr("x", 5U, kTypeValue), "val"}});
|
|
1378
|
+
AddMockFile(file3_2, 3);
|
|
1379
|
+
|
|
1380
|
+
auto cfd = versions_->GetColumnFamilySet()->GetDefault();
|
|
1381
|
+
auto files0 = cfd->current()->storage_info()->LevelFiles(0);
|
|
1382
|
+
auto files1 = cfd->current()->storage_info()->LevelFiles(1);
|
|
1383
|
+
auto files2 = cfd->current()->storage_info()->LevelFiles(2);
|
|
1384
|
+
auto files3 = cfd->current()->storage_info()->LevelFiles(3);
|
|
1385
|
+
|
|
1386
|
+
RunLastLevelCompaction(
|
|
1387
|
+
{files0, files1, files2, files3}, /*verify_func=*/[&](Compaction& comp) {
|
|
1388
|
+
for (char c = 'a'; c <= 'z'; c++) {
|
|
1389
|
+
std::string c_str;
|
|
1390
|
+
c_str = c;
|
|
1391
|
+
const Slice key(c_str);
|
|
1392
|
+
if (c == 'a') {
|
|
1393
|
+
ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
|
|
1394
|
+
} else {
|
|
1395
|
+
ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
});
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1314
1401
|
TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
|
|
1315
1402
|
db_options_.enforce_single_del_contracts = false;
|
|
1316
1403
|
NewDB();
|
|
@@ -1360,7 +1447,6 @@ TEST_F(CompactionJobTest, InputSerialization) {
|
|
|
1360
1447
|
if (input.has_end) {
|
|
1361
1448
|
input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
|
|
1362
1449
|
}
|
|
1363
|
-
input.approx_size = rnd64.Uniform(UINT64_MAX);
|
|
1364
1450
|
|
|
1365
1451
|
std::string output;
|
|
1366
1452
|
ASSERT_OK(input.Write(&output));
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
//
|
|
3
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
4
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
5
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
6
|
+
//
|
|
7
|
+
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
8
|
+
// Use of this source code is governed by a BSD-style license that can be
|
|
9
|
+
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
10
|
+
|
|
11
|
+
#include "db/compaction/compaction_outputs.h"
|
|
12
|
+
|
|
13
|
+
#include "db/builder.h"
|
|
14
|
+
|
|
15
|
+
namespace ROCKSDB_NAMESPACE {
|
|
16
|
+
|
|
17
|
+
void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
|
|
18
|
+
builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
Status CompactionOutputs::Finish(const Status& intput_status) {
|
|
22
|
+
FileMetaData* meta = GetMetaData();
|
|
23
|
+
assert(meta != nullptr);
|
|
24
|
+
Status s = intput_status;
|
|
25
|
+
if (s.ok()) {
|
|
26
|
+
s = builder_->Finish();
|
|
27
|
+
} else {
|
|
28
|
+
builder_->Abandon();
|
|
29
|
+
}
|
|
30
|
+
Status io_s = builder_->io_status();
|
|
31
|
+
if (s.ok()) {
|
|
32
|
+
s = io_s;
|
|
33
|
+
} else {
|
|
34
|
+
io_s.PermitUncheckedError();
|
|
35
|
+
}
|
|
36
|
+
const uint64_t current_bytes = builder_->FileSize();
|
|
37
|
+
if (s.ok()) {
|
|
38
|
+
meta->fd.file_size = current_bytes;
|
|
39
|
+
meta->marked_for_compaction = builder_->NeedCompact();
|
|
40
|
+
}
|
|
41
|
+
current_output().finished = true;
|
|
42
|
+
stats_.bytes_written += current_bytes;
|
|
43
|
+
stats_.num_output_files = outputs_.size();
|
|
44
|
+
|
|
45
|
+
return s;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
|
|
49
|
+
SystemClock* clock,
|
|
50
|
+
Statistics* statistics,
|
|
51
|
+
bool use_fsync) {
|
|
52
|
+
IOStatus io_s;
|
|
53
|
+
if (input_status.ok()) {
|
|
54
|
+
StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
|
|
55
|
+
io_s = file_writer_->Sync(use_fsync);
|
|
56
|
+
}
|
|
57
|
+
if (input_status.ok() && io_s.ok()) {
|
|
58
|
+
io_s = file_writer_->Close();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (input_status.ok() && io_s.ok()) {
|
|
62
|
+
FileMetaData* meta = GetMetaData();
|
|
63
|
+
meta->file_checksum = file_writer_->GetFileChecksum();
|
|
64
|
+
meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
file_writer_.reset();
|
|
68
|
+
|
|
69
|
+
return io_s;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
Status CompactionOutputs::AddToOutput(
|
|
73
|
+
const CompactionIterator& c_iter,
|
|
74
|
+
const CompactionFileOpenFunc& open_file_func,
|
|
75
|
+
const CompactionFileCloseFunc& close_file_func) {
|
|
76
|
+
Status s;
|
|
77
|
+
const Slice& key = c_iter.key();
|
|
78
|
+
|
|
79
|
+
if (!pending_close_ && c_iter.Valid() && partitioner_ && HasBuilder() &&
|
|
80
|
+
partitioner_->ShouldPartition(
|
|
81
|
+
PartitionerRequest(last_key_for_partitioner_, c_iter.user_key(),
|
|
82
|
+
current_output_file_size_)) == kRequired) {
|
|
83
|
+
pending_close_ = true;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (pending_close_) {
|
|
87
|
+
s = close_file_func(*this, c_iter.InputStatus(), key);
|
|
88
|
+
pending_close_ = false;
|
|
89
|
+
}
|
|
90
|
+
if (!s.ok()) {
|
|
91
|
+
return s;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Open output file if necessary
|
|
95
|
+
if (!HasBuilder()) {
|
|
96
|
+
s = open_file_func(*this);
|
|
97
|
+
}
|
|
98
|
+
if (!s.ok()) {
|
|
99
|
+
return s;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
Output& curr = current_output();
|
|
103
|
+
assert(builder_ != nullptr);
|
|
104
|
+
const Slice& value = c_iter.value();
|
|
105
|
+
s = curr.validator.Add(key, value);
|
|
106
|
+
if (!s.ok()) {
|
|
107
|
+
return s;
|
|
108
|
+
}
|
|
109
|
+
builder_->Add(key, value);
|
|
110
|
+
|
|
111
|
+
stats_.num_output_records++;
|
|
112
|
+
current_output_file_size_ = builder_->EstimatedFileSize();
|
|
113
|
+
|
|
114
|
+
if (blob_garbage_meter_) {
|
|
115
|
+
s = blob_garbage_meter_->ProcessOutFlow(key, value);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (!s.ok()) {
|
|
119
|
+
return s;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const ParsedInternalKey& ikey = c_iter.ikey();
|
|
123
|
+
s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
|
|
124
|
+
ikey.type);
|
|
125
|
+
|
|
126
|
+
// Close output file if it is big enough. Two possibilities determine it's
|
|
127
|
+
// time to close it: (1) the current key should be this file's last key, (2)
|
|
128
|
+
// the next key should not be in this file.
|
|
129
|
+
//
|
|
130
|
+
// TODO(aekmekji): determine if file should be closed earlier than this
|
|
131
|
+
// during subcompactions (i.e. if output size, estimated by input size, is
|
|
132
|
+
// going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
|
|
133
|
+
// and 0.6MB instead of 1MB and 0.2MB)
|
|
134
|
+
if (compaction_->output_level() != 0 &&
|
|
135
|
+
current_output_file_size_ >= compaction_->max_output_file_size()) {
|
|
136
|
+
pending_close_ = true;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (partitioner_) {
|
|
140
|
+
last_key_for_partitioner_.assign(c_iter.user_key().data_,
|
|
141
|
+
c_iter.user_key().size_);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return s;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
Status CompactionOutputs::AddRangeDels(
|
|
148
|
+
const Slice* comp_start, const Slice* comp_end,
|
|
149
|
+
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
|
|
150
|
+
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
|
|
151
|
+
const Slice& next_table_min_key) {
|
|
152
|
+
assert(HasRangeDel());
|
|
153
|
+
FileMetaData& meta = current_output().meta;
|
|
154
|
+
const Comparator* ucmp = icmp.user_comparator();
|
|
155
|
+
|
|
156
|
+
Slice lower_bound_guard, upper_bound_guard;
|
|
157
|
+
std::string smallest_user_key;
|
|
158
|
+
const Slice *lower_bound, *upper_bound;
|
|
159
|
+
bool lower_bound_from_sub_compact = false;
|
|
160
|
+
|
|
161
|
+
size_t output_size = outputs_.size();
|
|
162
|
+
if (output_size == 1) {
|
|
163
|
+
// For the first output table, include range tombstones before the min
|
|
164
|
+
// key but after the subcompaction boundary.
|
|
165
|
+
lower_bound = comp_start;
|
|
166
|
+
lower_bound_from_sub_compact = true;
|
|
167
|
+
} else if (meta.smallest.size() > 0) {
|
|
168
|
+
// For subsequent output tables, only include range tombstones from min
|
|
169
|
+
// key onwards since the previous file was extended to contain range
|
|
170
|
+
// tombstones falling before min key.
|
|
171
|
+
smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
|
|
172
|
+
lower_bound_guard = Slice(smallest_user_key);
|
|
173
|
+
lower_bound = &lower_bound_guard;
|
|
174
|
+
} else {
|
|
175
|
+
lower_bound = nullptr;
|
|
176
|
+
}
|
|
177
|
+
if (!next_table_min_key.empty()) {
|
|
178
|
+
// This may be the last file in the subcompaction in some cases, so we
|
|
179
|
+
// need to compare the end key of subcompaction with the next file start
|
|
180
|
+
// key. When the end key is chosen by the subcompaction, we know that
|
|
181
|
+
// it must be the biggest key in output file. Therefore, it is safe to
|
|
182
|
+
// use the smaller key as the upper bound of the output file, to ensure
|
|
183
|
+
// that there is no overlapping between different output files.
|
|
184
|
+
upper_bound_guard = ExtractUserKey(next_table_min_key);
|
|
185
|
+
if (comp_end != nullptr &&
|
|
186
|
+
ucmp->Compare(upper_bound_guard, *comp_end) >= 0) {
|
|
187
|
+
upper_bound = comp_end;
|
|
188
|
+
} else {
|
|
189
|
+
upper_bound = &upper_bound_guard;
|
|
190
|
+
}
|
|
191
|
+
} else {
|
|
192
|
+
// This is the last file in the subcompaction, so extend until the
|
|
193
|
+
// subcompaction ends.
|
|
194
|
+
upper_bound = comp_end;
|
|
195
|
+
}
|
|
196
|
+
bool has_overlapping_endpoints;
|
|
197
|
+
if (upper_bound != nullptr && meta.largest.size() > 0) {
|
|
198
|
+
has_overlapping_endpoints =
|
|
199
|
+
ucmp->Compare(meta.largest.user_key(), *upper_bound) == 0;
|
|
200
|
+
} else {
|
|
201
|
+
has_overlapping_endpoints = false;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// The end key of the subcompaction must be bigger or equal to the upper
|
|
205
|
+
// bound. If the end of subcompaction is null or the upper bound is null,
|
|
206
|
+
// it means that this file is the last file in the compaction. So there
|
|
207
|
+
// will be no overlapping between this file and others.
|
|
208
|
+
assert(comp_end == nullptr || upper_bound == nullptr ||
|
|
209
|
+
ucmp->Compare(*upper_bound, *comp_end) <= 0);
|
|
210
|
+
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
|
|
211
|
+
has_overlapping_endpoints);
|
|
212
|
+
// Position the range tombstone output iterator. There may be tombstone
|
|
213
|
+
// fragments that are entirely out of range, so make sure that we do not
|
|
214
|
+
// include those.
|
|
215
|
+
if (lower_bound != nullptr) {
|
|
216
|
+
it->Seek(*lower_bound);
|
|
217
|
+
} else {
|
|
218
|
+
it->SeekToFirst();
|
|
219
|
+
}
|
|
220
|
+
for (; it->Valid(); it->Next()) {
|
|
221
|
+
auto tombstone = it->Tombstone();
|
|
222
|
+
if (upper_bound != nullptr) {
|
|
223
|
+
int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
|
|
224
|
+
if ((has_overlapping_endpoints && cmp < 0) ||
|
|
225
|
+
(!has_overlapping_endpoints && cmp <= 0)) {
|
|
226
|
+
// Tombstones starting after upper_bound only need to be included in
|
|
227
|
+
// the next table. If the current SST ends before upper_bound, i.e.,
|
|
228
|
+
// `has_overlapping_endpoints == false`, we can also skip over range
|
|
229
|
+
// tombstones that start exactly at upper_bound. Such range
|
|
230
|
+
// tombstones will be included in the next file and are not relevant
|
|
231
|
+
// to the point keys or endpoints of the current file.
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (bottommost_level && tombstone.seq_ <= earliest_snapshot) {
|
|
237
|
+
// TODO(andrewkr): tombstones that span multiple output files are
|
|
238
|
+
// counted for each compaction output file, so lots of double
|
|
239
|
+
// counting.
|
|
240
|
+
range_del_out_stats.num_range_del_drop_obsolete++;
|
|
241
|
+
range_del_out_stats.num_record_drop_obsolete++;
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
auto kv = tombstone.Serialize();
|
|
246
|
+
assert(lower_bound == nullptr ||
|
|
247
|
+
ucmp->Compare(*lower_bound, kv.second) < 0);
|
|
248
|
+
// Range tombstone is not supported by output validator yet.
|
|
249
|
+
builder_->Add(kv.first.Encode(), kv.second);
|
|
250
|
+
InternalKey smallest_candidate = std::move(kv.first);
|
|
251
|
+
if (lower_bound != nullptr &&
|
|
252
|
+
ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
|
|
253
|
+
// Pretend the smallest key has the same user key as lower_bound
|
|
254
|
+
// (the max key in the previous table or subcompaction) in order for
|
|
255
|
+
// files to appear key-space partitioned.
|
|
256
|
+
//
|
|
257
|
+
// When lower_bound is chosen by a subcompaction, we know that
|
|
258
|
+
// subcompactions over smaller keys cannot contain any keys at
|
|
259
|
+
// lower_bound. We also know that smaller subcompactions exist,
|
|
260
|
+
// because otherwise the subcompaction woud be unbounded on the left.
|
|
261
|
+
// As a result, we know that no other files on the output level will
|
|
262
|
+
// contain actual keys at lower_bound (an output file may have a
|
|
263
|
+
// largest key of lower_bound@kMaxSequenceNumber, but this only
|
|
264
|
+
// indicates a large range tombstone was truncated). Therefore, it is
|
|
265
|
+
// safe to use the tombstone's sequence number, to ensure that keys at
|
|
266
|
+
// lower_bound at lower levels are covered by truncated tombstones.
|
|
267
|
+
//
|
|
268
|
+
// If lower_bound was chosen by the smallest data key in the file,
|
|
269
|
+
// choose lowest seqnum so this file's smallest internal key comes
|
|
270
|
+
// after the previous file's largest. The fake seqnum is OK because
|
|
271
|
+
// the read path's file-picking code only considers user key.
|
|
272
|
+
smallest_candidate = InternalKey(
|
|
273
|
+
*lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
|
|
274
|
+
kTypeRangeDeletion);
|
|
275
|
+
}
|
|
276
|
+
InternalKey largest_candidate = tombstone.SerializeEndKey();
|
|
277
|
+
if (upper_bound != nullptr &&
|
|
278
|
+
ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
|
|
279
|
+
// Pretend the largest key has the same user key as upper_bound (the
|
|
280
|
+
// min key in the following table or subcompaction) in order for files
|
|
281
|
+
// to appear key-space partitioned.
|
|
282
|
+
//
|
|
283
|
+
// Choose highest seqnum so this file's largest internal key comes
|
|
284
|
+
// before the next file's/subcompaction's smallest. The fake seqnum is
|
|
285
|
+
// OK because the read path's file-picking code only considers the
|
|
286
|
+
// user key portion.
|
|
287
|
+
//
|
|
288
|
+
// Note Seek() also creates InternalKey with (user_key,
|
|
289
|
+
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
|
290
|
+
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
|
291
|
+
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
|
292
|
+
// next file for the user key.
|
|
293
|
+
largest_candidate =
|
|
294
|
+
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
|
|
295
|
+
}
|
|
296
|
+
#ifndef NDEBUG
|
|
297
|
+
SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
|
|
298
|
+
if (meta.smallest.size() > 0) {
|
|
299
|
+
smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
|
|
300
|
+
}
|
|
301
|
+
#endif
|
|
302
|
+
meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
|
|
303
|
+
tombstone.seq_, icmp);
|
|
304
|
+
// The smallest key in a file is used for range tombstone truncation, so
|
|
305
|
+
// it cannot have a seqnum of 0 (unless the smallest data key in a file
|
|
306
|
+
// has a seqnum of 0). Otherwise, the truncated tombstone may expose
|
|
307
|
+
// deleted keys at lower levels.
|
|
308
|
+
assert(smallest_ikey_seqnum == 0 ||
|
|
309
|
+
ExtractInternalKeyFooter(meta.smallest.Encode()) !=
|
|
310
|
+
PackSequenceAndType(0, kTypeRangeDeletion));
|
|
311
|
+
}
|
|
312
|
+
return Status::OK();
|
|
313
|
+
}
|
|
314
|
+
} // namespace ROCKSDB_NAMESPACE
|