@nxtedition/rocksdb 7.0.23 → 7.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +3 -1
- package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
- package/deps/rocksdb/rocksdb/Makefile +6 -2
- package/deps/rocksdb/rocksdb/TARGETS +14 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
- package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
- package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
- package/deps/rocksdb/rocksdb/db/c.cc +68 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
- package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
- package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
- package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
- package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
- package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
- package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
- package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
- package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
- package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
- package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
- package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
- package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
- package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
- package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
- package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
- package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
- package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
- package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
- package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
- package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
- package/deps/rocksdb/rocksdb/src.mk +5 -0
- package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
- package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
- package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
- package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/util/compression.h +2 -0
- package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
- package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
- package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
- package/deps/rocksdb/rocksdb.gyp +5 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -81,6 +81,7 @@ class Compaction {
|
|
|
81
81
|
std::vector<FileMetaData*> grandparents,
|
|
82
82
|
bool manual_compaction = false, const std::string& trim_ts = "",
|
|
83
83
|
double score = -1, bool deletion_compaction = false,
|
|
84
|
+
bool l0_files_might_overlap = true,
|
|
84
85
|
CompactionReason compaction_reason = CompactionReason::kUnknown,
|
|
85
86
|
BlobGarbageCollectionPolicy blob_garbage_collection_policy =
|
|
86
87
|
BlobGarbageCollectionPolicy::kUseDefault,
|
|
@@ -181,7 +182,7 @@ class Compaction {
|
|
|
181
182
|
// split the output files according to the existing cursor in the output
|
|
182
183
|
// level under round-robin compaction policy. Empty indicates no required
|
|
183
184
|
// splitting key
|
|
184
|
-
const InternalKey GetOutputSplitKey() const { return output_split_key_; }
|
|
185
|
+
const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
|
|
185
186
|
|
|
186
187
|
// If true, then the compaction can be done by simply deleting input files.
|
|
187
188
|
bool deletion_compaction() const { return deletion_compaction_; }
|
|
@@ -301,7 +302,25 @@ class Compaction {
|
|
|
301
302
|
|
|
302
303
|
Slice GetLargestUserKey() const { return largest_user_key_; }
|
|
303
304
|
|
|
304
|
-
|
|
305
|
+
// Return true if the compaction supports per_key_placement
|
|
306
|
+
bool SupportsPerKeyPlacement() const;
|
|
307
|
+
|
|
308
|
+
// Get per_key_placement penultimate output level, which is `last_level - 1`
|
|
309
|
+
// if per_key_placement feature is supported. Otherwise, return -1.
|
|
310
|
+
int GetPenultimateLevel() const;
|
|
311
|
+
|
|
312
|
+
// Return true if the given range is overlap with penultimate level output
|
|
313
|
+
// range.
|
|
314
|
+
bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
|
|
315
|
+
const Slice& largest_key) const;
|
|
316
|
+
|
|
317
|
+
// Return true if the key is within penultimate level output range for
|
|
318
|
+
// per_key_placement feature, which is safe to place the key to the
|
|
319
|
+
// penultimate level. different compaction strategy has different rules.
|
|
320
|
+
// If per_key_placement is not supported, always return false.
|
|
321
|
+
// TODO: currently it doesn't support moving data from the last level to the
|
|
322
|
+
// penultimate level
|
|
323
|
+
bool WithinPenultimateLevelOutputRange(const Slice& key) const;
|
|
305
324
|
|
|
306
325
|
CompactionReason compaction_reason() const { return compaction_reason_; }
|
|
307
326
|
|
|
@@ -338,6 +357,15 @@ class Compaction {
|
|
|
338
357
|
return notify_on_compaction_completion_;
|
|
339
358
|
}
|
|
340
359
|
|
|
360
|
+
static constexpr int kInvalidLevel = -1;
|
|
361
|
+
// Evaluate penultimate output level. If the compaction supports
|
|
362
|
+
// per_key_placement feature, it returns the penultimate level number.
|
|
363
|
+
// Otherwise, it's set to kInvalidLevel (-1), which means
|
|
364
|
+
// output_to_penultimate_level is not supported.
|
|
365
|
+
static int EvaluatePenultimateLevel(const ImmutableOptions& immutable_options,
|
|
366
|
+
const int start_level,
|
|
367
|
+
const int output_level);
|
|
368
|
+
|
|
341
369
|
private:
|
|
342
370
|
// mark (or clear) all files that are being compacted
|
|
343
371
|
void MarkFilesBeingCompacted(bool mark_as_compacted);
|
|
@@ -345,7 +373,18 @@ class Compaction {
|
|
|
345
373
|
// get the smallest and largest key present in files to be compacted
|
|
346
374
|
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
|
|
347
375
|
const std::vector<CompactionInputFiles>& inputs,
|
|
348
|
-
Slice* smallest_key, Slice* largest_key
|
|
376
|
+
Slice* smallest_key, Slice* largest_key,
|
|
377
|
+
int exclude_level = -1);
|
|
378
|
+
|
|
379
|
+
// populate penultimate level output range, which will be used to determine if
|
|
380
|
+
// a key is safe to output to the penultimate level (details see
|
|
381
|
+
// `Compaction::WithinPenultimateLevelOutputRange()`.
|
|
382
|
+
// TODO: Currently the penultimate level output range is the min/max keys of
|
|
383
|
+
// non-last-level input files. Which is only good if there's no key moved
|
|
384
|
+
// from the last level to the penultimate level. For a more complicated per
|
|
385
|
+
// key placement which may move data from the last level to the penultimate
|
|
386
|
+
// level, it needs extra check.
|
|
387
|
+
void PopulatePenultimateLevelOutputRange();
|
|
349
388
|
|
|
350
389
|
// Get the atomic file boundaries for all files in the compaction. Necessary
|
|
351
390
|
// in order to avoid the scenario described in
|
|
@@ -386,7 +425,12 @@ class Compaction {
|
|
|
386
425
|
// If true, then the compaction can be done by simply deleting input files.
|
|
387
426
|
const bool deletion_compaction_;
|
|
388
427
|
// should it split the output file using the compact cursor?
|
|
389
|
-
InternalKey output_split_key_;
|
|
428
|
+
const InternalKey* output_split_key_;
|
|
429
|
+
|
|
430
|
+
// L0 files in LSM-tree might be overlapping. But the compaction picking
|
|
431
|
+
// logic might pick a subset of the files that aren't overlapping. if
|
|
432
|
+
// that is the case, set the value to false. Otherwise, set it true.
|
|
433
|
+
bool l0_files_might_overlap_;
|
|
390
434
|
|
|
391
435
|
// Compaction input files organized by level. Constant after construction
|
|
392
436
|
const std::vector<CompactionInputFiles> inputs_;
|
|
@@ -438,7 +482,35 @@ class Compaction {
|
|
|
438
482
|
|
|
439
483
|
// Blob garbage collection age cutoff.
|
|
440
484
|
double blob_garbage_collection_age_cutoff_;
|
|
485
|
+
|
|
486
|
+
// only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
|
|
487
|
+
// means not supported.
|
|
488
|
+
const int penultimate_level_;
|
|
489
|
+
|
|
490
|
+
// Key range for penultimate level output
|
|
491
|
+
Slice penultimate_level_smallest_user_key_;
|
|
492
|
+
Slice penultimate_level_largest_user_key_;
|
|
493
|
+
};
|
|
494
|
+
|
|
495
|
+
#ifndef NDEBUG
|
|
496
|
+
// Helper struct only for tests, which contains the data to decide if a key
|
|
497
|
+
// should be output to the penultimate level.
|
|
498
|
+
// TODO: remove this when the public feature knob is available
|
|
499
|
+
struct PerKeyPlacementContext {
|
|
500
|
+
const int level;
|
|
501
|
+
const Slice key;
|
|
502
|
+
const Slice value;
|
|
503
|
+
const SequenceNumber seq_num;
|
|
504
|
+
|
|
505
|
+
bool output_to_penultimate_level;
|
|
506
|
+
|
|
507
|
+
PerKeyPlacementContext(int _level, Slice _key, Slice _value,
|
|
508
|
+
SequenceNumber _seq_num)
|
|
509
|
+
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
|
|
510
|
+
output_to_penultimate_level = false;
|
|
511
|
+
}
|
|
441
512
|
};
|
|
513
|
+
#endif /* !NDEBUG */
|
|
442
514
|
|
|
443
515
|
// Return sum of sizes of all files in `files`.
|
|
444
516
|
extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
|
|
@@ -1075,6 +1075,52 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
|
|
|
1075
1075
|
}
|
|
1076
1076
|
}
|
|
1077
1077
|
|
|
1078
|
+
void CompactionIterator::DecideOutputLevel() {
|
|
1079
|
+
#ifndef NDEBUG
|
|
1080
|
+
// TODO: will be set by sequence number or key range, for now, it will only be
|
|
1081
|
+
// set by unittest
|
|
1082
|
+
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
|
|
1083
|
+
ikey_.sequence);
|
|
1084
|
+
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
|
|
1085
|
+
&context);
|
|
1086
|
+
output_to_penultimate_level_ = context.output_to_penultimate_level;
|
|
1087
|
+
#endif /* !NDEBUG */
|
|
1088
|
+
|
|
1089
|
+
// if the key is within the earliest snapshot, it has to output to the
|
|
1090
|
+
// penultimate level.
|
|
1091
|
+
if (ikey_.sequence > earliest_snapshot_) {
|
|
1092
|
+
output_to_penultimate_level_ = true;
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
if (output_to_penultimate_level_) {
|
|
1096
|
+
// If it's decided to output to the penultimate level, but unsafe to do so,
|
|
1097
|
+
// still output to the last level. For example, moving the data from a lower
|
|
1098
|
+
// level to a higher level outside of the higher-level input key range is
|
|
1099
|
+
// considered unsafe, because the key may conflict with higher-level SSTs
|
|
1100
|
+
// not from this compaction.
|
|
1101
|
+
// TODO: add statistic for declined output_to_penultimate_level
|
|
1102
|
+
bool safe_to_penultimate_level =
|
|
1103
|
+
compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
|
|
1104
|
+
if (!safe_to_penultimate_level) {
|
|
1105
|
+
output_to_penultimate_level_ = false;
|
|
1106
|
+
// It could happen when disable/enable `bottommost_temperature` while
|
|
1107
|
+
// holding a snapshot. When `bottommost_temperature` is not set
|
|
1108
|
+
// (==kUnknown), the data newer than any snapshot is pushed to the last
|
|
1109
|
+
// level, but when the per_key_placement feature is enabled on the fly,
|
|
1110
|
+
// the data later than the snapshot has to be moved to the penultimate
|
|
1111
|
+
// level, which may or may not be safe. So the user needs to make sure all
|
|
1112
|
+
// snapshot is released before enabling `bottommost_temperature` feature
|
|
1113
|
+
// We will migrate the feature to `last_level_temperature` and maybe make
|
|
1114
|
+
// it not dynamically changeable.
|
|
1115
|
+
if (ikey_.sequence > earliest_snapshot_) {
|
|
1116
|
+
status_ = Status::Corruption(
|
|
1117
|
+
"Unsafe to store Seq later than snapshot in the last level if "
|
|
1118
|
+
"per_key_placement is enabled");
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
}
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1078
1124
|
void CompactionIterator::PrepareOutput() {
|
|
1079
1125
|
if (valid_) {
|
|
1080
1126
|
if (ikey_.type == kTypeValue) {
|
|
@@ -1083,6 +1129,10 @@ void CompactionIterator::PrepareOutput() {
|
|
|
1083
1129
|
GarbageCollectBlobIfNeeded();
|
|
1084
1130
|
}
|
|
1085
1131
|
|
|
1132
|
+
if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
|
|
1133
|
+
DecideOutputLevel();
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1086
1136
|
// Zeroing out the sequence number leads to better compression.
|
|
1087
1137
|
// If this is the bottommost level (no files in lower levels)
|
|
1088
1138
|
// and the earliest snapshot is larger than this seqno
|
|
@@ -1097,7 +1147,8 @@ void CompactionIterator::PrepareOutput() {
|
|
|
1097
1147
|
if (valid_ && compaction_ != nullptr &&
|
|
1098
1148
|
!compaction_->allow_ingest_behind() && bottommost_level_ &&
|
|
1099
1149
|
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
|
|
1100
|
-
ikey_.type != kTypeMerge && current_key_committed_
|
|
1150
|
+
ikey_.type != kTypeMerge && current_key_committed_ &&
|
|
1151
|
+
!output_to_penultimate_level_) {
|
|
1101
1152
|
if (ikey_.type == kTypeDeletion ||
|
|
1102
1153
|
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
|
|
1103
1154
|
ROCKS_LOG_FATAL(
|
|
@@ -32,7 +32,7 @@ class SequenceIterWrapper : public InternalIterator {
|
|
|
32
32
|
public:
|
|
33
33
|
SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
|
|
34
34
|
bool need_count_entries)
|
|
35
|
-
: icmp_(cmp
|
|
35
|
+
: icmp_(cmp),
|
|
36
36
|
inner_iter_(iter),
|
|
37
37
|
need_count_entries_(need_count_entries) {}
|
|
38
38
|
bool Valid() const override { return inner_iter_->Valid(); }
|
|
@@ -105,6 +105,10 @@ class CompactionIterator {
|
|
|
105
105
|
virtual bool DoesInputReferenceBlobFiles() const = 0;
|
|
106
106
|
|
|
107
107
|
virtual const Compaction* real_compaction() const = 0;
|
|
108
|
+
|
|
109
|
+
virtual bool SupportsPerKeyPlacement() const = 0;
|
|
110
|
+
|
|
111
|
+
virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
|
|
108
112
|
};
|
|
109
113
|
|
|
110
114
|
class RealCompaction : public CompactionProxy {
|
|
@@ -163,6 +167,16 @@ class CompactionIterator {
|
|
|
163
167
|
|
|
164
168
|
const Compaction* real_compaction() const override { return compaction_; }
|
|
165
169
|
|
|
170
|
+
bool SupportsPerKeyPlacement() const override {
|
|
171
|
+
return compaction_->SupportsPerKeyPlacement();
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Check if key is within penultimate level output range, to see if it's
|
|
175
|
+
// safe to output to the penultimate level for per_key_placement feature.
|
|
176
|
+
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
|
|
177
|
+
return compaction_->WithinPenultimateLevelOutputRange(key);
|
|
178
|
+
}
|
|
179
|
+
|
|
166
180
|
private:
|
|
167
181
|
const Compaction* compaction_;
|
|
168
182
|
};
|
|
@@ -227,6 +241,12 @@ class CompactionIterator {
|
|
|
227
241
|
const Slice& user_key() const { return current_user_key_; }
|
|
228
242
|
const CompactionIterationStats& iter_stats() const { return iter_stats_; }
|
|
229
243
|
uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
|
|
244
|
+
// If the current key should be placed on penultimate level, only valid if
|
|
245
|
+
// per_key_placement is supported
|
|
246
|
+
bool output_to_penultimate_level() const {
|
|
247
|
+
return output_to_penultimate_level_;
|
|
248
|
+
}
|
|
249
|
+
Status InputStatus() const { return input_.status(); }
|
|
230
250
|
|
|
231
251
|
private:
|
|
232
252
|
// Processes the input stream to find the next output
|
|
@@ -235,6 +255,10 @@ class CompactionIterator {
|
|
|
235
255
|
// Do final preparations before presenting the output to the callee.
|
|
236
256
|
void PrepareOutput();
|
|
237
257
|
|
|
258
|
+
// Decide the current key should be output to the last level or penultimate
|
|
259
|
+
// level, only call for compaction supports per key placement
|
|
260
|
+
void DecideOutputLevel();
|
|
261
|
+
|
|
238
262
|
// Passes the output value to the blob file builder (if any), and replaces it
|
|
239
263
|
// with the corresponding blob reference if it has been actually written to a
|
|
240
264
|
// blob file (i.e. if it passed the value size check). Returns true if the
|
|
@@ -417,6 +441,11 @@ class CompactionIterator {
|
|
|
417
441
|
// just been zeroed out during bottommost compaction.
|
|
418
442
|
bool last_key_seq_zeroed_{false};
|
|
419
443
|
|
|
444
|
+
// True if the current key should be output to the penultimate level if
|
|
445
|
+
// possible, compaction logic makes the final decision on which level to
|
|
446
|
+
// output to.
|
|
447
|
+
bool output_to_penultimate_level_{false};
|
|
448
|
+
|
|
420
449
|
void AdvanceInputIter() { input_.Next(); }
|
|
421
450
|
|
|
422
451
|
void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
|
|
@@ -180,11 +180,21 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
|
|
|
180
180
|
|
|
181
181
|
const Compaction* real_compaction() const override { return nullptr; }
|
|
182
182
|
|
|
183
|
+
bool SupportsPerKeyPlacement() const override {
|
|
184
|
+
return supports_per_key_placement;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
|
|
188
|
+
return (!key.starts_with("unsafe_pb"));
|
|
189
|
+
}
|
|
190
|
+
|
|
183
191
|
bool key_not_exists_beyond_output_level = false;
|
|
184
192
|
|
|
185
193
|
bool is_bottommost_level = false;
|
|
186
194
|
|
|
187
195
|
bool is_allow_ingest_behind = false;
|
|
196
|
+
|
|
197
|
+
bool supports_per_key_placement = false;
|
|
188
198
|
};
|
|
189
199
|
|
|
190
200
|
// A simplified snapshot checker which assumes each snapshot has a global
|
|
@@ -254,6 +264,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
|
|
|
254
264
|
compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
|
|
255
265
|
compaction_proxy_->key_not_exists_beyond_output_level =
|
|
256
266
|
key_not_exists_beyond_output_level;
|
|
267
|
+
compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
|
|
257
268
|
compaction.reset(compaction_proxy_);
|
|
258
269
|
}
|
|
259
270
|
bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
|
|
@@ -295,6 +306,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
|
|
|
295
306
|
|
|
296
307
|
virtual bool AllowIngestBehind() const { return false; }
|
|
297
308
|
|
|
309
|
+
virtual bool SupportsPerKeyPlacement() const { return false; }
|
|
310
|
+
|
|
298
311
|
void RunTest(
|
|
299
312
|
const std::vector<std::string>& input_keys,
|
|
300
313
|
const std::vector<std::string>& input_values,
|
|
@@ -756,6 +769,119 @@ TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
|
|
|
756
769
|
INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
|
|
757
770
|
testing::Values(true, false));
|
|
758
771
|
|
|
772
|
+
class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
|
|
773
|
+
public:
|
|
774
|
+
bool SupportsPerKeyPlacement() const override { return true; }
|
|
775
|
+
};
|
|
776
|
+
|
|
777
|
+
TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
|
|
778
|
+
std::atomic_uint64_t latest_cold_seq = 0;
|
|
779
|
+
|
|
780
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
781
|
+
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
|
782
|
+
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
|
783
|
+
context->output_to_penultimate_level =
|
|
784
|
+
context->seq_num > latest_cold_seq;
|
|
785
|
+
});
|
|
786
|
+
SyncPoint::GetInstance()->EnableProcessing();
|
|
787
|
+
|
|
788
|
+
latest_cold_seq = 5;
|
|
789
|
+
|
|
790
|
+
InitIterators(
|
|
791
|
+
{test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
|
|
792
|
+
test::KeyStr("c", 5, kTypeValue)},
|
|
793
|
+
{"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
|
|
794
|
+
nullptr, nullptr, true);
|
|
795
|
+
c_iter_->SeekToFirst();
|
|
796
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
797
|
+
|
|
798
|
+
// the first 2 keys are hot, which should has
|
|
799
|
+
// `output_to_penultimate_level()==true` and seq num not zeroed out
|
|
800
|
+
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
|
801
|
+
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
|
802
|
+
c_iter_->Next();
|
|
803
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
804
|
+
ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
|
|
805
|
+
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
|
806
|
+
c_iter_->Next();
|
|
807
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
808
|
+
// `a` is cold data, which should be output to bottommost
|
|
809
|
+
ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
|
|
810
|
+
ASSERT_FALSE(c_iter_->output_to_penultimate_level());
|
|
811
|
+
c_iter_->Next();
|
|
812
|
+
ASSERT_OK(c_iter_->status());
|
|
813
|
+
ASSERT_FALSE(c_iter_->Valid());
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
|
|
817
|
+
AddSnapshot(5);
|
|
818
|
+
|
|
819
|
+
InitIterators(
|
|
820
|
+
{test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
|
|
821
|
+
test::KeyStr("b", 5, kTypeValue)},
|
|
822
|
+
{"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
|
|
823
|
+
nullptr, nullptr, true);
|
|
824
|
+
c_iter_->SeekToFirst();
|
|
825
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
826
|
+
|
|
827
|
+
// The first key and the tombstone are within snapshot, which should output
|
|
828
|
+
// to the penultimate level (and seq num cannot be zeroed out).
|
|
829
|
+
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
|
830
|
+
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
|
831
|
+
c_iter_->Next();
|
|
832
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
833
|
+
ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
|
|
834
|
+
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
|
835
|
+
c_iter_->Next();
|
|
836
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
837
|
+
// `a` is not protected by the snapshot, the sequence number is zero out and
|
|
838
|
+
// should output bottommost
|
|
839
|
+
ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
|
|
840
|
+
ASSERT_FALSE(c_iter_->output_to_penultimate_level());
|
|
841
|
+
c_iter_->Next();
|
|
842
|
+
ASSERT_OK(c_iter_->status());
|
|
843
|
+
ASSERT_FALSE(c_iter_->Valid());
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
|
|
847
|
+
std::atomic_uint64_t latest_cold_seq = 0;
|
|
848
|
+
|
|
849
|
+
SyncPoint::GetInstance()->SetCallBack(
|
|
850
|
+
"CompactionIterator::PrepareOutput.context", [&](void* arg) {
|
|
851
|
+
auto context = static_cast<PerKeyPlacementContext*>(arg);
|
|
852
|
+
context->output_to_penultimate_level =
|
|
853
|
+
context->seq_num > latest_cold_seq;
|
|
854
|
+
});
|
|
855
|
+
SyncPoint::GetInstance()->EnableProcessing();
|
|
856
|
+
|
|
857
|
+
latest_cold_seq = 6;
|
|
858
|
+
|
|
859
|
+
AddSnapshot(5);
|
|
860
|
+
|
|
861
|
+
InitIterators({test::KeyStr("a", 7, kTypeValue),
|
|
862
|
+
test::KeyStr("unsafe_pb", 6, kTypeValue),
|
|
863
|
+
test::KeyStr("c", 5, kTypeValue)},
|
|
864
|
+
{"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
|
|
865
|
+
kMaxSequenceNumber, nullptr, nullptr, true);
|
|
866
|
+
c_iter_->SeekToFirst();
|
|
867
|
+
ASSERT_TRUE(c_iter_->Valid());
|
|
868
|
+
|
|
869
|
+
ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
|
|
870
|
+
ASSERT_TRUE(c_iter_->output_to_penultimate_level());
|
|
871
|
+
// the 2nd key is unsafe to output_to_penultimate_level, but it's within
|
|
872
|
+
// snapshot so for per_key_placement feature it has to be outputted to the
|
|
873
|
+
// penultimate level. which is a corruption. We should never see
|
|
874
|
+
// such case as the data with seq num (within snapshot) should always come
|
|
875
|
+
// from higher compaction input level, which makes it safe to
|
|
876
|
+
// output_to_penultimate_level.
|
|
877
|
+
c_iter_->Next();
|
|
878
|
+
ASSERT_TRUE(c_iter_->status().IsCorruption());
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
|
|
882
|
+
PerKeyPlacementCompIteratorTest,
|
|
883
|
+
testing::Values(true, false));
|
|
884
|
+
|
|
759
885
|
// Tests how CompactionIterator work together with SnapshotChecker.
|
|
760
886
|
class CompactionIteratorWithSnapshotCheckerTest
|
|
761
887
|
: public CompactionIteratorTest {
|