@nxtedition/rocksdb 8.2.0-alpha.1 → 8.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +11 -74
- package/binding.gyp +7 -5
- package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
- package/deps/rocksdb/rocksdb/TARGETS +7 -0
- package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
- package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
- package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
- package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
- package/deps/rocksdb/rocksdb/db/c.cc +90 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
- package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
- package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
- package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
- package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
- package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
- package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
- package/deps/rocksdb/rocksdb/src.mk +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
- package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
- package/deps/rocksdb/rocksdb.gyp +6 -7
- package/index.js +0 -6
- package/package.json +1 -1
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/deps/liburing/liburing.gyp +0 -20
- package/tmp/test.js +0 -7
|
@@ -226,6 +226,15 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
|
|
|
226
226
|
bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
|
|
227
227
|
assert(c_iter.Valid());
|
|
228
228
|
const Slice& internal_key = c_iter.key();
|
|
229
|
+
#ifndef NDEBUG
|
|
230
|
+
bool should_stop = false;
|
|
231
|
+
std::pair<bool*, const Slice> p{&should_stop, internal_key};
|
|
232
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
233
|
+
"CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
|
|
234
|
+
if (should_stop) {
|
|
235
|
+
return true;
|
|
236
|
+
}
|
|
237
|
+
#endif // NDEBUG
|
|
229
238
|
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
|
|
230
239
|
const InternalKeyComparator* icmp =
|
|
231
240
|
&compaction_->column_family_data()->internal_comparator();
|
|
@@ -347,8 +356,14 @@ Status CompactionOutputs::AddToOutput(
|
|
|
347
356
|
const CompactionFileOpenFunc& open_file_func,
|
|
348
357
|
const CompactionFileCloseFunc& close_file_func) {
|
|
349
358
|
Status s;
|
|
359
|
+
bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
|
|
360
|
+
if (is_range_del && compaction_->bottommost_level()) {
|
|
361
|
+
// We don't consider range tombstone for bottommost level since:
|
|
362
|
+
// 1. there is no grandparent and hence no overlap to consider
|
|
363
|
+
// 2. range tombstone may be dropped at bottommost level.
|
|
364
|
+
return s;
|
|
365
|
+
}
|
|
350
366
|
const Slice& key = c_iter.key();
|
|
351
|
-
|
|
352
367
|
if (ShouldStopBefore(c_iter) && HasBuilder()) {
|
|
353
368
|
s = close_file_func(*this, c_iter.InputStatus(), key);
|
|
354
369
|
if (!s.ok()) {
|
|
@@ -358,6 +373,13 @@ Status CompactionOutputs::AddToOutput(
|
|
|
358
373
|
grandparent_boundary_switched_num_ = 0;
|
|
359
374
|
grandparent_overlapped_bytes_ =
|
|
360
375
|
GetCurrentKeyGrandparentOverlappedBytes(key);
|
|
376
|
+
if (UNLIKELY(is_range_del)) {
|
|
377
|
+
// lower bound for this new output file, this is needed as the lower bound
|
|
378
|
+
// does not come from the smallest point key in this case.
|
|
379
|
+
range_tombstone_lower_bound_.DecodeFrom(key);
|
|
380
|
+
} else {
|
|
381
|
+
range_tombstone_lower_bound_.Clear();
|
|
382
|
+
}
|
|
361
383
|
}
|
|
362
384
|
|
|
363
385
|
// Open output file if necessary
|
|
@@ -368,6 +390,17 @@ Status CompactionOutputs::AddToOutput(
|
|
|
368
390
|
}
|
|
369
391
|
}
|
|
370
392
|
|
|
393
|
+
// c_iter may emit range deletion keys, so update `last_key_for_partitioner_`
|
|
394
|
+
// here before returning below when `is_range_del` is true
|
|
395
|
+
if (partitioner_) {
|
|
396
|
+
last_key_for_partitioner_.assign(c_iter.user_key().data_,
|
|
397
|
+
c_iter.user_key().size_);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (UNLIKELY(is_range_del)) {
|
|
401
|
+
return s;
|
|
402
|
+
}
|
|
403
|
+
|
|
371
404
|
assert(builder_ != nullptr);
|
|
372
405
|
const Slice& value = c_iter.value();
|
|
373
406
|
s = current_output().validator.Add(key, value);
|
|
@@ -391,28 +424,33 @@ Status CompactionOutputs::AddToOutput(
|
|
|
391
424
|
s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
|
|
392
425
|
ikey.type);
|
|
393
426
|
|
|
394
|
-
if (partitioner_) {
|
|
395
|
-
last_key_for_partitioner_.assign(c_iter.user_key().data_,
|
|
396
|
-
c_iter.user_key().size_);
|
|
397
|
-
}
|
|
398
|
-
|
|
399
427
|
return s;
|
|
400
428
|
}
|
|
401
429
|
|
|
430
|
+
namespace {
|
|
431
|
+
void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
|
|
432
|
+
const size_t ts_sz) {
|
|
433
|
+
if (ts_sz) {
|
|
434
|
+
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
|
435
|
+
if (ts_sz <= strlen(kTsMax)) {
|
|
436
|
+
internal_key = InternalKey(user_key, kMaxSequenceNumber,
|
|
437
|
+
kTypeRangeDeletion, Slice(kTsMax, ts_sz));
|
|
438
|
+
} else {
|
|
439
|
+
internal_key =
|
|
440
|
+
InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
|
|
441
|
+
std::string(ts_sz, '\xff'));
|
|
442
|
+
}
|
|
443
|
+
} else {
|
|
444
|
+
internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
} // namespace
|
|
448
|
+
|
|
402
449
|
Status CompactionOutputs::AddRangeDels(
|
|
403
450
|
const Slice* comp_start_user_key, const Slice* comp_end_user_key,
|
|
404
451
|
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
|
|
405
452
|
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
|
|
406
453
|
const Slice& next_table_min_key, const std::string& full_history_ts_low) {
|
|
407
|
-
assert(HasRangeDel());
|
|
408
|
-
FileMetaData& meta = current_output().meta;
|
|
409
|
-
const Comparator* ucmp = icmp.user_comparator();
|
|
410
|
-
|
|
411
|
-
Slice lower_bound_guard, upper_bound_guard;
|
|
412
|
-
std::string smallest_user_key;
|
|
413
|
-
const Slice *lower_bound, *upper_bound;
|
|
414
|
-
bool lower_bound_from_sub_compact = false;
|
|
415
|
-
|
|
416
454
|
// The following example does not happen since
|
|
417
455
|
// CompactionOutput::ShouldStopBefore() always return false for the first
|
|
418
456
|
// point key. But we should consider removing this dependency. Suppose for the
|
|
@@ -424,98 +462,134 @@ Status CompactionOutputs::AddRangeDels(
|
|
|
424
462
|
// Then meta.smallest will be set to comp_start_user_key@seqno
|
|
425
463
|
// and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
|
|
426
464
|
// which violates the assumption that meta.smallest should be <= meta.largest.
|
|
465
|
+
assert(HasRangeDel());
|
|
466
|
+
FileMetaData& meta = current_output().meta;
|
|
467
|
+
const Comparator* ucmp = icmp.user_comparator();
|
|
468
|
+
InternalKey lower_bound_buf, upper_bound_buf;
|
|
469
|
+
Slice lower_bound_guard, upper_bound_guard;
|
|
470
|
+
std::string smallest_user_key;
|
|
471
|
+
const Slice *lower_bound, *upper_bound;
|
|
472
|
+
|
|
473
|
+
// We first determine the internal key lower_bound and upper_bound for
|
|
474
|
+
// this output file. All and only range tombstones that overlap with
|
|
475
|
+
// [lower_bound, upper_bound] should be added to this file. File
|
|
476
|
+
// boundaries (meta.smallest/largest) should be updated accordingly when
|
|
477
|
+
// extended by range tombstones.
|
|
427
478
|
size_t output_size = outputs_.size();
|
|
428
479
|
if (output_size == 1) {
|
|
429
|
-
//
|
|
430
|
-
//
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
480
|
+
// This is the first file in the subcompaction.
|
|
481
|
+
//
|
|
482
|
+
// When outputting a range tombstone that spans a subcompaction boundary,
|
|
483
|
+
// the files on either side of that boundary need to include that
|
|
484
|
+
// boundary's user key. Otherwise, the spanning range tombstone would lose
|
|
485
|
+
// coverage.
|
|
486
|
+
//
|
|
487
|
+
// To achieve this while preventing files from overlapping in internal key
|
|
488
|
+
// (an LSM invariant violation), we allow the earlier file to include the
|
|
489
|
+
// boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
|
|
490
|
+
// later file can begin at the boundary user key at the newest key version
|
|
491
|
+
// it contains. At this point that version number is unknown since we have
|
|
492
|
+
// not processed the range tombstones yet, so permit any version. Same story
|
|
493
|
+
// applies to timestamp, and a non-nullptr `comp_start_user_key` should have
|
|
494
|
+
// `kMaxTs` here, which similarly permits any timestamp.
|
|
495
|
+
if (comp_start_user_key) {
|
|
496
|
+
lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
|
|
497
|
+
kTypeRangeDeletion);
|
|
498
|
+
lower_bound_guard = lower_bound_buf.Encode();
|
|
499
|
+
lower_bound = &lower_bound_guard;
|
|
500
|
+
} else {
|
|
501
|
+
lower_bound = nullptr;
|
|
502
|
+
}
|
|
503
|
+
} else {
|
|
434
504
|
// For subsequent output tables, only include range tombstones from min
|
|
435
505
|
// key onwards since the previous file was extended to contain range
|
|
436
506
|
// tombstones falling before min key.
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
lower_bound = nullptr;
|
|
442
|
-
}
|
|
443
|
-
if (!next_table_min_key.empty()) {
|
|
444
|
-
// This may be the last file in the subcompaction in some cases, so we
|
|
445
|
-
// need to compare the end key of subcompaction with the next file start
|
|
446
|
-
// key. When the end key is chosen by the subcompaction, we know that
|
|
447
|
-
// it must be the biggest key in output file. Therefore, it is safe to
|
|
448
|
-
// use the smaller key as the upper bound of the output file, to ensure
|
|
449
|
-
// that there is no overlapping between different output files.
|
|
450
|
-
upper_bound_guard = ExtractUserKey(next_table_min_key);
|
|
451
|
-
if (comp_end_user_key != nullptr &&
|
|
452
|
-
ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
|
|
453
|
-
0) {
|
|
454
|
-
upper_bound = comp_end_user_key;
|
|
507
|
+
if (range_tombstone_lower_bound_.size() > 0) {
|
|
508
|
+
assert(meta.smallest.size() == 0 ||
|
|
509
|
+
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
|
|
510
|
+
lower_bound_guard = range_tombstone_lower_bound_.Encode();
|
|
455
511
|
} else {
|
|
512
|
+
assert(meta.smallest.size() > 0);
|
|
513
|
+
lower_bound_guard = meta.smallest.Encode();
|
|
514
|
+
}
|
|
515
|
+
lower_bound = &lower_bound_guard;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
const size_t ts_sz = ucmp->timestamp_size();
|
|
519
|
+
if (next_table_min_key.empty()) {
|
|
520
|
+
// Last file of the subcompaction.
|
|
521
|
+
if (comp_end_user_key) {
|
|
522
|
+
upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
|
|
523
|
+
kTypeRangeDeletion);
|
|
524
|
+
upper_bound_guard = upper_bound_buf.Encode();
|
|
456
525
|
upper_bound = &upper_bound_guard;
|
|
526
|
+
} else {
|
|
527
|
+
upper_bound = nullptr;
|
|
457
528
|
}
|
|
458
529
|
} else {
|
|
459
|
-
//
|
|
460
|
-
//
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
530
|
+
// There is another file coming whose coverage will begin at
|
|
531
|
+
// `next_table_min_key`. The current file needs to extend range tombstone
|
|
532
|
+
// coverage through its own keys (through `meta.largest`) and through user
|
|
533
|
+
// keys preceding `next_table_min_key`'s user key.
|
|
534
|
+
ParsedInternalKey next_table_min_key_parsed;
|
|
535
|
+
ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
|
|
536
|
+
false /* log_err_key */)
|
|
537
|
+
.PermitUncheckedError();
|
|
538
|
+
assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
|
|
539
|
+
assert(meta.largest.size() == 0 ||
|
|
540
|
+
icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
|
|
541
|
+
assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
|
|
542
|
+
if (meta.largest.size() > 0 &&
|
|
543
|
+
ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
|
|
544
|
+
next_table_min_key_parsed.user_key)) {
|
|
545
|
+
// Caution: this assumes meta.largest.Encode() lives longer than
|
|
546
|
+
// upper_bound, which is only true if meta.largest is never updated.
|
|
547
|
+
// This just happens to be the case here since meta.largest serves
|
|
548
|
+
// as the upper_bound.
|
|
549
|
+
upper_bound_guard = meta.largest.Encode();
|
|
550
|
+
} else {
|
|
551
|
+
SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
|
|
552
|
+
ts_sz);
|
|
553
|
+
upper_bound_guard = upper_bound_buf.Encode();
|
|
554
|
+
}
|
|
555
|
+
upper_bound = &upper_bound_guard;
|
|
556
|
+
}
|
|
557
|
+
if (lower_bound && upper_bound &&
|
|
558
|
+
icmp.Compare(*lower_bound, *upper_bound) > 0) {
|
|
559
|
+
assert(meta.smallest.size() == 0 &&
|
|
560
|
+
ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
|
|
561
|
+
ExtractUserKey(*upper_bound)));
|
|
562
|
+
// This can only happen when lower_bound have the same user key as
|
|
563
|
+
// next_table_min_key and that there is no point key in the current
|
|
564
|
+
// compaction output file.
|
|
565
|
+
return Status::OK();
|
|
469
566
|
}
|
|
470
|
-
|
|
471
567
|
// The end key of the subcompaction must be bigger or equal to the upper
|
|
472
568
|
// bound. If the end of subcompaction is null or the upper bound is null,
|
|
473
569
|
// it means that this file is the last file in the compaction. So there
|
|
474
570
|
// will be no overlapping between this file and others.
|
|
475
571
|
assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
|
|
476
|
-
ucmp->CompareWithoutTimestamp(*upper_bound,
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
// Position the range tombstone output iterator. There may be tombstone
|
|
480
|
-
// fragments that are entirely out of range, so make sure that we do not
|
|
481
|
-
// include those.
|
|
482
|
-
if (lower_bound != nullptr) {
|
|
483
|
-
it->Seek(*lower_bound);
|
|
484
|
-
} else {
|
|
485
|
-
it->SeekToFirst();
|
|
486
|
-
}
|
|
572
|
+
ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
|
|
573
|
+
*comp_end_user_key) <= 0);
|
|
574
|
+
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
|
|
487
575
|
Slice last_tombstone_start_user_key{};
|
|
488
|
-
|
|
576
|
+
bool reached_lower_bound = false;
|
|
577
|
+
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
489
578
|
auto tombstone = it->Tombstone();
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
// to the point keys or endpoints of the current file.
|
|
500
|
-
// If the current SST ends at the same user key at upper_bound,
|
|
501
|
-
// i.e., `has_overlapping_endpoints == true`, AND the tombstone has
|
|
502
|
-
// the same start key as upper_bound, i.e., cmp == 0, then
|
|
503
|
-
// the tombstone is relevant only if the tombstone's sequence number
|
|
504
|
-
// is no larger than this file's largest key's sequence number. This
|
|
505
|
-
// is because the upper bound to truncate this file's range tombstone
|
|
506
|
-
// will be meta.largest in this case, and any tombstone that starts after
|
|
507
|
-
// it will not be relevant.
|
|
508
|
-
if (cmp < 0) {
|
|
509
|
-
break;
|
|
510
|
-
} else if (cmp == 0) {
|
|
511
|
-
if (!has_overlapping_endpoints ||
|
|
512
|
-
tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) {
|
|
513
|
-
break;
|
|
514
|
-
}
|
|
515
|
-
}
|
|
579
|
+
auto kv = tombstone.Serialize();
|
|
580
|
+
InternalKey tombstone_end = tombstone.SerializeEndKey();
|
|
581
|
+
// TODO: the underlying iterator should support clamping the bounds.
|
|
582
|
+
// tombstone_end.Encode is of form user_key@kMaxSeqno
|
|
583
|
+
// if it is equal to lower_bound, there is no need to include
|
|
584
|
+
// such range tombstone.
|
|
585
|
+
if (!reached_lower_bound && lower_bound &&
|
|
586
|
+
icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
|
|
587
|
+
continue;
|
|
516
588
|
}
|
|
589
|
+
assert(!lower_bound ||
|
|
590
|
+
icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
|
|
591
|
+
reached_lower_bound = true;
|
|
517
592
|
|
|
518
|
-
const size_t ts_sz = ucmp->timestamp_size();
|
|
519
593
|
// Garbage collection for range tombstones.
|
|
520
594
|
// If user-defined timestamp is enabled, range tombstones are dropped if
|
|
521
595
|
// they are at bottommost_level, below full_history_ts_low and not visible
|
|
@@ -534,83 +608,93 @@ Status CompactionOutputs::AddRangeDels(
|
|
|
534
608
|
continue;
|
|
535
609
|
}
|
|
536
610
|
|
|
537
|
-
auto kv = tombstone.Serialize();
|
|
538
611
|
assert(lower_bound == nullptr ||
|
|
539
|
-
ucmp->CompareWithoutTimestamp(*lower_bound,
|
|
612
|
+
ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
|
|
613
|
+
kv.second) < 0);
|
|
614
|
+
InternalKey tombstone_start = kv.first;
|
|
615
|
+
if (lower_bound &&
|
|
616
|
+
ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
|
|
617
|
+
ExtractUserKey(*lower_bound)) < 0) {
|
|
618
|
+
// This just updates the non-timestamp portion of `tombstone_start`'s user
|
|
619
|
+
// key. Ideally there would be a simpler API usage
|
|
620
|
+
ParsedInternalKey tombstone_start_parsed;
|
|
621
|
+
ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
|
|
622
|
+
false /* log_err_key */)
|
|
623
|
+
.PermitUncheckedError();
|
|
624
|
+
// timestamp should be from where sequence number is from, which is from
|
|
625
|
+
// tombstone in this case
|
|
626
|
+
std::string ts =
|
|
627
|
+
tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
|
|
628
|
+
.ToString();
|
|
629
|
+
tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
|
|
630
|
+
tombstone_start.SetFrom(tombstone_start_parsed, ts);
|
|
631
|
+
}
|
|
632
|
+
if (upper_bound != nullptr &&
|
|
633
|
+
icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
|
|
634
|
+
break;
|
|
635
|
+
}
|
|
636
|
+
// Here we show that *only* range tombstones that overlap with
|
|
637
|
+
// [lower_bound, upper_bound] are added to the current file, and
|
|
638
|
+
// sanity checking invariants that should hold:
|
|
639
|
+
// - [tombstone_start, tombstone_end] overlaps with [lower_bound,
|
|
640
|
+
// upper_bound]
|
|
641
|
+
// - meta.smallest <= meta.largest
|
|
642
|
+
// Corresponding assertions are made, the proof is broken is any of them
|
|
643
|
+
// fails.
|
|
644
|
+
// TODO: show that *all* range tombstones that overlap with
|
|
645
|
+
// [lower_bound, upper_bound] are added.
|
|
646
|
+
// TODO: some invariant about boundaries are correctly updated.
|
|
647
|
+
//
|
|
648
|
+
// Note that `tombstone_start` is updated in the if condition above, we use
|
|
649
|
+
// tombstone_start to refer to its initial value, i.e.,
|
|
650
|
+
// it->Tombstone().first, and use tombstone_start* to refer to its value
|
|
651
|
+
// after the update.
|
|
652
|
+
//
|
|
653
|
+
// To show [lower_bound, upper_bound] overlaps with [tombstone_start,
|
|
654
|
+
// tombstone_end]:
|
|
655
|
+
// lower_bound <= upper_bound from the if condition right after all
|
|
656
|
+
// bounds are initialized. We assume each tombstone fragment has
|
|
657
|
+
// start_key.user_key < end_key.user_key, so
|
|
658
|
+
// tombstone_start < tombstone_end by
|
|
659
|
+
// FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
|
|
660
|
+
// non-emtpy. The flag `reached_lower_bound` and the if logic before it
|
|
661
|
+
// ensures lower_bound <= tombstone_end. tombstone_start is only updated
|
|
662
|
+
// if it has a smaller user_key than lower_bound user_key, so
|
|
663
|
+
// tombstone_start <= tombstone_start*. The above if condition implies
|
|
664
|
+
// tombstone_start* <= upper_bound. So we have
|
|
665
|
+
// tombstone_start <= upper_bound and lower_bound <= tombstone_end
|
|
666
|
+
// and the two ranges overlap.
|
|
667
|
+
//
|
|
668
|
+
// To show meta.smallest <= meta.largest:
|
|
669
|
+
// From the implementation of UpdateBoundariesForRange(), it suffices to
|
|
670
|
+
// prove that when it is first called in this function, its parameters
|
|
671
|
+
// satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
|
|
672
|
+
// and end = min(tombstone_end, upper_bound). From the above proof we have
|
|
673
|
+
// lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
|
|
674
|
+
// to show that tombstone_start* <= min(tombstone_end, upper_bound).
|
|
675
|
+
// Note that tombstone_start*.user_key = max(tombstone_start.user_key,
|
|
676
|
+
// lower_bound.user_key). Assuming tombstone_end always has
|
|
677
|
+
// kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
|
|
678
|
+
// Since lower_bound <= tombstone_end and lower_bound.seqno <
|
|
679
|
+
// tombstone_end.seqno (in absolute number order, not internal key order),
|
|
680
|
+
// lower_bound.user_key < tombstone_end.user_key.
|
|
681
|
+
// Since lower_bound.user_key < tombstone_end.user_key and
|
|
682
|
+
// tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
|
|
683
|
+
// tombstone_end. Since tombstone_start* <= upper_bound from the above proof
|
|
684
|
+
// and tombstone_start* < tombstone_end, tombstone_start* <=
|
|
685
|
+
// min(tombstone_end, upper_bound), so the two ranges overlap.
|
|
686
|
+
|
|
540
687
|
// Range tombstone is not supported by output validator yet.
|
|
541
688
|
builder_->Add(kv.first.Encode(), kv.second);
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
|
|
546
|
-
*lower_bound) <= 0) {
|
|
547
|
-
// Pretend the smallest key has the same user key as lower_bound
|
|
548
|
-
// (the max key in the previous table or subcompaction) in order for
|
|
549
|
-
// files to appear key-space partitioned.
|
|
550
|
-
if (lower_bound_from_sub_compact) {
|
|
551
|
-
// When lower_bound is chosen by a subcompaction
|
|
552
|
-
// (lower_bound_from_sub_compact), we know that subcompactions over
|
|
553
|
-
// smaller keys cannot contain any keys at lower_bound. We also know
|
|
554
|
-
// that smaller subcompactions exist, because otherwise the
|
|
555
|
-
// subcompaction woud be unbounded on the left. As a result, we know
|
|
556
|
-
// that no other files on the output level will contain actual keys at
|
|
557
|
-
// lower_bound (an output file may have a largest key of
|
|
558
|
-
// lower_bound@kMaxSequenceNumber, but this only indicates a large range
|
|
559
|
-
// tombstone was truncated). Therefore, it is safe to use the
|
|
560
|
-
// tombstone's sequence number, to ensure that keys at lower_bound at
|
|
561
|
-
// lower levels are covered by truncated tombstones.
|
|
562
|
-
if (ts_sz) {
|
|
563
|
-
assert(tombstone.ts_.size() == ts_sz);
|
|
564
|
-
smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
|
|
565
|
-
kTypeRangeDeletion, tombstone.ts_);
|
|
566
|
-
} else {
|
|
567
|
-
smallest_candidate =
|
|
568
|
-
InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
|
|
569
|
-
}
|
|
570
|
-
} else {
|
|
571
|
-
// If lower_bound was chosen by the smallest data key in the file,
|
|
572
|
-
// choose lowest seqnum so this file's smallest internal key comes
|
|
573
|
-
// after the previous file's largest. The fake seqnum is OK because
|
|
574
|
-
// the read path's file-picking code only considers user key.
|
|
575
|
-
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
|
|
576
|
-
}
|
|
689
|
+
if (lower_bound &&
|
|
690
|
+
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
|
|
691
|
+
tombstone_start.DecodeFrom(*lower_bound);
|
|
577
692
|
}
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
if (upper_bound != nullptr &&
|
|
581
|
-
ucmp->CompareWithoutTimestamp(*upper_bound,
|
|
582
|
-
largest_candidate.user_key()) <= 0) {
|
|
583
|
-
// Pretend the largest key has the same user key as upper_bound (the
|
|
584
|
-
// min key in the following table or subcompaction) in order for files
|
|
585
|
-
// to appear key-space partitioned.
|
|
586
|
-
//
|
|
587
|
-
// Choose highest seqnum so this file's largest internal key comes
|
|
588
|
-
// before the next file's/subcompaction's smallest. The fake seqnum is
|
|
589
|
-
// OK because the read path's file-picking code only considers the
|
|
590
|
-
// user key portion.
|
|
591
|
-
//
|
|
592
|
-
// Note Seek() also creates InternalKey with (user_key,
|
|
593
|
-
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
|
|
594
|
-
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
|
|
595
|
-
// Seek() key in InternalKey's ordering. So Seek() will look in the
|
|
596
|
-
// next file for the user key
|
|
597
|
-
if (ts_sz) {
|
|
598
|
-
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
|
599
|
-
if (ts_sz <= strlen(kTsMax)) {
|
|
600
|
-
largest_candidate =
|
|
601
|
-
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
|
|
602
|
-
Slice(kTsMax, ts_sz));
|
|
603
|
-
} else {
|
|
604
|
-
largest_candidate =
|
|
605
|
-
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
|
|
606
|
-
std::string(ts_sz, '\xff'));
|
|
607
|
-
}
|
|
608
|
-
} else {
|
|
609
|
-
largest_candidate =
|
|
610
|
-
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
|
|
611
|
-
}
|
|
693
|
+
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
|
|
694
|
+
tombstone_end.DecodeFrom(*upper_bound);
|
|
612
695
|
}
|
|
613
|
-
|
|
696
|
+
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
|
|
697
|
+
meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
|
|
614
698
|
tombstone.seq_, icmp);
|
|
615
699
|
if (!bottommost_level) {
|
|
616
700
|
bool start_user_key_changed =
|
|
@@ -618,17 +702,8 @@ Status CompactionOutputs::AddRangeDels(
|
|
|
618
702
|
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
|
|
619
703
|
it->start_key()) < 0;
|
|
620
704
|
last_tombstone_start_user_key = it->start_key();
|
|
621
|
-
// Range tombstones are truncated at file boundaries
|
|
622
|
-
if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
|
|
623
|
-
tombstone_start = meta.smallest;
|
|
624
|
-
}
|
|
625
|
-
if (icmp.Compare(tombstone_end, meta.largest) > 0) {
|
|
626
|
-
tombstone_end = meta.largest;
|
|
627
|
-
}
|
|
628
|
-
// this assertion validates invariant (2) in the comment below.
|
|
629
|
-
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
|
|
630
705
|
if (start_user_key_changed) {
|
|
631
|
-
//
|
|
706
|
+
// If tombstone_start >= tombstone_end, then either no key range is
|
|
632
707
|
// covered, or that they have the same user key. If they have the same
|
|
633
708
|
// user key, then the internal key range should only be within this
|
|
634
709
|
// level, and no keys from older levels is covered.
|
|
@@ -646,138 +721,6 @@ Status CompactionOutputs::AddRangeDels(
|
|
|
646
721
|
}
|
|
647
722
|
}
|
|
648
723
|
}
|
|
649
|
-
// TODO: show invariants that ensure all necessary range tombstones are
|
|
650
|
-
// added
|
|
651
|
-
// and that file boundaries ensure no coverage is lost.
|
|
652
|
-
// Each range tombstone with internal key range [tombstone_start,
|
|
653
|
-
// tombstone_end] is being added to the current compaction output file here.
|
|
654
|
-
// The range tombstone is going to be truncated at range [meta.smallest,
|
|
655
|
-
// meta.largest] during reading/scanning. We should maintain invariants
|
|
656
|
-
// (1) meta.smallest <= meta.largest and,
|
|
657
|
-
// (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest]
|
|
658
|
-
// overlaps, as there is no point adding range tombstone with a range
|
|
659
|
-
// outside the file's range.
|
|
660
|
-
// Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to
|
|
661
|
-
// use either open or closed range. Using closed range here to make
|
|
662
|
-
// reasoning easier, and it is more consistent with an ongoing work that
|
|
663
|
-
// tries to simplify this method.
|
|
664
|
-
//
|
|
665
|
-
// There are two cases:
|
|
666
|
-
// Case 1. Output file has no point key:
|
|
667
|
-
// First we show this case only happens when the entire compaction output
|
|
668
|
-
// is range tombstone only. This is true if CompactionIterator does not
|
|
669
|
-
// emit any point key. Suppose CompactionIterator emits some point key.
|
|
670
|
-
// Based on the assumption that CompactionOutputs::ShouldStopBefore()
|
|
671
|
-
// always return false for the first point key, the first compaction
|
|
672
|
-
// output file always contains a point key. Each new compaction output
|
|
673
|
-
// file is created if there is a point key for which ShouldStopBefore()
|
|
674
|
-
// returns true, and the point key would be added to the new compaction
|
|
675
|
-
// output file. So each new compaction file always contains a point key.
|
|
676
|
-
// So Case 1 only happens when CompactionIterator does not emit any
|
|
677
|
-
// point key.
|
|
678
|
-
//
|
|
679
|
-
// To show (1) meta.smallest <= meta.largest:
|
|
680
|
-
// Since the compaction output is range tombstone only, `lower_bound` and
|
|
681
|
-
// `upper_bound` are either null or comp_start/end_user_key respectively.
|
|
682
|
-
// According to how UpdateBoundariesForRange() is implemented, it blindly
|
|
683
|
-
// updates meta.smallest and meta.largest to smallest_candidate and
|
|
684
|
-
// largest_candidate the first time it is called. Subsequently, it
|
|
685
|
-
// compares input parameter with meta.smallest and meta.largest and only
|
|
686
|
-
// updates them when input is smaller/larger. So we only need to show
|
|
687
|
-
// smallest_candidate <= largest_candidate the first time
|
|
688
|
-
// UpdateBoundariesForRange() is called. Here we show something stronger
|
|
689
|
-
// that smallest_candidate.user_key < largest_candidate.user_key always
|
|
690
|
-
// hold for Case 1.
|
|
691
|
-
// We assume comp_start_user_key < comp_end_user_key, if provided. We
|
|
692
|
-
// assume that tombstone_start < tombstone_end. This assumption is based
|
|
693
|
-
// on that each fragment in FragmentedTombstoneList has
|
|
694
|
-
// start_key < end_key (user_key) and that
|
|
695
|
-
// FragmentedTombstoneIterator::Tombstone() returns the pair
|
|
696
|
-
// (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key).
|
|
697
|
-
// The logic in this loop sets smallest_candidate to
|
|
698
|
-
// max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with
|
|
699
|
-
// op_type kTypeRangeDeletion, largest_candidate to
|
|
700
|
-
// min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with
|
|
701
|
-
// op_type kTypeRangeDeletion. When a bound is null, there is no
|
|
702
|
-
// truncation on that end. To show that smallest_candidate.user_key <
|
|
703
|
-
// largest_candidate.user_key, it suffices to show
|
|
704
|
-
// tombstone_start.user_key < comp_end_user_key (if not null) AND
|
|
705
|
-
// comp_start_user_key (if not null) < tombstone_end.user_key.
|
|
706
|
-
// Since the file has no point key, `has_overlapping_endpoints` is false.
|
|
707
|
-
// In the first sanity check of this for-loop, we compare
|
|
708
|
-
// tombstone_start.user_key against upper_bound = comp_end_user_key,
|
|
709
|
-
// and only proceed if tombstone_start.user_key < comp_end_user_key.
|
|
710
|
-
// We assume FragmentedTombstoneIterator::Seek(k) lands
|
|
711
|
-
// on a tombstone with end_key > k. So the call it->Seek(*lower_bound)
|
|
712
|
-
// above implies compact_start_user_key < tombstone_end.user_key.
|
|
713
|
-
//
|
|
714
|
-
// To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
|
|
715
|
-
// meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
|
|
716
|
-
// In the proof for (1) we have shown that
|
|
717
|
-
// smallest_candidate <= largest_candidate. Since tombstone_start <=
|
|
718
|
-
// smallest_candidate <= largest_candidate <= tombstone_end, for (2) to
|
|
719
|
-
// hold, it suffices to show that [smallest_candidate, largest_candidate]
|
|
720
|
-
// overlaps with [meta.smallest, meta.largest]. too.
|
|
721
|
-
// Given meta.smallest <= meta.largest shown above, we need to show
|
|
722
|
-
// that it is impossible to have largest_candidate < meta.smallest or
|
|
723
|
-
// meta.largest < smallest_candidate. If the above
|
|
724
|
-
// meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate)
|
|
725
|
-
// updates meta.largest or meta.smallest, then the two ranges overlap.
|
|
726
|
-
// So we assume meta.UpdateBoundariesForRange(smallest_candidate,
|
|
727
|
-
// largest_candidate) did not update meta.smallest nor meta.largest, which
|
|
728
|
-
// means meta.smallest < smallest_candidate and largest_candidate <
|
|
729
|
-
// meta.largest.
|
|
730
|
-
//
|
|
731
|
-
// Case 2. Output file has >= 1 point key. This means meta.smallest and
|
|
732
|
-
// meta.largest are not empty when AddRangeDels() is called.
|
|
733
|
-
// To show (1) meta.smallest <= meta.largest:
|
|
734
|
-
// Assume meta.smallest <= meta.largest when AddRangeDels() is called,
|
|
735
|
-
// this follow from how UpdateBoundariesForRange() is implemented where it
|
|
736
|
-
// takes min or max to update meta.smallest or meta.largest.
|
|
737
|
-
//
|
|
738
|
-
// To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
|
|
739
|
-
// meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
|
|
740
|
-
// When smallest_candidate <= largest_candidate, the proof in Case 1
|
|
741
|
-
// applies, so we only need to show (2) holds when smallest_candidate >
|
|
742
|
-
// largest_candidate. When both bounds are either null or from
|
|
743
|
-
// subcompaction boundary, the proof in Case 1 applies, so we only need to
|
|
744
|
-
// show (2) holds when at least one bound is from a point key (either
|
|
745
|
-
// meta.smallest for lower bound or next_table_min_key for upper bound).
|
|
746
|
-
//
|
|
747
|
-
// Suppose lower bound is meta.smallest.user_key. The call
|
|
748
|
-
// it->Seek(*lower_bound) implies tombstone_end.user_key >
|
|
749
|
-
// meta.smallest.user_key. We have smallest_candidate.user_key =
|
|
750
|
-
// max(tombstone_start.user_key, meta.smallest.user_key). For
|
|
751
|
-
// smallest_candidate to be > largest_candidate, we need
|
|
752
|
-
// largest_candidate.user_key = upper_bound = smallest_candidate.user_key,
|
|
753
|
-
// where tombstone_end is truncated to largest_candidate.
|
|
754
|
-
// Subcase 1:
|
|
755
|
-
// Suppose largest_candidate.user_key = comp_end_user_key (there is no
|
|
756
|
-
// next point key). Subcompaction ensures any point key from this
|
|
757
|
-
// subcompaction has a user_key < comp_end_user_key, so 1)
|
|
758
|
-
// meta.smallest.user_key < comp_end_user_key, 2)
|
|
759
|
-
// `has_overlapping_endpoints` is false, and the first if condition in
|
|
760
|
-
// this for-loop ensures tombstone_start.user_key < comp_end_user_key. So
|
|
761
|
-
// smallest_candidate.user_key < largest_candidate.user_key. This case
|
|
762
|
-
// cannot happen when smallest > largest_candidate.
|
|
763
|
-
// Subcase 2:
|
|
764
|
-
// Suppose largest_candidate.user_key = next_table_min_key.user_key.
|
|
765
|
-
// The first if condition in this for-loop together with
|
|
766
|
-
// smallest_candidate.user_key = next_table_min_key.user_key =
|
|
767
|
-
// upper_bound implies `has_overlapping_endpoints` is true (so meta
|
|
768
|
-
// largest.user_key = upper_bound) and
|
|
769
|
-
// tombstone.seq_ < meta.largest.seqno. So
|
|
770
|
-
// tombstone_start < meta.largest < tombstone_end.
|
|
771
|
-
//
|
|
772
|
-
// Suppose lower bound is comp_start_user_key and upper_bound is
|
|
773
|
-
// next_table_min_key. The call it->Seek(*lower_bound) implies we have
|
|
774
|
-
// tombstone_end_key.user_key > comp_start_user_key. So
|
|
775
|
-
// tombstone_end_key.user_key > smallest_candidate.user_key. For
|
|
776
|
-
// smallest_candidate to be > largest_candidate, we need
|
|
777
|
-
// tombstone_start.user_key = largest_candidate.user_key = upper_bound =
|
|
778
|
-
// next_table_min_key.user_key. This means `has_overlapping_endpoints` is
|
|
779
|
-
// true (so meta.largest.user_key = upper_bound) and tombstone.seq_ <
|
|
780
|
-
// meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end.
|
|
781
724
|
}
|
|
782
725
|
return Status::OK();
|
|
783
726
|
}
|