@nxtedition/rocksdb 8.2.0-alpha.1 → 8.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/binding.cc +11 -74
  2. package/binding.gyp +7 -5
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  4. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  8. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  10. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  14. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  17. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  19. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  20. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  23. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  24. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  25. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  29. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  30. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  31. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  32. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  33. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  40. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  41. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  42. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  43. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  44. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  51. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  52. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  53. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  54. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  55. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  56. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  57. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  58. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  59. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  60. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  62. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  63. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  65. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  66. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  67. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  68. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  69. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  70. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  71. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  74. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  77. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  78. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  80. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  81. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  88. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  89. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  92. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  93. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  95. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  96. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  97. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  99. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  100. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  101. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  102. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  103. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  104. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  105. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  106. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  107. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  108. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  109. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  112. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  113. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  114. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  115. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  116. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  117. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  118. package/deps/rocksdb/rocksdb/src.mk +4 -0
  119. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  123. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  124. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  133. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  135. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  136. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  138. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  139. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  141. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  142. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  143. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  144. package/deps/rocksdb/rocksdb.gyp +6 -7
  145. package/index.js +0 -6
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
  148. package/deps/liburing/liburing.gyp +0 -20
  149. package/tmp/test.js +0 -7
@@ -226,6 +226,15 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
226
226
  bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
227
227
  assert(c_iter.Valid());
228
228
  const Slice& internal_key = c_iter.key();
229
+ #ifndef NDEBUG
230
+ bool should_stop = false;
231
+ std::pair<bool*, const Slice> p{&should_stop, internal_key};
232
+ TEST_SYNC_POINT_CALLBACK(
233
+ "CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
234
+ if (should_stop) {
235
+ return true;
236
+ }
237
+ #endif // NDEBUG
229
238
  const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
230
239
  const InternalKeyComparator* icmp =
231
240
  &compaction_->column_family_data()->internal_comparator();
@@ -347,8 +356,14 @@ Status CompactionOutputs::AddToOutput(
347
356
  const CompactionFileOpenFunc& open_file_func,
348
357
  const CompactionFileCloseFunc& close_file_func) {
349
358
  Status s;
359
+ bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
360
+ if (is_range_del && compaction_->bottommost_level()) {
361
+ // We don't consider range tombstone for bottommost level since:
362
+ // 1. there is no grandparent and hence no overlap to consider
363
+ // 2. range tombstone may be dropped at bottommost level.
364
+ return s;
365
+ }
350
366
  const Slice& key = c_iter.key();
351
-
352
367
  if (ShouldStopBefore(c_iter) && HasBuilder()) {
353
368
  s = close_file_func(*this, c_iter.InputStatus(), key);
354
369
  if (!s.ok()) {
@@ -358,6 +373,13 @@ Status CompactionOutputs::AddToOutput(
358
373
  grandparent_boundary_switched_num_ = 0;
359
374
  grandparent_overlapped_bytes_ =
360
375
  GetCurrentKeyGrandparentOverlappedBytes(key);
376
+ if (UNLIKELY(is_range_del)) {
377
+ // lower bound for this new output file, this is needed as the lower bound
378
+ // does not come from the smallest point key in this case.
379
+ range_tombstone_lower_bound_.DecodeFrom(key);
380
+ } else {
381
+ range_tombstone_lower_bound_.Clear();
382
+ }
361
383
  }
362
384
 
363
385
  // Open output file if necessary
@@ -368,6 +390,17 @@ Status CompactionOutputs::AddToOutput(
368
390
  }
369
391
  }
370
392
 
393
+ // c_iter may emit range deletion keys, so update `last_key_for_partitioner_`
394
+ // here before returning below when `is_range_del` is true
395
+ if (partitioner_) {
396
+ last_key_for_partitioner_.assign(c_iter.user_key().data_,
397
+ c_iter.user_key().size_);
398
+ }
399
+
400
+ if (UNLIKELY(is_range_del)) {
401
+ return s;
402
+ }
403
+
371
404
  assert(builder_ != nullptr);
372
405
  const Slice& value = c_iter.value();
373
406
  s = current_output().validator.Add(key, value);
@@ -391,28 +424,33 @@ Status CompactionOutputs::AddToOutput(
391
424
  s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
392
425
  ikey.type);
393
426
 
394
- if (partitioner_) {
395
- last_key_for_partitioner_.assign(c_iter.user_key().data_,
396
- c_iter.user_key().size_);
397
- }
398
-
399
427
  return s;
400
428
  }
401
429
 
430
+ namespace {
431
+ void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
432
+ const size_t ts_sz) {
433
+ if (ts_sz) {
434
+ static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
435
+ if (ts_sz <= strlen(kTsMax)) {
436
+ internal_key = InternalKey(user_key, kMaxSequenceNumber,
437
+ kTypeRangeDeletion, Slice(kTsMax, ts_sz));
438
+ } else {
439
+ internal_key =
440
+ InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
441
+ std::string(ts_sz, '\xff'));
442
+ }
443
+ } else {
444
+ internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
445
+ }
446
+ }
447
+ } // namespace
448
+
402
449
  Status CompactionOutputs::AddRangeDels(
403
450
  const Slice* comp_start_user_key, const Slice* comp_end_user_key,
404
451
  CompactionIterationStats& range_del_out_stats, bool bottommost_level,
405
452
  const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
406
453
  const Slice& next_table_min_key, const std::string& full_history_ts_low) {
407
- assert(HasRangeDel());
408
- FileMetaData& meta = current_output().meta;
409
- const Comparator* ucmp = icmp.user_comparator();
410
-
411
- Slice lower_bound_guard, upper_bound_guard;
412
- std::string smallest_user_key;
413
- const Slice *lower_bound, *upper_bound;
414
- bool lower_bound_from_sub_compact = false;
415
-
416
454
  // The following example does not happen since
417
455
  // CompactionOutput::ShouldStopBefore() always return false for the first
418
456
  // point key. But we should consider removing this dependency. Suppose for the
@@ -424,98 +462,134 @@ Status CompactionOutputs::AddRangeDels(
424
462
  // Then meta.smallest will be set to comp_start_user_key@seqno
425
463
  // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
426
464
  // which violates the assumption that meta.smallest should be <= meta.largest.
465
+ assert(HasRangeDel());
466
+ FileMetaData& meta = current_output().meta;
467
+ const Comparator* ucmp = icmp.user_comparator();
468
+ InternalKey lower_bound_buf, upper_bound_buf;
469
+ Slice lower_bound_guard, upper_bound_guard;
470
+ std::string smallest_user_key;
471
+ const Slice *lower_bound, *upper_bound;
472
+
473
+ // We first determine the internal key lower_bound and upper_bound for
474
+ // this output file. All and only range tombstones that overlap with
475
+ // [lower_bound, upper_bound] should be added to this file. File
476
+ // boundaries (meta.smallest/largest) should be updated accordingly when
477
+ // extended by range tombstones.
427
478
  size_t output_size = outputs_.size();
428
479
  if (output_size == 1) {
429
- // For the first output table, include range tombstones before the min
430
- // key but after the subcompaction boundary.
431
- lower_bound = comp_start_user_key;
432
- lower_bound_from_sub_compact = true;
433
- } else if (meta.smallest.size() > 0) {
480
+ // This is the first file in the subcompaction.
481
+ //
482
+ // When outputting a range tombstone that spans a subcompaction boundary,
483
+ // the files on either side of that boundary need to include that
484
+ // boundary's user key. Otherwise, the spanning range tombstone would lose
485
+ // coverage.
486
+ //
487
+ // To achieve this while preventing files from overlapping in internal key
488
+ // (an LSM invariant violation), we allow the earlier file to include the
489
+ // boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
490
+ // later file can begin at the boundary user key at the newest key version
491
+ // it contains. At this point that version number is unknown since we have
492
+ // not processed the range tombstones yet, so permit any version. Same story
493
+ // applies to timestamp, and a non-nullptr `comp_start_user_key` should have
494
+ // `kMaxTs` here, which similarly permits any timestamp.
495
+ if (comp_start_user_key) {
496
+ lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
497
+ kTypeRangeDeletion);
498
+ lower_bound_guard = lower_bound_buf.Encode();
499
+ lower_bound = &lower_bound_guard;
500
+ } else {
501
+ lower_bound = nullptr;
502
+ }
503
+ } else {
434
504
  // For subsequent output tables, only include range tombstones from min
435
505
  // key onwards since the previous file was extended to contain range
436
506
  // tombstones falling before min key.
437
- smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
438
- lower_bound_guard = Slice(smallest_user_key);
439
- lower_bound = &lower_bound_guard;
440
- } else {
441
- lower_bound = nullptr;
442
- }
443
- if (!next_table_min_key.empty()) {
444
- // This may be the last file in the subcompaction in some cases, so we
445
- // need to compare the end key of subcompaction with the next file start
446
- // key. When the end key is chosen by the subcompaction, we know that
447
- // it must be the biggest key in output file. Therefore, it is safe to
448
- // use the smaller key as the upper bound of the output file, to ensure
449
- // that there is no overlapping between different output files.
450
- upper_bound_guard = ExtractUserKey(next_table_min_key);
451
- if (comp_end_user_key != nullptr &&
452
- ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
453
- 0) {
454
- upper_bound = comp_end_user_key;
507
+ if (range_tombstone_lower_bound_.size() > 0) {
508
+ assert(meta.smallest.size() == 0 ||
509
+ icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
510
+ lower_bound_guard = range_tombstone_lower_bound_.Encode();
455
511
  } else {
512
+ assert(meta.smallest.size() > 0);
513
+ lower_bound_guard = meta.smallest.Encode();
514
+ }
515
+ lower_bound = &lower_bound_guard;
516
+ }
517
+
518
+ const size_t ts_sz = ucmp->timestamp_size();
519
+ if (next_table_min_key.empty()) {
520
+ // Last file of the subcompaction.
521
+ if (comp_end_user_key) {
522
+ upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
523
+ kTypeRangeDeletion);
524
+ upper_bound_guard = upper_bound_buf.Encode();
456
525
  upper_bound = &upper_bound_guard;
526
+ } else {
527
+ upper_bound = nullptr;
457
528
  }
458
529
  } else {
459
- // This is the last file in the subcompaction, so extend until the
460
- // subcompaction ends.
461
- upper_bound = comp_end_user_key;
462
- }
463
- bool has_overlapping_endpoints;
464
- if (upper_bound != nullptr && meta.largest.size() > 0) {
465
- has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
466
- meta.largest.user_key(), *upper_bound) == 0;
467
- } else {
468
- has_overlapping_endpoints = false;
530
+ // There is another file coming whose coverage will begin at
531
+ // `next_table_min_key`. The current file needs to extend range tombstone
532
+ // coverage through its own keys (through `meta.largest`) and through user
533
+ // keys preceding `next_table_min_key`'s user key.
534
+ ParsedInternalKey next_table_min_key_parsed;
535
+ ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
536
+ false /* log_err_key */)
537
+ .PermitUncheckedError();
538
+ assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
539
+ assert(meta.largest.size() == 0 ||
540
+ icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
541
+ assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
542
+ if (meta.largest.size() > 0 &&
543
+ ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
544
+ next_table_min_key_parsed.user_key)) {
545
+ // Caution: this assumes meta.largest.Encode() lives longer than
546
+ // upper_bound, which is only true if meta.largest is never updated.
547
+ // This just happens to be the case here since meta.largest serves
548
+ // as the upper_bound.
549
+ upper_bound_guard = meta.largest.Encode();
550
+ } else {
551
+ SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
552
+ ts_sz);
553
+ upper_bound_guard = upper_bound_buf.Encode();
554
+ }
555
+ upper_bound = &upper_bound_guard;
556
+ }
557
+ if (lower_bound && upper_bound &&
558
+ icmp.Compare(*lower_bound, *upper_bound) > 0) {
559
+ assert(meta.smallest.size() == 0 &&
560
+ ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
561
+ ExtractUserKey(*upper_bound)));
562
+ // This can only happen when lower_bound have the same user key as
563
+ // next_table_min_key and that there is no point key in the current
564
+ // compaction output file.
565
+ return Status::OK();
469
566
  }
470
-
471
567
  // The end key of the subcompaction must be bigger or equal to the upper
472
568
  // bound. If the end of subcompaction is null or the upper bound is null,
473
569
  // it means that this file is the last file in the compaction. So there
474
570
  // will be no overlapping between this file and others.
475
571
  assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
476
- ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
477
- auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
478
- has_overlapping_endpoints);
479
- // Position the range tombstone output iterator. There may be tombstone
480
- // fragments that are entirely out of range, so make sure that we do not
481
- // include those.
482
- if (lower_bound != nullptr) {
483
- it->Seek(*lower_bound);
484
- } else {
485
- it->SeekToFirst();
486
- }
572
+ ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
573
+ *comp_end_user_key) <= 0);
574
+ auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
487
575
  Slice last_tombstone_start_user_key{};
488
- for (; it->Valid(); it->Next()) {
576
+ bool reached_lower_bound = false;
577
+ for (it->SeekToFirst(); it->Valid(); it->Next()) {
489
578
  auto tombstone = it->Tombstone();
490
- if (upper_bound != nullptr) {
491
- int cmp =
492
- ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
493
- // Tombstones starting after upper_bound only need to be included in
494
- // the next table.
495
- // If the current SST ends before upper_bound, i.e.,
496
- // `has_overlapping_endpoints == false`, we can also skip over range
497
- // tombstones that start exactly at upper_bound. Such range
498
- // tombstones will be included in the next file and are not relevant
499
- // to the point keys or endpoints of the current file.
500
- // If the current SST ends at the same user key at upper_bound,
501
- // i.e., `has_overlapping_endpoints == true`, AND the tombstone has
502
- // the same start key as upper_bound, i.e., cmp == 0, then
503
- // the tombstone is relevant only if the tombstone's sequence number
504
- // is no larger than this file's largest key's sequence number. This
505
- // is because the upper bound to truncate this file's range tombstone
506
- // will be meta.largest in this case, and any tombstone that starts after
507
- // it will not be relevant.
508
- if (cmp < 0) {
509
- break;
510
- } else if (cmp == 0) {
511
- if (!has_overlapping_endpoints ||
512
- tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) {
513
- break;
514
- }
515
- }
579
+ auto kv = tombstone.Serialize();
580
+ InternalKey tombstone_end = tombstone.SerializeEndKey();
581
+ // TODO: the underlying iterator should support clamping the bounds.
582
+ // tombstone_end.Encode is of form user_key@kMaxSeqno
583
+ // if it is equal to lower_bound, there is no need to include
584
+ // such range tombstone.
585
+ if (!reached_lower_bound && lower_bound &&
586
+ icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
587
+ continue;
516
588
  }
589
+ assert(!lower_bound ||
590
+ icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
591
+ reached_lower_bound = true;
517
592
 
518
- const size_t ts_sz = ucmp->timestamp_size();
519
593
  // Garbage collection for range tombstones.
520
594
  // If user-defined timestamp is enabled, range tombstones are dropped if
521
595
  // they are at bottommost_level, below full_history_ts_low and not visible
@@ -534,83 +608,93 @@ Status CompactionOutputs::AddRangeDels(
534
608
  continue;
535
609
  }
536
610
 
537
- auto kv = tombstone.Serialize();
538
611
  assert(lower_bound == nullptr ||
539
- ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
612
+ ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
613
+ kv.second) < 0);
614
+ InternalKey tombstone_start = kv.first;
615
+ if (lower_bound &&
616
+ ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
617
+ ExtractUserKey(*lower_bound)) < 0) {
618
+ // This just updates the non-timestamp portion of `tombstone_start`'s user
619
+ // key. Ideally there would be a simpler API usage
620
+ ParsedInternalKey tombstone_start_parsed;
621
+ ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
622
+ false /* log_err_key */)
623
+ .PermitUncheckedError();
624
+ // timestamp should be from where sequence number is from, which is from
625
+ // tombstone in this case
626
+ std::string ts =
627
+ tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
628
+ .ToString();
629
+ tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
630
+ tombstone_start.SetFrom(tombstone_start_parsed, ts);
631
+ }
632
+ if (upper_bound != nullptr &&
633
+ icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
634
+ break;
635
+ }
636
+ // Here we show that *only* range tombstones that overlap with
637
+ // [lower_bound, upper_bound] are added to the current file, and
638
+ // sanity checking invariants that should hold:
639
+ // - [tombstone_start, tombstone_end] overlaps with [lower_bound,
640
+ // upper_bound]
641
+ // - meta.smallest <= meta.largest
642
+ // Corresponding assertions are made, the proof is broken is any of them
643
+ // fails.
644
+ // TODO: show that *all* range tombstones that overlap with
645
+ // [lower_bound, upper_bound] are added.
646
+ // TODO: some invariant about boundaries are correctly updated.
647
+ //
648
+ // Note that `tombstone_start` is updated in the if condition above, we use
649
+ // tombstone_start to refer to its initial value, i.e.,
650
+ // it->Tombstone().first, and use tombstone_start* to refer to its value
651
+ // after the update.
652
+ //
653
+ // To show [lower_bound, upper_bound] overlaps with [tombstone_start,
654
+ // tombstone_end]:
655
+ // lower_bound <= upper_bound from the if condition right after all
656
+ // bounds are initialized. We assume each tombstone fragment has
657
+ // start_key.user_key < end_key.user_key, so
658
+ // tombstone_start < tombstone_end by
659
+ // FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
660
+ // non-emtpy. The flag `reached_lower_bound` and the if logic before it
661
+ // ensures lower_bound <= tombstone_end. tombstone_start is only updated
662
+ // if it has a smaller user_key than lower_bound user_key, so
663
+ // tombstone_start <= tombstone_start*. The above if condition implies
664
+ // tombstone_start* <= upper_bound. So we have
665
+ // tombstone_start <= upper_bound and lower_bound <= tombstone_end
666
+ // and the two ranges overlap.
667
+ //
668
+ // To show meta.smallest <= meta.largest:
669
+ // From the implementation of UpdateBoundariesForRange(), it suffices to
670
+ // prove that when it is first called in this function, its parameters
671
+ // satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
672
+ // and end = min(tombstone_end, upper_bound). From the above proof we have
673
+ // lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
674
+ // to show that tombstone_start* <= min(tombstone_end, upper_bound).
675
+ // Note that tombstone_start*.user_key = max(tombstone_start.user_key,
676
+ // lower_bound.user_key). Assuming tombstone_end always has
677
+ // kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
678
+ // Since lower_bound <= tombstone_end and lower_bound.seqno <
679
+ // tombstone_end.seqno (in absolute number order, not internal key order),
680
+ // lower_bound.user_key < tombstone_end.user_key.
681
+ // Since lower_bound.user_key < tombstone_end.user_key and
682
+ // tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
683
+ // tombstone_end. Since tombstone_start* <= upper_bound from the above proof
684
+ // and tombstone_start* < tombstone_end, tombstone_start* <=
685
+ // min(tombstone_end, upper_bound), so the two ranges overlap.
686
+
540
687
  // Range tombstone is not supported by output validator yet.
541
688
  builder_->Add(kv.first.Encode(), kv.second);
542
- InternalKey tombstone_start = std::move(kv.first);
543
- InternalKey smallest_candidate{tombstone_start};
544
- if (lower_bound != nullptr &&
545
- ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
546
- *lower_bound) <= 0) {
547
- // Pretend the smallest key has the same user key as lower_bound
548
- // (the max key in the previous table or subcompaction) in order for
549
- // files to appear key-space partitioned.
550
- if (lower_bound_from_sub_compact) {
551
- // When lower_bound is chosen by a subcompaction
552
- // (lower_bound_from_sub_compact), we know that subcompactions over
553
- // smaller keys cannot contain any keys at lower_bound. We also know
554
- // that smaller subcompactions exist, because otherwise the
555
- // subcompaction woud be unbounded on the left. As a result, we know
556
- // that no other files on the output level will contain actual keys at
557
- // lower_bound (an output file may have a largest key of
558
- // lower_bound@kMaxSequenceNumber, but this only indicates a large range
559
- // tombstone was truncated). Therefore, it is safe to use the
560
- // tombstone's sequence number, to ensure that keys at lower_bound at
561
- // lower levels are covered by truncated tombstones.
562
- if (ts_sz) {
563
- assert(tombstone.ts_.size() == ts_sz);
564
- smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
565
- kTypeRangeDeletion, tombstone.ts_);
566
- } else {
567
- smallest_candidate =
568
- InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
569
- }
570
- } else {
571
- // If lower_bound was chosen by the smallest data key in the file,
572
- // choose lowest seqnum so this file's smallest internal key comes
573
- // after the previous file's largest. The fake seqnum is OK because
574
- // the read path's file-picking code only considers user key.
575
- smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
576
- }
689
+ if (lower_bound &&
690
+ icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
691
+ tombstone_start.DecodeFrom(*lower_bound);
577
692
  }
578
- InternalKey tombstone_end = tombstone.SerializeEndKey();
579
- InternalKey largest_candidate{tombstone_end};
580
- if (upper_bound != nullptr &&
581
- ucmp->CompareWithoutTimestamp(*upper_bound,
582
- largest_candidate.user_key()) <= 0) {
583
- // Pretend the largest key has the same user key as upper_bound (the
584
- // min key in the following table or subcompaction) in order for files
585
- // to appear key-space partitioned.
586
- //
587
- // Choose highest seqnum so this file's largest internal key comes
588
- // before the next file's/subcompaction's smallest. The fake seqnum is
589
- // OK because the read path's file-picking code only considers the
590
- // user key portion.
591
- //
592
- // Note Seek() also creates InternalKey with (user_key,
593
- // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
594
- // kTypeRangeDeletion (0xF), so the range tombstone comes before the
595
- // Seek() key in InternalKey's ordering. So Seek() will look in the
596
- // next file for the user key
597
- if (ts_sz) {
598
- static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
599
- if (ts_sz <= strlen(kTsMax)) {
600
- largest_candidate =
601
- InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
602
- Slice(kTsMax, ts_sz));
603
- } else {
604
- largest_candidate =
605
- InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
606
- std::string(ts_sz, '\xff'));
607
- }
608
- } else {
609
- largest_candidate =
610
- InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
611
- }
693
+ if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
694
+ tombstone_end.DecodeFrom(*upper_bound);
612
695
  }
613
- meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
696
+ assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
697
+ meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
614
698
  tombstone.seq_, icmp);
615
699
  if (!bottommost_level) {
616
700
  bool start_user_key_changed =
@@ -618,17 +702,8 @@ Status CompactionOutputs::AddRangeDels(
618
702
  ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
619
703
  it->start_key()) < 0;
620
704
  last_tombstone_start_user_key = it->start_key();
621
- // Range tombstones are truncated at file boundaries
622
- if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
623
- tombstone_start = meta.smallest;
624
- }
625
- if (icmp.Compare(tombstone_end, meta.largest) > 0) {
626
- tombstone_end = meta.largest;
627
- }
628
- // this assertion validates invariant (2) in the comment below.
629
- assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
630
705
  if (start_user_key_changed) {
631
- // if tombstone_start >= tombstone_end, then either no key range is
706
+ // If tombstone_start >= tombstone_end, then either no key range is
632
707
  // covered, or that they have the same user key. If they have the same
633
708
  // user key, then the internal key range should only be within this
634
709
  // level, and no keys from older levels is covered.
@@ -646,138 +721,6 @@ Status CompactionOutputs::AddRangeDels(
646
721
  }
647
722
  }
648
723
  }
649
- // TODO: show invariants that ensure all necessary range tombstones are
650
- // added
651
- // and that file boundaries ensure no coverage is lost.
652
- // Each range tombstone with internal key range [tombstone_start,
653
- // tombstone_end] is being added to the current compaction output file here.
654
- // The range tombstone is going to be truncated at range [meta.smallest,
655
- // meta.largest] during reading/scanning. We should maintain invariants
656
- // (1) meta.smallest <= meta.largest and,
657
- // (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest]
658
- // overlaps, as there is no point adding range tombstone with a range
659
- // outside the file's range.
660
- // Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to
661
- // use either open or closed range. Using closed range here to make
662
- // reasoning easier, and it is more consistent with an ongoing work that
663
- // tries to simplify this method.
664
- //
665
- // There are two cases:
666
- // Case 1. Output file has no point key:
667
- // First we show this case only happens when the entire compaction output
668
- // is range tombstone only. This is true if CompactionIterator does not
669
- // emit any point key. Suppose CompactionIterator emits some point key.
670
- // Based on the assumption that CompactionOutputs::ShouldStopBefore()
671
- // always return false for the first point key, the first compaction
672
- // output file always contains a point key. Each new compaction output
673
- // file is created if there is a point key for which ShouldStopBefore()
674
- // returns true, and the point key would be added to the new compaction
675
- // output file. So each new compaction file always contains a point key.
676
- // So Case 1 only happens when CompactionIterator does not emit any
677
- // point key.
678
- //
679
- // To show (1) meta.smallest <= meta.largest:
680
- // Since the compaction output is range tombstone only, `lower_bound` and
681
- // `upper_bound` are either null or comp_start/end_user_key respectively.
682
- // According to how UpdateBoundariesForRange() is implemented, it blindly
683
- // updates meta.smallest and meta.largest to smallest_candidate and
684
- // largest_candidate the first time it is called. Subsequently, it
685
- // compares input parameter with meta.smallest and meta.largest and only
686
- // updates them when input is smaller/larger. So we only need to show
687
- // smallest_candidate <= largest_candidate the first time
688
- // UpdateBoundariesForRange() is called. Here we show something stronger
689
- // that smallest_candidate.user_key < largest_candidate.user_key always
690
- // hold for Case 1.
691
- // We assume comp_start_user_key < comp_end_user_key, if provided. We
692
- // assume that tombstone_start < tombstone_end. This assumption is based
693
- // on that each fragment in FragmentedTombstoneList has
694
- // start_key < end_key (user_key) and that
695
- // FragmentedTombstoneIterator::Tombstone() returns the pair
696
- // (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key).
697
- // The logic in this loop sets smallest_candidate to
698
- // max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with
699
- // op_type kTypeRangeDeletion, largest_candidate to
700
- // min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with
701
- // op_type kTypeRangeDeletion. When a bound is null, there is no
702
- // truncation on that end. To show that smallest_candidate.user_key <
703
- // largest_candidate.user_key, it suffices to show
704
- // tombstone_start.user_key < comp_end_user_key (if not null) AND
705
- // comp_start_user_key (if not null) < tombstone_end.user_key.
706
- // Since the file has no point key, `has_overlapping_endpoints` is false.
707
- // In the first sanity check of this for-loop, we compare
708
- // tombstone_start.user_key against upper_bound = comp_end_user_key,
709
- // and only proceed if tombstone_start.user_key < comp_end_user_key.
710
- // We assume FragmentedTombstoneIterator::Seek(k) lands
711
- // on a tombstone with end_key > k. So the call it->Seek(*lower_bound)
712
- // above implies compact_start_user_key < tombstone_end.user_key.
713
- //
714
- // To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
715
- // meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
716
- // In the proof for (1) we have shown that
717
- // smallest_candidate <= largest_candidate. Since tombstone_start <=
718
- // smallest_candidate <= largest_candidate <= tombstone_end, for (2) to
719
- // hold, it suffices to show that [smallest_candidate, largest_candidate]
720
- // overlaps with [meta.smallest, meta.largest]. too.
721
- // Given meta.smallest <= meta.largest shown above, we need to show
722
- // that it is impossible to have largest_candidate < meta.smallest or
723
- // meta.largest < smallest_candidate. If the above
724
- // meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate)
725
- // updates meta.largest or meta.smallest, then the two ranges overlap.
726
- // So we assume meta.UpdateBoundariesForRange(smallest_candidate,
727
- // largest_candidate) did not update meta.smallest nor meta.largest, which
728
- // means meta.smallest < smallest_candidate and largest_candidate <
729
- // meta.largest.
730
- //
731
- // Case 2. Output file has >= 1 point key. This means meta.smallest and
732
- // meta.largest are not empty when AddRangeDels() is called.
733
- // To show (1) meta.smallest <= meta.largest:
734
- // Assume meta.smallest <= meta.largest when AddRangeDels() is called,
735
- // this follow from how UpdateBoundariesForRange() is implemented where it
736
- // takes min or max to update meta.smallest or meta.largest.
737
- //
738
- // To show (2) [tombstone_start, tombstone_end] and [meta.smallest,
739
- // meta.largest] overlaps (after the call to UpdateBoundariesForRange()):
740
- // When smallest_candidate <= largest_candidate, the proof in Case 1
741
- // applies, so we only need to show (2) holds when smallest_candidate >
742
- // largest_candidate. When both bounds are either null or from
743
- // subcompaction boundary, the proof in Case 1 applies, so we only need to
744
- // show (2) holds when at least one bound is from a point key (either
745
- // meta.smallest for lower bound or next_table_min_key for upper bound).
746
- //
747
- // Suppose lower bound is meta.smallest.user_key. The call
748
- // it->Seek(*lower_bound) implies tombstone_end.user_key >
749
- // meta.smallest.user_key. We have smallest_candidate.user_key =
750
- // max(tombstone_start.user_key, meta.smallest.user_key). For
751
- // smallest_candidate to be > largest_candidate, we need
752
- // largest_candidate.user_key = upper_bound = smallest_candidate.user_key,
753
- // where tombstone_end is truncated to largest_candidate.
754
- // Subcase 1:
755
- // Suppose largest_candidate.user_key = comp_end_user_key (there is no
756
- // next point key). Subcompaction ensures any point key from this
757
- // subcompaction has a user_key < comp_end_user_key, so 1)
758
- // meta.smallest.user_key < comp_end_user_key, 2)
759
- // `has_overlapping_endpoints` is false, and the first if condition in
760
- // this for-loop ensures tombstone_start.user_key < comp_end_user_key. So
761
- // smallest_candidate.user_key < largest_candidate.user_key. This case
762
- // cannot happen when smallest > largest_candidate.
763
- // Subcase 2:
764
- // Suppose largest_candidate.user_key = next_table_min_key.user_key.
765
- // The first if condition in this for-loop together with
766
- // smallest_candidate.user_key = next_table_min_key.user_key =
767
- // upper_bound implies `has_overlapping_endpoints` is true (so meta
768
- // largest.user_key = upper_bound) and
769
- // tombstone.seq_ < meta.largest.seqno. So
770
- // tombstone_start < meta.largest < tombstone_end.
771
- //
772
- // Suppose lower bound is comp_start_user_key and upper_bound is
773
- // next_table_min_key. The call it->Seek(*lower_bound) implies we have
774
- // tombstone_end_key.user_key > comp_start_user_key. So
775
- // tombstone_end_key.user_key > smallest_candidate.user_key. For
776
- // smallest_candidate to be > largest_candidate, we need
777
- // tombstone_start.user_key = largest_candidate.user_key = upper_bound =
778
- // next_table_min_key.user_key. This means `has_overlapping_endpoints` is
779
- // true (so meta.largest.user_key = upper_bound) and tombstone.seq_ <
780
- // meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end.
781
724
  }
782
725
  return Status::OK();
783
726
  }