@nxtedition/rocksdb 8.0.1 → 8.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  2. package/deps/rocksdb/rocksdb/Makefile +2 -2
  3. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
  8. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +20 -146
  9. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
  10. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
  11. package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
  12. package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
  33. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
  35. package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
  36. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
  37. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
  38. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
  47. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
  48. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
  49. package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
  50. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
  51. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
  52. package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
  53. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
  54. package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
  56. package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
  57. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  58. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
  59. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
  60. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
  62. package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
  63. package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
  65. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
  68. package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
  69. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
  74. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
  75. package/deps/rocksdb/rocksdb/db/version_set.cc +212 -35
  76. package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
  77. package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
  78. package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
  82. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
  83. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
  84. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
  85. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
  86. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
  87. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
  88. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
  89. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
  90. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
  91. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  94. package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
  95. package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
  96. package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
  97. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  98. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  99. package/deps/rocksdb/rocksdb/src.mk +2 -1
  100. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
  103. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
  105. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
  106. package/deps/rocksdb/rocksdb/table/format.cc +24 -20
  107. package/deps/rocksdb/rocksdb/table/format.h +5 -2
  108. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
  109. package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
  110. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
  111. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  112. package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
  113. package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
  114. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
  115. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
  116. package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
  117. package/deps/rocksdb/rocksdb/util/status.cc +7 -0
  118. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
  119. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
  120. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
  121. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
  122. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
  123. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
  124. package/deps/rocksdb/rocksdb.gyp +2 -1
  125. package/package.json +1 -1
  126. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  127. package/prebuilds/linux-x64/node.napi.node +0 -0
  128. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
  129. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
@@ -1059,16 +1059,31 @@ void DBImpl::DumpStats() {
1059
1059
  return;
1060
1060
  }
1061
1061
 
1062
+ // Also probe block cache(s) for problems, dump to info log
1063
+ UnorderedSet<Cache*> probed_caches;
1062
1064
  TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
1063
1065
  {
1064
1066
  InstrumentedMutexLock l(&mutex_);
1065
1067
  for (auto cfd : versions_->GetRefedColumnFamilySet()) {
1066
- if (cfd->initialized()) {
1067
- // Release DB mutex for gathering cache entry stats. Pass over all
1068
- // column families for this first so that other stats are dumped
1069
- // near-atomically.
1070
- InstrumentedMutexUnlock u(&mutex_);
1071
- cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
1068
+ if (!cfd->initialized()) {
1069
+ continue;
1070
+ }
1071
+
1072
+ // Release DB mutex for gathering cache entry stats. Pass over all
1073
+ // column families for this first so that other stats are dumped
1074
+ // near-atomically.
1075
+ InstrumentedMutexUnlock u(&mutex_);
1076
+ cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
1077
+
1078
+ // Probe block cache for problems (if not already via another CF)
1079
+ if (immutable_db_options_.info_log) {
1080
+ auto* table_factory = cfd->ioptions()->table_factory.get();
1081
+ assert(table_factory != nullptr);
1082
+ Cache* cache =
1083
+ table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
1084
+ if (cache && probed_caches.insert(cache).second) {
1085
+ cache->ReportProblems(immutable_db_options_.info_log);
1086
+ }
1072
1087
  }
1073
1088
  }
1074
1089
 
@@ -1555,21 +1570,31 @@ Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
1555
1570
  }
1556
1571
 
1557
1572
  Status DBImpl::LockWAL() {
1558
- log_write_mutex_.Lock();
1559
- auto cur_log_writer = logs_.back().writer;
1560
- IOStatus status = cur_log_writer->WriteBuffer();
1561
- if (!status.ok()) {
1562
- ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
1563
- status.ToString().c_str());
1564
- // In case there is a fs error we should set it globally to prevent the
1565
- // future writes
1566
- WriteStatusCheck(status);
1573
+ {
1574
+ InstrumentedMutexLock lock(&mutex_);
1575
+ WriteThread::Writer w;
1576
+ write_thread_.EnterUnbatched(&w, &mutex_);
1577
+ WriteThread::Writer nonmem_w;
1578
+ if (two_write_queues_) {
1579
+ nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
1580
+ }
1581
+
1582
+ lock_wal_write_token_ = write_controller_.GetStopToken();
1583
+
1584
+ if (two_write_queues_) {
1585
+ nonmem_write_thread_.ExitUnbatched(&nonmem_w);
1586
+ }
1587
+ write_thread_.ExitUnbatched(&w);
1567
1588
  }
1568
- return static_cast<Status>(status);
1589
+ return FlushWAL(/*sync=*/false);
1569
1590
  }
1570
1591
 
1571
1592
  Status DBImpl::UnlockWAL() {
1572
- log_write_mutex_.Unlock();
1593
+ {
1594
+ InstrumentedMutexLock lock(&mutex_);
1595
+ lock_wal_write_token_.reset();
1596
+ }
1597
+ bg_cv_.SignalAll();
1573
1598
  return Status::OK();
1574
1599
  }
1575
1600
 
@@ -1808,7 +1833,8 @@ InternalIterator* DBImpl::NewInternalIterator(
1808
1833
  MergeIteratorBuilder merge_iter_builder(
1809
1834
  &cfd->internal_comparator(), arena,
1810
1835
  !read_options.total_order_seek &&
1811
- super_version->mutable_cf_options.prefix_extractor != nullptr);
1836
+ super_version->mutable_cf_options.prefix_extractor != nullptr,
1837
+ read_options.iterate_upper_bound);
1812
1838
  // Collect iterator for mutable memtable
1813
1839
  auto mem_iter = super_version->mem->NewIterator(read_options, arena);
1814
1840
  Status s;
@@ -5302,6 +5328,7 @@ Status DBImpl::IngestExternalFiles(
5302
5328
  // Run ingestion jobs.
5303
5329
  if (status.ok()) {
5304
5330
  for (size_t i = 0; i != num_cfs; ++i) {
5331
+ mutex_.AssertHeld();
5305
5332
  status = ingestion_jobs[i].Run();
5306
5333
  if (!status.ok()) {
5307
5334
  break;
@@ -5506,6 +5533,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
5506
5533
 
5507
5534
  num_running_ingest_file_++;
5508
5535
  assert(!cfd->IsDropped());
5536
+ mutex_.AssertHeld();
5509
5537
  status = import_job.Run();
5510
5538
 
5511
5539
  // Install job edit [Mutex will be unlocked here]
@@ -1161,7 +1161,7 @@ class DBImpl : public DB {
1161
1161
  int TEST_BGCompactionsAllowed() const;
1162
1162
  int TEST_BGFlushesAllowed() const;
1163
1163
  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
1164
- void TEST_WaitForPeridicTaskRun(std::function<void()> callback) const;
1164
+ void TEST_WaitForPeriodicTaskRun(std::function<void()> callback) const;
1165
1165
  SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
1166
1166
  size_t TEST_EstimateInMemoryStatsHistorySize() const;
1167
1167
 
@@ -2680,6 +2680,10 @@ class DBImpl : public DB {
2680
2680
  // seqno_time_mapping_ stores the sequence number to time mapping, it's not
2681
2681
  // thread safe, both read and write need db mutex hold.
2682
2682
  SeqnoToTimeMapping seqno_time_mapping_;
2683
+
2684
+ // stop write token that is acquired when LockWal() is called. Destructed
2685
+ // when UnlockWal() is called.
2686
+ std::unique_ptr<WriteControllerToken> lock_wal_write_token_;
2683
2687
  };
2684
2688
 
2685
2689
  class GetWithTimestampReadCallback : public ReadCallback {
@@ -1344,18 +1344,8 @@ Status DBImpl::CompactFilesImpl(
1344
1344
  }
1345
1345
  }
1346
1346
 
1347
- SequenceNumber earliest_mem_seqno = kMaxSequenceNumber;
1348
- if (cfd->mem() != nullptr) {
1349
- earliest_mem_seqno =
1350
- std::min(cfd->mem()->GetEarliestSequenceNumber(), earliest_mem_seqno);
1351
- }
1352
- if (cfd->imm() != nullptr && cfd->imm()->current() != nullptr) {
1353
- earliest_mem_seqno =
1354
- std::min(cfd->imm()->current()->GetEarliestSequenceNumber(false),
1355
- earliest_mem_seqno);
1356
- }
1357
1347
  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
1358
- &input_set, cf_meta, output_level, earliest_mem_seqno);
1348
+ &input_set, cf_meta, output_level);
1359
1349
  if (!s.ok()) {
1360
1350
  return s;
1361
1351
  }
@@ -1697,14 +1687,15 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
1697
1687
 
1698
1688
  VersionEdit edit;
1699
1689
  edit.SetColumnFamily(cfd->GetID());
1690
+
1700
1691
  for (const auto& f : vstorage->LevelFiles(level)) {
1701
1692
  edit.DeleteFile(level, f->fd.GetNumber());
1702
1693
  edit.AddFile(
1703
1694
  to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
1704
1695
  f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
1705
1696
  f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
1706
- f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
1707
- f->file_checksum_func_name, f->unique_id);
1697
+ f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
1698
+ f->file_checksum, f->file_checksum_func_name, f->unique_id);
1708
1699
  }
1709
1700
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
1710
1701
  "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -3344,8 +3335,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
3344
3335
  f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
3345
3336
  f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
3346
3337
  f->oldest_blob_file_number, f->oldest_ancester_time,
3347
- f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
3348
- f->unique_id);
3338
+ f->file_creation_time, f->epoch_number, f->file_checksum,
3339
+ f->file_checksum_func_name, f->unique_id);
3349
3340
 
3350
3341
  ROCKS_LOG_BUFFER(
3351
3342
  log_buffer,
@@ -290,7 +290,7 @@ size_t DBImpl::TEST_GetWalPreallocateBlockSize(
290
290
  }
291
291
 
292
292
  #ifndef ROCKSDB_LITE
293
- void DBImpl::TEST_WaitForPeridicTaskRun(std::function<void()> callback) const {
293
+ void DBImpl::TEST_WaitForPeriodicTaskRun(std::function<void()> callback) const {
294
294
  periodic_task_scheduler_.TEST_WaitForRun(callback);
295
295
  }
296
296
 
@@ -136,7 +136,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
136
136
  f->fd.smallest_seqno, f->fd.largest_seqno,
137
137
  f->marked_for_compaction, f->temperature,
138
138
  f->oldest_blob_file_number, f->oldest_ancester_time,
139
- f->file_creation_time, f->file_checksum,
139
+ f->file_creation_time, f->epoch_number, f->file_checksum,
140
140
  f->file_checksum_func_name, f->unique_id);
141
141
  }
142
142
 
@@ -315,6 +315,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
315
315
  }
316
316
  log_write_mutex_.Unlock();
317
317
  mutex_.Unlock();
318
+ TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr);
318
319
  log_write_mutex_.Lock();
319
320
  while (!logs_.empty() && logs_.front().number < min_log_number) {
320
321
  auto& log = logs_.front();
@@ -360,6 +361,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
360
361
  }
361
362
  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
362
363
  &file_deletion_status);
364
+ TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion2",
365
+ const_cast<std::string*>(&fname));
363
366
  if (file_deletion_status.ok()) {
364
367
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
365
368
  "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
@@ -1515,7 +1515,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1515
1515
  .PermitUncheckedError(); // ignore error
1516
1516
  const uint64_t current_time = static_cast<uint64_t>(_current_time);
1517
1517
  meta.oldest_ancester_time = current_time;
1518
-
1518
+ meta.epoch_number = cfd->NewEpochNumber();
1519
1519
  {
1520
1520
  auto write_hint = cfd->CalculateSSTWriteHint(0);
1521
1521
  mutex_.Unlock();
@@ -1583,13 +1583,13 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1583
1583
  constexpr int level = 0;
1584
1584
 
1585
1585
  if (s.ok() && has_output) {
1586
- edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
1587
- meta.fd.GetFileSize(), meta.smallest, meta.largest,
1588
- meta.fd.smallest_seqno, meta.fd.largest_seqno,
1589
- meta.marked_for_compaction, meta.temperature,
1590
- meta.oldest_blob_file_number, meta.oldest_ancester_time,
1591
- meta.file_creation_time, meta.file_checksum,
1592
- meta.file_checksum_func_name, meta.unique_id);
1586
+ edit->AddFile(
1587
+ level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
1588
+ meta.smallest, meta.largest, meta.fd.smallest_seqno,
1589
+ meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
1590
+ meta.oldest_blob_file_number, meta.oldest_ancester_time,
1591
+ meta.file_creation_time, meta.epoch_number, meta.file_checksum,
1592
+ meta.file_checksum_func_name, meta.unique_id);
1593
1593
 
1594
1594
  for (const auto& blob : blob_file_additions) {
1595
1595
  edit->AddBlobFile(blob);
@@ -924,6 +924,15 @@ Status DBImpl::WriteImplWALOnly(
924
924
  write_thread->ExitAsBatchGroupLeader(write_group, status);
925
925
  return status;
926
926
  }
927
+ } else {
928
+ InstrumentedMutexLock lock(&mutex_);
929
+ Status status = DelayWrite(/*num_bytes=*/0ull, write_options);
930
+ if (!status.ok()) {
931
+ WriteThread::WriteGroup write_group;
932
+ write_thread->EnterAsBatchGroupLeader(&w, &write_group);
933
+ write_thread->ExitAsBatchGroupLeader(write_group, status);
934
+ return status;
935
+ }
927
936
  }
928
937
 
929
938
  WriteThread::WriteGroup write_group;
@@ -1762,6 +1771,7 @@ uint64_t DBImpl::GetMaxTotalWalSize() const {
1762
1771
  // REQUIRES: this thread is currently at the front of the writer queue
1763
1772
  Status DBImpl::DelayWrite(uint64_t num_bytes,
1764
1773
  const WriteOptions& write_options) {
1774
+ mutex_.AssertHeld();
1765
1775
  uint64_t time_delayed = 0;
1766
1776
  bool delayed = false;
1767
1777
  {
@@ -1661,6 +1661,213 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
1661
1661
  ASSERT_EQ(1, num_range_deletions);
1662
1662
  }
1663
1663
 
1664
+ // Test SST partitioner cut after every single key
1665
+ class SingleKeySstPartitioner : public SstPartitioner {
1666
+ public:
1667
+ const char* Name() const override { return "SingleKeySstPartitioner"; }
1668
+
1669
+ PartitionerResult ShouldPartition(
1670
+ const PartitionerRequest& /*request*/) override {
1671
+ return kRequired;
1672
+ }
1673
+
1674
+ bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
1675
+ const Slice& /*largest_user_key*/) override {
1676
+ return false;
1677
+ }
1678
+ };
1679
+
1680
+ class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
1681
+ public:
1682
+ static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
1683
+ const char* Name() const override { return kClassName(); }
1684
+
1685
+ std::unique_ptr<SstPartitioner> CreatePartitioner(
1686
+ const SstPartitioner::Context& /* context */) const override {
1687
+ return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
1688
+ }
1689
+ };
1690
+
1691
+ TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) {
1692
+ Options options = CurrentOptions();
1693
+ options.compression = kNoCompression;
1694
+ options.compaction_pri = kMinOverlappingRatio;
1695
+ options.disable_auto_compactions = true;
1696
+ options.ttl = 24 * 60 * 60; // 24 hours
1697
+ options.target_file_size_base = 8 << 10;
1698
+ env_->SetMockSleep();
1699
+ options.env = env_;
1700
+ DestroyAndReopen(options);
1701
+
1702
+ Random rnd(301);
1703
+ // Fill some data so that future compactions are not bottommost level
1704
+ // compaction, and hence they would try cut around files for ttl
1705
+ for (int i = 5; i < 10; ++i) {
1706
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10)));
1707
+ }
1708
+ ASSERT_OK(Flush());
1709
+ MoveFilesToLevel(3);
1710
+ ASSERT_EQ("0,0,0,1", FilesPerLevel());
1711
+
1712
+ for (int i = 5; i < 10; ++i) {
1713
+ ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10)));
1714
+ }
1715
+ ASSERT_OK(Flush());
1716
+ MoveFilesToLevel(1);
1717
+ ASSERT_EQ("0,1,0,1", FilesPerLevel());
1718
+
1719
+ env_->MockSleepForSeconds(20 * 60 * 60);
1720
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
1721
+ Key(11), Key(12)));
1722
+ ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
1723
+ ASSERT_OK(Flush());
1724
+ ASSERT_EQ("1,1,0,1", FilesPerLevel());
1725
+ // L0 file is new, L1 and L3 file are old and qualified for TTL
1726
+ env_->MockSleepForSeconds(10 * 60 * 60);
1727
+ MoveFilesToLevel(1);
1728
+ // L1 output should be cut into 3 files:
1729
+ // File 0: Key(0)
1730
+ // File 1: (qualified for TTL): Key(5) - Key(10)
1731
+ // File 1: DeleteRange [11, 12)
1732
+ ASSERT_EQ("0,3,0,1", FilesPerLevel());
1733
+ }
1734
+
1735
+ TEST_F(DBRangeDelTest, CompactionEmitRangeTombstoneToSSTPartitioner) {
1736
+ Options options = CurrentOptions();
1737
+ auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
1738
+ options.sst_partitioner_factory = factory;
1739
+ options.disable_auto_compactions = true;
1740
+ DestroyAndReopen(options);
1741
+
1742
+ Random rnd(301);
1743
+ // range deletion keys are not processed when compacting to bottommost level,
1744
+ // so creating a file at older level to make the next compaction not
1745
+ // bottommost level
1746
+ ASSERT_OK(db_->Put(WriteOptions(), Key(4), rnd.RandomString(10)));
1747
+ ASSERT_OK(Flush());
1748
+ MoveFilesToLevel(5);
1749
+
1750
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(10)));
1751
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
1752
+ Key(5)));
1753
+ ASSERT_OK(Flush());
1754
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1755
+ MoveFilesToLevel(1);
1756
+ // SSTPartitioner decides to cut when range tombstone start key is passed to
1757
+ // it Note that the range tombstone [2, 5) itself span multiple keys but we
1758
+ // are not able to partition in between yet.
1759
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
1760
+ }
1761
+
1762
+ TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenPointKeyAndTombstone) {
1763
+ // L2 has two files
1764
+ // L2_0: 0, 1, 2, 3, 4. L2_1: 5, 6, 7
1765
+ // L0 has 0, [5, 6), 8
1766
+ // max_compaction_bytes is less than the size of L2_0 and L2_1.
1767
+ // When compacting L0 into L1, it should split into 3 files.
1768
+ const int kNumPerFile = 4, kNumFiles = 2;
1769
+ Options options = CurrentOptions();
1770
+ options.disable_auto_compactions = true;
1771
+ options.target_file_size_base = 9 * 1024;
1772
+ options.max_compaction_bytes = 9 * 1024;
1773
+ DestroyAndReopen(options);
1774
+ Random rnd(301);
1775
+ for (int i = 0; i < kNumFiles; ++i) {
1776
+ std::vector<std::string> values;
1777
+ for (int j = 0; j < kNumPerFile; j++) {
1778
+ values.push_back(rnd.RandomString(3 << 10));
1779
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
1780
+ }
1781
+ }
1782
+ ASSERT_OK(db_->Flush(FlushOptions()));
1783
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1784
+ MoveFilesToLevel(2);
1785
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
1786
+ ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
1787
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(5),
1788
+ Key(6)));
1789
+ ASSERT_OK(Put(Key(8), rnd.RandomString(1 << 10)));
1790
+ ASSERT_OK(db_->Flush(FlushOptions()));
1791
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1792
+
1793
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
1794
+ true /* disallow_trivial_move */));
1795
+ ASSERT_EQ(3, NumTableFilesAtLevel(1));
1796
+ }
1797
+
1798
+ TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenTombstone) {
1799
+ // L2 has two files
1800
+ // L2_0: 0, 1, 2, 3, 4. L2_1: 5, 6, 7
1801
+ // L0 has two range tombstones [0, 1), [7, 8).
1802
+ // max_compaction_bytes is less than the size of L2_0.
1803
+ // When compacting L0 into L1, the two range tombstones should be
1804
+ // split into two files.
1805
+ const int kNumPerFile = 4, kNumFiles = 2;
1806
+ Options options = CurrentOptions();
1807
+ options.disable_auto_compactions = true;
1808
+ options.target_file_size_base = 9 * 1024;
1809
+ options.max_compaction_bytes = 9 * 1024;
1810
+ DestroyAndReopen(options);
1811
+ Random rnd(301);
1812
+ for (int i = 0; i < kNumFiles; ++i) {
1813
+ std::vector<std::string> values;
1814
+ // Write 12K (4 values, each 3K)
1815
+ for (int j = 0; j < kNumPerFile; j++) {
1816
+ values.push_back(rnd.RandomString(3 << 10));
1817
+ ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
1818
+ }
1819
+ }
1820
+ ASSERT_OK(db_->Flush(FlushOptions()));
1821
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1822
+ MoveFilesToLevel(2);
1823
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
1824
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1825
+ Key(1)));
1826
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
1827
+ Key(8)));
1828
+ ASSERT_OK(db_->Flush(FlushOptions()));
1829
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1830
+
1831
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
1832
+ true /* disallow_trivial_move */));
1833
+ // This is L0 -> L1 compaction
1834
+ // The two range tombstones are broken up into two output files
1835
+ // to limit compaction size.
1836
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
1837
+ }
1838
+
1839
+ TEST_F(DBRangeDelTest, OversizeCompactionPointKeyWithinRangetombstone) {
1840
+ // L2 has two files
1841
+ // L2_0: 0, 1, 2, 3, 4. L2_1: 6, 7, 8
1842
+ // L0 has [0, 9) and point key 5
1843
+ // max_compaction_bytes is less than the size of L2_0.
1844
+ // When compacting L0 into L1, the compaction should cut at point key 5.
1845
+ Options options = CurrentOptions();
1846
+ options.disable_auto_compactions = true;
1847
+ options.target_file_size_base = 9 * 1024;
1848
+ options.max_compaction_bytes = 9 * 1024;
1849
+ DestroyAndReopen(options);
1850
+ Random rnd(301);
1851
+ for (int i = 0; i < 9; ++i) {
1852
+ if (i == 5) {
1853
+ ++i;
1854
+ }
1855
+ ASSERT_OK(Put(Key(i), rnd.RandomString(3 << 10)));
1856
+ }
1857
+ ASSERT_OK(db_->Flush(FlushOptions()));
1858
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1859
+ MoveFilesToLevel(2);
1860
+ ASSERT_EQ(2, NumTableFilesAtLevel(2));
1861
+ ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
1862
+ Key(9)));
1863
+ ASSERT_OK(Put(Key(5), rnd.RandomString(1 << 10)));
1864
+ ASSERT_OK(db_->Flush(FlushOptions()));
1865
+ ASSERT_EQ(1, NumTableFilesAtLevel(0));
1866
+ ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
1867
+ true /* disallow_trivial_move */));
1868
+ ASSERT_EQ(2, NumTableFilesAtLevel(1));
1869
+ }
1870
+
1664
1871
  TEST_F(DBRangeDelTest, OverlappedTombstones) {
1665
1872
  const int kNumPerFile = 4, kNumFiles = 2;
1666
1873
  Options options = CurrentOptions();
@@ -2093,6 +2300,7 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
2093
2300
  options.compression = kNoCompression;
2094
2301
  options.disable_auto_compactions = true;
2095
2302
  options.target_file_size_base = 2 * 1024;
2303
+ options.level_compaction_dynamic_file_size = false;
2096
2304
  DestroyAndReopen(options);
2097
2305
 
2098
2306
  Random rnd(301);
@@ -2508,7 +2716,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
2508
2716
  options.compression = kNoCompression;
2509
2717
  options.disable_auto_compactions = true;
2510
2718
  options.target_file_size_base = 3 * 1024;
2511
- options.max_compaction_bytes = 1024;
2719
+ options.max_compaction_bytes = 2048;
2512
2720
 
2513
2721
  DestroyAndReopen(options);
2514
2722
  // L2
@@ -2554,7 +2762,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
2554
2762
  options.compression = kNoCompression;
2555
2763
  options.disable_auto_compactions = true;
2556
2764
  options.target_file_size_base = 3 * 1024;
2557
- options.max_compaction_bytes = 1024;
2765
+ options.max_compaction_bytes = 3 * 1024;
2558
2766
 
2559
2767
  DestroyAndReopen(options);
2560
2768
  // L2
@@ -2756,6 +2964,46 @@ TEST_F(DBRangeDelTest, RefreshMemtableIter) {
2756
2964
  ASSERT_OK(iter->Refresh());
2757
2965
  }
2758
2966
 
2967
+ TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
2968
+ // Memtable: a, [b, bz)
2969
+ // Do a Seek on `a` with iterate_upper_bound being az
2970
+ // range tombstone [b, bz) should not be processed (added to and
2971
+ // popped from the min_heap in MergingIterator).
2972
+ Options options = CurrentOptions();
2973
+ options.disable_auto_compactions = true;
2974
+ DestroyAndReopen(options);
2975
+
2976
+ ASSERT_OK(Put("a", "bar"));
2977
+ ASSERT_OK(
2978
+ db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
2979
+
2980
+ // I could not find a cleaner way to test this without relying on
2981
+ // implementation detail. Tried to test the value of
2982
+ // `internal_range_del_reseek_count` but that did not work
2983
+ // since BlockBasedTable iterator becomes !Valid() when point key
2984
+ // is out of bound and that reseek only happens when a point key
2985
+ // is covered by some range tombstone.
2986
+ SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
2987
+ [](void*) {
2988
+ // there should not be any range
2989
+ // tombstone in the heap.
2990
+ FAIL();
2991
+ });
2992
+ SyncPoint::GetInstance()->EnableProcessing();
2993
+
2994
+ ReadOptions read_opts;
2995
+ std::string upper_bound = "az";
2996
+ Slice upper_bound_slice = upper_bound;
2997
+ read_opts.iterate_upper_bound = &upper_bound_slice;
2998
+ std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
2999
+ iter->Seek("a");
3000
+ ASSERT_TRUE(iter->Valid());
3001
+ ASSERT_EQ(iter->key(), "a");
3002
+ iter->Next();
3003
+ ASSERT_FALSE(iter->Valid());
3004
+ ASSERT_OK(iter->status());
3005
+ }
3006
+
2759
3007
  #endif // ROCKSDB_LITE
2760
3008
 
2761
3009
  } // namespace ROCKSDB_NAMESPACE
@@ -1203,6 +1203,8 @@ void CheckColumnFamilyMeta(
1203
1203
  file_meta_from_files.file_creation_time);
1204
1204
  ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
1205
1205
  ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
1206
+ ASSERT_EQ(file_meta_from_cf.epoch_number,
1207
+ file_meta_from_files.epoch_number);
1206
1208
  ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
1207
1209
  ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
1208
1210
  // More from FileStorageInfo
@@ -1253,6 +1255,7 @@ void CheckLiveFilesMeta(
1253
1255
  ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
1254
1256
  ASSERT_EQ(meta.oldest_blob_file_number,
1255
1257
  expected_meta.oldest_blob_file_number);
1258
+ ASSERT_EQ(meta.epoch_number, expected_meta.epoch_number);
1256
1259
 
1257
1260
  // More from FileStorageInfo
1258
1261
  ASSERT_EQ(meta.file_type, kTableFile);