@nxtedition/rocksdb 12.1.3 → 12.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. package/binding.cc +12 -13
  2. package/binding.gyp +0 -4
  3. package/deps/rocksdb/rocksdb/Makefile +10 -5
  4. package/deps/rocksdb/rocksdb/TARGETS +9 -7
  5. package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
  6. package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
  13. package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
  14. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
  15. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
  16. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
  19. package/deps/rocksdb/rocksdb/db/c.cc +9 -0
  20. package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
  21. package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
  22. package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
  40. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
  41. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
  42. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
  52. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
  53. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
  54. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
  56. package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
  57. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
  58. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
  59. package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
  60. package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
  61. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  62. package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
  63. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
  64. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
  65. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
  66. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
  67. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
  68. package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
  70. package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
  71. package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
  72. package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
  73. package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
  74. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  75. package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
  78. package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
  79. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
  80. package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
  81. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
  85. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
  86. package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
  87. package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
  88. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
  89. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
  90. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
  91. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
  92. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
  94. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
  95. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
  96. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
  97. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
  98. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  100. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  101. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
  102. package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
  103. package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
  104. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  105. package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
  106. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  107. package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
  108. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
  109. package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
  110. package/deps/rocksdb/rocksdb/port/port.h +16 -0
  111. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
  112. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
  113. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
  114. package/deps/rocksdb/rocksdb/table/format.cc +32 -4
  115. package/deps/rocksdb/rocksdb/table/format.h +12 -1
  116. package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
  117. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
  118. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
  119. package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
  120. package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
  121. package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
  122. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
  123. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
  124. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
  125. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
  126. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
  127. package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
  128. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
  129. package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
  130. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
  131. package/package.json +1 -1
  132. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  133. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -65,9 +65,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
65
65
  valid_(false),
66
66
  current_entry_is_merged_(false),
67
67
  is_key_seqnum_zero_(false),
68
- prefix_same_as_start_(mutable_cf_options.prefix_extractor
69
- ? read_options.prefix_same_as_start
70
- : false),
68
+ prefix_same_as_start_(
69
+ prefix_extractor_ ? read_options.prefix_same_as_start : false),
71
70
  pin_thru_lifetime_(read_options.pin_data),
72
71
  expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
73
72
  read_options.total_order_seek ||
@@ -93,6 +92,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
93
92
  status_.PermitUncheckedError();
94
93
  assert(timestamp_size_ ==
95
94
  user_comparator_.user_comparator()->timestamp_size());
95
+ // prefix_seek_opt_in_only should force total_order_seek whereever the caller
96
+ // is duplicating the original ReadOptions
97
+ assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
96
98
  }
97
99
 
98
100
  Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@@ -538,6 +540,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
538
540
  } else {
539
541
  iter_.Next();
540
542
  }
543
+ // This could be a long-running operation due to tombstones, etc.
544
+ ROCKSDB_THREAD_YIELD_HOOK();
541
545
  } while (iter_.Valid());
542
546
 
543
547
  valid_ = false;
@@ -244,7 +244,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
244
244
  ASSERT_EQ(largest.user_key().ToString(), "foo");
245
245
  ASSERT_EQ(result.output_level, 1);
246
246
  ASSERT_EQ(result.output_path, this->secondary_path_);
247
- ASSERT_EQ(result.num_output_records, 2);
247
+ ASSERT_EQ(result.stats.num_output_records, 2);
248
248
  ASSERT_GT(result.bytes_written, 0);
249
249
  ASSERT_OK(result.status);
250
250
  }
@@ -383,12 +383,16 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
383
383
  ASSERT_EQ(files_moved, 0);
384
384
 
385
385
  Close();
386
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
387
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
386
388
  Reopen(options);
387
389
  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
388
390
  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
389
391
 
390
392
  // Verify that we track all the files again after the DB is closed and opened
391
393
  Close();
394
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
395
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
392
396
  sst_file_manager.reset(NewSstFileManager(env_));
393
397
  options.sst_file_manager = sst_file_manager;
394
398
  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@@ -439,6 +443,11 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
439
443
 
440
444
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
441
445
  "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
446
+
447
+ int64_t untracked_files = 0;
448
+ ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
449
+ "SstFileManagerImpl::OnUntrackFile",
450
+ [&](void* /*arg*/) { ++untracked_files; });
442
451
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
443
452
 
444
453
  Options options = CurrentOptions();
@@ -485,6 +494,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
485
494
  }
486
495
  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
487
496
  Close();
497
+ ASSERT_EQ(untracked_files, files_in_db.size());
498
+ untracked_files = 0;
499
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
500
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
488
501
 
489
502
  Reopen(options);
490
503
  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
@@ -492,6 +505,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
492
505
 
493
506
  // Verify that we track all the files again after the DB is closed and opened.
494
507
  Close();
508
+ ASSERT_EQ(untracked_files, files_in_db.size());
509
+ untracked_files = 0;
510
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
511
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
495
512
 
496
513
  sst_file_manager.reset(NewSstFileManager(env_));
497
514
  options.sst_file_manager = sst_file_manager;
@@ -507,6 +524,10 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
507
524
  ASSERT_EQ(files_deleted, 0);
508
525
  ASSERT_EQ(files_scheduled_to_delete, 0);
509
526
  Close();
527
+ ASSERT_EQ(untracked_files, files_in_db.size());
528
+ untracked_files = 0;
529
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
530
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
510
531
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
511
532
  "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
512
533
  assert(arg);
@@ -666,6 +687,9 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
666
687
  }
667
688
 
668
689
  Close();
690
+ ASSERT_EQ(sfm->GetTrackedFiles().size(), 0) << "sfm should be empty";
691
+ ASSERT_EQ(sfm->GetTotalSize(), 0) << "sfm should be empty";
692
+
669
693
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
670
694
  "SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
671
695
  assert(arg);
@@ -5597,32 +5597,45 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) {
5597
5597
  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
5598
5598
  bbto.whole_key_filtering = false;
5599
5599
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
5600
- DestroyAndReopen(options);
5601
5600
 
5602
- // Construct two L1 files with keys:
5603
- // f1:[aaa1 ccc1] f2:[ddd0]
5604
- ASSERT_OK(Put("aaa1", ""));
5605
- ASSERT_OK(Put("ccc1", ""));
5606
- ASSERT_OK(Flush());
5607
- ASSERT_OK(Put("ddd0", ""));
5608
- ASSERT_OK(Flush());
5609
- CompactRangeOptions cro;
5610
- cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5611
- ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5601
+ // This test is also the primary test for prefix_seek_opt_in_only
5602
+ for (bool opt_in : {false, true}) {
5603
+ options.prefix_seek_opt_in_only = opt_in;
5604
+ DestroyAndReopen(options);
5612
5605
 
5613
- Iterator* iter = db_->NewIterator(ReadOptions());
5614
- ASSERT_OK(iter->status());
5606
+ // Construct two L1 files with keys:
5607
+ // f1:[aaa1 ccc1] f2:[ddd0]
5608
+ ASSERT_OK(Put("aaa1", ""));
5609
+ ASSERT_OK(Put("ccc1", ""));
5610
+ ASSERT_OK(Flush());
5611
+ ASSERT_OK(Put("ddd0", ""));
5612
+ ASSERT_OK(Flush());
5613
+ CompactRangeOptions cro;
5614
+ cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
5615
+ ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
5615
5616
 
5616
- // Bloom filter is filterd out by f1.
5617
- // This is just one of several valid position following the contract.
5618
- // Postioning to ccc1 or ddd0 is also valid. This is just to validate
5619
- // the behavior of the current implementation. If underlying implementation
5620
- // changes, the test might fail here.
5621
- iter->Seek("bbb1");
5622
- ASSERT_OK(iter->status());
5623
- ASSERT_FALSE(iter->Valid());
5617
+ ReadOptions ropts;
5618
+ for (bool same : {false, true}) {
5619
+ ropts.prefix_same_as_start = same;
5620
+ std::unique_ptr<Iterator> iter(db_->NewIterator(ropts));
5621
+ ASSERT_OK(iter->status());
5624
5622
 
5625
- delete iter;
5623
+ iter->Seek("bbb1");
5624
+ ASSERT_OK(iter->status());
5625
+ if (opt_in && !same) {
5626
+ // Unbounded total order seek
5627
+ ASSERT_TRUE(iter->Valid());
5628
+ ASSERT_EQ(iter->key(), "ccc1");
5629
+ } else {
5630
+ // Bloom filter is filterd out by f1. When same == false, this is just
5631
+ // one valid position following the contract. Postioning to ccc1 or ddd0
5632
+ // is also valid. This is just to validate the behavior of the current
5633
+ // implementation. If underlying implementation changes, the test might
5634
+ // fail here.
5635
+ ASSERT_FALSE(iter->Valid());
5636
+ }
5637
+ }
5638
+ }
5626
5639
  }
5627
5640
 
5628
5641
  TEST_F(DBTest2, RowCacheSnapshot) {
@@ -5987,6 +6000,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) {
5987
6000
  // create a DB with block prefix index
5988
6001
  BlockBasedTableOptions table_options;
5989
6002
  Options options = CurrentOptions();
6003
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
5990
6004
 
5991
6005
  // Sometimes filter is checked based on upper bound. Assert counters
5992
6006
  // for that case. Otherwise, only check data correctness.
@@ -2931,6 +2931,29 @@ TEST_F(DBWALTest, RecoveryFlushSwitchWALOnEmptyMemtable) {
2931
2931
  ASSERT_EQ("new_v", Get("k"));
2932
2932
  Destroy(options);
2933
2933
  }
2934
+
2935
+ TEST_F(DBWALTest, WALWriteErrorNoRecovery) {
2936
+ Options options = CurrentOptions();
2937
+ auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
2938
+ std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
2939
+ options.env = fault_fs_env.get();
2940
+ options.manual_wal_flush = true;
2941
+ DestroyAndReopen(options);
2942
+ fault_fs->SetThreadLocalErrorContext(
2943
+ FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
2944
+ true /* retryable */, false /* has_data_loss*/);
2945
+ fault_fs->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
2946
+
2947
+ ASSERT_OK(Put("k", "v"));
2948
+ Status s;
2949
+ s = db_->FlushWAL(false);
2950
+ ASSERT_TRUE(s.IsIOError());
2951
+ s = dbfull()->TEST_GetBGError();
2952
+ ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
2953
+ ASSERT_FALSE(dbfull()->TEST_IsRecoveryInProgress());
2954
+ fault_fs->DisableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
2955
+ Destroy(options);
2956
+ }
2934
2957
  } // namespace ROCKSDB_NAMESPACE
2935
2958
 
2936
2959
  int main(int argc, char** argv) {
@@ -832,6 +832,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
832
832
 
833
833
  TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
834
834
  Options options = CurrentOptions();
835
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
835
836
  options.env = env_;
836
837
  options.create_if_missing = true;
837
838
  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
@@ -1009,6 +1010,7 @@ TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
1009
1010
  TestComparator test_cmp(kTimestampSize);
1010
1011
  options.comparator = &test_cmp;
1011
1012
  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
1013
+ options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
1012
1014
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
1013
1015
  DestroyAndReopen(options);
1014
1016
  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
@@ -381,7 +381,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
381
381
  // BackgroundErrorReason reason) will be called to handle other error cases
382
382
  // such as delegating to SstFileManager to handle no space error.
383
383
  void ErrorHandler::SetBGError(const Status& bg_status,
384
- BackgroundErrorReason reason) {
384
+ BackgroundErrorReason reason, bool wal_related) {
385
385
  db_mutex_->AssertHeld();
386
386
  Status tmp_status = bg_status;
387
387
  IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
@@ -389,8 +389,8 @@ void ErrorHandler::SetBGError(const Status& bg_status,
389
389
  if (bg_io_err.ok()) {
390
390
  return;
391
391
  }
392
- ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
393
- bg_io_err.ToString().c_str());
392
+ ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s, reason %d",
393
+ bg_io_err.ToString().c_str(), static_cast<int>(reason));
394
394
 
395
395
  RecordStats({ERROR_HANDLER_BG_ERROR_COUNT, ERROR_HANDLER_BG_IO_ERROR_COUNT},
396
396
  {} /* int_histograms */);
@@ -412,6 +412,31 @@ void ErrorHandler::SetBGError(const Status& bg_status,
412
412
  recover_context_ = context;
413
413
  return;
414
414
  }
415
+ if (wal_related) {
416
+ assert(reason == BackgroundErrorReason::kWriteCallback ||
417
+ reason == BackgroundErrorReason::kMemTable ||
418
+ reason == BackgroundErrorReason::kFlush);
419
+ }
420
+ if (db_options_.manual_wal_flush && wal_related && bg_io_err.IsIOError()) {
421
+ // With manual_wal_flush, a WAL write failure can drop buffered WAL writes.
422
+ // Memtables and WAL then become inconsistent. A successful memtable flush
423
+ // on one CF can cause CFs to be inconsistent upon restart. Before we fix
424
+ // the bug in auto recovery from WAL write failures that can flush one CF
425
+ // at a time, we set the error severity to fatal to disallow auto recovery.
426
+ // TODO: remove parameter `wal_related` once we can automatically recover
427
+ // from WAL write failures.
428
+ bool auto_recovery = false;
429
+ Status bg_err(new_bg_io_err, Status::Severity::kFatalError);
430
+ CheckAndSetRecoveryAndBGError(bg_err);
431
+ ROCKS_LOG_WARN(db_options_.info_log,
432
+ "ErrorHandler: A potentially WAL error happened, set "
433
+ "background IO error as fatal error\n");
434
+ EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
435
+ &bg_err, db_mutex_, &auto_recovery);
436
+ recover_context_ = context;
437
+ return;
438
+ }
439
+
415
440
  if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
416
441
  (bg_io_err.GetScope() == IOStatus::IOErrorScope::kIOErrorScopeFile ||
417
442
  bg_io_err.GetRetryable())) {
@@ -56,7 +56,8 @@ class ErrorHandler {
56
56
  Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
57
57
  Status::Code code, Status::SubCode subcode);
58
58
 
59
- void SetBGError(const Status& bg_err, BackgroundErrorReason reason);
59
+ void SetBGError(const Status& bg_err, BackgroundErrorReason reason,
60
+ bool wal_related = false);
60
61
 
61
62
  Status GetBGError() const { return bg_error_; }
62
63
 
@@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
124
124
  << "comparator" << table_properties.comparator_name
125
125
  << "user_defined_timestamps_persisted"
126
126
  << table_properties.user_defined_timestamps_persisted
127
+ << "key_largest_seqno" << table_properties.key_largest_seqno
127
128
  << "merge_operator" << table_properties.merge_operator_name
128
129
  << "prefix_extractor_name"
129
130
  << table_properties.prefix_extractor_name << "property_collectors"
@@ -152,6 +152,85 @@ Status UpdateManifestForFilesState(
152
152
  // EXPERIMENTAL new filtering features
153
153
 
154
154
  namespace {
155
+ template <size_t N>
156
+ class SemiStaticCappedKeySegmentsExtractor : public KeySegmentsExtractor {
157
+ public:
158
+ SemiStaticCappedKeySegmentsExtractor(const uint32_t* byte_widths) {
159
+ id_ = kName();
160
+ uint32_t prev_end = 0;
161
+ if constexpr (N > 0) { // Suppress a compiler warning
162
+ for (size_t i = 0; i < N; ++i) {
163
+ prev_end = prev_end + byte_widths[i];
164
+ ideal_ends_[i] = prev_end;
165
+ id_ += std::to_string(byte_widths[i]) + "b";
166
+ }
167
+ }
168
+ }
169
+
170
+ static const char* kName() { return "CappedKeySegmentsExtractor"; }
171
+
172
+ const char* Name() const override { return kName(); }
173
+
174
+ std::string GetId() const override { return id_; }
175
+
176
+ void Extract(const Slice& key_or_bound, KeyKind /*kind*/,
177
+ Result* result) const override {
178
+ // Optimistic assignment
179
+ result->segment_ends.assign(ideal_ends_.begin(), ideal_ends_.end());
180
+ if constexpr (N > 0) { // Suppress a compiler warning
181
+ uint32_t key_size = static_cast<uint32_t>(key_or_bound.size());
182
+ if (key_size < ideal_ends_.back()) {
183
+ // Need to fix up (should be rare)
184
+ for (size_t i = 0; i < N; ++i) {
185
+ result->segment_ends[i] = std::min(key_size, result->segment_ends[i]);
186
+ }
187
+ }
188
+ }
189
+ }
190
+
191
+ private:
192
+ std::array<uint32_t, N> ideal_ends_;
193
+ std::string id_;
194
+ };
195
+
196
+ class DynamicCappedKeySegmentsExtractor : public KeySegmentsExtractor {
197
+ public:
198
+ DynamicCappedKeySegmentsExtractor(const std::vector<uint32_t>& byte_widths) {
199
+ id_ = kName();
200
+ uint32_t prev_end = 0;
201
+ for (size_t i = 0; i < byte_widths.size(); ++i) {
202
+ prev_end = prev_end + byte_widths[i];
203
+ ideal_ends_[i] = prev_end;
204
+ id_ += std::to_string(byte_widths[i]) + "b";
205
+ }
206
+ final_ideal_end_ = prev_end;
207
+ }
208
+
209
+ static const char* kName() { return "CappedKeySegmentsExtractor"; }
210
+
211
+ const char* Name() const override { return kName(); }
212
+
213
+ std::string GetId() const override { return id_; }
214
+
215
+ void Extract(const Slice& key_or_bound, KeyKind /*kind*/,
216
+ Result* result) const override {
217
+ // Optimistic assignment
218
+ result->segment_ends = ideal_ends_;
219
+ uint32_t key_size = static_cast<uint32_t>(key_or_bound.size());
220
+ if (key_size < final_ideal_end_) {
221
+ // Need to fix up (should be rare)
222
+ for (size_t i = 0; i < ideal_ends_.size(); ++i) {
223
+ result->segment_ends[i] = std::min(key_size, result->segment_ends[i]);
224
+ }
225
+ }
226
+ }
227
+
228
+ private:
229
+ std::vector<uint32_t> ideal_ends_;
230
+ uint32_t final_ideal_end_;
231
+ std::string id_;
232
+ };
233
+
155
234
  void GetFilterInput(FilterInput select, const Slice& key,
156
235
  const KeySegmentsExtractor::Result& extracted,
157
236
  Slice* out_input, Slice* out_leadup) {
@@ -211,12 +290,6 @@ void GetFilterInput(FilterInput select, const Slice& key,
211
290
  assert(false);
212
291
  return Slice();
213
292
  }
214
-
215
- Slice operator()(SelectValue) {
216
- // TODO
217
- assert(false);
218
- return Slice();
219
- }
220
293
  };
221
294
 
222
295
  Slice input = std::visit(FilterInputGetter(key, extracted), select);
@@ -256,9 +329,6 @@ const char* DeserializeFilterInput(const char* p, const char* limit,
256
329
  case 3:
257
330
  *out = SelectColumnName{};
258
331
  return p;
259
- case 4:
260
- *out = SelectValue{};
261
- return p;
262
332
  default:
263
333
  // Reserved for future use
264
334
  return nullptr;
@@ -315,7 +385,6 @@ void SerializeFilterInput(std::string* out, const FilterInput& select) {
315
385
  void operator()(SelectLegacyKeyPrefix) { out->push_back(1); }
316
386
  void operator()(SelectUserTimestamp) { out->push_back(2); }
317
387
  void operator()(SelectColumnName) { out->push_back(3); }
318
- void operator()(SelectValue) { out->push_back(4); }
319
388
  void operator()(SelectKeySegment select) {
320
389
  // TODO: expand supported cases
321
390
  assert(select.segment_index < 16);
@@ -372,6 +441,7 @@ enum BuiltinSstQueryFilters : char {
372
441
  // and filtered independently because it might be a special case that is
373
442
  // not representative of the minimum in a spread of values.
374
443
  kBytewiseMinMaxFilter = 0x10,
444
+ kRevBytewiseMinMaxFilter = 0x11,
375
445
  };
376
446
 
377
447
  class SstQueryFilterBuilder {
@@ -459,7 +529,10 @@ class CategoryScopeFilterWrapperBuilder : public SstQueryFilterBuilder {
459
529
 
460
530
  class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
461
531
  public:
462
- using SstQueryFilterConfigImpl::SstQueryFilterConfigImpl;
532
+ explicit BytewiseMinMaxSstQueryFilterConfig(
533
+ const FilterInput& input,
534
+ const KeySegmentsExtractor::KeyCategorySet& categories, bool reverse)
535
+ : SstQueryFilterConfigImpl(input, categories), reverse_(reverse) {}
463
536
 
464
537
  std::unique_ptr<SstQueryFilterBuilder> NewBuilder(
465
538
  bool sanity_checks) const override {
@@ -477,11 +550,13 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
477
550
  const KeySegmentsExtractor::Result& lower_bound_extracted,
478
551
  const Slice& upper_bound_excl,
479
552
  const KeySegmentsExtractor::Result& upper_bound_extracted) {
480
- assert(!filter.empty() && filter[0] == kBytewiseMinMaxFilter);
553
+ assert(!filter.empty() && (filter[0] == kBytewiseMinMaxFilter ||
554
+ filter[0] == kRevBytewiseMinMaxFilter));
481
555
  if (filter.size() <= 4) {
482
556
  // Missing some data
483
557
  return true;
484
558
  }
559
+ bool reverse = (filter[0] == kRevBytewiseMinMaxFilter);
485
560
  bool empty_included = (filter[1] & kEmptySeenFlag) != 0;
486
561
  const char* p = filter.data() + 2;
487
562
  const char* limit = filter.data() + filter.size();
@@ -528,8 +603,13 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
528
603
 
529
604
  // May match if both the upper bound and lower bound indicate there could
530
605
  // be overlap
531
- return upper_bound_input.compare(smallest) >= 0 &&
532
- lower_bound_input.compare(largest) <= 0;
606
+ if (reverse) {
607
+ return upper_bound_input.compare(smallest) <= 0 &&
608
+ lower_bound_input.compare(largest) >= 0;
609
+ } else {
610
+ return upper_bound_input.compare(smallest) >= 0 &&
611
+ lower_bound_input.compare(largest) <= 0;
612
+ }
533
613
  }
534
614
 
535
615
  protected:
@@ -551,19 +631,11 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
551
631
  &prev_leadup);
552
632
 
553
633
  int compare = prev_leadup.compare(leadup);
554
- if (compare > 0) {
555
- status = Status::Corruption(
556
- "Ordering invariant violated from 0x" +
557
- prev_key->ToString(/*hex=*/true) + " with prefix 0x" +
558
- prev_leadup.ToString(/*hex=*/true) + " to 0x" +
559
- key.ToString(/*hex=*/true) + " with prefix 0x" +
560
- leadup.ToString(/*hex=*/true));
561
- return;
562
- } else if (compare == 0) {
634
+ if (compare == 0) {
563
635
  // On the same prefix leading up to the segment, the segments must
564
636
  // not be out of order.
565
637
  compare = prev_input.compare(input);
566
- if (compare > 0) {
638
+ if (parent.reverse_ ? compare < 0 : compare > 0) {
567
639
  status = Status::Corruption(
568
640
  "Ordering invariant violated from 0x" +
569
641
  prev_key->ToString(/*hex=*/true) + " with segment 0x" +
@@ -573,6 +645,9 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
573
645
  return;
574
646
  }
575
647
  }
648
+ // NOTE: it is not strictly required that the leadup be ordered, just
649
+ // satisfy the "common segment prefix property" which would be
650
+ // expensive to check
576
651
  }
577
652
 
578
653
  // Now actually update state for the filter inputs
@@ -598,7 +673,8 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
598
673
  return 0;
599
674
  }
600
675
  return 2 + GetFilterInputSerializedLength(parent.input_) +
601
- VarintLength(smallest.size()) + smallest.size() + largest.size();
676
+ VarintLength(parent.reverse_ ? largest.size() : smallest.size()) +
677
+ smallest.size() + largest.size();
602
678
  }
603
679
 
604
680
  void Finish(std::string& append_to) override {
@@ -610,23 +686,27 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
610
686
  }
611
687
  size_t old_append_to_size = append_to.size();
612
688
  append_to.reserve(old_append_to_size + encoded_length);
613
- append_to.push_back(kBytewiseMinMaxFilter);
689
+ append_to.push_back(parent.reverse_ ? kRevBytewiseMinMaxFilter
690
+ : kBytewiseMinMaxFilter);
614
691
 
615
692
  append_to.push_back(empty_seen ? kEmptySeenFlag : 0);
616
693
 
617
694
  SerializeFilterInput(&append_to, parent.input_);
618
695
 
619
- PutVarint32(&append_to, static_cast<uint32_t>(smallest.size()));
620
- append_to.append(smallest);
621
- // The end of `largest` is given by the end of the filter
622
- append_to.append(largest);
696
+ auto& minv = parent.reverse_ ? largest : smallest;
697
+ auto& maxv = parent.reverse_ ? smallest : largest;
698
+ PutVarint32(&append_to, static_cast<uint32_t>(minv.size()));
699
+ append_to.append(minv);
700
+ // The end of `maxv` is given by the end of the filter
701
+ append_to.append(maxv);
623
702
  assert(append_to.size() == old_append_to_size + encoded_length);
624
703
  }
625
704
 
626
705
  const BytewiseMinMaxSstQueryFilterConfig& parent;
627
706
  const bool sanity_checks;
628
707
  // Smallest and largest segment seen, excluding the empty segment which
629
- // is tracked separately
708
+ // is tracked separately. "Reverse" from parent is only applied at
709
+ // serialization time, for efficiency.
630
710
  std::string smallest;
631
711
  std::string largest;
632
712
  bool empty_seen = false;
@@ -635,6 +715,8 @@ class BytewiseMinMaxSstQueryFilterConfig : public SstQueryFilterConfigImpl {
635
715
  Status status;
636
716
  };
637
717
 
718
+ bool reverse_;
719
+
638
720
  private:
639
721
  static constexpr char kEmptySeenFlag = 0x1;
640
722
  };
@@ -1036,6 +1118,7 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
1036
1118
  may_match = MayMatch_CategoryScopeFilterWrapper(filter, *state);
1037
1119
  break;
1038
1120
  case kBytewiseMinMaxFilter:
1121
+ case kRevBytewiseMinMaxFilter:
1039
1122
  if (state == nullptr) {
1040
1123
  // TODO? Report problem
1041
1124
  // No filtering
@@ -1189,14 +1272,63 @@ const std::string SstQueryFilterConfigsManagerImpl::kTablePropertyName =
1189
1272
  "rocksdb.sqfc";
1190
1273
  } // namespace
1191
1274
 
1275
+ std::shared_ptr<const KeySegmentsExtractor>
1276
+ MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths) {
1277
+ std::vector<uint32_t> byte_widths_checked;
1278
+ byte_widths_checked.resize(byte_widths.size());
1279
+ size_t final_end = 0;
1280
+ for (size_t i = 0; i < byte_widths.size(); ++i) {
1281
+ final_end += byte_widths[i];
1282
+ if (byte_widths[i] > UINT32_MAX / 2 || final_end > UINT32_MAX) {
1283
+ // Better to crash than to proceed unsafely
1284
+ return nullptr;
1285
+ }
1286
+ byte_widths_checked[i] = static_cast<uint32_t>(byte_widths[i]);
1287
+ }
1288
+
1289
+ switch (byte_widths_checked.size()) {
1290
+ case 0:
1291
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<0>>(
1292
+ byte_widths_checked.data());
1293
+ case 1:
1294
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<1>>(
1295
+ byte_widths_checked.data());
1296
+ case 2:
1297
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<2>>(
1298
+ byte_widths_checked.data());
1299
+ case 3:
1300
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<3>>(
1301
+ byte_widths_checked.data());
1302
+ case 4:
1303
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<4>>(
1304
+ byte_widths_checked.data());
1305
+ case 5:
1306
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<5>>(
1307
+ byte_widths_checked.data());
1308
+ case 6:
1309
+ return std::make_shared<SemiStaticCappedKeySegmentsExtractor<6>>(
1310
+ byte_widths_checked.data());
1311
+ default:
1312
+ return std::make_shared<DynamicCappedKeySegmentsExtractor>(
1313
+ byte_widths_checked);
1314
+ }
1315
+ }
1316
+
1192
1317
  bool SstQueryFilterConfigs::IsEmptyNotFound() const {
1193
1318
  return this == &kEmptyNotFoundSQFC;
1194
1319
  }
1195
1320
 
1196
1321
  std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
1197
1322
  FilterInput input, KeySegmentsExtractor::KeyCategorySet categories) {
1198
- return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(input,
1199
- categories);
1323
+ return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(
1324
+ input, categories,
1325
+ /*reverse=*/false);
1326
+ }
1327
+
1328
+ std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
1329
+ FilterInput input, KeySegmentsExtractor::KeyCategorySet categories) {
1330
+ return std::make_shared<BytewiseMinMaxSstQueryFilterConfig>(input, categories,
1331
+ /*reverse=*/true);
1200
1332
  }
1201
1333
 
1202
1334
  Status SstQueryFilterConfigsManager::MakeShared(
@@ -113,8 +113,7 @@ Status ExternalSstFileIngestionJob::Prepare(
113
113
  const std::string path_outside_db = f.external_file_path;
114
114
  const std::string path_inside_db = TableFileName(
115
115
  cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
116
- if (ingestion_options_.move_files) {
117
- assert(!ingestion_options_.allow_db_generated_files);
116
+ if (ingestion_options_.move_files || ingestion_options_.link_files) {
118
117
  status =
119
118
  fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
120
119
  if (status.ok()) {
@@ -914,9 +913,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
914
913
  } else if (!iter->status().ok()) {
915
914
  return iter->status();
916
915
  }
917
- if (ingestion_options_.allow_db_generated_files) {
918
- // Verify that all keys have seqno zero.
919
- // TODO: store largest seqno in table property and validate it instead.
916
+ SequenceNumber largest_seqno =
917
+ table_reader.get()->GetTableProperties()->key_largest_seqno;
918
+ // UINT64_MAX means unknown and the file is generated before table property
919
+ // `key_largest_seqno` is introduced.
920
+ if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
921
+ return Status::Corruption(
922
+ "External file has non zero largest sequence number " +
923
+ std::to_string(largest_seqno));
924
+ }
925
+ if (ingestion_options_.allow_db_generated_files &&
926
+ largest_seqno == UINT64_MAX) {
927
+ // Need to verify that all keys have seqno zero.
920
928
  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
921
929
  Status pik_status =
922
930
  ParseInternalKey(iter->key(), &key, allow_data_in_errors);