@nxtedition/rocksdb 7.0.27 → 7.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/binding.cc +170 -30
  2. package/chained-batch.js +1 -1
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -0
  4. package/deps/rocksdb/rocksdb/Makefile +3 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +10 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +17 -7
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  9. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +117 -0
  10. package/deps/rocksdb/rocksdb/cache/charged_cache.h +121 -0
  11. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +270 -180
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.h +412 -124
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +1 -0
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +1 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +2 -2
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +71 -9
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +11 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +21 -14
  21. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +68 -7
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +16 -0
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +519 -12
  24. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +120 -0
  25. package/deps/rocksdb/rocksdb/db/builder.cc +15 -5
  26. package/deps/rocksdb/rocksdb/db/builder.h +3 -0
  27. package/deps/rocksdb/rocksdb/db/c.cc +18 -0
  28. package/deps/rocksdb/rocksdb/db/c_test.c +18 -0
  29. package/deps/rocksdb/rocksdb/db/column_family.h +2 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +3 -2
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +9 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +15 -10
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +36 -34
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +50 -13
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +12 -0
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +13 -17
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +26 -9
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +0 -11
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +93 -0
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +3 -8
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +17 -5
  46. package/deps/rocksdb/rocksdb/db/db_test.cc +0 -3
  47. package/deps/rocksdb/rocksdb/db/db_test2.cc +39 -12
  48. package/deps/rocksdb/rocksdb/db/db_test_util.cc +9 -0
  49. package/deps/rocksdb/rocksdb/db/db_test_util.h +2 -0
  50. package/deps/rocksdb/rocksdb/db/dbformat.cc +0 -38
  51. package/deps/rocksdb/rocksdb/db/dbformat.h +14 -13
  52. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +5 -2
  53. package/deps/rocksdb/rocksdb/db/event_helpers.cc +13 -1
  54. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +0 -10
  55. package/deps/rocksdb/rocksdb/db/flush_job.cc +19 -15
  56. package/deps/rocksdb/rocksdb/db/flush_job.h +7 -0
  57. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +21 -15
  58. package/deps/rocksdb/rocksdb/db/forward_iterator.h +4 -3
  59. package/deps/rocksdb/rocksdb/db/memtable_list.cc +9 -0
  60. package/deps/rocksdb/rocksdb/db/memtable_list.h +5 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +53 -12
  62. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +14 -2
  63. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +10 -10
  64. package/deps/rocksdb/rocksdb/db/repair.cc +8 -6
  65. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +890 -0
  66. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +324 -0
  67. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +186 -0
  68. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -0
  69. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -4
  70. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -2
  71. package/deps/rocksdb/rocksdb/env/env_test.cc +74 -1
  72. package/deps/rocksdb/rocksdb/env/io_posix.cc +11 -8
  73. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +28 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +14 -1
  75. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +4 -4
  76. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +30 -23
  77. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +1 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +3 -13
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  80. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +1 -2
  81. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  83. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  84. package/deps/rocksdb/rocksdb/options/cf_options.cc +14 -1
  85. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  86. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -56
  87. package/deps/rocksdb/rocksdb/options/db_options.cc +4 -5
  88. package/deps/rocksdb/rocksdb/options/options.cc +11 -1
  89. package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -0
  90. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  91. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -0
  92. package/deps/rocksdb/rocksdb/options/options_test.cc +4 -0
  93. package/deps/rocksdb/rocksdb/src.mk +3 -0
  94. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +6 -1
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +4 -0
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +36 -3
  97. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +36 -1
  98. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +14 -3
  99. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  100. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +6 -0
  101. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +5 -0
  102. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +3 -0
  103. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -7
  104. package/deps/rocksdb/rocksdb/table/table_builder.h +7 -3
  105. package/deps/rocksdb/rocksdb/table/table_properties.cc +9 -0
  106. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +3 -2
  107. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +58 -30
  108. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +1 -0
  109. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +20 -0
  110. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +29 -154
  111. package/deps/rocksdb/rocksdb/util/rate_limiter.h +16 -34
  112. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +0 -92
  113. package/deps/rocksdb/rocksdb/util/timer.h +6 -0
  114. package/deps/rocksdb/rocksdb/util/vector_iterator.h +4 -3
  115. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -45
  116. package/deps/rocksdb/rocksdb/utilities/debug.cc +40 -0
  117. package/deps/rocksdb/rocksdb.gyp +2 -0
  118. package/index.js +19 -2
  119. package/package.json +1 -1
  120. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  121. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -53,26 +53,17 @@ class TieredCompactionTest : public DBTestBase {
53
53
  InternalStats::CompactionOutputsStats kBasicPerLevelStats;
54
54
  InternalStats::CompactionStats kBasicFlushStats;
55
55
 
56
+ std::atomic_bool enable_per_key_placement = true;
57
+
56
58
  void SetUp() override {
57
59
  SyncPoint::GetInstance()->SetCallBack(
58
60
  "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
59
61
  auto supports_per_key_placement = static_cast<bool*>(arg);
60
- *supports_per_key_placement = true;
62
+ *supports_per_key_placement = enable_per_key_placement;
61
63
  });
62
64
  SyncPoint::GetInstance()->EnableProcessing();
63
65
  }
64
66
 
65
- #ifndef ROCKSDB_LITE
66
- uint64_t GetSstSizeHelper(Temperature temperature) {
67
- std::string prop;
68
- EXPECT_TRUE(dbfull()->GetProperty(
69
- DB::Properties::kLiveSstFilesSizeAtTemperature +
70
- std::to_string(static_cast<uint8_t>(temperature)),
71
- &prop));
72
- return static_cast<uint64_t>(std::atoi(prop.c_str()));
73
- }
74
- #endif // ROCKSDB_LITE
75
-
76
67
  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
77
68
  VersionSet* const versions = dbfull()->GetVersionSet();
78
69
  assert(versions);
@@ -1054,12 +1045,14 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
1054
1045
  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
1055
1046
  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
1056
1047
 
1048
+ latest_cold_seq = seq_history[2];
1049
+
1057
1050
  MoveFilesToLevel(kLastLevel);
1058
1051
 
1059
1052
  // move forward the cold_seq again with range delete, take a snapshot to keep
1060
1053
  // the range dels in bottommost
1061
1054
  auto snap = db_->GetSnapshot();
1062
- latest_cold_seq = seq_history[2];
1055
+
1063
1056
  std::string start = Key(25), end = Key(35);
1064
1057
  ASSERT_OK(
1065
1058
  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
@@ -1104,9 +1097,12 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
1104
1097
 
1105
1098
  db_->ReleaseSnapshot(snap);
1106
1099
 
1100
+ // TODO: it should push the data to last level, but penultimate level file is
1101
+ // already bottommost, it's a conflict between bottommost_temperature and
1102
+ // tiered compaction which only applies to last level compaction.
1107
1103
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
1108
- ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
1109
- ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
1104
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
1105
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
1110
1106
  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
1111
1107
 
1112
1108
  // 3 range dels dropped, the first one is double counted as expected, which is
@@ -1123,8 +1119,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
1123
1119
  // input range
1124
1120
  latest_cold_seq = seq_history[1];
1125
1121
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
1126
- ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
1127
- ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
1122
+ ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
1123
+ ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
1128
1124
  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
1129
1125
  }
1130
1126
 
@@ -2618,13 +2618,20 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
2618
2618
  ASSERT_OK(Delete(std::to_string(i)));
2619
2619
  }
2620
2620
  std::vector<KeyVersion> key_versions;
2621
- ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
2622
- db_, Slice(), Slice(), std::numeric_limits<size_t>::max(),
2623
- &key_versions));
2621
+ ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
2622
+ std::numeric_limits<size_t>::max(),
2623
+ &key_versions));
2624
2624
  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
2625
- ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
2626
- db_, handles_[0], Slice(), Slice(), std::numeric_limits<size_t>::max(),
2627
- &key_versions));
2625
+ for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
2626
+ if (i % 3 == 0) {
2627
+ ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion");
2628
+ } else {
2629
+ ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
2630
+ }
2631
+ }
2632
+ ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
2633
+ std::numeric_limits<size_t>::max(),
2634
+ &key_versions));
2628
2635
  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
2629
2636
 
2630
2637
  // Check non-default column family
@@ -2637,11 +2644,21 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
2637
2644
  for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
2638
2645
  ASSERT_OK(Delete(1, std::to_string(i)));
2639
2646
  }
2640
- ASSERT_OK(ROCKSDB_NAMESPACE::GetAllKeyVersions(
2641
- db_, handles_[1], Slice(), Slice(), std::numeric_limits<size_t>::max(),
2642
- &key_versions));
2647
+ ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
2648
+ std::numeric_limits<size_t>::max(),
2649
+ &key_versions));
2643
2650
  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
2644
2651
  }
2652
+
2653
+ TEST_F(DBBasicTest, ValueTypeString) {
2654
+ KeyVersion key_version;
2655
+ // when adding new type, please also update `value_type_string_map`
2656
+ for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid;
2657
+ i++) {
2658
+ key_version.type = i;
2659
+ ASSERT_TRUE(key_version.GetTypeName() != "Invalid");
2660
+ }
2661
+ }
2645
2662
  #endif // !ROCKSDB_LITE
2646
2663
 
2647
2664
  TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
@@ -78,17 +78,6 @@ class DBCompactionTest : public DBTestBase {
78
78
  : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
79
79
 
80
80
  protected:
81
- #ifndef ROCKSDB_LITE
82
- uint64_t GetSstSizeHelper(Temperature temperature) {
83
- std::string prop;
84
- EXPECT_TRUE(dbfull()->GetProperty(
85
- DB::Properties::kLiveSstFilesSizeAtTemperature +
86
- std::to_string(static_cast<uint8_t>(temperature)),
87
- &prop));
88
- return static_cast<uint64_t>(std::atoi(prop.c_str()));
89
- }
90
- #endif // ROCKSDB_LITE
91
-
92
81
  /*
93
82
  * Verifies compaction stats of cfd are valid.
94
83
  *
@@ -480,6 +480,7 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
480
480
  #ifndef ROCKSDB_LITE
481
481
  if (periodic_work_scheduler_ != nullptr) {
482
482
  periodic_work_scheduler_->Unregister(this);
483
+ periodic_work_scheduler_->UnregisterRecordSeqnoTimeWorker(this);
483
484
  }
484
485
  #endif // !ROCKSDB_LITE
485
486
 
@@ -791,6 +792,66 @@ Status DBImpl::StartPeriodicWorkScheduler() {
791
792
  #endif // !ROCKSDB_LITE
792
793
  }
793
794
 
795
+ Status DBImpl::RegisterRecordSeqnoTimeWorker() {
796
+ #ifndef ROCKSDB_LITE
797
+ if (!periodic_work_scheduler_) {
798
+ return Status::OK();
799
+ }
800
+ uint64_t min_time_duration = std::numeric_limits<uint64_t>::max();
801
+ uint64_t max_time_duration = std::numeric_limits<uint64_t>::min();
802
+ {
803
+ InstrumentedMutexLock l(&mutex_);
804
+
805
+ for (auto cfd : *versions_->GetColumnFamilySet()) {
806
+ uint64_t preclude_last_option =
807
+ cfd->ioptions()->preclude_last_level_data_seconds;
808
+ if (!cfd->IsDropped() && preclude_last_option > 0) {
809
+ min_time_duration = std::min(preclude_last_option, min_time_duration);
810
+ max_time_duration = std::max(preclude_last_option, max_time_duration);
811
+ }
812
+ }
813
+ if (min_time_duration == std::numeric_limits<uint64_t>::max()) {
814
+ seqno_time_mapping_.Resize(0, 0);
815
+ } else {
816
+ seqno_time_mapping_.Resize(min_time_duration, max_time_duration);
817
+ }
818
+ }
819
+
820
+ uint64_t seqno_time_cadence = 0;
821
+ if (min_time_duration != std::numeric_limits<uint64_t>::max()) {
822
+ seqno_time_cadence =
823
+ min_time_duration / SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF;
824
+ }
825
+
826
+ Status s;
827
+ if (seqno_time_cadence != record_seqno_time_cadence_) {
828
+ if (seqno_time_cadence == 0) {
829
+ periodic_work_scheduler_->UnregisterRecordSeqnoTimeWorker(this);
830
+ } else {
831
+ s = periodic_work_scheduler_->RegisterRecordSeqnoTimeWorker(
832
+ this, seqno_time_cadence);
833
+ }
834
+
835
+ if (s.ok()) {
836
+ record_seqno_time_cadence_ = seqno_time_cadence;
837
+ }
838
+
839
+ if (s.IsNotSupported()) {
840
+ // TODO: Fix the timer cannot cancel and re-add the same task
841
+ ROCKS_LOG_WARN(
842
+ immutable_db_options_.info_log,
843
+ "Updating seqno to time worker cadence is not supported yet, to make "
844
+ "the change effective, please reopen the DB instance.");
845
+ s = Status::OK();
846
+ }
847
+ }
848
+
849
+ return s;
850
+ #else
851
+ return Status::OK();
852
+ #endif // !ROCKSDB_LITE
853
+ }
854
+
794
855
  // esitmate the total size of stats_history_
795
856
  size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
796
857
  size_t size_total =
@@ -2805,6 +2866,14 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
2805
2866
  }
2806
2867
  } // InstrumentedMutexLock l(&mutex_)
2807
2868
 
2869
+ if (cf_options.preclude_last_level_data_seconds > 0) {
2870
+ // TODO(zjay): Fix the timer issue and re-enable this.
2871
+ ROCKS_LOG_ERROR(
2872
+ immutable_db_options_.info_log,
2873
+ "Creating column family with `preclude_last_level_data_seconds` needs "
2874
+ "to restart DB to take effect");
2875
+ // s = RegisterRecordSeqnoTimeWorker();
2876
+ }
2808
2877
  sv_context.Clean();
2809
2878
  // this is outside the mutex
2810
2879
  if (s.ok()) {
@@ -2893,6 +2962,10 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
2893
2962
  bg_cv_.SignalAll();
2894
2963
  }
2895
2964
 
2965
+ if (cfd->ioptions()->preclude_last_level_data_seconds > 0) {
2966
+ s = RegisterRecordSeqnoTimeWorker();
2967
+ }
2968
+
2896
2969
  if (s.ok()) {
2897
2970
  // Note that here we erase the associated cf_info of the to-be-dropped
2898
2971
  // cfd before its ref-count goes to zero to avoid having to erase cf_info
@@ -5536,6 +5609,26 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
5536
5609
  return Status::NotSupported("This API only works if max_open_files = -1");
5537
5610
  }
5538
5611
  }
5612
+
5613
+ void DBImpl::RecordSeqnoToTimeMapping() {
5614
+ // Get time first then sequence number, so the actual time of seqno is <=
5615
+ // unix_time recorded
5616
+ int64_t unix_time = 0;
5617
+ immutable_db_options_.clock->GetCurrentTime(&unix_time)
5618
+ .PermitUncheckedError(); // Ignore error
5619
+ SequenceNumber seqno = GetLatestSequenceNumber();
5620
+ bool appended = false;
5621
+ {
5622
+ InstrumentedMutexLock l(&mutex_);
5623
+ appended = seqno_time_mapping_.Append(seqno, unix_time);
5624
+ }
5625
+ if (!appended) {
5626
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
5627
+ "Failed to insert sequence number to time entry: %" PRIu64
5628
+ " -> %" PRIu64,
5629
+ seqno, unix_time);
5630
+ }
5631
+ }
5539
5632
  #endif // ROCKSDB_LITE
5540
5633
 
5541
5634
  } // namespace ROCKSDB_NAMESPACE
@@ -36,6 +36,7 @@
36
36
  #include "db/pre_release_callback.h"
37
37
  #include "db/range_del_aggregator.h"
38
38
  #include "db/read_callback.h"
39
+ #include "db/seqno_to_time_mapping.h"
39
40
  #include "db/snapshot_checker.h"
40
41
  #include "db/snapshot_impl.h"
41
42
  #include "db/trim_history_scheduler.h"
@@ -1158,7 +1159,8 @@ class DBImpl : public DB {
1158
1159
  int TEST_BGCompactionsAllowed() const;
1159
1160
  int TEST_BGFlushesAllowed() const;
1160
1161
  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
1161
- void TEST_WaitForStatsDumpRun(std::function<void()> callback) const;
1162
+ void TEST_WaitForPeridicWorkerRun(std::function<void()> callback) const;
1163
+ SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
1162
1164
  size_t TEST_EstimateInMemoryStatsHistorySize() const;
1163
1165
 
1164
1166
  uint64_t TEST_GetCurrentLogNumber() const {
@@ -1186,6 +1188,9 @@ class DBImpl : public DB {
1186
1188
  // flush LOG out of application buffer
1187
1189
  void FlushInfoLog();
1188
1190
 
1191
+ // record current sequence number to time mapping
1192
+ void RecordSeqnoToTimeMapping();
1193
+
1189
1194
  // Interface to block and signal the DB in case of stalling writes by
1190
1195
  // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
1191
1196
  // When DB needs to be blocked or signalled by WriteBufferManager,
@@ -2069,6 +2074,8 @@ class DBImpl : public DB {
2069
2074
  // Schedule background tasks
2070
2075
  Status StartPeriodicWorkScheduler();
2071
2076
 
2077
+ Status RegisterRecordSeqnoTimeWorker();
2078
+
2072
2079
  void PrintStatistics();
2073
2080
 
2074
2081
  size_t EstimateInMemoryStatsHistorySize() const;
@@ -2537,6 +2544,10 @@ class DBImpl : public DB {
2537
2544
  // PeriodicWorkScheduler::Default(). Only in unittest, it can be overrided by
2538
2545
  // PeriodicWorkTestScheduler.
2539
2546
  PeriodicWorkScheduler* periodic_work_scheduler_;
2547
+
2548
+ // Current cadence of the periodic worker for recording sequence number to
2549
+ // time.
2550
+ uint64_t record_seqno_time_cadence_ = 0;
2540
2551
  #endif
2541
2552
 
2542
2553
  // When set, we use a separate queue for writes that don't write to memtable.
@@ -2586,6 +2597,10 @@ class DBImpl : public DB {
2586
2597
 
2587
2598
  // Pointer to WriteBufferManager stalling interface.
2588
2599
  std::unique_ptr<StallInterface> wbm_stall_;
2600
+
2601
+ // seqno_time_mapping_ stores the sequence number to time mapping, it's not
2602
+ // thread safe, both read and write need db mutex hold.
2603
+ SeqnoToTimeMapping seqno_time_mapping_;
2589
2604
  };
2590
2605
 
2591
2606
  class GetWithTimestampReadCallback : public ReadCallback {
@@ -212,8 +212,8 @@ Status DBImpl::FlushMemTableToOutputFile(
212
212
  GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
213
213
  &event_logger_, mutable_cf_options.report_bg_io_stats,
214
214
  true /* sync_output_directory */, true /* write_manifest */, thread_pri,
215
- io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(),
216
- &blob_callback_);
215
+ io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
216
+ cfd->GetFullHistoryTsLow(), &blob_callback_);
217
217
  FileMetaData file_meta;
218
218
 
219
219
  Status s;
@@ -450,7 +450,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
450
450
  data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
451
451
  stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
452
452
  false /* sync_output_directory */, false /* write_manifest */,
453
- thread_pri, io_tracer_, db_id_, db_session_id_,
453
+ thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
454
454
  cfd->GetFullHistoryTsLow(), &blob_callback_));
455
455
  }
456
456
 
@@ -3802,11 +3802,6 @@ void DBImpl::GetSnapshotContext(
3802
3802
 
3803
3803
  Status DBImpl::WaitForCompact(bool wait_unscheduled) {
3804
3804
  // Wait until the compaction completes
3805
-
3806
- // TODO: a bug here. This function actually does not necessarily
3807
- // wait for compact. It actually waits for scheduled compaction
3808
- // OR flush to finish.
3809
-
3810
3805
  InstrumentedMutexLock l(&mutex_);
3811
3806
  while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
3812
3807
  bg_flush_scheduled_ ||
@@ -302,7 +302,8 @@ size_t DBImpl::TEST_GetWalPreallocateBlockSize(
302
302
  }
303
303
 
304
304
  #ifndef ROCKSDB_LITE
305
- void DBImpl::TEST_WaitForStatsDumpRun(std::function<void()> callback) const {
305
+ void DBImpl::TEST_WaitForPeridicWorkerRun(
306
+ std::function<void()> callback) const {
306
307
  if (periodic_work_scheduler_ != nullptr) {
307
308
  static_cast<PeriodicWorkTestScheduler*>(periodic_work_scheduler_)
308
309
  ->TEST_WaitForRun(callback);
@@ -312,6 +313,12 @@ void DBImpl::TEST_WaitForStatsDumpRun(std::function<void()> callback) const {
312
313
  PeriodicWorkTestScheduler* DBImpl::TEST_GetPeriodicWorkScheduler() const {
313
314
  return static_cast<PeriodicWorkTestScheduler*>(periodic_work_scheduler_);
314
315
  }
316
+
317
+ SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const {
318
+ InstrumentedMutexLock l(&mutex_);
319
+ return seqno_time_mapping_;
320
+ }
321
+
315
322
  #endif // !ROCKSDB_LITE
316
323
 
317
324
  size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
@@ -718,6 +718,12 @@ Status DBImpl::VerifySstUniqueIdInManifest() {
718
718
  status = version->VerifySstUniqueIds();
719
719
  mutex_.Lock();
720
720
  version->Unref();
721
+ if (!status.ok()) {
722
+ ROCKS_LOG_WARN(immutable_db_options_.info_log,
723
+ "SST unique id mismatch in column family \"%s\": %s",
724
+ cfd->GetName().c_str(), status.ToString().c_str());
725
+ return status;
726
+ }
721
727
  }
722
728
  }
723
729
  return status;
@@ -1554,17 +1560,19 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
1554
1560
  GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
1555
1561
  mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
1556
1562
  0 /* level */, false /* is_bottommost */,
1557
- TableFileCreationReason::kRecovery, current_time,
1558
- 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
1559
- db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber());
1563
+ TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
1564
+ 0 /* file_creation_time */, db_id_, db_session_id_,
1565
+ 0 /* target_file_size */, meta.fd.GetNumber());
1566
+ SeqnoToTimeMapping empty_seqno_time_mapping;
1560
1567
  s = BuildTable(
1561
1568
  dbname_, versions_.get(), immutable_db_options_, tboptions,
1562
1569
  file_options_for_compaction_, cfd->table_cache(), iter.get(),
1563
1570
  std::move(range_del_iters), &meta, &blob_file_additions,
1564
1571
  snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber,
1565
1572
  snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s,
1566
- io_tracer_, BlobFileCreationReason::kRecovery, &event_logger_, job_id,
1567
- Env::IO_HIGH, nullptr /* table_properties */, write_hint,
1573
+ io_tracer_, BlobFileCreationReason::kRecovery,
1574
+ empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH,
1575
+ nullptr /* table_properties */, write_hint,
1568
1576
  nullptr /*full_history_ts_low*/, &blob_callback_);
1569
1577
  LogFlush(immutable_db_options_.info_log);
1570
1578
  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -2106,6 +2114,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
2106
2114
  if (s.ok()) {
2107
2115
  s = impl->StartPeriodicWorkScheduler();
2108
2116
  }
2117
+
2118
+ if (s.ok()) {
2119
+ s = impl->RegisterRecordSeqnoTimeWorker();
2120
+ }
2109
2121
  if (!s.ok()) {
2110
2122
  for (auto* h : *handles) {
2111
2123
  delete h;
@@ -4101,9 +4101,6 @@ class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
4101
4101
 
4102
4102
  ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
4103
4103
 
4104
- const char* Name() const override {
4105
- return "MockedRateLimiterWithNoOptionalAPI";
4106
- }
4107
4104
  void SetBytesPerSecond(int64_t bytes_per_second) override {
4108
4105
  (void)bytes_per_second;
4109
4106
  }
@@ -33,18 +33,6 @@ namespace ROCKSDB_NAMESPACE {
33
33
  class DBTest2 : public DBTestBase {
34
34
  public:
35
35
  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
36
-
37
- protected:
38
- #ifndef ROCKSDB_LITE
39
- uint64_t GetSstSizeHelper(Temperature temperature) {
40
- std::string prop;
41
- EXPECT_TRUE(dbfull()->GetProperty(
42
- DB::Properties::kLiveSstFilesSizeAtTemperature +
43
- std::to_string(static_cast<uint8_t>(temperature)),
44
- &prop));
45
- return static_cast<uint64_t>(std::atoi(prop.c_str()));
46
- }
47
- #endif // ROCKSDB_LITE
48
36
  };
49
37
 
50
38
  #ifndef ROCKSDB_LITE
@@ -7436,6 +7424,45 @@ TEST_F(DBTest2, SstUniqueIdVerify) {
7436
7424
  ASSERT_TRUE(s.IsCorruption());
7437
7425
  }
7438
7426
 
7427
+ TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
7428
+ const int kNumSst = 3;
7429
+ const int kLevel0Trigger = 4;
7430
+ auto options = CurrentOptions();
7431
+ options.level0_file_num_compaction_trigger = kLevel0Trigger;
7432
+
7433
+ CreateAndReopenWithCF({"one", "two"}, options);
7434
+
7435
+ // generate good SSTs
7436
+ for (int cf_num : {0, 2}) {
7437
+ for (int i = 0; i < kNumSst; i++) {
7438
+ for (int j = 0; j < 100; j++) {
7439
+ ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value"));
7440
+ }
7441
+ ASSERT_OK(Flush(cf_num));
7442
+ }
7443
+ }
7444
+
7445
+ // generate SSTs with bad unique id
7446
+ SyncPoint::GetInstance()->SetCallBack(
7447
+ "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
7448
+ auto props = static_cast<TableProperties*>(props_vs);
7449
+ // update table property session_id to a different one
7450
+ props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
7451
+ });
7452
+ SyncPoint::GetInstance()->EnableProcessing();
7453
+ for (int i = 0; i < kNumSst; i++) {
7454
+ for (int j = 0; j < 100; j++) {
7455
+ ASSERT_OK(Put(1, Key(i * 10 + j), "value"));
7456
+ }
7457
+ ASSERT_OK(Flush(1));
7458
+ }
7459
+
7460
+ // Reopen with verification should report corruption
7461
+ options.verify_sst_unique_id_in_manifest = true;
7462
+ auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options);
7463
+ ASSERT_TRUE(s.IsCorruption());
7464
+ }
7465
+
7439
7466
  #ifndef ROCKSDB_LITE
7440
7467
  TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
7441
7468
  Destroy(last_options_);
@@ -1676,6 +1676,15 @@ uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
1676
1676
  }
1677
1677
  return result;
1678
1678
  }
1679
+
1680
+ uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
1681
+ std::string prop;
1682
+ EXPECT_TRUE(dbfull()->GetProperty(
1683
+ DB::Properties::kLiveSstFilesSizeAtTemperature +
1684
+ std::to_string(static_cast<uint8_t>(temperature)),
1685
+ &prop));
1686
+ return static_cast<uint64_t>(std::atoi(prop.c_str()));
1687
+ }
1679
1688
  #endif // ROCKSDB_LITE
1680
1689
 
1681
1690
  void VerifySstUniqueIds(const TablePropertiesCollection& props) {
@@ -1345,6 +1345,8 @@ class DBTestBase : public testing::Test {
1345
1345
  #ifndef ROCKSDB_LITE
1346
1346
  uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
1347
1347
  std::string column_family_name);
1348
+
1349
+ uint64_t GetSstSizeHelper(Temperature temperature);
1348
1350
  #endif // ROCKSDB_LITE
1349
1351
 
1350
1352
  uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
@@ -116,10 +116,6 @@ std::string InternalKey::DebugString(bool hex) const {
116
116
  return result;
117
117
  }
118
118
 
119
- const char* InternalKeyComparator::Name() const {
120
- return "rocksdb.anonymous.InternalKeyComparator";
121
- }
122
-
123
119
  int InternalKeyComparator::Compare(const ParsedInternalKey& a,
124
120
  const ParsedInternalKey& b) const {
125
121
  // Order by:
@@ -141,40 +137,6 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
141
137
  return r;
142
138
  }
143
139
 
144
- void InternalKeyComparator::FindShortestSeparator(std::string* start,
145
- const Slice& limit) const {
146
- // Attempt to shorten the user portion of the key
147
- Slice user_start = ExtractUserKey(*start);
148
- Slice user_limit = ExtractUserKey(limit);
149
- std::string tmp(user_start.data(), user_start.size());
150
- user_comparator_.FindShortestSeparator(&tmp, user_limit);
151
- if (tmp.size() <= user_start.size() &&
152
- user_comparator_.Compare(user_start, tmp) < 0) {
153
- // User key has become shorter physically, but larger logically.
154
- // Tack on the earliest possible number to the shortened user key.
155
- PutFixed64(&tmp,
156
- PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
157
- assert(this->Compare(*start, tmp) < 0);
158
- assert(this->Compare(tmp, limit) < 0);
159
- start->swap(tmp);
160
- }
161
- }
162
-
163
- void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
164
- Slice user_key = ExtractUserKey(*key);
165
- std::string tmp(user_key.data(), user_key.size());
166
- user_comparator_.FindShortSuccessor(&tmp);
167
- if (tmp.size() <= user_key.size() &&
168
- user_comparator_.Compare(user_key, tmp) < 0) {
169
- // User key has become shorter physically, but larger logically.
170
- // Tack on the earliest possible number to the shortened user key.
171
- PutFixed64(&tmp,
172
- PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
173
- assert(this->Compare(*key, tmp) < 0);
174
- key->swap(tmp);
175
- }
176
- }
177
-
178
140
  LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
179
141
  const Slice* ts) {
180
142
  size_t usize = _user_key.size();
@@ -68,7 +68,9 @@ enum ValueType : unsigned char {
68
68
  kTypeCommitXIDAndTimestamp = 0x15, // WAL only
69
69
  kTypeWideColumnEntity = 0x16,
70
70
  kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only
71
- kMaxValue = 0x7F // Not used for storing records.
71
+ kTypeMaxValid, // Should be after the last valid type, only used for
72
+ // validation
73
+ kMaxValue = 0x7F // Not used for storing records.
72
74
  };
73
75
 
74
76
  // Defined in dbformat.cc
@@ -235,7 +237,7 @@ class InternalKeyComparator
235
237
  #ifdef NDEBUG
236
238
  final
237
239
  #endif
238
- : public Comparator {
240
+ : public CompareInterface {
239
241
  private:
240
242
  UserComparatorWrapper user_comparator_;
241
243
 
@@ -249,17 +251,19 @@ class InternalKeyComparator
249
251
  // this constructor to precompute the result of `Name()`. To avoid this
250
252
  // overhead, set `named` to false. In that case, `Name()` will return a
251
253
  // generic name that is non-specific to the underlying comparator.
252
- explicit InternalKeyComparator(const Comparator* c)
253
- : Comparator(c->timestamp_size()), user_comparator_(c) {}
254
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
254
255
  virtual ~InternalKeyComparator() {}
255
256
 
256
- virtual const char* Name() const override;
257
- virtual int Compare(const Slice& a, const Slice& b) const override;
257
+ int Compare(const Slice& a, const Slice& b) const override;
258
+
259
+ bool Equal(const Slice& a, const Slice& b) const {
260
+ // TODO Use user_comparator_.Equal(). Perhaps compare seqno before
261
+ // comparing the user key too.
262
+ return Compare(a, b) == 0;
263
+ }
264
+
258
265
  // Same as Compare except that it excludes the value type from comparison
259
- virtual int CompareKeySeq(const Slice& a, const Slice& b) const;
260
- virtual void FindShortestSeparator(std::string* start,
261
- const Slice& limit) const override;
262
- virtual void FindShortSuccessor(std::string* key) const override;
266
+ int CompareKeySeq(const Slice& a, const Slice& b) const;
263
267
 
264
268
  const Comparator* user_comparator() const {
265
269
  return user_comparator_.user_comparator();
@@ -273,9 +277,6 @@ class InternalKeyComparator
273
277
  // value `kDisableGlobalSequenceNumber`.
274
278
  int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
275
279
  SequenceNumber b_global_seqno) const;
276
- virtual const Comparator* GetRootComparator() const override {
277
- return user_comparator_.GetRootComparator();
278
- }
279
280
  };
280
281
 
281
282
  // The class represent the internal key in encoded form.
@@ -9,6 +9,7 @@
9
9
 
10
10
  #include "db/dbformat.h"
11
11
 
12
+ #include "table/block_based/index_builder.h"
12
13
  #include "test_util/testharness.h"
13
14
  #include "test_util/testutil.h"
14
15
 
@@ -24,13 +25,15 @@ static std::string IKey(const std::string& user_key,
24
25
 
25
26
  static std::string Shorten(const std::string& s, const std::string& l) {
26
27
  std::string result = s;
27
- InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
28
+ ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(),
29
+ &result, l);
28
30
  return result;
29
31
  }
30
32
 
31
33
  static std::string ShortSuccessor(const std::string& s) {
32
34
  std::string result = s;
33
- InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
35
+ ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(),
36
+ &result);
34
37
  return result;
35
38
  }
36
39