@nxtedition/rocksdb 7.0.23 → 7.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/binding.cc +3 -1
  2. package/deps/rocksdb/rocksdb/CMakeLists.txt +5 -0
  3. package/deps/rocksdb/rocksdb/Makefile +6 -2
  4. package/deps/rocksdb/rocksdb/TARGETS +14 -0
  5. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +4 -1
  6. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +20 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager_test.cc +2 -2
  8. package/deps/rocksdb/rocksdb/cache/cache_test.cc +44 -31
  9. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +491 -722
  10. package/deps/rocksdb/rocksdb/cache/clock_cache.h +468 -2
  11. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +1 -1
  12. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +51 -52
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +28 -16
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +12 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +1 -0
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +170 -36
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache_test.cc +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +63 -36
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.h +4 -6
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader_test.cc +57 -38
  21. package/deps/rocksdb/rocksdb/db/blob/blob_read_request.h +58 -0
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +164 -74
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +42 -29
  24. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +419 -62
  25. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +208 -8
  26. package/deps/rocksdb/rocksdb/db/c.cc +68 -0
  27. package/deps/rocksdb/rocksdb/db/c_test.c +95 -2
  28. package/deps/rocksdb/rocksdb/db/column_family.cc +12 -3
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +92 -15
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +76 -4
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +52 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +30 -1
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator_test.cc +126 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +203 -1584
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +93 -26
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +87 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +314 -0
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +328 -0
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +32 -6
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +4 -1
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +7 -3
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +174 -33
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +474 -7
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -2
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +825 -0
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +46 -0
  47. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.h +42 -0
  48. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +223 -0
  49. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +255 -0
  50. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +1253 -0
  51. package/deps/rocksdb/rocksdb/db/corruption_test.cc +32 -8
  52. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +3 -1
  53. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +13 -8
  54. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +376 -0
  55. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +103 -78
  56. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +4 -6
  57. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +0 -8
  58. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +10 -3
  59. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +21 -6
  60. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.h +19 -1
  61. package/deps/rocksdb/rocksdb/db/db_iter.cc +91 -14
  62. package/deps/rocksdb/rocksdb/db/db_iter.h +5 -0
  63. package/deps/rocksdb/rocksdb/db/db_kv_checksum_test.cc +33 -0
  64. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +79 -0
  65. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/db_test2.cc +1 -1
  67. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +5 -2
  68. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +185 -0
  69. package/deps/rocksdb/rocksdb/db/dbformat.cc +1 -4
  70. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -8
  71. package/deps/rocksdb/rocksdb/db/internal_stats.cc +71 -29
  72. package/deps/rocksdb/rocksdb/db/internal_stats.h +160 -5
  73. package/deps/rocksdb/rocksdb/db/log_reader.cc +29 -3
  74. package/deps/rocksdb/rocksdb/db/log_reader.h +12 -3
  75. package/deps/rocksdb/rocksdb/db/repair_test.cc +1 -3
  76. package/deps/rocksdb/rocksdb/db/version_edit.cc +6 -0
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +93 -129
  78. package/deps/rocksdb/rocksdb/db/version_set.h +4 -4
  79. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
  80. package/deps/rocksdb/rocksdb/db/version_set_test.cc +42 -35
  81. package/deps/rocksdb/rocksdb/db/write_batch.cc +10 -2
  82. package/deps/rocksdb/rocksdb/db/write_batch_internal.h +4 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +10 -4
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +3 -3
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +3 -2
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -0
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_shared_state.h +5 -1
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +140 -8
  89. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +12 -0
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +46 -7
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +7 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +27 -7
  93. package/deps/rocksdb/rocksdb/env/composite_env_wrapper.h +8 -0
  94. package/deps/rocksdb/rocksdb/env/env_posix.cc +14 -0
  95. package/deps/rocksdb/rocksdb/env/env_test.cc +130 -1
  96. package/deps/rocksdb/rocksdb/env/fs_posix.cc +7 -1
  97. package/deps/rocksdb/rocksdb/env/io_posix.cc +18 -50
  98. package/deps/rocksdb/rocksdb/env/io_posix.h +53 -6
  99. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +8 -10
  100. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +3 -7
  101. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +239 -259
  102. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +84 -19
  103. package/deps/rocksdb/rocksdb/file/random_access_file_reader.h +24 -4
  104. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +1 -1
  105. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +31 -1
  106. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +11 -7
  107. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +2 -0
  108. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +14 -0
  109. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +20 -0
  110. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +37 -13
  111. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +7 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +14 -0
  113. package/deps/rocksdb/rocksdb/include/rocksdb/threadpool.h +9 -0
  114. package/deps/rocksdb/rocksdb/include/rocksdb/write_batch.h +13 -13
  115. package/deps/rocksdb/rocksdb/logging/auto_roll_logger.cc +12 -2
  116. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +38 -0
  117. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +7 -1
  118. package/deps/rocksdb/rocksdb/port/win/env_win.cc +17 -0
  119. package/deps/rocksdb/rocksdb/port/win/env_win.h +8 -0
  120. package/deps/rocksdb/rocksdb/port/win/io_win.cc +6 -3
  121. package/deps/rocksdb/rocksdb/src.mk +5 -0
  122. package/deps/rocksdb/rocksdb/table/block_based/block.h +1 -2
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +1 -1
  124. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +5 -2
  125. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +1 -1
  126. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +15 -12
  127. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.cc +5 -4
  128. package/deps/rocksdb/rocksdb/table/block_based/block_prefetcher.h +2 -1
  129. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_iterator.cc +4 -4
  131. package/deps/rocksdb/rocksdb/table/block_fetcher.cc +1 -2
  132. package/deps/rocksdb/rocksdb/table/get_context.cc +1 -0
  133. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -2
  134. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +24 -4
  135. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +1 -1
  136. package/deps/rocksdb/rocksdb/util/compression.h +2 -0
  137. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +18 -1
  138. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +67 -4
  139. package/deps/rocksdb/rocksdb/util/threadpool_imp.h +8 -0
  140. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +15 -12
  141. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -2
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache_test.cc +1 -1
  143. package/deps/rocksdb/rocksdb.gyp +5 -1
  144. package/package.json +1 -1
  145. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  146. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -30,10 +30,100 @@ namespace ROCKSDB_NAMESPACE {
30
30
  // SYNC_POINT is not supported in released Windows mode.
31
31
  #if !defined(ROCKSDB_LITE)
32
32
 
33
+ class CompactionStatsCollector : public EventListener {
34
+ public:
35
+ CompactionStatsCollector()
36
+ : compaction_completed_(
37
+ static_cast<int>(CompactionReason::kNumOfReasons)) {
38
+ for (auto& v : compaction_completed_) {
39
+ v.store(0);
40
+ }
41
+ }
42
+
43
+ ~CompactionStatsCollector() override {}
44
+
45
+ void OnCompactionCompleted(DB* /* db */,
46
+ const CompactionJobInfo& info) override {
47
+ int k = static_cast<int>(info.compaction_reason);
48
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
49
+ assert(k >= 0 && k < num_of_reasons);
50
+ compaction_completed_[k]++;
51
+ }
52
+
53
+ void OnExternalFileIngested(
54
+ DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
55
+ int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
56
+ compaction_completed_[k]++;
57
+ }
58
+
59
+ void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
60
+ int k = static_cast<int>(CompactionReason::kFlush);
61
+ compaction_completed_[k]++;
62
+ }
63
+
64
+ int NumberOfCompactions(CompactionReason reason) const {
65
+ int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
66
+ int k = static_cast<int>(reason);
67
+ assert(k >= 0 && k < num_of_reasons);
68
+ return compaction_completed_.at(k).load();
69
+ }
70
+
71
+ private:
72
+ std::vector<std::atomic<int>> compaction_completed_;
73
+ };
74
+
33
75
  class DBCompactionTest : public DBTestBase {
34
76
  public:
35
77
  DBCompactionTest()
36
78
  : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
79
+
80
+ protected:
81
+ #ifndef ROCKSDB_LITE
82
+ uint64_t GetSstSizeHelper(Temperature temperature) {
83
+ std::string prop;
84
+ EXPECT_TRUE(dbfull()->GetProperty(
85
+ DB::Properties::kLiveSstFilesSizeAtTemperature +
86
+ std::to_string(static_cast<uint8_t>(temperature)),
87
+ &prop));
88
+ return static_cast<uint64_t>(std::atoi(prop.c_str()));
89
+ }
90
+ #endif // ROCKSDB_LITE
91
+
92
+ /*
93
+ * Verifies compaction stats of cfd are valid.
94
+ *
95
+ * For each level of cfd, its compaction stats are valid if
96
+ * 1) sum(stat.counts) == stat.count, and
97
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
98
+ */
99
+ void VerifyCompactionStats(ColumnFamilyData& cfd,
100
+ const CompactionStatsCollector& collector) {
101
+ #ifndef NDEBUG
102
+ InternalStats* internal_stats_ptr = cfd.internal_stats();
103
+ ASSERT_NE(internal_stats_ptr, nullptr);
104
+ const std::vector<InternalStats::CompactionStats>& comp_stats =
105
+ internal_stats_ptr->TEST_GetCompactionStats();
106
+ const int num_of_reasons =
107
+ static_cast<int>(CompactionReason::kNumOfReasons);
108
+ std::vector<int> counts(num_of_reasons, 0);
109
+ // Count the number of compactions caused by each CompactionReason across
110
+ // all levels.
111
+ for (const auto& stat : comp_stats) {
112
+ int sum = 0;
113
+ for (int i = 0; i < num_of_reasons; i++) {
114
+ counts[i] += stat.counts[i];
115
+ sum += stat.counts[i];
116
+ }
117
+ ASSERT_EQ(sum, stat.count);
118
+ }
119
+ // Verify InternalStats bookkeeping matches that of
120
+ // CompactionStatsCollector, assuming that all compactions complete.
121
+ for (int i = 0; i < num_of_reasons; i++) {
122
+ ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
123
+ counts[i]);
124
+ }
125
+ #endif /* NDEBUG */
126
+ }
37
127
  };
38
128
 
39
129
  class DBCompactionTestWithParam
@@ -110,47 +200,6 @@ class FlushedFileCollector : public EventListener {
110
200
  std::mutex mutex_;
111
201
  };
112
202
 
113
- class CompactionStatsCollector : public EventListener {
114
- public:
115
- CompactionStatsCollector()
116
- : compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
117
- for (auto& v : compaction_completed_) {
118
- v.store(0);
119
- }
120
- }
121
-
122
- ~CompactionStatsCollector() override {}
123
-
124
- void OnCompactionCompleted(DB* /* db */,
125
- const CompactionJobInfo& info) override {
126
- int k = static_cast<int>(info.compaction_reason);
127
- int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
128
- assert(k >= 0 && k < num_of_reasons);
129
- compaction_completed_[k]++;
130
- }
131
-
132
- void OnExternalFileIngested(
133
- DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
134
- int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
135
- compaction_completed_[k]++;
136
- }
137
-
138
- void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
139
- int k = static_cast<int>(CompactionReason::kFlush);
140
- compaction_completed_[k]++;
141
- }
142
-
143
- int NumberOfCompactions(CompactionReason reason) const {
144
- int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
145
- int k = static_cast<int>(reason);
146
- assert(k >= 0 && k < num_of_reasons);
147
- return compaction_completed_.at(k).load();
148
- }
149
-
150
- private:
151
- std::vector<std::atomic<int>> compaction_completed_;
152
- };
153
-
154
203
  class SstStatsCollector : public EventListener {
155
204
  public:
156
205
  SstStatsCollector() : num_ssts_creation_started_(0) {}
@@ -247,40 +296,6 @@ void VerifyCompactionResult(
247
296
  #endif
248
297
  }
249
298
 
250
- /*
251
- * Verifies compaction stats of cfd are valid.
252
- *
253
- * For each level of cfd, its compaction stats are valid if
254
- * 1) sum(stat.counts) == stat.count, and
255
- * 2) stat.counts[i] == collector.NumberOfCompactions(i)
256
- */
257
- void VerifyCompactionStats(ColumnFamilyData& cfd,
258
- const CompactionStatsCollector& collector) {
259
- #ifndef NDEBUG
260
- InternalStats* internal_stats_ptr = cfd.internal_stats();
261
- ASSERT_NE(internal_stats_ptr, nullptr);
262
- const std::vector<InternalStats::CompactionStats>& comp_stats =
263
- internal_stats_ptr->TEST_GetCompactionStats();
264
- const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
265
- std::vector<int> counts(num_of_reasons, 0);
266
- // Count the number of compactions caused by each CompactionReason across
267
- // all levels.
268
- for (const auto& stat : comp_stats) {
269
- int sum = 0;
270
- for (int i = 0; i < num_of_reasons; i++) {
271
- counts[i] += stat.counts[i];
272
- sum += stat.counts[i];
273
- }
274
- ASSERT_EQ(sum, stat.count);
275
- }
276
- // Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
277
- // assuming that all compactions complete.
278
- for (int i = 0; i < num_of_reasons; i++) {
279
- ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
280
- }
281
- #endif /* NDEBUG */
282
- }
283
-
284
299
  const SstFileMetaData* PickFileRandomly(
285
300
  const ColumnFamilyMetaData& cf_meta,
286
301
  Random* rand,
@@ -1093,16 +1108,20 @@ TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
1093
1108
  // create two files in l1 that we can compact
1094
1109
  for (int i = 0; i < 2; ++i) {
1095
1110
  for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
1096
- // make l0 files' ranges overlap to avoid trivial move
1097
1111
  ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
1098
1112
  ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
1099
1113
  ASSERT_OK(Flush());
1100
1114
  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
1101
1115
  }
1102
1116
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
1103
- ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
1104
- ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
1105
1117
  }
1118
+ ASSERT_OK(
1119
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
1120
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
1121
+ ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
1122
+ ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
1123
+ ASSERT_OK(
1124
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
1106
1125
 
1107
1126
  ColumnFamilyMetaData cf_meta;
1108
1127
  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
@@ -4366,7 +4385,13 @@ TEST_F(DBCompactionTest, LevelTtlBooster) {
4366
4385
  ASSERT_OK(Flush());
4367
4386
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
4368
4387
  }
4388
+ // Force files to be compacted to L1
4389
+ ASSERT_OK(
4390
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}}));
4391
+ ASSERT_OK(dbfull()->TEST_WaitForCompact());
4369
4392
  ASSERT_EQ("0,1,2", FilesPerLevel());
4393
+ ASSERT_OK(
4394
+ dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
4370
4395
 
4371
4396
  ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
4372
4397
  }
@@ -246,8 +246,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
246
246
  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
247
247
  // WriteUnprepared, which should use seq_per_batch_.
248
248
  assert(batch_per_txn_ || seq_per_batch_);
249
- // TODO: Check for an error here
250
- env_->GetAbsolutePath(dbname, &db_absolute_path_).PermitUncheckedError();
251
249
 
252
250
  // Reserve ten files or so for other uses and give the rest to TableCache.
253
251
  // Give a large number for setting of "infinite" open files.
@@ -1444,12 +1442,12 @@ Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
1444
1442
  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
1445
1443
  auto& wal = *it;
1446
1444
  assert(wal.IsSyncing());
1447
- if (immutable_db_options_.track_and_verify_wals_in_manifest &&
1448
- wal.GetPreSyncSize() > 0) {
1449
- synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1450
- }
1451
1445
 
1452
1446
  if (logs_.size() > 1) {
1447
+ if (immutable_db_options_.track_and_verify_wals_in_manifest &&
1448
+ wal.GetPreSyncSize() > 0) {
1449
+ synced_wals.AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
1450
+ }
1453
1451
  logs_to_free_.push_back(wal.ReleaseWriter());
1454
1452
  // To modify logs_ both mutex_ and log_write_mutex_ must be held
1455
1453
  InstrumentedMutexLock l(&log_write_mutex_);
@@ -1999,7 +1999,6 @@ class DBImpl : public DB {
1999
1999
  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
2000
2000
 
2001
2001
  #ifndef ROCKSDB_LITE
2002
-
2003
2002
  Status CompactFilesImpl(const CompactionOptions& compact_options,
2004
2003
  ColumnFamilyData* cfd, Version* version,
2005
2004
  const std::vector<std::string>& input_file_names,
@@ -2011,7 +2010,6 @@ class DBImpl : public DB {
2011
2010
  // Wait for current IngestExternalFile() calls to finish.
2012
2011
  // REQUIRES: mutex_ held
2013
2012
  void WaitForIngestFile();
2014
-
2015
2013
  #else
2016
2014
  // IngestExternalFile is not supported in ROCKSDB_LITE so this function
2017
2015
  // will be no-op
@@ -2498,12 +2496,6 @@ class DBImpl : public DB {
2498
2496
  // log is fully commited.
2499
2497
  bool unable_to_release_oldest_log_;
2500
2498
 
2501
- static const int KEEP_LOG_FILE_NUM = 1000;
2502
- // MSVC version 1800 still does not have constexpr for ::max()
2503
- static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
2504
-
2505
- std::string db_absolute_path_;
2506
-
2507
2499
  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
2508
2500
  // calls.
2509
2501
  // REQUIRES: mutex held
@@ -1108,9 +1108,11 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1108
1108
 
1109
1109
  TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
1110
1110
  /*arg=*/nullptr);
1111
+ uint64_t record_checksum;
1111
1112
  while (!stop_replay_by_wal_filter &&
1112
1113
  reader.ReadRecord(&record, &scratch,
1113
- immutable_db_options_.wal_recovery_mode) &&
1114
+ immutable_db_options_.wal_recovery_mode,
1115
+ &record_checksum) &&
1114
1116
  status.ok()) {
1115
1117
  if (record.size() < WriteBatchInternal::kHeader) {
1116
1118
  reporter.Corruption(record.size(),
@@ -1126,8 +1128,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
1126
1128
  if (!status.ok()) {
1127
1129
  return status;
1128
1130
  }
1129
- status = WriteBatchInternal::UpdateProtectionInfo(&batch,
1130
- 8 /* bytes_per_key */);
1131
+ TEST_SYNC_POINT_CALLBACK(
1132
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch);
1133
+ TEST_SYNC_POINT_CALLBACK(
1134
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
1135
+ &record_checksum);
1136
+ status = WriteBatchInternal::UpdateProtectionInfo(
1137
+ &batch, 8 /* bytes_per_key */, &record_checksum);
1131
1138
  if (!status.ok()) {
1132
1139
  return status;
1133
1140
  }
@@ -682,12 +682,6 @@ Status DB::OpenAsSecondary(
682
682
  const std::vector<ColumnFamilyDescriptor>& column_families,
683
683
  std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
684
684
  *dbptr = nullptr;
685
- if (db_options.max_open_files != -1) {
686
- // TODO (yanqin) maybe support max_open_files != -1 by creating hard links
687
- // on SST files so that db secondary can still have access to old SSTs
688
- // while primary instance may delete original.
689
- return Status::InvalidArgument("require max_open_files to be -1");
690
- }
691
685
 
692
686
  DBOptions tmp_opts(db_options);
693
687
  Status s;
@@ -699,6 +693,27 @@ Status DB::OpenAsSecondary(
699
693
  }
700
694
  }
701
695
 
696
+ assert(tmp_opts.info_log != nullptr);
697
+ if (db_options.max_open_files != -1) {
698
+ std::ostringstream oss;
699
+ oss << "The primary instance may delete all types of files after they "
700
+ "become obsolete. The application can coordinate the primary and "
701
+ "secondary so that primary does not delete/rename files that are "
702
+ "currently being used by the secondary. Alternatively, a custom "
703
+ "Env/FS can be provided such that files become inaccessible only "
704
+ "after all primary and secondaries indicate that they are obsolete "
705
+ "and deleted. If the above two are not possible, you can open the "
706
+ "secondary instance with `max_open_files==-1` so that secondary "
707
+ "will eagerly keep all table files open. Even if a file is deleted, "
708
+ "its content can still be accessed via a prior open file "
709
+ "descriptor. This is a hacky workaround for only table files. If "
710
+ "none of the above is done, then point lookup or "
711
+ "range scan via the secondary instance can result in IOError: file "
712
+ "not found. This can be resolved by retrying "
713
+ "TryCatchUpWithPrimary().";
714
+ ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str());
715
+ }
716
+
702
717
  handles->clear();
703
718
  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
704
719
  impl->versions_.reset(new ReactiveVersionSet(
@@ -84,8 +84,17 @@ class DBImplSecondary : public DBImpl {
84
84
  bool error_if_data_exists_in_wals, uint64_t* = nullptr,
85
85
  RecoveryContext* recovery_ctx = nullptr) override;
86
86
 
87
- // Implementations of the DB interface
87
+ // Implementations of the DB interface.
88
88
  using DB::Get;
89
+ // Can return IOError due to files being deleted by the primary. To avoid
90
+ // IOError in this case, application can coordinate between primary and
91
+ // secondaries so that primary will not delete files that are currently being
92
+ // used by the secondaries. The application can also provide a custom FS/Env
93
+ // implementation so that files will remain present until all primary and
94
+ // secondaries indicate that they can be deleted. As a partial hacky
95
+ // workaround, the secondaries can be opened with `max_open_files=-1` so that
96
+ // it eagerly keeps all talbe files open and is able to access the contents of
97
+ // deleted files via prior open fd.
89
98
  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
90
99
  const Slice& key, PinnableSlice* value) override;
91
100
 
@@ -98,6 +107,15 @@ class DBImplSecondary : public DBImpl {
98
107
  std::string* timestamp);
99
108
 
100
109
  using DBImpl::NewIterator;
110
+ // Operations on the created iterators can return IOError due to files being
111
+ // deleted by the primary. To avoid IOError in this case, application can
112
+ // coordinate between primary and secondaries so that primary will not delete
113
+ // files that are currently being used by the secondaries. The application can
114
+ // also provide a custom FS/Env implementation so that files will remain
115
+ // present until all primary and secondaries indicate that they can be
116
+ // deleted. As a partial hacky workaround, the secondaries can be opened with
117
+ // `max_open_files=-1` so that it eagerly keeps all talbe files open and is
118
+ // able to access the contents of deleted files via prior open fd.
101
119
  Iterator* NewIterator(const ReadOptions&,
102
120
  ColumnFamilyHandle* column_family) override;
103
121
 
@@ -812,6 +812,10 @@ bool DBIter::FindValueForCurrentKey() {
812
812
  ValueType last_not_merge_type = kTypeDeletion;
813
813
  ValueType last_key_entry_type = kTypeDeletion;
814
814
 
815
+ // If false, it indicates that we have not seen any valid entry, even though
816
+ // last_key_entry_type is initialized to kTypeDeletion.
817
+ bool valid_entry_seen = false;
818
+
815
819
  // Temporarily pin blocks that hold (merge operands / the value)
816
820
  ReleaseTempPinnedData();
817
821
  TempPinData();
@@ -822,20 +826,33 @@ bool DBIter::FindValueForCurrentKey() {
822
826
  return false;
823
827
  }
824
828
 
829
+ if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
830
+ saved_key_.GetUserKey())) {
831
+ // Found a smaller user key, thus we are done with current user key.
832
+ break;
833
+ }
834
+
825
835
  assert(ikey.user_key.size() >= timestamp_size_);
826
836
  Slice ts;
827
837
  if (timestamp_size_ > 0) {
828
838
  ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
829
839
  timestamp_size_);
830
840
  }
831
- if (!IsVisible(ikey.sequence, ts) ||
832
- !user_comparator_.EqualWithoutTimestamp(ikey.user_key,
833
- saved_key_.GetUserKey())) {
841
+
842
+ bool visible = IsVisible(ikey.sequence, ts);
843
+ if (!visible &&
844
+ (timestamp_lb_ == nullptr ||
845
+ user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
846
+ // Found an invisible version of the current user key, and it must have
847
+ // a higher sequence number or timestamp. Therefore, we are done with the
848
+ // current user key.
834
849
  break;
835
850
  }
851
+
836
852
  if (!ts.empty()) {
837
853
  saved_timestamp_.assign(ts.data(), ts.size());
838
854
  }
855
+
839
856
  if (TooManyInternalKeysSkipped()) {
840
857
  return false;
841
858
  }
@@ -852,6 +869,15 @@ bool DBIter::FindValueForCurrentKey() {
852
869
  return false;
853
870
  }
854
871
 
872
+ if (timestamp_lb_ != nullptr) {
873
+ // Only needed when timestamp_lb_ is not null
874
+ [[maybe_unused]] const bool ret = ParseKey(&ikey_);
875
+ saved_ikey_.assign(iter_.key().data(), iter_.key().size());
876
+ // Since the preceding ParseKey(&ikey) succeeds, so must this.
877
+ assert(ret);
878
+ }
879
+
880
+ valid_entry_seen = true;
855
881
  last_key_entry_type = ikey.type;
856
882
  switch (last_key_entry_type) {
857
883
  case kTypeValue:
@@ -908,6 +934,14 @@ bool DBIter::FindValueForCurrentKey() {
908
934
  PERF_COUNTER_ADD(internal_key_skipped_count, 1);
909
935
  iter_.Prev();
910
936
  ++num_skipped;
937
+
938
+ if (visible && timestamp_lb_ != nullptr) {
939
+ // If timestamp_lb_ is not nullptr, we do not have to look further for
940
+ // another internal key. We can return this current internal key. Yet we
941
+ // still keep the invariant that iter_ is positioned before the returned
942
+ // key.
943
+ break;
944
+ }
911
945
  }
912
946
 
913
947
  if (!iter_.status().ok()) {
@@ -915,6 +949,20 @@ bool DBIter::FindValueForCurrentKey() {
915
949
  return false;
916
950
  }
917
951
 
952
+ if (!valid_entry_seen) {
953
+ // Since we haven't seen any valid entry, last_key_entry_type remains
954
+ // unchanged and the same as its initial value.
955
+ assert(last_key_entry_type == kTypeDeletion);
956
+ assert(last_not_merge_type == kTypeDeletion);
957
+ valid_ = false;
958
+ return true;
959
+ }
960
+
961
+ if (timestamp_lb_ != nullptr) {
962
+ assert(last_key_entry_type == ikey_.type ||
963
+ last_key_entry_type == kTypeRangeDeletion);
964
+ }
965
+
918
966
  Status s;
919
967
  s.PermitUncheckedError();
920
968
  is_blob_ = false;
@@ -923,7 +971,12 @@ bool DBIter::FindValueForCurrentKey() {
923
971
  case kTypeDeletionWithTimestamp:
924
972
  case kTypeSingleDeletion:
925
973
  case kTypeRangeDeletion:
926
- valid_ = false;
974
+ if (timestamp_lb_ == nullptr) {
975
+ valid_ = false;
976
+ } else {
977
+ saved_key_.SetInternalKey(saved_ikey_);
978
+ valid_ = true;
979
+ }
927
980
  return true;
928
981
  case kTypeMerge:
929
982
  current_entry_is_merged_ = true;
@@ -970,6 +1023,9 @@ bool DBIter::FindValueForCurrentKey() {
970
1023
  break;
971
1024
  case kTypeValue:
972
1025
  // do nothing - we've already has value in pinned_value_
1026
+ if (timestamp_lb_ != nullptr) {
1027
+ saved_key_.SetInternalKey(saved_ikey_);
1028
+ }
973
1029
  break;
974
1030
  case kTypeBlobIndex:
975
1031
  if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
@@ -1015,7 +1071,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
1015
1071
  &last_key,
1016
1072
  ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
1017
1073
  kValueTypeForSeek),
1018
- *timestamp_ub_);
1074
+ timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
1019
1075
  }
1020
1076
  iter_.Seek(last_key);
1021
1077
  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
@@ -1060,7 +1116,12 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
1060
1116
  range_del_agg_.ShouldDelete(
1061
1117
  ikey, RangeDelPositioningMode::kBackwardTraversal) ||
1062
1118
  kTypeDeletionWithTimestamp == ikey.type) {
1063
- valid_ = false;
1119
+ if (timestamp_lb_ == nullptr) {
1120
+ valid_ = false;
1121
+ } else {
1122
+ valid_ = true;
1123
+ saved_key_.SetInternalKey(ikey);
1124
+ }
1064
1125
  return true;
1065
1126
  }
1066
1127
  if (!iter_.PrepareValue()) {
@@ -1085,6 +1146,10 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
1085
1146
  }
1086
1147
  }
1087
1148
 
1149
+ if (timestamp_lb_ != nullptr) {
1150
+ saved_key_.SetInternalKey(ikey);
1151
+ }
1152
+
1088
1153
  valid_ = true;
1089
1154
  return true;
1090
1155
  }
@@ -1214,8 +1279,7 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
1214
1279
  return false;
1215
1280
  }
1216
1281
 
1217
- if (user_comparator_.CompareWithoutTimestamp(ikey.user_key,
1218
- saved_key_.GetUserKey()) < 0) {
1282
+ if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
1219
1283
  return true;
1220
1284
  }
1221
1285
 
@@ -1328,7 +1392,9 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
1328
1392
  if (timestamp_size_ > 0) {
1329
1393
  const std::string kTsMin(timestamp_size_, '\0');
1330
1394
  Slice ts = kTsMin;
1331
- saved_key_.UpdateInternalKey(/*seq=*/0, kValueTypeForSeekForPrev, &ts);
1395
+ saved_key_.UpdateInternalKey(
1396
+ /*seq=*/0, kValueTypeForSeekForPrev,
1397
+ timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
1332
1398
  }
1333
1399
 
1334
1400
  if (iterate_upper_bound_ != nullptr &&
@@ -1341,8 +1407,9 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
1341
1407
  if (timestamp_size_ > 0) {
1342
1408
  const std::string kTsMax(timestamp_size_, '\xff');
1343
1409
  Slice ts = kTsMax;
1344
- saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
1345
- &ts);
1410
+ saved_key_.UpdateInternalKey(
1411
+ kMaxSequenceNumber, kValueTypeForSeekForPrev,
1412
+ timestamp_lb_ != nullptr ? timestamp_lb_ : &ts);
1346
1413
  }
1347
1414
  }
1348
1415
  }
@@ -1543,11 +1610,21 @@ void DBIter::SeekToLast() {
1543
1610
  if (iterate_upper_bound_ != nullptr) {
1544
1611
  // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
1545
1612
  SeekForPrev(*iterate_upper_bound_);
1546
- if (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
1547
- *iterate_upper_bound_, /*a_has_ts=*/false, key(),
1548
- /*b_has_ts=*/false)) {
1613
+ const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr);
1614
+ Slice k = Valid() ? key() : Slice();
1615
+ if (is_ikey && Valid()) {
1616
+ k.remove_suffix(kNumInternalBytes + timestamp_size_);
1617
+ }
1618
+ while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
1619
+ *iterate_upper_bound_, /*a_has_ts=*/false, k,
1620
+ /*b_has_ts=*/false)) {
1549
1621
  ReleaseTempPinnedData();
1550
1622
  PrevInternal(nullptr);
1623
+
1624
+ k = key();
1625
+ if (is_ikey) {
1626
+ k.remove_suffix(kNumInternalBytes + timestamp_size_);
1627
+ }
1551
1628
  }
1552
1629
  return;
1553
1630
  }
@@ -224,9 +224,11 @@ class DBIter final : public Iterator {
224
224
  bool ReverseToBackward();
225
225
  // Set saved_key_ to the seek key to target, with proper sequence number set.
226
226
  // It might get adjusted if the seek key is smaller than iterator lower bound.
227
+ // target does not have timestamp.
227
228
  void SetSavedKeyToSeekTarget(const Slice& target);
228
229
  // Set saved_key_ to the seek key to target, with proper sequence number set.
229
230
  // It might get adjusted if the seek key is larger than iterator upper bound.
231
+ // target does not have timestamp.
230
232
  void SetSavedKeyToSeekForPrevTarget(const Slice& target);
231
233
  bool FindValueForCurrentKey();
232
234
  bool FindValueForCurrentKeyUsingSeek();
@@ -377,6 +379,9 @@ class DBIter final : public Iterator {
377
379
  const Slice* const timestamp_lb_;
378
380
  const size_t timestamp_size_;
379
381
  std::string saved_timestamp_;
382
+
383
+ // Used only if timestamp_lb_ is not nullptr.
384
+ std::string saved_ikey_;
380
385
  };
381
386
 
382
387
  // Return a new iterator that converts internal keys (yielded by
@@ -627,6 +627,39 @@ INSTANTIATE_TEST_CASE_P(
627
627
 
628
628
  // TODO: add test for transactions
629
629
  // TODO: add test for corrupted write batch with WAL disabled
630
+
631
+ class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
632
+ public:
633
+ DbKVChecksumWALToWriteBatchTest()
634
+ : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
635
+ };
636
+
637
+ TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
638
+ Options options = CurrentOptions();
639
+ Reopen(options);
640
+ ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
641
+ std::string content = "";
642
+ SyncPoint::GetInstance()->SetCallBack(
643
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
644
+ [&](void* batch_ptr) {
645
+ WriteBatch* batch = reinterpret_cast<WriteBatch*>(batch_ptr);
646
+ content.assign(batch->Data().data(), batch->GetDataSize());
647
+ Slice batch_content = batch->Data();
648
+ // Corrupt first bit
649
+ CorruptWriteBatch(&batch_content, 0, 1);
650
+ });
651
+ SyncPoint::GetInstance()->SetCallBack(
652
+ "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
653
+ [&](void* checksum_ptr) {
654
+ // Verify that checksum is produced on the batch content
655
+ uint64_t checksum = *reinterpret_cast<uint64_t*>(checksum_ptr);
656
+ ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
657
+ });
658
+ SyncPoint::GetInstance()->EnableProcessing();
659
+ ASSERT_TRUE(TryReopen(options).IsCorruption());
660
+ SyncPoint::GetInstance()->DisableProcessing();
661
+ };
662
+
630
663
  } // namespace ROCKSDB_NAMESPACE
631
664
 
632
665
  int main(int argc, char** argv) {