@nxtedition/rocksdb 15.1.2 → 15.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/binding.cc +79 -38
  2. package/build.sh +1 -2
  3. package/deps/rocksdb/rocksdb/BUCK +10 -8
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +27 -2
  5. package/deps/rocksdb/rocksdb/Makefile +27 -116
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -1
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +101 -124
  8. package/deps/rocksdb/rocksdb/cache/clock_cache.h +47 -30
  9. package/deps/rocksdb/rocksdb/db/c.cc +793 -131
  10. package/deps/rocksdb/rocksdb/db/c_test.c +571 -0
  11. package/deps/rocksdb/rocksdb/db/compact_files_test.cc +226 -0
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +95 -59
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +2 -2
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +45 -35
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -4
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -6
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +8 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +47 -0
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +82 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
  24. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  25. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +69 -24
  26. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +9 -1
  27. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +65 -0
  28. package/deps/rocksdb/rocksdb/db/db_etc3_test.cc +161 -0
  29. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -0
  30. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +20 -7
  31. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +13 -0
  32. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +114 -39
  33. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  34. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +3 -3
  35. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  36. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +39 -25
  37. package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +361 -0
  38. package/deps/rocksdb/rocksdb/db/db_options_test.cc +35 -0
  39. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +83 -0
  40. package/deps/rocksdb/rocksdb/db/db_test.cc +249 -4
  41. package/deps/rocksdb/rocksdb/db/db_test2.cc +3 -0
  42. package/deps/rocksdb/rocksdb/db/db_test_util.cc +2 -1
  43. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +3 -2
  44. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -7
  45. package/deps/rocksdb/rocksdb/db/listener_test.cc +7 -17
  46. package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +4 -2
  47. package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +41 -0
  48. package/deps/rocksdb/rocksdb/db/repair.cc +2 -2
  49. package/deps/rocksdb/rocksdb/db/version_edit.h +7 -4
  50. package/deps/rocksdb/rocksdb/db/version_set.cc +299 -90
  51. package/deps/rocksdb/rocksdb/db/version_set.h +56 -9
  52. package/deps/rocksdb/rocksdb/db/version_set_test.cc +41 -39
  53. package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
  54. package/deps/rocksdb/rocksdb/db/wal_manager.cc +7 -1
  55. package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +48 -10
  56. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  57. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  58. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +16 -5
  59. package/deps/rocksdb/rocksdb/env/env_test.cc +126 -41
  60. package/deps/rocksdb/rocksdb/env/fs_posix.cc +14 -7
  61. package/deps/rocksdb/rocksdb/env/io_posix.cc +304 -112
  62. package/deps/rocksdb/rocksdb/env/io_posix.h +16 -4
  63. package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
  64. package/deps/rocksdb/rocksdb/folly.mk +148 -0
  65. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +29 -3
  66. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +73 -0
  67. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +246 -0
  68. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +0 -2
  69. package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +15 -9
  70. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -9
  71. package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -1
  72. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -4
  73. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +14 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +67 -6
  75. package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +1 -7
  76. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
  77. package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +6 -14
  78. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +8 -1
  79. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +2 -2
  80. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +0 -4
  81. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +33 -5
  82. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +6 -0
  83. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  84. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +2 -0
  85. package/deps/rocksdb/rocksdb/monitoring/thread_status_impl.cc +5 -2
  86. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +2 -2
  87. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +6 -6
  88. package/deps/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc +2 -2
  89. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +10 -5
  90. package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +2 -2
  91. package/deps/rocksdb/rocksdb/options/cf_options.cc +15 -3
  92. package/deps/rocksdb/rocksdb/options/cf_options.h +7 -0
  93. package/deps/rocksdb/rocksdb/options/db_options.cc +27 -36
  94. package/deps/rocksdb/rocksdb/options/db_options.h +3 -2
  95. package/deps/rocksdb/rocksdb/options/options.cc +4 -0
  96. package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -2
  97. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -1
  98. package/deps/rocksdb/rocksdb/options/options_test.cc +19 -3
  99. package/deps/rocksdb/rocksdb/src.mk +1 -1
  100. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +155 -32
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -3
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +169 -125
  103. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +22 -7
  104. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +43 -24
  105. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +9 -5
  106. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +9 -8
  107. package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +17 -0
  108. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -5
  109. package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +13 -18
  110. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +29 -0
  111. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +6 -0
  112. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +15 -0
  113. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +79 -19
  114. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +48 -20
  115. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +51 -0
  116. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +19 -0
  117. package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +1 -1
  118. package/deps/rocksdb/rocksdb/table/external_table.cc +2 -2
  119. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +3 -2
  120. package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +3 -1
  121. package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
  122. package/deps/rocksdb/rocksdb/table/table_reader.h +4 -2
  123. package/deps/rocksdb/rocksdb/table/table_test.cc +48 -39
  124. package/deps/rocksdb/rocksdb/test_util/sync_point.cc +4 -0
  125. package/deps/rocksdb/rocksdb/test_util/sync_point.h +32 -0
  126. package/deps/rocksdb/rocksdb/test_util/testutil.h +6 -2
  127. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +14 -4
  128. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
  129. package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +3 -2
  130. package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +63 -12
  131. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +16 -1
  132. package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +5 -1
  133. package/deps/rocksdb/rocksdb/util/bit_fields.h +133 -23
  134. package/deps/rocksdb/rocksdb/util/bloom_test.cc +2 -5
  135. package/deps/rocksdb/rocksdb/util/compression.cc +51 -23
  136. package/deps/rocksdb/rocksdb/util/compression_test.cc +525 -270
  137. package/deps/rocksdb/rocksdb/util/filter_bench.cc +3 -4
  138. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +11 -2
  139. package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -1
  140. package/deps/rocksdb/rocksdb/util/slice_test.cc +92 -0
  141. package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -2
  142. package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -2
  143. package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +2 -2
  144. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +19 -2
  145. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +75 -0
  146. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +1 -0
  147. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +303 -111
  148. package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +379 -0
  149. package/deps/rocksdb/rocksdb.gyp +6 -4
  150. package/iterator.js +66 -70
  151. package/package.json +6 -6
  152. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  153. package/deps/rocksdb/rocksdb/table/block_based/index_builder_test.cc +0 -183
@@ -16,6 +16,7 @@
16
16
  #include <list>
17
17
  #include <map>
18
18
  #include <set>
19
+ #include <stdexcept>
19
20
  #include <string>
20
21
  #include <unordered_map>
21
22
  #include <vector>
@@ -980,7 +981,8 @@ class LevelIterator final : public InternalIterator {
980
981
  nullptr,
981
982
  bool allow_unprepared_value = false,
982
983
  std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ =
983
- nullptr)
984
+ nullptr,
985
+ Statistics* db_statistics = nullptr, SystemClock* clock = nullptr)
984
986
  : table_cache_(table_cache),
985
987
  read_options_(read_options),
986
988
  file_options_(file_options),
@@ -1005,7 +1007,9 @@ class LevelIterator final : public InternalIterator {
1005
1007
  allow_unprepared_value_(allow_unprepared_value),
1006
1008
  is_next_read_sequential_(false),
1007
1009
  to_return_sentinel_(false),
1008
- scan_opts_(nullptr) {
1010
+ scan_opts_(nullptr),
1011
+ db_statistics_(db_statistics),
1012
+ clock_(clock) {
1009
1013
  // Empty level is not supported.
1010
1014
  assert(flevel_ != nullptr && flevel_->num_files > 0);
1011
1015
  if (range_tombstone_iter_ptr_) {
@@ -1013,7 +1017,15 @@ class LevelIterator final : public InternalIterator {
1013
1017
  }
1014
1018
  }
1015
1019
 
1016
- ~LevelIterator() override { delete file_iter_.Set(nullptr); }
1020
+ ~LevelIterator() override {
1021
+ delete file_iter_.Set(nullptr);
1022
+ // Clean up any prepared iterators that weren't used
1023
+ for (auto& entry : prepared_iters_) {
1024
+ delete entry.second;
1025
+ }
1026
+ prepared_iters_.clear();
1027
+ assert(prepared_iters_.size() == 0);
1028
+ }
1017
1029
 
1018
1030
  // Seek to the first file with a key >= target.
1019
1031
  // If range_tombstone_iter_ is not nullptr, then we pretend that file
@@ -1124,10 +1136,12 @@ class LevelIterator final : public InternalIterator {
1124
1136
 
1125
1137
  void Prepare(const MultiScanArgs* so) override {
1126
1138
  // We assume here that scan_opts is sorted such that
1127
- // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
1139
+ // scan_opts[0].range.start < scan_opts[1].range.start, and non
1140
+ // overlapping
1128
1141
  if (so == nullptr) {
1129
1142
  return;
1130
1143
  }
1144
+
1131
1145
  scan_opts_ = so;
1132
1146
 
1133
1147
  // Verify comparator is consistent
@@ -1188,8 +1202,8 @@ class LevelIterator final : public InternalIterator {
1188
1202
  continue;
1189
1203
  }
1190
1204
  auto const metadata = flevel_->files[i].file_metadata;
1191
- if (metadata->num_entries == metadata->num_range_deletions) {
1192
- // Skip range deletion only files.
1205
+ if (metadata->FileIsStandAloneRangeTombstone()) {
1206
+ // Skip stand alone range deletion files.
1193
1207
  continue;
1194
1208
  }
1195
1209
  auto& args = GetMultiScanArgForFile(i);
@@ -1197,9 +1211,35 @@ class LevelIterator final : public InternalIterator {
1197
1211
  }
1198
1212
  }
1199
1213
  }
1214
+
1215
+ StopWatch timer(clock_, db_statistics_, MULTISCAN_PREPARE_ITERATORS);
1216
+
1200
1217
  // Propagate multiscan configs
1201
1218
  for (auto& file_to_arg : *file_to_scan_opts_) {
1202
1219
  file_to_arg.second.CopyConfigFrom(*so);
1220
+ assert(OverlapRange(*file_to_arg.second.GetScanRanges().begin(),
1221
+ file_to_arg.first) &&
1222
+ OverlapRange(*file_to_arg.second.GetScanRanges().rbegin(),
1223
+ file_to_arg.first));
1224
+ }
1225
+
1226
+ if (so->use_async_io) {
1227
+ auto before = file_index_;
1228
+ // Pre-create and prepare only relevant file iterators
1229
+ for (auto& file_to_arg : *file_to_scan_opts_) {
1230
+ size_t file_index = file_to_arg.first;
1231
+
1232
+ file_index_ = file_index;
1233
+ // Create iterator for this file
1234
+ auto iter = NewFileIterator();
1235
+ if (iter != nullptr) {
1236
+ // If we have async enabled, lets prepare all our iterators.
1237
+ iter->Prepare(&file_to_arg.second);
1238
+ // Store the prepared iterator
1239
+ prepared_iters_[file_index] = iter;
1240
+ }
1241
+ }
1242
+ file_index_ = before;
1203
1243
  }
1204
1244
  }
1205
1245
 
@@ -1276,7 +1316,7 @@ class LevelIterator final : public InternalIterator {
1276
1316
  }
1277
1317
 
1278
1318
  #ifndef NDEBUG
1279
- bool OverlapRange(const ScanOptions& opts);
1319
+ bool OverlapRange(const ScanOptions& opts, size_t file_index);
1280
1320
  #endif
1281
1321
 
1282
1322
  TableCache* table_cache_;
@@ -1334,9 +1374,15 @@ class LevelIterator final : public InternalIterator {
1334
1374
  bool to_return_sentinel_ = false;
1335
1375
  const MultiScanArgs* scan_opts_ = nullptr;
1336
1376
 
1377
+ Statistics* db_statistics_ = nullptr;
1378
+ SystemClock* clock_ = nullptr;
1379
+
1337
1380
  // Our stored scan_opts for each prefix
1338
1381
  std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
1339
1382
 
1383
+ // Map to store pre-created iterators by file index
1384
+ std::unordered_map<size_t, InternalIterator*> prepared_iters_;
1385
+
1340
1386
  // Sets flags for if we should return the sentinel key next.
1341
1387
  // The condition for returning sentinel is reaching the end of current
1342
1388
  // file_iter_: !Valid() && status.().ok().
@@ -1613,8 +1659,19 @@ bool LevelIterator::SkipEmptyFileForward() {
1613
1659
  const ScanOptions& opts =
1614
1660
  GetMultiScanArgForFile(file_index_).GetScanRanges().front();
1615
1661
  if (opts.range.start.has_value()) {
1616
- InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
1617
- kValueTypeForSeek);
1662
+ InternalKey target;
1663
+ const size_t ts_size =
1664
+ user_comparator_.user_comparator()->timestamp_size();
1665
+ if (ts_size == 0) {
1666
+ target = InternalKey(opts.range.start.value(), kMaxSequenceNumber,
1667
+ kValueTypeForSeek);
1668
+ } else {
1669
+ std::string seek_key;
1670
+ AppendKeyWithMaxTimestamp(&seek_key, opts.range.start.value(),
1671
+ ts_size);
1672
+ target =
1673
+ InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
1674
+ }
1618
1675
  file_iter_.Seek(target.Encode());
1619
1676
  }
1620
1677
  } else {
@@ -1662,14 +1719,14 @@ void LevelIterator::SkipEmptyFileBackward() {
1662
1719
  }
1663
1720
 
1664
1721
  #ifndef NDEBUG
1665
- bool LevelIterator::OverlapRange(const ScanOptions& opts) {
1722
+ bool LevelIterator::OverlapRange(const ScanOptions& opts, size_t file_index) {
1666
1723
  return (user_comparator_.CompareWithoutTimestamp(
1667
1724
  opts.range.start.value(), /*a_has_ts=*/false,
1668
- ExtractUserKey(flevel_->files[file_index_].largest_key),
1725
+ ExtractUserKey(flevel_->files[file_index].largest_key),
1669
1726
  /*b_has_ts=*/true) <= 0 &&
1670
1727
  user_comparator_.CompareWithoutTimestamp(
1671
1728
  opts.range.limit.value(), /*a_has_ts=*/false,
1672
- ExtractUserKey(flevel_->files[file_index_].smallest_key),
1729
+ ExtractUserKey(flevel_->files[file_index].smallest_key),
1673
1730
  /*b_has_ts=*/true) > 0);
1674
1731
  }
1675
1732
  #endif
@@ -1680,15 +1737,6 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
1680
1737
  }
1681
1738
 
1682
1739
  InternalIterator* old_iter = file_iter_.Set(iter);
1683
- if (iter && scan_opts_) {
1684
- if (FileHasMultiScanArg(file_index_)) {
1685
- const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
1686
- assert(OverlapRange(*new_opts.GetScanRanges().begin()) &&
1687
- OverlapRange(*new_opts.GetScanRanges().rbegin()));
1688
- file_iter_.Prepare(&new_opts);
1689
- }
1690
- }
1691
-
1692
1740
  // Update the read pattern for PrefetchBuffer.
1693
1741
  if (is_next_read_sequential_) {
1694
1742
  file_iter_.UpdateReadaheadState(old_iter);
@@ -1718,7 +1766,24 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
1718
1766
  // no need to change anything
1719
1767
  } else {
1720
1768
  file_index_ = new_file_index;
1769
+ if (!prepared_iters_.empty()) {
1770
+ auto prepared_it = prepared_iters_.find(file_index_);
1771
+ if (prepared_it != prepared_iters_.end()) {
1772
+ InternalIterator* iter = prepared_it->second;
1773
+ prepared_iters_.erase(prepared_it);
1774
+ SetFileIterator(iter);
1775
+ return;
1776
+ }
1777
+ }
1778
+
1721
1779
  InternalIterator* iter = NewFileIterator();
1780
+ if (FileHasMultiScanArg(file_index_)) {
1781
+ auto& args = GetMultiScanArgForFile(file_index_);
1782
+ assert(OverlapRange(*args.GetScanRanges().begin(), file_index_) &&
1783
+ OverlapRange(*args.GetScanRanges().rbegin(), file_index_));
1784
+ iter->Prepare(&args);
1785
+ }
1786
+
1722
1787
  SetFileIterator(iter);
1723
1788
  }
1724
1789
  }
@@ -2024,6 +2089,79 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
2024
2089
  }
2025
2090
  }
2026
2091
 
2092
+ void Version::GetColumnFamilyMetaData(
2093
+ const GetColumnFamilyMetaDataOptions& options,
2094
+ ColumnFamilyMetaData* cf_meta) {
2095
+ assert(cf_meta);
2096
+ assert(cfd_);
2097
+
2098
+ cf_meta->name = cfd_->GetName();
2099
+ cf_meta->size = 0;
2100
+ cf_meta->file_count = 0;
2101
+ cf_meta->levels.clear();
2102
+ cf_meta->blob_file_size = 0;
2103
+ cf_meta->blob_file_count = 0;
2104
+ cf_meta->blob_files.clear();
2105
+
2106
+ const auto& ioptions = cfd_->ioptions();
2107
+ auto* vstorage = storage_info();
2108
+
2109
+ int first_level = (options.level >= 0) ? options.level : 0;
2110
+ int last_level =
2111
+ (options.level >= 0) ? options.level + 1 : cfd_->NumberLevels();
2112
+
2113
+ InternalKey ikey_start, ikey_end;
2114
+ const InternalKey* begin = nullptr;
2115
+ const InternalKey* end = nullptr;
2116
+
2117
+ if (options.range.start.has_value()) {
2118
+ ikey_start = InternalKey(options.range.start.value(), kMaxSequenceNumber,
2119
+ kValueTypeForSeek);
2120
+ begin = &ikey_start;
2121
+ }
2122
+
2123
+ if (options.range.limit.has_value()) {
2124
+ ikey_end = InternalKey(options.range.limit.value(), kMaxSequenceNumber,
2125
+ kValueTypeForSeek);
2126
+ end = &ikey_end;
2127
+ }
2128
+
2129
+ for (int l = first_level; l < last_level; ++l) {
2130
+ uint64_t level_size = 0;
2131
+ std::vector<SstFileMetaData> files;
2132
+ std::vector<FileMetaData*> overlapping_files;
2133
+ vstorage->GetOverlappingInputs(l, begin, end, &overlapping_files);
2134
+
2135
+ for (const auto& file : overlapping_files) {
2136
+ uint32_t path_id = file->fd.GetPathId();
2137
+ const auto& file_path = (path_id < ioptions.cf_paths.size())
2138
+ ? ioptions.cf_paths[path_id].path
2139
+ : ioptions.cf_paths.back().path;
2140
+ const uint64_t file_number = file->fd.GetNumber();
2141
+ files.emplace_back(
2142
+ MakeTableFileName("", file_number), file_number, file_path,
2143
+ file->fd.GetFileSize(), file->fd.smallest_seqno,
2144
+ file->fd.largest_seqno, file->smallest.user_key().ToString(),
2145
+ file->largest.user_key().ToString(),
2146
+ file->stats.num_reads_sampled.load(std::memory_order_relaxed),
2147
+ file->being_compacted, file->temperature,
2148
+ file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
2149
+ file->TryGetFileCreationTime(), file->epoch_number,
2150
+ file->file_checksum, file->file_checksum_func_name);
2151
+ files.back().num_entries = file->num_entries;
2152
+ files.back().num_deletions = file->num_deletions;
2153
+ files.back().smallest = file->smallest.Encode().ToString();
2154
+ files.back().largest = file->largest.Encode().ToString();
2155
+ level_size += file->fd.GetFileSize();
2156
+ cf_meta->file_count++;
2157
+ }
2158
+ if (!files.empty()) {
2159
+ cf_meta->levels.emplace_back(l, level_size, std::move(files));
2160
+ cf_meta->size += level_size;
2161
+ }
2162
+ }
2163
+ }
2164
+
2027
2165
  uint64_t Version::GetSstFilesSize() {
2028
2166
  uint64_t sst_files_size = 0;
2029
2167
  for (int level = 0; level < storage_info_.num_levels_; level++) {
@@ -2108,7 +2246,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
2108
2246
  cfd_->internal_stats()->GetFileReadHist(level),
2109
2247
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
2110
2248
  nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
2111
- allow_unprepared_value, &tombstone_iter_ptr);
2249
+ allow_unprepared_value, &tombstone_iter_ptr, db_statistics_, clock_);
2112
2250
  if (read_options.ignore_range_deletions) {
2113
2251
  merge_iter_builder->AddIterator(level_iter);
2114
2252
  } else {
@@ -2248,7 +2386,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
2248
2386
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
2249
2387
  /*range_del_agg=*/nullptr,
2250
2388
  /*compaction_boundaries=*/nullptr, allow_unprepared_value,
2251
- &tombstone_iter_ptr);
2389
+ &tombstone_iter_ptr, db_statistics_, clock_);
2252
2390
  if (read_options.ignore_range_deletions) {
2253
2391
  merge_iter_builder->AddIterator(level_iter);
2254
2392
  } else {
@@ -2305,7 +2443,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
2305
2443
  mutable_cf_options_, should_sample_file_read(),
2306
2444
  cfd_->internal_stats()->GetFileReadHist(level),
2307
2445
  TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
2308
- &range_del_agg, nullptr, false));
2446
+ &range_del_agg, nullptr, false, nullptr, db_statistics_, clock_));
2309
2447
  status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
2310
2448
  iter.get(), overlap);
2311
2449
  }
@@ -4487,7 +4625,8 @@ bool VersionStorageInfo::OverlapInLevel(int level,
4487
4625
  void VersionStorageInfo::GetOverlappingInputs(
4488
4626
  int level, const InternalKey* begin, const InternalKey* end,
4489
4627
  std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
4490
- bool expand_range, InternalKey** next_smallest) const {
4628
+ bool expand_range, const FileMetaData* starting_l0_file,
4629
+ InternalKey** next_smallest) const {
4491
4630
  if (level >= num_non_empty_levels_) {
4492
4631
  // this level is empty, no overlapping inputs
4493
4632
  return;
@@ -4520,7 +4659,19 @@ void VersionStorageInfo::GetOverlappingInputs(
4520
4659
 
4521
4660
  // index stores the file index need to check.
4522
4661
  std::list<size_t> index;
4523
- for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
4662
+ size_t start_index = 0;
4663
+ if (starting_l0_file != nullptr) {
4664
+ uint64_t starting_file_number = starting_l0_file->fd.GetNumber();
4665
+ for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
4666
+ if (level_files_brief_[level].files[i].fd.GetNumber() ==
4667
+ starting_file_number) {
4668
+ start_index = i;
4669
+ break;
4670
+ }
4671
+ }
4672
+ assert(start_index < level_files_brief_[level].num_files);
4673
+ }
4674
+ for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) {
4524
4675
  index.emplace_back(i);
4525
4676
  }
4526
4677
 
@@ -5307,6 +5458,7 @@ void AtomicGroupReadBuffer::Clear() {
5307
5458
 
5308
5459
  VersionSet::VersionSet(
5309
5460
  const std::string& dbname, const ImmutableDBOptions* _db_options,
5461
+ const MutableDBOptions& mutable_db_options,
5310
5462
  const FileOptions& storage_options, Cache* table_cache,
5311
5463
  WriteBufferManager* write_buffer_manager, WriteController* write_controller,
5312
5464
  BlockCacheTracer* const block_cache_tracer,
@@ -5335,6 +5487,7 @@ VersionSet::VersionSet(
5335
5487
  prev_log_number_(0),
5336
5488
  current_version_number_(0),
5337
5489
  manifest_file_size_(0),
5490
+ last_compacted_manifest_file_size_(0),
5338
5491
  file_options_(storage_options),
5339
5492
  block_cache_tracer_(block_cache_tracer),
5340
5493
  io_tracer_(io_tracer),
@@ -5342,7 +5495,9 @@ VersionSet::VersionSet(
5342
5495
  offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
5343
5496
  error_handler_(error_handler),
5344
5497
  unchanging_(unchanging),
5345
- closed_(false) {}
5498
+ closed_(false) {
5499
+ UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr);
5500
+ }
5346
5501
 
5347
5502
  Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
5348
5503
  Status s;
@@ -5416,6 +5571,15 @@ void VersionSet::Reset() {
5416
5571
  if (column_family_set_) {
5417
5572
  WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
5418
5573
  WriteController* wc = column_family_set_->write_controller();
5574
+
5575
+ // Clear TableCache to prevent use-after-free: Reset() deletes old
5576
+ // ColumnFamilySet but reuses table_cache_, which may contain
5577
+ // BlockBasedTable entries with dangling references to deleted CFD's
5578
+ // ioptions.
5579
+ if (table_cache_) {
5580
+ table_cache_->EraseUnRefEntries();
5581
+ }
5582
+
5419
5583
  // db_id becomes the source of truth after DBImpl::Recover():
5420
5584
  // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
5421
5585
  // Note: we may not be able to recover db_id from MANIFEST if
@@ -5438,11 +5602,38 @@ void VersionSet::Reset() {
5438
5602
  current_version_number_ = 0;
5439
5603
  manifest_writers_.clear();
5440
5604
  manifest_file_size_ = 0;
5605
+ last_compacted_manifest_file_size_ = 0;
5606
+ TuneMaxManifestFileSize();
5441
5607
  obsolete_files_.clear();
5442
5608
  obsolete_manifests_.clear();
5443
5609
  wals_.Reset();
5444
5610
  }
5445
5611
 
5612
+ void VersionSet::UpdatedMutableDbOptions(
5613
+ const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
5614
+ // Must be holding mutex if not called during initialization
5615
+ if (mu) {
5616
+ mu->AssertHeld();
5617
+ } else {
5618
+ // manifest_file_size_ must be 0 if called from the constructor
5619
+ assert(manifest_file_size_ == 0);
5620
+ }
5621
+ file_options_.writable_file_max_buffer_size =
5622
+ updated_options.writable_file_max_buffer_size;
5623
+ min_max_manifest_file_size_ = updated_options.max_manifest_file_size;
5624
+ max_manifest_space_amp_pct_ = static_cast<unsigned>(
5625
+ std::max(updated_options.max_manifest_space_amp_pct, 0));
5626
+ manifest_preallocation_size_ = updated_options.manifest_preallocation_size;
5627
+ TuneMaxManifestFileSize();
5628
+ }
5629
+
5630
+ void VersionSet::TuneMaxManifestFileSize() {
5631
+ tuned_max_manifest_file_size_ =
5632
+ std::max(min_max_manifest_file_size_,
5633
+ last_compacted_manifest_file_size_ *
5634
+ (100U + max_manifest_space_amp_pct_) / 100U);
5635
+ }
5636
+
5446
5637
  void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
5447
5638
  Version* v) {
5448
5639
  // compute new compaction score
@@ -5525,8 +5716,8 @@ Status VersionSet::ProcessManifestWrites(
5525
5716
  // the preceding version edits in the same atomic group, and update
5526
5717
  // their `remaining_entries_` member variable because we are NOT going
5527
5718
  // to write the version edits' of dropped CF to the MANIFEST. If we
5528
- // don't update, then Recover can report corrupted atomic group because
5529
- // the `remaining_entries_` do not match.
5719
+ // don't update, then Recover can report corrupted atomic group
5720
+ // because the `remaining_entries_` do not match.
5530
5721
  if (!batch_edits.empty()) {
5531
5722
  if (batch_edits.back()->IsInAtomicGroup() &&
5532
5723
  batch_edits.back()->GetRemainingEntries() > 0) {
@@ -5686,10 +5877,11 @@ Status VersionSet::ProcessManifestWrites(
5686
5877
  }
5687
5878
  #endif // NDEBUG
5688
5879
 
5880
+ uint64_t prev_manifest_file_size = manifest_file_size_;
5689
5881
  assert(pending_manifest_file_number_ == 0);
5690
5882
  if (!skip_manifest_write &&
5691
5883
  (!descriptor_log_ ||
5692
- manifest_file_size_ > db_options_->max_manifest_file_size)) {
5884
+ prev_manifest_file_size >= tuned_max_manifest_file_size_)) {
5693
5885
  TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
5694
5886
  new_descriptor_log = true;
5695
5887
  } else {
@@ -5729,6 +5921,8 @@ Status VersionSet::ProcessManifestWrites(
5729
5921
  IOStatus manifest_io_status;
5730
5922
  manifest_io_status.PermitUncheckedError();
5731
5923
  std::unique_ptr<log::Writer> new_desc_log_ptr;
5924
+ // Save before releasing mu
5925
+ uint64_t manifest_preallocation_size = manifest_preallocation_size_;
5732
5926
  if (skip_manifest_write) {
5733
5927
  if (s.ok()) {
5734
5928
  constexpr bool update_stats = true;
@@ -5772,16 +5966,13 @@ Status VersionSet::ProcessManifestWrites(
5772
5966
  // This is fine because everything inside of this block is serialized --
5773
5967
  // only one thread can be here at the same time
5774
5968
  // create new manifest file
5775
- ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
5776
- pending_manifest_file_number_);
5777
5969
  std::string descriptor_fname =
5778
5970
  DescriptorFileName(dbname_, pending_manifest_file_number_);
5779
5971
  std::unique_ptr<FSWritableFile> descriptor_file;
5780
5972
  io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
5781
5973
  opt_file_opts);
5782
5974
  if (io_s.ok()) {
5783
- descriptor_file->SetPreallocationBlockSize(
5784
- db_options_->manifest_preallocation_size);
5975
+ descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size);
5785
5976
  FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
5786
5977
  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
5787
5978
  std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
@@ -5831,10 +6022,12 @@ Status VersionSet::ProcessManifestWrites(
5831
6022
  #ifndef NDEBUG
5832
6023
  if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
5833
6024
  TEST_SYNC_POINT_CALLBACK(
5834
- "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
6025
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
6026
+ "0",
5835
6027
  nullptr);
5836
6028
  TEST_SYNC_POINT(
5837
- "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
6029
+ "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
6030
+ "1");
5838
6031
  }
5839
6032
  ++idx;
5840
6033
  #endif /* !NDEBUG */
@@ -5871,8 +6064,8 @@ Status VersionSet::ProcessManifestWrites(
5871
6064
  file_options_.temperature, dir_contains_current_file);
5872
6065
  if (!io_s.ok()) {
5873
6066
  s = io_s;
5874
- // Quarantine old manifest file in case new manifest file's CURRENT file
5875
- // wasn't created successfully and the old manifest is needed.
6067
+ // Quarantine old manifest file in case new manifest file's CURRENT
6068
+ // file wasn't created successfully and the old manifest is needed.
5876
6069
  limbo_descriptor_log_file_number.push_back(manifest_file_number_);
5877
6070
  files_to_quarantine_if_commit_fail.push_back(
5878
6071
  &limbo_descriptor_log_file_number);
@@ -5882,6 +6075,13 @@ Status VersionSet::ProcessManifestWrites(
5882
6075
  if (s.ok()) {
5883
6076
  // find offset in manifest file where this version is stored.
5884
6077
  new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
6078
+ if (new_descriptor_log) {
6079
+ ROCKS_LOG_INFO(db_options_->info_log,
6080
+ "Created manifest %" PRIu64
6081
+ ", compacted+appended from %" PRIu64 " to %" PRIu64 "\n",
6082
+ pending_manifest_file_number_, prev_manifest_file_size,
6083
+ new_manifest_file_size);
6084
+ }
5885
6085
  }
5886
6086
 
5887
6087
  if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
@@ -5930,6 +6130,8 @@ Status VersionSet::ProcessManifestWrites(
5930
6130
  descriptor_log_ = std::move(new_desc_log_ptr);
5931
6131
  obsolete_manifests_.emplace_back(
5932
6132
  DescriptorFileName("", manifest_file_number_));
6133
+ last_compacted_manifest_file_size_ = new_manifest_file_size;
6134
+ TuneMaxManifestFileSize();
5933
6135
  }
5934
6136
 
5935
6137
  // Install the new versions
@@ -6012,21 +6214,21 @@ Status VersionSet::ProcessManifestWrites(
6012
6214
  // that renaming tmp file to CURRENT failed.
6013
6215
  //
6014
6216
  // On local POSIX-compliant FS, the CURRENT must point to the original
6015
- // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
6016
- // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
6017
- // process not to crash and continue using the db. Any future LogAndApply()
6018
- // call will switch to a new MANIFEST and update CURRENT, still ignoring
6019
- // this one.
6217
+ // MANIFEST. We can delete the new MANIFEST for simplicity, but we can
6218
+ // also keep it. Future recovery will ignore this MANIFEST. It's also ok
6219
+ // for the process not to crash and continue using the db. Any future
6220
+ // LogAndApply() call will switch to a new MANIFEST and update CURRENT,
6221
+ // still ignoring this one.
6020
6222
  //
6021
6223
  // On non-local FS, it is
6022
6224
  // possible that the rename operation succeeded on the server (remote)
6023
6225
  // side, but the client somehow returns a non-ok status to RocksDB. Note
6024
6226
  // that this does not violate atomicity. Should we delete the new MANIFEST
6025
6227
  // successfully, a subsequent recovery attempt will likely see the CURRENT
6026
- // pointing to the new MANIFEST, thus fail. We will not be able to open the
6027
- // DB again. Therefore, if manifest operations succeed, we should keep the
6028
- // the new MANIFEST. If the process proceeds, any future LogAndApply() call
6029
- // will switch to a new MANIFEST and update CURRENT. If user tries to
6228
+ // pointing to the new MANIFEST, thus fail. We will not be able to open
6229
+ // the DB again. Therefore, if manifest operations succeed, we should keep
6230
+ // the the new MANIFEST. If the process proceeds, any future LogAndApply()
6231
+ // call will switch to a new MANIFEST and update CURRENT. If user tries to
6030
6232
  // re-open the DB,
6031
6233
  // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
6032
6234
  // b) CURRENT points to the original MANIFEST, and the original MANIFEST
@@ -6155,9 +6357,9 @@ Status VersionSet::LogAndApply(
6155
6357
  first_writer.cv.Wait();
6156
6358
  }
6157
6359
  if (first_writer.done) {
6158
- // All non-CF-manipulation operations can be grouped together and committed
6159
- // to MANIFEST. They should all have finished. The status code is stored in
6160
- // the first manifest writer.
6360
+ // All non-CF-manipulation operations can be grouped together and
6361
+ // committed to MANIFEST. They should all have finished. The status code
6362
+ // is stored in the first manifest writer.
6161
6363
  #ifndef NDEBUG
6162
6364
  for (const auto& writer : writers) {
6163
6365
  assert(writer.done);
@@ -6211,8 +6413,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
6211
6413
  assert(!edit->HasLastSequence());
6212
6414
  edit->SetLastSequence(*max_last_sequence);
6213
6415
  if (edit->IsColumnFamilyDrop()) {
6214
- // if we drop column family, we have to make sure to save max column family,
6215
- // so that we don't reuse existing ID
6416
+ // if we drop column family, we have to make sure to save max column
6417
+ // family, so that we don't reuse existing ID
6216
6418
  edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
6217
6419
  }
6218
6420
  }
@@ -6501,7 +6703,8 @@ void VersionSet::RecoverEpochNumbers() {
6501
6703
  Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
6502
6704
  const std::string& dbname,
6503
6705
  FileSystem* fs) {
6504
- // Read "CURRENT" file, which contains a pointer to the current manifest file
6706
+ // Read "CURRENT" file, which contains a pointer to the current manifest
6707
+ // file
6505
6708
  std::string manifest_path;
6506
6709
  uint64_t manifest_file_number;
6507
6710
  Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false,
@@ -6563,14 +6766,16 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
6563
6766
  const ReadOptions read_options;
6564
6767
  const WriteOptions write_options;
6565
6768
 
6566
- ImmutableDBOptions db_options(*options);
6769
+ ImmutableDBOptions imm_db_options(*options);
6770
+ MutableDBOptions mutable_db_options(*options);
6567
6771
  ColumnFamilyOptions cf_options(*options);
6568
6772
  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
6569
6773
  options->table_cache_numshardbits));
6570
6774
  WriteController wc(options->delayed_write_rate);
6571
6775
  WriteBufferManager wb(options->db_write_buffer_size);
6572
- VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
6573
- nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
6776
+ VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options,
6777
+ tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/,
6778
+ nullptr /*IOTracer*/,
6574
6779
  /*db_id*/ "",
6575
6780
  /*db_session_id*/ "", options->daily_offpeak_time_utc,
6576
6781
  /*error_handler_*/ nullptr, /*unchanging=*/false);
@@ -6656,9 +6861,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
6656
6861
 
6657
6862
  // Get the checksum information including the checksum and checksum function
6658
6863
  // name of all SST and blob files in VersionSet. Store the information in
6659
- // FileChecksumList which contains a map from file number to its checksum info.
6660
- // If DB is not running, make sure call VersionSet::Recover() to load the file
6661
- // metadata from Manifest to VersionSet before calling this function.
6864
+ // FileChecksumList which contains a map from file number to its checksum
6865
+ // info. If DB is not running, make sure call VersionSet::Recover() to load
6866
+ // the file metadata from Manifest to VersionSet before calling this function.
6662
6867
  Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
6663
6868
  // Clean the previously stored checksum information if any.
6664
6869
  Status s;
@@ -6800,8 +7005,8 @@ Status VersionSet::WriteCurrentStateToManifest(
6800
7005
  // WARNING: This method doesn't hold a mutex!!
6801
7006
 
6802
7007
  // This is done without DB mutex lock held, but only within single-threaded
6803
- // LogAndApply. Column family manipulations can only happen within LogAndApply
6804
- // (the same single thread), so we're safe to iterate.
7008
+ // LogAndApply. Column family manipulations can only happen within
7009
+ // LogAndApply (the same single thread), so we're safe to iterate.
6805
7010
 
6806
7011
  assert(io_s.ok());
6807
7012
  if (db_options_->write_dbid_to_manifest) {
@@ -6835,9 +7040,9 @@ Status VersionSet::WriteCurrentStateToManifest(
6835
7040
  }
6836
7041
 
6837
7042
  // New manifest should rollover the WAL deletion record from previous
6838
- // manifest. Otherwise, when an addition record of a deleted WAL gets added to
6839
- // this new manifest later (which can happens in e.g, SyncWAL()), this new
6840
- // manifest creates an illusion that such WAL hasn't been deleted.
7043
+ // manifest. Otherwise, when an addition record of a deleted WAL gets added
7044
+ // to this new manifest later (which can happens in e.g, SyncWAL()), this
7045
+ // new manifest creates an illusion that such WAL hasn't been deleted.
6841
7046
  VersionEdit wal_deletions;
6842
7047
  wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
6843
7048
  std::string wal_deletions_record;
@@ -6969,9 +7174,9 @@ Status VersionSet::WriteCurrentStateToManifest(
6969
7174
  // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
6970
7175
  // function is called repeatedly with consecutive pairs of slices. For example
6971
7176
  // if the slice list is [a, b, c, d] this function is called with arguments
6972
- // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
6973
- // we avoid doing binary search for the keys b and c twice and instead somehow
6974
- // maintain state of where they first appear in the files.
7177
+ // (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible
7178
+ // where we avoid doing binary search for the keys b and c twice and instead
7179
+ // somehow maintain state of where they first appear in the files.
6975
7180
  uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
6976
7181
  const ReadOptions& read_options,
6977
7182
  Version* v, const Slice& start,
@@ -6992,19 +7197,20 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
6992
7197
  }
6993
7198
 
6994
7199
  // Outline of the optimization that uses options.files_size_error_margin.
6995
- // When approximating the files total size that is used to store a keys range,
6996
- // we first sum up the sizes of the files that fully fall into the range.
6997
- // Then we sum up the sizes of all the files that may intersect with the range
6998
- // (this includes all files in L0 as well). Then, if total_intersecting_size
6999
- // is smaller than total_full_size * options.files_size_error_margin - we can
7000
- // infer that the intersecting files have a sufficiently negligible
7001
- // contribution to the total size, and we can approximate the storage required
7002
- // for the keys in range as just half of the intersecting_files_size.
7003
- // E.g., if the value of files_size_error_margin is 0.1, then the error of the
7004
- // approximation is limited to only ~10% of the total size of files that fully
7005
- // fall into the keys range. In such case, this helps to avoid a costly
7006
- // process of binary searching the intersecting files that is required only
7007
- // for a more precise calculation of the total size.
7200
+ // When approximating the files total size that is used to store a keys
7201
+ // range, we first sum up the sizes of the files that fully fall into the
7202
+ // range. Then we sum up the sizes of all the files that may intersect with
7203
+ // the range (this includes all files in L0 as well). Then, if
7204
+ // total_intersecting_size is smaller than total_full_size *
7205
+ // options.files_size_error_margin - we can infer that the intersecting
7206
+ // files have a sufficiently negligible contribution to the total size, and
7207
+ // we can approximate the storage required for the keys in range as just
7208
+ // half of the intersecting_files_size. E.g., if the value of
7209
+ // files_size_error_margin is 0.1, then the error of the approximation is
7210
+ // limited to only ~10% of the total size of files that fully fall into the
7211
+ // keys range. In such case, this helps to avoid a costly process of binary
7212
+ // searching the intersecting files that is required only for a more precise
7213
+ // calculation of the total size.
7008
7214
 
7009
7215
  autovector<FdWithKeyRange*, 32> first_files;
7010
7216
  autovector<FdWithKeyRange*, 16> last_files;
@@ -7076,10 +7282,11 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
7076
7282
  total_intersecting_size += file_ptr->fd.GetFileSize();
7077
7283
  }
7078
7284
 
7079
- // Now scan all the first & last files at each level, and estimate their size.
7080
- // If the total_intersecting_size is less than X% of the total_full_size - we
7081
- // want to approximate the result in order to avoid the costly binary search
7082
- // inside ApproximateSize. We use half of file size as an approximation below.
7285
+ // Now scan all the first & last files at each level, and estimate their
7286
+ // size. If the total_intersecting_size is less than X% of the
7287
+ // total_full_size - we want to approximate the result in order to avoid the
7288
+ // costly binary search inside ApproximateSize. We use half of file size as
7289
+ // an approximation below.
7083
7290
 
7084
7291
  const double margin = options.files_size_error_margin;
7085
7292
  if (margin > 0 && total_intersecting_size <
@@ -7347,7 +7554,8 @@ InternalIterator* VersionSet::MakeInputIterator(
7347
7554
  /*no per level latency histogram=*/nullptr,
7348
7555
  TableReaderCaller::kCompaction, /*skip_filters=*/false,
7349
7556
  /*level=*/static_cast<int>(c->level(which)), range_del_agg,
7350
- c->boundaries(which), false, &tombstone_iter_ptr);
7557
+ c->boundaries(which), false, &tombstone_iter_ptr,
7558
+ db_options_->statistics.get(), clock_);
7351
7559
  range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
7352
7560
  }
7353
7561
  }
@@ -7622,12 +7830,13 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
7622
7830
  }
7623
7831
 
7624
7832
  ReactiveVersionSet::ReactiveVersionSet(
7625
- const std::string& dbname, const ImmutableDBOptions* _db_options,
7833
+ const std::string& dbname, const ImmutableDBOptions* imm_db_options,
7834
+ const MutableDBOptions& mutable_db_options,
7626
7835
  const FileOptions& _file_options, Cache* table_cache,
7627
7836
  WriteBufferManager* write_buffer_manager, WriteController* write_controller,
7628
7837
  const std::shared_ptr<IOTracer>& io_tracer)
7629
- : VersionSet(dbname, _db_options, _file_options, table_cache,
7630
- write_buffer_manager, write_controller,
7838
+ : VersionSet(dbname, imm_db_options, mutable_db_options, _file_options,
7839
+ table_cache, write_buffer_manager, write_controller,
7631
7840
  /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
7632
7841
  /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
7633
7842
  /*error_handler=*/nullptr, /*unchanging=*/false) {}
@@ -7751,8 +7960,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
7751
7960
  }
7752
7961
  } else if (s.IsPathNotFound()) {
7753
7962
  // This can happen if the primary switches to a new MANIFEST after the
7754
- // secondary reads the CURRENT file but before the secondary actually tries
7755
- // to open the MANIFEST.
7963
+ // secondary reads the CURRENT file but before the secondary actually
7964
+ // tries to open the MANIFEST.
7756
7965
  s = Status::TryAgain(
7757
7966
  "The primary may have switched to a new MANIFEST and deleted the old "
7758
7967
  "one.");