@nxtedition/rocksdb 15.1.2 → 15.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +15 -0
- package/binding.cc +79 -38
- package/build.sh +1 -2
- package/deps/rocksdb/rocksdb/BUCK +10 -8
- package/deps/rocksdb/rocksdb/CMakeLists.txt +27 -2
- package/deps/rocksdb/rocksdb/Makefile +27 -116
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +101 -124
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +47 -30
- package/deps/rocksdb/rocksdb/db/c.cc +793 -131
- package/deps/rocksdb/rocksdb/db/c_test.c +571 -0
- package/deps/rocksdb/rocksdb/db/compact_files_test.cc +226 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +95 -59
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +45 -35
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +47 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +12 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +82 -0
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +69 -24
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +9 -1
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +65 -0
- package/deps/rocksdb/rocksdb/db/db_etc3_test.cc +161 -0
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +20 -7
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +13 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +114 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_follower.cc +3 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +39 -25
- package/deps/rocksdb/rocksdb/db/db_iterator_test.cc +361 -0
- package/deps/rocksdb/rocksdb/db/db_options_test.cc +35 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +83 -0
- package/deps/rocksdb/rocksdb/db/db_test.cc +249 -4
- package/deps/rocksdb/rocksdb/db/db_test2.cc +3 -0
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +3 -2
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -7
- package/deps/rocksdb/rocksdb/db/listener_test.cc +7 -17
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +4 -2
- package/deps/rocksdb/rocksdb/db/obsolete_files_test.cc +41 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +2 -2
- package/deps/rocksdb/rocksdb/db/version_edit.h +7 -4
- package/deps/rocksdb/rocksdb/db/version_set.cc +299 -90
- package/deps/rocksdb/rocksdb/db/version_set.h +56 -9
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +41 -39
- package/deps/rocksdb/rocksdb/db/version_util.h +3 -2
- package/deps/rocksdb/rocksdb/db/wal_manager.cc +7 -1
- package/deps/rocksdb/rocksdb/db/wal_manager_test.cc +48 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +16 -5
- package/deps/rocksdb/rocksdb/env/env_test.cc +126 -41
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +14 -7
- package/deps/rocksdb/rocksdb/env/io_posix.cc +304 -112
- package/deps/rocksdb/rocksdb/env/io_posix.h +16 -4
- package/deps/rocksdb/rocksdb/env/io_posix_test.cc +43 -0
- package/deps/rocksdb/rocksdb/folly.mk +148 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_compression.h +29 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +73 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +246 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +0 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/data_structure.h +15 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +19 -9
- package/deps/rocksdb/rocksdb/include/rocksdb/env.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +14 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +67 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/sst_file_writer.h +1 -7
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/thread_status.h +6 -14
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/backup_engine.h +8 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/env_mirror.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h +0 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/option_change_migration.h +33 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +2 -0
- package/deps/rocksdb/rocksdb/monitoring/thread_status_impl.cc +5 -2
- package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.cc +2 -2
- package/deps/rocksdb/rocksdb/monitoring/thread_status_updater.h +6 -6
- package/deps/rocksdb/rocksdb/monitoring/thread_status_updater_debug.cc +2 -2
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util.cc +10 -5
- package/deps/rocksdb/rocksdb/monitoring/thread_status_util.h +2 -2
- package/deps/rocksdb/rocksdb/options/cf_options.cc +15 -3
- package/deps/rocksdb/rocksdb/options/cf_options.h +7 -0
- package/deps/rocksdb/rocksdb/options/db_options.cc +27 -36
- package/deps/rocksdb/rocksdb/options/db_options.h +3 -2
- package/deps/rocksdb/rocksdb/options/options.cc +4 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -2
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -1
- package/deps/rocksdb/rocksdb/options/options_test.cc +19 -3
- package/deps/rocksdb/rocksdb/src.mk +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +155 -32
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +7 -3
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.cc +169 -125
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_iterator.h +22 -7
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +43 -24
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +9 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +9 -8
- package/deps/rocksdb/rocksdb/table/block_based/filter_block.h +17 -0
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -5
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy_internal.h +13 -18
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +29 -0
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.h +6 -0
- package/deps/rocksdb/rocksdb/table/block_based/full_filter_block_test.cc +15 -0
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +79 -19
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +48 -20
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +19 -0
- package/deps/rocksdb/rocksdb/table/block_based/user_defined_index_wrapper.h +1 -1
- package/deps/rocksdb/rocksdb/table/external_table.cc +2 -2
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +3 -2
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.h +3 -1
- package/deps/rocksdb/rocksdb/table/table_builder.h +5 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +4 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +48 -39
- package/deps/rocksdb/rocksdb/test_util/sync_point.cc +4 -0
- package/deps/rocksdb/rocksdb/test_util/sync_point.h +32 -0
- package/deps/rocksdb/rocksdb/test_util/testutil.h +6 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +14 -4
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +8 -5
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_test.cc +3 -2
- package/deps/rocksdb/rocksdb/tools/sst_dump_tool.cc +63 -12
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.cc +16 -1
- package/deps/rocksdb/rocksdb/util/auto_tune_compressor.h +5 -1
- package/deps/rocksdb/rocksdb/util/bit_fields.h +133 -23
- package/deps/rocksdb/rocksdb/util/bloom_test.cc +2 -5
- package/deps/rocksdb/rocksdb/util/compression.cc +51 -23
- package/deps/rocksdb/rocksdb/util/compression_test.cc +525 -270
- package/deps/rocksdb/rocksdb/util/filter_bench.cc +3 -4
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.cc +11 -2
- package/deps/rocksdb/rocksdb/util/simple_mixed_compressor.h +4 -1
- package/deps/rocksdb/rocksdb/util/slice_test.cc +92 -0
- package/deps/rocksdb/rocksdb/util/thread_list_test.cc +2 -2
- package/deps/rocksdb/rocksdb/util/thread_operation.h +2 -2
- package/deps/rocksdb/rocksdb/util/threadpool_imp.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +19 -2
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +75 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_test.cc +1 -0
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration.cc +303 -111
- package/deps/rocksdb/rocksdb/utilities/option_change_migration/option_change_migration_test.cc +379 -0
- package/deps/rocksdb/rocksdb.gyp +1 -0
- package/iterator.js +66 -70
- package/package.json +6 -6
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/deps/rocksdb/rocksdb/table/block_based/index_builder_test.cc +0 -183
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#include <list>
|
|
17
17
|
#include <map>
|
|
18
18
|
#include <set>
|
|
19
|
+
#include <stdexcept>
|
|
19
20
|
#include <string>
|
|
20
21
|
#include <unordered_map>
|
|
21
22
|
#include <vector>
|
|
@@ -980,7 +981,8 @@ class LevelIterator final : public InternalIterator {
|
|
|
980
981
|
nullptr,
|
|
981
982
|
bool allow_unprepared_value = false,
|
|
982
983
|
std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ =
|
|
983
|
-
nullptr
|
|
984
|
+
nullptr,
|
|
985
|
+
Statistics* db_statistics = nullptr, SystemClock* clock = nullptr)
|
|
984
986
|
: table_cache_(table_cache),
|
|
985
987
|
read_options_(read_options),
|
|
986
988
|
file_options_(file_options),
|
|
@@ -1005,7 +1007,9 @@ class LevelIterator final : public InternalIterator {
|
|
|
1005
1007
|
allow_unprepared_value_(allow_unprepared_value),
|
|
1006
1008
|
is_next_read_sequential_(false),
|
|
1007
1009
|
to_return_sentinel_(false),
|
|
1008
|
-
scan_opts_(nullptr)
|
|
1010
|
+
scan_opts_(nullptr),
|
|
1011
|
+
db_statistics_(db_statistics),
|
|
1012
|
+
clock_(clock) {
|
|
1009
1013
|
// Empty level is not supported.
|
|
1010
1014
|
assert(flevel_ != nullptr && flevel_->num_files > 0);
|
|
1011
1015
|
if (range_tombstone_iter_ptr_) {
|
|
@@ -1013,7 +1017,15 @@ class LevelIterator final : public InternalIterator {
|
|
|
1013
1017
|
}
|
|
1014
1018
|
}
|
|
1015
1019
|
|
|
1016
|
-
~LevelIterator() override {
|
|
1020
|
+
~LevelIterator() override {
|
|
1021
|
+
delete file_iter_.Set(nullptr);
|
|
1022
|
+
// Clean up any prepared iterators that weren't used
|
|
1023
|
+
for (auto& entry : prepared_iters_) {
|
|
1024
|
+
delete entry.second;
|
|
1025
|
+
}
|
|
1026
|
+
prepared_iters_.clear();
|
|
1027
|
+
assert(prepared_iters_.size() == 0);
|
|
1028
|
+
}
|
|
1017
1029
|
|
|
1018
1030
|
// Seek to the first file with a key >= target.
|
|
1019
1031
|
// If range_tombstone_iter_ is not nullptr, then we pretend that file
|
|
@@ -1124,10 +1136,12 @@ class LevelIterator final : public InternalIterator {
|
|
|
1124
1136
|
|
|
1125
1137
|
void Prepare(const MultiScanArgs* so) override {
|
|
1126
1138
|
// We assume here that scan_opts is sorted such that
|
|
1127
|
-
// scan_opts[0].range.start < scan_opts[1].range.start, and non
|
|
1139
|
+
// scan_opts[0].range.start < scan_opts[1].range.start, and non
|
|
1140
|
+
// overlapping
|
|
1128
1141
|
if (so == nullptr) {
|
|
1129
1142
|
return;
|
|
1130
1143
|
}
|
|
1144
|
+
|
|
1131
1145
|
scan_opts_ = so;
|
|
1132
1146
|
|
|
1133
1147
|
// Verify comparator is consistent
|
|
@@ -1188,8 +1202,8 @@ class LevelIterator final : public InternalIterator {
|
|
|
1188
1202
|
continue;
|
|
1189
1203
|
}
|
|
1190
1204
|
auto const metadata = flevel_->files[i].file_metadata;
|
|
1191
|
-
if (metadata->
|
|
1192
|
-
// Skip range deletion
|
|
1205
|
+
if (metadata->FileIsStandAloneRangeTombstone()) {
|
|
1206
|
+
// Skip stand alone range deletion files.
|
|
1193
1207
|
continue;
|
|
1194
1208
|
}
|
|
1195
1209
|
auto& args = GetMultiScanArgForFile(i);
|
|
@@ -1197,9 +1211,35 @@ class LevelIterator final : public InternalIterator {
|
|
|
1197
1211
|
}
|
|
1198
1212
|
}
|
|
1199
1213
|
}
|
|
1214
|
+
|
|
1215
|
+
StopWatch timer(clock_, db_statistics_, MULTISCAN_PREPARE_ITERATORS);
|
|
1216
|
+
|
|
1200
1217
|
// Propagate multiscan configs
|
|
1201
1218
|
for (auto& file_to_arg : *file_to_scan_opts_) {
|
|
1202
1219
|
file_to_arg.second.CopyConfigFrom(*so);
|
|
1220
|
+
assert(OverlapRange(*file_to_arg.second.GetScanRanges().begin(),
|
|
1221
|
+
file_to_arg.first) &&
|
|
1222
|
+
OverlapRange(*file_to_arg.second.GetScanRanges().rbegin(),
|
|
1223
|
+
file_to_arg.first));
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
if (so->use_async_io) {
|
|
1227
|
+
auto before = file_index_;
|
|
1228
|
+
// Pre-create and prepare only relevant file iterators
|
|
1229
|
+
for (auto& file_to_arg : *file_to_scan_opts_) {
|
|
1230
|
+
size_t file_index = file_to_arg.first;
|
|
1231
|
+
|
|
1232
|
+
file_index_ = file_index;
|
|
1233
|
+
// Create iterator for this file
|
|
1234
|
+
auto iter = NewFileIterator();
|
|
1235
|
+
if (iter != nullptr) {
|
|
1236
|
+
// If we have async enabled, lets prepare all our iterators.
|
|
1237
|
+
iter->Prepare(&file_to_arg.second);
|
|
1238
|
+
// Store the prepared iterator
|
|
1239
|
+
prepared_iters_[file_index] = iter;
|
|
1240
|
+
}
|
|
1241
|
+
}
|
|
1242
|
+
file_index_ = before;
|
|
1203
1243
|
}
|
|
1204
1244
|
}
|
|
1205
1245
|
|
|
@@ -1276,7 +1316,7 @@ class LevelIterator final : public InternalIterator {
|
|
|
1276
1316
|
}
|
|
1277
1317
|
|
|
1278
1318
|
#ifndef NDEBUG
|
|
1279
|
-
bool OverlapRange(const ScanOptions& opts);
|
|
1319
|
+
bool OverlapRange(const ScanOptions& opts, size_t file_index);
|
|
1280
1320
|
#endif
|
|
1281
1321
|
|
|
1282
1322
|
TableCache* table_cache_;
|
|
@@ -1334,9 +1374,15 @@ class LevelIterator final : public InternalIterator {
|
|
|
1334
1374
|
bool to_return_sentinel_ = false;
|
|
1335
1375
|
const MultiScanArgs* scan_opts_ = nullptr;
|
|
1336
1376
|
|
|
1377
|
+
Statistics* db_statistics_ = nullptr;
|
|
1378
|
+
SystemClock* clock_ = nullptr;
|
|
1379
|
+
|
|
1337
1380
|
// Our stored scan_opts for each prefix
|
|
1338
1381
|
std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
|
|
1339
1382
|
|
|
1383
|
+
// Map to store pre-created iterators by file index
|
|
1384
|
+
std::unordered_map<size_t, InternalIterator*> prepared_iters_;
|
|
1385
|
+
|
|
1340
1386
|
// Sets flags for if we should return the sentinel key next.
|
|
1341
1387
|
// The condition for returning sentinel is reaching the end of current
|
|
1342
1388
|
// file_iter_: !Valid() && status.().ok().
|
|
@@ -1613,8 +1659,19 @@ bool LevelIterator::SkipEmptyFileForward() {
|
|
|
1613
1659
|
const ScanOptions& opts =
|
|
1614
1660
|
GetMultiScanArgForFile(file_index_).GetScanRanges().front();
|
|
1615
1661
|
if (opts.range.start.has_value()) {
|
|
1616
|
-
InternalKey target
|
|
1617
|
-
|
|
1662
|
+
InternalKey target;
|
|
1663
|
+
const size_t ts_size =
|
|
1664
|
+
user_comparator_.user_comparator()->timestamp_size();
|
|
1665
|
+
if (ts_size == 0) {
|
|
1666
|
+
target = InternalKey(opts.range.start.value(), kMaxSequenceNumber,
|
|
1667
|
+
kValueTypeForSeek);
|
|
1668
|
+
} else {
|
|
1669
|
+
std::string seek_key;
|
|
1670
|
+
AppendKeyWithMaxTimestamp(&seek_key, opts.range.start.value(),
|
|
1671
|
+
ts_size);
|
|
1672
|
+
target =
|
|
1673
|
+
InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
|
|
1674
|
+
}
|
|
1618
1675
|
file_iter_.Seek(target.Encode());
|
|
1619
1676
|
}
|
|
1620
1677
|
} else {
|
|
@@ -1662,14 +1719,14 @@ void LevelIterator::SkipEmptyFileBackward() {
|
|
|
1662
1719
|
}
|
|
1663
1720
|
|
|
1664
1721
|
#ifndef NDEBUG
|
|
1665
|
-
bool LevelIterator::OverlapRange(const ScanOptions& opts) {
|
|
1722
|
+
bool LevelIterator::OverlapRange(const ScanOptions& opts, size_t file_index) {
|
|
1666
1723
|
return (user_comparator_.CompareWithoutTimestamp(
|
|
1667
1724
|
opts.range.start.value(), /*a_has_ts=*/false,
|
|
1668
|
-
ExtractUserKey(flevel_->files[
|
|
1725
|
+
ExtractUserKey(flevel_->files[file_index].largest_key),
|
|
1669
1726
|
/*b_has_ts=*/true) <= 0 &&
|
|
1670
1727
|
user_comparator_.CompareWithoutTimestamp(
|
|
1671
1728
|
opts.range.limit.value(), /*a_has_ts=*/false,
|
|
1672
|
-
ExtractUserKey(flevel_->files[
|
|
1729
|
+
ExtractUserKey(flevel_->files[file_index].smallest_key),
|
|
1673
1730
|
/*b_has_ts=*/true) > 0);
|
|
1674
1731
|
}
|
|
1675
1732
|
#endif
|
|
@@ -1680,15 +1737,6 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
|
|
|
1680
1737
|
}
|
|
1681
1738
|
|
|
1682
1739
|
InternalIterator* old_iter = file_iter_.Set(iter);
|
|
1683
|
-
if (iter && scan_opts_) {
|
|
1684
|
-
if (FileHasMultiScanArg(file_index_)) {
|
|
1685
|
-
const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
|
|
1686
|
-
assert(OverlapRange(*new_opts.GetScanRanges().begin()) &&
|
|
1687
|
-
OverlapRange(*new_opts.GetScanRanges().rbegin()));
|
|
1688
|
-
file_iter_.Prepare(&new_opts);
|
|
1689
|
-
}
|
|
1690
|
-
}
|
|
1691
|
-
|
|
1692
1740
|
// Update the read pattern for PrefetchBuffer.
|
|
1693
1741
|
if (is_next_read_sequential_) {
|
|
1694
1742
|
file_iter_.UpdateReadaheadState(old_iter);
|
|
@@ -1718,7 +1766,24 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
|
|
|
1718
1766
|
// no need to change anything
|
|
1719
1767
|
} else {
|
|
1720
1768
|
file_index_ = new_file_index;
|
|
1769
|
+
if (!prepared_iters_.empty()) {
|
|
1770
|
+
auto prepared_it = prepared_iters_.find(file_index_);
|
|
1771
|
+
if (prepared_it != prepared_iters_.end()) {
|
|
1772
|
+
InternalIterator* iter = prepared_it->second;
|
|
1773
|
+
prepared_iters_.erase(prepared_it);
|
|
1774
|
+
SetFileIterator(iter);
|
|
1775
|
+
return;
|
|
1776
|
+
}
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1721
1779
|
InternalIterator* iter = NewFileIterator();
|
|
1780
|
+
if (FileHasMultiScanArg(file_index_)) {
|
|
1781
|
+
auto& args = GetMultiScanArgForFile(file_index_);
|
|
1782
|
+
assert(OverlapRange(*args.GetScanRanges().begin(), file_index_) &&
|
|
1783
|
+
OverlapRange(*args.GetScanRanges().rbegin(), file_index_));
|
|
1784
|
+
iter->Prepare(&args);
|
|
1785
|
+
}
|
|
1786
|
+
|
|
1722
1787
|
SetFileIterator(iter);
|
|
1723
1788
|
}
|
|
1724
1789
|
}
|
|
@@ -2024,6 +2089,79 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
2024
2089
|
}
|
|
2025
2090
|
}
|
|
2026
2091
|
|
|
2092
|
+
void Version::GetColumnFamilyMetaData(
|
|
2093
|
+
const GetColumnFamilyMetaDataOptions& options,
|
|
2094
|
+
ColumnFamilyMetaData* cf_meta) {
|
|
2095
|
+
assert(cf_meta);
|
|
2096
|
+
assert(cfd_);
|
|
2097
|
+
|
|
2098
|
+
cf_meta->name = cfd_->GetName();
|
|
2099
|
+
cf_meta->size = 0;
|
|
2100
|
+
cf_meta->file_count = 0;
|
|
2101
|
+
cf_meta->levels.clear();
|
|
2102
|
+
cf_meta->blob_file_size = 0;
|
|
2103
|
+
cf_meta->blob_file_count = 0;
|
|
2104
|
+
cf_meta->blob_files.clear();
|
|
2105
|
+
|
|
2106
|
+
const auto& ioptions = cfd_->ioptions();
|
|
2107
|
+
auto* vstorage = storage_info();
|
|
2108
|
+
|
|
2109
|
+
int first_level = (options.level >= 0) ? options.level : 0;
|
|
2110
|
+
int last_level =
|
|
2111
|
+
(options.level >= 0) ? options.level + 1 : cfd_->NumberLevels();
|
|
2112
|
+
|
|
2113
|
+
InternalKey ikey_start, ikey_end;
|
|
2114
|
+
const InternalKey* begin = nullptr;
|
|
2115
|
+
const InternalKey* end = nullptr;
|
|
2116
|
+
|
|
2117
|
+
if (options.range.start.has_value()) {
|
|
2118
|
+
ikey_start = InternalKey(options.range.start.value(), kMaxSequenceNumber,
|
|
2119
|
+
kValueTypeForSeek);
|
|
2120
|
+
begin = &ikey_start;
|
|
2121
|
+
}
|
|
2122
|
+
|
|
2123
|
+
if (options.range.limit.has_value()) {
|
|
2124
|
+
ikey_end = InternalKey(options.range.limit.value(), kMaxSequenceNumber,
|
|
2125
|
+
kValueTypeForSeek);
|
|
2126
|
+
end = &ikey_end;
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
for (int l = first_level; l < last_level; ++l) {
|
|
2130
|
+
uint64_t level_size = 0;
|
|
2131
|
+
std::vector<SstFileMetaData> files;
|
|
2132
|
+
std::vector<FileMetaData*> overlapping_files;
|
|
2133
|
+
vstorage->GetOverlappingInputs(l, begin, end, &overlapping_files);
|
|
2134
|
+
|
|
2135
|
+
for (const auto& file : overlapping_files) {
|
|
2136
|
+
uint32_t path_id = file->fd.GetPathId();
|
|
2137
|
+
const auto& file_path = (path_id < ioptions.cf_paths.size())
|
|
2138
|
+
? ioptions.cf_paths[path_id].path
|
|
2139
|
+
: ioptions.cf_paths.back().path;
|
|
2140
|
+
const uint64_t file_number = file->fd.GetNumber();
|
|
2141
|
+
files.emplace_back(
|
|
2142
|
+
MakeTableFileName("", file_number), file_number, file_path,
|
|
2143
|
+
file->fd.GetFileSize(), file->fd.smallest_seqno,
|
|
2144
|
+
file->fd.largest_seqno, file->smallest.user_key().ToString(),
|
|
2145
|
+
file->largest.user_key().ToString(),
|
|
2146
|
+
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
|
|
2147
|
+
file->being_compacted, file->temperature,
|
|
2148
|
+
file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
|
|
2149
|
+
file->TryGetFileCreationTime(), file->epoch_number,
|
|
2150
|
+
file->file_checksum, file->file_checksum_func_name);
|
|
2151
|
+
files.back().num_entries = file->num_entries;
|
|
2152
|
+
files.back().num_deletions = file->num_deletions;
|
|
2153
|
+
files.back().smallest = file->smallest.Encode().ToString();
|
|
2154
|
+
files.back().largest = file->largest.Encode().ToString();
|
|
2155
|
+
level_size += file->fd.GetFileSize();
|
|
2156
|
+
cf_meta->file_count++;
|
|
2157
|
+
}
|
|
2158
|
+
if (!files.empty()) {
|
|
2159
|
+
cf_meta->levels.emplace_back(l, level_size, std::move(files));
|
|
2160
|
+
cf_meta->size += level_size;
|
|
2161
|
+
}
|
|
2162
|
+
}
|
|
2163
|
+
}
|
|
2164
|
+
|
|
2027
2165
|
uint64_t Version::GetSstFilesSize() {
|
|
2028
2166
|
uint64_t sst_files_size = 0;
|
|
2029
2167
|
for (int level = 0; level < storage_info_.num_levels_; level++) {
|
|
@@ -2108,7 +2246,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
|
|
|
2108
2246
|
cfd_->internal_stats()->GetFileReadHist(level),
|
|
2109
2247
|
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
|
|
2110
2248
|
nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
|
|
2111
|
-
allow_unprepared_value, &tombstone_iter_ptr);
|
|
2249
|
+
allow_unprepared_value, &tombstone_iter_ptr, db_statistics_, clock_);
|
|
2112
2250
|
if (read_options.ignore_range_deletions) {
|
|
2113
2251
|
merge_iter_builder->AddIterator(level_iter);
|
|
2114
2252
|
} else {
|
|
@@ -2248,7 +2386,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
|
|
|
2248
2386
|
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
|
|
2249
2387
|
/*range_del_agg=*/nullptr,
|
|
2250
2388
|
/*compaction_boundaries=*/nullptr, allow_unprepared_value,
|
|
2251
|
-
&tombstone_iter_ptr);
|
|
2389
|
+
&tombstone_iter_ptr, db_statistics_, clock_);
|
|
2252
2390
|
if (read_options.ignore_range_deletions) {
|
|
2253
2391
|
merge_iter_builder->AddIterator(level_iter);
|
|
2254
2392
|
} else {
|
|
@@ -2305,7 +2443,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
|
|
|
2305
2443
|
mutable_cf_options_, should_sample_file_read(),
|
|
2306
2444
|
cfd_->internal_stats()->GetFileReadHist(level),
|
|
2307
2445
|
TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
|
|
2308
|
-
&range_del_agg, nullptr, false));
|
|
2446
|
+
&range_del_agg, nullptr, false, nullptr, db_statistics_, clock_));
|
|
2309
2447
|
status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
|
|
2310
2448
|
iter.get(), overlap);
|
|
2311
2449
|
}
|
|
@@ -4487,7 +4625,8 @@ bool VersionStorageInfo::OverlapInLevel(int level,
|
|
|
4487
4625
|
void VersionStorageInfo::GetOverlappingInputs(
|
|
4488
4626
|
int level, const InternalKey* begin, const InternalKey* end,
|
|
4489
4627
|
std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
|
|
4490
|
-
bool expand_range,
|
|
4628
|
+
bool expand_range, const FileMetaData* starting_l0_file,
|
|
4629
|
+
InternalKey** next_smallest) const {
|
|
4491
4630
|
if (level >= num_non_empty_levels_) {
|
|
4492
4631
|
// this level is empty, no overlapping inputs
|
|
4493
4632
|
return;
|
|
@@ -4520,7 +4659,19 @@ void VersionStorageInfo::GetOverlappingInputs(
|
|
|
4520
4659
|
|
|
4521
4660
|
// index stores the file index need to check.
|
|
4522
4661
|
std::list<size_t> index;
|
|
4523
|
-
|
|
4662
|
+
size_t start_index = 0;
|
|
4663
|
+
if (starting_l0_file != nullptr) {
|
|
4664
|
+
uint64_t starting_file_number = starting_l0_file->fd.GetNumber();
|
|
4665
|
+
for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
|
|
4666
|
+
if (level_files_brief_[level].files[i].fd.GetNumber() ==
|
|
4667
|
+
starting_file_number) {
|
|
4668
|
+
start_index = i;
|
|
4669
|
+
break;
|
|
4670
|
+
}
|
|
4671
|
+
}
|
|
4672
|
+
assert(start_index < level_files_brief_[level].num_files);
|
|
4673
|
+
}
|
|
4674
|
+
for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) {
|
|
4524
4675
|
index.emplace_back(i);
|
|
4525
4676
|
}
|
|
4526
4677
|
|
|
@@ -5307,6 +5458,7 @@ void AtomicGroupReadBuffer::Clear() {
|
|
|
5307
5458
|
|
|
5308
5459
|
VersionSet::VersionSet(
|
|
5309
5460
|
const std::string& dbname, const ImmutableDBOptions* _db_options,
|
|
5461
|
+
const MutableDBOptions& mutable_db_options,
|
|
5310
5462
|
const FileOptions& storage_options, Cache* table_cache,
|
|
5311
5463
|
WriteBufferManager* write_buffer_manager, WriteController* write_controller,
|
|
5312
5464
|
BlockCacheTracer* const block_cache_tracer,
|
|
@@ -5335,6 +5487,7 @@ VersionSet::VersionSet(
|
|
|
5335
5487
|
prev_log_number_(0),
|
|
5336
5488
|
current_version_number_(0),
|
|
5337
5489
|
manifest_file_size_(0),
|
|
5490
|
+
last_compacted_manifest_file_size_(0),
|
|
5338
5491
|
file_options_(storage_options),
|
|
5339
5492
|
block_cache_tracer_(block_cache_tracer),
|
|
5340
5493
|
io_tracer_(io_tracer),
|
|
@@ -5342,7 +5495,9 @@ VersionSet::VersionSet(
|
|
|
5342
5495
|
offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
|
|
5343
5496
|
error_handler_(error_handler),
|
|
5344
5497
|
unchanging_(unchanging),
|
|
5345
|
-
closed_(false) {
|
|
5498
|
+
closed_(false) {
|
|
5499
|
+
UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr);
|
|
5500
|
+
}
|
|
5346
5501
|
|
|
5347
5502
|
Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
|
|
5348
5503
|
Status s;
|
|
@@ -5416,6 +5571,15 @@ void VersionSet::Reset() {
|
|
|
5416
5571
|
if (column_family_set_) {
|
|
5417
5572
|
WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
|
|
5418
5573
|
WriteController* wc = column_family_set_->write_controller();
|
|
5574
|
+
|
|
5575
|
+
// Clear TableCache to prevent use-after-free: Reset() deletes old
|
|
5576
|
+
// ColumnFamilySet but reuses table_cache_, which may contain
|
|
5577
|
+
// BlockBasedTable entries with dangling references to deleted CFD's
|
|
5578
|
+
// ioptions.
|
|
5579
|
+
if (table_cache_) {
|
|
5580
|
+
table_cache_->EraseUnRefEntries();
|
|
5581
|
+
}
|
|
5582
|
+
|
|
5419
5583
|
// db_id becomes the source of truth after DBImpl::Recover():
|
|
5420
5584
|
// https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
|
|
5421
5585
|
// Note: we may not be able to recover db_id from MANIFEST if
|
|
@@ -5438,11 +5602,38 @@ void VersionSet::Reset() {
|
|
|
5438
5602
|
current_version_number_ = 0;
|
|
5439
5603
|
manifest_writers_.clear();
|
|
5440
5604
|
manifest_file_size_ = 0;
|
|
5605
|
+
last_compacted_manifest_file_size_ = 0;
|
|
5606
|
+
TuneMaxManifestFileSize();
|
|
5441
5607
|
obsolete_files_.clear();
|
|
5442
5608
|
obsolete_manifests_.clear();
|
|
5443
5609
|
wals_.Reset();
|
|
5444
5610
|
}
|
|
5445
5611
|
|
|
5612
|
+
void VersionSet::UpdatedMutableDbOptions(
|
|
5613
|
+
const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
|
|
5614
|
+
// Must be holding mutex if not called during initialization
|
|
5615
|
+
if (mu) {
|
|
5616
|
+
mu->AssertHeld();
|
|
5617
|
+
} else {
|
|
5618
|
+
// manifest_file_size_ must be 0 if called from the constructor
|
|
5619
|
+
assert(manifest_file_size_ == 0);
|
|
5620
|
+
}
|
|
5621
|
+
file_options_.writable_file_max_buffer_size =
|
|
5622
|
+
updated_options.writable_file_max_buffer_size;
|
|
5623
|
+
min_max_manifest_file_size_ = updated_options.max_manifest_file_size;
|
|
5624
|
+
max_manifest_space_amp_pct_ = static_cast<unsigned>(
|
|
5625
|
+
std::max(updated_options.max_manifest_space_amp_pct, 0));
|
|
5626
|
+
manifest_preallocation_size_ = updated_options.manifest_preallocation_size;
|
|
5627
|
+
TuneMaxManifestFileSize();
|
|
5628
|
+
}
|
|
5629
|
+
|
|
5630
|
+
void VersionSet::TuneMaxManifestFileSize() {
|
|
5631
|
+
tuned_max_manifest_file_size_ =
|
|
5632
|
+
std::max(min_max_manifest_file_size_,
|
|
5633
|
+
last_compacted_manifest_file_size_ *
|
|
5634
|
+
(100U + max_manifest_space_amp_pct_) / 100U);
|
|
5635
|
+
}
|
|
5636
|
+
|
|
5446
5637
|
void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
|
|
5447
5638
|
Version* v) {
|
|
5448
5639
|
// compute new compaction score
|
|
@@ -5525,8 +5716,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5525
5716
|
// the preceding version edits in the same atomic group, and update
|
|
5526
5717
|
// their `remaining_entries_` member variable because we are NOT going
|
|
5527
5718
|
// to write the version edits' of dropped CF to the MANIFEST. If we
|
|
5528
|
-
// don't update, then Recover can report corrupted atomic group
|
|
5529
|
-
// the `remaining_entries_` do not match.
|
|
5719
|
+
// don't update, then Recover can report corrupted atomic group
|
|
5720
|
+
// because the `remaining_entries_` do not match.
|
|
5530
5721
|
if (!batch_edits.empty()) {
|
|
5531
5722
|
if (batch_edits.back()->IsInAtomicGroup() &&
|
|
5532
5723
|
batch_edits.back()->GetRemainingEntries() > 0) {
|
|
@@ -5686,10 +5877,11 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5686
5877
|
}
|
|
5687
5878
|
#endif // NDEBUG
|
|
5688
5879
|
|
|
5880
|
+
uint64_t prev_manifest_file_size = manifest_file_size_;
|
|
5689
5881
|
assert(pending_manifest_file_number_ == 0);
|
|
5690
5882
|
if (!skip_manifest_write &&
|
|
5691
5883
|
(!descriptor_log_ ||
|
|
5692
|
-
|
|
5884
|
+
prev_manifest_file_size >= tuned_max_manifest_file_size_)) {
|
|
5693
5885
|
TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
|
|
5694
5886
|
new_descriptor_log = true;
|
|
5695
5887
|
} else {
|
|
@@ -5729,6 +5921,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5729
5921
|
IOStatus manifest_io_status;
|
|
5730
5922
|
manifest_io_status.PermitUncheckedError();
|
|
5731
5923
|
std::unique_ptr<log::Writer> new_desc_log_ptr;
|
|
5924
|
+
// Save before releasing mu
|
|
5925
|
+
uint64_t manifest_preallocation_size = manifest_preallocation_size_;
|
|
5732
5926
|
if (skip_manifest_write) {
|
|
5733
5927
|
if (s.ok()) {
|
|
5734
5928
|
constexpr bool update_stats = true;
|
|
@@ -5772,16 +5966,13 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5772
5966
|
// This is fine because everything inside of this block is serialized --
|
|
5773
5967
|
// only one thread can be here at the same time
|
|
5774
5968
|
// create new manifest file
|
|
5775
|
-
ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
|
|
5776
|
-
pending_manifest_file_number_);
|
|
5777
5969
|
std::string descriptor_fname =
|
|
5778
5970
|
DescriptorFileName(dbname_, pending_manifest_file_number_);
|
|
5779
5971
|
std::unique_ptr<FSWritableFile> descriptor_file;
|
|
5780
5972
|
io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
|
|
5781
5973
|
opt_file_opts);
|
|
5782
5974
|
if (io_s.ok()) {
|
|
5783
|
-
descriptor_file->SetPreallocationBlockSize(
|
|
5784
|
-
db_options_->manifest_preallocation_size);
|
|
5975
|
+
descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size);
|
|
5785
5976
|
FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
|
|
5786
5977
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
5787
5978
|
std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
|
|
@@ -5831,10 +6022,12 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5831
6022
|
#ifndef NDEBUG
|
|
5832
6023
|
if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
|
|
5833
6024
|
TEST_SYNC_POINT_CALLBACK(
|
|
5834
|
-
"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:
|
|
6025
|
+
"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
|
|
6026
|
+
"0",
|
|
5835
6027
|
nullptr);
|
|
5836
6028
|
TEST_SYNC_POINT(
|
|
5837
|
-
"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:
|
|
6029
|
+
"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
|
|
6030
|
+
"1");
|
|
5838
6031
|
}
|
|
5839
6032
|
++idx;
|
|
5840
6033
|
#endif /* !NDEBUG */
|
|
@@ -5871,8 +6064,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5871
6064
|
file_options_.temperature, dir_contains_current_file);
|
|
5872
6065
|
if (!io_s.ok()) {
|
|
5873
6066
|
s = io_s;
|
|
5874
|
-
// Quarantine old manifest file in case new manifest file's CURRENT
|
|
5875
|
-
// wasn't created successfully and the old manifest is needed.
|
|
6067
|
+
// Quarantine old manifest file in case new manifest file's CURRENT
|
|
6068
|
+
// file wasn't created successfully and the old manifest is needed.
|
|
5876
6069
|
limbo_descriptor_log_file_number.push_back(manifest_file_number_);
|
|
5877
6070
|
files_to_quarantine_if_commit_fail.push_back(
|
|
5878
6071
|
&limbo_descriptor_log_file_number);
|
|
@@ -5882,6 +6075,13 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5882
6075
|
if (s.ok()) {
|
|
5883
6076
|
// find offset in manifest file where this version is stored.
|
|
5884
6077
|
new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
|
|
6078
|
+
if (new_descriptor_log) {
|
|
6079
|
+
ROCKS_LOG_INFO(db_options_->info_log,
|
|
6080
|
+
"Created manifest %" PRIu64
|
|
6081
|
+
", compacted+appended from %" PRIu64 " to %" PRIu64 "\n",
|
|
6082
|
+
pending_manifest_file_number_, prev_manifest_file_size,
|
|
6083
|
+
new_manifest_file_size);
|
|
6084
|
+
}
|
|
5885
6085
|
}
|
|
5886
6086
|
|
|
5887
6087
|
if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
|
|
@@ -5930,6 +6130,8 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5930
6130
|
descriptor_log_ = std::move(new_desc_log_ptr);
|
|
5931
6131
|
obsolete_manifests_.emplace_back(
|
|
5932
6132
|
DescriptorFileName("", manifest_file_number_));
|
|
6133
|
+
last_compacted_manifest_file_size_ = new_manifest_file_size;
|
|
6134
|
+
TuneMaxManifestFileSize();
|
|
5933
6135
|
}
|
|
5934
6136
|
|
|
5935
6137
|
// Install the new versions
|
|
@@ -6012,21 +6214,21 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
6012
6214
|
// that renaming tmp file to CURRENT failed.
|
|
6013
6215
|
//
|
|
6014
6216
|
// On local POSIX-compliant FS, the CURRENT must point to the original
|
|
6015
|
-
// MANIFEST. We can delete the new MANIFEST for simplicity, but we can
|
|
6016
|
-
// keep it. Future recovery will ignore this MANIFEST. It's also ok
|
|
6017
|
-
// process not to crash and continue using the db. Any future
|
|
6018
|
-
// call will switch to a new MANIFEST and update CURRENT,
|
|
6019
|
-
// this one.
|
|
6217
|
+
// MANIFEST. We can delete the new MANIFEST for simplicity, but we can
|
|
6218
|
+
// also keep it. Future recovery will ignore this MANIFEST. It's also ok
|
|
6219
|
+
// for the process not to crash and continue using the db. Any future
|
|
6220
|
+
// LogAndApply() call will switch to a new MANIFEST and update CURRENT,
|
|
6221
|
+
// still ignoring this one.
|
|
6020
6222
|
//
|
|
6021
6223
|
// On non-local FS, it is
|
|
6022
6224
|
// possible that the rename operation succeeded on the server (remote)
|
|
6023
6225
|
// side, but the client somehow returns a non-ok status to RocksDB. Note
|
|
6024
6226
|
// that this does not violate atomicity. Should we delete the new MANIFEST
|
|
6025
6227
|
// successfully, a subsequent recovery attempt will likely see the CURRENT
|
|
6026
|
-
// pointing to the new MANIFEST, thus fail. We will not be able to open
|
|
6027
|
-
// DB again. Therefore, if manifest operations succeed, we should keep
|
|
6028
|
-
// the new MANIFEST. If the process proceeds, any future LogAndApply()
|
|
6029
|
-
// will switch to a new MANIFEST and update CURRENT. If user tries to
|
|
6228
|
+
// pointing to the new MANIFEST, thus fail. We will not be able to open
|
|
6229
|
+
// the DB again. Therefore, if manifest operations succeed, we should keep
|
|
6230
|
+
// the the new MANIFEST. If the process proceeds, any future LogAndApply()
|
|
6231
|
+
// call will switch to a new MANIFEST and update CURRENT. If user tries to
|
|
6030
6232
|
// re-open the DB,
|
|
6031
6233
|
// a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
|
|
6032
6234
|
// b) CURRENT points to the original MANIFEST, and the original MANIFEST
|
|
@@ -6155,9 +6357,9 @@ Status VersionSet::LogAndApply(
|
|
|
6155
6357
|
first_writer.cv.Wait();
|
|
6156
6358
|
}
|
|
6157
6359
|
if (first_writer.done) {
|
|
6158
|
-
// All non-CF-manipulation operations can be grouped together and
|
|
6159
|
-
// to MANIFEST. They should all have finished. The status code
|
|
6160
|
-
// the first manifest writer.
|
|
6360
|
+
// All non-CF-manipulation operations can be grouped together and
|
|
6361
|
+
// committed to MANIFEST. They should all have finished. The status code
|
|
6362
|
+
// is stored in the first manifest writer.
|
|
6161
6363
|
#ifndef NDEBUG
|
|
6162
6364
|
for (const auto& writer : writers) {
|
|
6163
6365
|
assert(writer.done);
|
|
@@ -6211,8 +6413,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
|
|
|
6211
6413
|
assert(!edit->HasLastSequence());
|
|
6212
6414
|
edit->SetLastSequence(*max_last_sequence);
|
|
6213
6415
|
if (edit->IsColumnFamilyDrop()) {
|
|
6214
|
-
// if we drop column family, we have to make sure to save max column
|
|
6215
|
-
// so that we don't reuse existing ID
|
|
6416
|
+
// if we drop column family, we have to make sure to save max column
|
|
6417
|
+
// family, so that we don't reuse existing ID
|
|
6216
6418
|
edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
|
|
6217
6419
|
}
|
|
6218
6420
|
}
|
|
@@ -6501,7 +6703,8 @@ void VersionSet::RecoverEpochNumbers() {
|
|
|
6501
6703
|
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
|
6502
6704
|
const std::string& dbname,
|
|
6503
6705
|
FileSystem* fs) {
|
|
6504
|
-
// Read "CURRENT" file, which contains a pointer to the current manifest
|
|
6706
|
+
// Read "CURRENT" file, which contains a pointer to the current manifest
|
|
6707
|
+
// file
|
|
6505
6708
|
std::string manifest_path;
|
|
6506
6709
|
uint64_t manifest_file_number;
|
|
6507
6710
|
Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false,
|
|
@@ -6563,14 +6766,16 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
|
|
6563
6766
|
const ReadOptions read_options;
|
|
6564
6767
|
const WriteOptions write_options;
|
|
6565
6768
|
|
|
6566
|
-
ImmutableDBOptions
|
|
6769
|
+
ImmutableDBOptions imm_db_options(*options);
|
|
6770
|
+
MutableDBOptions mutable_db_options(*options);
|
|
6567
6771
|
ColumnFamilyOptions cf_options(*options);
|
|
6568
6772
|
std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
|
|
6569
6773
|
options->table_cache_numshardbits));
|
|
6570
6774
|
WriteController wc(options->delayed_write_rate);
|
|
6571
6775
|
WriteBufferManager wb(options->db_write_buffer_size);
|
|
6572
|
-
VersionSet versions(dbname, &
|
|
6573
|
-
|
|
6776
|
+
VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options,
|
|
6777
|
+
tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/,
|
|
6778
|
+
nullptr /*IOTracer*/,
|
|
6574
6779
|
/*db_id*/ "",
|
|
6575
6780
|
/*db_session_id*/ "", options->daily_offpeak_time_utc,
|
|
6576
6781
|
/*error_handler_*/ nullptr, /*unchanging=*/false);
|
|
@@ -6656,9 +6861,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
|
|
|
6656
6861
|
|
|
6657
6862
|
// Get the checksum information including the checksum and checksum function
|
|
6658
6863
|
// name of all SST and blob files in VersionSet. Store the information in
|
|
6659
|
-
// FileChecksumList which contains a map from file number to its checksum
|
|
6660
|
-
// If DB is not running, make sure call VersionSet::Recover() to load
|
|
6661
|
-
// metadata from Manifest to VersionSet before calling this function.
|
|
6864
|
+
// FileChecksumList which contains a map from file number to its checksum
|
|
6865
|
+
// info. If DB is not running, make sure call VersionSet::Recover() to load
|
|
6866
|
+
// the file metadata from Manifest to VersionSet before calling this function.
|
|
6662
6867
|
Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
|
|
6663
6868
|
// Clean the previously stored checksum information if any.
|
|
6664
6869
|
Status s;
|
|
@@ -6800,8 +7005,8 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
6800
7005
|
// WARNING: This method doesn't hold a mutex!!
|
|
6801
7006
|
|
|
6802
7007
|
// This is done without DB mutex lock held, but only within single-threaded
|
|
6803
|
-
// LogAndApply. Column family manipulations can only happen within
|
|
6804
|
-
// (the same single thread), so we're safe to iterate.
|
|
7008
|
+
// LogAndApply. Column family manipulations can only happen within
|
|
7009
|
+
// LogAndApply (the same single thread), so we're safe to iterate.
|
|
6805
7010
|
|
|
6806
7011
|
assert(io_s.ok());
|
|
6807
7012
|
if (db_options_->write_dbid_to_manifest) {
|
|
@@ -6835,9 +7040,9 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
6835
7040
|
}
|
|
6836
7041
|
|
|
6837
7042
|
// New manifest should rollover the WAL deletion record from previous
|
|
6838
|
-
// manifest. Otherwise, when an addition record of a deleted WAL gets added
|
|
6839
|
-
// this new manifest later (which can happens in e.g, SyncWAL()), this
|
|
6840
|
-
// manifest creates an illusion that such WAL hasn't been deleted.
|
|
7043
|
+
// manifest. Otherwise, when an addition record of a deleted WAL gets added
|
|
7044
|
+
// to this new manifest later (which can happens in e.g, SyncWAL()), this
|
|
7045
|
+
// new manifest creates an illusion that such WAL hasn't been deleted.
|
|
6841
7046
|
VersionEdit wal_deletions;
|
|
6842
7047
|
wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
|
|
6843
7048
|
std::string wal_deletions_record;
|
|
@@ -6969,9 +7174,9 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
6969
7174
|
// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
|
|
6970
7175
|
// function is called repeatedly with consecutive pairs of slices. For example
|
|
6971
7176
|
// if the slice list is [a, b, c, d] this function is called with arguments
|
|
6972
|
-
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible
|
|
6973
|
-
// we avoid doing binary search for the keys b and c twice and instead
|
|
6974
|
-
// maintain state of where they first appear in the files.
|
|
7177
|
+
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible
|
|
7178
|
+
// where we avoid doing binary search for the keys b and c twice and instead
|
|
7179
|
+
// somehow maintain state of where they first appear in the files.
|
|
6975
7180
|
uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
|
6976
7181
|
const ReadOptions& read_options,
|
|
6977
7182
|
Version* v, const Slice& start,
|
|
@@ -6992,19 +7197,20 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
|
|
6992
7197
|
}
|
|
6993
7198
|
|
|
6994
7199
|
// Outline of the optimization that uses options.files_size_error_margin.
|
|
6995
|
-
// When approximating the files total size that is used to store a keys
|
|
6996
|
-
// we first sum up the sizes of the files that fully fall into the
|
|
6997
|
-
// Then we sum up the sizes of all the files that may intersect with
|
|
6998
|
-
// (this includes all files in L0 as well). Then, if
|
|
6999
|
-
// is smaller than total_full_size *
|
|
7000
|
-
// infer that the intersecting
|
|
7001
|
-
// contribution to the total size, and
|
|
7002
|
-
// for the keys in range as just
|
|
7003
|
-
// E.g., if the value of
|
|
7004
|
-
//
|
|
7005
|
-
//
|
|
7006
|
-
//
|
|
7007
|
-
//
|
|
7200
|
+
// When approximating the files total size that is used to store a keys
|
|
7201
|
+
// range, we first sum up the sizes of the files that fully fall into the
|
|
7202
|
+
// range. Then we sum up the sizes of all the files that may intersect with
|
|
7203
|
+
// the range (this includes all files in L0 as well). Then, if
|
|
7204
|
+
// total_intersecting_size is smaller than total_full_size *
|
|
7205
|
+
// options.files_size_error_margin - we can infer that the intersecting
|
|
7206
|
+
// files have a sufficiently negligible contribution to the total size, and
|
|
7207
|
+
// we can approximate the storage required for the keys in range as just
|
|
7208
|
+
// half of the intersecting_files_size. E.g., if the value of
|
|
7209
|
+
// files_size_error_margin is 0.1, then the error of the approximation is
|
|
7210
|
+
// limited to only ~10% of the total size of files that fully fall into the
|
|
7211
|
+
// keys range. In such case, this helps to avoid a costly process of binary
|
|
7212
|
+
// searching the intersecting files that is required only for a more precise
|
|
7213
|
+
// calculation of the total size.
|
|
7008
7214
|
|
|
7009
7215
|
autovector<FdWithKeyRange*, 32> first_files;
|
|
7010
7216
|
autovector<FdWithKeyRange*, 16> last_files;
|
|
@@ -7076,10 +7282,11 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
|
|
7076
7282
|
total_intersecting_size += file_ptr->fd.GetFileSize();
|
|
7077
7283
|
}
|
|
7078
7284
|
|
|
7079
|
-
// Now scan all the first & last files at each level, and estimate their
|
|
7080
|
-
// If the total_intersecting_size is less than X% of the
|
|
7081
|
-
// want to approximate the result in order to avoid the
|
|
7082
|
-
// inside ApproximateSize. We use half of file size as
|
|
7285
|
+
// Now scan all the first & last files at each level, and estimate their
|
|
7286
|
+
// size. If the total_intersecting_size is less than X% of the
|
|
7287
|
+
// total_full_size - we want to approximate the result in order to avoid the
|
|
7288
|
+
// costly binary search inside ApproximateSize. We use half of file size as
|
|
7289
|
+
// an approximation below.
|
|
7083
7290
|
|
|
7084
7291
|
const double margin = options.files_size_error_margin;
|
|
7085
7292
|
if (margin > 0 && total_intersecting_size <
|
|
@@ -7347,7 +7554,8 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
7347
7554
|
/*no per level latency histogram=*/nullptr,
|
|
7348
7555
|
TableReaderCaller::kCompaction, /*skip_filters=*/false,
|
|
7349
7556
|
/*level=*/static_cast<int>(c->level(which)), range_del_agg,
|
|
7350
|
-
c->boundaries(which), false, &tombstone_iter_ptr
|
|
7557
|
+
c->boundaries(which), false, &tombstone_iter_ptr,
|
|
7558
|
+
db_options_->statistics.get(), clock_);
|
|
7351
7559
|
range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
|
|
7352
7560
|
}
|
|
7353
7561
|
}
|
|
@@ -7622,12 +7830,13 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
|
|
|
7622
7830
|
}
|
|
7623
7831
|
|
|
7624
7832
|
ReactiveVersionSet::ReactiveVersionSet(
|
|
7625
|
-
const std::string& dbname, const ImmutableDBOptions*
|
|
7833
|
+
const std::string& dbname, const ImmutableDBOptions* imm_db_options,
|
|
7834
|
+
const MutableDBOptions& mutable_db_options,
|
|
7626
7835
|
const FileOptions& _file_options, Cache* table_cache,
|
|
7627
7836
|
WriteBufferManager* write_buffer_manager, WriteController* write_controller,
|
|
7628
7837
|
const std::shared_ptr<IOTracer>& io_tracer)
|
|
7629
|
-
: VersionSet(dbname,
|
|
7630
|
-
write_buffer_manager, write_controller,
|
|
7838
|
+
: VersionSet(dbname, imm_db_options, mutable_db_options, _file_options,
|
|
7839
|
+
table_cache, write_buffer_manager, write_controller,
|
|
7631
7840
|
/*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
|
|
7632
7841
|
/*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
|
|
7633
7842
|
/*error_handler=*/nullptr, /*unchanging=*/false) {}
|
|
@@ -7751,8 +7960,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
|
|
|
7751
7960
|
}
|
|
7752
7961
|
} else if (s.IsPathNotFound()) {
|
|
7753
7962
|
// This can happen if the primary switches to a new MANIFEST after the
|
|
7754
|
-
// secondary reads the CURRENT file but before the secondary actually
|
|
7755
|
-
// to open the MANIFEST.
|
|
7963
|
+
// secondary reads the CURRENT file but before the secondary actually
|
|
7964
|
+
// tries to open the MANIFEST.
|
|
7756
7965
|
s = Status::TryAgain(
|
|
7757
7966
|
"The primary may have switched to a new MANIFEST and deleted the old "
|
|
7758
7967
|
"one.");
|