@nxtedition/rocksdb 8.0.1 → 8.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
- package/deps/rocksdb/rocksdb/Makefile +2 -2
- package/deps/rocksdb/rocksdb/TARGETS +4 -2
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +20 -146
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
- package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
- package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
- package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
- package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
- package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
- package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
- package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
- package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
- package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
- package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
- package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
- package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
- package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
- package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
- package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
- package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
- package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
- package/deps/rocksdb/rocksdb/db/version_set.cc +212 -35
- package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
- package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
- package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
- package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
- package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
- package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
- package/deps/rocksdb/rocksdb/src.mk +2 -1
- package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -2
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
- package/deps/rocksdb/rocksdb/table/format.cc +24 -20
- package/deps/rocksdb/rocksdb/table/format.h +5 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
- package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
- package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
- package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
- package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
- package/deps/rocksdb/rocksdb/util/status.cc +7 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
- package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
- package/deps/rocksdb/rocksdb.gyp +2 -1
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
|
@@ -39,6 +39,8 @@
|
|
|
39
39
|
#include "db/table_cache.h"
|
|
40
40
|
#include "db/version_builder.h"
|
|
41
41
|
#include "db/version_edit_handler.h"
|
|
42
|
+
#include "table/compaction_merging_iterator.h"
|
|
43
|
+
|
|
42
44
|
#if USE_COROUTINES
|
|
43
45
|
#include "folly/experimental/coro/BlockingWait.h"
|
|
44
46
|
#include "folly/experimental/coro/Collect.h"
|
|
@@ -1771,8 +1773,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
|
|
|
1771
1773
|
file->stats.num_reads_sampled.load(std::memory_order_relaxed),
|
|
1772
1774
|
file->being_compacted, file->temperature,
|
|
1773
1775
|
file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
|
|
1774
|
-
file->TryGetFileCreationTime(), file->
|
|
1775
|
-
file->file_checksum_func_name);
|
|
1776
|
+
file->TryGetFileCreationTime(), file->epoch_number,
|
|
1777
|
+
file->file_checksum, file->file_checksum_func_name);
|
|
1776
1778
|
files.back().num_entries = file->num_entries;
|
|
1777
1779
|
files.back().num_deletions = file->num_deletions;
|
|
1778
1780
|
level_size += file->fd.GetFileSize();
|
|
@@ -2036,7 +2038,8 @@ VersionStorageInfo::VersionStorageInfo(
|
|
|
2036
2038
|
const InternalKeyComparator* internal_comparator,
|
|
2037
2039
|
const Comparator* user_comparator, int levels,
|
|
2038
2040
|
CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
|
|
2039
|
-
bool _force_consistency_checks
|
|
2041
|
+
bool _force_consistency_checks,
|
|
2042
|
+
EpochNumberRequirement epoch_number_requirement)
|
|
2040
2043
|
: internal_comparator_(internal_comparator),
|
|
2041
2044
|
user_comparator_(user_comparator),
|
|
2042
2045
|
// cfd is nullptr if Version is dummy
|
|
@@ -2064,7 +2067,8 @@ VersionStorageInfo::VersionStorageInfo(
|
|
|
2064
2067
|
current_num_samples_(0),
|
|
2065
2068
|
estimated_compaction_needed_bytes_(0),
|
|
2066
2069
|
finalized_(false),
|
|
2067
|
-
force_consistency_checks_(_force_consistency_checks)
|
|
2070
|
+
force_consistency_checks_(_force_consistency_checks),
|
|
2071
|
+
epoch_number_requirement_(epoch_number_requirement) {
|
|
2068
2072
|
if (ref_vstorage != nullptr) {
|
|
2069
2073
|
accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
|
|
2070
2074
|
accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
|
|
@@ -2085,7 +2089,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
|
|
2085
2089
|
const FileOptions& file_opt,
|
|
2086
2090
|
const MutableCFOptions mutable_cf_options,
|
|
2087
2091
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
2088
|
-
uint64_t version_number
|
|
2092
|
+
uint64_t version_number,
|
|
2093
|
+
EpochNumberRequirement epoch_number_requirement)
|
|
2089
2094
|
: env_(vset->env_),
|
|
2090
2095
|
clock_(vset->clock_),
|
|
2091
2096
|
cfd_(column_family_data),
|
|
@@ -2104,7 +2109,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
|
|
|
2104
2109
|
(cfd_ == nullptr || cfd_->current() == nullptr)
|
|
2105
2110
|
? nullptr
|
|
2106
2111
|
: cfd_->current()->storage_info(),
|
|
2107
|
-
cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks
|
|
2112
|
+
cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks,
|
|
2113
|
+
epoch_number_requirement),
|
|
2108
2114
|
vset_(vset),
|
|
2109
2115
|
next_(this),
|
|
2110
2116
|
prev_(this),
|
|
@@ -2539,16 +2545,19 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|
|
2539
2545
|
}
|
|
2540
2546
|
f = fp.GetNextFileInLevel();
|
|
2541
2547
|
}
|
|
2542
|
-
if (
|
|
2548
|
+
if (mget_tasks.size() > 0) {
|
|
2543
2549
|
RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
|
|
2544
2550
|
mget_tasks.size());
|
|
2545
2551
|
// Collect all results so far
|
|
2546
2552
|
std::vector<Status> statuses = folly::coro::blockingWait(
|
|
2547
2553
|
folly::coro::collectAllRange(std::move(mget_tasks))
|
|
2548
2554
|
.scheduleOn(&range->context()->executor()));
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2555
|
+
if (s.ok()) {
|
|
2556
|
+
for (Status stat : statuses) {
|
|
2557
|
+
if (!stat.ok()) {
|
|
2558
|
+
s = std::move(stat);
|
|
2559
|
+
break;
|
|
2560
|
+
}
|
|
2552
2561
|
}
|
|
2553
2562
|
}
|
|
2554
2563
|
|
|
@@ -2794,6 +2803,9 @@ Status Version::MultiGetAsync(
|
|
|
2794
2803
|
unsigned int num_tasks_queued = 0;
|
|
2795
2804
|
to_process.pop_front();
|
|
2796
2805
|
if (batch->IsSearchEnded() || batch->GetRange().empty()) {
|
|
2806
|
+
// If to_process is empty, i.e no more batches to look at, then we need
|
|
2807
|
+
// schedule the enqueued coroutines and wait for them. Otherwise, we
|
|
2808
|
+
// skip this batch and move to the next one in to_process.
|
|
2797
2809
|
if (!to_process.empty()) {
|
|
2798
2810
|
continue;
|
|
2799
2811
|
}
|
|
@@ -2802,9 +2814,6 @@ Status Version::MultiGetAsync(
|
|
|
2802
2814
|
// to_process
|
|
2803
2815
|
s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
|
|
2804
2816
|
to_process, num_tasks_queued, mget_stats);
|
|
2805
|
-
if (!s.ok()) {
|
|
2806
|
-
break;
|
|
2807
|
-
}
|
|
2808
2817
|
// If ProcessBatch didn't enqueue any coroutine tasks, it means all
|
|
2809
2818
|
// keys were filtered out. So put the batch back in to_process to
|
|
2810
2819
|
// lookup in the next level
|
|
@@ -2815,8 +2824,10 @@ Status Version::MultiGetAsync(
|
|
|
2815
2824
|
waiting.emplace_back(idx);
|
|
2816
2825
|
}
|
|
2817
2826
|
}
|
|
2818
|
-
|
|
2819
|
-
|
|
2827
|
+
// If ProcessBatch() returned an error, then schedule the enqueued
|
|
2828
|
+
// coroutines and wait for them, then abort the MultiGet.
|
|
2829
|
+
if (to_process.empty() || !s.ok()) {
|
|
2830
|
+
if (mget_tasks.size() > 0) {
|
|
2820
2831
|
assert(waiting.size());
|
|
2821
2832
|
RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
|
|
2822
2833
|
// Collect all results so far
|
|
@@ -2824,10 +2835,12 @@ Status Version::MultiGetAsync(
|
|
|
2824
2835
|
folly::coro::collectAllRange(std::move(mget_tasks))
|
|
2825
2836
|
.scheduleOn(&range->context()->executor()));
|
|
2826
2837
|
mget_tasks.clear();
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
|
|
2830
|
-
|
|
2838
|
+
if (s.ok()) {
|
|
2839
|
+
for (Status stat : statuses) {
|
|
2840
|
+
if (!stat.ok()) {
|
|
2841
|
+
s = std::move(stat);
|
|
2842
|
+
break;
|
|
2843
|
+
}
|
|
2831
2844
|
}
|
|
2832
2845
|
}
|
|
2833
2846
|
|
|
@@ -2850,6 +2863,9 @@ Status Version::MultiGetAsync(
|
|
|
2850
2863
|
assert(!s.ok() || waiting.size() == 0);
|
|
2851
2864
|
}
|
|
2852
2865
|
}
|
|
2866
|
+
if (!s.ok()) {
|
|
2867
|
+
break;
|
|
2868
|
+
}
|
|
2853
2869
|
}
|
|
2854
2870
|
|
|
2855
2871
|
uint64_t num_levels = 0;
|
|
@@ -4270,6 +4286,74 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
|
|
|
4270
4286
|
return scratch->buffer;
|
|
4271
4287
|
}
|
|
4272
4288
|
|
|
4289
|
+
bool VersionStorageInfo::HasMissingEpochNumber() const {
|
|
4290
|
+
for (int level = 0; level < num_levels_; ++level) {
|
|
4291
|
+
for (const FileMetaData* f : files_[level]) {
|
|
4292
|
+
if (f->epoch_number == kUnknownEpochNumber) {
|
|
4293
|
+
return true;
|
|
4294
|
+
}
|
|
4295
|
+
}
|
|
4296
|
+
}
|
|
4297
|
+
return false;
|
|
4298
|
+
}
|
|
4299
|
+
|
|
4300
|
+
uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const {
|
|
4301
|
+
uint64_t max_epoch_number = kUnknownEpochNumber;
|
|
4302
|
+
for (int level = 0; level < num_levels_; ++level) {
|
|
4303
|
+
for (const FileMetaData* f : files_[level]) {
|
|
4304
|
+
max_epoch_number = std::max(max_epoch_number, f->epoch_number);
|
|
4305
|
+
}
|
|
4306
|
+
}
|
|
4307
|
+
return max_epoch_number;
|
|
4308
|
+
}
|
|
4309
|
+
|
|
4310
|
+
void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) {
|
|
4311
|
+
cfd->ResetNextEpochNumber();
|
|
4312
|
+
|
|
4313
|
+
bool reserve_epoch_num_for_file_ingested_behind =
|
|
4314
|
+
cfd->ioptions()->allow_ingest_behind;
|
|
4315
|
+
if (reserve_epoch_num_for_file_ingested_behind) {
|
|
4316
|
+
uint64_t reserved_epoch_number = cfd->NewEpochNumber();
|
|
4317
|
+
assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind);
|
|
4318
|
+
ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(),
|
|
4319
|
+
"[%s]CF has reserved epoch number %" PRIu64
|
|
4320
|
+
" for files ingested "
|
|
4321
|
+
"behind since `Options::allow_ingest_behind` is true",
|
|
4322
|
+
cfd->GetName().c_str(), reserved_epoch_number);
|
|
4323
|
+
}
|
|
4324
|
+
|
|
4325
|
+
if (HasMissingEpochNumber()) {
|
|
4326
|
+
assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing);
|
|
4327
|
+
assert(num_levels_ >= 1);
|
|
4328
|
+
|
|
4329
|
+
for (int level = num_levels_ - 1; level >= 1; --level) {
|
|
4330
|
+
auto& files_at_level = files_[level];
|
|
4331
|
+
if (files_at_level.empty()) {
|
|
4332
|
+
continue;
|
|
4333
|
+
}
|
|
4334
|
+
uint64_t next_epoch_number = cfd->NewEpochNumber();
|
|
4335
|
+
for (FileMetaData* f : files_at_level) {
|
|
4336
|
+
f->epoch_number = next_epoch_number;
|
|
4337
|
+
}
|
|
4338
|
+
}
|
|
4339
|
+
|
|
4340
|
+
for (auto file_meta_iter = files_[0].rbegin();
|
|
4341
|
+
file_meta_iter != files_[0].rend(); file_meta_iter++) {
|
|
4342
|
+
FileMetaData* f = *file_meta_iter;
|
|
4343
|
+
f->epoch_number = cfd->NewEpochNumber();
|
|
4344
|
+
}
|
|
4345
|
+
|
|
4346
|
+
ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(),
|
|
4347
|
+
"[%s]CF's epoch numbers are inferred based on seqno",
|
|
4348
|
+
cfd->GetName().c_str());
|
|
4349
|
+
epoch_number_requirement_ = EpochNumberRequirement::kMustPresent;
|
|
4350
|
+
} else {
|
|
4351
|
+
assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent);
|
|
4352
|
+
cfd->SetNextEpochNumber(
|
|
4353
|
+
std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber()));
|
|
4354
|
+
}
|
|
4355
|
+
}
|
|
4356
|
+
|
|
4273
4357
|
uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
|
|
4274
4358
|
uint64_t result = 0;
|
|
4275
4359
|
std::vector<FileMetaData*> overlaps;
|
|
@@ -4967,10 +5051,15 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
4967
5051
|
if (!descriptor_log_ ||
|
|
4968
5052
|
manifest_file_size_ > db_options_->max_manifest_file_size) {
|
|
4969
5053
|
TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
|
|
5054
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
5055
|
+
"VersionSet::ProcessManifestWrites:BeforeNewManifest", nullptr);
|
|
4970
5056
|
new_descriptor_log = true;
|
|
4971
5057
|
} else {
|
|
4972
5058
|
pending_manifest_file_number_ = manifest_file_number_;
|
|
4973
5059
|
}
|
|
5060
|
+
TEST_SYNC_POINT_CALLBACK(
|
|
5061
|
+
"VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot",
|
|
5062
|
+
&new_descriptor_log);
|
|
4974
5063
|
|
|
4975
5064
|
// Local cached copy of state variable(s). WriteCurrentStateToManifest()
|
|
4976
5065
|
// reads its content after releasing db mutex to avoid race with
|
|
@@ -5099,6 +5188,7 @@ Status VersionSet::ProcessManifestWrites(
|
|
|
5099
5188
|
break;
|
|
5100
5189
|
}
|
|
5101
5190
|
}
|
|
5191
|
+
|
|
5102
5192
|
if (s.ok()) {
|
|
5103
5193
|
io_s = SyncManifest(db_options_, descriptor_log_->file());
|
|
5104
5194
|
manifest_io_status = io_s;
|
|
@@ -5506,7 +5596,8 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
|
|
|
5506
5596
|
Status VersionSet::Recover(
|
|
5507
5597
|
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
|
5508
5598
|
std::string* db_id, bool no_error_if_files_missing) {
|
|
5509
|
-
// Read "CURRENT" file, which contains a pointer to the current manifest
|
|
5599
|
+
// Read "CURRENT" file, which contains a pointer to the current manifest
|
|
5600
|
+
// file
|
|
5510
5601
|
std::string manifest_path;
|
|
5511
5602
|
Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
|
|
5512
5603
|
&manifest_file_number_);
|
|
@@ -5540,7 +5631,8 @@ Status VersionSet::Recover(
|
|
|
5540
5631
|
true /* checksum */, 0 /* log_number */);
|
|
5541
5632
|
VersionEditHandler handler(
|
|
5542
5633
|
read_only, column_families, const_cast<VersionSet*>(this),
|
|
5543
|
-
/*track_missing_files=*/false, no_error_if_files_missing, io_tracer_
|
|
5634
|
+
/*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
|
|
5635
|
+
EpochNumberRequirement::kMightMissing);
|
|
5544
5636
|
handler.Iterate(reader, &log_read_status);
|
|
5545
5637
|
s = handler.status();
|
|
5546
5638
|
if (s.ok()) {
|
|
@@ -5549,6 +5641,9 @@ Status VersionSet::Recover(
|
|
|
5549
5641
|
assert(current_manifest_file_size != 0);
|
|
5550
5642
|
handler.GetDbId(db_id);
|
|
5551
5643
|
}
|
|
5644
|
+
if (s.ok()) {
|
|
5645
|
+
RecoverEpochNumbers();
|
|
5646
|
+
}
|
|
5552
5647
|
}
|
|
5553
5648
|
|
|
5554
5649
|
if (s.ok()) {
|
|
@@ -5708,7 +5803,8 @@ Status VersionSet::TryRecoverFromOneManifest(
|
|
|
5708
5803
|
log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
|
|
5709
5804
|
/*checksum=*/true, /*log_num=*/0);
|
|
5710
5805
|
VersionEditHandlerPointInTime handler_pit(
|
|
5711
|
-
read_only, column_families, const_cast<VersionSet*>(this), io_tracer_
|
|
5806
|
+
read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
|
|
5807
|
+
EpochNumberRequirement::kMightMissing);
|
|
5712
5808
|
|
|
5713
5809
|
handler_pit.Iterate(reader, &s);
|
|
5714
5810
|
|
|
@@ -5717,7 +5813,21 @@ Status VersionSet::TryRecoverFromOneManifest(
|
|
|
5717
5813
|
assert(nullptr != has_missing_table_file);
|
|
5718
5814
|
*has_missing_table_file = handler_pit.HasMissingFiles();
|
|
5719
5815
|
|
|
5720
|
-
|
|
5816
|
+
s = handler_pit.status();
|
|
5817
|
+
if (s.ok()) {
|
|
5818
|
+
RecoverEpochNumbers();
|
|
5819
|
+
}
|
|
5820
|
+
return s;
|
|
5821
|
+
}
|
|
5822
|
+
|
|
5823
|
+
void VersionSet::RecoverEpochNumbers() {
|
|
5824
|
+
for (auto cfd : *column_family_set_) {
|
|
5825
|
+
if (cfd->IsDropped()) {
|
|
5826
|
+
continue;
|
|
5827
|
+
}
|
|
5828
|
+
assert(cfd->initialized());
|
|
5829
|
+
cfd->RecoverEpochNumbers();
|
|
5830
|
+
}
|
|
5721
5831
|
}
|
|
5722
5832
|
|
|
5723
5833
|
Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
|
|
@@ -6037,6 +6147,22 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
6037
6147
|
}
|
|
6038
6148
|
}
|
|
6039
6149
|
|
|
6150
|
+
// New manifest should rollover the WAL deletion record from previous
|
|
6151
|
+
// manifest. Otherwise, when an addition record of a deleted WAL gets added to
|
|
6152
|
+
// this new manifest later (which can happens in e.g, SyncWAL()), this new
|
|
6153
|
+
// manifest creates an illusion that such WAL hasn't been deleted.
|
|
6154
|
+
VersionEdit wal_deletions;
|
|
6155
|
+
wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
|
|
6156
|
+
std::string wal_deletions_record;
|
|
6157
|
+
if (!wal_deletions.EncodeTo(&wal_deletions_record)) {
|
|
6158
|
+
return Status::Corruption("Unable to Encode VersionEdit: " +
|
|
6159
|
+
wal_deletions.DebugString(true));
|
|
6160
|
+
}
|
|
6161
|
+
io_s = log->AddRecord(wal_deletions_record);
|
|
6162
|
+
if (!io_s.ok()) {
|
|
6163
|
+
return io_s;
|
|
6164
|
+
}
|
|
6165
|
+
|
|
6040
6166
|
for (auto cfd : *column_family_set_) {
|
|
6041
6167
|
assert(cfd);
|
|
6042
6168
|
|
|
@@ -6088,7 +6214,7 @@ Status VersionSet::WriteCurrentStateToManifest(
|
|
|
6088
6214
|
f->fd.smallest_seqno, f->fd.largest_seqno,
|
|
6089
6215
|
f->marked_for_compaction, f->temperature,
|
|
6090
6216
|
f->oldest_blob_file_number, f->oldest_ancester_time,
|
|
6091
|
-
f->file_creation_time, f->file_checksum,
|
|
6217
|
+
f->file_creation_time, f->epoch_number, f->file_checksum,
|
|
6092
6218
|
f->file_checksum_func_name, f->unique_id);
|
|
6093
6219
|
}
|
|
6094
6220
|
}
|
|
@@ -6460,6 +6586,14 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
6460
6586
|
c->num_input_levels() - 1
|
|
6461
6587
|
: c->num_input_levels());
|
|
6462
6588
|
InternalIterator** list = new InternalIterator*[space];
|
|
6589
|
+
// First item in the pair is a pointer to range tombstones.
|
|
6590
|
+
// Second item is a pointer to a member of a LevelIterator,
|
|
6591
|
+
// that will be initialized to where CompactionMergingIterator stores
|
|
6592
|
+
// pointer to its range tombstones. This is used by LevelIterator
|
|
6593
|
+
// to update pointer to range tombstones as it traverse different SST files.
|
|
6594
|
+
std::vector<
|
|
6595
|
+
std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
|
|
6596
|
+
range_tombstones;
|
|
6463
6597
|
size_t num = 0;
|
|
6464
6598
|
for (size_t which = 0; which < c->num_input_levels(); which++) {
|
|
6465
6599
|
if (c->input_levels(which)->num_files != 0) {
|
|
@@ -6480,7 +6614,7 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
6480
6614
|
end.value(), fmd.smallest.user_key()) < 0) {
|
|
6481
6615
|
continue;
|
|
6482
6616
|
}
|
|
6483
|
-
|
|
6617
|
+
TruncatedRangeDelIterator* range_tombstone_iter = nullptr;
|
|
6484
6618
|
list[num++] = cfd->table_cache()->NewIterator(
|
|
6485
6619
|
read_options, file_options_compactions,
|
|
6486
6620
|
cfd->internal_comparator(), fmd, range_del_agg,
|
|
@@ -6493,10 +6627,13 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
6493
6627
|
MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
|
|
6494
6628
|
/*smallest_compaction_key=*/nullptr,
|
|
6495
6629
|
/*largest_compaction_key=*/nullptr,
|
|
6496
|
-
/*allow_unprepared_value=*/false
|
|
6630
|
+
/*allow_unprepared_value=*/false,
|
|
6631
|
+
/*range_del_iter=*/&range_tombstone_iter);
|
|
6632
|
+
range_tombstones.emplace_back(range_tombstone_iter, nullptr);
|
|
6497
6633
|
}
|
|
6498
6634
|
} else {
|
|
6499
6635
|
// Create concatenating iterator for the files from this level
|
|
6636
|
+
TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
|
|
6500
6637
|
list[num++] = new LevelIterator(
|
|
6501
6638
|
cfd->table_cache(), read_options, file_options_compactions,
|
|
6502
6639
|
cfd->internal_comparator(), c->input_levels(which),
|
|
@@ -6505,14 +6642,15 @@ InternalIterator* VersionSet::MakeInputIterator(
|
|
|
6505
6642
|
/*no per level latency histogram=*/nullptr,
|
|
6506
6643
|
TableReaderCaller::kCompaction, /*skip_filters=*/false,
|
|
6507
6644
|
/*level=*/static_cast<int>(c->level(which)), range_del_agg,
|
|
6508
|
-
c->boundaries(which));
|
|
6645
|
+
c->boundaries(which), false, &tombstone_iter_ptr);
|
|
6646
|
+
range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
|
|
6509
6647
|
}
|
|
6510
6648
|
}
|
|
6511
6649
|
}
|
|
6512
6650
|
assert(num <= space);
|
|
6513
|
-
InternalIterator* result =
|
|
6514
|
-
|
|
6515
|
-
|
|
6651
|
+
InternalIterator* result = NewCompactionMergingIterator(
|
|
6652
|
+
&c->column_family_data()->internal_comparator(), list,
|
|
6653
|
+
static_cast<int>(num), range_tombstones);
|
|
6516
6654
|
delete[] list;
|
|
6517
6655
|
return result;
|
|
6518
6656
|
}
|
|
@@ -6579,6 +6717,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
|
|
6579
6717
|
filemetadata.temperature = file->temperature;
|
|
6580
6718
|
filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
|
|
6581
6719
|
filemetadata.file_creation_time = file->TryGetFileCreationTime();
|
|
6720
|
+
filemetadata.epoch_number = file->epoch_number;
|
|
6582
6721
|
metadata->push_back(filemetadata);
|
|
6583
6722
|
}
|
|
6584
6723
|
}
|
|
@@ -6704,8 +6843,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
|
|
|
6704
6843
|
return all_versions_blob_file_size;
|
|
6705
6844
|
}
|
|
6706
6845
|
|
|
6707
|
-
Status VersionSet::VerifyFileMetadata(
|
|
6708
|
-
const
|
|
6846
|
+
Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
|
|
6847
|
+
const std::string& fpath, int level,
|
|
6848
|
+
const FileMetaData& meta) {
|
|
6709
6849
|
uint64_t fsize = 0;
|
|
6710
6850
|
Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
|
|
6711
6851
|
if (status.ok()) {
|
|
@@ -6713,6 +6853,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath,
|
|
|
6713
6853
|
status = Status::Corruption("File size mismatch: " + fpath);
|
|
6714
6854
|
}
|
|
6715
6855
|
}
|
|
6856
|
+
if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
|
|
6857
|
+
assert(cfd);
|
|
6858
|
+
TableCache* table_cache = cfd->table_cache();
|
|
6859
|
+
assert(table_cache);
|
|
6860
|
+
|
|
6861
|
+
const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
|
|
6862
|
+
assert(cf_opts);
|
|
6863
|
+
std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
|
|
6864
|
+
size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
|
|
6865
|
+
|
|
6866
|
+
const FileOptions& file_opts = file_options();
|
|
6867
|
+
|
|
6868
|
+
Version* version = cfd->current();
|
|
6869
|
+
assert(version);
|
|
6870
|
+
VersionStorageInfo& storage_info = version->storage_info_;
|
|
6871
|
+
const InternalKeyComparator* icmp = storage_info.InternalComparator();
|
|
6872
|
+
assert(icmp);
|
|
6873
|
+
|
|
6874
|
+
InternalStats* internal_stats = cfd->internal_stats();
|
|
6875
|
+
|
|
6876
|
+
FileMetaData meta_copy = meta;
|
|
6877
|
+
status = table_cache->FindTable(
|
|
6878
|
+
ReadOptions(), file_opts, *icmp, meta_copy,
|
|
6879
|
+
&(meta_copy.table_reader_handle), pe,
|
|
6880
|
+
/*no_io=*/false, /*record_read_stats=*/true,
|
|
6881
|
+
internal_stats->GetFileReadHist(level), false, level,
|
|
6882
|
+
/*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
|
|
6883
|
+
meta_copy.temperature);
|
|
6884
|
+
if (meta_copy.table_reader_handle) {
|
|
6885
|
+
table_cache->ReleaseHandle(meta_copy.table_reader_handle);
|
|
6886
|
+
}
|
|
6887
|
+
}
|
|
6716
6888
|
return status;
|
|
6717
6889
|
}
|
|
6718
6890
|
|
|
@@ -6748,12 +6920,17 @@ Status ReactiveVersionSet::Recover(
|
|
|
6748
6920
|
log::Reader* reader = manifest_reader->get();
|
|
6749
6921
|
assert(reader);
|
|
6750
6922
|
|
|
6751
|
-
manifest_tailer_.reset(
|
|
6752
|
-
column_families, const_cast<ReactiveVersionSet*>(this),
|
|
6923
|
+
manifest_tailer_.reset(
|
|
6924
|
+
new ManifestTailer(column_families, const_cast<ReactiveVersionSet*>(this),
|
|
6925
|
+
io_tracer_, EpochNumberRequirement::kMightMissing));
|
|
6753
6926
|
|
|
6754
6927
|
manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
|
|
6755
6928
|
|
|
6756
|
-
|
|
6929
|
+
s = manifest_tailer_->status();
|
|
6930
|
+
if (s.ok()) {
|
|
6931
|
+
RecoverEpochNumbers();
|
|
6932
|
+
}
|
|
6933
|
+
return s;
|
|
6757
6934
|
}
|
|
6758
6935
|
|
|
6759
6936
|
Status ReactiveVersionSet::ReadAndApply(
|
|
@@ -116,6 +116,10 @@ extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
|
|
|
116
116
|
extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
|
|
117
117
|
const std::vector<FileMetaData*>& files,
|
|
118
118
|
Arena* arena);
|
|
119
|
+
enum EpochNumberRequirement {
|
|
120
|
+
kMightMissing,
|
|
121
|
+
kMustPresent,
|
|
122
|
+
};
|
|
119
123
|
|
|
120
124
|
// Information of the storage associated with each Version, including number of
|
|
121
125
|
// levels of LSM tree, files information at each level, files marked for
|
|
@@ -126,7 +130,9 @@ class VersionStorageInfo {
|
|
|
126
130
|
const Comparator* user_comparator, int num_levels,
|
|
127
131
|
CompactionStyle compaction_style,
|
|
128
132
|
VersionStorageInfo* src_vstorage,
|
|
129
|
-
bool _force_consistency_checks
|
|
133
|
+
bool _force_consistency_checks,
|
|
134
|
+
EpochNumberRequirement epoch_number_requirement =
|
|
135
|
+
EpochNumberRequirement::kMustPresent);
|
|
130
136
|
// No copying allowed
|
|
131
137
|
VersionStorageInfo(const VersionStorageInfo&) = delete;
|
|
132
138
|
void operator=(const VersionStorageInfo&) = delete;
|
|
@@ -319,6 +325,17 @@ class VersionStorageInfo {
|
|
|
319
325
|
return files_[level];
|
|
320
326
|
}
|
|
321
327
|
|
|
328
|
+
bool HasMissingEpochNumber() const;
|
|
329
|
+
uint64_t GetMaxEpochNumberOfFiles() const;
|
|
330
|
+
EpochNumberRequirement GetEpochNumberRequirement() const {
|
|
331
|
+
return epoch_number_requirement_;
|
|
332
|
+
}
|
|
333
|
+
void SetEpochNumberRequirement(
|
|
334
|
+
EpochNumberRequirement epoch_number_requirement) {
|
|
335
|
+
epoch_number_requirement_ = epoch_number_requirement;
|
|
336
|
+
}
|
|
337
|
+
void RecoverEpochNumbers(ColumnFamilyData* cfd);
|
|
338
|
+
|
|
322
339
|
class FileLocation {
|
|
323
340
|
public:
|
|
324
341
|
FileLocation() = default;
|
|
@@ -440,6 +457,11 @@ class VersionStorageInfo {
|
|
|
440
457
|
return files_marked_for_compaction_;
|
|
441
458
|
}
|
|
442
459
|
|
|
460
|
+
void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) {
|
|
461
|
+
f->marked_for_compaction = true;
|
|
462
|
+
files_marked_for_compaction_.emplace_back(level, f);
|
|
463
|
+
}
|
|
464
|
+
|
|
443
465
|
// REQUIRES: ComputeCompactionScore has been called
|
|
444
466
|
// REQUIRES: DB mutex held during access
|
|
445
467
|
const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
|
|
@@ -723,6 +745,8 @@ class VersionStorageInfo {
|
|
|
723
745
|
// is compiled in release mode
|
|
724
746
|
bool force_consistency_checks_;
|
|
725
747
|
|
|
748
|
+
EpochNumberRequirement epoch_number_requirement_;
|
|
749
|
+
|
|
726
750
|
friend class Version;
|
|
727
751
|
friend class VersionSet;
|
|
728
752
|
};
|
|
@@ -1047,7 +1071,9 @@ class Version {
|
|
|
1047
1071
|
Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
|
|
1048
1072
|
MutableCFOptions mutable_cf_options,
|
|
1049
1073
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
1050
|
-
uint64_t version_number = 0
|
|
1074
|
+
uint64_t version_number = 0,
|
|
1075
|
+
EpochNumberRequirement epoch_number_requirement =
|
|
1076
|
+
EpochNumberRequirement::kMustPresent);
|
|
1051
1077
|
|
|
1052
1078
|
~Version();
|
|
1053
1079
|
|
|
@@ -1188,6 +1214,10 @@ class VersionSet {
|
|
|
1188
1214
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
1189
1215
|
bool read_only, std::string* db_id, bool* has_missing_table_file);
|
|
1190
1216
|
|
|
1217
|
+
// Recover the next epoch number of each CFs and epoch number
|
|
1218
|
+
// of their files (if missing)
|
|
1219
|
+
void RecoverEpochNumbers();
|
|
1220
|
+
|
|
1191
1221
|
// Reads a manifest file and returns a list of column families in
|
|
1192
1222
|
// column_families.
|
|
1193
1223
|
static Status ListColumnFamilies(std::vector<std::string>* column_families,
|
|
@@ -1501,8 +1531,8 @@ class VersionSet {
|
|
|
1501
1531
|
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
|
1502
1532
|
const VersionEdit* edit);
|
|
1503
1533
|
|
|
1504
|
-
Status VerifyFileMetadata(const std::string& fpath,
|
|
1505
|
-
const FileMetaData& meta)
|
|
1534
|
+
Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
|
|
1535
|
+
int level, const FileMetaData& meta);
|
|
1506
1536
|
|
|
1507
1537
|
// Protected by DB mutex.
|
|
1508
1538
|
WalSet wals_;
|