@nxtedition/rocksdb 12.1.4 → 12.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +1 -1
- package/deps/rocksdb/rocksdb/Makefile +10 -5
- package/deps/rocksdb/rocksdb/TARGETS +9 -7
- package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
- package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
- package/deps/rocksdb/rocksdb/db/c.cc +9 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
- package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
- package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
- package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
- package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
- package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
- package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
- package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
- package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
- package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
- package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
- package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
- package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
- package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
- package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
- package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
- package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
- package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
- package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
- package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
- package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
- package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
- package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
- package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
- package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
- package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
- package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
- package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
- package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
- package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
- package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
- package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
- package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
- package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
- package/deps/rocksdb/rocksdb/port/port.h +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
- package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
- package/deps/rocksdb/rocksdb/table/format.cc +32 -4
- package/deps/rocksdb/rocksdb/table/format.h +12 -1
- package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
- package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
- package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
- package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
- package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
- package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
- package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
- package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
- package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
- package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
- package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
|
@@ -680,26 +680,12 @@ Status BlockBasedTable::Open(
|
|
|
680
680
|
if (s.ok()) {
|
|
681
681
|
s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
|
|
682
682
|
prefetch_buffer.get(), file_size, &footer,
|
|
683
|
-
kBlockBasedTableMagicNumber);
|
|
684
|
-
}
|
|
685
|
-
// If the footer is corrupted and the FS supports checksum verification and
|
|
686
|
-
// correction, try reading the footer again
|
|
687
|
-
if (s.IsCorruption()) {
|
|
688
|
-
RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
|
|
689
|
-
if (CheckFSFeatureSupport(ioptions.fs.get(),
|
|
690
|
-
FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
691
|
-
IOOptions retry_opts = opts;
|
|
692
|
-
retry_opts.verify_and_reconstruct_read = true;
|
|
693
|
-
s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs,
|
|
694
|
-
prefetch_buffer.get(), file_size, &footer,
|
|
695
|
-
kBlockBasedTableMagicNumber);
|
|
696
|
-
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
697
|
-
if (s.ok()) {
|
|
698
|
-
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
699
|
-
}
|
|
700
|
-
}
|
|
683
|
+
kBlockBasedTableMagicNumber, ioptions.stats);
|
|
701
684
|
}
|
|
702
685
|
if (!s.ok()) {
|
|
686
|
+
if (s.IsCorruption()) {
|
|
687
|
+
RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
|
|
688
|
+
}
|
|
703
689
|
return s;
|
|
704
690
|
}
|
|
705
691
|
if (!IsSupportedFormatVersion(footer.format_version())) {
|
|
@@ -2077,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator(
|
|
|
2077
2063
|
if (arena == nullptr) {
|
|
2078
2064
|
return new BlockBasedTableIterator(
|
|
2079
2065
|
this, read_options, rep_->internal_comparator, std::move(index_iter),
|
|
2080
|
-
!skip_filters &&
|
|
2066
|
+
!skip_filters &&
|
|
2067
|
+
(!read_options.total_order_seek || read_options.auto_prefix_mode ||
|
|
2068
|
+
read_options.prefix_same_as_start) &&
|
|
2081
2069
|
prefix_extractor != nullptr,
|
|
2082
2070
|
need_upper_bound_check, prefix_extractor, caller,
|
|
2083
2071
|
compaction_readahead_size, allow_unprepared_value);
|
|
@@ -2085,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator(
|
|
|
2085
2073
|
auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
|
|
2086
2074
|
return new (mem) BlockBasedTableIterator(
|
|
2087
2075
|
this, read_options, rep_->internal_comparator, std::move(index_iter),
|
|
2088
|
-
!skip_filters &&
|
|
2076
|
+
!skip_filters &&
|
|
2077
|
+
(!read_options.total_order_seek || read_options.auto_prefix_mode ||
|
|
2078
|
+
read_options.prefix_same_as_start) &&
|
|
2089
2079
|
prefix_extractor != nullptr,
|
|
2090
2080
|
need_upper_bound_check, prefix_extractor, caller,
|
|
2091
2081
|
compaction_readahead_size, allow_unprepared_value);
|
|
@@ -91,9 +91,24 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
|
|
|
91
91
|
uint64_t alt_hash = GetSliceHash64(alt);
|
|
92
92
|
std::optional<uint64_t> prev_key_hash;
|
|
93
93
|
std::optional<uint64_t> prev_alt_hash = hash_entries_info_.prev_alt_hash;
|
|
94
|
+
|
|
94
95
|
if (!hash_entries_info_.entries.empty()) {
|
|
95
96
|
prev_key_hash = hash_entries_info_.entries.back();
|
|
96
97
|
}
|
|
98
|
+
|
|
99
|
+
#ifdef ROCKSDB_VALGRIND_RUN
|
|
100
|
+
// Valgrind can report uninitialized FPs on std::optional usage. See e.g.
|
|
101
|
+
// https://stackoverflow.com/q/51616179
|
|
102
|
+
if (!prev_key_hash.has_value()) {
|
|
103
|
+
std::memset((void*)&prev_key_hash, 0, sizeof(prev_key_hash));
|
|
104
|
+
prev_key_hash.reset();
|
|
105
|
+
}
|
|
106
|
+
if (!prev_alt_hash.has_value()) {
|
|
107
|
+
std::memset((void*)&prev_alt_hash, 0, sizeof(prev_key_hash));
|
|
108
|
+
prev_alt_hash.reset();
|
|
109
|
+
}
|
|
110
|
+
#endif
|
|
111
|
+
|
|
97
112
|
// Add alt first, so that entries.back() always contains previous key
|
|
98
113
|
// ASSUMING a change from one alt to the next implies a change to
|
|
99
114
|
// corresponding key
|
|
@@ -295,15 +310,6 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
|
|
|
295
310
|
bool detect_filter_construct_corruption_;
|
|
296
311
|
|
|
297
312
|
struct HashEntriesInfo {
|
|
298
|
-
#ifdef ROCKSDB_VALGRIND_RUN
|
|
299
|
-
HashEntriesInfo() {
|
|
300
|
-
// Valgrind can report uninitialized FPs on std::optional usage. See e.g.
|
|
301
|
-
// https://stackoverflow.com/q/51616179
|
|
302
|
-
std::memset((void*)&prev_alt_hash, 0, sizeof(prev_alt_hash));
|
|
303
|
-
prev_alt_hash = {};
|
|
304
|
-
}
|
|
305
|
-
#endif
|
|
306
|
-
|
|
307
313
|
// A deque avoids unnecessary copying of already-saved values
|
|
308
314
|
// and has near-minimal peak memory use.
|
|
309
315
|
std::deque<uint64_t> entries;
|
|
@@ -475,10 +475,12 @@ std::string Footer::ToString() const {
|
|
|
475
475
|
return result;
|
|
476
476
|
}
|
|
477
477
|
|
|
478
|
-
Status
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
478
|
+
static Status ReadFooterFromFileInternal(const IOOptions& opts,
|
|
479
|
+
RandomAccessFileReader* file,
|
|
480
|
+
FileSystem& fs,
|
|
481
|
+
FilePrefetchBuffer* prefetch_buffer,
|
|
482
|
+
uint64_t file_size, Footer* footer,
|
|
483
|
+
uint64_t enforce_table_magic_number) {
|
|
482
484
|
if (file_size < Footer::kMinEncodedLength) {
|
|
483
485
|
return Status::Corruption("file is too short (" +
|
|
484
486
|
std::to_string(file_size) +
|
|
@@ -516,6 +518,8 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
|
|
516
518
|
}
|
|
517
519
|
}
|
|
518
520
|
|
|
521
|
+
TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
|
|
522
|
+
|
|
519
523
|
// Check that we actually read the whole footer from the file. It may be
|
|
520
524
|
// that size isn't correct.
|
|
521
525
|
if (footer_input.size() < Footer::kMinEncodedLength) {
|
|
@@ -543,6 +547,30 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
|
|
543
547
|
return Status::OK();
|
|
544
548
|
}
|
|
545
549
|
|
|
550
|
+
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
|
551
|
+
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
|
552
|
+
uint64_t file_size, Footer* footer,
|
|
553
|
+
uint64_t enforce_table_magic_number,
|
|
554
|
+
Statistics* stats) {
|
|
555
|
+
Status s =
|
|
556
|
+
ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size,
|
|
557
|
+
footer, enforce_table_magic_number);
|
|
558
|
+
if (s.IsCorruption() &&
|
|
559
|
+
CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
560
|
+
IOOptions new_opts = opts;
|
|
561
|
+
new_opts.verify_and_reconstruct_read = true;
|
|
562
|
+
footer->Reset();
|
|
563
|
+
s = ReadFooterFromFileInternal(new_opts, file, fs, prefetch_buffer,
|
|
564
|
+
file_size, footer,
|
|
565
|
+
enforce_table_magic_number);
|
|
566
|
+
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
567
|
+
if (s.ok()) {
|
|
568
|
+
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
return s;
|
|
572
|
+
}
|
|
573
|
+
|
|
546
574
|
namespace {
|
|
547
575
|
// Custom handling for the last byte of a block, to avoid invoking streaming
|
|
548
576
|
// API to get an effective block checksum. This function is its own inverse
|
|
@@ -186,6 +186,16 @@ class Footer {
|
|
|
186
186
|
// Create empty. Populate using DecodeFrom.
|
|
187
187
|
Footer() {}
|
|
188
188
|
|
|
189
|
+
void Reset() {
|
|
190
|
+
table_magic_number_ = kNullTableMagicNumber;
|
|
191
|
+
format_version_ = kInvalidFormatVersion;
|
|
192
|
+
base_context_checksum_ = 0;
|
|
193
|
+
metaindex_handle_ = BlockHandle::NullBlockHandle();
|
|
194
|
+
index_handle_ = BlockHandle::NullBlockHandle();
|
|
195
|
+
checksum_type_ = kInvalidChecksumType;
|
|
196
|
+
block_trailer_size_ = 0;
|
|
197
|
+
}
|
|
198
|
+
|
|
189
199
|
// Deserialize a footer (populate fields) from `input` and check for various
|
|
190
200
|
// corruptions. `input_offset` is the offset within the target file of
|
|
191
201
|
// `input` buffer, which is needed for verifying format_version >= 6 footer.
|
|
@@ -304,7 +314,8 @@ class FooterBuilder {
|
|
|
304
314
|
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
|
305
315
|
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
|
306
316
|
uint64_t file_size, Footer* footer,
|
|
307
|
-
uint64_t enforce_table_magic_number = 0
|
|
317
|
+
uint64_t enforce_table_magic_number = 0,
|
|
318
|
+
Statistics* stats = nullptr);
|
|
308
319
|
|
|
309
320
|
// Computes a checksum using the given ChecksumType. Sometimes we need to
|
|
310
321
|
// include one more input byte logically at the end but not part of the main
|
|
@@ -74,6 +74,10 @@ class EmptyInternalIterator : public InternalIteratorBase<TValue> {
|
|
|
74
74
|
assert(false);
|
|
75
75
|
return TValue();
|
|
76
76
|
}
|
|
77
|
+
uint64_t write_unix_time() const override {
|
|
78
|
+
assert(false);
|
|
79
|
+
return std::numeric_limits<uint64_t>::max();
|
|
80
|
+
}
|
|
77
81
|
Status status() const override { return status_; }
|
|
78
82
|
|
|
79
83
|
private:
|
|
@@ -163,6 +163,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
|
|
163
163
|
Add(TablePropertiesNames::kSequenceNumberTimeMapping,
|
|
164
164
|
props.seqno_to_time_mapping);
|
|
165
165
|
}
|
|
166
|
+
if (props.key_largest_seqno != UINT64_MAX) {
|
|
167
|
+
Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
|
|
168
|
+
}
|
|
166
169
|
}
|
|
167
170
|
|
|
168
171
|
Slice PropertyBlockBuilder::Finish() {
|
|
@@ -259,180 +262,230 @@ Status ReadTablePropertiesHelper(
|
|
|
259
262
|
MemoryAllocator* memory_allocator) {
|
|
260
263
|
assert(table_properties);
|
|
261
264
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
{
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
{TablePropertiesNames::kFormatVersion,
|
|
320
|
-
&new_table_properties->format_version},
|
|
321
|
-
{TablePropertiesNames::kFixedKeyLen,
|
|
322
|
-
&new_table_properties->fixed_key_len},
|
|
323
|
-
{TablePropertiesNames::kColumnFamilyId,
|
|
324
|
-
&new_table_properties->column_family_id},
|
|
325
|
-
{TablePropertiesNames::kCreationTime,
|
|
326
|
-
&new_table_properties->creation_time},
|
|
327
|
-
{TablePropertiesNames::kOldestKeyTime,
|
|
328
|
-
&new_table_properties->oldest_key_time},
|
|
329
|
-
{TablePropertiesNames::kFileCreationTime,
|
|
330
|
-
&new_table_properties->file_creation_time},
|
|
331
|
-
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
332
|
-
&new_table_properties->slow_compression_estimated_data_size},
|
|
333
|
-
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
334
|
-
&new_table_properties->fast_compression_estimated_data_size},
|
|
335
|
-
{TablePropertiesNames::kTailStartOffset,
|
|
336
|
-
&new_table_properties->tail_start_offset},
|
|
337
|
-
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
338
|
-
&new_table_properties->user_defined_timestamps_persisted},
|
|
339
|
-
};
|
|
340
|
-
|
|
341
|
-
std::string last_key;
|
|
342
|
-
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
343
|
-
s = iter->status();
|
|
344
|
-
if (!s.ok()) {
|
|
345
|
-
break;
|
|
265
|
+
Status s;
|
|
266
|
+
bool retry = false;
|
|
267
|
+
while (true) {
|
|
268
|
+
BlockContents block_contents;
|
|
269
|
+
size_t len = handle.size() + footer.GetBlockTrailerSize();
|
|
270
|
+
// If this is an external SST file ingested with write_global_seqno set to
|
|
271
|
+
// true, then we expect the checksum mismatch because checksum was written
|
|
272
|
+
// by SstFileWriter, but its global seqno in the properties block may have
|
|
273
|
+
// been changed during ingestion. For this reason, we initially read
|
|
274
|
+
// and process without checksum verification, then later try checksum
|
|
275
|
+
// verification so that if it fails, we can copy to a temporary buffer with
|
|
276
|
+
// global seqno set to its original value, i.e. 0, and attempt checksum
|
|
277
|
+
// verification again.
|
|
278
|
+
if (!retry) {
|
|
279
|
+
ReadOptions modified_ro = ro;
|
|
280
|
+
modified_ro.verify_checksums = false;
|
|
281
|
+
BlockFetcher block_fetcher(
|
|
282
|
+
file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
|
|
283
|
+
ioptions, false /* decompress */, false /*maybe_compressed*/,
|
|
284
|
+
BlockType::kProperties, UncompressionDict::GetEmptyDict(),
|
|
285
|
+
PersistentCacheOptions::kEmpty, memory_allocator);
|
|
286
|
+
s = block_fetcher.ReadBlockContents();
|
|
287
|
+
if (!s.ok()) {
|
|
288
|
+
return s;
|
|
289
|
+
}
|
|
290
|
+
assert(block_fetcher.GetBlockSizeWithTrailer() == len);
|
|
291
|
+
TEST_SYNC_POINT_CALLBACK("ReadTablePropertiesHelper:0",
|
|
292
|
+
&block_contents.data);
|
|
293
|
+
} else {
|
|
294
|
+
assert(s.IsCorruption());
|
|
295
|
+
// If retrying, use a stronger file system read to check and correct
|
|
296
|
+
// data corruption
|
|
297
|
+
IOOptions opts;
|
|
298
|
+
if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) !=
|
|
299
|
+
IOStatus::OK()) {
|
|
300
|
+
return s;
|
|
301
|
+
}
|
|
302
|
+
opts.verify_and_reconstruct_read = true;
|
|
303
|
+
std::unique_ptr<char[]> data(new char[len]);
|
|
304
|
+
Slice result;
|
|
305
|
+
IOStatus io_s =
|
|
306
|
+
file->Read(opts, handle.offset(), len, &result, data.get(), nullptr);
|
|
307
|
+
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
308
|
+
if (!io_s.ok()) {
|
|
309
|
+
ROCKS_LOG_INFO(ioptions.info_log,
|
|
310
|
+
"Reading properties block failed - %s",
|
|
311
|
+
io_s.ToString().c_str());
|
|
312
|
+
// Return the original corruption error as that's more serious
|
|
313
|
+
return s;
|
|
314
|
+
}
|
|
315
|
+
if (result.size() < len) {
|
|
316
|
+
return Status::Corruption("Reading properties block failed - " +
|
|
317
|
+
std::to_string(result.size()) +
|
|
318
|
+
" bytes read");
|
|
319
|
+
}
|
|
320
|
+
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
321
|
+
block_contents = BlockContents(std::move(data), handle.size());
|
|
346
322
|
}
|
|
347
323
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
}
|
|
355
|
-
|
|
324
|
+
uint64_t block_size = block_contents.data.size();
|
|
325
|
+
Block properties_block(std::move(block_contents));
|
|
326
|
+
// Unfortunately, Block::size() might not equal block_contents.data.size(),
|
|
327
|
+
// and Block hides block_contents
|
|
328
|
+
std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
|
|
329
|
+
|
|
330
|
+
std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
|
|
331
|
+
// All pre-defined properties of type uint64_t
|
|
332
|
+
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
|
333
|
+
{TablePropertiesNames::kOriginalFileNumber,
|
|
334
|
+
&new_table_properties->orig_file_number},
|
|
335
|
+
{TablePropertiesNames::kDataSize, &new_table_properties->data_size},
|
|
336
|
+
{TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
|
|
337
|
+
{TablePropertiesNames::kIndexPartitions,
|
|
338
|
+
&new_table_properties->index_partitions},
|
|
339
|
+
{TablePropertiesNames::kTopLevelIndexSize,
|
|
340
|
+
&new_table_properties->top_level_index_size},
|
|
341
|
+
{TablePropertiesNames::kIndexKeyIsUserKey,
|
|
342
|
+
&new_table_properties->index_key_is_user_key},
|
|
343
|
+
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
|
344
|
+
&new_table_properties->index_value_is_delta_encoded},
|
|
345
|
+
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
|
|
346
|
+
{TablePropertiesNames::kRawKeySize,
|
|
347
|
+
&new_table_properties->raw_key_size},
|
|
348
|
+
{TablePropertiesNames::kRawValueSize,
|
|
349
|
+
&new_table_properties->raw_value_size},
|
|
350
|
+
{TablePropertiesNames::kNumDataBlocks,
|
|
351
|
+
&new_table_properties->num_data_blocks},
|
|
352
|
+
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
|
|
353
|
+
{TablePropertiesNames::kNumFilterEntries,
|
|
354
|
+
&new_table_properties->num_filter_entries},
|
|
355
|
+
{TablePropertiesNames::kDeletedKeys,
|
|
356
|
+
&new_table_properties->num_deletions},
|
|
357
|
+
{TablePropertiesNames::kMergeOperands,
|
|
358
|
+
&new_table_properties->num_merge_operands},
|
|
359
|
+
{TablePropertiesNames::kNumRangeDeletions,
|
|
360
|
+
&new_table_properties->num_range_deletions},
|
|
361
|
+
{TablePropertiesNames::kFormatVersion,
|
|
362
|
+
&new_table_properties->format_version},
|
|
363
|
+
{TablePropertiesNames::kFixedKeyLen,
|
|
364
|
+
&new_table_properties->fixed_key_len},
|
|
365
|
+
{TablePropertiesNames::kColumnFamilyId,
|
|
366
|
+
&new_table_properties->column_family_id},
|
|
367
|
+
{TablePropertiesNames::kCreationTime,
|
|
368
|
+
&new_table_properties->creation_time},
|
|
369
|
+
{TablePropertiesNames::kOldestKeyTime,
|
|
370
|
+
&new_table_properties->oldest_key_time},
|
|
371
|
+
{TablePropertiesNames::kFileCreationTime,
|
|
372
|
+
&new_table_properties->file_creation_time},
|
|
373
|
+
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
374
|
+
&new_table_properties->slow_compression_estimated_data_size},
|
|
375
|
+
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
376
|
+
&new_table_properties->fast_compression_estimated_data_size},
|
|
377
|
+
{TablePropertiesNames::kTailStartOffset,
|
|
378
|
+
&new_table_properties->tail_start_offset},
|
|
379
|
+
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
380
|
+
&new_table_properties->user_defined_timestamps_persisted},
|
|
381
|
+
{TablePropertiesNames::kKeyLargestSeqno,
|
|
382
|
+
&new_table_properties->key_largest_seqno},
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
std::string last_key;
|
|
386
|
+
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
387
|
+
s = iter->status();
|
|
388
|
+
if (!s.ok()) {
|
|
389
|
+
break;
|
|
390
|
+
}
|
|
356
391
|
|
|
357
|
-
|
|
358
|
-
|
|
392
|
+
auto key = iter->key().ToString();
|
|
393
|
+
// properties block should be strictly sorted with no duplicate key.
|
|
394
|
+
if (!last_key.empty() &&
|
|
395
|
+
BytewiseComparator()->Compare(key, last_key) <= 0) {
|
|
396
|
+
s = Status::Corruption("properties unsorted");
|
|
397
|
+
break;
|
|
398
|
+
}
|
|
399
|
+
last_key = key;
|
|
359
400
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
handle.offset() + iter->ValueOffset();
|
|
363
|
-
}
|
|
401
|
+
auto raw_val = iter->value();
|
|
402
|
+
auto pos = predefined_uint64_properties.find(key);
|
|
364
403
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
404
|
+
if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
|
|
405
|
+
new_table_properties->external_sst_file_global_seqno_offset =
|
|
406
|
+
handle.offset() + iter->ValueOffset();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
if (pos != predefined_uint64_properties.end()) {
|
|
410
|
+
if (key == TablePropertiesNames::kDeletedKeys ||
|
|
411
|
+
key == TablePropertiesNames::kMergeOperands) {
|
|
412
|
+
// Insert in user-collected properties for API backwards compatibility
|
|
413
|
+
new_table_properties->user_collected_properties.insert(
|
|
414
|
+
{key, raw_val.ToString()});
|
|
415
|
+
}
|
|
416
|
+
// handle predefined rocksdb properties
|
|
417
|
+
uint64_t val;
|
|
418
|
+
if (!GetVarint64(&raw_val, &val)) {
|
|
419
|
+
// skip malformed value
|
|
420
|
+
auto error_msg =
|
|
421
|
+
"Detect malformed value in properties meta-block:"
|
|
422
|
+
"\tkey: " +
|
|
423
|
+
key + "\tval: " + raw_val.ToString();
|
|
424
|
+
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
|
425
|
+
continue;
|
|
426
|
+
}
|
|
427
|
+
*(pos->second) = val;
|
|
428
|
+
} else if (key == TablePropertiesNames::kDbId) {
|
|
429
|
+
new_table_properties->db_id = raw_val.ToString();
|
|
430
|
+
} else if (key == TablePropertiesNames::kDbSessionId) {
|
|
431
|
+
new_table_properties->db_session_id = raw_val.ToString();
|
|
432
|
+
} else if (key == TablePropertiesNames::kDbHostId) {
|
|
433
|
+
new_table_properties->db_host_id = raw_val.ToString();
|
|
434
|
+
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
|
435
|
+
new_table_properties->filter_policy_name = raw_val.ToString();
|
|
436
|
+
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
|
437
|
+
new_table_properties->column_family_name = raw_val.ToString();
|
|
438
|
+
} else if (key == TablePropertiesNames::kComparator) {
|
|
439
|
+
new_table_properties->comparator_name = raw_val.ToString();
|
|
440
|
+
} else if (key == TablePropertiesNames::kMergeOperator) {
|
|
441
|
+
new_table_properties->merge_operator_name = raw_val.ToString();
|
|
442
|
+
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
|
443
|
+
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
|
444
|
+
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
|
445
|
+
new_table_properties->property_collectors_names = raw_val.ToString();
|
|
446
|
+
} else if (key == TablePropertiesNames::kCompression) {
|
|
447
|
+
new_table_properties->compression_name = raw_val.ToString();
|
|
448
|
+
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
|
449
|
+
new_table_properties->compression_options = raw_val.ToString();
|
|
450
|
+
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
|
451
|
+
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
|
452
|
+
} else {
|
|
453
|
+
// handle user-collected properties
|
|
369
454
|
new_table_properties->user_collected_properties.insert(
|
|
370
455
|
{key, raw_val.ToString()});
|
|
371
456
|
}
|
|
372
|
-
// handle predefined rocksdb properties
|
|
373
|
-
uint64_t val;
|
|
374
|
-
if (!GetVarint64(&raw_val, &val)) {
|
|
375
|
-
// skip malformed value
|
|
376
|
-
auto error_msg =
|
|
377
|
-
"Detect malformed value in properties meta-block:"
|
|
378
|
-
"\tkey: " +
|
|
379
|
-
key + "\tval: " + raw_val.ToString();
|
|
380
|
-
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
|
381
|
-
continue;
|
|
382
|
-
}
|
|
383
|
-
*(pos->second) = val;
|
|
384
|
-
} else if (key == TablePropertiesNames::kDbId) {
|
|
385
|
-
new_table_properties->db_id = raw_val.ToString();
|
|
386
|
-
} else if (key == TablePropertiesNames::kDbSessionId) {
|
|
387
|
-
new_table_properties->db_session_id = raw_val.ToString();
|
|
388
|
-
} else if (key == TablePropertiesNames::kDbHostId) {
|
|
389
|
-
new_table_properties->db_host_id = raw_val.ToString();
|
|
390
|
-
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
|
391
|
-
new_table_properties->filter_policy_name = raw_val.ToString();
|
|
392
|
-
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
|
393
|
-
new_table_properties->column_family_name = raw_val.ToString();
|
|
394
|
-
} else if (key == TablePropertiesNames::kComparator) {
|
|
395
|
-
new_table_properties->comparator_name = raw_val.ToString();
|
|
396
|
-
} else if (key == TablePropertiesNames::kMergeOperator) {
|
|
397
|
-
new_table_properties->merge_operator_name = raw_val.ToString();
|
|
398
|
-
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
|
399
|
-
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
|
400
|
-
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
|
401
|
-
new_table_properties->property_collectors_names = raw_val.ToString();
|
|
402
|
-
} else if (key == TablePropertiesNames::kCompression) {
|
|
403
|
-
new_table_properties->compression_name = raw_val.ToString();
|
|
404
|
-
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
|
405
|
-
new_table_properties->compression_options = raw_val.ToString();
|
|
406
|
-
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
|
407
|
-
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
|
408
|
-
} else {
|
|
409
|
-
// handle user-collected properties
|
|
410
|
-
new_table_properties->user_collected_properties.insert(
|
|
411
|
-
{key, raw_val.ToString()});
|
|
412
457
|
}
|
|
413
|
-
}
|
|
414
458
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
459
|
+
// Modified version of BlockFetcher checksum verification
|
|
460
|
+
// (See write_global_seqno comment above)
|
|
461
|
+
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
|
|
462
|
+
s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
|
|
463
|
+
file->file_name(), handle.offset());
|
|
464
|
+
if (s.IsCorruption()) {
|
|
465
|
+
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
|
|
466
|
+
std::string tmp_buf(properties_block.data(), len);
|
|
467
|
+
uint64_t global_seqno_offset =
|
|
468
|
+
new_table_properties->external_sst_file_global_seqno_offset -
|
|
469
|
+
handle.offset();
|
|
470
|
+
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
|
|
471
|
+
s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
|
|
472
|
+
file->file_name(), handle.offset());
|
|
473
|
+
}
|
|
430
474
|
}
|
|
431
475
|
}
|
|
432
|
-
}
|
|
433
476
|
|
|
434
|
-
|
|
435
|
-
|
|
477
|
+
// If we detected a corruption and the file system supports verification
|
|
478
|
+
// and reconstruction, retry the read
|
|
479
|
+
if (s.IsCorruption() && !retry &&
|
|
480
|
+
CheckFSFeatureSupport(ioptions.fs.get(),
|
|
481
|
+
FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
482
|
+
retry = true;
|
|
483
|
+
} else {
|
|
484
|
+
if (s.ok()) {
|
|
485
|
+
*table_properties = std::move(new_table_properties);
|
|
486
|
+
}
|
|
487
|
+
break;
|
|
488
|
+
}
|
|
436
489
|
}
|
|
437
490
|
|
|
438
491
|
return s;
|
|
@@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator(
|
|
|
201
201
|
assert(table_properties_);
|
|
202
202
|
|
|
203
203
|
// Auto prefix mode is not implemented in PlainTable.
|
|
204
|
-
bool use_prefix_seek =
|
|
205
|
-
|
|
204
|
+
bool use_prefix_seek =
|
|
205
|
+
!IsTotalOrderMode() &&
|
|
206
|
+
(options.prefix_same_as_start ||
|
|
207
|
+
(!options.total_order_seek && !options.auto_prefix_mode));
|
|
206
208
|
if (arena == nullptr) {
|
|
207
209
|
return new PlainTableIterator(this, use_prefix_seek);
|
|
208
210
|
} else {
|
|
@@ -113,6 +113,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
|
|
|
113
113
|
user_defined_timestamps_persisted ? std::string("true")
|
|
114
114
|
: std::string("false"),
|
|
115
115
|
prop_delim, kv_delim);
|
|
116
|
+
AppendProperty(result, "largest sequence number in file", key_largest_seqno,
|
|
117
|
+
prop_delim, kv_delim);
|
|
116
118
|
|
|
117
119
|
AppendProperty(
|
|
118
120
|
result, "merge operator name",
|
|
@@ -311,6 +313,8 @@ const std::string TablePropertiesNames::kTailStartOffset =
|
|
|
311
313
|
"rocksdb.tail.start.offset";
|
|
312
314
|
const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted =
|
|
313
315
|
"rocksdb.user.defined.timestamps.persisted";
|
|
316
|
+
const std::string TablePropertiesNames::kKeyLargestSeqno =
|
|
317
|
+
"rocksdb.key.largest.seqno";
|
|
314
318
|
|
|
315
319
|
#ifndef NDEBUG
|
|
316
320
|
// WARNING: TEST_SetRandomTableProperties assumes the following layout of
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
|
|
13
13
|
#include "db/range_tombstone_fragmenter.h"
|
|
14
14
|
#if USE_COROUTINES
|
|
15
|
-
#include "folly/
|
|
16
|
-
#include "folly/
|
|
15
|
+
#include "folly/coro/Coroutine.h"
|
|
16
|
+
#include "folly/coro/Task.h"
|
|
17
17
|
#endif
|
|
18
18
|
#include "rocksdb/slice_transform.h"
|
|
19
19
|
#include "rocksdb/table_reader_caller.h"
|