@nxtedition/rocksdb 12.1.4 → 12.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. package/binding.cc +1 -1
  2. package/deps/rocksdb/rocksdb/Makefile +10 -5
  3. package/deps/rocksdb/rocksdb/TARGETS +9 -7
  4. package/deps/rocksdb/rocksdb/cache/cache.cc +15 -11
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +26 -0
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +16 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +6 -0
  8. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +38 -8
  9. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -0
  10. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +4 -0
  11. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +11 -0
  12. package/deps/rocksdb/rocksdb/cache/lru_cache.h +6 -0
  13. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +2 -1
  14. package/deps/rocksdb/rocksdb/cache/tiered_secondary_cache_test.cc +56 -0
  15. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +12 -9
  16. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +10 -0
  17. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +9 -0
  18. package/deps/rocksdb/rocksdb/db/c.cc +9 -0
  19. package/deps/rocksdb/rocksdb/db/c_test.c +12 -1
  20. package/deps/rocksdb/rocksdb/db/column_family.cc +6 -23
  21. package/deps/rocksdb/rocksdb/db/column_family.h +1 -2
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +4 -5
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +4 -4
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +14 -6
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +19 -16
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +34 -30
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +2 -1
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +1 -1
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +16 -31
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +2 -1
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +7 -50
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +95 -84
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_test.cc +616 -5
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_state.cc +1 -1
  36. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.cc +1 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +1 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +8 -2
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +93 -69
  40. package/deps/rocksdb/rocksdb/db/db_bloom_filter_test.cc +353 -89
  41. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +4 -3
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +116 -14
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +67 -8
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +42 -14
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +50 -0
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +79 -32
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +36 -59
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +72 -39
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -12
  51. package/deps/rocksdb/rocksdb/db/db_io_failure_test.cc +75 -0
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +7 -3
  53. package/deps/rocksdb/rocksdb/db/db_secondary_test.cc +1 -1
  54. package/deps/rocksdb/rocksdb/db/db_sst_test.cc +24 -0
  55. package/deps/rocksdb/rocksdb/db/db_test2.cc +36 -22
  56. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +23 -0
  57. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +2 -0
  58. package/deps/rocksdb/rocksdb/db/error_handler.cc +28 -3
  59. package/deps/rocksdb/rocksdb/db/error_handler.h +2 -1
  60. package/deps/rocksdb/rocksdb/db/event_helpers.cc +1 -0
  61. package/deps/rocksdb/rocksdb/db/experimental.cc +165 -33
  62. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +13 -5
  63. package/deps/rocksdb/rocksdb/db/external_sst_file_test.cc +37 -28
  64. package/deps/rocksdb/rocksdb/db/flush_job.cc +11 -6
  65. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +7 -6
  66. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +14 -6
  67. package/deps/rocksdb/rocksdb/db/job_context.h +4 -0
  68. package/deps/rocksdb/rocksdb/db/memtable.cc +24 -14
  69. package/deps/rocksdb/rocksdb/db/memtable.h +2 -1
  70. package/deps/rocksdb/rocksdb/db/memtable_list.cc +61 -33
  71. package/deps/rocksdb/rocksdb/db/memtable_list.h +8 -0
  72. package/deps/rocksdb/rocksdb/db/repair.cc +4 -2
  73. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  74. package/deps/rocksdb/rocksdb/db/version_builder.cc +14 -11
  75. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +20 -4
  76. package/deps/rocksdb/rocksdb/db/version_set.cc +40 -30
  77. package/deps/rocksdb/rocksdb/db/version_set.h +13 -3
  78. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -76
  79. package/deps/rocksdb/rocksdb/db/write_batch.cc +6 -2
  80. package/deps/rocksdb/rocksdb/db/write_batch_test.cc +1 -1
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +1 -0
  82. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +5 -1
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +2 -1
  84. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +25 -2
  85. package/deps/rocksdb/rocksdb/env/fs_remap.cc +11 -0
  86. package/deps/rocksdb/rocksdb/env/fs_remap.h +5 -0
  87. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.cc +11 -1
  88. package/deps/rocksdb/rocksdb/file/sst_file_manager_impl.h +3 -1
  89. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +20 -1
  90. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +10 -8
  91. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +4 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_job_stats.h +30 -28
  93. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +10 -5
  94. package/deps/rocksdb/rocksdb/include/rocksdb/convenience.h +3 -1
  95. package/deps/rocksdb/rocksdb/include/rocksdb/experimental.h +287 -83
  96. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +68 -36
  97. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +8 -0
  98. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  99. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  100. package/deps/rocksdb/rocksdb/memtable/inlineskiplist.h +4 -4
  101. package/deps/rocksdb/rocksdb/options/customizable_test.cc +31 -0
  102. package/deps/rocksdb/rocksdb/options/db_options.cc +14 -0
  103. package/deps/rocksdb/rocksdb/options/db_options.h +2 -0
  104. package/deps/rocksdb/rocksdb/options/options_helper.cc +15 -4
  105. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  106. package/deps/rocksdb/rocksdb/options/options_parser.cc +5 -4
  107. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +11 -1
  108. package/deps/rocksdb/rocksdb/options/options_test.cc +38 -45
  109. package/deps/rocksdb/rocksdb/port/port.h +16 -0
  110. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +8 -1
  111. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +10 -20
  112. package/deps/rocksdb/rocksdb/table/block_based/filter_policy.cc +15 -9
  113. package/deps/rocksdb/rocksdb/table/format.cc +32 -4
  114. package/deps/rocksdb/rocksdb/table/format.h +12 -1
  115. package/deps/rocksdb/rocksdb/table/iterator.cc +4 -0
  116. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +214 -161
  117. package/deps/rocksdb/rocksdb/table/plain/plain_table_reader.cc +4 -2
  118. package/deps/rocksdb/rocksdb/table/table_properties.cc +4 -0
  119. package/deps/rocksdb/rocksdb/table/table_reader.h +2 -2
  120. package/deps/rocksdb/rocksdb/table/table_test.cc +5 -4
  121. package/deps/rocksdb/rocksdb/test_util/testutil.cc +2 -0
  122. package/deps/rocksdb/rocksdb/test_util/testutil.h +2 -0
  123. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +11 -2
  124. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +213 -22
  125. package/deps/rocksdb/rocksdb/tools/ldb_cmd_impl.h +3 -0
  126. package/deps/rocksdb/rocksdb/util/async_file_reader.h +1 -1
  127. package/deps/rocksdb/rocksdb/util/compaction_job_stats_impl.cc +3 -0
  128. package/deps/rocksdb/rocksdb/util/coro_utils.h +2 -2
  129. package/deps/rocksdb/rocksdb/utilities/fault_injection_fs.cc +3 -3
  130. package/package.json +1 -1
  131. package/prebuilds/darwin-arm64/@nxtedition+rocksdb.node +0 -0
  132. package/prebuilds/linux-x64/@nxtedition+rocksdb.node +0 -0
@@ -680,26 +680,12 @@ Status BlockBasedTable::Open(
680
680
  if (s.ok()) {
681
681
  s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
682
682
  prefetch_buffer.get(), file_size, &footer,
683
- kBlockBasedTableMagicNumber);
684
- }
685
- // If the footer is corrupted and the FS supports checksum verification and
686
- // correction, try reading the footer again
687
- if (s.IsCorruption()) {
688
- RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
689
- if (CheckFSFeatureSupport(ioptions.fs.get(),
690
- FSSupportedOps::kVerifyAndReconstructRead)) {
691
- IOOptions retry_opts = opts;
692
- retry_opts.verify_and_reconstruct_read = true;
693
- s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs,
694
- prefetch_buffer.get(), file_size, &footer,
695
- kBlockBasedTableMagicNumber);
696
- RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
697
- if (s.ok()) {
698
- RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
699
- }
700
- }
683
+ kBlockBasedTableMagicNumber, ioptions.stats);
701
684
  }
702
685
  if (!s.ok()) {
686
+ if (s.IsCorruption()) {
687
+ RecordTick(ioptions.statistics.get(), SST_FOOTER_CORRUPTION_COUNT);
688
+ }
703
689
  return s;
704
690
  }
705
691
  if (!IsSupportedFormatVersion(footer.format_version())) {
@@ -2077,7 +2063,9 @@ InternalIterator* BlockBasedTable::NewIterator(
2077
2063
  if (arena == nullptr) {
2078
2064
  return new BlockBasedTableIterator(
2079
2065
  this, read_options, rep_->internal_comparator, std::move(index_iter),
2080
- !skip_filters && !read_options.total_order_seek &&
2066
+ !skip_filters &&
2067
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
2068
+ read_options.prefix_same_as_start) &&
2081
2069
  prefix_extractor != nullptr,
2082
2070
  need_upper_bound_check, prefix_extractor, caller,
2083
2071
  compaction_readahead_size, allow_unprepared_value);
@@ -2085,7 +2073,9 @@ InternalIterator* BlockBasedTable::NewIterator(
2085
2073
  auto* mem = arena->AllocateAligned(sizeof(BlockBasedTableIterator));
2086
2074
  return new (mem) BlockBasedTableIterator(
2087
2075
  this, read_options, rep_->internal_comparator, std::move(index_iter),
2088
- !skip_filters && !read_options.total_order_seek &&
2076
+ !skip_filters &&
2077
+ (!read_options.total_order_seek || read_options.auto_prefix_mode ||
2078
+ read_options.prefix_same_as_start) &&
2089
2079
  prefix_extractor != nullptr,
2090
2080
  need_upper_bound_check, prefix_extractor, caller,
2091
2081
  compaction_readahead_size, allow_unprepared_value);
@@ -91,9 +91,24 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
91
91
  uint64_t alt_hash = GetSliceHash64(alt);
92
92
  std::optional<uint64_t> prev_key_hash;
93
93
  std::optional<uint64_t> prev_alt_hash = hash_entries_info_.prev_alt_hash;
94
+
94
95
  if (!hash_entries_info_.entries.empty()) {
95
96
  prev_key_hash = hash_entries_info_.entries.back();
96
97
  }
98
+
99
+ #ifdef ROCKSDB_VALGRIND_RUN
100
+ // Valgrind can report uninitialized FPs on std::optional usage. See e.g.
101
+ // https://stackoverflow.com/q/51616179
102
+ if (!prev_key_hash.has_value()) {
103
+ std::memset((void*)&prev_key_hash, 0, sizeof(prev_key_hash));
104
+ prev_key_hash.reset();
105
+ }
106
+ if (!prev_alt_hash.has_value()) {
107
+ std::memset((void*)&prev_alt_hash, 0, sizeof(prev_key_hash));
108
+ prev_alt_hash.reset();
109
+ }
110
+ #endif
111
+
97
112
  // Add alt first, so that entries.back() always contains previous key
98
113
  // ASSUMING a change from one alt to the next implies a change to
99
114
  // corresponding key
@@ -295,15 +310,6 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
295
310
  bool detect_filter_construct_corruption_;
296
311
 
297
312
  struct HashEntriesInfo {
298
- #ifdef ROCKSDB_VALGRIND_RUN
299
- HashEntriesInfo() {
300
- // Valgrind can report uninitialized FPs on std::optional usage. See e.g.
301
- // https://stackoverflow.com/q/51616179
302
- std::memset((void*)&prev_alt_hash, 0, sizeof(prev_alt_hash));
303
- prev_alt_hash = {};
304
- }
305
- #endif
306
-
307
313
  // A deque avoids unnecessary copying of already-saved values
308
314
  // and has near-minimal peak memory use.
309
315
  std::deque<uint64_t> entries;
@@ -475,10 +475,12 @@ std::string Footer::ToString() const {
475
475
  return result;
476
476
  }
477
477
 
478
- Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
479
- FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
480
- uint64_t file_size, Footer* footer,
481
- uint64_t enforce_table_magic_number) {
478
+ static Status ReadFooterFromFileInternal(const IOOptions& opts,
479
+ RandomAccessFileReader* file,
480
+ FileSystem& fs,
481
+ FilePrefetchBuffer* prefetch_buffer,
482
+ uint64_t file_size, Footer* footer,
483
+ uint64_t enforce_table_magic_number) {
482
484
  if (file_size < Footer::kMinEncodedLength) {
483
485
  return Status::Corruption("file is too short (" +
484
486
  std::to_string(file_size) +
@@ -516,6 +518,8 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
516
518
  }
517
519
  }
518
520
 
521
+ TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
522
+
519
523
  // Check that we actually read the whole footer from the file. It may be
520
524
  // that size isn't correct.
521
525
  if (footer_input.size() < Footer::kMinEncodedLength) {
@@ -543,6 +547,30 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
543
547
  return Status::OK();
544
548
  }
545
549
 
550
+ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
551
+ FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
552
+ uint64_t file_size, Footer* footer,
553
+ uint64_t enforce_table_magic_number,
554
+ Statistics* stats) {
555
+ Status s =
556
+ ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size,
557
+ footer, enforce_table_magic_number);
558
+ if (s.IsCorruption() &&
559
+ CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
560
+ IOOptions new_opts = opts;
561
+ new_opts.verify_and_reconstruct_read = true;
562
+ footer->Reset();
563
+ s = ReadFooterFromFileInternal(new_opts, file, fs, prefetch_buffer,
564
+ file_size, footer,
565
+ enforce_table_magic_number);
566
+ RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
567
+ if (s.ok()) {
568
+ RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
569
+ }
570
+ }
571
+ return s;
572
+ }
573
+
546
574
  namespace {
547
575
  // Custom handling for the last byte of a block, to avoid invoking streaming
548
576
  // API to get an effective block checksum. This function is its own inverse
@@ -186,6 +186,16 @@ class Footer {
186
186
  // Create empty. Populate using DecodeFrom.
187
187
  Footer() {}
188
188
 
189
+ void Reset() {
190
+ table_magic_number_ = kNullTableMagicNumber;
191
+ format_version_ = kInvalidFormatVersion;
192
+ base_context_checksum_ = 0;
193
+ metaindex_handle_ = BlockHandle::NullBlockHandle();
194
+ index_handle_ = BlockHandle::NullBlockHandle();
195
+ checksum_type_ = kInvalidChecksumType;
196
+ block_trailer_size_ = 0;
197
+ }
198
+
189
199
  // Deserialize a footer (populate fields) from `input` and check for various
190
200
  // corruptions. `input_offset` is the offset within the target file of
191
201
  // `input` buffer, which is needed for verifying format_version >= 6 footer.
@@ -304,7 +314,8 @@ class FooterBuilder {
304
314
  Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
305
315
  FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
306
316
  uint64_t file_size, Footer* footer,
307
- uint64_t enforce_table_magic_number = 0);
317
+ uint64_t enforce_table_magic_number = 0,
318
+ Statistics* stats = nullptr);
308
319
 
309
320
  // Computes a checksum using the given ChecksumType. Sometimes we need to
310
321
  // include one more input byte logically at the end but not part of the main
@@ -74,6 +74,10 @@ class EmptyInternalIterator : public InternalIteratorBase<TValue> {
74
74
  assert(false);
75
75
  return TValue();
76
76
  }
77
+ uint64_t write_unix_time() const override {
78
+ assert(false);
79
+ return std::numeric_limits<uint64_t>::max();
80
+ }
77
81
  Status status() const override { return status_; }
78
82
 
79
83
  private:
@@ -163,6 +163,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
163
163
  Add(TablePropertiesNames::kSequenceNumberTimeMapping,
164
164
  props.seqno_to_time_mapping);
165
165
  }
166
+ if (props.key_largest_seqno != UINT64_MAX) {
167
+ Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
168
+ }
166
169
  }
167
170
 
168
171
  Slice PropertyBlockBuilder::Finish() {
@@ -259,180 +262,230 @@ Status ReadTablePropertiesHelper(
259
262
  MemoryAllocator* memory_allocator) {
260
263
  assert(table_properties);
261
264
 
262
- // If this is an external SST file ingested with write_global_seqno set to
263
- // true, then we expect the checksum mismatch because checksum was written
264
- // by SstFileWriter, but its global seqno in the properties block may have
265
- // been changed during ingestion. For this reason, we initially read
266
- // and process without checksum verification, then later try checksum
267
- // verification so that if it fails, we can copy to a temporary buffer with
268
- // global seqno set to its original value, i.e. 0, and attempt checksum
269
- // verification again.
270
- ReadOptions modified_ro = ro;
271
- modified_ro.verify_checksums = false;
272
- BlockContents block_contents;
273
- BlockFetcher block_fetcher(file, prefetch_buffer, footer, modified_ro, handle,
274
- &block_contents, ioptions, false /* decompress */,
275
- false /*maybe_compressed*/, BlockType::kProperties,
276
- UncompressionDict::GetEmptyDict(),
277
- PersistentCacheOptions::kEmpty, memory_allocator);
278
- Status s = block_fetcher.ReadBlockContents();
279
- if (!s.ok()) {
280
- return s;
281
- }
282
-
283
- // Unfortunately, Block::size() might not equal block_contents.data.size(),
284
- // and Block hides block_contents
285
- uint64_t block_size = block_contents.data.size();
286
- Block properties_block(std::move(block_contents));
287
- std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
288
-
289
- std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
290
- // All pre-defined properties of type uint64_t
291
- std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
292
- {TablePropertiesNames::kOriginalFileNumber,
293
- &new_table_properties->orig_file_number},
294
- {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
295
- {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
296
- {TablePropertiesNames::kIndexPartitions,
297
- &new_table_properties->index_partitions},
298
- {TablePropertiesNames::kTopLevelIndexSize,
299
- &new_table_properties->top_level_index_size},
300
- {TablePropertiesNames::kIndexKeyIsUserKey,
301
- &new_table_properties->index_key_is_user_key},
302
- {TablePropertiesNames::kIndexValueIsDeltaEncoded,
303
- &new_table_properties->index_value_is_delta_encoded},
304
- {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
305
- {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
306
- {TablePropertiesNames::kRawValueSize,
307
- &new_table_properties->raw_value_size},
308
- {TablePropertiesNames::kNumDataBlocks,
309
- &new_table_properties->num_data_blocks},
310
- {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
311
- {TablePropertiesNames::kNumFilterEntries,
312
- &new_table_properties->num_filter_entries},
313
- {TablePropertiesNames::kDeletedKeys,
314
- &new_table_properties->num_deletions},
315
- {TablePropertiesNames::kMergeOperands,
316
- &new_table_properties->num_merge_operands},
317
- {TablePropertiesNames::kNumRangeDeletions,
318
- &new_table_properties->num_range_deletions},
319
- {TablePropertiesNames::kFormatVersion,
320
- &new_table_properties->format_version},
321
- {TablePropertiesNames::kFixedKeyLen,
322
- &new_table_properties->fixed_key_len},
323
- {TablePropertiesNames::kColumnFamilyId,
324
- &new_table_properties->column_family_id},
325
- {TablePropertiesNames::kCreationTime,
326
- &new_table_properties->creation_time},
327
- {TablePropertiesNames::kOldestKeyTime,
328
- &new_table_properties->oldest_key_time},
329
- {TablePropertiesNames::kFileCreationTime,
330
- &new_table_properties->file_creation_time},
331
- {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
332
- &new_table_properties->slow_compression_estimated_data_size},
333
- {TablePropertiesNames::kFastCompressionEstimatedDataSize,
334
- &new_table_properties->fast_compression_estimated_data_size},
335
- {TablePropertiesNames::kTailStartOffset,
336
- &new_table_properties->tail_start_offset},
337
- {TablePropertiesNames::kUserDefinedTimestampsPersisted,
338
- &new_table_properties->user_defined_timestamps_persisted},
339
- };
340
-
341
- std::string last_key;
342
- for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
343
- s = iter->status();
344
- if (!s.ok()) {
345
- break;
265
+ Status s;
266
+ bool retry = false;
267
+ while (true) {
268
+ BlockContents block_contents;
269
+ size_t len = handle.size() + footer.GetBlockTrailerSize();
270
+ // If this is an external SST file ingested with write_global_seqno set to
271
+ // true, then we expect the checksum mismatch because checksum was written
272
+ // by SstFileWriter, but its global seqno in the properties block may have
273
+ // been changed during ingestion. For this reason, we initially read
274
+ // and process without checksum verification, then later try checksum
275
+ // verification so that if it fails, we can copy to a temporary buffer with
276
+ // global seqno set to its original value, i.e. 0, and attempt checksum
277
+ // verification again.
278
+ if (!retry) {
279
+ ReadOptions modified_ro = ro;
280
+ modified_ro.verify_checksums = false;
281
+ BlockFetcher block_fetcher(
282
+ file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
283
+ ioptions, false /* decompress */, false /*maybe_compressed*/,
284
+ BlockType::kProperties, UncompressionDict::GetEmptyDict(),
285
+ PersistentCacheOptions::kEmpty, memory_allocator);
286
+ s = block_fetcher.ReadBlockContents();
287
+ if (!s.ok()) {
288
+ return s;
289
+ }
290
+ assert(block_fetcher.GetBlockSizeWithTrailer() == len);
291
+ TEST_SYNC_POINT_CALLBACK("ReadTablePropertiesHelper:0",
292
+ &block_contents.data);
293
+ } else {
294
+ assert(s.IsCorruption());
295
+ // If retrying, use a stronger file system read to check and correct
296
+ // data corruption
297
+ IOOptions opts;
298
+ if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) !=
299
+ IOStatus::OK()) {
300
+ return s;
301
+ }
302
+ opts.verify_and_reconstruct_read = true;
303
+ std::unique_ptr<char[]> data(new char[len]);
304
+ Slice result;
305
+ IOStatus io_s =
306
+ file->Read(opts, handle.offset(), len, &result, data.get(), nullptr);
307
+ RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
308
+ if (!io_s.ok()) {
309
+ ROCKS_LOG_INFO(ioptions.info_log,
310
+ "Reading properties block failed - %s",
311
+ io_s.ToString().c_str());
312
+ // Return the original corruption error as that's more serious
313
+ return s;
314
+ }
315
+ if (result.size() < len) {
316
+ return Status::Corruption("Reading properties block failed - " +
317
+ std::to_string(result.size()) +
318
+ " bytes read");
319
+ }
320
+ RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
321
+ block_contents = BlockContents(std::move(data), handle.size());
346
322
  }
347
323
 
348
- auto key = iter->key().ToString();
349
- // properties block should be strictly sorted with no duplicate key.
350
- if (!last_key.empty() &&
351
- BytewiseComparator()->Compare(key, last_key) <= 0) {
352
- s = Status::Corruption("properties unsorted");
353
- break;
354
- }
355
- last_key = key;
324
+ uint64_t block_size = block_contents.data.size();
325
+ Block properties_block(std::move(block_contents));
326
+ // Unfortunately, Block::size() might not equal block_contents.data.size(),
327
+ // and Block hides block_contents
328
+ std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
329
+
330
+ std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
331
+ // All pre-defined properties of type uint64_t
332
+ std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
333
+ {TablePropertiesNames::kOriginalFileNumber,
334
+ &new_table_properties->orig_file_number},
335
+ {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
336
+ {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
337
+ {TablePropertiesNames::kIndexPartitions,
338
+ &new_table_properties->index_partitions},
339
+ {TablePropertiesNames::kTopLevelIndexSize,
340
+ &new_table_properties->top_level_index_size},
341
+ {TablePropertiesNames::kIndexKeyIsUserKey,
342
+ &new_table_properties->index_key_is_user_key},
343
+ {TablePropertiesNames::kIndexValueIsDeltaEncoded,
344
+ &new_table_properties->index_value_is_delta_encoded},
345
+ {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
346
+ {TablePropertiesNames::kRawKeySize,
347
+ &new_table_properties->raw_key_size},
348
+ {TablePropertiesNames::kRawValueSize,
349
+ &new_table_properties->raw_value_size},
350
+ {TablePropertiesNames::kNumDataBlocks,
351
+ &new_table_properties->num_data_blocks},
352
+ {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
353
+ {TablePropertiesNames::kNumFilterEntries,
354
+ &new_table_properties->num_filter_entries},
355
+ {TablePropertiesNames::kDeletedKeys,
356
+ &new_table_properties->num_deletions},
357
+ {TablePropertiesNames::kMergeOperands,
358
+ &new_table_properties->num_merge_operands},
359
+ {TablePropertiesNames::kNumRangeDeletions,
360
+ &new_table_properties->num_range_deletions},
361
+ {TablePropertiesNames::kFormatVersion,
362
+ &new_table_properties->format_version},
363
+ {TablePropertiesNames::kFixedKeyLen,
364
+ &new_table_properties->fixed_key_len},
365
+ {TablePropertiesNames::kColumnFamilyId,
366
+ &new_table_properties->column_family_id},
367
+ {TablePropertiesNames::kCreationTime,
368
+ &new_table_properties->creation_time},
369
+ {TablePropertiesNames::kOldestKeyTime,
370
+ &new_table_properties->oldest_key_time},
371
+ {TablePropertiesNames::kFileCreationTime,
372
+ &new_table_properties->file_creation_time},
373
+ {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
374
+ &new_table_properties->slow_compression_estimated_data_size},
375
+ {TablePropertiesNames::kFastCompressionEstimatedDataSize,
376
+ &new_table_properties->fast_compression_estimated_data_size},
377
+ {TablePropertiesNames::kTailStartOffset,
378
+ &new_table_properties->tail_start_offset},
379
+ {TablePropertiesNames::kUserDefinedTimestampsPersisted,
380
+ &new_table_properties->user_defined_timestamps_persisted},
381
+ {TablePropertiesNames::kKeyLargestSeqno,
382
+ &new_table_properties->key_largest_seqno},
383
+ };
384
+
385
+ std::string last_key;
386
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
387
+ s = iter->status();
388
+ if (!s.ok()) {
389
+ break;
390
+ }
356
391
 
357
- auto raw_val = iter->value();
358
- auto pos = predefined_uint64_properties.find(key);
392
+ auto key = iter->key().ToString();
393
+ // properties block should be strictly sorted with no duplicate key.
394
+ if (!last_key.empty() &&
395
+ BytewiseComparator()->Compare(key, last_key) <= 0) {
396
+ s = Status::Corruption("properties unsorted");
397
+ break;
398
+ }
399
+ last_key = key;
359
400
 
360
- if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
361
- new_table_properties->external_sst_file_global_seqno_offset =
362
- handle.offset() + iter->ValueOffset();
363
- }
401
+ auto raw_val = iter->value();
402
+ auto pos = predefined_uint64_properties.find(key);
364
403
 
365
- if (pos != predefined_uint64_properties.end()) {
366
- if (key == TablePropertiesNames::kDeletedKeys ||
367
- key == TablePropertiesNames::kMergeOperands) {
368
- // Insert in user-collected properties for API backwards compatibility
404
+ if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
405
+ new_table_properties->external_sst_file_global_seqno_offset =
406
+ handle.offset() + iter->ValueOffset();
407
+ }
408
+
409
+ if (pos != predefined_uint64_properties.end()) {
410
+ if (key == TablePropertiesNames::kDeletedKeys ||
411
+ key == TablePropertiesNames::kMergeOperands) {
412
+ // Insert in user-collected properties for API backwards compatibility
413
+ new_table_properties->user_collected_properties.insert(
414
+ {key, raw_val.ToString()});
415
+ }
416
+ // handle predefined rocksdb properties
417
+ uint64_t val;
418
+ if (!GetVarint64(&raw_val, &val)) {
419
+ // skip malformed value
420
+ auto error_msg =
421
+ "Detect malformed value in properties meta-block:"
422
+ "\tkey: " +
423
+ key + "\tval: " + raw_val.ToString();
424
+ ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
425
+ continue;
426
+ }
427
+ *(pos->second) = val;
428
+ } else if (key == TablePropertiesNames::kDbId) {
429
+ new_table_properties->db_id = raw_val.ToString();
430
+ } else if (key == TablePropertiesNames::kDbSessionId) {
431
+ new_table_properties->db_session_id = raw_val.ToString();
432
+ } else if (key == TablePropertiesNames::kDbHostId) {
433
+ new_table_properties->db_host_id = raw_val.ToString();
434
+ } else if (key == TablePropertiesNames::kFilterPolicy) {
435
+ new_table_properties->filter_policy_name = raw_val.ToString();
436
+ } else if (key == TablePropertiesNames::kColumnFamilyName) {
437
+ new_table_properties->column_family_name = raw_val.ToString();
438
+ } else if (key == TablePropertiesNames::kComparator) {
439
+ new_table_properties->comparator_name = raw_val.ToString();
440
+ } else if (key == TablePropertiesNames::kMergeOperator) {
441
+ new_table_properties->merge_operator_name = raw_val.ToString();
442
+ } else if (key == TablePropertiesNames::kPrefixExtractorName) {
443
+ new_table_properties->prefix_extractor_name = raw_val.ToString();
444
+ } else if (key == TablePropertiesNames::kPropertyCollectors) {
445
+ new_table_properties->property_collectors_names = raw_val.ToString();
446
+ } else if (key == TablePropertiesNames::kCompression) {
447
+ new_table_properties->compression_name = raw_val.ToString();
448
+ } else if (key == TablePropertiesNames::kCompressionOptions) {
449
+ new_table_properties->compression_options = raw_val.ToString();
450
+ } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
451
+ new_table_properties->seqno_to_time_mapping = raw_val.ToString();
452
+ } else {
453
+ // handle user-collected properties
369
454
  new_table_properties->user_collected_properties.insert(
370
455
  {key, raw_val.ToString()});
371
456
  }
372
- // handle predefined rocksdb properties
373
- uint64_t val;
374
- if (!GetVarint64(&raw_val, &val)) {
375
- // skip malformed value
376
- auto error_msg =
377
- "Detect malformed value in properties meta-block:"
378
- "\tkey: " +
379
- key + "\tval: " + raw_val.ToString();
380
- ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
381
- continue;
382
- }
383
- *(pos->second) = val;
384
- } else if (key == TablePropertiesNames::kDbId) {
385
- new_table_properties->db_id = raw_val.ToString();
386
- } else if (key == TablePropertiesNames::kDbSessionId) {
387
- new_table_properties->db_session_id = raw_val.ToString();
388
- } else if (key == TablePropertiesNames::kDbHostId) {
389
- new_table_properties->db_host_id = raw_val.ToString();
390
- } else if (key == TablePropertiesNames::kFilterPolicy) {
391
- new_table_properties->filter_policy_name = raw_val.ToString();
392
- } else if (key == TablePropertiesNames::kColumnFamilyName) {
393
- new_table_properties->column_family_name = raw_val.ToString();
394
- } else if (key == TablePropertiesNames::kComparator) {
395
- new_table_properties->comparator_name = raw_val.ToString();
396
- } else if (key == TablePropertiesNames::kMergeOperator) {
397
- new_table_properties->merge_operator_name = raw_val.ToString();
398
- } else if (key == TablePropertiesNames::kPrefixExtractorName) {
399
- new_table_properties->prefix_extractor_name = raw_val.ToString();
400
- } else if (key == TablePropertiesNames::kPropertyCollectors) {
401
- new_table_properties->property_collectors_names = raw_val.ToString();
402
- } else if (key == TablePropertiesNames::kCompression) {
403
- new_table_properties->compression_name = raw_val.ToString();
404
- } else if (key == TablePropertiesNames::kCompressionOptions) {
405
- new_table_properties->compression_options = raw_val.ToString();
406
- } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
407
- new_table_properties->seqno_to_time_mapping = raw_val.ToString();
408
- } else {
409
- // handle user-collected properties
410
- new_table_properties->user_collected_properties.insert(
411
- {key, raw_val.ToString()});
412
457
  }
413
- }
414
458
 
415
- // Modified version of BlockFetcher checksum verification
416
- // (See write_global_seqno comment above)
417
- if (s.ok() && footer.GetBlockTrailerSize() > 0) {
418
- s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
419
- file->file_name(), handle.offset());
420
- if (s.IsCorruption()) {
421
- if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
422
- std::string tmp_buf(properties_block.data(),
423
- block_fetcher.GetBlockSizeWithTrailer());
424
- uint64_t global_seqno_offset =
425
- new_table_properties->external_sst_file_global_seqno_offset -
426
- handle.offset();
427
- EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
428
- s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
429
- file->file_name(), handle.offset());
459
+ // Modified version of BlockFetcher checksum verification
460
+ // (See write_global_seqno comment above)
461
+ if (s.ok() && footer.GetBlockTrailerSize() > 0) {
462
+ s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
463
+ file->file_name(), handle.offset());
464
+ if (s.IsCorruption()) {
465
+ if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
466
+ std::string tmp_buf(properties_block.data(), len);
467
+ uint64_t global_seqno_offset =
468
+ new_table_properties->external_sst_file_global_seqno_offset -
469
+ handle.offset();
470
+ EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
471
+ s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
472
+ file->file_name(), handle.offset());
473
+ }
430
474
  }
431
475
  }
432
- }
433
476
 
434
- if (s.ok()) {
435
- *table_properties = std::move(new_table_properties);
477
+ // If we detected a corruption and the file system supports verification
478
+ // and reconstruction, retry the read
479
+ if (s.IsCorruption() && !retry &&
480
+ CheckFSFeatureSupport(ioptions.fs.get(),
481
+ FSSupportedOps::kVerifyAndReconstructRead)) {
482
+ retry = true;
483
+ } else {
484
+ if (s.ok()) {
485
+ *table_properties = std::move(new_table_properties);
486
+ }
487
+ break;
488
+ }
436
489
  }
437
490
 
438
491
  return s;
@@ -201,8 +201,10 @@ InternalIterator* PlainTableReader::NewIterator(
201
201
  assert(table_properties_);
202
202
 
203
203
  // Auto prefix mode is not implemented in PlainTable.
204
- bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek &&
205
- !options.auto_prefix_mode;
204
+ bool use_prefix_seek =
205
+ !IsTotalOrderMode() &&
206
+ (options.prefix_same_as_start ||
207
+ (!options.total_order_seek && !options.auto_prefix_mode));
206
208
  if (arena == nullptr) {
207
209
  return new PlainTableIterator(this, use_prefix_seek);
208
210
  } else {
@@ -113,6 +113,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
113
113
  user_defined_timestamps_persisted ? std::string("true")
114
114
  : std::string("false"),
115
115
  prop_delim, kv_delim);
116
+ AppendProperty(result, "largest sequence number in file", key_largest_seqno,
117
+ prop_delim, kv_delim);
116
118
 
117
119
  AppendProperty(
118
120
  result, "merge operator name",
@@ -311,6 +313,8 @@ const std::string TablePropertiesNames::kTailStartOffset =
311
313
  "rocksdb.tail.start.offset";
312
314
  const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted =
313
315
  "rocksdb.user.defined.timestamps.persisted";
316
+ const std::string TablePropertiesNames::kKeyLargestSeqno =
317
+ "rocksdb.key.largest.seqno";
314
318
 
315
319
  #ifndef NDEBUG
316
320
  // WARNING: TEST_SetRandomTableProperties assumes the following layout of
@@ -12,8 +12,8 @@
12
12
 
13
13
  #include "db/range_tombstone_fragmenter.h"
14
14
  #if USE_COROUTINES
15
- #include "folly/experimental/coro/Coroutine.h"
16
- #include "folly/experimental/coro/Task.h"
15
+ #include "folly/coro/Coroutine.h"
16
+ #include "folly/coro/Task.h"
17
17
  #endif
18
18
  #include "rocksdb/slice_transform.h"
19
19
  #include "rocksdb/table_reader_caller.h"