@nxtedition/rocksdb 8.0.1 → 8.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
  2. package/deps/rocksdb/rocksdb/Makefile +2 -2
  3. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +0 -5
  5. package/deps/rocksdb/rocksdb/cache/cache_test.cc +8 -29
  6. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +146 -0
  7. package/deps/rocksdb/rocksdb/cache/clock_cache.h +13 -1
  8. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +20 -146
  9. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +32 -0
  10. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +11 -0
  11. package/deps/rocksdb/rocksdb/db/column_family.cc +11 -9
  12. package/deps/rocksdb/rocksdb/db/column_family.h +20 -0
  13. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -33
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +5 -0
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +27 -8
  17. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  18. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +2 -1
  19. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +4 -2
  20. package/deps/rocksdb/rocksdb/db/compaction/compaction_job_test.cc +8 -6
  21. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +65 -7
  22. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +5 -0
  23. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +10 -32
  24. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +28 -47
  25. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +28 -22
  26. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.h +8 -14
  27. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +8 -8
  28. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.h +5 -4
  29. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +170 -140
  30. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.cc +5 -1
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_universal.h +5 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -2
  33. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +8 -0
  34. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +266 -138
  35. package/deps/rocksdb/rocksdb/db/corruption_test.cc +86 -1
  36. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +72 -5
  37. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +119 -10
  38. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +585 -264
  39. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +46 -18
  40. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +6 -15
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_experimental.cc +1 -1
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +3 -0
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +8 -8
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +10 -0
  47. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +250 -2
  48. package/deps/rocksdb/rocksdb/db/db_test.cc +3 -0
  49. package/deps/rocksdb/rocksdb/db/db_test2.cc +307 -8
  50. package/deps/rocksdb/rocksdb/db/db_wal_test.cc +129 -0
  51. package/deps/rocksdb/rocksdb/db/db_with_timestamp_compaction_test.cc +21 -0
  52. package/deps/rocksdb/rocksdb/db/dbformat.cc +25 -0
  53. package/deps/rocksdb/rocksdb/db/dbformat.h +2 -0
  54. package/deps/rocksdb/rocksdb/db/experimental.cc +1 -1
  55. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +5 -2
  56. package/deps/rocksdb/rocksdb/db/flush_job.cc +5 -2
  57. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  58. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +56 -53
  59. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +3 -4
  60. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_task_scheduler_test.cc +10 -10
  62. package/deps/rocksdb/rocksdb/db/repair.cc +64 -22
  63. package/deps/rocksdb/rocksdb/db/repair_test.cc +54 -0
  64. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +26 -26
  65. package/deps/rocksdb/rocksdb/db/table_cache.cc +2 -0
  66. package/deps/rocksdb/rocksdb/db/table_properties_collector.h +3 -1
  67. package/deps/rocksdb/rocksdb/db/version_builder.cc +90 -43
  68. package/deps/rocksdb/rocksdb/db/version_builder.h +20 -0
  69. package/deps/rocksdb/rocksdb/db/version_builder_test.cc +190 -67
  70. package/deps/rocksdb/rocksdb/db/version_edit.cc +15 -1
  71. package/deps/rocksdb/rocksdb/db/version_edit.h +16 -4
  72. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +41 -11
  73. package/deps/rocksdb/rocksdb/db/version_edit_handler.h +27 -12
  74. package/deps/rocksdb/rocksdb/db/version_edit_test.cc +18 -16
  75. package/deps/rocksdb/rocksdb/db/version_set.cc +212 -35
  76. package/deps/rocksdb/rocksdb/db/version_set.h +34 -4
  77. package/deps/rocksdb/rocksdb/db/version_set_test.cc +45 -25
  78. package/deps/rocksdb/rocksdb/db/write_thread.cc +5 -2
  79. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +0 -1
  80. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +0 -4
  81. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +12 -17
  82. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +6 -4
  83. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +1 -0
  84. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +0 -48
  85. package/deps/rocksdb/rocksdb/file/random_access_file_reader.cc +8 -0
  86. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +196 -171
  87. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +6 -0
  88. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +9 -3
  89. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +25 -18
  90. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +27 -5
  91. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +5 -0
  92. package/deps/rocksdb/rocksdb/include/rocksdb/status.h +3 -0
  93. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  94. package/deps/rocksdb/rocksdb/logging/logging.h +13 -19
  95. package/deps/rocksdb/rocksdb/memory/arena.cc +4 -3
  96. package/deps/rocksdb/rocksdb/memory/arena_test.cc +30 -0
  97. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +3 -1
  98. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  99. package/deps/rocksdb/rocksdb/src.mk +2 -1
  100. package/deps/rocksdb/rocksdb/table/adaptive/adaptive_table_factory.cc +3 -2
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -2
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +1 -1
  103. package/deps/rocksdb/rocksdb/table/block_fetcher_test.cc +3 -3
  104. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +142 -0
  105. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +241 -0
  106. package/deps/rocksdb/rocksdb/table/format.cc +24 -20
  107. package/deps/rocksdb/rocksdb/table/format.h +5 -2
  108. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +97 -115
  109. package/deps/rocksdb/rocksdb/table/merging_iterator.h +82 -1
  110. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +2 -2
  111. package/deps/rocksdb/rocksdb/table/sst_file_dumper.cc +1 -1
  112. package/deps/rocksdb/rocksdb/table/table_test.cc +7 -6
  113. package/deps/rocksdb/rocksdb/test_util/testutil.h +10 -0
  114. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +0 -6
  115. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +2 -2
  116. package/deps/rocksdb/rocksdb/util/bloom_test.cc +1 -1
  117. package/deps/rocksdb/rocksdb/util/status.cc +7 -0
  118. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +5 -0
  119. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +4 -0
  120. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +7 -67
  121. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +1 -3
  122. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +1 -0
  123. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +59 -0
  124. package/deps/rocksdb/rocksdb.gyp +2 -1
  125. package/package.json +1 -1
  126. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  127. package/prebuilds/linux-x64/node.napi.node +0 -0
  128. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +0 -580
  129. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.h +0 -476
@@ -0,0 +1,241 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ //
3
+ // This source code is licensed under both the GPLv2 (found in the
4
+ // COPYING file in the root directory) and Apache 2.0 License
5
+ // (found in the LICENSE.Apache file in the root directory).
6
+
7
+ #pragma once
8
+
9
+ #include "db/range_del_aggregator.h"
10
+ #include "rocksdb/slice.h"
11
+ #include "rocksdb/types.h"
12
+ #include "table/merging_iterator.h"
13
+
14
+ namespace ROCKSDB_NAMESPACE {
15
+
16
+ class CompactionHeapItemComparator {
17
+ public:
18
+ explicit CompactionHeapItemComparator(const InternalKeyComparator* comparator)
19
+ : comparator_(comparator) {}
20
+ bool operator()(HeapItem* a, HeapItem* b) const {
21
+ int r = comparator_->Compare(a->key(), b->key());
22
+ if (r > 0) {
23
+ return true;
24
+ } else if (r < 0) {
25
+ return false;
26
+ } else {
27
+ // When range tombstone and point key have the same internal key,
28
+ // range tombstone comes first. So that when range tombstone and
29
+ // file's largest key are the same, the file boundary sentinel key
30
+ // comes after.
31
+ return a->type == HeapItem::ITERATOR &&
32
+ b->type == HeapItem::DELETE_RANGE_START;
33
+ }
34
+ }
35
+
36
+ private:
37
+ const InternalKeyComparator* comparator_;
38
+ };
39
+
40
+ using CompactionMinHeap = BinaryHeap<HeapItem*, CompactionHeapItemComparator>;
41
+ /*
42
+ * This is a simplified version of MergingIterator and is specifically used for
43
+ * compaction. It merges the input `children` iterators into a sorted stream of
44
+ * keys. Range tombstone start keys are also emitted to prevent oversize
45
+ * compactions. For example, consider an L1 file with content [a, b), y, z,
46
+ * where [a, b) is a range tombstone and y and z are point keys. This could
47
+ * cause an oversize compaction as it can overlap with a wide range of key space
48
+ * in L2.
49
+ *
50
+ * CompactionMergingIterator emits range tombstone start keys from each LSM
51
+ * level's range tombstone iterator, and for each range tombstone
52
+ * [start,end)@seqno, the key will be start@kMaxSequenceNumber unless truncated
53
+ * at file boundary (see detail TruncatedRangeDelIterator::start_key()).
54
+ *
55
+ * Caller should use CompactionMergingIterator::IsDeleteRangeSentinelKey() to
56
+ * check if the current key is a range tombstone key.
57
+ * TODO(cbi): IsDeleteRangeSentinelKey() is used for two kinds of keys at
58
+ * different layers: file boundary and range tombstone keys. Separate them into
59
+ * two APIs for clarity.
60
+ */
61
+ class CompactionMergingIterator : public InternalIterator {
62
+ public:
63
+ CompactionMergingIterator(
64
+ const InternalKeyComparator* comparator, InternalIterator** children,
65
+ int n, bool is_arena_mode,
66
+ std::vector<
67
+ std::pair<TruncatedRangeDelIterator*, TruncatedRangeDelIterator***>>
68
+ range_tombstones)
69
+ : is_arena_mode_(is_arena_mode),
70
+ comparator_(comparator),
71
+ current_(nullptr),
72
+ minHeap_(CompactionHeapItemComparator(comparator_)),
73
+ pinned_iters_mgr_(nullptr) {
74
+ children_.resize(n);
75
+ for (int i = 0; i < n; i++) {
76
+ children_[i].level = i;
77
+ children_[i].iter.Set(children[i]);
78
+ assert(children_[i].type == HeapItem::ITERATOR);
79
+ }
80
+ assert(range_tombstones.size() == static_cast<size_t>(n));
81
+ for (auto& p : range_tombstones) {
82
+ range_tombstone_iters_.push_back(p.first);
83
+ }
84
+
85
+ pinned_heap_item_.resize(n);
86
+ for (int i = 0; i < n; ++i) {
87
+ if (range_tombstones[i].second) {
88
+ // for LevelIterator
89
+ *range_tombstones[i].second = &range_tombstone_iters_[i];
90
+ }
91
+ pinned_heap_item_[i].level = i;
92
+ pinned_heap_item_[i].type = HeapItem::DELETE_RANGE_START;
93
+ }
94
+ }
95
+
96
+ void considerStatus(const Status& s) {
97
+ if (!s.ok() && status_.ok()) {
98
+ status_ = s;
99
+ }
100
+ }
101
+
102
+ ~CompactionMergingIterator() override {
103
+ // TODO: use unique_ptr for range_tombstone_iters_
104
+ for (auto child : range_tombstone_iters_) {
105
+ delete child;
106
+ }
107
+
108
+ for (auto& child : children_) {
109
+ child.iter.DeleteIter(is_arena_mode_);
110
+ }
111
+ status_.PermitUncheckedError();
112
+ }
113
+
114
+ bool Valid() const override { return current_ != nullptr && status_.ok(); }
115
+
116
+ Status status() const override { return status_; }
117
+
118
+ void SeekToFirst() override;
119
+
120
+ void Seek(const Slice& target) override;
121
+
122
+ void Next() override;
123
+
124
+ Slice key() const override {
125
+ assert(Valid());
126
+ return current_->key();
127
+ }
128
+
129
+ Slice value() const override {
130
+ assert(Valid());
131
+ if (LIKELY(current_->type == HeapItem::ITERATOR)) {
132
+ return current_->iter.value();
133
+ } else {
134
+ return dummy_tombstone_val;
135
+ }
136
+ }
137
+
138
+ // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
139
+ // from current child iterator. Potentially as long as one of child iterator
140
+ // report out of bound is not possible, we know current key is within bound.
141
+ bool MayBeOutOfLowerBound() override {
142
+ assert(Valid());
143
+ return current_->type == HeapItem::DELETE_RANGE_START ||
144
+ current_->iter.MayBeOutOfLowerBound();
145
+ }
146
+
147
+ IterBoundCheck UpperBoundCheckResult() override {
148
+ assert(Valid());
149
+ return current_->type == HeapItem::DELETE_RANGE_START
150
+ ? IterBoundCheck::kUnknown
151
+ : current_->iter.UpperBoundCheckResult();
152
+ }
153
+
154
+ void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
155
+ pinned_iters_mgr_ = pinned_iters_mgr;
156
+ for (auto& child : children_) {
157
+ child.iter.SetPinnedItersMgr(pinned_iters_mgr);
158
+ }
159
+ }
160
+
161
+ bool IsDeleteRangeSentinelKey() const override {
162
+ assert(Valid());
163
+ return current_->type == HeapItem::DELETE_RANGE_START;
164
+ }
165
+
166
+ // Compaction uses the above subset of InternalIterator interface.
167
+ void SeekToLast() override { assert(false); }
168
+
169
+ void SeekForPrev(const Slice&) override { assert(false); }
170
+
171
+ void Prev() override { assert(false); }
172
+
173
+ bool NextAndGetResult(IterateResult*) override {
174
+ assert(false);
175
+ return false;
176
+ }
177
+
178
+ bool IsKeyPinned() const override {
179
+ assert(false);
180
+ return false;
181
+ }
182
+
183
+ bool IsValuePinned() const override {
184
+ assert(false);
185
+ return false;
186
+ }
187
+
188
+ bool PrepareValue() override {
189
+ assert(false);
190
+ return false;
191
+ }
192
+
193
+ private:
194
+ bool is_arena_mode_;
195
+ const InternalKeyComparator* comparator_;
196
+ // HeapItem for all child point iterators.
197
+ std::vector<HeapItem> children_;
198
+ // HeapItem for range tombstones. pinned_heap_item_[i] corresponds to the
199
+ // current range tombstone from range_tombstone_iters_[i].
200
+ std::vector<HeapItem> pinned_heap_item_;
201
+ // range_tombstone_iters_[i] contains range tombstones in the sorted run that
202
+ // corresponds to children_[i]. range_tombstone_iters_[i] ==
203
+ // nullptr means the sorted run of children_[i] does not have range
204
+ // tombstones (or the current SSTable does not have range tombstones in the
205
+ // case of LevelIterator).
206
+ std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
207
+ // Used as value for range tombstone keys
208
+ std::string dummy_tombstone_val{};
209
+
210
+ // Skip file boundary sentinel keys.
211
+ void FindNextVisibleKey();
212
+
213
+ // top of minHeap_
214
+ HeapItem* current_;
215
+ // If any of the children have non-ok status, this is one of them.
216
+ Status status_;
217
+ CompactionMinHeap minHeap_;
218
+ PinnedIteratorsManager* pinned_iters_mgr_;
219
+ // Process a child that is not in the min heap.
220
+ // If valid, add to the min heap. Otherwise, check status.
221
+ void AddToMinHeapOrCheckStatus(HeapItem*);
222
+
223
+ HeapItem* CurrentForward() const {
224
+ return !minHeap_.empty() ? minHeap_.top() : nullptr;
225
+ }
226
+
227
+ void InsertRangeTombstoneAtLevel(size_t level) {
228
+ if (range_tombstone_iters_[level]->Valid()) {
229
+ pinned_heap_item_[level].SetTombstoneForCompaction(
230
+ range_tombstone_iters_[level]->start_key());
231
+ minHeap_.push(&pinned_heap_item_[level]);
232
+ }
233
+ }
234
+ };
235
+
236
+ InternalIterator* NewCompactionMergingIterator(
237
+ const InternalKeyComparator* comparator, InternalIterator** children, int n,
238
+ std::vector<std::pair<TruncatedRangeDelIterator*,
239
+ TruncatedRangeDelIterator***>>& range_tombstone_iters,
240
+ Arena* arena = nullptr);
241
+ } // namespace ROCKSDB_NAMESPACE
@@ -264,7 +264,8 @@ void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
264
264
  }
265
265
  }
266
266
 
267
- Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
267
+ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
268
+ uint64_t enforce_table_magic_number) {
268
269
  (void)input_offset; // Future use
269
270
 
270
271
  // Only decode to unused Footer
@@ -280,6 +281,11 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
280
281
  if (legacy) {
281
282
  magic = UpconvertLegacyFooterFormat(magic);
282
283
  }
284
+ if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) {
285
+ return Status::Corruption("Bad table magic number: expected " +
286
+ std::to_string(enforce_table_magic_number) +
287
+ ", found " + std::to_string(magic));
288
+ }
283
289
  table_magic_number_ = magic;
284
290
  block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
285
291
 
@@ -346,7 +352,7 @@ std::string Footer::ToString() const {
346
352
  }
347
353
 
348
354
  Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
349
- FilePrefetchBuffer* prefetch_buffer,
355
+ FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
350
356
  uint64_t file_size, Footer* footer,
351
357
  uint64_t enforce_table_magic_number) {
352
358
  if (file_size < Footer::kMinEncodedLength) {
@@ -390,29 +396,27 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
390
396
  // Check that we actually read the whole footer from the file. It may be
391
397
  // that size isn't correct.
392
398
  if (footer_input.size() < Footer::kMinEncodedLength) {
393
- // FIXME: this error message is bad. We should be checking whether the
394
- // provided file_size matches what's on disk, at least in this case.
395
- // Unfortunately FileSystem/Env does not provide a way to get the size
396
- // of an open file, so getting file size requires a full path seek.
397
- return Status::Corruption("file is too short (" +
398
- std::to_string(file_size) +
399
- " bytes) to be an "
400
- "sstable" +
401
- file->file_name());
399
+ uint64_t size_on_disk = 0;
400
+ if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr)
401
+ .ok()) {
402
+ // Similar to CheckConsistency message, but not completely sure the
403
+ // expected size always came from manifest.
404
+ return Status::Corruption("Sst file size mismatch: " + file->file_name() +
405
+ ". Expected " + std::to_string(file_size) +
406
+ ", actual size " +
407
+ std::to_string(size_on_disk) + "\n");
408
+ } else {
409
+ return Status::Corruption(
410
+ "Missing SST footer data in file " + file->file_name() +
411
+ " File too short? Expected size: " + std::to_string(file_size));
412
+ }
402
413
  }
403
414
 
404
- s = footer->DecodeFrom(footer_input, read_offset);
415
+ s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number);
405
416
  if (!s.ok()) {
417
+ s = Status::CopyAppendMessage(s, " in ", file->file_name());
406
418
  return s;
407
419
  }
408
- if (enforce_table_magic_number != 0 &&
409
- enforce_table_magic_number != footer->table_magic_number()) {
410
- return Status::Corruption("Bad table magic number: expected " +
411
- std::to_string(enforce_table_magic_number) +
412
- ", found " +
413
- std::to_string(footer->table_magic_number()) +
414
- " in " + file->file_name());
415
- }
416
420
  return Status::OK();
417
421
  }
418
422
 
@@ -138,7 +138,10 @@ class Footer {
138
138
  // Deserialize a footer (populate fields) from `input` and check for various
139
139
  // corruptions. `input_offset` is the offset within the target file of
140
140
  // `input` buffer (future use).
141
- Status DecodeFrom(Slice input, uint64_t input_offset);
141
+ // If enforce_table_magic_number != 0, will return corruption if table magic
142
+ // number is not equal to enforce_table_magic_number.
143
+ Status DecodeFrom(Slice input, uint64_t input_offset,
144
+ uint64_t enforce_table_magic_number = 0);
142
145
 
143
146
  // Table magic number identifies file as RocksDB SST file and which kind of
144
147
  // SST format is use.
@@ -238,7 +241,7 @@ class FooterBuilder {
238
241
  // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
239
242
  // corruption if table_magic number is not equal to enforce_table_magic_number
240
243
  Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
241
- FilePrefetchBuffer* prefetch_buffer,
244
+ FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
242
245
  uint64_t file_size, Footer* footer,
243
246
  uint64_t enforce_table_magic_number = 0);
244
247
 
@@ -10,98 +10,26 @@
10
10
  #include "table/merging_iterator.h"
11
11
 
12
12
  #include "db/arena_wrapped_db_iter.h"
13
- #include "db/dbformat.h"
14
- #include "db/pinned_iterators_manager.h"
15
- #include "memory/arena.h"
16
- #include "monitoring/perf_context_imp.h"
17
- #include "rocksdb/comparator.h"
18
- #include "rocksdb/iterator.h"
19
- #include "rocksdb/options.h"
20
- #include "table/internal_iterator.h"
21
- #include "table/iter_heap.h"
22
- #include "table/iterator_wrapper.h"
23
- #include "test_util/sync_point.h"
24
- #include "util/autovector.h"
25
- #include "util/heap.h"
26
- #include "util/stop_watch.h"
27
13
 
28
14
  namespace ROCKSDB_NAMESPACE {
29
- // For merging iterator to process range tombstones, we treat the start and end
30
- // keys of a range tombstone as point keys and put them into the minHeap/maxHeap
31
- // used in merging iterator. Take minHeap for example, we are able to keep track
32
- // of currently "active" range tombstones (the ones whose start keys are popped
33
- // but end keys are still in the heap) in `active_`. This `active_` set of range
34
- // tombstones is then used to quickly determine whether the point key at heap
35
- // top is deleted (by heap property, the point key at heap top must be within
36
- // internal key range of active range tombstones).
37
- //
38
- // The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
39
- // point key and the start and end keys of a range tombstone.
40
- struct HeapItem {
41
- HeapItem() = default;
42
-
43
- enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
44
- IteratorWrapper iter;
45
- size_t level = 0;
46
- std::string pinned_key;
47
- // Will be overwritten before use, initialize here so compiler does not
48
- // complain.
49
- Type type = ITERATOR;
50
-
51
- explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
52
- : level(_level), type(Type::ITERATOR) {
53
- iter.Set(_iter);
54
- }
55
-
56
- void SetTombstoneKey(ParsedInternalKey&& pik) {
57
- pinned_key.clear();
58
- // Range tombstone end key is exclusive. If a point internal key has the
59
- // same user key and sequence number as the start or end key of a range
60
- // tombstone, the order will be start < end key < internal key with the
61
- // following op_type change. This is helpful to ensure keys popped from
62
- // heap are in expected order since range tombstone start/end keys will
63
- // be distinct from point internal keys. Strictly speaking, this is only
64
- // needed for tombstone end points that are truncated in
65
- // TruncatedRangeDelIterator since untruncated tombstone end points always
66
- // have kMaxSequenceNumber and kTypeRangeDeletion (see
67
- // TruncatedRangeDelIterator::start_key()/end_key()).
68
- ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid);
69
- AppendInternalKey(&pinned_key, p);
70
- }
71
-
72
- Slice key() const {
73
- if (type == Type::ITERATOR) {
74
- return iter.key();
75
- }
76
- return pinned_key;
77
- }
78
-
79
- bool IsDeleteRangeSentinelKey() const {
80
- if (type == Type::ITERATOR) {
81
- return iter.IsDeleteRangeSentinelKey();
82
- }
83
- return false;
84
- }
85
- };
86
-
87
- class MinHeapItemComparator {
88
- public:
89
- MinHeapItemComparator(const InternalKeyComparator* comparator)
90
- : comparator_(comparator) {}
91
- bool operator()(HeapItem* a, HeapItem* b) const {
92
- return comparator_->Compare(a->key(), b->key()) > 0;
93
- }
94
-
95
- private:
96
- const InternalKeyComparator* comparator_;
97
- };
98
-
99
15
  class MaxHeapItemComparator {
100
16
  public:
101
17
  MaxHeapItemComparator(const InternalKeyComparator* comparator)
102
18
  : comparator_(comparator) {}
103
19
  bool operator()(HeapItem* a, HeapItem* b) const {
104
- return comparator_->Compare(a->key(), b->key()) < 0;
20
+ if (LIKELY(a->type == HeapItem::ITERATOR)) {
21
+ if (LIKELY(b->type == HeapItem::ITERATOR)) {
22
+ return comparator_->Compare(a->iter.key(), b->iter.key()) < 0;
23
+ } else {
24
+ return comparator_->Compare(a->iter.key(), b->parsed_ikey) < 0;
25
+ }
26
+ } else {
27
+ if (LIKELY(b->type == HeapItem::ITERATOR)) {
28
+ return comparator_->Compare(a->parsed_ikey, b->iter.key()) < 0;
29
+ } else {
30
+ return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0;
31
+ }
32
+ }
105
33
  }
106
34
 
107
35
  private:
@@ -109,7 +37,6 @@ class MaxHeapItemComparator {
109
37
  };
110
38
  // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
111
39
  namespace {
112
- using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
113
40
  using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
114
41
  } // namespace
115
42
 
@@ -117,14 +44,16 @@ class MergingIterator : public InternalIterator {
117
44
  public:
118
45
  MergingIterator(const InternalKeyComparator* comparator,
119
46
  InternalIterator** children, int n, bool is_arena_mode,
120
- bool prefix_seek_mode)
47
+ bool prefix_seek_mode,
48
+ const Slice* iterate_upper_bound = nullptr)
121
49
  : is_arena_mode_(is_arena_mode),
122
50
  prefix_seek_mode_(prefix_seek_mode),
123
51
  direction_(kForward),
124
52
  comparator_(comparator),
125
53
  current_(nullptr),
126
- minHeap_(comparator_),
127
- pinned_iters_mgr_(nullptr) {
54
+ minHeap_(MinHeapItemComparator(comparator_)),
55
+ pinned_iters_mgr_(nullptr),
56
+ iterate_upper_bound_(iterate_upper_bound) {
128
57
  children_.resize(n);
129
58
  for (int i = 0; i < n; i++) {
130
59
  children_[i].level = i;
@@ -175,6 +104,17 @@ class MergingIterator : public InternalIterator {
175
104
  pinned_heap_item_.resize(range_tombstone_iters_.size());
176
105
  for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
177
106
  pinned_heap_item_[i].level = i;
107
+ // Range tombstone end key is exclusive. If a point internal key has the
108
+ // same user key and sequence number as the start or end key of a range
109
+ // tombstone, the order will be start < end key < internal key with the
110
+ // following op_type change. This is helpful to ensure keys popped from
111
+ // heap are in expected order since range tombstone start/end keys will
112
+ // be distinct from point internal keys. Strictly speaking, this is only
113
+ // needed for tombstone end points that are truncated in
114
+ // TruncatedRangeDelIterator since untruncated tombstone end points
115
+ // always have kMaxSequenceNumber and kTypeRangeDeletion (see
116
+ // TruncatedRangeDelIterator::start_key()/end_key()).
117
+ pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid;
178
118
  }
179
119
  }
180
120
  }
@@ -202,11 +142,26 @@ class MergingIterator : public InternalIterator {
202
142
  assert(!range_tombstone_iters_.empty() &&
203
143
  range_tombstone_iters_[level]->Valid());
204
144
  if (start_key) {
205
- pinned_heap_item_[level].SetTombstoneKey(
206
- range_tombstone_iters_[level]->start_key());
145
+ ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
146
+ // iterate_upper_bound does not have timestamp
147
+ if (iterate_upper_bound_ &&
148
+ comparator_->user_comparator()->CompareWithoutTimestamp(
149
+ pik.user_key, true /* a_has_ts */, *iterate_upper_bound_,
150
+ false /* b_has_ts */) >= 0) {
151
+ if (replace_top) {
152
+ // replace_top implies this range tombstone iterator is still in
153
+ // minHeap_ and at the top.
154
+ minHeap_.pop();
155
+ }
156
+ return;
157
+ }
158
+ pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
207
159
  pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
208
160
  assert(active_.count(level) == 0);
209
161
  } else {
162
+ // allow end key to go over upper bound (if present) since start key is
163
+ // before upper bound and the range tombstone could still cover a
164
+ // range before upper bound.
210
165
  pinned_heap_item_[level].SetTombstoneKey(
211
166
  range_tombstone_iters_[level]->end_key());
212
167
  pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
@@ -251,6 +206,7 @@ class MergingIterator : public InternalIterator {
251
206
  void PopDeleteRangeStart() {
252
207
  while (!minHeap_.empty() &&
253
208
  minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
209
+ TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
254
210
  // insert end key of this range tombstone and updates active_
255
211
  InsertRangeTombstoneToMinHeap(
256
212
  minHeap_.top()->level, false /* start_key */, true /* replace_top */);
@@ -573,6 +529,10 @@ class MergingIterator : public InternalIterator {
573
529
  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
574
530
  PinnedIteratorsManager* pinned_iters_mgr_;
575
531
 
532
+ // Used to bound range tombstones. For point keys, DBIter and SSTable iterator
533
+ // take care of boundary checking.
534
+ const Slice* iterate_upper_bound_;
535
+
576
536
  // In forward direction, process a child that is not in the min heap.
577
537
  // If valid, add to the min heap. Otherwise, check status.
578
538
  void AddToMinHeapOrCheckStatus(HeapItem*);
@@ -634,9 +594,19 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
634
594
  for (size_t level = 0; level < starting_level; ++level) {
635
595
  if (range_tombstone_iters_[level] &&
636
596
  range_tombstone_iters_[level]->Valid()) {
637
- assert(static_cast<bool>(active_.count(level)) ==
638
- (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END));
639
- minHeap_.push(&pinned_heap_item_[level]);
597
+ // use an iterator on active_ if performance becomes an issue here
598
+ if (active_.count(level) > 0) {
599
+ assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
600
+ // if it was active, then start key must be within upper_bound,
601
+ // so we can add to minHeap_ directly.
602
+ minHeap_.push(&pinned_heap_item_[level]);
603
+ } else {
604
+ // this takes care of checking iterate_upper_bound, but with an extra
605
+ // key comparison if range_tombstone_iters_[level] was already out of
606
+ // bound. Consider using a new HeapItem type or some flag to remember
607
+ // boundary checking result.
608
+ InsertRangeTombstoneToMinHeap(level);
609
+ }
640
610
  } else {
641
611
  assert(!active_.count(level));
642
612
  }
@@ -792,14 +762,18 @@ bool MergingIterator::SkipNextDeleted() {
792
762
  // SetTombstoneKey()).
793
763
  assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
794
764
  active_.count(current->level) == 0);
795
- // LevelIterator enters a new SST file
796
- current->iter.Next();
797
- if (current->iter.Valid()) {
798
- assert(current->iter.status().ok());
799
- minHeap_.replace_top(current);
800
- } else {
801
- minHeap_.pop();
802
- }
765
+ // When entering a new file, old range tombstone iter is freed,
766
+ // but the last key from that range tombstone iter may still be in the heap.
767
+ // We need to ensure the data underlying its corresponding key Slice is
768
+ // still alive. We do so by popping the range tombstone key from heap before
769
+ // calling iter->Next(). Technically, this change is not needed: if there is
770
+ // a range tombstone end key that is after file boundary sentinel key in
771
+ // minHeap_, the range tombstone end key must have been truncated at file
772
+ // boundary. The underlying data of the range tombstone end key Slice is the
773
+ // SST file's largest internal key stored as file metadata in Version.
774
+ // However, since there are too many implicit assumptions made, it is safer
775
+ // to just ensure range tombstone iter is still alive.
776
+ minHeap_.pop();
803
777
  // Remove last SST file's range tombstone end key if there is one.
804
778
  // This means file boundary is before range tombstone end key,
805
779
  // which could happen when a range tombstone and a user key
@@ -810,6 +784,12 @@ bool MergingIterator::SkipNextDeleted() {
810
784
  minHeap_.pop();
811
785
  active_.erase(current->level);
812
786
  }
787
+ // LevelIterator enters a new SST file
788
+ current->iter.Next();
789
+ if (current->iter.Valid()) {
790
+ assert(current->iter.status().ok());
791
+ minHeap_.push(current);
792
+ }
813
793
  if (range_tombstone_iters_[current->level] &&
814
794
  range_tombstone_iters_[current->level]->Valid()) {
815
795
  InsertRangeTombstoneToMinHeap(current->level);
@@ -1006,18 +986,19 @@ bool MergingIterator::SkipPrevDeleted() {
1006
986
  }
1007
987
  if (current->iter.IsDeleteRangeSentinelKey()) {
1008
988
  // LevelIterator enters a new SST file
1009
- current->iter.Prev();
1010
- if (current->iter.Valid()) {
1011
- assert(current->iter.status().ok());
1012
- maxHeap_->replace_top(current);
1013
- } else {
1014
- maxHeap_->pop();
1015
- }
989
+ maxHeap_->pop();
990
+ // Remove last SST file's range tombstone key if there is one.
1016
991
  if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
1017
992
  maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
1018
993
  maxHeap_->pop();
1019
994
  active_.erase(current->level);
1020
995
  }
996
+ current->iter.Prev();
997
+ if (current->iter.Valid()) {
998
+ assert(current->iter.status().ok());
999
+ maxHeap_->push(current);
1000
+ }
1001
+
1021
1002
  if (range_tombstone_iters_[current->level] &&
1022
1003
  range_tombstone_iters_[current->level]->Valid()) {
1023
1004
  InsertRangeTombstoneToMaxHeap(current->level);
@@ -1111,7 +1092,7 @@ void MergingIterator::SwitchToForward() {
1111
1092
  if (child.iter.status() == Status::TryAgain()) {
1112
1093
  continue;
1113
1094
  }
1114
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1095
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1115
1096
  assert(child.iter.status().ok());
1116
1097
  child.iter.Next();
1117
1098
  }
@@ -1122,7 +1103,7 @@ void MergingIterator::SwitchToForward() {
1122
1103
  for (auto& child : children_) {
1123
1104
  if (child.iter.status() == Status::TryAgain()) {
1124
1105
  child.iter.Seek(target);
1125
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1106
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1126
1107
  assert(child.iter.status().ok());
1127
1108
  child.iter.Next();
1128
1109
  }
@@ -1173,7 +1154,7 @@ void MergingIterator::SwitchToBackward() {
1173
1154
  if (&child.iter != current_) {
1174
1155
  child.iter.SeekForPrev(target);
1175
1156
  TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
1176
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1157
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1177
1158
  assert(child.iter.status().ok());
1178
1159
  child.iter.Prev();
1179
1160
  }
@@ -1280,11 +1261,12 @@ InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
1280
1261
  }
1281
1262
 
1282
1263
  MergeIteratorBuilder::MergeIteratorBuilder(
1283
- const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode)
1264
+ const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode,
1265
+ const Slice* iterate_upper_bound)
1284
1266
  : first_iter(nullptr), use_merging_iter(false), arena(a) {
1285
1267
  auto mem = arena->AllocateAligned(sizeof(MergingIterator));
1286
- merge_iter =
1287
- new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode);
1268
+ merge_iter = new (mem) MergingIterator(comparator, nullptr, 0, true,
1269
+ prefix_seek_mode, iterate_upper_bound);
1288
1270
  }
1289
1271
 
1290
1272
  MergeIteratorBuilder::~MergeIteratorBuilder() {