@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/binding.cc +32 -2
  2. package/binding.gyp +8 -0
  3. package/deps/liburing/liburing.gyp +20 -0
  4. package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +7 -0
  6. package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
  7. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
  8. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
  9. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
  10. package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
  11. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
  12. package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
  13. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
  14. package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
  15. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
  16. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
  18. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
  19. package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
  20. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
  21. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
  22. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
  23. package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
  25. package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
  26. package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
  27. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
  28. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
  29. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
  30. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
  31. package/deps/rocksdb/rocksdb/db/c.cc +90 -1
  32. package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
  33. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  34. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
  38. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
  39. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
  41. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
  42. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
  43. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
  44. package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
  45. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
  46. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
  47. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
  48. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
  49. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
  50. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
  51. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
  52. package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
  53. package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
  54. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
  55. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
  56. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
  57. package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
  58. package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
  59. package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
  60. package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
  61. package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
  62. package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
  63. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
  64. package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
  65. package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
  66. package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
  67. package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
  68. package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
  69. package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
  70. package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
  71. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
  72. package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
  73. package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
  74. package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
  75. package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
  76. package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
  77. package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
  78. package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
  79. package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
  80. package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
  81. package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
  82. package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
  83. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
  84. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
  85. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
  86. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
  87. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
  88. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
  89. package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
  90. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
  91. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
  92. package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
  93. package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
  94. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
  95. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
  96. package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
  97. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
  98. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
  99. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
  100. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
  101. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
  102. package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
  103. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
  104. package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
  105. package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
  106. package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
  107. package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
  108. package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
  109. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
  110. package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
  111. package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
  112. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
  113. package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
  114. package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
  115. package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
  116. package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
  117. package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
  118. package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
  119. package/deps/rocksdb/rocksdb/src.mk +4 -0
  120. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
  121. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
  122. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
  123. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
  124. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
  125. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
  126. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
  127. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
  128. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
  129. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
  130. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
  131. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
  132. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
  133. package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
  134. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
  135. package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
  136. package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
  137. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
  138. package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
  139. package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
  140. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
  141. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
  142. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
  143. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
  144. package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
  145. package/deps/rocksdb/rocksdb.gyp +7 -1
  146. package/package.json +1 -1
  147. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -10,121 +10,45 @@
10
10
  #include "table/merging_iterator.h"
11
11
 
12
12
  #include "db/arena_wrapped_db_iter.h"
13
- #include "db/dbformat.h"
14
- #include "db/pinned_iterators_manager.h"
15
- #include "memory/arena.h"
16
- #include "monitoring/perf_context_imp.h"
17
- #include "rocksdb/comparator.h"
18
- #include "rocksdb/iterator.h"
19
- #include "rocksdb/options.h"
20
- #include "table/internal_iterator.h"
21
- #include "table/iter_heap.h"
22
- #include "table/iterator_wrapper.h"
23
- #include "test_util/sync_point.h"
24
- #include "util/autovector.h"
25
- #include "util/heap.h"
26
- #include "util/stop_watch.h"
27
13
 
28
14
  namespace ROCKSDB_NAMESPACE {
29
- // For merging iterator to process range tombstones, we treat the start and end
30
- // keys of a range tombstone as point keys and put them into the minHeap/maxHeap
31
- // used in merging iterator. Take minHeap for example, we are able to keep track
32
- // of currently "active" range tombstones (the ones whose start keys are popped
33
- // but end keys are still in the heap) in `active_`. This `active_` set of range
34
- // tombstones is then used to quickly determine whether the point key at heap
35
- // top is deleted (by heap property, the point key at heap top must be within
36
- // internal key range of active range tombstones).
15
+ // MergingIterator uses a min/max heap to combine data from point iterators.
16
+ // Range tombstones can be added and keys covered by range tombstones will be
17
+ // skipped.
37
18
  //
38
- // The HeapItem struct represents 3 types of elements in the minHeap/maxHeap:
39
- // point key and the start and end keys of a range tombstone.
40
- struct HeapItem {
41
- HeapItem() = default;
42
-
43
- enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
44
- IteratorWrapper iter;
45
- size_t level = 0;
46
- ParsedInternalKey parsed_ikey;
47
- // Will be overwritten before use, initialize here so compiler does not
48
- // complain.
49
- Type type = ITERATOR;
50
-
51
- explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
52
- : level(_level), type(Type::ITERATOR) {
53
- iter.Set(_iter);
54
- }
55
-
56
- void SetTombstoneKey(ParsedInternalKey&& pik) {
57
- // op_type is already initialized in MergingIterator::Finish().
58
- parsed_ikey.user_key = pik.user_key;
59
- parsed_ikey.sequence = pik.sequence;
60
- }
61
-
62
- Slice key() const {
63
- assert(type == ITERATOR);
64
- return iter.key();
65
- }
66
-
67
- bool IsDeleteRangeSentinelKey() const {
68
- if (type == Type::ITERATOR) {
69
- return iter.IsDeleteRangeSentinelKey();
70
- }
71
- return false;
72
- }
73
- };
74
-
75
- class MinHeapItemComparator {
76
- public:
77
- MinHeapItemComparator(const InternalKeyComparator* comparator)
78
- : comparator_(comparator) {}
79
- bool operator()(HeapItem* a, HeapItem* b) const {
80
- if (LIKELY(a->type == HeapItem::ITERATOR)) {
81
- if (LIKELY(b->type == HeapItem::ITERATOR)) {
82
- return comparator_->Compare(a->key(), b->key()) > 0;
83
- } else {
84
- return comparator_->Compare(a->key(), b->parsed_ikey) > 0;
85
- }
86
- } else {
87
- if (LIKELY(b->type == HeapItem::ITERATOR)) {
88
- return comparator_->Compare(a->parsed_ikey, b->key()) > 0;
89
- } else {
90
- return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0;
91
- }
92
- }
93
- }
94
-
95
- private:
96
- const InternalKeyComparator* comparator_;
97
- };
98
-
99
- class MaxHeapItemComparator {
100
- public:
101
- MaxHeapItemComparator(const InternalKeyComparator* comparator)
102
- : comparator_(comparator) {}
103
- bool operator()(HeapItem* a, HeapItem* b) const {
104
- if (LIKELY(a->type == HeapItem::ITERATOR)) {
105
- if (LIKELY(b->type == HeapItem::ITERATOR)) {
106
- return comparator_->Compare(a->key(), b->key()) < 0;
107
- } else {
108
- return comparator_->Compare(a->key(), b->parsed_ikey) < 0;
109
- }
110
- } else {
111
- if (LIKELY(b->type == HeapItem::ITERATOR)) {
112
- return comparator_->Compare(a->parsed_ikey, b->key()) < 0;
113
- } else {
114
- return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0;
115
- }
116
- }
117
- }
118
-
119
- private:
120
- const InternalKeyComparator* comparator_;
121
- };
122
- // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
123
- namespace {
124
- using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
125
- using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
126
- } // namespace
127
-
19
+ // The following are implementation details and can be ignored by user.
20
+ // For merging iterator to process range tombstones, it treats the start and end
21
+ // keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_
22
+ // together with regular point keys. Each range tombstone is active only within
23
+ // its internal key range [start_key, end_key). An `active_` set is used to
24
+ // track levels that have an active range tombstone. Take forward scanning
25
+ // for example. Level j is in active_ if its current range tombstone has its
26
+ // start_key popped from minHeap_ and its end_key in minHeap_. If the top of
27
+ // minHeap_ is a point key from level L, we can determine if the point key is
28
+ // covered by any range tombstone by checking if there is an l <= L in active_.
29
+ // The case of l == L also involves checking range tombstone's sequence number.
30
+ //
31
+ // The following (non-exhaustive) list of invariants are maintained by
32
+ // MergingIterator during forward scanning. After each InternalIterator API,
33
+ // i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty:
34
+ // (1) minHeap_.top().type == ITERATOR
35
+ // (2) minHeap_.top()->key() is not covered by any range tombstone.
36
+ //
37
+ // After each call to SeekImpl() in addition to the functions mentioned above:
38
+ // (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() <
39
+ // children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before
40
+ // the first range tombstone from level j with end_key() >
41
+ // children_[i].iter.key().
42
+ // (4) For all level i and j <= i, if j in active_, then
43
+ // range_tombstone_iters_[j]->start_key() < children_[i].iter.key().
44
+ // - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be
45
+ // the last range tombstone from that range tombstone iterator.
46
+ // - When referring to range tombstone start/end keys, assume it is the value of
47
+ // HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes
48
+ // range tombstone keys have distinct values from point keys.
49
+ //
50
+ // Applicable class variables have their own (forward scanning) invariants
51
+ // listed in the comments above their definition.
128
52
  class MergingIterator : public InternalIterator {
129
53
  public:
130
54
  MergingIterator(const InternalKeyComparator* comparator,
@@ -136,7 +60,7 @@ class MergingIterator : public InternalIterator {
136
60
  direction_(kForward),
137
61
  comparator_(comparator),
138
62
  current_(nullptr),
139
- minHeap_(comparator_),
63
+ minHeap_(MinHeapItemComparator(comparator_)),
140
64
  pinned_iters_mgr_(nullptr),
141
65
  iterate_upper_bound_(iterate_upper_bound) {
142
66
  children_.resize(n);
@@ -162,30 +86,26 @@ class MergingIterator : public InternalIterator {
162
86
  current_ = nullptr;
163
87
  }
164
88
 
165
- // Merging iterator can optionally process range tombstones: if a key is
166
- // covered by a range tombstone, the merging iterator will not output it but
167
- // skip it.
168
- //
169
- // Add the next range tombstone iterator to this merging iterator.
170
- // There must be either no range tombstone iterator, or same number of
171
- // range tombstone iterators as point iterators after all range tombstone
172
- // iters are added. The i-th added range tombstone iterator and the i-th point
173
- // iterator must point to the same sorted run.
174
- // Merging iterator takes ownership of the range tombstone iterator and
175
- // is responsible for freeing it. Note that during Iterator::Refresh()
176
- // and when a level iterator moves to a different SST file, the range
177
- // tombstone iterator could be updated. In that case, the merging iterator
178
- // is only responsible to freeing the new range tombstone iterator
179
- // that it has pointers to in range_tombstone_iters_.
89
+ // There must be either no range tombstone iterator or the same number of
90
+ // range tombstone iterators as point iterators after all iters are added.
91
+ // The i-th added range tombstone iterator and the i-th point iterator
92
+ // must point to the same LSM level.
93
+ // Merging iterator takes ownership of `iter` and is responsible for freeing
94
+ // it. One exception to this is when a LevelIterator moves to a different SST
95
+ // file or when Iterator::Refresh() is called, the range tombstone iterator
96
+ // could be updated. In that case, this merging iterator is only responsible
97
+ // for freeing the new range tombstone iterator that it has pointers to in
98
+ // range_tombstone_iters_.
180
99
  void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) {
181
100
  range_tombstone_iters_.emplace_back(iter);
182
101
  }
183
102
 
184
103
  // Called by MergingIteratorBuilder when all point iterators and range
185
104
  // tombstone iterators are added. Initializes HeapItems for range tombstone
186
- // iterators so that no further allocation is needed for HeapItem.
105
+ // iterators.
187
106
  void Finish() {
188
107
  if (!range_tombstone_iters_.empty()) {
108
+ assert(range_tombstone_iters_.size() == children_.size());
189
109
  pinned_heap_item_.resize(range_tombstone_iters_.size());
190
110
  for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
191
111
  pinned_heap_item_[i].level = i;
@@ -199,7 +119,7 @@ class MergingIterator : public InternalIterator {
199
119
  // TruncatedRangeDelIterator since untruncated tombstone end points
200
120
  // always have kMaxSequenceNumber and kTypeRangeDeletion (see
201
121
  // TruncatedRangeDelIterator::start_key()/end_key()).
202
- pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid;
122
+ pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid;
203
123
  }
204
124
  }
205
125
  }
@@ -221,12 +141,18 @@ class MergingIterator : public InternalIterator {
221
141
 
222
142
  // Add range_tombstone_iters_[level] into min heap.
223
143
  // Updates active_ if the end key of a range tombstone is inserted.
144
+ // pinned_heap_items_[level].type is updated based on `start_key`.
145
+ //
146
+ // If range_tombstone_iters_[level] is after iterate_upper_bound_,
147
+ // it is removed from the heap.
224
148
  // @param start_key specifies which end point of the range tombstone to add.
225
149
  void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
226
150
  bool replace_top = false) {
227
151
  assert(!range_tombstone_iters_.empty() &&
228
152
  range_tombstone_iters_[level]->Valid());
153
+ // Maintains Invariant(phi)
229
154
  if (start_key) {
155
+ pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
230
156
  ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
231
157
  // iterate_upper_bound does not have timestamp
232
158
  if (iterate_upper_bound_ &&
@@ -241,15 +167,16 @@ class MergingIterator : public InternalIterator {
241
167
  return;
242
168
  }
243
169
  pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
244
- pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
170
+ // Checks Invariant(active_)
245
171
  assert(active_.count(level) == 0);
246
172
  } else {
247
173
  // allow end key to go over upper bound (if present) since start key is
248
174
  // before upper bound and the range tombstone could still cover a
249
175
  // range before upper bound.
176
+ // Maintains Invariant(active_)
250
177
  pinned_heap_item_[level].SetTombstoneKey(
251
178
  range_tombstone_iters_[level]->end_key());
252
- pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
179
+ pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
253
180
  active_.insert(level);
254
181
  }
255
182
  if (replace_top) {
@@ -269,12 +196,12 @@ class MergingIterator : public InternalIterator {
269
196
  if (end_key) {
270
197
  pinned_heap_item_[level].SetTombstoneKey(
271
198
  range_tombstone_iters_[level]->end_key());
272
- pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
199
+ pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
273
200
  assert(active_.count(level) == 0);
274
201
  } else {
275
202
  pinned_heap_item_[level].SetTombstoneKey(
276
203
  range_tombstone_iters_[level]->start_key());
277
- pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
204
+ pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
278
205
  active_.insert(level);
279
206
  }
280
207
  if (replace_top) {
@@ -290,9 +217,12 @@ class MergingIterator : public InternalIterator {
290
217
  // so `active_` is updated accordingly.
291
218
  void PopDeleteRangeStart() {
292
219
  while (!minHeap_.empty() &&
293
- minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
220
+ minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) {
294
221
  TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
295
- // insert end key of this range tombstone and updates active_
222
+ // Invariant(rti) holds since
223
+ // range_tombstone_iters_[minHeap_.top()->level] is still valid, and
224
+ // parameter `replace_top` is set to true here to ensure only one such
225
+ // HeapItem is in minHeap_.
296
226
  InsertRangeTombstoneToMinHeap(
297
227
  minHeap_.top()->level, false /* start_key */, true /* replace_top */);
298
228
  }
@@ -304,7 +234,7 @@ class MergingIterator : public InternalIterator {
304
234
  // so `active_` is updated accordingly.
305
235
  void PopDeleteRangeEnd() {
306
236
  while (!maxHeap_->empty() &&
307
- maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
237
+ maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) {
308
238
  // insert start key of this range tombstone and updates active_
309
239
  InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
310
240
  true /* replace_top */);
@@ -359,44 +289,25 @@ class MergingIterator : public InternalIterator {
359
289
  // Position this merging iterator at the first key >= target (internal key).
360
290
  // If range tombstones are present, keys covered by range tombstones are
361
291
  // skipped, and this merging iter points to the first non-range-deleted key >=
362
- // target after Seek(). If !Valid() and status().ok() then end of the iterator
363
- // is reached.
364
- //
365
- // Internally, this involves positioning all child iterators at the first key
366
- // >= target. If range tombstones are present, we apply a similar
367
- // optimization, cascading seek, as in Pebble
368
- // (https://github.com/cockroachdb/pebble). Specifically, if there is a range
369
- // tombstone [start, end) that covers the target user key at level L, then
370
- // this range tombstone must cover the range [target key, end) in all levels >
371
- // L. So for all levels > L, we can pretend the target key is `end`. This
372
- // optimization is applied at each level and hence the name "cascading seek".
373
- // After a round of (cascading) seeks, the top of the heap is checked to see
374
- // if it is covered by a range tombstone (see FindNextVisibleKey() for more
375
- // detail), and advanced if so. The process is repeated until a
376
- // non-range-deleted key is at the top of the heap, or heap becomes empty.
292
+ // target after Seek(). If !Valid() and status().ok() then this iterator
293
+ // reaches the end.
377
294
  //
378
- // As mentioned in comments above HeapItem, to make the checking of whether
379
- // top of the heap is covered by some range tombstone efficient, we treat each
380
- // range deletion [start, end) as two point keys and insert them into the same
381
- // min/maxHeap_ where point iterators are. The set `active_` tracks the levels
382
- // that have active range tombstones. If level L is in `active_`, and the
383
- // point key at top of the heap is from level >= L, then the point key is
384
- // within the internal key range of the range tombstone that
385
- // range_tombstone_iters_[L] currently points to. For correctness reasoning,
386
- // one invariant that Seek() (and every other public APIs Seek*(),
387
- // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
388
- // current key of level L's point iterator. Then for each range tombstone
389
- // iterator at level <= L, it is at or before the first range tombstone with
390
- // end key > `k`. This ensures that when level L's point iterator reaches top
391
- // of the heap, `active_` is calculated correctly (it contains the covering
392
- // range tombstone's level if there is one), since no range tombstone iterator
393
- // was skipped beyond that point iterator's current key during Seek().
394
- // Next()/Prev() maintains a stronger version of this invariant where all
395
- // range tombstone iterators from level <= L are *at* the first range
396
- // tombstone with end key > `k`.
295
+ // If range tombstones are present, cascading seeks may be called (an
296
+ // optimization adapted from Pebble https://github.com/cockroachdb/pebble).
297
+ // Roughly, if there is a range tombstone [start, end) that covers the
298
+ // target user key at level L, then this range tombstone must cover the range
299
+ // [target key, end) in all levels > L. So for all levels > L, we can pretend
300
+ // the target key is `end`. This optimization is applied at each level and
301
+ // hence the name "cascading seek".
397
302
  void Seek(const Slice& target) override {
398
- assert(range_tombstone_iters_.empty() ||
399
- range_tombstone_iters_.size() == children_.size());
303
+ // Define LevelNextVisible(i, k) to be the first key >= k in level i that is
304
+ // not covered by any range tombstone.
305
+ // After SeekImpl(target, 0), invariants (3) and (4) hold.
306
+ // For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i,
307
+ // target). By the contract of FindNextVisibleKey(), Invariants (1)-(4)
308
+ // holds after this call, and minHeap_.top().iter points to the
309
+ // first key >= target among children_ that is not covered by any range
310
+ // tombstone.
400
311
  SeekImpl(target);
401
312
  FindNextVisibleKey();
402
313
 
@@ -424,7 +335,7 @@ class MergingIterator : public InternalIterator {
424
335
  assert(Valid());
425
336
  // Ensure that all children are positioned after key().
426
337
  // If we are moving in the forward direction, it is already
427
- // true for all of the non-current children since current_ is
338
+ // true for all the non-current children since current_ is
428
339
  // the smallest child and key() == current_->key().
429
340
  if (direction_ != kForward) {
430
341
  // The loop advanced all non-current children to be > key() so current_
@@ -448,6 +359,12 @@ class MergingIterator : public InternalIterator {
448
359
  considerStatus(current_->status());
449
360
  minHeap_.pop();
450
361
  }
362
+ // Invariants (3) and (4) hold when after advancing current_.
363
+ // Let k be the smallest key among children_[i].iter.key().
364
+ // k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all
365
+ // level i. After FindNextVisible(), Invariants (1)-(4) hold and
366
+ // minHeap_.top()->key() is the first key >= k from any children_ that is
367
+ // not covered by any range tombstone.
451
368
  FindNextVisibleKey();
452
369
  current_ = CurrentForward();
453
370
  }
@@ -467,7 +384,7 @@ class MergingIterator : public InternalIterator {
467
384
  assert(Valid());
468
385
  // Ensure that all children are positioned before key().
469
386
  // If we are moving in the reverse direction, it is already
470
- // true for all of the non-current children since current_ is
387
+ // true for all the non-current children since current_ is
471
388
  // the largest child and key() == current_->key().
472
389
  if (direction_ != kReverse) {
473
390
  // Otherwise, retreat the non-current children. We retreat current_
@@ -518,7 +435,6 @@ class MergingIterator : public InternalIterator {
518
435
  // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
519
436
  // from current child iterator. Potentially as long as one of child iterator
520
437
  // report out of bound is not possible, we know current key is within bound.
521
-
522
438
  bool MayBeOutOfLowerBound() override {
523
439
  assert(Valid());
524
440
  return current_->MayBeOutOfLowerBound();
@@ -549,20 +465,108 @@ class MergingIterator : public InternalIterator {
549
465
  }
550
466
 
551
467
  private:
468
+ // Represents an element in the min/max heap. Each HeapItem corresponds to a
469
+ // point iterator or a range tombstone iterator, differentiated by
470
+ // HeapItem::type.
471
+ struct HeapItem {
472
+ HeapItem() = default;
473
+
474
+ // corresponding point iterator
475
+ IteratorWrapper iter;
476
+ size_t level = 0;
477
+ // corresponding range tombstone iterator's start or end key value
478
+ // depending on value of `type`.
479
+ ParsedInternalKey tombstone_pik;
480
+ // Will be overwritten before use, initialize here so compiler does not
481
+ // complain.
482
+ enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
483
+ Type type = Type::ITERATOR;
484
+
485
+ explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
486
+ : level(_level), type(Type::ITERATOR) {
487
+ iter.Set(_iter);
488
+ }
489
+
490
+ void SetTombstoneKey(ParsedInternalKey&& pik) {
491
+ // op_type is already initialized in MergingIterator::Finish().
492
+ tombstone_pik.user_key = pik.user_key;
493
+ tombstone_pik.sequence = pik.sequence;
494
+ }
495
+ };
496
+
497
+ class MinHeapItemComparator {
498
+ public:
499
+ explicit MinHeapItemComparator(const InternalKeyComparator* comparator)
500
+ : comparator_(comparator) {}
501
+
502
+ bool operator()(HeapItem* a, HeapItem* b) const {
503
+ if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
504
+ if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
505
+ return comparator_->Compare(a->iter.key(), b->iter.key()) > 0;
506
+ } else {
507
+ return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0;
508
+ }
509
+ } else {
510
+ if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
511
+ return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0;
512
+ } else {
513
+ return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0;
514
+ }
515
+ }
516
+ }
517
+
518
+ private:
519
+ const InternalKeyComparator* comparator_;
520
+ };
521
+
522
+ class MaxHeapItemComparator {
523
+ public:
524
+ explicit MaxHeapItemComparator(const InternalKeyComparator* comparator)
525
+ : comparator_(comparator) {}
526
+
527
+ bool operator()(HeapItem* a, HeapItem* b) const {
528
+ if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
529
+ if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
530
+ return comparator_->Compare(a->iter.key(), b->iter.key()) < 0;
531
+ } else {
532
+ return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0;
533
+ }
534
+ } else {
535
+ if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
536
+ return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0;
537
+ } else {
538
+ return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0;
539
+ }
540
+ }
541
+ }
542
+
543
+ private:
544
+ const InternalKeyComparator* comparator_;
545
+ };
546
+
547
+ using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
548
+ using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
549
+
552
550
  friend class MergeIteratorBuilder;
553
551
  // Clears heaps for both directions, used when changing direction or seeking
554
552
  void ClearHeaps(bool clear_active = true);
555
553
  // Ensures that maxHeap_ is initialized when starting to go in the reverse
556
554
  // direction
557
555
  void InitMaxHeap();
558
-
559
- // Advance this merging iterator until the current key (top of min heap) is
560
- // not covered by any range tombstone or that there is no more keys (heap is
561
- // empty). After this call, if Valid(), current_ points to the next key that
562
- // is not covered by any range tombstone.
556
+ // Advance this merging iterator until the current key (minHeap_.top()) is
557
+ // from a point iterator and is not covered by any range tombstone,
558
+ // or that there is no more keys (heap is empty). SeekImpl() may be called
559
+ // to seek to the end of a range tombstone as an optimization.
563
560
  void FindNextVisibleKey();
564
561
  void FindPrevVisibleKey();
565
562
 
563
+ // Advance this merging iterators to the first key >= `target` for all
564
+ // components from levels >= starting_level. All iterators before
565
+ // starting_level are untouched.
566
+ //
567
+ // @param range_tombstone_reseek Whether target is some range tombstone
568
+ // end, i.e., whether this SeekImpl() call is a part of a "cascading seek".
569
+ // This is used only for recoding relevant perf_context.
566
570
  void SeekImpl(const Slice& target, size_t starting_level = 0,
567
571
  bool range_tombstone_reseek = false);
568
572
 
@@ -577,40 +581,59 @@ class MergingIterator : public InternalIterator {
577
581
  enum Direction : uint8_t { kForward, kReverse };
578
582
  Direction direction_;
579
583
  const InternalKeyComparator* comparator_;
580
- // We could also use an autovector with a larger reserved size.
581
584
  // HeapItem for all child point iterators.
585
+ // Invariant(children_): children_[i] is in minHeap_ iff
586
+ // children_[i].iter.Valid(), and at most one children_[i] is in minHeap_.
587
+ // TODO: We could use an autovector with a larger reserved size.
582
588
  std::vector<HeapItem> children_;
583
- // HeapItem for range tombstone start and end keys. Each range tombstone
584
- // iterator will have at most one side (start key or end key) in a heap
585
- // at the same time, so this vector will be of size children_.size();
586
- // pinned_heap_item_[i] corresponds to the start key and end key HeapItem
587
- // for range_tombstone_iters_[i].
589
+ // HeapItem for range tombstone start and end keys.
590
+ // pinned_heap_item_[i] corresponds to range_tombstone_iters_[i].
591
+ // Invariant(phi): If range_tombstone_iters_[i]->Valid(),
592
+ // pinned_heap_item_[i].tombstone_pik is equal to
593
+ // range_tombstone_iters_[i]->start_key() when
594
+ // pinned_heap_item_[i].type is DELETE_RANGE_START and
595
+ // range_tombstone_iters_[i]->end_key() when
596
+ // pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is
597
+ // kMaxValid for all pinned_heap_item_.tombstone_pik).
598
+ // pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END.
588
599
  std::vector<HeapItem> pinned_heap_item_;
589
600
  // range_tombstone_iters_[i] contains range tombstones in the sorted run that
590
601
  // corresponds to children_[i]. range_tombstone_iters_.empty() means not
591
602
  // handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
592
603
  // nullptr means the sorted run of children_[i] does not have range
593
604
  // tombstones.
605
+ // Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff
606
+ // range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is
607
+ // in minHeap_.
594
608
  std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
595
609
 
596
610
  // Levels (indices into range_tombstone_iters_/children_ ) that currently have
597
- // "active" range tombstones. See comments above Seek() for meaning of
598
- // "active".
611
+ // "active" range tombstones. See comments above MergingIterator for meaning
612
+ // of "active".
613
+ // Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid()
614
+ // and pinned_heap_item_[i].type == DELETE_RANGE_END.
599
615
  std::set<size_t> active_;
600
616
 
601
617
  bool SkipNextDeleted();
618
+
602
619
  bool SkipPrevDeleted();
603
620
 
604
- // Cached pointer to child iterator with the current key, or nullptr if no
605
- // child iterators are valid. This is the top of minHeap_ or maxHeap_
606
- // depending on the direction.
621
+ // Invariant: at the end of each InternalIterator API,
622
+ // current_ points to minHeap_.top().iter (maxHeap_ if backward scanning)
623
+ // or nullptr if no child iterator is valid.
624
+ // This follows from that current_ = CurrentForward()/CurrentReverse() is
625
+ // called at the end of each InternalIterator API.
607
626
  IteratorWrapper* current_;
608
627
  // If any of the children have non-ok status, this is one of them.
609
628
  Status status_;
629
+ // Invariant: min heap property is maintained (parent is always <= child).
630
+ // This holds by using only BinaryHeap APIs to modify heap. One
631
+ // exception is to modify heap top item directly (by caller iter->Next()), and
632
+ // it should be followed by a call to replace_top() or pop().
610
633
  MergerMinIterHeap minHeap_;
611
634
 
612
635
  // Max heap is used for reverse iteration, which is way less common than
613
- // forward. Lazily initialize it to save memory.
636
+ // forward. Lazily initialize it to save memory.
614
637
  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
615
638
  PinnedIteratorsManager* pinned_iters_mgr_;
616
639
 
@@ -634,25 +657,93 @@ class MergingIterator : public InternalIterator {
634
657
 
635
658
  IteratorWrapper* CurrentForward() const {
636
659
  assert(direction_ == kForward);
637
- assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::ITERATOR);
660
+ assert(minHeap_.empty() ||
661
+ minHeap_.top()->type == HeapItem::Type::ITERATOR);
638
662
  return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
639
663
  }
640
664
 
641
665
  IteratorWrapper* CurrentReverse() const {
642
666
  assert(direction_ == kReverse);
643
667
  assert(maxHeap_);
644
- assert(maxHeap_->empty() || maxHeap_->top()->type == HeapItem::ITERATOR);
668
+ assert(maxHeap_->empty() ||
669
+ maxHeap_->top()->type == HeapItem::Type::ITERATOR);
645
670
  return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
646
671
  }
647
672
  };
648
673
 
649
- // Seek to fist key >= target key (internal key) for children_[starting_level:].
650
- // Cascading seek optimizations are applied if range tombstones are present (see
651
- // comment above Seek() for more).
674
+ // Pre-condition:
675
+ // - Invariants (3) and (4) hold for i < starting_level
676
+ // - For i < starting_level, range_tombstone_iters_[i].prev.end_key() <
677
+ // `target`.
678
+ // - For i < starting_level, if i in active_, then
679
+ // range_tombstone_iters_[i]->start_key() < `target`.
680
+ //
681
+ // Post-condition:
682
+ // - Invariants (3) and (4) hold for all level i.
683
+ // - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target)
684
+ // for i >= starting_level
685
+ // - (**) target < pinned_heap_item_[i].tombstone_pik if
686
+ // range_tombstone_iters_[i].Valid() for i >= starting_level
687
+ //
688
+ // Proof sketch:
689
+ // Invariant (3) holds for all level i.
690
+ // For j <= i < starting_level, it follows from Pre-condition that (3) holds
691
+ // and that SeekImpl(-, starting_level) does not update children_[i] or
692
+ // range_tombstone_iters_[j].
693
+ // For j < starting_level and i >= starting_level, it follows from
694
+ // - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target`
695
+ // - range_tombstone_iters_[j] is not updated in SeekImpl(), and
696
+ // - children_[i].iter.Seek(current_search_key) is called with
697
+ // current_search_key >= target (shown below).
698
+ // When current_search_key is updated, it is updated to some
699
+ // range_tombstone_iter->end_key() after
700
+ // range_tombstone_iter->SeekInternalKey(current_search_key) was called. So
701
+ // current_search_key increases if updated and >= target.
702
+ // For starting_level <= j <= i:
703
+ // children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2)
704
+ // are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >=
705
+ // k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at
706
+ // the first range tombstone from level j with end_key() > k2. It suffices to
707
+ // show that k1 >= k2. Since k1 and k2 are values of current_search_key where
708
+ // k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2.
709
+ //
710
+ // Invariant (4) holds for all level >= 0.
711
+ // By Pre-condition Invariant (4) holds for i < starting_level.
712
+ // Since children_[i], range_tombstone_iters_[i] and contents of active_ for
713
+ // i < starting_level do not change (4) holds for j <= i < starting_level.
714
+ // By Pre-condition: for all j < starting_level, if j in active_, then
715
+ // range_tombstone_iters_[j]->start_key() < target. For i >= starting_level,
716
+ // children_[i].iter.Seek(k) is called for k >= target. So
717
+ // children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key()
718
+ // for j < starting_level and i >= starting_level. So invariant (4) holds for
719
+ // j < starting_level and i >= starting_level.
720
+ // For starting_level <= j <= i, j is added to active_ only if
721
+ // - range_tombstone_iters_[j]->SeekInternalKey(k1) was called
722
+ // - range_tombstone_iters_[j]->start_key() <= k1
723
+ // Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all
724
+ // starting_level <= j <= i, (4) also holds for all starting_level <= j <= i.
652
725
  //
653
- // @param range_tombstone_reseek Whether target is some range tombstone
654
- // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This
655
- // is used only for recoding relevant perf_context.
726
+ // Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i,
727
+ // target) for i >= starting_level.
728
+ // target <= children_[i].iter.key() follows from that Seek() is called on some
729
+ // current_search_key >= target for children_[i].iter. If current_search_key
730
+ // is updated from k1 to k2 when level = i, we show that the range [k1, k2) is
731
+ // not visible for children_[j] for any j > i. When current_search_key is
732
+ // updated from k1 to k2,
733
+ // - range_tombstone_iters_[i]->SeekInternalKey(k1) was called
734
+ // - range_tombstone_iters_[i]->Valid()
735
+ // - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key
736
+ // - k2 = range_tombstone_iters_[i]->end_key()
737
+ // We assume that range_tombstone_iters_[i]->start_key() has a higher sequence
738
+ // number compared to any key from levels > i that has the same user key. So no
739
+ // point key from levels > i in range [k1, k2) is visible. So
740
+ // children_[i].iter.key() <= LevelNextVisible(i, target).
741
+ //
742
+ // Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >=
743
+ // starting_level if range_tombstone_iters_[i].Valid(). This follows from that
744
+ // SeekInternalKey() being called for each range_tombstone_iters_ with some key
745
+ // >= `target` and that we pick start/end key that is > `target` to insert to
746
+ // minHeap_.
656
747
  void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
657
748
  bool range_tombstone_reseek) {
658
749
  // active range tombstones before `starting_level` remain active
@@ -665,6 +756,7 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
665
756
 
666
757
  // TODO: perhaps we could save some upheap cost by add all child iters first
667
758
  // and then do a single heapify.
759
+ // Invariant(children_) for level < starting_level
668
760
  for (size_t level = 0; level < starting_level; ++level) {
669
761
  PERF_TIMER_GUARD(seek_min_heap_time);
670
762
  AddToMinHeapOrCheckStatus(&children_[level]);
@@ -677,15 +769,20 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
677
769
  // - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
678
770
  // and pinned_heap_item_[level] is of type RANGE_DELETION_END.
679
771
  for (size_t level = 0; level < starting_level; ++level) {
772
+ // Restores Invariants(rti), (phi) and (active_) for level <
773
+ // starting_level
680
774
  if (range_tombstone_iters_[level] &&
681
775
  range_tombstone_iters_[level]->Valid()) {
682
776
  // use an iterator on active_ if performance becomes an issue here
683
777
  if (active_.count(level) > 0) {
684
- assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END);
778
+ assert(pinned_heap_item_[level].type ==
779
+ HeapItem::Type::DELETE_RANGE_END);
685
780
  // if it was active, then start key must be within upper_bound,
686
781
  // so we can add to minHeap_ directly.
687
782
  minHeap_.push(&pinned_heap_item_[level]);
688
783
  } else {
784
+ assert(pinned_heap_item_[level].type ==
785
+ HeapItem::Type::DELETE_RANGE_START);
689
786
  // this takes care of checking iterate_upper_bound, but with an extra
690
787
  // key comparison if range_tombstone_iters_[level] was already out of
691
788
  // bound. Consider using a new HeapItem type or some flag to remember
@@ -728,45 +825,37 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
728
825
  }
729
826
  auto range_tombstone_iter = range_tombstone_iters_[level];
730
827
  if (range_tombstone_iter) {
731
- range_tombstone_iter->Seek(current_search_key.GetUserKey());
828
+ range_tombstone_iter->SeekInternalKey(
829
+ current_search_key.GetInternalKey());
830
+ // Invariants (rti) and (phi)
732
831
  if (range_tombstone_iter->Valid()) {
733
- // insert the range tombstone end that is closer to and >=
734
- // current_search_key. Strictly speaking, since the Seek() call above
735
- // is on user key, it is possible that range_tombstone_iter->end_key()
736
- // < current_search_key. This can happen when range_tombstone_iter is
737
- // truncated and range_tombstone_iter.largest_ has the same user key
738
- // as current_search_key.GetUserKey() but with a larger sequence
739
- // number than current_search_key. Correctness is not affected as this
740
- // tombstone end key will be popped during FindNextVisibleKey().
832
+ // If range tombstone starts after `current_search_key`,
833
+ // we should insert start key to heap as the range tombstone is not
834
+ // active yet.
741
835
  InsertRangeTombstoneToMinHeap(
742
836
  level, comparator_->Compare(range_tombstone_iter->start_key(),
743
837
  pik) > 0 /* start_key */);
744
- // current_search_key < end_key guaranteed by the Seek() and Valid()
745
- // calls above. Only interested in user key coverage since older
746
- // sorted runs must have smaller sequence numbers than this range
747
- // tombstone.
838
+ // current_search_key < end_key guaranteed by the SeekInternalKey()
839
+ // and Valid() calls above. Here we only need to compare user_key
840
+ // since if target.user_key ==
841
+ // range_tombstone_iter->start_key().user_key and target <
842
+ // range_tombstone_iter->start_key(), no older level would have any
843
+ // key in range [target, range_tombstone_iter->start_key()], so no
844
+ // keys in range [target, range_tombstone_iter->end_key()) from older
845
+ // level would be visible. So it is safe to seek to
846
+ // range_tombstone_iter->end_key().
748
847
  //
749
848
  // TODO: range_tombstone_iter->Seek() finds the max covering
750
849
  // sequence number, can make it cheaper by not looking for max.
751
850
  if (comparator_->user_comparator()->Compare(
752
851
  range_tombstone_iter->start_key().user_key,
753
852
  current_search_key.GetUserKey()) <= 0) {
754
- // Since range_tombstone_iter->Valid(), seqno should be valid, so
755
- // there is no need to check it.
756
853
  range_tombstone_reseek = true;
757
- // Current target user key is covered by this range tombstone.
758
- // All older sorted runs will seek to range tombstone end key.
759
854
  // Note that for prefix seek case, it is possible that the prefix
760
855
  // is not the same as the original target, it should not affect
761
856
  // correctness. Besides, in most cases, range tombstone start and
762
857
  // end key should have the same prefix?
763
- // If range_tombstone_iter->end_key() is truncated to its largest_
764
- // boundary, the timestamp in user_key will not be max timestamp,
765
- // but the timestamp of `range_tombstone_iter.largest_`. This should
766
- // be fine here as current_search_key is used to Seek into lower
767
- // levels.
768
- current_search_key.SetInternalKey(
769
- range_tombstone_iter->end_key().user_key, kMaxSequenceNumber);
858
+ current_search_key.SetInternalKey(range_tombstone_iter->end_key());
770
859
  }
771
860
  }
772
861
  }
@@ -818,6 +907,8 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
818
907
  // and `active_` is updated accordingly.
819
908
  // See FindNextVisibleKey() for more detail on internal implementation
820
909
  // of advancing child iters.
910
+ // When false is returned, if minHeap is not empty, then minHeap_.top().type
911
+ // == ITERATOR
821
912
  //
822
913
  // REQUIRES:
823
914
  // - min heap is currently not empty, and iter is in kForward direction.
@@ -828,11 +919,14 @@ bool MergingIterator::SkipNextDeleted() {
828
919
  // - file boundary sentinel keys
829
920
  // - range deletion end key
830
921
  auto current = minHeap_.top();
831
- if (current->type == HeapItem::DELETE_RANGE_END) {
922
+ if (current->type == HeapItem::Type::DELETE_RANGE_END) {
923
+ // Invariant(active_): range_tombstone_iters_[current->level] is about to
924
+ // become !Valid() or that its start key is going to be added to minHeap_.
832
925
  active_.erase(current->level);
833
926
  assert(range_tombstone_iters_[current->level] &&
834
927
  range_tombstone_iters_[current->level]->Valid());
835
928
  range_tombstone_iters_[current->level]->Next();
929
+ // Maintain Invariants (rti) and (phi)
836
930
  if (range_tombstone_iters_[current->level]->Valid()) {
837
931
  InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
838
932
  true /* replace_top */);
@@ -847,41 +941,62 @@ bool MergingIterator::SkipNextDeleted() {
847
941
  // SetTombstoneKey()).
848
942
  assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
849
943
  active_.count(current->level) == 0);
850
- // When entering a new file, old range tombstone iter is freed,
851
- // but the last key from that range tombstone iter may still be in the heap.
852
- // We need to ensure the data underlying its corresponding key Slice is
853
- // still alive. We do so by popping the range tombstone key from heap before
854
- // calling iter->Next(). Technically, this change is not needed: if there is
855
- // a range tombstone end key that is after file boundary sentinel key in
856
- // minHeap_, the range tombstone end key must have been truncated at file
857
- // boundary. The underlying data of the range tombstone end key Slice is the
858
- // SST file's largest internal key stored as file metadata in Version.
859
- // However, since there are too many implicit assumptions made, it is safer
860
- // to just ensure range tombstone iter is still alive.
944
+ // When entering a new file, range tombstone iter from the old file is
945
+ // freed, but the last key from that range tombstone iter may still be in
946
+ // the heap. We need to ensure the data underlying its corresponding key
947
+ // Slice is still alive. We do so by popping the range tombstone key from
948
+ // heap before calling iter->Next(). Technically, this change is not needed:
949
+ // if there is a range tombstone end key that is after file boundary
950
+ // sentinel key in minHeap_, the range tombstone end key must have been
951
+ // truncated at file boundary. The underlying data of the range tombstone
952
+ // end key Slice is the SST file's largest internal key stored as file
953
+ // metadata in Version. However, since there are too many implicit
954
+ // assumptions made, it is safer to just ensure range tombstone iter is
955
+ // still alive.
861
956
  minHeap_.pop();
862
957
  // Remove last SST file's range tombstone end key if there is one.
863
958
  // This means file boundary is before range tombstone end key,
864
959
  // which could happen when a range tombstone and a user key
865
960
  // straddle two SST files. Note that in TruncatedRangeDelIterator
866
961
  // constructor, parsed_largest.sequence is decremented 1 in this case.
867
- if (!minHeap_.empty() && minHeap_.top()->level == current->level &&
868
- minHeap_.top()->type == HeapItem::DELETE_RANGE_END) {
869
- minHeap_.pop();
870
- active_.erase(current->level);
962
+ // Maintains Invariant(rti) that at most one
963
+ // pinned_heap_item_[current->level] is in minHeap_.
964
+ if (range_tombstone_iters_[current->level] &&
965
+ range_tombstone_iters_[current->level]->Valid()) {
966
+ if (!minHeap_.empty() && minHeap_.top()->level == current->level) {
967
+ assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END);
968
+ minHeap_.pop();
969
+ // Invariant(active_): we are about to enter a new SST file with new
970
+ // range_tombstone_iters[current->level]. Either it is !Valid() or its
971
+ // start key is going to be added to minHeap_.
972
+ active_.erase(current->level);
973
+ } else {
974
+ // range tombstone is still valid, but it is not on heap.
975
+ // This should only happen if the range tombstone is over iterator
976
+ // upper bound.
977
+ assert(iterate_upper_bound_ &&
978
+ comparator_->user_comparator()->CompareWithoutTimestamp(
979
+ range_tombstone_iters_[current->level]->start_key().user_key,
980
+ true /* a_has_ts */, *iterate_upper_bound_,
981
+ false /* b_has_ts */) >= 0);
982
+ }
871
983
  }
872
984
  // LevelIterator enters a new SST file
873
985
  current->iter.Next();
986
+ // Invariant(children_): current is popped from heap and added back only if
987
+ // it is valid
874
988
  if (current->iter.Valid()) {
875
989
  assert(current->iter.status().ok());
876
990
  minHeap_.push(current);
877
991
  }
992
+ // Invariants (rti) and (phi)
878
993
  if (range_tombstone_iters_[current->level] &&
879
994
  range_tombstone_iters_[current->level]->Valid()) {
880
995
  InsertRangeTombstoneToMinHeap(current->level);
881
996
  }
882
997
  return true /* current key deleted */;
883
998
  }
884
- assert(current->type == HeapItem::ITERATOR);
999
+ assert(current->type == HeapItem::Type::ITERATOR);
885
1000
  // Point key case: check active_ for range tombstone coverage.
886
1001
  ParsedInternalKey pik;
887
1002
  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
@@ -908,6 +1023,7 @@ bool MergingIterator::SkipNextDeleted() {
908
1023
  if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
909
1024
  // covered by range tombstone
910
1025
  current->iter.Next();
1026
+ // Invariant (children_)
911
1027
  if (current->iter.Valid()) {
912
1028
  minHeap_.replace_top(current);
913
1029
  } else {
@@ -927,7 +1043,7 @@ bool MergingIterator::SkipNextDeleted() {
927
1043
  }
928
1044
  // we can reach here only if active_ is empty
929
1045
  assert(active_.empty());
930
- assert(minHeap_.top()->type == HeapItem::ITERATOR);
1046
+ assert(minHeap_.top()->type == HeapItem::Type::ITERATOR);
931
1047
  return false /* current key not deleted */;
932
1048
  }
933
1049
 
@@ -951,7 +1067,8 @@ void MergingIterator::SeekForPrevImpl(const Slice& target,
951
1067
  if (range_tombstone_iters_[level] &&
952
1068
  range_tombstone_iters_[level]->Valid()) {
953
1069
  assert(static_cast<bool>(active_.count(level)) ==
954
- (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_START));
1070
+ (pinned_heap_item_[level].type ==
1071
+ HeapItem::Type::DELETE_RANGE_START));
955
1072
  maxHeap_->push(&pinned_heap_item_[level]);
956
1073
  } else {
957
1074
  assert(!active_.count(level));
@@ -1056,7 +1173,7 @@ bool MergingIterator::SkipPrevDeleted() {
1056
1173
  // - file boundary sentinel keys
1057
1174
  // - range deletion start key
1058
1175
  auto current = maxHeap_->top();
1059
- if (current->type == HeapItem::DELETE_RANGE_START) {
1176
+ if (current->type == HeapItem::Type::DELETE_RANGE_START) {
1060
1177
  active_.erase(current->level);
1061
1178
  assert(range_tombstone_iters_[current->level] &&
1062
1179
  range_tombstone_iters_[current->level]->Valid());
@@ -1074,7 +1191,7 @@ bool MergingIterator::SkipPrevDeleted() {
1074
1191
  maxHeap_->pop();
1075
1192
  // Remove last SST file's range tombstone key if there is one.
1076
1193
  if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
1077
- maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
1194
+ maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) {
1078
1195
  maxHeap_->pop();
1079
1196
  active_.erase(current->level);
1080
1197
  }
@@ -1090,7 +1207,7 @@ bool MergingIterator::SkipPrevDeleted() {
1090
1207
  }
1091
1208
  return true /* current key deleted */;
1092
1209
  }
1093
- assert(current->type == HeapItem::ITERATOR);
1210
+ assert(current->type == HeapItem::Type::ITERATOR);
1094
1211
  // Point key case: check active_ for range tombstone coverage.
1095
1212
  ParsedInternalKey pik;
1096
1213
  ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
@@ -1136,11 +1253,12 @@ bool MergingIterator::SkipPrevDeleted() {
1136
1253
  }
1137
1254
 
1138
1255
  assert(active_.empty());
1139
- assert(maxHeap_->top()->type == HeapItem::ITERATOR);
1256
+ assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR);
1140
1257
  return false /* current key not deleted */;
1141
1258
  }
1142
1259
 
1143
1260
  void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
1261
+ // Invariant(children_)
1144
1262
  if (child->iter.Valid()) {
1145
1263
  assert(child->iter.status().ok());
1146
1264
  minHeap_.push(child);
@@ -1164,6 +1282,7 @@ void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
1164
1282
  // Advance all range tombstones iters, including the one corresponding to
1165
1283
  // current_, to the first tombstone with end_key > current_.key().
1166
1284
  // TODO: potentially do cascading seek here too
1285
+ // TODO: show that invariants hold
1167
1286
  void MergingIterator::SwitchToForward() {
1168
1287
  ClearHeaps();
1169
1288
  Slice target = key();
@@ -1177,7 +1296,7 @@ void MergingIterator::SwitchToForward() {
1177
1296
  if (child.iter.status() == Status::TryAgain()) {
1178
1297
  continue;
1179
1298
  }
1180
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1299
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1181
1300
  assert(child.iter.status().ok());
1182
1301
  child.iter.Next();
1183
1302
  }
@@ -1188,7 +1307,7 @@ void MergingIterator::SwitchToForward() {
1188
1307
  for (auto& child : children_) {
1189
1308
  if (child.iter.status() == Status::TryAgain()) {
1190
1309
  child.iter.Seek(target);
1191
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1310
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1192
1311
  assert(child.iter.status().ok());
1193
1312
  child.iter.Next();
1194
1313
  }
@@ -1239,7 +1358,7 @@ void MergingIterator::SwitchToBackward() {
1239
1358
  if (&child.iter != current_) {
1240
1359
  child.iter.SeekForPrev(target);
1241
1360
  TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
1242
- if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
1361
+ if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
1243
1362
  assert(child.iter.status().ok());
1244
1363
  child.iter.Prev();
1245
1364
  }
@@ -1297,32 +1416,201 @@ void MergingIterator::ClearHeaps(bool clear_active) {
1297
1416
 
1298
1417
  void MergingIterator::InitMaxHeap() {
1299
1418
  if (!maxHeap_) {
1300
- maxHeap_ = std::make_unique<MergerMaxIterHeap>(comparator_);
1419
+ maxHeap_ =
1420
+ std::make_unique<MergerMaxIterHeap>(MaxHeapItemComparator(comparator_));
1301
1421
  }
1302
1422
  }
1303
1423
 
1304
- // Repeatedly check and remove heap top key if it is not a point key
1305
- // that is not covered by range tombstones. SeekImpl() is called to seek to end
1306
- // of a range tombstone if the heap top is a point key covered by some range
1307
- // tombstone from a newer sorted run. If the covering tombstone is from current
1308
- // key's level, then the current child iterator is simply advanced to its next
1309
- // key without reseeking.
1424
+ // Assume there is a next key that is not covered by range tombstone.
1425
+ // Pre-condition:
1426
+ // - Invariants (3) and (4)
1427
+ // - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i,
1428
+ // k) for all levels i (LevelNextVisible() defined in Seek()).
1429
+ //
1430
+ // Define NextVisible(k) to be the first key >= k from among children_ that
1431
+ // is not covered by any range tombstone.
1432
+ // Post-condition:
1433
+ // - Invariants (1)-(4) hold
1434
+ // - (*): minHeap_->top()->key() == NextVisible(k)
1435
+ //
1436
+ // Loop invariants:
1437
+ // - Invariants (3) and (4)
1438
+ // - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k)
1439
+ //
1440
+ // Progress: minHeap_.top()->key() is non-decreasing and strictly increases in
1441
+ // a finite number of iterations.
1442
+ // TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with
1443
+ // k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range
1444
+ // tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5
1445
+ // triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers
1446
+ // SeekImpl(3 /* target */, 3).
1447
+ // Ideally we should only move iterators forward in SeekImpl(), and the
1448
+ // progress condition can be made simpler: iterator only moves forward.
1449
+ //
1450
+ // Proof sketch:
1451
+ // Post-condition:
1452
+ // Invariant (1) holds when this method returns:
1453
+ // Ignoring the empty minHeap_ case, there are two cases:
1454
+ // Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey()
1455
+ // By invariants (rti) and (active_), active_ being empty means if a
1456
+ // pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note
1457
+ // that PopDeleteRangeStart() was called right before the while loop condition,
1458
+ // so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must
1459
+ // be of type ITERATOR.
1460
+ // Case 2: SkipNextDeleted() returns false. The method returns false only when
1461
+ // minHeap_.top().type == ITERATOR.
1462
+ //
1463
+ // Invariant (2) holds when this method returns:
1464
+ // From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is
1465
+ // children_[i] for some i. Suppose that children_[i].iter.key() is covered by
1466
+ // some range tombstone. This means there is a j <= i and a range tombstone from
1467
+ // level j with start_key() < children_[i].iter.key() < end_key().
1468
+ // - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi),
1469
+ // pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik
1470
+ // is either start or end key of this range tombstone. If
1471
+ // pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at
1472
+ // top of minHeap_ which would contradict Invariant (1). So
1473
+ // pinned_heap_item_[j].tombstone_pik > children_[i].iter.key().
1474
+ // By Invariant (3), range_tombstone_iters_[j].prev.end_key() <
1475
+ // children_[i].iter.key(). We assume that in each level, range tombstones
1476
+ // cover non-overlapping ranges. So range_tombstone_iters_[j] is at
1477
+ // the range tombstone with start_key() < children_[i].iter.key() < end_key()
1478
+ // and has its end_key() in minHeap_. By Invariants (phi) and (active_),
1479
+ // j is in active_. From while loop condition, SkipNextDeleted() must have
1480
+ // returned false for this method to return.
1481
+ // - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i)
1482
+ // was called for some j' < i and j' in active_. Note that since j' is in
1483
+ // active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik =
1484
+ // range_tombstone_iters_[j']->end_key(). So
1485
+ // range_tombstone_iters_[j']->end_key() must be larger than
1486
+ // children_[i].iter.key() to not be at top of minHeap_. This means after
1487
+ // SeekImpl(), children_[i] would be at a key > children_[i].iter.key()
1488
+ // -- contradiction.
1489
+ // - If j == i, children_[i]->Next() would have been called and children_[i]
1490
+ // would be at a key > children_[i].iter.key() -- contradiction.
1491
+ // - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j]
1492
+ // points to an SST file with all range tombstones from that file exhausted.
1493
+ // The file must come before the file containing the first
1494
+ // range tombstone with start_key() < children_[i].iter.key() < end_key().
1495
+ // Assume files from same level have non-overlapping ranges, the current file's
1496
+ // meta.largest is less than children_[i].iter.key(). So the file boundary key,
1497
+ // which has value meta.largest must have been popped from minHeap_ before
1498
+ // children_[i].iter.key(). So range_tombstone_iters_[j] would not point to
1499
+ // this SST file -- contradiction.
1500
+ // So it is impossible for children_[i].iter.key() to be covered by a range
1501
+ // tombstone.
1502
+ //
1503
+ // Post-condition (*) holds when the function returns:
1504
+ // From loop invariant (*) that k <= children_[i].iter.key() <=
1505
+ // LevelNextVisible(i, k) and Invariant (2) above, when the function returns,
1506
+ // minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels
1507
+ // i. This is equal to NextVisible(k).
1508
+ //
1509
+ // Invariant (3) holds after each iteration:
1510
+ // PopDeleteRangeStart() does not change range tombstone position.
1511
+ // In SkipNextDeleted():
1512
+ // - If DELETE_RANGE_END is popped from minHeap_, it means the range
1513
+ // tombstone's end key is < all other point keys, so it is safe to advance to
1514
+ // next range tombstone.
1515
+ // - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()),
1516
+ // we assume that file's last range tombstone's
1517
+ // end_key <= file boundary key < all other point keys. So it is safe to
1518
+ // move to the first range tombstone in the next SST file.
1519
+ // - If children_[i]->Next() is called, then it is fine as it is advancing a
1520
+ // point iterator.
1521
+ // - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s
1522
+ // post-condition if its pre-condition holds. First pre-condition follows
1523
+ // from loop invariant where Invariant (3) holds for all levels i.
1524
+ // Now we should second pre-condition holds. Since Invariant (3) holds for
1525
+ // all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key()
1526
+ // < children_[l].iter.key(). `target` is the value of
1527
+ // range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_.
1528
+ // By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and
1529
+ // pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key().
1530
+ // This end_key must be larger than children_[l].key() since it was not at top
1531
+ // of minHeap_. So for all levels j <= l,
1532
+ // range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target
1533
+ //
1534
+ // Invariant (4) holds after each iteration:
1535
+ // A level i is inserted into active_ during calls to PopDeleteRangeStart().
1536
+ // In that case, range_tombstone_iters_[i].start_key() < all point keys
1537
+ // by heap property and the assumption that point keys and range tombstone keys
1538
+ // are distinct.
1539
+ // If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j]
1540
+ // where target = range_tombstone_iters_[j]->end_key() and children_[l]->key()
1541
+ // < target. By loop invariants, (3) and (4) holds for levels.
1542
+ // Since target > children_[l]->key(), it also holds that for j < l,
1543
+ // range_tombstone_iters_[j].prev.end_key() < target and that if j in active_,
1544
+ // range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of
1545
+ // SeekImpl(target, l) holds, and (4) follow from its post-condition.
1546
+ // All other places either in this function either advance point iterators
1547
+ // or remove some level from active_, so (4) still holds.
1548
+ //
1549
+ // Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i,
1550
+ // k).
1551
+ // k <= children_[i] follows from loop `progress` condition.
1552
+ // Consider when children_[i] is changed for any i. It is through
1553
+ // children_[i].iter.Next() or SeekImpl() in SkipNextDeleted().
1554
+ // If children_[i].iter.Next() is called, there is a range tombstone from level
1555
+ // i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_.
1556
+ // By Invariant (4), tombstone's start_key < children_[i].iter.key(). By
1557
+ // invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_
1558
+ // and that children_[i].iter.key() < end_key. So children_[i].iter.key() is
1559
+ // not visible, and it is safe to call Next().
1560
+ // If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns,
1561
+ // target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l,
1562
+ // and children_[<l] is not touched. We know `target` is
1563
+ // range_tombstone_iters_[j]->end_key() for some j < i and j is in active_.
1564
+ // By Invariant (4), range_tombstone_iters_[j]->start_key() <
1565
+ // children_[i].iter.key() for all i >= l. So for each level i >= l, the range
1566
+ // [children_[i].iter.key(), target) is not visible. So after SeekImpl(),
1567
+ // children_[i].iter.key() <= LevelNextVisible(i, target) <=
1568
+ // LevelNextVisible(i, k).
1569
+ //
1570
+ // `Progress` holds for each iteration:
1571
+ // Very sloppy intuition:
1572
+ // - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_
1573
+ // is updated from the start key to the end key of the same range tombstone.
1574
+ // We assume that start key <= end key for the same range tombstone.
1575
+ // - in SkipNextDeleted()
1576
+ // - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced
1577
+ // and the relevant pinned_heap_item_.tombstone_pik is increased or popped
1578
+ // from minHeap_.
1579
+ // - If the top of heap is a file boundary key, then both point iter and
1580
+ // range tombstone iter are advanced to the next file.
1581
+ // - If the top of heap is ITERATOR and current->iter.Next() is called, it
1582
+ // moves to a larger point key.
1583
+ // - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all
1584
+ // iterators from levels >= l are advanced to some key >= k by its contract.
1585
+ // And top of minHeap_ before SeekImpl(k, l) was less than k.
1586
+ // There are special cases where different heap items have the same key,
1587
+ // e.g. when two range tombstone end keys share the same value). In
1588
+ // these cases, iterators are being advanced, so the minimum key should increase
1589
+ // in a finite number of steps.
1310
1590
  inline void MergingIterator::FindNextVisibleKey() {
1311
- // When active_ is empty, we know heap top cannot be a range tombstone end
1312
- // key. It cannot be a range tombstone start key per PopDeleteRangeStart().
1313
1591
  PopDeleteRangeStart();
1314
- while (!minHeap_.empty() &&
1315
- (!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) &&
1316
- SkipNextDeleted()) {
1592
+ // PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START
1593
+ // active_ being empty implies no DELETE_RANGE_END in heap.
1594
+ // So minHeap_->top() must be of type ITERATOR.
1595
+ while (
1596
+ !minHeap_.empty() &&
1597
+ (!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) &&
1598
+ SkipNextDeleted()) {
1317
1599
  PopDeleteRangeStart();
1318
1600
  }
1601
+ // Checks Invariant (1)
1602
+ assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR);
1319
1603
  }
1320
1604
 
1321
1605
  inline void MergingIterator::FindPrevVisibleKey() {
1322
1606
  PopDeleteRangeEnd();
1323
- while (!maxHeap_->empty() &&
1324
- (!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) &&
1325
- SkipPrevDeleted()) {
1607
+ // PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END
1608
+ // active_ being empty implies no DELETE_RANGE_START in heap.
1609
+ // So maxHeap_->top() must be of type ITERATOR.
1610
+ while (
1611
+ !maxHeap_->empty() &&
1612
+ (!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) &&
1613
+ SkipPrevDeleted()) {
1326
1614
  PopDeleteRangeEnd();
1327
1615
  }
1328
1616
  }