@nxtedition/rocksdb 8.1.17 → 8.2.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +32 -2
- package/binding.gyp +8 -0
- package/deps/liburing/liburing.gyp +20 -0
- package/deps/rocksdb/rocksdb/CMakeLists.txt +4 -0
- package/deps/rocksdb/rocksdb/TARGETS +7 -0
- package/deps/rocksdb/rocksdb/cache/cache.cc +43 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +8 -5
- package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/cache_test.cc +12 -48
- package/deps/rocksdb/rocksdb/cache/charged_cache.cc +26 -18
- package/deps/rocksdb/rocksdb/cache/charged_cache.h +5 -62
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +119 -44
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +34 -29
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +2 -2
- package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +148 -209
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +118 -284
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +23 -71
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +351 -392
- package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +5 -2
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.cc +296 -0
- package/deps/rocksdb/rocksdb/cache/secondary_cache_adapter.h +52 -0
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +22 -19
- package/deps/rocksdb/rocksdb/cache/typed_cache.h +56 -20
- package/deps/rocksdb/rocksdb/db/arena_wrapped_db_iter.cc +3 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +3 -3
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +19 -25
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +216 -0
- package/deps/rocksdb/rocksdb/db/c.cc +90 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +8 -7
- package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +5 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +24 -7
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +17 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +18 -12
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +3 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +245 -302
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +13 -2
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +5 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +75 -15
- package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +2 -3
- package/deps/rocksdb/rocksdb/db/db_filesnapshot.cc +1 -5
- package/deps/rocksdb/rocksdb/db/db_flush_test.cc +91 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +5 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +47 -24
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +32 -3
- package/deps/rocksdb/rocksdb/db/db_iter.cc +28 -29
- package/deps/rocksdb/rocksdb/db/db_iter.h +0 -3
- package/deps/rocksdb/rocksdb/db/db_properties_test.cc +176 -0
- package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +391 -2
- package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +26 -0
- package/deps/rocksdb/rocksdb/db/db_write_test.cc +13 -5
- package/deps/rocksdb/rocksdb/db/dbformat.h +3 -1
- package/deps/rocksdb/rocksdb/db/error_handler_fs_test.cc +0 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_ingestion_job.cc +0 -6
- package/deps/rocksdb/rocksdb/db/forward_iterator.cc +3 -0
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +4 -0
- package/deps/rocksdb/rocksdb/db/import_column_family_job.cc +68 -40
- package/deps/rocksdb/rocksdb/db/import_column_family_job.h +3 -3
- package/deps/rocksdb/rocksdb/db/import_column_family_test.cc +115 -0
- package/deps/rocksdb/rocksdb/db/internal_stats.cc +169 -72
- package/deps/rocksdb/rocksdb/db/internal_stats.h +36 -7
- package/deps/rocksdb/rocksdb/db/memtable.cc +6 -4
- package/deps/rocksdb/rocksdb/db/merge_helper.cc +4 -0
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +151 -0
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.cc +47 -16
- package/deps/rocksdb/rocksdb/db/range_del_aggregator.h +10 -8
- package/deps/rocksdb/rocksdb/db/range_del_aggregator_test.cc +91 -93
- package/deps/rocksdb/rocksdb/db/range_tombstone_fragmenter.h +1 -2
- package/deps/rocksdb/rocksdb/db/version_edit_handler.cc +1 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +30 -14
- package/deps/rocksdb/rocksdb/db/version_set.h +1 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.cc +179 -0
- package/deps/rocksdb/rocksdb/db/write_stall_stats.h +47 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/batched_ops_stress.cc +109 -7
- package/deps/rocksdb/rocksdb/db_stress_tool/cf_consistency_stress.cc +147 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +31 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +22 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +4 -1
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +42 -59
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +7 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_tool.cc +7 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/expected_state.cc +6 -10
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +6 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.h +4 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/no_batched_ops_stress.cc +127 -36
- package/deps/rocksdb/rocksdb/env/fs_posix.cc +8 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +35 -0
- package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.h +29 -8
- package/deps/rocksdb/rocksdb/file/file_util.cc +14 -10
- package/deps/rocksdb/rocksdb/file/prefetch_test.cc +183 -63
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_cache.h +159 -66
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +3 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +52 -5
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/compaction_filter.h +134 -73
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +46 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/file_system.h +6 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/listener.h +0 -6
- package/deps/rocksdb/rocksdb/include/rocksdb/metadata.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/perf_context.h +6 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +3 -3
- package/deps/rocksdb/rocksdb/include/rocksdb/statistics.h +18 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/types.h +28 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +2 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +39 -0
- package/deps/rocksdb/rocksdb/monitoring/perf_context.cc +5 -0
- package/deps/rocksdb/rocksdb/monitoring/statistics.cc +9 -1
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +2 -2
- package/deps/rocksdb/rocksdb/port/stack_trace.cc +17 -7
- package/deps/rocksdb/rocksdb/port/win/env_win.h +1 -0
- package/deps/rocksdb/rocksdb/src.mk +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +38 -34
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +11 -12
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +5 -5
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +126 -132
- package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +16 -16
- package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +0 -16
- package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +3 -4
- package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +370 -0
- package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +44 -0
- package/deps/rocksdb/rocksdb/table/get_context.cc +4 -2
- package/deps/rocksdb/rocksdb/table/merging_iterator.cc +555 -267
- package/deps/rocksdb/rocksdb/table/merging_iterator.h +10 -5
- package/deps/rocksdb/rocksdb/table/table_test.cc +113 -70
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.cc +96 -0
- package/deps/rocksdb/rocksdb/test_util/secondary_cache_test_util.h +117 -0
- package/deps/rocksdb/rocksdb/utilities/checkpoint/checkpoint_impl.cc +5 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +3 -3
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +1 -1
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +9 -2
- package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +5 -1
- package/deps/rocksdb/rocksdb/utilities/ttl/ttl_test.cc +11 -0
- package/deps/rocksdb/rocksdb.gyp +7 -1
- package/package.json +1 -1
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -10,121 +10,45 @@
|
|
|
10
10
|
#include "table/merging_iterator.h"
|
|
11
11
|
|
|
12
12
|
#include "db/arena_wrapped_db_iter.h"
|
|
13
|
-
#include "db/dbformat.h"
|
|
14
|
-
#include "db/pinned_iterators_manager.h"
|
|
15
|
-
#include "memory/arena.h"
|
|
16
|
-
#include "monitoring/perf_context_imp.h"
|
|
17
|
-
#include "rocksdb/comparator.h"
|
|
18
|
-
#include "rocksdb/iterator.h"
|
|
19
|
-
#include "rocksdb/options.h"
|
|
20
|
-
#include "table/internal_iterator.h"
|
|
21
|
-
#include "table/iter_heap.h"
|
|
22
|
-
#include "table/iterator_wrapper.h"
|
|
23
|
-
#include "test_util/sync_point.h"
|
|
24
|
-
#include "util/autovector.h"
|
|
25
|
-
#include "util/heap.h"
|
|
26
|
-
#include "util/stop_watch.h"
|
|
27
13
|
|
|
28
14
|
namespace ROCKSDB_NAMESPACE {
|
|
29
|
-
//
|
|
30
|
-
//
|
|
31
|
-
//
|
|
32
|
-
// of currently "active" range tombstones (the ones whose start keys are popped
|
|
33
|
-
// but end keys are still in the heap) in `active_`. This `active_` set of range
|
|
34
|
-
// tombstones is then used to quickly determine whether the point key at heap
|
|
35
|
-
// top is deleted (by heap property, the point key at heap top must be within
|
|
36
|
-
// internal key range of active range tombstones).
|
|
15
|
+
// MergingIterator uses a min/max heap to combine data from point iterators.
|
|
16
|
+
// Range tombstones can be added and keys covered by range tombstones will be
|
|
17
|
+
// skipped.
|
|
37
18
|
//
|
|
38
|
-
// The
|
|
39
|
-
//
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
return false;
|
|
72
|
-
}
|
|
73
|
-
};
|
|
74
|
-
|
|
75
|
-
class MinHeapItemComparator {
|
|
76
|
-
public:
|
|
77
|
-
MinHeapItemComparator(const InternalKeyComparator* comparator)
|
|
78
|
-
: comparator_(comparator) {}
|
|
79
|
-
bool operator()(HeapItem* a, HeapItem* b) const {
|
|
80
|
-
if (LIKELY(a->type == HeapItem::ITERATOR)) {
|
|
81
|
-
if (LIKELY(b->type == HeapItem::ITERATOR)) {
|
|
82
|
-
return comparator_->Compare(a->key(), b->key()) > 0;
|
|
83
|
-
} else {
|
|
84
|
-
return comparator_->Compare(a->key(), b->parsed_ikey) > 0;
|
|
85
|
-
}
|
|
86
|
-
} else {
|
|
87
|
-
if (LIKELY(b->type == HeapItem::ITERATOR)) {
|
|
88
|
-
return comparator_->Compare(a->parsed_ikey, b->key()) > 0;
|
|
89
|
-
} else {
|
|
90
|
-
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
private:
|
|
96
|
-
const InternalKeyComparator* comparator_;
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
class MaxHeapItemComparator {
|
|
100
|
-
public:
|
|
101
|
-
MaxHeapItemComparator(const InternalKeyComparator* comparator)
|
|
102
|
-
: comparator_(comparator) {}
|
|
103
|
-
bool operator()(HeapItem* a, HeapItem* b) const {
|
|
104
|
-
if (LIKELY(a->type == HeapItem::ITERATOR)) {
|
|
105
|
-
if (LIKELY(b->type == HeapItem::ITERATOR)) {
|
|
106
|
-
return comparator_->Compare(a->key(), b->key()) < 0;
|
|
107
|
-
} else {
|
|
108
|
-
return comparator_->Compare(a->key(), b->parsed_ikey) < 0;
|
|
109
|
-
}
|
|
110
|
-
} else {
|
|
111
|
-
if (LIKELY(b->type == HeapItem::ITERATOR)) {
|
|
112
|
-
return comparator_->Compare(a->parsed_ikey, b->key()) < 0;
|
|
113
|
-
} else {
|
|
114
|
-
return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0;
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
private:
|
|
120
|
-
const InternalKeyComparator* comparator_;
|
|
121
|
-
};
|
|
122
|
-
// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
|
|
123
|
-
namespace {
|
|
124
|
-
using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
|
|
125
|
-
using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
|
|
126
|
-
} // namespace
|
|
127
|
-
|
|
19
|
+
// The following are implementation details and can be ignored by user.
|
|
20
|
+
// For merging iterator to process range tombstones, it treats the start and end
|
|
21
|
+
// keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_
|
|
22
|
+
// together with regular point keys. Each range tombstone is active only within
|
|
23
|
+
// its internal key range [start_key, end_key). An `active_` set is used to
|
|
24
|
+
// track levels that have an active range tombstone. Take forward scanning
|
|
25
|
+
// for example. Level j is in active_ if its current range tombstone has its
|
|
26
|
+
// start_key popped from minHeap_ and its end_key in minHeap_. If the top of
|
|
27
|
+
// minHeap_ is a point key from level L, we can determine if the point key is
|
|
28
|
+
// covered by any range tombstone by checking if there is an l <= L in active_.
|
|
29
|
+
// The case of l == L also involves checking range tombstone's sequence number.
|
|
30
|
+
//
|
|
31
|
+
// The following (non-exhaustive) list of invariants are maintained by
|
|
32
|
+
// MergingIterator during forward scanning. After each InternalIterator API,
|
|
33
|
+
// i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty:
|
|
34
|
+
// (1) minHeap_.top().type == ITERATOR
|
|
35
|
+
// (2) minHeap_.top()->key() is not covered by any range tombstone.
|
|
36
|
+
//
|
|
37
|
+
// After each call to SeekImpl() in addition to the functions mentioned above:
|
|
38
|
+
// (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() <
|
|
39
|
+
// children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before
|
|
40
|
+
// the first range tombstone from level j with end_key() >
|
|
41
|
+
// children_[i].iter.key().
|
|
42
|
+
// (4) For all level i and j <= i, if j in active_, then
|
|
43
|
+
// range_tombstone_iters_[j]->start_key() < children_[i].iter.key().
|
|
44
|
+
// - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be
|
|
45
|
+
// the last range tombstone from that range tombstone iterator.
|
|
46
|
+
// - When referring to range tombstone start/end keys, assume it is the value of
|
|
47
|
+
// HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes
|
|
48
|
+
// range tombstone keys have distinct values from point keys.
|
|
49
|
+
//
|
|
50
|
+
// Applicable class variables have their own (forward scanning) invariants
|
|
51
|
+
// listed in the comments above their definition.
|
|
128
52
|
class MergingIterator : public InternalIterator {
|
|
129
53
|
public:
|
|
130
54
|
MergingIterator(const InternalKeyComparator* comparator,
|
|
@@ -136,7 +60,7 @@ class MergingIterator : public InternalIterator {
|
|
|
136
60
|
direction_(kForward),
|
|
137
61
|
comparator_(comparator),
|
|
138
62
|
current_(nullptr),
|
|
139
|
-
minHeap_(comparator_),
|
|
63
|
+
minHeap_(MinHeapItemComparator(comparator_)),
|
|
140
64
|
pinned_iters_mgr_(nullptr),
|
|
141
65
|
iterate_upper_bound_(iterate_upper_bound) {
|
|
142
66
|
children_.resize(n);
|
|
@@ -162,30 +86,26 @@ class MergingIterator : public InternalIterator {
|
|
|
162
86
|
current_ = nullptr;
|
|
163
87
|
}
|
|
164
88
|
|
|
165
|
-
//
|
|
166
|
-
//
|
|
167
|
-
//
|
|
168
|
-
//
|
|
169
|
-
//
|
|
170
|
-
//
|
|
171
|
-
//
|
|
172
|
-
//
|
|
173
|
-
// iterator
|
|
174
|
-
//
|
|
175
|
-
// is responsible for freeing it. Note that during Iterator::Refresh()
|
|
176
|
-
// and when a level iterator moves to a different SST file, the range
|
|
177
|
-
// tombstone iterator could be updated. In that case, the merging iterator
|
|
178
|
-
// is only responsible to freeing the new range tombstone iterator
|
|
179
|
-
// that it has pointers to in range_tombstone_iters_.
|
|
89
|
+
// There must be either no range tombstone iterator or the same number of
|
|
90
|
+
// range tombstone iterators as point iterators after all iters are added.
|
|
91
|
+
// The i-th added range tombstone iterator and the i-th point iterator
|
|
92
|
+
// must point to the same LSM level.
|
|
93
|
+
// Merging iterator takes ownership of `iter` and is responsible for freeing
|
|
94
|
+
// it. One exception to this is when a LevelIterator moves to a different SST
|
|
95
|
+
// file or when Iterator::Refresh() is called, the range tombstone iterator
|
|
96
|
+
// could be updated. In that case, this merging iterator is only responsible
|
|
97
|
+
// for freeing the new range tombstone iterator that it has pointers to in
|
|
98
|
+
// range_tombstone_iters_.
|
|
180
99
|
void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) {
|
|
181
100
|
range_tombstone_iters_.emplace_back(iter);
|
|
182
101
|
}
|
|
183
102
|
|
|
184
103
|
// Called by MergingIteratorBuilder when all point iterators and range
|
|
185
104
|
// tombstone iterators are added. Initializes HeapItems for range tombstone
|
|
186
|
-
// iterators
|
|
105
|
+
// iterators.
|
|
187
106
|
void Finish() {
|
|
188
107
|
if (!range_tombstone_iters_.empty()) {
|
|
108
|
+
assert(range_tombstone_iters_.size() == children_.size());
|
|
189
109
|
pinned_heap_item_.resize(range_tombstone_iters_.size());
|
|
190
110
|
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
|
|
191
111
|
pinned_heap_item_[i].level = i;
|
|
@@ -199,7 +119,7 @@ class MergingIterator : public InternalIterator {
|
|
|
199
119
|
// TruncatedRangeDelIterator since untruncated tombstone end points
|
|
200
120
|
// always have kMaxSequenceNumber and kTypeRangeDeletion (see
|
|
201
121
|
// TruncatedRangeDelIterator::start_key()/end_key()).
|
|
202
|
-
pinned_heap_item_[i].
|
|
122
|
+
pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid;
|
|
203
123
|
}
|
|
204
124
|
}
|
|
205
125
|
}
|
|
@@ -221,12 +141,18 @@ class MergingIterator : public InternalIterator {
|
|
|
221
141
|
|
|
222
142
|
// Add range_tombstone_iters_[level] into min heap.
|
|
223
143
|
// Updates active_ if the end key of a range tombstone is inserted.
|
|
144
|
+
// pinned_heap_items_[level].type is updated based on `start_key`.
|
|
145
|
+
//
|
|
146
|
+
// If range_tombstone_iters_[level] is after iterate_upper_bound_,
|
|
147
|
+
// it is removed from the heap.
|
|
224
148
|
// @param start_key specifies which end point of the range tombstone to add.
|
|
225
149
|
void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true,
|
|
226
150
|
bool replace_top = false) {
|
|
227
151
|
assert(!range_tombstone_iters_.empty() &&
|
|
228
152
|
range_tombstone_iters_[level]->Valid());
|
|
153
|
+
// Maintains Invariant(phi)
|
|
229
154
|
if (start_key) {
|
|
155
|
+
pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
|
|
230
156
|
ParsedInternalKey pik = range_tombstone_iters_[level]->start_key();
|
|
231
157
|
// iterate_upper_bound does not have timestamp
|
|
232
158
|
if (iterate_upper_bound_ &&
|
|
@@ -241,15 +167,16 @@ class MergingIterator : public InternalIterator {
|
|
|
241
167
|
return;
|
|
242
168
|
}
|
|
243
169
|
pinned_heap_item_[level].SetTombstoneKey(std::move(pik));
|
|
244
|
-
|
|
170
|
+
// Checks Invariant(active_)
|
|
245
171
|
assert(active_.count(level) == 0);
|
|
246
172
|
} else {
|
|
247
173
|
// allow end key to go over upper bound (if present) since start key is
|
|
248
174
|
// before upper bound and the range tombstone could still cover a
|
|
249
175
|
// range before upper bound.
|
|
176
|
+
// Maintains Invariant(active_)
|
|
250
177
|
pinned_heap_item_[level].SetTombstoneKey(
|
|
251
178
|
range_tombstone_iters_[level]->end_key());
|
|
252
|
-
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
|
|
179
|
+
pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
|
|
253
180
|
active_.insert(level);
|
|
254
181
|
}
|
|
255
182
|
if (replace_top) {
|
|
@@ -269,12 +196,12 @@ class MergingIterator : public InternalIterator {
|
|
|
269
196
|
if (end_key) {
|
|
270
197
|
pinned_heap_item_[level].SetTombstoneKey(
|
|
271
198
|
range_tombstone_iters_[level]->end_key());
|
|
272
|
-
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END;
|
|
199
|
+
pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END;
|
|
273
200
|
assert(active_.count(level) == 0);
|
|
274
201
|
} else {
|
|
275
202
|
pinned_heap_item_[level].SetTombstoneKey(
|
|
276
203
|
range_tombstone_iters_[level]->start_key());
|
|
277
|
-
pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START;
|
|
204
|
+
pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START;
|
|
278
205
|
active_.insert(level);
|
|
279
206
|
}
|
|
280
207
|
if (replace_top) {
|
|
@@ -290,9 +217,12 @@ class MergingIterator : public InternalIterator {
|
|
|
290
217
|
// so `active_` is updated accordingly.
|
|
291
218
|
void PopDeleteRangeStart() {
|
|
292
219
|
while (!minHeap_.empty() &&
|
|
293
|
-
minHeap_.top()->type == HeapItem::DELETE_RANGE_START) {
|
|
220
|
+
minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) {
|
|
294
221
|
TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr);
|
|
295
|
-
//
|
|
222
|
+
// Invariant(rti) holds since
|
|
223
|
+
// range_tombstone_iters_[minHeap_.top()->level] is still valid, and
|
|
224
|
+
// parameter `replace_top` is set to true here to ensure only one such
|
|
225
|
+
// HeapItem is in minHeap_.
|
|
296
226
|
InsertRangeTombstoneToMinHeap(
|
|
297
227
|
minHeap_.top()->level, false /* start_key */, true /* replace_top */);
|
|
298
228
|
}
|
|
@@ -304,7 +234,7 @@ class MergingIterator : public InternalIterator {
|
|
|
304
234
|
// so `active_` is updated accordingly.
|
|
305
235
|
void PopDeleteRangeEnd() {
|
|
306
236
|
while (!maxHeap_->empty() &&
|
|
307
|
-
maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) {
|
|
237
|
+
maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) {
|
|
308
238
|
// insert start key of this range tombstone and updates active_
|
|
309
239
|
InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */,
|
|
310
240
|
true /* replace_top */);
|
|
@@ -359,44 +289,25 @@ class MergingIterator : public InternalIterator {
|
|
|
359
289
|
// Position this merging iterator at the first key >= target (internal key).
|
|
360
290
|
// If range tombstones are present, keys covered by range tombstones are
|
|
361
291
|
// skipped, and this merging iter points to the first non-range-deleted key >=
|
|
362
|
-
// target after Seek(). If !Valid() and status().ok() then
|
|
363
|
-
//
|
|
364
|
-
//
|
|
365
|
-
// Internally, this involves positioning all child iterators at the first key
|
|
366
|
-
// >= target. If range tombstones are present, we apply a similar
|
|
367
|
-
// optimization, cascading seek, as in Pebble
|
|
368
|
-
// (https://github.com/cockroachdb/pebble). Specifically, if there is a range
|
|
369
|
-
// tombstone [start, end) that covers the target user key at level L, then
|
|
370
|
-
// this range tombstone must cover the range [target key, end) in all levels >
|
|
371
|
-
// L. So for all levels > L, we can pretend the target key is `end`. This
|
|
372
|
-
// optimization is applied at each level and hence the name "cascading seek".
|
|
373
|
-
// After a round of (cascading) seeks, the top of the heap is checked to see
|
|
374
|
-
// if it is covered by a range tombstone (see FindNextVisibleKey() for more
|
|
375
|
-
// detail), and advanced if so. The process is repeated until a
|
|
376
|
-
// non-range-deleted key is at the top of the heap, or heap becomes empty.
|
|
292
|
+
// target after Seek(). If !Valid() and status().ok() then this iterator
|
|
293
|
+
// reaches the end.
|
|
377
294
|
//
|
|
378
|
-
//
|
|
379
|
-
//
|
|
380
|
-
// range
|
|
381
|
-
//
|
|
382
|
-
//
|
|
383
|
-
//
|
|
384
|
-
//
|
|
385
|
-
// range_tombstone_iters_[L] currently points to. For correctness reasoning,
|
|
386
|
-
// one invariant that Seek() (and every other public APIs Seek*(),
|
|
387
|
-
// Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the
|
|
388
|
-
// current key of level L's point iterator. Then for each range tombstone
|
|
389
|
-
// iterator at level <= L, it is at or before the first range tombstone with
|
|
390
|
-
// end key > `k`. This ensures that when level L's point iterator reaches top
|
|
391
|
-
// of the heap, `active_` is calculated correctly (it contains the covering
|
|
392
|
-
// range tombstone's level if there is one), since no range tombstone iterator
|
|
393
|
-
// was skipped beyond that point iterator's current key during Seek().
|
|
394
|
-
// Next()/Prev() maintains a stronger version of this invariant where all
|
|
395
|
-
// range tombstone iterators from level <= L are *at* the first range
|
|
396
|
-
// tombstone with end key > `k`.
|
|
295
|
+
// If range tombstones are present, cascading seeks may be called (an
|
|
296
|
+
// optimization adapted from Pebble https://github.com/cockroachdb/pebble).
|
|
297
|
+
// Roughly, if there is a range tombstone [start, end) that covers the
|
|
298
|
+
// target user key at level L, then this range tombstone must cover the range
|
|
299
|
+
// [target key, end) in all levels > L. So for all levels > L, we can pretend
|
|
300
|
+
// the target key is `end`. This optimization is applied at each level and
|
|
301
|
+
// hence the name "cascading seek".
|
|
397
302
|
void Seek(const Slice& target) override {
|
|
398
|
-
|
|
399
|
-
|
|
303
|
+
// Define LevelNextVisible(i, k) to be the first key >= k in level i that is
|
|
304
|
+
// not covered by any range tombstone.
|
|
305
|
+
// After SeekImpl(target, 0), invariants (3) and (4) hold.
|
|
306
|
+
// For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i,
|
|
307
|
+
// target). By the contract of FindNextVisibleKey(), Invariants (1)-(4)
|
|
308
|
+
// holds after this call, and minHeap_.top().iter points to the
|
|
309
|
+
// first key >= target among children_ that is not covered by any range
|
|
310
|
+
// tombstone.
|
|
400
311
|
SeekImpl(target);
|
|
401
312
|
FindNextVisibleKey();
|
|
402
313
|
|
|
@@ -424,7 +335,7 @@ class MergingIterator : public InternalIterator {
|
|
|
424
335
|
assert(Valid());
|
|
425
336
|
// Ensure that all children are positioned after key().
|
|
426
337
|
// If we are moving in the forward direction, it is already
|
|
427
|
-
// true for all
|
|
338
|
+
// true for all the non-current children since current_ is
|
|
428
339
|
// the smallest child and key() == current_->key().
|
|
429
340
|
if (direction_ != kForward) {
|
|
430
341
|
// The loop advanced all non-current children to be > key() so current_
|
|
@@ -448,6 +359,12 @@ class MergingIterator : public InternalIterator {
|
|
|
448
359
|
considerStatus(current_->status());
|
|
449
360
|
minHeap_.pop();
|
|
450
361
|
}
|
|
362
|
+
// Invariants (3) and (4) hold when after advancing current_.
|
|
363
|
+
// Let k be the smallest key among children_[i].iter.key().
|
|
364
|
+
// k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all
|
|
365
|
+
// level i. After FindNextVisible(), Invariants (1)-(4) hold and
|
|
366
|
+
// minHeap_.top()->key() is the first key >= k from any children_ that is
|
|
367
|
+
// not covered by any range tombstone.
|
|
451
368
|
FindNextVisibleKey();
|
|
452
369
|
current_ = CurrentForward();
|
|
453
370
|
}
|
|
@@ -467,7 +384,7 @@ class MergingIterator : public InternalIterator {
|
|
|
467
384
|
assert(Valid());
|
|
468
385
|
// Ensure that all children are positioned before key().
|
|
469
386
|
// If we are moving in the reverse direction, it is already
|
|
470
|
-
// true for all
|
|
387
|
+
// true for all the non-current children since current_ is
|
|
471
388
|
// the largest child and key() == current_->key().
|
|
472
389
|
if (direction_ != kReverse) {
|
|
473
390
|
// Otherwise, retreat the non-current children. We retreat current_
|
|
@@ -518,7 +435,6 @@ class MergingIterator : public InternalIterator {
|
|
|
518
435
|
// Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
|
|
519
436
|
// from current child iterator. Potentially as long as one of child iterator
|
|
520
437
|
// report out of bound is not possible, we know current key is within bound.
|
|
521
|
-
|
|
522
438
|
bool MayBeOutOfLowerBound() override {
|
|
523
439
|
assert(Valid());
|
|
524
440
|
return current_->MayBeOutOfLowerBound();
|
|
@@ -549,20 +465,108 @@ class MergingIterator : public InternalIterator {
|
|
|
549
465
|
}
|
|
550
466
|
|
|
551
467
|
private:
|
|
468
|
+
// Represents an element in the min/max heap. Each HeapItem corresponds to a
|
|
469
|
+
// point iterator or a range tombstone iterator, differentiated by
|
|
470
|
+
// HeapItem::type.
|
|
471
|
+
struct HeapItem {
|
|
472
|
+
HeapItem() = default;
|
|
473
|
+
|
|
474
|
+
// corresponding point iterator
|
|
475
|
+
IteratorWrapper iter;
|
|
476
|
+
size_t level = 0;
|
|
477
|
+
// corresponding range tombstone iterator's start or end key value
|
|
478
|
+
// depending on value of `type`.
|
|
479
|
+
ParsedInternalKey tombstone_pik;
|
|
480
|
+
// Will be overwritten before use, initialize here so compiler does not
|
|
481
|
+
// complain.
|
|
482
|
+
enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END };
|
|
483
|
+
Type type = Type::ITERATOR;
|
|
484
|
+
|
|
485
|
+
explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
|
|
486
|
+
: level(_level), type(Type::ITERATOR) {
|
|
487
|
+
iter.Set(_iter);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
void SetTombstoneKey(ParsedInternalKey&& pik) {
|
|
491
|
+
// op_type is already initialized in MergingIterator::Finish().
|
|
492
|
+
tombstone_pik.user_key = pik.user_key;
|
|
493
|
+
tombstone_pik.sequence = pik.sequence;
|
|
494
|
+
}
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
class MinHeapItemComparator {
|
|
498
|
+
public:
|
|
499
|
+
explicit MinHeapItemComparator(const InternalKeyComparator* comparator)
|
|
500
|
+
: comparator_(comparator) {}
|
|
501
|
+
|
|
502
|
+
bool operator()(HeapItem* a, HeapItem* b) const {
|
|
503
|
+
if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
|
|
504
|
+
if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
|
|
505
|
+
return comparator_->Compare(a->iter.key(), b->iter.key()) > 0;
|
|
506
|
+
} else {
|
|
507
|
+
return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0;
|
|
508
|
+
}
|
|
509
|
+
} else {
|
|
510
|
+
if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
|
|
511
|
+
return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0;
|
|
512
|
+
} else {
|
|
513
|
+
return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
private:
|
|
519
|
+
const InternalKeyComparator* comparator_;
|
|
520
|
+
};
|
|
521
|
+
|
|
522
|
+
class MaxHeapItemComparator {
|
|
523
|
+
public:
|
|
524
|
+
explicit MaxHeapItemComparator(const InternalKeyComparator* comparator)
|
|
525
|
+
: comparator_(comparator) {}
|
|
526
|
+
|
|
527
|
+
bool operator()(HeapItem* a, HeapItem* b) const {
|
|
528
|
+
if (LIKELY(a->type == HeapItem::Type::ITERATOR)) {
|
|
529
|
+
if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
|
|
530
|
+
return comparator_->Compare(a->iter.key(), b->iter.key()) < 0;
|
|
531
|
+
} else {
|
|
532
|
+
return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0;
|
|
533
|
+
}
|
|
534
|
+
} else {
|
|
535
|
+
if (LIKELY(b->type == HeapItem::Type::ITERATOR)) {
|
|
536
|
+
return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0;
|
|
537
|
+
} else {
|
|
538
|
+
return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0;
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
private:
|
|
544
|
+
const InternalKeyComparator* comparator_;
|
|
545
|
+
};
|
|
546
|
+
|
|
547
|
+
using MergerMinIterHeap = BinaryHeap<HeapItem*, MinHeapItemComparator>;
|
|
548
|
+
using MergerMaxIterHeap = BinaryHeap<HeapItem*, MaxHeapItemComparator>;
|
|
549
|
+
|
|
552
550
|
friend class MergeIteratorBuilder;
|
|
553
551
|
// Clears heaps for both directions, used when changing direction or seeking
|
|
554
552
|
void ClearHeaps(bool clear_active = true);
|
|
555
553
|
// Ensures that maxHeap_ is initialized when starting to go in the reverse
|
|
556
554
|
// direction
|
|
557
555
|
void InitMaxHeap();
|
|
558
|
-
|
|
559
|
-
//
|
|
560
|
-
//
|
|
561
|
-
//
|
|
562
|
-
// is not covered by any range tombstone.
|
|
556
|
+
// Advance this merging iterator until the current key (minHeap_.top()) is
|
|
557
|
+
// from a point iterator and is not covered by any range tombstone,
|
|
558
|
+
// or that there is no more keys (heap is empty). SeekImpl() may be called
|
|
559
|
+
// to seek to the end of a range tombstone as an optimization.
|
|
563
560
|
void FindNextVisibleKey();
|
|
564
561
|
void FindPrevVisibleKey();
|
|
565
562
|
|
|
563
|
+
// Advance this merging iterators to the first key >= `target` for all
|
|
564
|
+
// components from levels >= starting_level. All iterators before
|
|
565
|
+
// starting_level are untouched.
|
|
566
|
+
//
|
|
567
|
+
// @param range_tombstone_reseek Whether target is some range tombstone
|
|
568
|
+
// end, i.e., whether this SeekImpl() call is a part of a "cascading seek".
|
|
569
|
+
// This is used only for recoding relevant perf_context.
|
|
566
570
|
void SeekImpl(const Slice& target, size_t starting_level = 0,
|
|
567
571
|
bool range_tombstone_reseek = false);
|
|
568
572
|
|
|
@@ -577,40 +581,59 @@ class MergingIterator : public InternalIterator {
|
|
|
577
581
|
enum Direction : uint8_t { kForward, kReverse };
|
|
578
582
|
Direction direction_;
|
|
579
583
|
const InternalKeyComparator* comparator_;
|
|
580
|
-
// We could also use an autovector with a larger reserved size.
|
|
581
584
|
// HeapItem for all child point iterators.
|
|
585
|
+
// Invariant(children_): children_[i] is in minHeap_ iff
|
|
586
|
+
// children_[i].iter.Valid(), and at most one children_[i] is in minHeap_.
|
|
587
|
+
// TODO: We could use an autovector with a larger reserved size.
|
|
582
588
|
std::vector<HeapItem> children_;
|
|
583
|
-
// HeapItem for range tombstone start and end keys.
|
|
584
|
-
//
|
|
585
|
-
//
|
|
586
|
-
// pinned_heap_item_[i]
|
|
587
|
-
//
|
|
589
|
+
// HeapItem for range tombstone start and end keys.
|
|
590
|
+
// pinned_heap_item_[i] corresponds to range_tombstone_iters_[i].
|
|
591
|
+
// Invariant(phi): If range_tombstone_iters_[i]->Valid(),
|
|
592
|
+
// pinned_heap_item_[i].tombstone_pik is equal to
|
|
593
|
+
// range_tombstone_iters_[i]->start_key() when
|
|
594
|
+
// pinned_heap_item_[i].type is DELETE_RANGE_START and
|
|
595
|
+
// range_tombstone_iters_[i]->end_key() when
|
|
596
|
+
// pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is
|
|
597
|
+
// kMaxValid for all pinned_heap_item_.tombstone_pik).
|
|
598
|
+
// pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END.
|
|
588
599
|
std::vector<HeapItem> pinned_heap_item_;
|
|
589
600
|
// range_tombstone_iters_[i] contains range tombstones in the sorted run that
|
|
590
601
|
// corresponds to children_[i]. range_tombstone_iters_.empty() means not
|
|
591
602
|
// handling range tombstones in merging iterator. range_tombstone_iters_[i] ==
|
|
592
603
|
// nullptr means the sorted run of children_[i] does not have range
|
|
593
604
|
// tombstones.
|
|
605
|
+
// Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff
|
|
606
|
+
// range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is
|
|
607
|
+
// in minHeap_.
|
|
594
608
|
std::vector<TruncatedRangeDelIterator*> range_tombstone_iters_;
|
|
595
609
|
|
|
596
610
|
// Levels (indices into range_tombstone_iters_/children_ ) that currently have
|
|
597
|
-
// "active" range tombstones. See comments above
|
|
598
|
-
// "active".
|
|
611
|
+
// "active" range tombstones. See comments above MergingIterator for meaning
|
|
612
|
+
// of "active".
|
|
613
|
+
// Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid()
|
|
614
|
+
// and pinned_heap_item_[i].type == DELETE_RANGE_END.
|
|
599
615
|
std::set<size_t> active_;
|
|
600
616
|
|
|
601
617
|
bool SkipNextDeleted();
|
|
618
|
+
|
|
602
619
|
bool SkipPrevDeleted();
|
|
603
620
|
|
|
604
|
-
//
|
|
605
|
-
//
|
|
606
|
-
//
|
|
621
|
+
// Invariant: at the end of each InternalIterator API,
|
|
622
|
+
// current_ points to minHeap_.top().iter (maxHeap_ if backward scanning)
|
|
623
|
+
// or nullptr if no child iterator is valid.
|
|
624
|
+
// This follows from that current_ = CurrentForward()/CurrentReverse() is
|
|
625
|
+
// called at the end of each InternalIterator API.
|
|
607
626
|
IteratorWrapper* current_;
|
|
608
627
|
// If any of the children have non-ok status, this is one of them.
|
|
609
628
|
Status status_;
|
|
629
|
+
// Invariant: min heap property is maintained (parent is always <= child).
|
|
630
|
+
// This holds by using only BinaryHeap APIs to modify heap. One
|
|
631
|
+
// exception is to modify heap top item directly (by caller iter->Next()), and
|
|
632
|
+
// it should be followed by a call to replace_top() or pop().
|
|
610
633
|
MergerMinIterHeap minHeap_;
|
|
611
634
|
|
|
612
635
|
// Max heap is used for reverse iteration, which is way less common than
|
|
613
|
-
// forward.
|
|
636
|
+
// forward. Lazily initialize it to save memory.
|
|
614
637
|
std::unique_ptr<MergerMaxIterHeap> maxHeap_;
|
|
615
638
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
|
616
639
|
|
|
@@ -634,25 +657,93 @@ class MergingIterator : public InternalIterator {
|
|
|
634
657
|
|
|
635
658
|
IteratorWrapper* CurrentForward() const {
|
|
636
659
|
assert(direction_ == kForward);
|
|
637
|
-
assert(minHeap_.empty() ||
|
|
660
|
+
assert(minHeap_.empty() ||
|
|
661
|
+
minHeap_.top()->type == HeapItem::Type::ITERATOR);
|
|
638
662
|
return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr;
|
|
639
663
|
}
|
|
640
664
|
|
|
641
665
|
IteratorWrapper* CurrentReverse() const {
|
|
642
666
|
assert(direction_ == kReverse);
|
|
643
667
|
assert(maxHeap_);
|
|
644
|
-
assert(maxHeap_->empty() ||
|
|
668
|
+
assert(maxHeap_->empty() ||
|
|
669
|
+
maxHeap_->top()->type == HeapItem::Type::ITERATOR);
|
|
645
670
|
return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr;
|
|
646
671
|
}
|
|
647
672
|
};
|
|
648
673
|
|
|
649
|
-
//
|
|
650
|
-
//
|
|
651
|
-
//
|
|
674
|
+
// Pre-condition:
|
|
675
|
+
// - Invariants (3) and (4) hold for i < starting_level
|
|
676
|
+
// - For i < starting_level, range_tombstone_iters_[i].prev.end_key() <
|
|
677
|
+
// `target`.
|
|
678
|
+
// - For i < starting_level, if i in active_, then
|
|
679
|
+
// range_tombstone_iters_[i]->start_key() < `target`.
|
|
680
|
+
//
|
|
681
|
+
// Post-condition:
|
|
682
|
+
// - Invariants (3) and (4) hold for all level i.
|
|
683
|
+
// - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target)
|
|
684
|
+
// for i >= starting_level
|
|
685
|
+
// - (**) target < pinned_heap_item_[i].tombstone_pik if
|
|
686
|
+
// range_tombstone_iters_[i].Valid() for i >= starting_level
|
|
687
|
+
//
|
|
688
|
+
// Proof sketch:
|
|
689
|
+
// Invariant (3) holds for all level i.
|
|
690
|
+
// For j <= i < starting_level, it follows from Pre-condition that (3) holds
|
|
691
|
+
// and that SeekImpl(-, starting_level) does not update children_[i] or
|
|
692
|
+
// range_tombstone_iters_[j].
|
|
693
|
+
// For j < starting_level and i >= starting_level, it follows from
|
|
694
|
+
// - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target`
|
|
695
|
+
// - range_tombstone_iters_[j] is not updated in SeekImpl(), and
|
|
696
|
+
// - children_[i].iter.Seek(current_search_key) is called with
|
|
697
|
+
// current_search_key >= target (shown below).
|
|
698
|
+
// When current_search_key is updated, it is updated to some
|
|
699
|
+
// range_tombstone_iter->end_key() after
|
|
700
|
+
// range_tombstone_iter->SeekInternalKey(current_search_key) was called. So
|
|
701
|
+
// current_search_key increases if updated and >= target.
|
|
702
|
+
// For starting_level <= j <= i:
|
|
703
|
+
// children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2)
|
|
704
|
+
// are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >=
|
|
705
|
+
// k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at
|
|
706
|
+
// the first range tombstone from level j with end_key() > k2. It suffices to
|
|
707
|
+
// show that k1 >= k2. Since k1 and k2 are values of current_search_key where
|
|
708
|
+
// k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2.
|
|
709
|
+
//
|
|
710
|
+
// Invariant (4) holds for all level >= 0.
|
|
711
|
+
// By Pre-condition Invariant (4) holds for i < starting_level.
|
|
712
|
+
// Since children_[i], range_tombstone_iters_[i] and contents of active_ for
|
|
713
|
+
// i < starting_level do not change (4) holds for j <= i < starting_level.
|
|
714
|
+
// By Pre-condition: for all j < starting_level, if j in active_, then
|
|
715
|
+
// range_tombstone_iters_[j]->start_key() < target. For i >= starting_level,
|
|
716
|
+
// children_[i].iter.Seek(k) is called for k >= target. So
|
|
717
|
+
// children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key()
|
|
718
|
+
// for j < starting_level and i >= starting_level. So invariant (4) holds for
|
|
719
|
+
// j < starting_level and i >= starting_level.
|
|
720
|
+
// For starting_level <= j <= i, j is added to active_ only if
|
|
721
|
+
// - range_tombstone_iters_[j]->SeekInternalKey(k1) was called
|
|
722
|
+
// - range_tombstone_iters_[j]->start_key() <= k1
|
|
723
|
+
// Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all
|
|
724
|
+
// starting_level <= j <= i, (4) also holds for all starting_level <= j <= i.
|
|
652
725
|
//
|
|
653
|
-
//
|
|
654
|
-
//
|
|
655
|
-
//
|
|
726
|
+
// Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i,
|
|
727
|
+
// target) for i >= starting_level.
|
|
728
|
+
// target <= children_[i].iter.key() follows from that Seek() is called on some
|
|
729
|
+
// current_search_key >= target for children_[i].iter. If current_search_key
|
|
730
|
+
// is updated from k1 to k2 when level = i, we show that the range [k1, k2) is
|
|
731
|
+
// not visible for children_[j] for any j > i. When current_search_key is
|
|
732
|
+
// updated from k1 to k2,
|
|
733
|
+
// - range_tombstone_iters_[i]->SeekInternalKey(k1) was called
|
|
734
|
+
// - range_tombstone_iters_[i]->Valid()
|
|
735
|
+
// - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key
|
|
736
|
+
// - k2 = range_tombstone_iters_[i]->end_key()
|
|
737
|
+
// We assume that range_tombstone_iters_[i]->start_key() has a higher sequence
|
|
738
|
+
// number compared to any key from levels > i that has the same user key. So no
|
|
739
|
+
// point key from levels > i in range [k1, k2) is visible. So
|
|
740
|
+
// children_[i].iter.key() <= LevelNextVisible(i, target).
|
|
741
|
+
//
|
|
742
|
+
// Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >=
|
|
743
|
+
// starting_level if range_tombstone_iters_[i].Valid(). This follows from that
|
|
744
|
+
// SeekInternalKey() being called for each range_tombstone_iters_ with some key
|
|
745
|
+
// >= `target` and that we pick start/end key that is > `target` to insert to
|
|
746
|
+
// minHeap_.
|
|
656
747
|
void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|
657
748
|
bool range_tombstone_reseek) {
|
|
658
749
|
// active range tombstones before `starting_level` remain active
|
|
@@ -665,6 +756,7 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|
|
665
756
|
|
|
666
757
|
// TODO: perhaps we could save some upheap cost by add all child iters first
|
|
667
758
|
// and then do a single heapify.
|
|
759
|
+
// Invariant(children_) for level < starting_level
|
|
668
760
|
for (size_t level = 0; level < starting_level; ++level) {
|
|
669
761
|
PERF_TIMER_GUARD(seek_min_heap_time);
|
|
670
762
|
AddToMinHeapOrCheckStatus(&children_[level]);
|
|
@@ -677,15 +769,20 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|
|
677
769
|
// - If `level` is in active_, then range_tombstone_iters_[level]->Valid()
|
|
678
770
|
// and pinned_heap_item_[level] is of type RANGE_DELETION_END.
|
|
679
771
|
for (size_t level = 0; level < starting_level; ++level) {
|
|
772
|
+
// Restores Invariants(rti), (phi) and (active_) for level <
|
|
773
|
+
// starting_level
|
|
680
774
|
if (range_tombstone_iters_[level] &&
|
|
681
775
|
range_tombstone_iters_[level]->Valid()) {
|
|
682
776
|
// use an iterator on active_ if performance becomes an issue here
|
|
683
777
|
if (active_.count(level) > 0) {
|
|
684
|
-
assert(pinned_heap_item_[level].type ==
|
|
778
|
+
assert(pinned_heap_item_[level].type ==
|
|
779
|
+
HeapItem::Type::DELETE_RANGE_END);
|
|
685
780
|
// if it was active, then start key must be within upper_bound,
|
|
686
781
|
// so we can add to minHeap_ directly.
|
|
687
782
|
minHeap_.push(&pinned_heap_item_[level]);
|
|
688
783
|
} else {
|
|
784
|
+
assert(pinned_heap_item_[level].type ==
|
|
785
|
+
HeapItem::Type::DELETE_RANGE_START);
|
|
689
786
|
// this takes care of checking iterate_upper_bound, but with an extra
|
|
690
787
|
// key comparison if range_tombstone_iters_[level] was already out of
|
|
691
788
|
// bound. Consider using a new HeapItem type or some flag to remember
|
|
@@ -728,45 +825,37 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|
|
728
825
|
}
|
|
729
826
|
auto range_tombstone_iter = range_tombstone_iters_[level];
|
|
730
827
|
if (range_tombstone_iter) {
|
|
731
|
-
range_tombstone_iter->
|
|
828
|
+
range_tombstone_iter->SeekInternalKey(
|
|
829
|
+
current_search_key.GetInternalKey());
|
|
830
|
+
// Invariants (rti) and (phi)
|
|
732
831
|
if (range_tombstone_iter->Valid()) {
|
|
733
|
-
//
|
|
734
|
-
//
|
|
735
|
-
//
|
|
736
|
-
// < current_search_key. This can happen when range_tombstone_iter is
|
|
737
|
-
// truncated and range_tombstone_iter.largest_ has the same user key
|
|
738
|
-
// as current_search_key.GetUserKey() but with a larger sequence
|
|
739
|
-
// number than current_search_key. Correctness is not affected as this
|
|
740
|
-
// tombstone end key will be popped during FindNextVisibleKey().
|
|
832
|
+
// If range tombstone starts after `current_search_key`,
|
|
833
|
+
// we should insert start key to heap as the range tombstone is not
|
|
834
|
+
// active yet.
|
|
741
835
|
InsertRangeTombstoneToMinHeap(
|
|
742
836
|
level, comparator_->Compare(range_tombstone_iter->start_key(),
|
|
743
837
|
pik) > 0 /* start_key */);
|
|
744
|
-
// current_search_key < end_key guaranteed by the
|
|
745
|
-
// calls above.
|
|
746
|
-
//
|
|
747
|
-
//
|
|
838
|
+
// current_search_key < end_key guaranteed by the SeekInternalKey()
|
|
839
|
+
// and Valid() calls above. Here we only need to compare user_key
|
|
840
|
+
// since if target.user_key ==
|
|
841
|
+
// range_tombstone_iter->start_key().user_key and target <
|
|
842
|
+
// range_tombstone_iter->start_key(), no older level would have any
|
|
843
|
+
// key in range [target, range_tombstone_iter->start_key()], so no
|
|
844
|
+
// keys in range [target, range_tombstone_iter->end_key()) from older
|
|
845
|
+
// level would be visible. So it is safe to seek to
|
|
846
|
+
// range_tombstone_iter->end_key().
|
|
748
847
|
//
|
|
749
848
|
// TODO: range_tombstone_iter->Seek() finds the max covering
|
|
750
849
|
// sequence number, can make it cheaper by not looking for max.
|
|
751
850
|
if (comparator_->user_comparator()->Compare(
|
|
752
851
|
range_tombstone_iter->start_key().user_key,
|
|
753
852
|
current_search_key.GetUserKey()) <= 0) {
|
|
754
|
-
// Since range_tombstone_iter->Valid(), seqno should be valid, so
|
|
755
|
-
// there is no need to check it.
|
|
756
853
|
range_tombstone_reseek = true;
|
|
757
|
-
// Current target user key is covered by this range tombstone.
|
|
758
|
-
// All older sorted runs will seek to range tombstone end key.
|
|
759
854
|
// Note that for prefix seek case, it is possible that the prefix
|
|
760
855
|
// is not the same as the original target, it should not affect
|
|
761
856
|
// correctness. Besides, in most cases, range tombstone start and
|
|
762
857
|
// end key should have the same prefix?
|
|
763
|
-
|
|
764
|
-
// boundary, the timestamp in user_key will not be max timestamp,
|
|
765
|
-
// but the timestamp of `range_tombstone_iter.largest_`. This should
|
|
766
|
-
// be fine here as current_search_key is used to Seek into lower
|
|
767
|
-
// levels.
|
|
768
|
-
current_search_key.SetInternalKey(
|
|
769
|
-
range_tombstone_iter->end_key().user_key, kMaxSequenceNumber);
|
|
858
|
+
current_search_key.SetInternalKey(range_tombstone_iter->end_key());
|
|
770
859
|
}
|
|
771
860
|
}
|
|
772
861
|
}
|
|
@@ -818,6 +907,8 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level,
|
|
|
818
907
|
// and `active_` is updated accordingly.
|
|
819
908
|
// See FindNextVisibleKey() for more detail on internal implementation
|
|
820
909
|
// of advancing child iters.
|
|
910
|
+
// When false is returned, if minHeap is not empty, then minHeap_.top().type
|
|
911
|
+
// == ITERATOR
|
|
821
912
|
//
|
|
822
913
|
// REQUIRES:
|
|
823
914
|
// - min heap is currently not empty, and iter is in kForward direction.
|
|
@@ -828,11 +919,14 @@ bool MergingIterator::SkipNextDeleted() {
|
|
|
828
919
|
// - file boundary sentinel keys
|
|
829
920
|
// - range deletion end key
|
|
830
921
|
auto current = minHeap_.top();
|
|
831
|
-
if (current->type == HeapItem::DELETE_RANGE_END) {
|
|
922
|
+
if (current->type == HeapItem::Type::DELETE_RANGE_END) {
|
|
923
|
+
// Invariant(active_): range_tombstone_iters_[current->level] is about to
|
|
924
|
+
// become !Valid() or that its start key is going to be added to minHeap_.
|
|
832
925
|
active_.erase(current->level);
|
|
833
926
|
assert(range_tombstone_iters_[current->level] &&
|
|
834
927
|
range_tombstone_iters_[current->level]->Valid());
|
|
835
928
|
range_tombstone_iters_[current->level]->Next();
|
|
929
|
+
// Maintain Invariants (rti) and (phi)
|
|
836
930
|
if (range_tombstone_iters_[current->level]->Valid()) {
|
|
837
931
|
InsertRangeTombstoneToMinHeap(current->level, true /* start_key */,
|
|
838
932
|
true /* replace_top */);
|
|
@@ -847,41 +941,62 @@ bool MergingIterator::SkipNextDeleted() {
|
|
|
847
941
|
// SetTombstoneKey()).
|
|
848
942
|
assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion ||
|
|
849
943
|
active_.count(current->level) == 0);
|
|
850
|
-
// When entering a new file,
|
|
851
|
-
// but the last key from that range tombstone iter may still be in
|
|
852
|
-
// We need to ensure the data underlying its corresponding key
|
|
853
|
-
// still alive. We do so by popping the range tombstone key from
|
|
854
|
-
// calling iter->Next(). Technically, this change is not needed:
|
|
855
|
-
// a range tombstone end key that is after file boundary
|
|
856
|
-
// minHeap_, the range tombstone end key must have been
|
|
857
|
-
// boundary. The underlying data of the range tombstone
|
|
858
|
-
// SST file's largest internal key stored as file
|
|
859
|
-
// However, since there are too many implicit
|
|
860
|
-
// to just ensure range tombstone iter is
|
|
944
|
+
// When entering a new file, range tombstone iter from the old file is
|
|
945
|
+
// freed, but the last key from that range tombstone iter may still be in
|
|
946
|
+
// the heap. We need to ensure the data underlying its corresponding key
|
|
947
|
+
// Slice is still alive. We do so by popping the range tombstone key from
|
|
948
|
+
// heap before calling iter->Next(). Technically, this change is not needed:
|
|
949
|
+
// if there is a range tombstone end key that is after file boundary
|
|
950
|
+
// sentinel key in minHeap_, the range tombstone end key must have been
|
|
951
|
+
// truncated at file boundary. The underlying data of the range tombstone
|
|
952
|
+
// end key Slice is the SST file's largest internal key stored as file
|
|
953
|
+
// metadata in Version. However, since there are too many implicit
|
|
954
|
+
// assumptions made, it is safer to just ensure range tombstone iter is
|
|
955
|
+
// still alive.
|
|
861
956
|
minHeap_.pop();
|
|
862
957
|
// Remove last SST file's range tombstone end key if there is one.
|
|
863
958
|
// This means file boundary is before range tombstone end key,
|
|
864
959
|
// which could happen when a range tombstone and a user key
|
|
865
960
|
// straddle two SST files. Note that in TruncatedRangeDelIterator
|
|
866
961
|
// constructor, parsed_largest.sequence is decremented 1 in this case.
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
962
|
+
// Maintains Invariant(rti) that at most one
|
|
963
|
+
// pinned_heap_item_[current->level] is in minHeap_.
|
|
964
|
+
if (range_tombstone_iters_[current->level] &&
|
|
965
|
+
range_tombstone_iters_[current->level]->Valid()) {
|
|
966
|
+
if (!minHeap_.empty() && minHeap_.top()->level == current->level) {
|
|
967
|
+
assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END);
|
|
968
|
+
minHeap_.pop();
|
|
969
|
+
// Invariant(active_): we are about to enter a new SST file with new
|
|
970
|
+
// range_tombstone_iters[current->level]. Either it is !Valid() or its
|
|
971
|
+
// start key is going to be added to minHeap_.
|
|
972
|
+
active_.erase(current->level);
|
|
973
|
+
} else {
|
|
974
|
+
// range tombstone is still valid, but it is not on heap.
|
|
975
|
+
// This should only happen if the range tombstone is over iterator
|
|
976
|
+
// upper bound.
|
|
977
|
+
assert(iterate_upper_bound_ &&
|
|
978
|
+
comparator_->user_comparator()->CompareWithoutTimestamp(
|
|
979
|
+
range_tombstone_iters_[current->level]->start_key().user_key,
|
|
980
|
+
true /* a_has_ts */, *iterate_upper_bound_,
|
|
981
|
+
false /* b_has_ts */) >= 0);
|
|
982
|
+
}
|
|
871
983
|
}
|
|
872
984
|
// LevelIterator enters a new SST file
|
|
873
985
|
current->iter.Next();
|
|
986
|
+
// Invariant(children_): current is popped from heap and added back only if
|
|
987
|
+
// it is valid
|
|
874
988
|
if (current->iter.Valid()) {
|
|
875
989
|
assert(current->iter.status().ok());
|
|
876
990
|
minHeap_.push(current);
|
|
877
991
|
}
|
|
992
|
+
// Invariants (rti) and (phi)
|
|
878
993
|
if (range_tombstone_iters_[current->level] &&
|
|
879
994
|
range_tombstone_iters_[current->level]->Valid()) {
|
|
880
995
|
InsertRangeTombstoneToMinHeap(current->level);
|
|
881
996
|
}
|
|
882
997
|
return true /* current key deleted */;
|
|
883
998
|
}
|
|
884
|
-
assert(current->type == HeapItem::ITERATOR);
|
|
999
|
+
assert(current->type == HeapItem::Type::ITERATOR);
|
|
885
1000
|
// Point key case: check active_ for range tombstone coverage.
|
|
886
1001
|
ParsedInternalKey pik;
|
|
887
1002
|
ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
|
|
@@ -908,6 +1023,7 @@ bool MergingIterator::SkipNextDeleted() {
|
|
|
908
1023
|
if (pik.sequence < range_tombstone_iters_[current->level]->seq()) {
|
|
909
1024
|
// covered by range tombstone
|
|
910
1025
|
current->iter.Next();
|
|
1026
|
+
// Invariant (children_)
|
|
911
1027
|
if (current->iter.Valid()) {
|
|
912
1028
|
minHeap_.replace_top(current);
|
|
913
1029
|
} else {
|
|
@@ -927,7 +1043,7 @@ bool MergingIterator::SkipNextDeleted() {
|
|
|
927
1043
|
}
|
|
928
1044
|
// we can reach here only if active_ is empty
|
|
929
1045
|
assert(active_.empty());
|
|
930
|
-
assert(minHeap_.top()->type == HeapItem::ITERATOR);
|
|
1046
|
+
assert(minHeap_.top()->type == HeapItem::Type::ITERATOR);
|
|
931
1047
|
return false /* current key not deleted */;
|
|
932
1048
|
}
|
|
933
1049
|
|
|
@@ -951,7 +1067,8 @@ void MergingIterator::SeekForPrevImpl(const Slice& target,
|
|
|
951
1067
|
if (range_tombstone_iters_[level] &&
|
|
952
1068
|
range_tombstone_iters_[level]->Valid()) {
|
|
953
1069
|
assert(static_cast<bool>(active_.count(level)) ==
|
|
954
|
-
(pinned_heap_item_[level].type ==
|
|
1070
|
+
(pinned_heap_item_[level].type ==
|
|
1071
|
+
HeapItem::Type::DELETE_RANGE_START));
|
|
955
1072
|
maxHeap_->push(&pinned_heap_item_[level]);
|
|
956
1073
|
} else {
|
|
957
1074
|
assert(!active_.count(level));
|
|
@@ -1056,7 +1173,7 @@ bool MergingIterator::SkipPrevDeleted() {
|
|
|
1056
1173
|
// - file boundary sentinel keys
|
|
1057
1174
|
// - range deletion start key
|
|
1058
1175
|
auto current = maxHeap_->top();
|
|
1059
|
-
if (current->type == HeapItem::DELETE_RANGE_START) {
|
|
1176
|
+
if (current->type == HeapItem::Type::DELETE_RANGE_START) {
|
|
1060
1177
|
active_.erase(current->level);
|
|
1061
1178
|
assert(range_tombstone_iters_[current->level] &&
|
|
1062
1179
|
range_tombstone_iters_[current->level]->Valid());
|
|
@@ -1074,7 +1191,7 @@ bool MergingIterator::SkipPrevDeleted() {
|
|
|
1074
1191
|
maxHeap_->pop();
|
|
1075
1192
|
// Remove last SST file's range tombstone key if there is one.
|
|
1076
1193
|
if (!maxHeap_->empty() && maxHeap_->top()->level == current->level &&
|
|
1077
|
-
maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) {
|
|
1194
|
+
maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) {
|
|
1078
1195
|
maxHeap_->pop();
|
|
1079
1196
|
active_.erase(current->level);
|
|
1080
1197
|
}
|
|
@@ -1090,7 +1207,7 @@ bool MergingIterator::SkipPrevDeleted() {
|
|
|
1090
1207
|
}
|
|
1091
1208
|
return true /* current key deleted */;
|
|
1092
1209
|
}
|
|
1093
|
-
assert(current->type == HeapItem::ITERATOR);
|
|
1210
|
+
assert(current->type == HeapItem::Type::ITERATOR);
|
|
1094
1211
|
// Point key case: check active_ for range tombstone coverage.
|
|
1095
1212
|
ParsedInternalKey pik;
|
|
1096
1213
|
ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError();
|
|
@@ -1136,11 +1253,12 @@ bool MergingIterator::SkipPrevDeleted() {
|
|
|
1136
1253
|
}
|
|
1137
1254
|
|
|
1138
1255
|
assert(active_.empty());
|
|
1139
|
-
assert(maxHeap_->top()->type == HeapItem::ITERATOR);
|
|
1256
|
+
assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR);
|
|
1140
1257
|
return false /* current key not deleted */;
|
|
1141
1258
|
}
|
|
1142
1259
|
|
|
1143
1260
|
void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
|
|
1261
|
+
// Invariant(children_)
|
|
1144
1262
|
if (child->iter.Valid()) {
|
|
1145
1263
|
assert(child->iter.status().ok());
|
|
1146
1264
|
minHeap_.push(child);
|
|
@@ -1164,6 +1282,7 @@ void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) {
|
|
|
1164
1282
|
// Advance all range tombstones iters, including the one corresponding to
|
|
1165
1283
|
// current_, to the first tombstone with end_key > current_.key().
|
|
1166
1284
|
// TODO: potentially do cascading seek here too
|
|
1285
|
+
// TODO: show that invariants hold
|
|
1167
1286
|
void MergingIterator::SwitchToForward() {
|
|
1168
1287
|
ClearHeaps();
|
|
1169
1288
|
Slice target = key();
|
|
@@ -1177,7 +1296,7 @@ void MergingIterator::SwitchToForward() {
|
|
|
1177
1296
|
if (child.iter.status() == Status::TryAgain()) {
|
|
1178
1297
|
continue;
|
|
1179
1298
|
}
|
|
1180
|
-
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
|
|
1299
|
+
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
|
|
1181
1300
|
assert(child.iter.status().ok());
|
|
1182
1301
|
child.iter.Next();
|
|
1183
1302
|
}
|
|
@@ -1188,7 +1307,7 @@ void MergingIterator::SwitchToForward() {
|
|
|
1188
1307
|
for (auto& child : children_) {
|
|
1189
1308
|
if (child.iter.status() == Status::TryAgain()) {
|
|
1190
1309
|
child.iter.Seek(target);
|
|
1191
|
-
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
|
|
1310
|
+
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
|
|
1192
1311
|
assert(child.iter.status().ok());
|
|
1193
1312
|
child.iter.Next();
|
|
1194
1313
|
}
|
|
@@ -1239,7 +1358,7 @@ void MergingIterator::SwitchToBackward() {
|
|
|
1239
1358
|
if (&child.iter != current_) {
|
|
1240
1359
|
child.iter.SeekForPrev(target);
|
|
1241
1360
|
TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
|
|
1242
|
-
if (child.iter.Valid() && comparator_->Equal(target, child.key())) {
|
|
1361
|
+
if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) {
|
|
1243
1362
|
assert(child.iter.status().ok());
|
|
1244
1363
|
child.iter.Prev();
|
|
1245
1364
|
}
|
|
@@ -1297,32 +1416,201 @@ void MergingIterator::ClearHeaps(bool clear_active) {
|
|
|
1297
1416
|
|
|
1298
1417
|
void MergingIterator::InitMaxHeap() {
|
|
1299
1418
|
if (!maxHeap_) {
|
|
1300
|
-
maxHeap_ =
|
|
1419
|
+
maxHeap_ =
|
|
1420
|
+
std::make_unique<MergerMaxIterHeap>(MaxHeapItemComparator(comparator_));
|
|
1301
1421
|
}
|
|
1302
1422
|
}
|
|
1303
1423
|
|
|
1304
|
-
//
|
|
1305
|
-
//
|
|
1306
|
-
//
|
|
1307
|
-
//
|
|
1308
|
-
//
|
|
1309
|
-
//
|
|
1424
|
+
// Assume there is a next key that is not covered by range tombstone.
|
|
1425
|
+
// Pre-condition:
|
|
1426
|
+
// - Invariants (3) and (4)
|
|
1427
|
+
// - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i,
|
|
1428
|
+
// k) for all levels i (LevelNextVisible() defined in Seek()).
|
|
1429
|
+
//
|
|
1430
|
+
// Define NextVisible(k) to be the first key >= k from among children_ that
|
|
1431
|
+
// is not covered by any range tombstone.
|
|
1432
|
+
// Post-condition:
|
|
1433
|
+
// - Invariants (1)-(4) hold
|
|
1434
|
+
// - (*): minHeap_->top()->key() == NextVisible(k)
|
|
1435
|
+
//
|
|
1436
|
+
// Loop invariants:
|
|
1437
|
+
// - Invariants (3) and (4)
|
|
1438
|
+
// - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k)
|
|
1439
|
+
//
|
|
1440
|
+
// Progress: minHeap_.top()->key() is non-decreasing and strictly increases in
|
|
1441
|
+
// a finite number of iterations.
|
|
1442
|
+
// TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with
|
|
1443
|
+
// k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range
|
|
1444
|
+
// tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5
|
|
1445
|
+
// triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers
|
|
1446
|
+
// SeekImpl(3 /* target */, 3).
|
|
1447
|
+
// Ideally we should only move iterators forward in SeekImpl(), and the
|
|
1448
|
+
// progress condition can be made simpler: iterator only moves forward.
|
|
1449
|
+
//
|
|
1450
|
+
// Proof sketch:
|
|
1451
|
+
// Post-condition:
|
|
1452
|
+
// Invariant (1) holds when this method returns:
|
|
1453
|
+
// Ignoring the empty minHeap_ case, there are two cases:
|
|
1454
|
+
// Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey()
|
|
1455
|
+
// By invariants (rti) and (active_), active_ being empty means if a
|
|
1456
|
+
// pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note
|
|
1457
|
+
// that PopDeleteRangeStart() was called right before the while loop condition,
|
|
1458
|
+
// so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must
|
|
1459
|
+
// be of type ITERATOR.
|
|
1460
|
+
// Case 2: SkipNextDeleted() returns false. The method returns false only when
|
|
1461
|
+
// minHeap_.top().type == ITERATOR.
|
|
1462
|
+
//
|
|
1463
|
+
// Invariant (2) holds when this method returns:
|
|
1464
|
+
// From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is
|
|
1465
|
+
// children_[i] for some i. Suppose that children_[i].iter.key() is covered by
|
|
1466
|
+
// some range tombstone. This means there is a j <= i and a range tombstone from
|
|
1467
|
+
// level j with start_key() < children_[i].iter.key() < end_key().
|
|
1468
|
+
// - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi),
|
|
1469
|
+
// pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik
|
|
1470
|
+
// is either start or end key of this range tombstone. If
|
|
1471
|
+
// pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at
|
|
1472
|
+
// top of minHeap_ which would contradict Invariant (1). So
|
|
1473
|
+
// pinned_heap_item_[j].tombstone_pik > children_[i].iter.key().
|
|
1474
|
+
// By Invariant (3), range_tombstone_iters_[j].prev.end_key() <
|
|
1475
|
+
// children_[i].iter.key(). We assume that in each level, range tombstones
|
|
1476
|
+
// cover non-overlapping ranges. So range_tombstone_iters_[j] is at
|
|
1477
|
+
// the range tombstone with start_key() < children_[i].iter.key() < end_key()
|
|
1478
|
+
// and has its end_key() in minHeap_. By Invariants (phi) and (active_),
|
|
1479
|
+
// j is in active_. From while loop condition, SkipNextDeleted() must have
|
|
1480
|
+
// returned false for this method to return.
|
|
1481
|
+
// - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i)
|
|
1482
|
+
// was called for some j' < i and j' in active_. Note that since j' is in
|
|
1483
|
+
// active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik =
|
|
1484
|
+
// range_tombstone_iters_[j']->end_key(). So
|
|
1485
|
+
// range_tombstone_iters_[j']->end_key() must be larger than
|
|
1486
|
+
// children_[i].iter.key() to not be at top of minHeap_. This means after
|
|
1487
|
+
// SeekImpl(), children_[i] would be at a key > children_[i].iter.key()
|
|
1488
|
+
// -- contradiction.
|
|
1489
|
+
// - If j == i, children_[i]->Next() would have been called and children_[i]
|
|
1490
|
+
// would be at a key > children_[i].iter.key() -- contradiction.
|
|
1491
|
+
// - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j]
|
|
1492
|
+
// points to an SST file with all range tombstones from that file exhausted.
|
|
1493
|
+
// The file must come before the file containing the first
|
|
1494
|
+
// range tombstone with start_key() < children_[i].iter.key() < end_key().
|
|
1495
|
+
// Assume files from same level have non-overlapping ranges, the current file's
|
|
1496
|
+
// meta.largest is less than children_[i].iter.key(). So the file boundary key,
|
|
1497
|
+
// which has value meta.largest must have been popped from minHeap_ before
|
|
1498
|
+
// children_[i].iter.key(). So range_tombstone_iters_[j] would not point to
|
|
1499
|
+
// this SST file -- contradiction.
|
|
1500
|
+
// So it is impossible for children_[i].iter.key() to be covered by a range
|
|
1501
|
+
// tombstone.
|
|
1502
|
+
//
|
|
1503
|
+
// Post-condition (*) holds when the function returns:
|
|
1504
|
+
// From loop invariant (*) that k <= children_[i].iter.key() <=
|
|
1505
|
+
// LevelNextVisible(i, k) and Invariant (2) above, when the function returns,
|
|
1506
|
+
// minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels
|
|
1507
|
+
// i. This is equal to NextVisible(k).
|
|
1508
|
+
//
|
|
1509
|
+
// Invariant (3) holds after each iteration:
|
|
1510
|
+
// PopDeleteRangeStart() does not change range tombstone position.
|
|
1511
|
+
// In SkipNextDeleted():
|
|
1512
|
+
// - If DELETE_RANGE_END is popped from minHeap_, it means the range
|
|
1513
|
+
// tombstone's end key is < all other point keys, so it is safe to advance to
|
|
1514
|
+
// next range tombstone.
|
|
1515
|
+
// - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()),
|
|
1516
|
+
// we assume that file's last range tombstone's
|
|
1517
|
+
// end_key <= file boundary key < all other point keys. So it is safe to
|
|
1518
|
+
// move to the first range tombstone in the next SST file.
|
|
1519
|
+
// - If children_[i]->Next() is called, then it is fine as it is advancing a
|
|
1520
|
+
// point iterator.
|
|
1521
|
+
// - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s
|
|
1522
|
+
// post-condition if its pre-condition holds. First pre-condition follows
|
|
1523
|
+
// from loop invariant where Invariant (3) holds for all levels i.
|
|
1524
|
+
// Now we should second pre-condition holds. Since Invariant (3) holds for
|
|
1525
|
+
// all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key()
|
|
1526
|
+
// < children_[l].iter.key(). `target` is the value of
|
|
1527
|
+
// range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_.
|
|
1528
|
+
// By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and
|
|
1529
|
+
// pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key().
|
|
1530
|
+
// This end_key must be larger than children_[l].key() since it was not at top
|
|
1531
|
+
// of minHeap_. So for all levels j <= l,
|
|
1532
|
+
// range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target
|
|
1533
|
+
//
|
|
1534
|
+
// Invariant (4) holds after each iteration:
|
|
1535
|
+
// A level i is inserted into active_ during calls to PopDeleteRangeStart().
|
|
1536
|
+
// In that case, range_tombstone_iters_[i].start_key() < all point keys
|
|
1537
|
+
// by heap property and the assumption that point keys and range tombstone keys
|
|
1538
|
+
// are distinct.
|
|
1539
|
+
// If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j]
|
|
1540
|
+
// where target = range_tombstone_iters_[j]->end_key() and children_[l]->key()
|
|
1541
|
+
// < target. By loop invariants, (3) and (4) holds for levels.
|
|
1542
|
+
// Since target > children_[l]->key(), it also holds that for j < l,
|
|
1543
|
+
// range_tombstone_iters_[j].prev.end_key() < target and that if j in active_,
|
|
1544
|
+
// range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of
|
|
1545
|
+
// SeekImpl(target, l) holds, and (4) follow from its post-condition.
|
|
1546
|
+
// All other places either in this function either advance point iterators
|
|
1547
|
+
// or remove some level from active_, so (4) still holds.
|
|
1548
|
+
//
|
|
1549
|
+
// Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i,
|
|
1550
|
+
// k).
|
|
1551
|
+
// k <= children_[i] follows from loop `progress` condition.
|
|
1552
|
+
// Consider when children_[i] is changed for any i. It is through
|
|
1553
|
+
// children_[i].iter.Next() or SeekImpl() in SkipNextDeleted().
|
|
1554
|
+
// If children_[i].iter.Next() is called, there is a range tombstone from level
|
|
1555
|
+
// i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_.
|
|
1556
|
+
// By Invariant (4), tombstone's start_key < children_[i].iter.key(). By
|
|
1557
|
+
// invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_
|
|
1558
|
+
// and that children_[i].iter.key() < end_key. So children_[i].iter.key() is
|
|
1559
|
+
// not visible, and it is safe to call Next().
|
|
1560
|
+
// If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns,
|
|
1561
|
+
// target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l,
|
|
1562
|
+
// and children_[<l] is not touched. We know `target` is
|
|
1563
|
+
// range_tombstone_iters_[j]->end_key() for some j < i and j is in active_.
|
|
1564
|
+
// By Invariant (4), range_tombstone_iters_[j]->start_key() <
|
|
1565
|
+
// children_[i].iter.key() for all i >= l. So for each level i >= l, the range
|
|
1566
|
+
// [children_[i].iter.key(), target) is not visible. So after SeekImpl(),
|
|
1567
|
+
// children_[i].iter.key() <= LevelNextVisible(i, target) <=
|
|
1568
|
+
// LevelNextVisible(i, k).
|
|
1569
|
+
//
|
|
1570
|
+
// `Progress` holds for each iteration:
|
|
1571
|
+
// Very sloppy intuition:
|
|
1572
|
+
// - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_
|
|
1573
|
+
// is updated from the start key to the end key of the same range tombstone.
|
|
1574
|
+
// We assume that start key <= end key for the same range tombstone.
|
|
1575
|
+
// - in SkipNextDeleted()
|
|
1576
|
+
// - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced
|
|
1577
|
+
// and the relevant pinned_heap_item_.tombstone_pik is increased or popped
|
|
1578
|
+
// from minHeap_.
|
|
1579
|
+
// - If the top of heap is a file boundary key, then both point iter and
|
|
1580
|
+
// range tombstone iter are advanced to the next file.
|
|
1581
|
+
// - If the top of heap is ITERATOR and current->iter.Next() is called, it
|
|
1582
|
+
// moves to a larger point key.
|
|
1583
|
+
// - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all
|
|
1584
|
+
// iterators from levels >= l are advanced to some key >= k by its contract.
|
|
1585
|
+
// And top of minHeap_ before SeekImpl(k, l) was less than k.
|
|
1586
|
+
// There are special cases where different heap items have the same key,
|
|
1587
|
+
// e.g. when two range tombstone end keys share the same value). In
|
|
1588
|
+
// these cases, iterators are being advanced, so the minimum key should increase
|
|
1589
|
+
// in a finite number of steps.
|
|
1310
1590
|
inline void MergingIterator::FindNextVisibleKey() {
|
|
1311
|
-
// When active_ is empty, we know heap top cannot be a range tombstone end
|
|
1312
|
-
// key. It cannot be a range tombstone start key per PopDeleteRangeStart().
|
|
1313
1591
|
PopDeleteRangeStart();
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1592
|
+
// PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START
|
|
1593
|
+
// active_ being empty implies no DELETE_RANGE_END in heap.
|
|
1594
|
+
// So minHeap_->top() must be of type ITERATOR.
|
|
1595
|
+
while (
|
|
1596
|
+
!minHeap_.empty() &&
|
|
1597
|
+
(!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) &&
|
|
1598
|
+
SkipNextDeleted()) {
|
|
1317
1599
|
PopDeleteRangeStart();
|
|
1318
1600
|
}
|
|
1601
|
+
// Checks Invariant (1)
|
|
1602
|
+
assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR);
|
|
1319
1603
|
}
|
|
1320
1604
|
|
|
1321
1605
|
inline void MergingIterator::FindPrevVisibleKey() {
|
|
1322
1606
|
PopDeleteRangeEnd();
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1607
|
+
// PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END
|
|
1608
|
+
// active_ being empty implies no DELETE_RANGE_START in heap.
|
|
1609
|
+
// So maxHeap_->top() must be of type ITERATOR.
|
|
1610
|
+
while (
|
|
1611
|
+
!maxHeap_->empty() &&
|
|
1612
|
+
(!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) &&
|
|
1613
|
+
SkipPrevDeleted()) {
|
|
1326
1614
|
PopDeleteRangeEnd();
|
|
1327
1615
|
}
|
|
1328
1616
|
}
|