@nxtedition/rocksdb 7.0.26 → 7.0.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +67 -25
- package/chained-batch.js +1 -1
- package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -0
- package/deps/rocksdb/rocksdb/Makefile +3 -0
- package/deps/rocksdb/rocksdb/TARGETS +10 -0
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +17 -7
- package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
- package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
- package/deps/rocksdb/rocksdb/cache/charged_cache.cc +117 -0
- package/deps/rocksdb/rocksdb/cache/charged_cache.h +121 -0
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +270 -180
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +412 -124
- package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +1 -0
- package/deps/rocksdb/rocksdb/cache/lru_cache.cc +1 -1
- package/deps/rocksdb/rocksdb/cache/lru_cache.h +2 -2
- package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
- package/deps/rocksdb/rocksdb/cache/sharded_cache.h +1 -1
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +71 -9
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +11 -2
- package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +21 -14
- package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +68 -7
- package/deps/rocksdb/rocksdb/db/blob/blob_source.h +16 -0
- package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +519 -12
- package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +120 -0
- package/deps/rocksdb/rocksdb/db/builder.cc +15 -5
- package/deps/rocksdb/rocksdb/db/builder.h +3 -0
- package/deps/rocksdb/rocksdb/db/c.cc +18 -0
- package/deps/rocksdb/rocksdb/db/c_test.c +18 -0
- package/deps/rocksdb/rocksdb/db/column_family.h +2 -0
- package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +3 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +9 -4
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +15 -10
- package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +36 -34
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +50 -13
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +12 -0
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +13 -17
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +26 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +0 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +93 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +3 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +17 -5
- package/deps/rocksdb/rocksdb/db/db_test.cc +0 -3
- package/deps/rocksdb/rocksdb/db/db_test2.cc +39 -12
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +9 -0
- package/deps/rocksdb/rocksdb/db/db_test_util.h +2 -0
- package/deps/rocksdb/rocksdb/db/dbformat.cc +0 -38
- package/deps/rocksdb/rocksdb/db/dbformat.h +14 -13
- package/deps/rocksdb/rocksdb/db/dbformat_test.cc +5 -2
- package/deps/rocksdb/rocksdb/db/event_helpers.cc +13 -1
- package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +0 -10
- package/deps/rocksdb/rocksdb/db/flush_job.cc +19 -15
- package/deps/rocksdb/rocksdb/db/flush_job.h +7 -0
- package/deps/rocksdb/rocksdb/db/flush_job_test.cc +21 -15
- package/deps/rocksdb/rocksdb/db/forward_iterator.h +4 -3
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +9 -0
- package/deps/rocksdb/rocksdb/db/memtable_list.h +5 -0
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +53 -12
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +14 -2
- package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +10 -10
- package/deps/rocksdb/rocksdb/db/repair.cc +8 -6
- package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +890 -0
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +324 -0
- package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +186 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -0
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -4
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -2
- package/deps/rocksdb/rocksdb/env/env_test.cc +74 -1
- package/deps/rocksdb/rocksdb/env/io_posix.cc +11 -8
- package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +28 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/c.h +14 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +4 -4
- package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +30 -23
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +3 -13
- package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +1 -2
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
- package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
- package/deps/rocksdb/rocksdb/options/cf_options.cc +14 -1
- package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
- package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -56
- package/deps/rocksdb/rocksdb/options/db_options.cc +4 -5
- package/deps/rocksdb/rocksdb/options/options.cc +11 -1
- package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -0
- package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
- package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -0
- package/deps/rocksdb/rocksdb/options/options_test.cc +4 -0
- package/deps/rocksdb/rocksdb/src.mk +3 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +6 -1
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +4 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +36 -3
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +36 -1
- package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +14 -3
- package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
- package/deps/rocksdb/rocksdb/table/meta_blocks.cc +6 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +5 -0
- package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +3 -0
- package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -7
- package/deps/rocksdb/rocksdb/table/table_builder.h +7 -3
- package/deps/rocksdb/rocksdb/table/table_properties.cc +9 -0
- package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +3 -2
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +58 -30
- package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +1 -0
- package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +20 -0
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +29 -154
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +16 -34
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +0 -92
- package/deps/rocksdb/rocksdb/util/timer.h +6 -0
- package/deps/rocksdb/rocksdb/util/vector_iterator.h +4 -3
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -45
- package/deps/rocksdb/rocksdb/utilities/debug.cc +40 -0
- package/deps/rocksdb/rocksdb.gyp +2 -0
- package/index.js +4 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
//
|
|
3
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
4
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
5
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
6
|
+
|
|
7
|
+
#include "db/seqno_to_time_mapping.h"
|
|
8
|
+
|
|
9
|
+
#include "db/version_edit.h"
|
|
10
|
+
#include "util/string_util.h"
|
|
11
|
+
|
|
12
|
+
namespace ROCKSDB_NAMESPACE {
|
|
13
|
+
|
|
14
|
+
uint64_t SeqnoToTimeMapping::GetOldestApproximateTime(
|
|
15
|
+
const SequenceNumber seqno) const {
|
|
16
|
+
assert(is_sorted_);
|
|
17
|
+
auto it = std::upper_bound(seqno_time_mapping_.begin(),
|
|
18
|
+
seqno_time_mapping_.end(), seqno);
|
|
19
|
+
if (it == seqno_time_mapping_.begin()) {
|
|
20
|
+
return 0;
|
|
21
|
+
}
|
|
22
|
+
it--;
|
|
23
|
+
return it->time;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) {
|
|
27
|
+
if (seqno == 0) {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
is_sorted_ = false;
|
|
31
|
+
seqno_time_mapping_.emplace_back(seqno, time);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
SequenceNumber SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) {
|
|
35
|
+
assert(is_sorted_);
|
|
36
|
+
|
|
37
|
+
if (max_time_duration_ == 0) {
|
|
38
|
+
return 0;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const uint64_t cut_off_time =
|
|
42
|
+
now > max_time_duration_ ? now - max_time_duration_ : 0;
|
|
43
|
+
assert(cut_off_time <= now); // no overflow
|
|
44
|
+
|
|
45
|
+
auto it = std::upper_bound(
|
|
46
|
+
seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time,
|
|
47
|
+
[](uint64_t target, const SeqnoTimePair& other) -> bool {
|
|
48
|
+
return target < other.time;
|
|
49
|
+
});
|
|
50
|
+
if (it == seqno_time_mapping_.begin()) {
|
|
51
|
+
return 0;
|
|
52
|
+
}
|
|
53
|
+
it--;
|
|
54
|
+
seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it);
|
|
55
|
+
|
|
56
|
+
return seqno_time_mapping_.front().seqno;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// The encoded format is:
|
|
60
|
+
// [num_of_entries][[seqno][time],[seqno][time],...]
|
|
61
|
+
// ^ ^
|
|
62
|
+
// var_int delta_encoded (var_int)
|
|
63
|
+
void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start,
|
|
64
|
+
const SequenceNumber end, const uint64_t now,
|
|
65
|
+
const uint64_t output_size) const {
|
|
66
|
+
assert(is_sorted_);
|
|
67
|
+
if (start > end) {
|
|
68
|
+
// It could happen when the SST file is empty, the initial value of min
|
|
69
|
+
// sequence number is kMaxSequenceNumber and max is 0.
|
|
70
|
+
// The empty output file will be removed in the final step of compaction.
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
auto start_it = std::upper_bound(seqno_time_mapping_.begin(),
|
|
75
|
+
seqno_time_mapping_.end(), start);
|
|
76
|
+
if (start_it != seqno_time_mapping_.begin()) {
|
|
77
|
+
start_it--;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
auto end_it = std::upper_bound(seqno_time_mapping_.begin(),
|
|
81
|
+
seqno_time_mapping_.end(), end);
|
|
82
|
+
if (end_it == seqno_time_mapping_.begin()) {
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
if (start_it >= end_it) {
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// truncate old entries that are not needed
|
|
90
|
+
if (max_time_duration_ > 0) {
|
|
91
|
+
const uint64_t cut_off_time =
|
|
92
|
+
now > max_time_duration_ ? now - max_time_duration_ : 0;
|
|
93
|
+
while (start_it < end_it && start_it->time < cut_off_time) {
|
|
94
|
+
start_it++;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// If there are more data than needed, pick the entries for encoding.
|
|
99
|
+
// It's not the most optimized algorithm for selecting the best representative
|
|
100
|
+
// entries over the time.
|
|
101
|
+
// It starts from the beginning and makes sure the distance is larger than
|
|
102
|
+
// `(end - start) / size` before selecting the number. For example, for the
|
|
103
|
+
// following list, pick 3 entries (it will pick seqno #1, #6, #8):
|
|
104
|
+
// 1 -> 10
|
|
105
|
+
// 5 -> 17
|
|
106
|
+
// 6 -> 25
|
|
107
|
+
// 8 -> 30
|
|
108
|
+
// first, it always picks the first one, then there are 2 num_entries_to_fill
|
|
109
|
+
// and the time difference between current one vs. the last one is
|
|
110
|
+
// (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips
|
|
111
|
+
// #5 and pick #6.
|
|
112
|
+
// But the most optimized solution is picking #1 #5 #8, as it will be more
|
|
113
|
+
// evenly distributed for time. Anyway the following algorithm is simple and
|
|
114
|
+
// may over-select new data, which is good. We do want more accurate time
|
|
115
|
+
// information for recent data.
|
|
116
|
+
std::deque<SeqnoTimePair> output_copy;
|
|
117
|
+
if (std::distance(start_it, end_it) > static_cast<int64_t>(output_size)) {
|
|
118
|
+
int64_t num_entries_to_fill = static_cast<int64_t>(output_size);
|
|
119
|
+
auto last_it = end_it;
|
|
120
|
+
last_it--;
|
|
121
|
+
uint64_t end_time = last_it->time;
|
|
122
|
+
uint64_t skip_until_time = 0;
|
|
123
|
+
for (auto it = start_it; it < end_it; it++) {
|
|
124
|
+
// skip if it's not reach the skip_until_time yet
|
|
125
|
+
if (std::distance(it, end_it) > num_entries_to_fill &&
|
|
126
|
+
it->time < skip_until_time) {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
output_copy.push_back(*it);
|
|
130
|
+
num_entries_to_fill--;
|
|
131
|
+
if (std::distance(it, end_it) > num_entries_to_fill &&
|
|
132
|
+
num_entries_to_fill > 0) {
|
|
133
|
+
// If there are more entries than we need, re-calculate the
|
|
134
|
+
// skip_until_time, which means skip until that time
|
|
135
|
+
skip_until_time =
|
|
136
|
+
it->time + ((end_time - it->time) / num_entries_to_fill);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Make sure all entries are filled
|
|
141
|
+
assert(num_entries_to_fill == 0);
|
|
142
|
+
start_it = output_copy.begin();
|
|
143
|
+
end_it = output_copy.end();
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Delta encode the data
|
|
147
|
+
uint64_t size = std::distance(start_it, end_it);
|
|
148
|
+
PutVarint64(&dest, size);
|
|
149
|
+
SeqnoTimePair base;
|
|
150
|
+
for (auto it = start_it; it < end_it; it++) {
|
|
151
|
+
assert(base < *it);
|
|
152
|
+
SeqnoTimePair val = *it - base;
|
|
153
|
+
base = *it;
|
|
154
|
+
val.Encode(dest);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) {
|
|
159
|
+
Slice input(seqno_time_mapping_str);
|
|
160
|
+
if (input.empty()) {
|
|
161
|
+
return Status::OK();
|
|
162
|
+
}
|
|
163
|
+
uint64_t size;
|
|
164
|
+
if (!GetVarint64(&input, &size)) {
|
|
165
|
+
return Status::Corruption("Invalid sequence number time size");
|
|
166
|
+
}
|
|
167
|
+
is_sorted_ = false;
|
|
168
|
+
SeqnoTimePair base;
|
|
169
|
+
for (uint64_t i = 0; i < size; i++) {
|
|
170
|
+
SeqnoTimePair val;
|
|
171
|
+
Status s = val.Decode(input);
|
|
172
|
+
if (!s.ok()) {
|
|
173
|
+
return s;
|
|
174
|
+
}
|
|
175
|
+
val.Add(base);
|
|
176
|
+
seqno_time_mapping_.emplace_back(val);
|
|
177
|
+
base = val;
|
|
178
|
+
}
|
|
179
|
+
return Status::OK();
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
|
|
183
|
+
PutVarint64Varint64(&dest, seqno, time);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
|
|
187
|
+
if (!GetVarint64(&input, &seqno)) {
|
|
188
|
+
return Status::Corruption("Invalid sequence number");
|
|
189
|
+
}
|
|
190
|
+
if (!GetVarint64(&input, &time)) {
|
|
191
|
+
return Status::Corruption("Invalid time");
|
|
192
|
+
}
|
|
193
|
+
return Status::OK();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
|
|
197
|
+
assert(is_sorted_);
|
|
198
|
+
|
|
199
|
+
// skip seq number 0, which may have special meaning, like zeroed out data
|
|
200
|
+
if (seqno == 0) {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
if (!Empty()) {
|
|
204
|
+
if (seqno < Last().seqno || time < Last().time) {
|
|
205
|
+
return false;
|
|
206
|
+
}
|
|
207
|
+
if (seqno == Last().seqno) {
|
|
208
|
+
Last().time = time;
|
|
209
|
+
return true;
|
|
210
|
+
}
|
|
211
|
+
if (time == Last().time) {
|
|
212
|
+
// new sequence has the same time as old one, no need to add new mapping
|
|
213
|
+
return false;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
seqno_time_mapping_.emplace_back(seqno, time);
|
|
218
|
+
|
|
219
|
+
if (seqno_time_mapping_.size() > max_capacity_) {
|
|
220
|
+
seqno_time_mapping_.pop_front();
|
|
221
|
+
}
|
|
222
|
+
return true;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration,
|
|
226
|
+
uint64_t max_time_duration) {
|
|
227
|
+
uint64_t new_max_capacity =
|
|
228
|
+
CalculateMaxCapacity(min_time_duration, max_time_duration);
|
|
229
|
+
if (new_max_capacity == max_capacity_) {
|
|
230
|
+
return false;
|
|
231
|
+
} else if (new_max_capacity < seqno_time_mapping_.size()) {
|
|
232
|
+
uint64_t delta = seqno_time_mapping_.size() - new_max_capacity;
|
|
233
|
+
seqno_time_mapping_.erase(seqno_time_mapping_.begin(),
|
|
234
|
+
seqno_time_mapping_.begin() + delta);
|
|
235
|
+
}
|
|
236
|
+
max_capacity_ = new_max_capacity;
|
|
237
|
+
return true;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
Status SeqnoToTimeMapping::Sort() {
|
|
241
|
+
if (is_sorted_) {
|
|
242
|
+
return Status::OK();
|
|
243
|
+
}
|
|
244
|
+
if (seqno_time_mapping_.empty()) {
|
|
245
|
+
is_sorted_ = true;
|
|
246
|
+
return Status::OK();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
std::deque<SeqnoTimePair> copy = std::move(seqno_time_mapping_);
|
|
250
|
+
|
|
251
|
+
std::sort(copy.begin(), copy.end());
|
|
252
|
+
|
|
253
|
+
seqno_time_mapping_.clear();
|
|
254
|
+
|
|
255
|
+
// remove seqno = 0, which may have special meaning, like zeroed out data
|
|
256
|
+
while (copy.front().seqno == 0) {
|
|
257
|
+
copy.pop_front();
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
SeqnoTimePair prev = copy.front();
|
|
261
|
+
for (const auto& it : copy) {
|
|
262
|
+
// If sequence number is the same, pick the one with larger time, which is
|
|
263
|
+
// more accurate than the older time.
|
|
264
|
+
if (it.seqno == prev.seqno) {
|
|
265
|
+
assert(it.time >= prev.time);
|
|
266
|
+
prev.time = it.time;
|
|
267
|
+
} else {
|
|
268
|
+
assert(it.seqno > prev.seqno);
|
|
269
|
+
// If a larger sequence number has an older time which is not useful, skip
|
|
270
|
+
if (it.time > prev.time) {
|
|
271
|
+
seqno_time_mapping_.push_back(prev);
|
|
272
|
+
prev = it;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
seqno_time_mapping_.emplace_back(prev);
|
|
277
|
+
|
|
278
|
+
is_sorted_ = true;
|
|
279
|
+
return Status::OK();
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
std::string SeqnoToTimeMapping::ToHumanString() const {
|
|
283
|
+
std::string ret;
|
|
284
|
+
for (const auto& seq_time : seqno_time_mapping_) {
|
|
285
|
+
AppendNumberTo(&ret, seq_time.seqno);
|
|
286
|
+
ret.append("->");
|
|
287
|
+
AppendNumberTo(&ret, seq_time.time);
|
|
288
|
+
ret.append(",");
|
|
289
|
+
}
|
|
290
|
+
return ret;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
SeqnoToTimeMapping SeqnoToTimeMapping::Copy(
|
|
294
|
+
SequenceNumber smallest_seqno) const {
|
|
295
|
+
SeqnoToTimeMapping ret;
|
|
296
|
+
auto it = std::upper_bound(seqno_time_mapping_.begin(),
|
|
297
|
+
seqno_time_mapping_.end(), smallest_seqno);
|
|
298
|
+
if (it != seqno_time_mapping_.begin()) {
|
|
299
|
+
it--;
|
|
300
|
+
}
|
|
301
|
+
std::copy(it, seqno_time_mapping_.end(),
|
|
302
|
+
std::back_inserter(ret.seqno_time_mapping_));
|
|
303
|
+
return ret;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration,
|
|
307
|
+
uint64_t max_time_duration) {
|
|
308
|
+
if (min_time_duration == 0) {
|
|
309
|
+
return 0;
|
|
310
|
+
}
|
|
311
|
+
return std::min(
|
|
312
|
+
kMaxSeqnoToTimeEntries,
|
|
313
|
+
max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-(
|
|
317
|
+
const SeqnoTimePair& other) const {
|
|
318
|
+
SeqnoTimePair res;
|
|
319
|
+
res.seqno = seqno - other.seqno;
|
|
320
|
+
res.time = time - other.time;
|
|
321
|
+
return res;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
//
|
|
3
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
4
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
5
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
6
|
+
|
|
7
|
+
#pragma once
|
|
8
|
+
|
|
9
|
+
#include <algorithm>
|
|
10
|
+
#include <cinttypes>
|
|
11
|
+
#include <deque>
|
|
12
|
+
#include <functional>
|
|
13
|
+
#include <iterator>
|
|
14
|
+
#include <string>
|
|
15
|
+
|
|
16
|
+
#include "rocksdb/status.h"
|
|
17
|
+
#include "rocksdb/types.h"
|
|
18
|
+
|
|
19
|
+
namespace ROCKSDB_NAMESPACE {
|
|
20
|
+
|
|
21
|
+
constexpr uint64_t kUnknownSeqnoTime = 0;
|
|
22
|
+
|
|
23
|
+
// SeqnoToTimeMapping stores the sequence number to time mapping, so given a
|
|
24
|
+
// sequence number it can estimate the oldest possible time for that sequence
|
|
25
|
+
// number. For example:
|
|
26
|
+
// 10 -> 100
|
|
27
|
+
// 50 -> 300
|
|
28
|
+
// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it
|
|
29
|
+
// would be 300.
|
|
30
|
+
// As it's a sorted list, the new entry is inserted from the back. The old data
|
|
31
|
+
// will be popped from the front if they're no longer used.
|
|
32
|
+
//
|
|
33
|
+
// Note: the data struct is not thread safe, both read and write need to be
|
|
34
|
+
// synchronized by caller.
|
|
35
|
+
class SeqnoToTimeMapping {
|
|
36
|
+
public:
|
|
37
|
+
// Maximum number of entries can be encoded into SST. The data is delta encode
|
|
38
|
+
// so the maximum data usage for each SST is < 0.3K
|
|
39
|
+
static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100;
|
|
40
|
+
|
|
41
|
+
// Maximum number of entries per CF. If there's only CF with this feature on,
|
|
42
|
+
// the max duration divided by this number, so for example, if
|
|
43
|
+
// preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
|
|
44
|
+
// seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
|
|
45
|
+
// needs is 100.
|
|
46
|
+
// When there are multiple CFs having this feature on, the sampling cadence is
|
|
47
|
+
// determined by the smallest setting, the capacity is determined the largest
|
|
48
|
+
// setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
|
|
49
|
+
static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100;
|
|
50
|
+
|
|
51
|
+
// A simple struct for sequence number to time pair
|
|
52
|
+
struct SeqnoTimePair {
|
|
53
|
+
SequenceNumber seqno = 0;
|
|
54
|
+
uint64_t time = 0;
|
|
55
|
+
|
|
56
|
+
SeqnoTimePair() = default;
|
|
57
|
+
SeqnoTimePair(SequenceNumber _seqno, uint64_t _time)
|
|
58
|
+
: seqno(_seqno), time(_time) {}
|
|
59
|
+
|
|
60
|
+
// Encode to dest string
|
|
61
|
+
void Encode(std::string& dest) const;
|
|
62
|
+
|
|
63
|
+
// Decode the value from input Slice and remove it from the input
|
|
64
|
+
Status Decode(Slice& input);
|
|
65
|
+
|
|
66
|
+
// subtraction of 2 SeqnoTimePair
|
|
67
|
+
SeqnoTimePair operator-(const SeqnoTimePair& other) const;
|
|
68
|
+
|
|
69
|
+
// Add 2 values together
|
|
70
|
+
void Add(const SeqnoTimePair& obj) {
|
|
71
|
+
seqno += obj.seqno;
|
|
72
|
+
time += obj.time;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Compare SeqnoTimePair with a sequence number, used for binary search a
|
|
76
|
+
// sequence number in a list of SeqnoTimePair
|
|
77
|
+
bool operator<(const SequenceNumber& other) const { return seqno < other; }
|
|
78
|
+
|
|
79
|
+
// Compare 2 SeqnoTimePair
|
|
80
|
+
bool operator<(const SeqnoTimePair& other) const {
|
|
81
|
+
return std::tie(seqno, time) < std::tie(other.seqno, other.time);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Check if 2 SeqnoTimePair is the same
|
|
85
|
+
bool operator==(const SeqnoTimePair& other) const {
|
|
86
|
+
return std::tie(seqno, time) == std::tie(other.seqno, other.time);
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// constractor of SeqnoToTimeMapping
|
|
91
|
+
// max_time_duration is the maximum time it should track. For example, if
|
|
92
|
+
// preclude_last_level_data_seconds is 1 day, then if an entry is older than 1
|
|
93
|
+
// day, then it can be removed.
|
|
94
|
+
// max_capacity is the maximum number of entry it can hold. For single CF,
|
|
95
|
+
// it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise
|
|
96
|
+
// kMaxSeqnoTimePairsPerCF * 10.
|
|
97
|
+
// If it's set to 0, means it won't truncate any old data.
|
|
98
|
+
explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0,
|
|
99
|
+
uint64_t max_capacity = 0)
|
|
100
|
+
: max_time_duration_(max_time_duration), max_capacity_(max_capacity) {}
|
|
101
|
+
|
|
102
|
+
// Append a new entry to the list. The new entry should be newer than the
|
|
103
|
+
// existing ones. It maintains the internal sorted status.
|
|
104
|
+
bool Append(SequenceNumber seqno, uint64_t time);
|
|
105
|
+
|
|
106
|
+
// Given a sequence number, estimate it's oldest time
|
|
107
|
+
uint64_t GetOldestApproximateTime(SequenceNumber seqno) const;
|
|
108
|
+
|
|
109
|
+
// Truncate the old entries based on the current time and max_time_duration_
|
|
110
|
+
SequenceNumber TruncateOldEntries(uint64_t now);
|
|
111
|
+
|
|
112
|
+
// Encode to a binary string
|
|
113
|
+
void Encode(std::string& des, SequenceNumber start, SequenceNumber end,
|
|
114
|
+
uint64_t now,
|
|
115
|
+
uint64_t output_size = kMaxSeqnoTimePairsPerSST) const;
|
|
116
|
+
|
|
117
|
+
// Add a new random entry, unlike Append(), it can be any data, but also makes
|
|
118
|
+
// the list un-sorted.
|
|
119
|
+
void Add(SequenceNumber seqno, uint64_t time);
|
|
120
|
+
|
|
121
|
+
// Decode and add the entries to the current obj. The list will be unsorted
|
|
122
|
+
Status Add(const std::string& seqno_time_mapping_str);
|
|
123
|
+
|
|
124
|
+
// Return the number of entries
|
|
125
|
+
size_t Size() const { return seqno_time_mapping_.size(); }
|
|
126
|
+
|
|
127
|
+
// Reduce the size of internal list
|
|
128
|
+
bool Resize(uint64_t min_time_duration, uint64_t max_time_duration);
|
|
129
|
+
|
|
130
|
+
// Override the max_time_duration_
|
|
131
|
+
void SetMaxTimeDuration(uint64_t max_time_duration) {
|
|
132
|
+
max_time_duration_ = max_time_duration;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
uint64_t GetCapacity() const { return max_capacity_; }
|
|
136
|
+
|
|
137
|
+
// Sort the list, which also remove the redundant entries, useless entries,
|
|
138
|
+
// which makes sure the seqno is sorted, but also the time
|
|
139
|
+
Status Sort();
|
|
140
|
+
|
|
141
|
+
// copy the current obj from the given smallest_seqno.
|
|
142
|
+
SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const;
|
|
143
|
+
|
|
144
|
+
// If the internal list is empty
|
|
145
|
+
bool Empty() const { return seqno_time_mapping_.empty(); }
|
|
146
|
+
|
|
147
|
+
// clear all entries
|
|
148
|
+
void Clear() { seqno_time_mapping_.clear(); }
|
|
149
|
+
|
|
150
|
+
// return the string for user message
|
|
151
|
+
// Note: Not efficient, okay for print
|
|
152
|
+
std::string ToHumanString() const;
|
|
153
|
+
|
|
154
|
+
#ifndef NDEBUG
|
|
155
|
+
const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const {
|
|
156
|
+
return seqno_time_mapping_;
|
|
157
|
+
}
|
|
158
|
+
#endif
|
|
159
|
+
|
|
160
|
+
private:
|
|
161
|
+
static constexpr uint64_t kMaxSeqnoToTimeEntries =
|
|
162
|
+
kMaxSeqnoTimePairsPerCF * 10;
|
|
163
|
+
|
|
164
|
+
uint64_t max_time_duration_;
|
|
165
|
+
uint64_t max_capacity_;
|
|
166
|
+
|
|
167
|
+
std::deque<SeqnoTimePair> seqno_time_mapping_;
|
|
168
|
+
|
|
169
|
+
bool is_sorted_ = true;
|
|
170
|
+
|
|
171
|
+
static uint64_t CalculateMaxCapacity(uint64_t min_time_duration,
|
|
172
|
+
uint64_t max_time_duration);
|
|
173
|
+
|
|
174
|
+
SeqnoTimePair& Last() {
|
|
175
|
+
assert(!Empty());
|
|
176
|
+
return seqno_time_mapping_.back();
|
|
177
|
+
}
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
// for searching the sequence number from SeqnoToTimeMapping
|
|
181
|
+
inline bool operator<(const SequenceNumber& seqno,
|
|
182
|
+
const SeqnoToTimeMapping::SeqnoTimePair& other) {
|
|
183
|
+
return seqno < other.seqno;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -141,6 +141,7 @@ DECLARE_bool(charge_compression_dictionary_building_buffer);
|
|
|
141
141
|
DECLARE_bool(charge_filter_construction);
|
|
142
142
|
DECLARE_bool(charge_table_reader);
|
|
143
143
|
DECLARE_bool(charge_file_metadata);
|
|
144
|
+
DECLARE_bool(charge_blob_cache);
|
|
144
145
|
DECLARE_int32(top_level_index_pinning);
|
|
145
146
|
DECLARE_int32(partition_pinning);
|
|
146
147
|
DECLARE_int32(unpartitioned_pinning);
|
|
@@ -272,6 +273,7 @@ DECLARE_bool(use_blob_cache);
|
|
|
272
273
|
DECLARE_bool(use_shared_block_and_blob_cache);
|
|
273
274
|
DECLARE_uint64(blob_cache_size);
|
|
274
275
|
DECLARE_int32(blob_cache_numshardbits);
|
|
276
|
+
DECLARE_int32(prepopulate_blob_cache);
|
|
275
277
|
|
|
276
278
|
DECLARE_int32(approximate_size_one_in);
|
|
277
279
|
DECLARE_bool(sync_fault_injection);
|
|
@@ -316,24 +316,29 @@ DEFINE_bool(cache_index_and_filter_blocks, false,
|
|
|
316
316
|
|
|
317
317
|
DEFINE_bool(charge_compression_dictionary_building_buffer, false,
|
|
318
318
|
"Setting for "
|
|
319
|
-
"CacheEntryRoleOptions::charged of"
|
|
319
|
+
"CacheEntryRoleOptions::charged of "
|
|
320
320
|
"CacheEntryRole::kCompressionDictionaryBuildingBuffer");
|
|
321
321
|
|
|
322
322
|
DEFINE_bool(charge_filter_construction, false,
|
|
323
323
|
"Setting for "
|
|
324
|
-
"CacheEntryRoleOptions::charged of"
|
|
324
|
+
"CacheEntryRoleOptions::charged of "
|
|
325
325
|
"CacheEntryRole::kFilterConstruction");
|
|
326
326
|
|
|
327
327
|
DEFINE_bool(charge_table_reader, false,
|
|
328
328
|
"Setting for "
|
|
329
|
-
"CacheEntryRoleOptions::charged of"
|
|
329
|
+
"CacheEntryRoleOptions::charged of "
|
|
330
330
|
"CacheEntryRole::kBlockBasedTableReader");
|
|
331
331
|
|
|
332
332
|
DEFINE_bool(charge_file_metadata, false,
|
|
333
333
|
"Setting for "
|
|
334
|
-
"CacheEntryRoleOptions::charged of"
|
|
334
|
+
"CacheEntryRoleOptions::charged of "
|
|
335
335
|
"kFileMetadata");
|
|
336
336
|
|
|
337
|
+
DEFINE_bool(charge_blob_cache, false,
|
|
338
|
+
"Setting for "
|
|
339
|
+
"CacheEntryRoleOptions::charged of "
|
|
340
|
+
"kBlobCache");
|
|
341
|
+
|
|
337
342
|
DEFINE_int32(
|
|
338
343
|
top_level_index_pinning,
|
|
339
344
|
static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
|
|
@@ -474,6 +479,10 @@ DEFINE_int32(blob_cache_numshardbits, 6,
|
|
|
474
479
|
"the block and blob caches are different "
|
|
475
480
|
"(use_shared_block_and_blob_cache = false).");
|
|
476
481
|
|
|
482
|
+
DEFINE_int32(prepopulate_blob_cache, 0,
|
|
483
|
+
"[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 "
|
|
484
|
+
"to disable and 1 to insert during flush.");
|
|
485
|
+
|
|
477
486
|
static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
|
|
478
487
|
RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
|
|
479
488
|
|
|
@@ -270,6 +270,8 @@ bool StressTest::BuildOptionsTable() {
|
|
|
270
270
|
std::vector<std::string>{"0", "1M", "4M"});
|
|
271
271
|
options_tbl.emplace("blob_file_starting_level",
|
|
272
272
|
std::vector<std::string>{"0", "1", "2"});
|
|
273
|
+
options_tbl.emplace("prepopulate_blob_cache",
|
|
274
|
+
std::vector<std::string>{"kDisable", "kFlushOnly"});
|
|
273
275
|
}
|
|
274
276
|
|
|
275
277
|
options_table_ = std::move(options_tbl);
|
|
@@ -2401,9 +2403,12 @@ void StressTest::Open(SharedState* shared) {
|
|
|
2401
2403
|
fprintf(stdout,
|
|
2402
2404
|
"Integrated BlobDB: blob cache enabled, block and blob caches "
|
|
2403
2405
|
"shared: %d, blob cache size %" PRIu64
|
|
2404
|
-
", blob cache num shard bits: %d\n",
|
|
2406
|
+
", blob cache num shard bits: %d, blob cache prepopulated: %s\n",
|
|
2405
2407
|
FLAGS_use_shared_block_and_blob_cache, FLAGS_blob_cache_size,
|
|
2406
|
-
FLAGS_blob_cache_numshardbits
|
|
2408
|
+
FLAGS_blob_cache_numshardbits,
|
|
2409
|
+
options_.prepopulate_blob_cache == PrepopulateBlobCache::kFlushOnly
|
|
2410
|
+
? "flush only"
|
|
2411
|
+
: "disable");
|
|
2407
2412
|
} else {
|
|
2408
2413
|
fprintf(stdout, "Integrated BlobDB: blob cache disabled\n");
|
|
2409
2414
|
}
|
|
@@ -2902,6 +2907,11 @@ void InitializeOptionsFromFlags(
|
|
|
2902
2907
|
{/*.charged = */ FLAGS_charge_file_metadata
|
|
2903
2908
|
? CacheEntryRoleOptions::Decision::kEnabled
|
|
2904
2909
|
: CacheEntryRoleOptions::Decision::kDisabled}});
|
|
2910
|
+
block_based_options.cache_usage_options.options_overrides.insert(
|
|
2911
|
+
{CacheEntryRole::kBlobCache,
|
|
2912
|
+
{/*.charged = */ FLAGS_charge_blob_cache
|
|
2913
|
+
? CacheEntryRoleOptions::Decision::kEnabled
|
|
2914
|
+
: CacheEntryRoleOptions::Decision::kDisabled}});
|
|
2905
2915
|
block_based_options.format_version =
|
|
2906
2916
|
static_cast<uint32_t>(FLAGS_format_version);
|
|
2907
2917
|
block_based_options.index_block_restart_interval =
|
|
@@ -3043,6 +3053,17 @@ void InitializeOptionsFromFlags(
|
|
|
3043
3053
|
exit(1);
|
|
3044
3054
|
}
|
|
3045
3055
|
}
|
|
3056
|
+
switch (FLAGS_prepopulate_blob_cache) {
|
|
3057
|
+
case 0:
|
|
3058
|
+
options.prepopulate_blob_cache = PrepopulateBlobCache::kDisable;
|
|
3059
|
+
break;
|
|
3060
|
+
case 1:
|
|
3061
|
+
options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
|
|
3062
|
+
break;
|
|
3063
|
+
default:
|
|
3064
|
+
fprintf(stderr, "Unknown prepopulate blob cache mode\n");
|
|
3065
|
+
exit(1);
|
|
3066
|
+
}
|
|
3046
3067
|
}
|
|
3047
3068
|
|
|
3048
3069
|
options.wal_compression =
|