@nxtedition/rocksdb 7.1.25 → 7.1.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +15 -7
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +7 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +18 -2
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +5 -1
- package/deps/rocksdb/rocksdb/db/db_iter.cc +2 -1
- package/deps/rocksdb/rocksdb/db/db_test_util.cc +4 -2
- package/deps/rocksdb/rocksdb/db/db_test_util.h +1 -1
- package/deps/rocksdb/rocksdb/db/version_set.cc +1 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/block_cache_trace_writer.h +149 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +8 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/table.h +1 -1
- package/deps/rocksdb/rocksdb/{table → include/rocksdb}/table_reader_caller.h +0 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -1
- package/deps/rocksdb/rocksdb/table/table_reader.h +1 -1
- package/deps/rocksdb/rocksdb/table/table_test.cc +29 -22
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +1 -1
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer.h +1 -1
- package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +25 -16
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +28 -21
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.h +16 -72
- package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer_test.cc +74 -38
- package/deps/rocksdb/rocksdb/util/user_comparator_wrapper.h +11 -31
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +2 -2
- package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator_test.cc +13 -13
- package/deps/rocksdb/rocksdb.gyp +1 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
package/binding.cc
CHANGED
|
@@ -306,13 +306,15 @@ struct BaseIterator : public Closable {
|
|
|
306
306
|
const std::optional<std::string>& gte,
|
|
307
307
|
const int limit,
|
|
308
308
|
const bool fillCache,
|
|
309
|
-
std::shared_ptr<const rocksdb::Snapshot> snapshot
|
|
309
|
+
std::shared_ptr<const rocksdb::Snapshot> snapshot,
|
|
310
|
+
bool tailing = false)
|
|
310
311
|
: database_(database),
|
|
311
312
|
column_(column),
|
|
312
313
|
snapshot_(snapshot),
|
|
313
314
|
reverse_(reverse),
|
|
314
315
|
limit_(limit),
|
|
315
|
-
fillCache_(fillCache)
|
|
316
|
+
fillCache_(fillCache),
|
|
317
|
+
tailing_(tailing) {
|
|
316
318
|
if (lte) {
|
|
317
319
|
upper_bound_ = rocksdb::PinnableSlice();
|
|
318
320
|
*upper_bound_->GetSelf() = std::move(*lte) + '\0';
|
|
@@ -426,6 +428,7 @@ struct BaseIterator : public Closable {
|
|
|
426
428
|
readOptions.snapshot = snapshot_.get();
|
|
427
429
|
readOptions.async_io = true;
|
|
428
430
|
readOptions.adaptive_readahead = true;
|
|
431
|
+
readOptions.tailing = tailing_;
|
|
429
432
|
|
|
430
433
|
iterator_.reset(database_->db->NewIterator(readOptions, column_));
|
|
431
434
|
}
|
|
@@ -436,6 +439,7 @@ struct BaseIterator : public Closable {
|
|
|
436
439
|
std::unique_ptr<rocksdb::Iterator> iterator_;
|
|
437
440
|
const bool reverse_;
|
|
438
441
|
const int limit_;
|
|
442
|
+
const bool tailing_;
|
|
439
443
|
const bool fillCache_;
|
|
440
444
|
};
|
|
441
445
|
|
|
@@ -454,8 +458,9 @@ struct Iterator final : public BaseIterator {
|
|
|
454
458
|
const Encoding keyEncoding,
|
|
455
459
|
const Encoding valueEncoding,
|
|
456
460
|
const size_t highWaterMarkBytes,
|
|
457
|
-
std::shared_ptr<const rocksdb::Snapshot> snapshot
|
|
458
|
-
|
|
461
|
+
std::shared_ptr<const rocksdb::Snapshot> snapshot,
|
|
462
|
+
bool tailing = false)
|
|
463
|
+
: BaseIterator(database, column, reverse, lt, lte, gt, gte, limit, fillCache, snapshot, tailing),
|
|
459
464
|
keys_(keys),
|
|
460
465
|
values_(values),
|
|
461
466
|
keyEncoding_(keyEncoding),
|
|
@@ -1157,6 +1162,9 @@ NAPI_METHOD(iterator_init) {
|
|
|
1157
1162
|
bool values = true;
|
|
1158
1163
|
NAPI_STATUS_THROWS(GetProperty(env, options, "values", values));
|
|
1159
1164
|
|
|
1165
|
+
bool tailing = false;
|
|
1166
|
+
NAPI_STATUS_THROWS(GetProperty(env, options, "tailing", tailing));
|
|
1167
|
+
|
|
1160
1168
|
bool fillCache = false;
|
|
1161
1169
|
NAPI_STATUS_THROWS(GetProperty(env, options, "fillCache", fillCache));
|
|
1162
1170
|
|
|
@@ -1190,9 +1198,9 @@ NAPI_METHOD(iterator_init) {
|
|
|
1190
1198
|
std::shared_ptr<const rocksdb::Snapshot> snapshot(database->db->GetSnapshot(),
|
|
1191
1199
|
[=](const auto ptr) { database->db->ReleaseSnapshot(ptr); });
|
|
1192
1200
|
|
|
1193
|
-
auto iterator =
|
|
1194
|
-
|
|
1195
|
-
|
|
1201
|
+
auto iterator = std::unique_ptr<Iterator>(new Iterator(database, column, reverse, keys, values, limit, lt, lte, gt,
|
|
1202
|
+
gte, fillCache, keyEncoding, valueEncoding, highWaterMarkBytes,
|
|
1203
|
+
snapshot, tailing));
|
|
1196
1204
|
|
|
1197
1205
|
napi_value result;
|
|
1198
1206
|
NAPI_STATUS_THROWS(napi_create_external(env, iterator.get(), Finalize<Iterator>, iterator.get(), &result));
|
|
@@ -919,11 +919,13 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
|
|
|
919
919
|
uint64_t{ClockHandle::kStateConstruction}
|
|
920
920
|
<< ClockHandle::kStateShift,
|
|
921
921
|
std::memory_order_acquire)) {
|
|
922
|
-
// Took ownership
|
|
923
|
-
|
|
922
|
+
// Took ownership.
|
|
923
|
+
// Save info about h to minimize dependences between atomic updates
|
|
924
|
+
// (e.g. fully relaxed Rollback after h released by marking empty)
|
|
925
|
+
const UniqueId64x2 h_hashed_key = h.hashed_key;
|
|
926
|
+
size_t h_total_charge = h.total_charge;
|
|
924
927
|
// TODO? Delay freeing?
|
|
925
928
|
h.FreeData();
|
|
926
|
-
*freed_charge += h.total_charge;
|
|
927
929
|
#ifndef NDEBUG
|
|
928
930
|
// Mark slot as empty, with assertion
|
|
929
931
|
meta = h.meta.exchange(0, std::memory_order_release);
|
|
@@ -934,7 +936,8 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge,
|
|
|
934
936
|
h.meta.store(0, std::memory_order_release);
|
|
935
937
|
#endif
|
|
936
938
|
*freed_count += 1;
|
|
937
|
-
|
|
939
|
+
*freed_charge += h_total_charge;
|
|
940
|
+
Rollback(h_hashed_key, &h);
|
|
938
941
|
}
|
|
939
942
|
}
|
|
940
943
|
|
|
@@ -5782,8 +5782,24 @@ Status DBImpl::NewDefaultReplayer(
|
|
|
5782
5782
|
Status DBImpl::StartBlockCacheTrace(
|
|
5783
5783
|
const TraceOptions& trace_options,
|
|
5784
5784
|
std::unique_ptr<TraceWriter>&& trace_writer) {
|
|
5785
|
-
|
|
5786
|
-
|
|
5785
|
+
BlockCacheTraceOptions block_trace_opts;
|
|
5786
|
+
block_trace_opts.sampling_frequency = trace_options.sampling_frequency;
|
|
5787
|
+
|
|
5788
|
+
BlockCacheTraceWriterOptions trace_writer_opt;
|
|
5789
|
+
trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size;
|
|
5790
|
+
|
|
5791
|
+
std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
|
|
5792
|
+
NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
|
|
5793
|
+
std::move(trace_writer));
|
|
5794
|
+
|
|
5795
|
+
return block_cache_tracer_.StartTrace(block_trace_opts,
|
|
5796
|
+
std::move(block_cache_trace_writer));
|
|
5797
|
+
}
|
|
5798
|
+
|
|
5799
|
+
Status DBImpl::StartBlockCacheTrace(
|
|
5800
|
+
const BlockCacheTraceOptions& trace_options,
|
|
5801
|
+
std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) {
|
|
5802
|
+
return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer));
|
|
5787
5803
|
}
|
|
5788
5804
|
|
|
5789
5805
|
Status DBImpl::EndBlockCacheTrace() {
|
|
@@ -568,9 +568,13 @@ class DBImpl : public DB {
|
|
|
568
568
|
|
|
569
569
|
using DB::StartBlockCacheTrace;
|
|
570
570
|
Status StartBlockCacheTrace(
|
|
571
|
-
const TraceOptions&
|
|
571
|
+
const TraceOptions& trace_options,
|
|
572
572
|
std::unique_ptr<TraceWriter>&& trace_writer) override;
|
|
573
573
|
|
|
574
|
+
Status StartBlockCacheTrace(
|
|
575
|
+
const BlockCacheTraceOptions& options,
|
|
576
|
+
std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
|
|
577
|
+
|
|
574
578
|
using DB::EndBlockCacheTrace;
|
|
575
579
|
Status EndBlockCacheTrace() override;
|
|
576
580
|
|
|
@@ -90,7 +90,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
|
|
|
90
90
|
iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
|
|
91
91
|
}
|
|
92
92
|
status_.PermitUncheckedError();
|
|
93
|
-
assert(timestamp_size_ ==
|
|
93
|
+
assert(timestamp_size_ ==
|
|
94
|
+
user_comparator_.user_comparator()->timestamp_size());
|
|
94
95
|
}
|
|
95
96
|
|
|
96
97
|
Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
|
|
@@ -487,8 +487,10 @@ Options DBTestBase::GetOptions(
|
|
|
487
487
|
case kInfiniteMaxOpenFiles:
|
|
488
488
|
options.max_open_files = -1;
|
|
489
489
|
break;
|
|
490
|
-
case
|
|
491
|
-
|
|
490
|
+
case kCRC32cChecksum: {
|
|
491
|
+
// Old default was CRC32c, but XXH3 (new default) is faster on common
|
|
492
|
+
// hardware
|
|
493
|
+
table_options.checksum = kCRC32c;
|
|
492
494
|
// Thrown in here for basic coverage:
|
|
493
495
|
options.DisableExtraChecks();
|
|
494
496
|
break;
|
|
@@ -1048,7 +1048,7 @@ class DBTestBase : public testing::Test {
|
|
|
1048
1048
|
kUniversalCompactionMultiLevel = 20,
|
|
1049
1049
|
kCompressedBlockCache = 21,
|
|
1050
1050
|
kInfiniteMaxOpenFiles = 22,
|
|
1051
|
-
|
|
1051
|
+
kCRC32cChecksum = 23,
|
|
1052
1052
|
kFIFOCompaction = 24,
|
|
1053
1053
|
kOptimizeFiltersForHits = 25,
|
|
1054
1054
|
kRowCache = 26,
|
|
@@ -1239,7 +1239,7 @@ void LevelIterator::Seek(const Slice& target) {
|
|
|
1239
1239
|
prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
|
|
1240
1240
|
!read_options_.auto_prefix_mode &&
|
|
1241
1241
|
file_index_ < flevel_->num_files - 1) {
|
|
1242
|
-
size_t ts_sz = user_comparator_.timestamp_size();
|
|
1242
|
+
size_t ts_sz = user_comparator_.user_comparator()->timestamp_size();
|
|
1243
1243
|
Slice target_user_key_without_ts =
|
|
1244
1244
|
ExtractUserKeyAndStripTimestamp(target, ts_sz);
|
|
1245
1245
|
Slice next_file_first_user_key_without_ts =
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
// Copyright (c) 2022, Meta, Inc. All rights reserved.
|
|
2
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
+
|
|
6
|
+
#pragma once
|
|
7
|
+
|
|
8
|
+
#include "rocksdb/options.h"
|
|
9
|
+
#include "rocksdb/system_clock.h"
|
|
10
|
+
#include "rocksdb/table_reader_caller.h"
|
|
11
|
+
#include "rocksdb/trace_reader_writer.h"
|
|
12
|
+
#include "rocksdb/trace_record.h"
|
|
13
|
+
|
|
14
|
+
namespace ROCKSDB_NAMESPACE {
|
|
15
|
+
// A record for block cache lookups/inserts. This is passed by the table
|
|
16
|
+
// reader to the BlockCacheTraceWriter for every block cache op.
|
|
17
|
+
struct BlockCacheTraceRecord {
|
|
18
|
+
// Required fields for all accesses.
|
|
19
|
+
uint64_t access_timestamp = 0;
|
|
20
|
+
|
|
21
|
+
// Info related to the block being looked up or inserted
|
|
22
|
+
//
|
|
23
|
+
// 1. The cache key for the block
|
|
24
|
+
std::string block_key;
|
|
25
|
+
|
|
26
|
+
// 2. The type of block
|
|
27
|
+
TraceType block_type = TraceType::kTraceMax;
|
|
28
|
+
|
|
29
|
+
// 3. Size of the block
|
|
30
|
+
uint64_t block_size = 0;
|
|
31
|
+
|
|
32
|
+
// Info about the SST file the block is in
|
|
33
|
+
//
|
|
34
|
+
// 1. Column family ID
|
|
35
|
+
uint64_t cf_id = 0;
|
|
36
|
+
|
|
37
|
+
// 2. Column family name
|
|
38
|
+
std::string cf_name;
|
|
39
|
+
|
|
40
|
+
// 3. LSM level of the file
|
|
41
|
+
uint32_t level = 0;
|
|
42
|
+
|
|
43
|
+
// 4. SST file number
|
|
44
|
+
uint64_t sst_fd_number = 0;
|
|
45
|
+
|
|
46
|
+
// Info about the calling context
|
|
47
|
+
//
|
|
48
|
+
// 1. The higher level request triggering the block cache request
|
|
49
|
+
TableReaderCaller caller = TableReaderCaller::kMaxBlockCacheLookupCaller;
|
|
50
|
+
|
|
51
|
+
// 2. Cache lookup hit/miss. Not relevant for inserts
|
|
52
|
+
bool is_cache_hit = false;
|
|
53
|
+
|
|
54
|
+
// 3. Whether this request is a lookup
|
|
55
|
+
bool no_insert = false;
|
|
56
|
+
|
|
57
|
+
// Get/MultiGet specific info
|
|
58
|
+
//
|
|
59
|
+
// 1. A unique ID for Get/MultiGet
|
|
60
|
+
uint64_t get_id = kReservedGetId;
|
|
61
|
+
|
|
62
|
+
// 2. Whether the Get/MultiGet is from a user-specified snapshot
|
|
63
|
+
bool get_from_user_specified_snapshot = false;
|
|
64
|
+
|
|
65
|
+
// 3. The target user key in the block
|
|
66
|
+
std::string referenced_key;
|
|
67
|
+
|
|
68
|
+
// Required fields for data block and user Get/Multi-Get only.
|
|
69
|
+
//
|
|
70
|
+
// 1. Size of te useful data in the block
|
|
71
|
+
uint64_t referenced_data_size = 0;
|
|
72
|
+
|
|
73
|
+
// 2. Only for MultiGet, number of keys from the batch found in the block
|
|
74
|
+
uint64_t num_keys_in_block = 0;
|
|
75
|
+
|
|
76
|
+
// 3. Whether the key was found in the block or not (false positive)
|
|
77
|
+
bool referenced_key_exist_in_block = false;
|
|
78
|
+
|
|
79
|
+
static const uint64_t kReservedGetId;
|
|
80
|
+
|
|
81
|
+
BlockCacheTraceRecord() {}
|
|
82
|
+
|
|
83
|
+
BlockCacheTraceRecord(uint64_t _access_timestamp, std::string _block_key,
|
|
84
|
+
TraceType _block_type, uint64_t _block_size,
|
|
85
|
+
uint64_t _cf_id, std::string _cf_name, uint32_t _level,
|
|
86
|
+
uint64_t _sst_fd_number, TableReaderCaller _caller,
|
|
87
|
+
bool _is_cache_hit, bool _no_insert, uint64_t _get_id,
|
|
88
|
+
bool _get_from_user_specified_snapshot = false,
|
|
89
|
+
std::string _referenced_key = "",
|
|
90
|
+
uint64_t _referenced_data_size = 0,
|
|
91
|
+
uint64_t _num_keys_in_block = 0,
|
|
92
|
+
bool _referenced_key_exist_in_block = false)
|
|
93
|
+
: access_timestamp(_access_timestamp),
|
|
94
|
+
block_key(_block_key),
|
|
95
|
+
block_type(_block_type),
|
|
96
|
+
block_size(_block_size),
|
|
97
|
+
cf_id(_cf_id),
|
|
98
|
+
cf_name(_cf_name),
|
|
99
|
+
level(_level),
|
|
100
|
+
sst_fd_number(_sst_fd_number),
|
|
101
|
+
caller(_caller),
|
|
102
|
+
is_cache_hit(_is_cache_hit),
|
|
103
|
+
no_insert(_no_insert),
|
|
104
|
+
get_id(_get_id),
|
|
105
|
+
get_from_user_specified_snapshot(_get_from_user_specified_snapshot),
|
|
106
|
+
referenced_key(_referenced_key),
|
|
107
|
+
referenced_data_size(_referenced_data_size),
|
|
108
|
+
num_keys_in_block(_num_keys_in_block),
|
|
109
|
+
referenced_key_exist_in_block(_referenced_key_exist_in_block) {}
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
// Options for tracing block cache accesses
|
|
113
|
+
struct BlockCacheTraceOptions {
|
|
114
|
+
// Specify trace sampling option, i.e. capture one per how many requests.
|
|
115
|
+
// Default to 1 (capture every request).
|
|
116
|
+
uint64_t sampling_frequency = 1;
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
// Options for the built-in implementation of BlockCacheTraceWriter
|
|
120
|
+
struct BlockCacheTraceWriterOptions {
|
|
121
|
+
uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
// BlockCacheTraceWriter is an abstract class that captures all RocksDB block
|
|
125
|
+
// cache accesses. Every RocksDB operation is passed to WriteBlockAccess()
|
|
126
|
+
// with a BlockCacheTraceRecord.
|
|
127
|
+
class BlockCacheTraceWriter {
|
|
128
|
+
public:
|
|
129
|
+
virtual ~BlockCacheTraceWriter() {}
|
|
130
|
+
|
|
131
|
+
// Pass Slice references to avoid copy.
|
|
132
|
+
virtual Status WriteBlockAccess(const BlockCacheTraceRecord& record,
|
|
133
|
+
const Slice& block_key, const Slice& cf_name,
|
|
134
|
+
const Slice& referenced_key) = 0;
|
|
135
|
+
|
|
136
|
+
// Write a trace header at the beginning, typically on initiating a trace,
|
|
137
|
+
// with some metadata like a magic number and RocksDB version.
|
|
138
|
+
virtual Status WriteHeader() = 0;
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
// Allocate an instance of the built-in BlockCacheTraceWriter implementation,
|
|
142
|
+
// that traces all block cache accesses to a user-provided TraceWriter. Each
|
|
143
|
+
// access is traced to a file with a timestamp and type, followed by the
|
|
144
|
+
// payload.
|
|
145
|
+
std::unique_ptr<BlockCacheTraceWriter> NewBlockCacheTraceWriter(
|
|
146
|
+
SystemClock* clock, const BlockCacheTraceWriterOptions& trace_options,
|
|
147
|
+
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
148
|
+
|
|
149
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <unordered_map>
|
|
18
18
|
#include <vector>
|
|
19
19
|
|
|
20
|
+
#include "rocksdb/block_cache_trace_writer.h"
|
|
20
21
|
#include "rocksdb/iterator.h"
|
|
21
22
|
#include "rocksdb/listener.h"
|
|
22
23
|
#include "rocksdb/metadata.h"
|
|
@@ -1748,11 +1749,17 @@ class DB {
|
|
|
1748
1749
|
|
|
1749
1750
|
// Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
|
|
1750
1751
|
virtual Status StartBlockCacheTrace(
|
|
1751
|
-
const TraceOptions& /*
|
|
1752
|
+
const TraceOptions& /*trace_options*/,
|
|
1752
1753
|
std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
|
|
1753
1754
|
return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
|
|
1754
1755
|
}
|
|
1755
1756
|
|
|
1757
|
+
virtual Status StartBlockCacheTrace(
|
|
1758
|
+
const BlockCacheTraceOptions& /*options*/,
|
|
1759
|
+
std::unique_ptr<BlockCacheTraceWriter>&& /*trace_writer*/) {
|
|
1760
|
+
return Status::NotSupported("StartBlockCacheTrace() is not implemented.");
|
|
1761
|
+
}
|
|
1762
|
+
|
|
1756
1763
|
virtual Status EndBlockCacheTrace() {
|
|
1757
1764
|
return Status::NotSupported("EndBlockCacheTrace() is not implemented.");
|
|
1758
1765
|
}
|
|
@@ -251,7 +251,7 @@ struct BlockBasedTableOptions {
|
|
|
251
251
|
// Use the specified checksum type. Newly created table files will be
|
|
252
252
|
// protected with this checksum type. Old table files will still be readable,
|
|
253
253
|
// even though they have different checksum type.
|
|
254
|
-
ChecksumType checksum =
|
|
254
|
+
ChecksumType checksum = kXXH3;
|
|
255
255
|
|
|
256
256
|
// Disable block cache. If this is set to true,
|
|
257
257
|
// then no block cache should be used, and the block_cache should
|
|
File without changes
|
|
@@ -404,8 +404,14 @@ class StackableDB : public DB {
|
|
|
404
404
|
|
|
405
405
|
using DB::StartBlockCacheTrace;
|
|
406
406
|
Status StartBlockCacheTrace(
|
|
407
|
-
const TraceOptions&
|
|
407
|
+
const TraceOptions& trace_options,
|
|
408
408
|
std::unique_ptr<TraceWriter>&& trace_writer) override {
|
|
409
|
+
return db_->StartBlockCacheTrace(trace_options, std::move(trace_writer));
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
Status StartBlockCacheTrace(
|
|
413
|
+
const BlockCacheTraceOptions& options,
|
|
414
|
+
std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override {
|
|
409
415
|
return db_->StartBlockCacheTrace(options, std::move(trace_writer));
|
|
410
416
|
}
|
|
411
417
|
|
|
@@ -15,10 +15,10 @@
|
|
|
15
15
|
#include "folly/experimental/coro/Task.h"
|
|
16
16
|
#endif
|
|
17
17
|
#include "rocksdb/slice_transform.h"
|
|
18
|
+
#include "rocksdb/table_reader_caller.h"
|
|
18
19
|
#include "table/get_context.h"
|
|
19
20
|
#include "table/internal_iterator.h"
|
|
20
21
|
#include "table/multiget_context.h"
|
|
21
|
-
#include "table/table_reader_caller.h"
|
|
22
22
|
|
|
23
23
|
namespace ROCKSDB_NAMESPACE {
|
|
24
24
|
|
|
@@ -1147,15 +1147,21 @@ class BlockBasedTableTest
|
|
|
1147
1147
|
test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
|
|
1148
1148
|
EXPECT_OK(env_->CreateDir(test_path_));
|
|
1149
1149
|
trace_file_path_ = test_path_ + "/block_cache_trace_file";
|
|
1150
|
-
|
|
1150
|
+
|
|
1151
|
+
BlockCacheTraceWriterOptions trace_writer_opt;
|
|
1152
|
+
BlockCacheTraceOptions trace_opt;
|
|
1151
1153
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
1152
1154
|
EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
|
|
1153
1155
|
&trace_writer));
|
|
1156
|
+
std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
|
|
1157
|
+
NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
|
|
1158
|
+
std::move(trace_writer));
|
|
1159
|
+
ASSERT_NE(block_cache_trace_writer, nullptr);
|
|
1154
1160
|
// Always return Status::OK().
|
|
1155
1161
|
assert(c->block_cache_tracer_
|
|
1156
|
-
.StartTrace(
|
|
1157
|
-
std::move(trace_writer))
|
|
1162
|
+
.StartTrace(trace_opt, std::move(block_cache_trace_writer))
|
|
1158
1163
|
.ok());
|
|
1164
|
+
|
|
1159
1165
|
{
|
|
1160
1166
|
std::string user_key = "k01";
|
|
1161
1167
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
@@ -1213,10 +1219,10 @@ class BlockBasedTableTest
|
|
|
1213
1219
|
} else {
|
|
1214
1220
|
EXPECT_EQ(access.referenced_key, "");
|
|
1215
1221
|
EXPECT_EQ(access.get_id, 0);
|
|
1216
|
-
|
|
1222
|
+
EXPECT_FALSE(access.get_from_user_specified_snapshot);
|
|
1217
1223
|
EXPECT_EQ(access.referenced_data_size, 0);
|
|
1218
1224
|
EXPECT_EQ(access.num_keys_in_block, 0);
|
|
1219
|
-
|
|
1225
|
+
EXPECT_FALSE(access.referenced_key_exist_in_block);
|
|
1220
1226
|
}
|
|
1221
1227
|
index++;
|
|
1222
1228
|
}
|
|
@@ -2250,7 +2256,8 @@ TEST_P(BlockBasedTableTest, BadChecksumType) {
|
|
|
2250
2256
|
// Corrupt checksum type (123 is invalid)
|
|
2251
2257
|
auto& sink = *c.TEST_GetSink();
|
|
2252
2258
|
size_t len = sink.contents_.size();
|
|
2253
|
-
ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength],
|
|
2259
|
+
ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength],
|
|
2260
|
+
table_options.checksum);
|
|
2254
2261
|
sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
|
|
2255
2262
|
|
|
2256
2263
|
// (Re-)Open table file with bad checksum type
|
|
@@ -3051,8 +3058,8 @@ TEST_P(BlockBasedTableTest, TracingGetTest) {
|
|
|
3051
3058
|
BlockCacheTraceRecord record;
|
|
3052
3059
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3053
3060
|
record.caller = TableReaderCaller::kPrefetch;
|
|
3054
|
-
record.is_cache_hit =
|
|
3055
|
-
record.no_insert =
|
|
3061
|
+
record.is_cache_hit = false;
|
|
3062
|
+
record.no_insert = false;
|
|
3056
3063
|
expected_records.push_back(record);
|
|
3057
3064
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
3058
3065
|
expected_records.push_back(record);
|
|
@@ -3061,22 +3068,22 @@ TEST_P(BlockBasedTableTest, TracingGetTest) {
|
|
|
3061
3068
|
record.get_id = 1;
|
|
3062
3069
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
3063
3070
|
record.caller = TableReaderCaller::kUserGet;
|
|
3064
|
-
record.get_from_user_specified_snapshot =
|
|
3071
|
+
record.get_from_user_specified_snapshot = false;
|
|
3065
3072
|
record.referenced_key = encoded_key;
|
|
3066
|
-
record.referenced_key_exist_in_block =
|
|
3067
|
-
record.is_cache_hit =
|
|
3073
|
+
record.referenced_key_exist_in_block = true;
|
|
3074
|
+
record.is_cache_hit = true;
|
|
3068
3075
|
expected_records.push_back(record);
|
|
3069
3076
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3070
3077
|
expected_records.push_back(record);
|
|
3071
|
-
record.is_cache_hit =
|
|
3078
|
+
record.is_cache_hit = false;
|
|
3072
3079
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
3073
3080
|
expected_records.push_back(record);
|
|
3074
3081
|
// The second get should all observe cache hits.
|
|
3075
|
-
record.is_cache_hit =
|
|
3082
|
+
record.is_cache_hit = true;
|
|
3076
3083
|
record.get_id = 2;
|
|
3077
3084
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
3078
3085
|
record.caller = TableReaderCaller::kUserGet;
|
|
3079
|
-
record.get_from_user_specified_snapshot =
|
|
3086
|
+
record.get_from_user_specified_snapshot = false;
|
|
3080
3087
|
record.referenced_key = encoded_key;
|
|
3081
3088
|
expected_records.push_back(record);
|
|
3082
3089
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
@@ -3116,15 +3123,15 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
|
|
|
3116
3123
|
BlockCacheTraceRecord record;
|
|
3117
3124
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3118
3125
|
record.caller = TableReaderCaller::kPrefetch;
|
|
3119
|
-
record.is_cache_hit =
|
|
3120
|
-
record.no_insert =
|
|
3126
|
+
record.is_cache_hit = false;
|
|
3127
|
+
record.no_insert = false;
|
|
3121
3128
|
expected_records.push_back(record);
|
|
3122
3129
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
3123
3130
|
expected_records.push_back(record);
|
|
3124
3131
|
// Then we should have two records for only index blocks.
|
|
3125
3132
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3126
3133
|
record.caller = TableReaderCaller::kUserApproximateSize;
|
|
3127
|
-
record.is_cache_hit =
|
|
3134
|
+
record.is_cache_hit = true;
|
|
3128
3135
|
expected_records.push_back(record);
|
|
3129
3136
|
expected_records.push_back(record);
|
|
3130
3137
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
@@ -3169,24 +3176,24 @@ TEST_P(BlockBasedTableTest, TracingIterator) {
|
|
|
3169
3176
|
BlockCacheTraceRecord record;
|
|
3170
3177
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3171
3178
|
record.caller = TableReaderCaller::kPrefetch;
|
|
3172
|
-
record.is_cache_hit =
|
|
3173
|
-
record.no_insert =
|
|
3179
|
+
record.is_cache_hit = false;
|
|
3180
|
+
record.no_insert = false;
|
|
3174
3181
|
expected_records.push_back(record);
|
|
3175
3182
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
3176
3183
|
expected_records.push_back(record);
|
|
3177
3184
|
// Then we should have three records for index and two data block access.
|
|
3178
3185
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3179
3186
|
record.caller = TableReaderCaller::kUserIterator;
|
|
3180
|
-
record.is_cache_hit =
|
|
3187
|
+
record.is_cache_hit = true;
|
|
3181
3188
|
expected_records.push_back(record);
|
|
3182
3189
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
3183
|
-
record.is_cache_hit =
|
|
3190
|
+
record.is_cache_hit = false;
|
|
3184
3191
|
expected_records.push_back(record);
|
|
3185
3192
|
expected_records.push_back(record);
|
|
3186
3193
|
// When we iterate this file for the second time, we should observe all cache
|
|
3187
3194
|
// hits.
|
|
3188
3195
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
3189
|
-
record.is_cache_hit =
|
|
3196
|
+
record.is_cache_hit = true;
|
|
3190
3197
|
expected_records.push_back(record);
|
|
3191
3198
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
3192
3199
|
expected_records.push_back(record);
|
|
@@ -1568,7 +1568,7 @@ Status BlockCacheTraceAnalyzer::Analyze() {
|
|
|
1568
1568
|
trace_end_timestamp_in_seconds_ = access.access_timestamp / kMicrosInSecond;
|
|
1569
1569
|
miss_ratio_stats_.UpdateMetrics(access.access_timestamp,
|
|
1570
1570
|
is_user_access(access.caller),
|
|
1571
|
-
access.is_cache_hit
|
|
1571
|
+
!access.is_cache_hit);
|
|
1572
1572
|
if (cache_simulator_) {
|
|
1573
1573
|
cache_simulator_->Access(access);
|
|
1574
1574
|
}
|
|
@@ -95,7 +95,7 @@ struct BlockAccessInfo {
|
|
|
95
95
|
if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock(access.block_type,
|
|
96
96
|
access.caller)) {
|
|
97
97
|
num_keys = access.num_keys_in_block;
|
|
98
|
-
if (access.referenced_key_exist_in_block
|
|
98
|
+
if (access.referenced_key_exist_in_block) {
|
|
99
99
|
if (key_num_access_map.find(access.referenced_key) ==
|
|
100
100
|
key_num_access_map.end()) {
|
|
101
101
|
referenced_data_size += access.referenced_data_size;
|
|
@@ -114,14 +114,14 @@ class BlockCacheTracerTest : public testing::Test {
|
|
|
114
114
|
} else {
|
|
115
115
|
record.sst_fd_number = kSSTStoringOddKeys;
|
|
116
116
|
}
|
|
117
|
-
record.is_cache_hit =
|
|
118
|
-
record.no_insert =
|
|
117
|
+
record.is_cache_hit = false;
|
|
118
|
+
record.no_insert = false;
|
|
119
119
|
// Provide these fields for all block types.
|
|
120
120
|
// The writer should only write these fields for data blocks and the
|
|
121
121
|
// caller is either GET or MGET.
|
|
122
122
|
record.referenced_key =
|
|
123
123
|
kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0);
|
|
124
|
-
record.referenced_key_exist_in_block =
|
|
124
|
+
record.referenced_key_exist_in_block = true;
|
|
125
125
|
record.num_keys_in_block = kNumKeysInBlock;
|
|
126
126
|
ASSERT_OK(writer->WriteBlockAccess(
|
|
127
127
|
record, record.block_key, record.cf_name, record.referenced_key));
|
|
@@ -223,15 +223,18 @@ class BlockCacheTracerTest : public testing::Test {
|
|
|
223
223
|
TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
|
|
224
224
|
{
|
|
225
225
|
// Generate a trace file.
|
|
226
|
-
|
|
226
|
+
BlockCacheTraceWriterOptions trace_writer_opt;
|
|
227
227
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
228
228
|
ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
|
|
229
229
|
&trace_writer));
|
|
230
230
|
const auto& clock = env_->GetSystemClock();
|
|
231
|
-
BlockCacheTraceWriter
|
|
231
|
+
std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
|
|
232
|
+
NewBlockCacheTraceWriter(clock.get(), trace_writer_opt,
|
|
232
233
|
std::move(trace_writer));
|
|
233
|
-
|
|
234
|
-
|
|
234
|
+
ASSERT_NE(block_cache_trace_writer, nullptr);
|
|
235
|
+
ASSERT_OK(block_cache_trace_writer->WriteHeader());
|
|
236
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 0,
|
|
237
|
+
TraceType::kBlockTraceDataBlock, 50);
|
|
235
238
|
ASSERT_OK(env_->FileExists(trace_file_path_));
|
|
236
239
|
}
|
|
237
240
|
{
|
|
@@ -612,21 +615,27 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
|
|
|
612
615
|
// It contains two SST files with 25 blocks of odd numbered block_key in
|
|
613
616
|
// kSSTStoringOddKeys and 25 blocks of even numbered blocks_key in
|
|
614
617
|
// kSSTStoringEvenKeys.
|
|
615
|
-
|
|
618
|
+
BlockCacheTraceWriterOptions trace_writer_opt;
|
|
616
619
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
617
620
|
const auto& clock = env_->GetSystemClock();
|
|
618
621
|
ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
|
|
619
622
|
&trace_writer));
|
|
620
|
-
BlockCacheTraceWriter
|
|
623
|
+
std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
|
|
624
|
+
NewBlockCacheTraceWriter(clock.get(), trace_writer_opt,
|
|
621
625
|
std::move(trace_writer));
|
|
622
|
-
|
|
626
|
+
ASSERT_NE(block_cache_trace_writer, nullptr);
|
|
627
|
+
ASSERT_OK(block_cache_trace_writer->WriteHeader());
|
|
623
628
|
// Write blocks of different types.
|
|
624
|
-
WriteBlockAccess(
|
|
625
|
-
10);
|
|
626
|
-
WriteBlockAccess(
|
|
627
|
-
|
|
628
|
-
WriteBlockAccess(
|
|
629
|
-
|
|
629
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 0,
|
|
630
|
+
TraceType::kBlockTraceUncompressionDictBlock, 10);
|
|
631
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 10,
|
|
632
|
+
TraceType::kBlockTraceDataBlock, 10);
|
|
633
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 20,
|
|
634
|
+
TraceType::kBlockTraceFilterBlock, 10);
|
|
635
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 30,
|
|
636
|
+
TraceType::kBlockTraceIndexBlock, 10);
|
|
637
|
+
WriteBlockAccess(block_cache_trace_writer.get(), 40,
|
|
638
|
+
TraceType::kBlockTraceRangeDeletionBlock, 10);
|
|
630
639
|
ASSERT_OK(env_->FileExists(trace_file_path_));
|
|
631
640
|
}
|
|
632
641
|
|