@nxtedition/rocksdb 7.1.10 → 7.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +50 -33
- package/deps/rocksdb/rocksdb/CMakeLists.txt +2 -1
- package/deps/rocksdb/rocksdb/TARGETS +2 -0
- package/deps/rocksdb/rocksdb/db/db_basic_test.cc +60 -17
- package/deps/rocksdb/rocksdb/db/db_impl/compacted_db_impl.cc +4 -4
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +81 -37
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +6 -0
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_readonly.cc +6 -6
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_secondary.cc +10 -8
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +14 -9
- package/deps/rocksdb/rocksdb/db/db_memtable_test.cc +3 -3
- package/deps/rocksdb/rocksdb/db/db_write_buffer_manager_test.cc +69 -0
- package/deps/rocksdb/rocksdb/db/flush_job.cc +6 -6
- package/deps/rocksdb/rocksdb/db/memtable.cc +19 -7
- package/deps/rocksdb/rocksdb/db/memtable.h +8 -16
- package/deps/rocksdb/rocksdb/db/memtable_list.cc +27 -16
- package/deps/rocksdb/rocksdb/db/memtable_list.h +18 -11
- package/deps/rocksdb/rocksdb/db/memtable_list_test.cc +70 -55
- package/deps/rocksdb/rocksdb/db/table_cache.cc +9 -11
- package/deps/rocksdb/rocksdb/db/table_cache.h +2 -1
- package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +3 -3
- package/deps/rocksdb/rocksdb/db/version_set.cc +530 -257
- package/deps/rocksdb/rocksdb/db/version_set.h +32 -2
- package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +2 -2
- package/deps/rocksdb/rocksdb/db/wide/db_wide_basic_test.cc +64 -12
- package/deps/rocksdb/rocksdb/db/wide/wide_columns.cc +18 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/db.h +8 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/options.h +13 -1
- package/deps/rocksdb/rocksdb/include/rocksdb/utilities/stackable_db.h +7 -0
- package/deps/rocksdb/rocksdb/include/rocksdb/wide_columns.h +83 -0
- package/deps/rocksdb/rocksdb/options/options.cc +4 -2
- package/deps/rocksdb/rocksdb/src.mk +1 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +3 -10
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_test.cc +5 -4
- package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +10 -28
- package/deps/rocksdb/rocksdb/table/block_based/data_block_hash_index_test.cc +4 -4
- package/deps/rocksdb/rocksdb/table/cuckoo/cuckoo_table_reader_test.cc +11 -9
- package/deps/rocksdb/rocksdb/table/get_context.cc +34 -22
- package/deps/rocksdb/rocksdb/table/get_context.h +6 -3
- package/deps/rocksdb/rocksdb/table/multiget_context.h +69 -5
- package/deps/rocksdb/rocksdb/table/table_reader_bench.cc +2 -2
- package/deps/rocksdb/rocksdb/table/table_test.cc +8 -8
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +23 -0
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +27 -7
- package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +8 -4
- package/deps/rocksdb/rocksdb.gyp +1 -0
- package/index.js +19 -12
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/darwin-x64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -46,6 +46,10 @@
|
|
|
46
46
|
#include "db/version_edit.h"
|
|
47
47
|
#include "db/write_controller.h"
|
|
48
48
|
#include "env/file_system_tracer.h"
|
|
49
|
+
#if USE_COROUTINES
|
|
50
|
+
#include "folly/experimental/coro/BlockingWait.h"
|
|
51
|
+
#include "folly/experimental/coro/Collect.h"
|
|
52
|
+
#endif
|
|
49
53
|
#include "monitoring/instrumented_mutex.h"
|
|
50
54
|
#include "options/db_options.h"
|
|
51
55
|
#include "port/port.h"
|
|
@@ -54,6 +58,7 @@
|
|
|
54
58
|
#include "table/get_context.h"
|
|
55
59
|
#include "table/multiget_context.h"
|
|
56
60
|
#include "trace_replay/block_cache_tracer.h"
|
|
61
|
+
#include "util/autovector.h"
|
|
57
62
|
#include "util/coro_utils.h"
|
|
58
63
|
#include "util/hash_containers.h"
|
|
59
64
|
|
|
@@ -76,6 +81,7 @@ class ColumnFamilySet;
|
|
|
76
81
|
class MergeIteratorBuilder;
|
|
77
82
|
class SystemClock;
|
|
78
83
|
class ManifestTailer;
|
|
84
|
+
class FilePickerMultiGet;
|
|
79
85
|
|
|
80
86
|
// VersionEdit is always supposed to be valid and it is used to point at
|
|
81
87
|
// entries in Manifest. Ideally it should not be used as a container to
|
|
@@ -836,7 +842,8 @@ class Version {
|
|
|
836
842
|
// REQUIRES: lock is not held
|
|
837
843
|
// REQUIRES: pinned_iters_mgr != nullptr
|
|
838
844
|
void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
|
|
839
|
-
std::string* timestamp, Status* status,
|
|
845
|
+
PinnableWideColumns* columns, std::string* timestamp, Status* status,
|
|
846
|
+
MergeContext* merge_context,
|
|
840
847
|
SequenceNumber* max_covering_tombstone_seq,
|
|
841
848
|
PinnedIteratorsManager* pinned_iters_mgr,
|
|
842
849
|
bool* value_found = nullptr, bool* key_exists = nullptr,
|
|
@@ -990,11 +997,34 @@ class Version {
|
|
|
990
997
|
DECLARE_SYNC_AND_ASYNC(
|
|
991
998
|
/* ret_type */ Status, /* func_name */ MultiGetFromSST,
|
|
992
999
|
const ReadOptions& read_options, MultiGetRange file_range,
|
|
993
|
-
int hit_file_level, bool skip_filters,
|
|
1000
|
+
int hit_file_level, bool skip_filters, bool skip_range_deletions,
|
|
1001
|
+
FdWithKeyRange* f,
|
|
994
1002
|
std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
|
|
995
1003
|
Cache::Handle* table_handle, uint64_t& num_filter_read,
|
|
996
1004
|
uint64_t& num_index_read, uint64_t& num_sst_read);
|
|
997
1005
|
|
|
1006
|
+
#ifdef USE_COROUTINES
|
|
1007
|
+
// MultiGet using async IO to read data blocks from SST files in parallel
|
|
1008
|
+
// within and across levels
|
|
1009
|
+
Status MultiGetAsync(
|
|
1010
|
+
const ReadOptions& options, MultiGetRange* range,
|
|
1011
|
+
std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
|
|
1012
|
+
|
|
1013
|
+
// A helper function to lookup a batch of keys in a single level. It will
|
|
1014
|
+
// queue coroutine tasks to mget_tasks. It may also split the input batch
|
|
1015
|
+
// by creating a new batch with keys definitely not in this level and
|
|
1016
|
+
// enqueuing it to to_process.
|
|
1017
|
+
Status ProcessBatch(const ReadOptions& read_options,
|
|
1018
|
+
FilePickerMultiGet* batch,
|
|
1019
|
+
std::vector<folly::coro::Task<Status>>& mget_tasks,
|
|
1020
|
+
std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
|
|
1021
|
+
autovector<FilePickerMultiGet, 4>& batches,
|
|
1022
|
+
std::deque<size_t>& waiting,
|
|
1023
|
+
std::deque<size_t>& to_process,
|
|
1024
|
+
unsigned int& num_tasks_queued, uint64_t& num_filter_read,
|
|
1025
|
+
uint64_t& num_index_read, uint64_t& num_sst_read);
|
|
1026
|
+
#endif
|
|
1027
|
+
|
|
998
1028
|
ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs
|
|
999
1029
|
Logger* info_log_;
|
|
1000
1030
|
Statistics* db_statistics_;
|
|
@@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE {
|
|
|
14
14
|
// Lookup a batch of keys in a single SST file
|
|
15
15
|
DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
|
|
16
16
|
(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
|
|
17
|
-
bool skip_filters, FdWithKeyRange* f,
|
|
17
|
+
bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f,
|
|
18
18
|
std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
|
|
19
19
|
Cache::Handle* table_handle, uint64_t& num_filter_read,
|
|
20
20
|
uint64_t& num_index_read, uint64_t& num_sst_read) {
|
|
@@ -27,7 +27,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
|
|
|
27
27
|
read_options, *internal_comparator(), *f->file_metadata, &file_range,
|
|
28
28
|
mutable_cf_options_.prefix_extractor,
|
|
29
29
|
cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
|
|
30
|
-
hit_file_level, table_handle);
|
|
30
|
+
skip_range_deletions, hit_file_level, table_handle);
|
|
31
31
|
// TODO: examine the behavior for corrupted key
|
|
32
32
|
if (timer_enabled) {
|
|
33
33
|
PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
|
|
@@ -22,10 +22,20 @@ class DBWideBasicTest : public DBTestBase {
|
|
|
22
22
|
TEST_F(DBWideBasicTest, PutEntity) {
|
|
23
23
|
Options options = GetDefaultOptions();
|
|
24
24
|
|
|
25
|
+
// Write a couple of wide-column entities and a plain old key-value, then read
|
|
26
|
+
// them back.
|
|
25
27
|
constexpr char first_key[] = "first";
|
|
28
|
+
constexpr char first_value_of_default_column[] = "hello";
|
|
29
|
+
WideColumns first_columns{
|
|
30
|
+
{kDefaultWideColumnName, first_value_of_default_column},
|
|
31
|
+
{"attr_name1", "foo"},
|
|
32
|
+
{"attr_name2", "bar"}};
|
|
33
|
+
|
|
26
34
|
constexpr char second_key[] = "second";
|
|
35
|
+
WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
|
|
27
36
|
|
|
28
|
-
constexpr char
|
|
37
|
+
constexpr char third_key[] = "third";
|
|
38
|
+
constexpr char third_value[] = "baz";
|
|
29
39
|
|
|
30
40
|
auto verify = [&]() {
|
|
31
41
|
{
|
|
@@ -35,6 +45,13 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
35
45
|
ASSERT_EQ(result, first_value_of_default_column);
|
|
36
46
|
}
|
|
37
47
|
|
|
48
|
+
{
|
|
49
|
+
PinnableWideColumns result;
|
|
50
|
+
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
|
|
51
|
+
first_key, &result));
|
|
52
|
+
ASSERT_EQ(result.columns(), first_columns);
|
|
53
|
+
}
|
|
54
|
+
|
|
38
55
|
{
|
|
39
56
|
PinnableSlice result;
|
|
40
57
|
ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
|
|
@@ -43,9 +60,32 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
43
60
|
}
|
|
44
61
|
|
|
45
62
|
{
|
|
46
|
-
|
|
63
|
+
PinnableWideColumns result;
|
|
64
|
+
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
|
|
65
|
+
second_key, &result));
|
|
66
|
+
ASSERT_EQ(result.columns(), second_columns);
|
|
67
|
+
}
|
|
47
68
|
|
|
48
|
-
|
|
69
|
+
{
|
|
70
|
+
PinnableSlice result;
|
|
71
|
+
ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key,
|
|
72
|
+
&result));
|
|
73
|
+
ASSERT_EQ(result, third_value);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
{
|
|
77
|
+
PinnableWideColumns result;
|
|
78
|
+
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
|
|
79
|
+
third_key, &result));
|
|
80
|
+
|
|
81
|
+
const WideColumns expected_columns{{kDefaultWideColumnName, third_value}};
|
|
82
|
+
ASSERT_EQ(result.columns(), expected_columns);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
{
|
|
86
|
+
constexpr size_t num_keys = 3;
|
|
87
|
+
|
|
88
|
+
std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
|
|
49
89
|
std::array<PinnableSlice, num_keys> values;
|
|
50
90
|
std::array<Status, num_keys> statuses;
|
|
51
91
|
|
|
@@ -57,6 +97,9 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
57
97
|
|
|
58
98
|
ASSERT_OK(statuses[1]);
|
|
59
99
|
ASSERT_TRUE(values[1].empty());
|
|
100
|
+
|
|
101
|
+
ASSERT_OK(statuses[2]);
|
|
102
|
+
ASSERT_EQ(values[2], third_value);
|
|
60
103
|
}
|
|
61
104
|
|
|
62
105
|
{
|
|
@@ -74,6 +117,12 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
74
117
|
ASSERT_EQ(iter->key(), second_key);
|
|
75
118
|
ASSERT_TRUE(iter->value().empty());
|
|
76
119
|
|
|
120
|
+
iter->Next();
|
|
121
|
+
ASSERT_TRUE(iter->Valid());
|
|
122
|
+
ASSERT_OK(iter->status());
|
|
123
|
+
ASSERT_EQ(iter->key(), third_key);
|
|
124
|
+
ASSERT_EQ(iter->value(), third_value);
|
|
125
|
+
|
|
77
126
|
iter->Next();
|
|
78
127
|
ASSERT_FALSE(iter->Valid());
|
|
79
128
|
ASSERT_OK(iter->status());
|
|
@@ -81,6 +130,12 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
81
130
|
iter->SeekToLast();
|
|
82
131
|
ASSERT_TRUE(iter->Valid());
|
|
83
132
|
ASSERT_OK(iter->status());
|
|
133
|
+
ASSERT_EQ(iter->key(), third_key);
|
|
134
|
+
ASSERT_EQ(iter->value(), third_value);
|
|
135
|
+
|
|
136
|
+
iter->Prev();
|
|
137
|
+
ASSERT_TRUE(iter->Valid());
|
|
138
|
+
ASSERT_OK(iter->status());
|
|
84
139
|
ASSERT_EQ(iter->key(), second_key);
|
|
85
140
|
ASSERT_TRUE(iter->value().empty());
|
|
86
141
|
|
|
@@ -96,23 +151,20 @@ TEST_F(DBWideBasicTest, PutEntity) {
|
|
|
96
151
|
}
|
|
97
152
|
};
|
|
98
153
|
|
|
99
|
-
// Use the DB::PutEntity API
|
|
100
|
-
WideColumns first_columns{
|
|
101
|
-
{kDefaultWideColumnName, first_value_of_default_column},
|
|
102
|
-
{"attr_name1", "foo"},
|
|
103
|
-
{"attr_name2", "bar"}};
|
|
104
|
-
|
|
154
|
+
// Use the DB::PutEntity API to write the first entity
|
|
105
155
|
ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
|
|
106
156
|
first_key, first_columns));
|
|
107
157
|
|
|
108
|
-
// Use WriteBatch
|
|
109
|
-
WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
|
|
110
|
-
|
|
158
|
+
// Use WriteBatch to write the second entity
|
|
111
159
|
WriteBatch batch;
|
|
112
160
|
ASSERT_OK(
|
|
113
161
|
batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns));
|
|
114
162
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
115
163
|
|
|
164
|
+
// Use Put to write the plain key-value
|
|
165
|
+
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), third_key,
|
|
166
|
+
third_value));
|
|
167
|
+
|
|
116
168
|
// Try reading from memtable
|
|
117
169
|
verify();
|
|
118
170
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// This source code is licensed under both the GPLv2 (found in the
|
|
3
|
+
// COPYING file in the root directory) and Apache 2.0 License
|
|
4
|
+
// (found in the LICENSE.Apache file in the root directory).
|
|
5
|
+
|
|
6
|
+
#include "rocksdb/wide_columns.h"
|
|
7
|
+
|
|
8
|
+
#include "db/wide/wide_column_serialization.h"
|
|
9
|
+
|
|
10
|
+
namespace ROCKSDB_NAMESPACE {
|
|
11
|
+
|
|
12
|
+
Status PinnableWideColumns::CreateIndexForWideColumns() {
|
|
13
|
+
Slice value_copy = value_;
|
|
14
|
+
|
|
15
|
+
return WideColumnSerialization::Deserialize(value_copy, columns_);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -567,6 +567,14 @@ class DB {
|
|
|
567
567
|
return Get(options, DefaultColumnFamily(), key, value, timestamp);
|
|
568
568
|
}
|
|
569
569
|
|
|
570
|
+
// UNDER CONSTRUCTION -- DO NOT USE
|
|
571
|
+
virtual Status GetEntity(const ReadOptions& /* options */,
|
|
572
|
+
ColumnFamilyHandle* /* column_family */,
|
|
573
|
+
const Slice& /* key */,
|
|
574
|
+
PinnableWideColumns* /* columns */) {
|
|
575
|
+
return Status::NotSupported("GetEntity not supported");
|
|
576
|
+
}
|
|
577
|
+
|
|
570
578
|
// Populates the `merge_operands` array with all the merge operands in the DB
|
|
571
579
|
// for `key`. The `merge_operands` array will be populated in the order of
|
|
572
580
|
// insertion. The number of entries populated in `merge_operands` will be
|
|
@@ -905,7 +905,8 @@ struct DBOptions {
|
|
|
905
905
|
// can be passed into multiple DBs and it will track the sum of size of all
|
|
906
906
|
// the DBs. If the total size of all live memtables of all the DBs exceeds
|
|
907
907
|
// a limit, a flush will be triggered in the next DB to which the next write
|
|
908
|
-
// is issued
|
|
908
|
+
// is issued, as long as there is one or more column family not already
|
|
909
|
+
// flushing.
|
|
909
910
|
//
|
|
910
911
|
// If the object is only passed to one DB, the behavior is the same as
|
|
911
912
|
// db_write_buffer_size. When write_buffer_manager is set, the value set will
|
|
@@ -1685,6 +1686,17 @@ struct ReadOptions {
|
|
|
1685
1686
|
// Default: false
|
|
1686
1687
|
bool async_io;
|
|
1687
1688
|
|
|
1689
|
+
// Experimental
|
|
1690
|
+
//
|
|
1691
|
+
// If async_io is set, then this flag controls whether we read SST files
|
|
1692
|
+
// in multiple levels asynchronously. Enabling this flag can help reduce
|
|
1693
|
+
// MultiGet latency by maximizing the number of SST files read in
|
|
1694
|
+
// parallel if the keys in the MultiGet batch are in different levels. It
|
|
1695
|
+
// comes at the expense of slightly higher CPU overhead.
|
|
1696
|
+
//
|
|
1697
|
+
// Default: false
|
|
1698
|
+
bool optimize_multiget_for_io;
|
|
1699
|
+
|
|
1688
1700
|
ReadOptions();
|
|
1689
1701
|
ReadOptions(bool cksum, bool cache);
|
|
1690
1702
|
};
|
|
@@ -99,6 +99,13 @@ class StackableDB : public DB {
|
|
|
99
99
|
return db_->Get(options, column_family, key, value);
|
|
100
100
|
}
|
|
101
101
|
|
|
102
|
+
using DB::GetEntity;
|
|
103
|
+
Status GetEntity(const ReadOptions& options,
|
|
104
|
+
ColumnFamilyHandle* column_family, const Slice& key,
|
|
105
|
+
PinnableWideColumns* columns) override {
|
|
106
|
+
return db_->GetEntity(options, column_family, key, columns);
|
|
107
|
+
}
|
|
108
|
+
|
|
102
109
|
using DB::GetMergeOperands;
|
|
103
110
|
virtual Status GetMergeOperands(
|
|
104
111
|
const ReadOptions& options, ColumnFamilyHandle* column_family,
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
#include "rocksdb/rocksdb_namespace.h"
|
|
13
13
|
#include "rocksdb/slice.h"
|
|
14
|
+
#include "rocksdb/status.h"
|
|
14
15
|
|
|
15
16
|
namespace ROCKSDB_NAMESPACE {
|
|
16
17
|
|
|
@@ -69,8 +70,90 @@ inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) {
|
|
|
69
70
|
return !(lhs == rhs);
|
|
70
71
|
}
|
|
71
72
|
|
|
73
|
+
// A collection of wide columns.
|
|
72
74
|
using WideColumns = std::vector<WideColumn>;
|
|
73
75
|
|
|
76
|
+
// The anonymous default wide column (an empty Slice).
|
|
74
77
|
extern const Slice kDefaultWideColumnName;
|
|
75
78
|
|
|
79
|
+
// A self-contained collection of wide columns. Used for the results of
|
|
80
|
+
// wide-column queries.
|
|
81
|
+
class PinnableWideColumns {
|
|
82
|
+
public:
|
|
83
|
+
const WideColumns& columns() const { return columns_; }
|
|
84
|
+
size_t serialized_size() const { return value_.size(); }
|
|
85
|
+
|
|
86
|
+
void SetPlainValue(const Slice& value);
|
|
87
|
+
void SetPlainValue(const Slice& value, Cleanable* cleanable);
|
|
88
|
+
|
|
89
|
+
Status SetWideColumnValue(const Slice& value);
|
|
90
|
+
Status SetWideColumnValue(const Slice& value, Cleanable* cleanable);
|
|
91
|
+
|
|
92
|
+
void Reset();
|
|
93
|
+
|
|
94
|
+
private:
|
|
95
|
+
void CopyValue(const Slice& value);
|
|
96
|
+
void PinOrCopyValue(const Slice& value, Cleanable* cleanable);
|
|
97
|
+
void CreateIndexForPlainValue();
|
|
98
|
+
Status CreateIndexForWideColumns();
|
|
99
|
+
|
|
100
|
+
PinnableSlice value_;
|
|
101
|
+
WideColumns columns_;
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
inline void PinnableWideColumns::CopyValue(const Slice& value) {
|
|
105
|
+
value_.PinSelf(value);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
inline void PinnableWideColumns::PinOrCopyValue(const Slice& value,
|
|
109
|
+
Cleanable* cleanable) {
|
|
110
|
+
if (!cleanable) {
|
|
111
|
+
CopyValue(value);
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
value_.PinSlice(value, cleanable);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
inline void PinnableWideColumns::CreateIndexForPlainValue() {
|
|
119
|
+
columns_ = WideColumns{{kDefaultWideColumnName, value_}};
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
inline void PinnableWideColumns::SetPlainValue(const Slice& value) {
|
|
123
|
+
CopyValue(value);
|
|
124
|
+
CreateIndexForPlainValue();
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
inline void PinnableWideColumns::SetPlainValue(const Slice& value,
|
|
128
|
+
Cleanable* cleanable) {
|
|
129
|
+
PinOrCopyValue(value, cleanable);
|
|
130
|
+
CreateIndexForPlainValue();
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) {
|
|
134
|
+
CopyValue(value);
|
|
135
|
+
return CreateIndexForWideColumns();
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value,
|
|
139
|
+
Cleanable* cleanable) {
|
|
140
|
+
PinOrCopyValue(value, cleanable);
|
|
141
|
+
return CreateIndexForWideColumns();
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
inline void PinnableWideColumns::Reset() {
|
|
145
|
+
value_.Reset();
|
|
146
|
+
columns_.clear();
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
inline bool operator==(const PinnableWideColumns& lhs,
|
|
150
|
+
const PinnableWideColumns& rhs) {
|
|
151
|
+
return lhs.columns() == rhs.columns();
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
inline bool operator!=(const PinnableWideColumns& lhs,
|
|
155
|
+
const PinnableWideColumns& rhs) {
|
|
156
|
+
return !(lhs == rhs);
|
|
157
|
+
}
|
|
158
|
+
|
|
76
159
|
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -696,7 +696,8 @@ ReadOptions::ReadOptions()
|
|
|
696
696
|
io_timeout(std::chrono::microseconds::zero()),
|
|
697
697
|
value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
|
|
698
698
|
adaptive_readahead(false),
|
|
699
|
-
async_io(false)
|
|
699
|
+
async_io(false),
|
|
700
|
+
optimize_multiget_for_io(false) {}
|
|
700
701
|
|
|
701
702
|
ReadOptions::ReadOptions(bool cksum, bool cache)
|
|
702
703
|
: snapshot(nullptr),
|
|
@@ -721,6 +722,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
|
|
|
721
722
|
io_timeout(std::chrono::microseconds::zero()),
|
|
722
723
|
value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
|
|
723
724
|
adaptive_readahead(false),
|
|
724
|
-
async_io(false)
|
|
725
|
+
async_io(false),
|
|
726
|
+
optimize_multiget_for_io(false) {}
|
|
725
727
|
|
|
726
728
|
} // namespace ROCKSDB_NAMESPACE
|
|
@@ -1251,12 +1251,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
|
|
|
1251
1251
|
Statistics* statistics = rep_->ioptions.statistics.get();
|
|
1252
1252
|
bool using_zstd = rep_->blocks_definitely_zstd_compressed;
|
|
1253
1253
|
const FilterPolicy* filter_policy = rep_->filter_policy;
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
// avoid dynamic memory allocation by using the reference (std::ref) of the
|
|
1257
|
-
// callback. Otherwise, binding a functor to std::function will allocate extra
|
|
1258
|
-
// memory from heap.
|
|
1259
|
-
Cache::CreateCallback create_cb(std::ref(callback));
|
|
1254
|
+
Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
|
|
1255
|
+
read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
|
|
1260
1256
|
|
|
1261
1257
|
// Lookup uncompressed cache first
|
|
1262
1258
|
if (block_cache != nullptr) {
|
|
@@ -1286,11 +1282,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
|
|
|
1286
1282
|
BlockContents contents;
|
|
1287
1283
|
if (rep_->ioptions.lowest_used_cache_tier ==
|
|
1288
1284
|
CacheTier::kNonVolatileBlockTier) {
|
|
1289
|
-
|
|
1285
|
+
Cache::CreateCallback create_cb_special = GetCreateCallback<BlockContents>(
|
|
1290
1286
|
read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
|
|
1291
|
-
// avoid dynamic memory allocation by using the reference (std::ref) of the
|
|
1292
|
-
// callback. Make sure the callback is only used within this code block.
|
|
1293
|
-
Cache::CreateCallback create_cb_special(std::ref(special_callback));
|
|
1294
1287
|
block_cache_compressed_handle = block_cache_compressed->Lookup(
|
|
1295
1288
|
cache_key,
|
|
1296
1289
|
BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
|
|
@@ -248,10 +248,11 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) {
|
|
|
248
248
|
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
|
249
249
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
|
250
250
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
251
|
-
get_context.emplace_back(
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
251
|
+
get_context.emplace_back(BytewiseComparator(), nullptr, nullptr, nullptr,
|
|
252
|
+
GetContext::kNotFound, keys[i], &values[i],
|
|
253
|
+
nullptr, nullptr, nullptr, nullptr,
|
|
254
|
+
true /* do_merge */, nullptr, nullptr, nullptr,
|
|
255
|
+
nullptr, nullptr, nullptr);
|
|
255
256
|
key_context.emplace_back(nullptr, keys[i], &values[i], nullptr,
|
|
256
257
|
&statuses.back());
|
|
257
258
|
key_context.back().get_context = &get_context.back();
|
|
@@ -21,42 +21,24 @@ template <typename T, CacheEntryRole R>
|
|
|
21
21
|
Cache::CacheItemHelper* GetCacheItemHelperForRole();
|
|
22
22
|
|
|
23
23
|
template <typename TBlocklike>
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
CacheCreateCallback& operator=(CacheCreateCallback&&) = delete;
|
|
31
|
-
|
|
32
|
-
explicit CacheCreateCallback(size_t read_amp_bytes_per_bit,
|
|
33
|
-
Statistics* statistics, bool using_zstd,
|
|
34
|
-
const FilterPolicy* filter_policy)
|
|
35
|
-
: read_amp_bytes_per_bit_(read_amp_bytes_per_bit),
|
|
36
|
-
statistics_(statistics),
|
|
37
|
-
using_zstd_(using_zstd),
|
|
38
|
-
filter_policy_(filter_policy) {}
|
|
39
|
-
|
|
40
|
-
Status operator()(const void* buf, size_t size, void** out_obj,
|
|
41
|
-
size_t* charge) {
|
|
24
|
+
Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit,
|
|
25
|
+
Statistics* statistics, bool using_zstd,
|
|
26
|
+
const FilterPolicy* filter_policy) {
|
|
27
|
+
return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy](
|
|
28
|
+
const void* buf, size_t size, void** out_obj,
|
|
29
|
+
size_t* charge) -> Status {
|
|
42
30
|
assert(buf != nullptr);
|
|
43
31
|
std::unique_ptr<char[]> buf_data(new char[size]());
|
|
44
32
|
memcpy(buf_data.get(), buf, size);
|
|
45
33
|
BlockContents bc = BlockContents(std::move(buf_data), size);
|
|
46
34
|
TBlocklike* ucd_ptr = BlocklikeTraits<TBlocklike>::Create(
|
|
47
|
-
std::move(bc),
|
|
48
|
-
|
|
35
|
+
std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd,
|
|
36
|
+
filter_policy);
|
|
49
37
|
*out_obj = reinterpret_cast<void*>(ucd_ptr);
|
|
50
38
|
*charge = size;
|
|
51
39
|
return Status::OK();
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
private:
|
|
55
|
-
const size_t read_amp_bytes_per_bit_;
|
|
56
|
-
Statistics* statistics_;
|
|
57
|
-
const bool using_zstd_;
|
|
58
|
-
const FilterPolicy* filter_policy_;
|
|
59
|
-
};
|
|
40
|
+
};
|
|
41
|
+
}
|
|
60
42
|
|
|
61
43
|
template <>
|
|
62
44
|
class BlocklikeTraits<BlockContents> {
|
|
@@ -625,7 +625,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
|
|
|
625
625
|
InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
|
|
626
626
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
627
627
|
GetContext::kNotFound, seek_ukey, &value, nullptr,
|
|
628
|
-
nullptr, true, nullptr, nullptr);
|
|
628
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
629
629
|
|
|
630
630
|
TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
|
|
631
631
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
@@ -650,7 +650,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
|
|
|
650
650
|
InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
|
|
651
651
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
652
652
|
GetContext::kNotFound, seek_ukey, &value, nullptr,
|
|
653
|
-
nullptr, true, nullptr, nullptr);
|
|
653
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
654
654
|
|
|
655
655
|
TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
|
|
656
656
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
@@ -675,7 +675,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
|
|
|
675
675
|
InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
|
|
676
676
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
677
677
|
GetContext::kNotFound, seek_ukey, &value, nullptr,
|
|
678
|
-
nullptr, true, nullptr, nullptr);
|
|
678
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
679
679
|
|
|
680
680
|
TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
|
|
681
681
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
@@ -700,7 +700,7 @@ TEST(DataBlockHashIndex, BlockBoundary) {
|
|
|
700
700
|
InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
|
|
701
701
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
702
702
|
GetContext::kNotFound, seek_ukey, &value, nullptr,
|
|
703
|
-
nullptr, true, nullptr, nullptr);
|
|
703
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
704
704
|
|
|
705
705
|
TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
|
|
706
706
|
ASSERT_EQ(get_context.State(), GetContext::kNotFound);
|
|
@@ -119,7 +119,8 @@ class CuckooReaderTest : public testing::Test {
|
|
|
119
119
|
PinnableSlice value;
|
|
120
120
|
GetContext get_context(ucomp, nullptr, nullptr, nullptr,
|
|
121
121
|
GetContext::kNotFound, Slice(user_keys[i]), &value,
|
|
122
|
-
nullptr, nullptr,
|
|
122
|
+
nullptr, nullptr, nullptr, nullptr, true, nullptr,
|
|
123
|
+
nullptr);
|
|
123
124
|
ASSERT_OK(
|
|
124
125
|
reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
|
|
125
126
|
ASSERT_STREQ(values[i].c_str(), value.data());
|
|
@@ -341,8 +342,8 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
|
|
|
341
342
|
AppendInternalKey(¬_found_key, ikey);
|
|
342
343
|
PinnableSlice value;
|
|
343
344
|
GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
|
|
344
|
-
Slice(not_found_key), &value, nullptr, nullptr,
|
|
345
|
-
nullptr, nullptr);
|
|
345
|
+
Slice(not_found_key), &value, nullptr, nullptr,
|
|
346
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
346
347
|
ASSERT_OK(
|
|
347
348
|
reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
|
|
348
349
|
ASSERT_TRUE(value.empty());
|
|
@@ -356,7 +357,8 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
|
|
|
356
357
|
value.Reset();
|
|
357
358
|
GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
|
|
358
359
|
GetContext::kNotFound, Slice(not_found_key2), &value,
|
|
359
|
-
nullptr, nullptr,
|
|
360
|
+
nullptr, nullptr, nullptr, nullptr, true, nullptr,
|
|
361
|
+
nullptr);
|
|
360
362
|
ASSERT_OK(
|
|
361
363
|
reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
|
|
362
364
|
ASSERT_TRUE(value.empty());
|
|
@@ -370,9 +372,9 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
|
|
|
370
372
|
AddHashLookups(ExtractUserKey(unused_key).ToString(),
|
|
371
373
|
kNumHashFunc, kNumHashFunc);
|
|
372
374
|
value.Reset();
|
|
373
|
-
GetContext get_context3(
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
GetContext get_context3(
|
|
376
|
+
ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key),
|
|
377
|
+
&value, nullptr, nullptr, nullptr, nullptr, true, nullptr, nullptr);
|
|
376
378
|
ASSERT_OK(
|
|
377
379
|
reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
|
|
378
380
|
ASSERT_TRUE(value.empty());
|
|
@@ -447,7 +449,7 @@ void WriteFile(const std::vector<std::string>& keys,
|
|
|
447
449
|
// Assume only the fast path is triggered
|
|
448
450
|
GetContext get_context(nullptr, nullptr, nullptr, nullptr,
|
|
449
451
|
GetContext::kNotFound, Slice(), &value, nullptr,
|
|
450
|
-
nullptr, true, nullptr, nullptr);
|
|
452
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
451
453
|
for (uint64_t i = 0; i < num; ++i) {
|
|
452
454
|
value.Reset();
|
|
453
455
|
value.clear();
|
|
@@ -496,7 +498,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
|
|
|
496
498
|
// Assume only the fast path is triggered
|
|
497
499
|
GetContext get_context(nullptr, nullptr, nullptr, nullptr,
|
|
498
500
|
GetContext::kNotFound, Slice(), &value, nullptr,
|
|
499
|
-
nullptr, true, nullptr, nullptr);
|
|
501
|
+
nullptr, nullptr, true, nullptr, nullptr);
|
|
500
502
|
uint64_t start_time = env->NowMicros();
|
|
501
503
|
if (batch_size > 0) {
|
|
502
504
|
for (uint64_t i = 0; i < num; i += batch_size) {
|