@nxtedition/rocksdb 7.0.38 → 7.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.cc +62 -33
- package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +27 -11
- package/deps/rocksdb/rocksdb/cache/clock_cache.cc +310 -337
- package/deps/rocksdb/rocksdb/cache/clock_cache.h +394 -352
- package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
- package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
- package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
- package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
- package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
- package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
- package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
- package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
- package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
- package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
- package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
- package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
- package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
- package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
- package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
- package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
- package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
- package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
- package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
- package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
- package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
- package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
- package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
- package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
- package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
- package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
- package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +100 -78
- package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
- package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
- package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
- package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
- package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
- package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
- package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
- package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
- package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
- package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
- package/index.js +2 -2
- package/iterator.js +1 -1
- package/max_rev_operator.h +114 -0
- package/package.json +1 -1
- package/prebuilds/darwin-arm64/node.napi.node +0 -0
- package/prebuilds/linux-x64/node.napi.node +0 -0
|
@@ -23,102 +23,137 @@
|
|
|
23
23
|
#include "rocksdb/cache.h"
|
|
24
24
|
#include "rocksdb/secondary_cache.h"
|
|
25
25
|
#include "util/autovector.h"
|
|
26
|
-
#include "util/distributed_mutex.h"
|
|
27
26
|
|
|
28
27
|
namespace ROCKSDB_NAMESPACE {
|
|
29
28
|
|
|
30
29
|
namespace clock_cache {
|
|
31
30
|
|
|
32
|
-
//
|
|
33
|
-
// and clock eviction.
|
|
31
|
+
// An experimental alternative to LRUCache, using a lock-free, open-addressed
|
|
32
|
+
// hash table and clock eviction.
|
|
34
33
|
|
|
35
|
-
|
|
36
|
-
//
|
|
34
|
+
// ----------------------------------------------------------------------------
|
|
35
|
+
// 1. INTRODUCTION
|
|
37
36
|
//
|
|
38
|
-
//
|
|
39
|
-
//
|
|
40
|
-
//
|
|
41
|
-
//
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
//
|
|
45
|
-
//
|
|
46
|
-
// referenced. In particular, when an handle becomes referenced, it's
|
|
47
|
-
// temporarily taken out of clock until all references to it are released.
|
|
48
|
-
// (M) Marked for deletion (or invisible): An handle is marked for deletion
|
|
49
|
-
// when an operation attempts to delete it, but the handle is externally
|
|
50
|
-
// referenced, so it can't be immediately deleted. When this mark is placed,
|
|
51
|
-
// lookups will no longer be able to find it. Consequently, no more external
|
|
52
|
-
// references will be taken to the handle. When a handle is marked for
|
|
53
|
-
// deletion, we also say it's invisible.
|
|
54
|
-
// These properties induce 4 different states, with transitions defined as
|
|
55
|
-
// follows:
|
|
56
|
-
// - Not M --> M: When a handle is deleted or replaced by a new version, but
|
|
57
|
-
// not immediately evicted.
|
|
58
|
-
// - M --> not M: This cannot happen. Once a handle is marked for deletion,
|
|
59
|
-
// there is no can't go back.
|
|
60
|
-
// - R --> not R: When all references to an handle are released.
|
|
61
|
-
// - Not R --> R: When an unreferenced handle becomes referenced. This can only
|
|
62
|
-
// happen if the handle is visible, since references to an handle can only be
|
|
63
|
-
// created when it's visible.
|
|
37
|
+
// In RocksDB, a Cache is a concurrent unordered dictionary that supports
|
|
38
|
+
// external references (a.k.a. user references). A ClockCache is a type of Cache
|
|
39
|
+
// that uses the clock algorithm as its eviction policy. Internally, a
|
|
40
|
+
// ClockCache is an open-addressed hash table that stores all KV pairs in a
|
|
41
|
+
// large array. Every slot in the hash table is a ClockHandle, which holds a KV
|
|
42
|
+
// pair plus some additional metadata that controls the different aspects of the
|
|
43
|
+
// cache: external references, the hashing mechanism, concurrent access and the
|
|
44
|
+
// clock algorithm.
|
|
64
45
|
//
|
|
65
|
-
///////////////////////////////////////////////////////////////////////////////
|
|
66
|
-
// Part 2: Hash table structure
|
|
67
46
|
//
|
|
68
|
-
//
|
|
69
|
-
//
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
// the
|
|
73
|
-
//
|
|
74
|
-
//
|
|
75
|
-
//
|
|
76
|
-
//
|
|
77
|
-
//
|
|
47
|
+
// 2. EXTERNAL REFERENCES
|
|
48
|
+
//
|
|
49
|
+
// An externally referenced handle can't be deleted (either evicted by the clock
|
|
50
|
+
// algorithm, or explicitly deleted) or replaced by a new version (via an insert
|
|
51
|
+
// of the same key) until all external references to it have been released by
|
|
52
|
+
// the users. ClockHandles have two members to support external references:
|
|
53
|
+
// - EXTERNAL_REFS counter: The number of external refs. When EXTERNAL_REFS > 0,
|
|
54
|
+
// the handle is externally referenced. Updates that intend to modify the
|
|
55
|
+
// handle will refrain from doing so. Eventually, when all references are
|
|
56
|
+
// released, we have EXTERNAL_REFS == 0, and updates can operate normally on
|
|
57
|
+
// the handle.
|
|
58
|
+
// - WILL_BE_DELETED flag: An handle is marked for deletion when an operation
|
|
59
|
+
// decides the handle should be deleted. This happens either when the last
|
|
60
|
+
// reference to a handle is released (and the release operation is instructed
|
|
61
|
+
// to delete on last reference) or on when a delete operation is called on
|
|
62
|
+
// the item. This flag is needed because an externally referenced handle
|
|
63
|
+
// can't be immediately deleted. In these cases, the flag will be later read
|
|
64
|
+
// and acted upon by the eviction algorithm. Importantly, WILL_BE_DELETED is
|
|
65
|
+
// used not only to defer deletions, but also as a barrier for external
|
|
66
|
+
// references: once WILL_BE_DELETED is set, lookups (which are the means to
|
|
67
|
+
// acquire new external references) will ignore the handle. For this reason,
|
|
68
|
+
// when WILL_BE_DELETED is set, we say the handle is invisible (and
|
|
69
|
+
// otherwise, that it's visible).
|
|
70
|
+
//
|
|
71
|
+
//
|
|
72
|
+
// 3. HASHING AND COLLISION RESOLUTION
|
|
73
|
+
//
|
|
74
|
+
// ClockCache uses an open-addressed hash table to store the handles.
|
|
75
|
+
// We use a variant of tombstones to manage collisions: every slot keeps a
|
|
76
|
+
// count of how many KV pairs that are currently in the cache have probed the
|
|
77
|
+
// slot in an attempt to insert. Probes are generated with double-hashing
|
|
78
|
+
// (although the code can be easily modified to use other probing schemes, like
|
|
79
|
+
// linear probing).
|
|
80
|
+
//
|
|
81
|
+
// A slot in the hash table can be in a few different states:
|
|
82
|
+
// - Element: The slot contains an element. This is indicated with the
|
|
83
|
+
// IS_ELEMENT flag. Element can be sub-classified depending on the
|
|
84
|
+
// value of WILL_BE_DELETED:
|
|
85
|
+
// * Visible element.
|
|
86
|
+
// * Invisible element.
|
|
87
|
+
// - Tombstone: The slot doesn't contain an element, but there is some other
|
|
78
88
|
// element that probed this slot during its insertion.
|
|
79
|
-
//
|
|
80
|
-
// When a ghost is removed from the table, it can either transition to being a
|
|
81
|
-
// tombstone or an empty slot, depending on the number of displacements of the
|
|
82
|
-
// slot. In any case, the slot becomes available. When a handle is inserted
|
|
83
|
-
// into that slot, it becomes a visible element again.
|
|
89
|
+
// - Empty: The slot is unused---it's neither an element nor a tombstone.
|
|
84
90
|
//
|
|
85
|
-
|
|
86
|
-
//
|
|
91
|
+
// A slot cycles through the following sequence of states:
|
|
92
|
+
// empty or tombstone --> visible element --> invisible element -->
|
|
93
|
+
// empty or tombstone. Initially a slot is available---it's either
|
|
94
|
+
// empty or a tombstone. As soon as a KV pair is written into the slot, it
|
|
95
|
+
// becomes a visible element. At some point, the handle will be deleted
|
|
96
|
+
// by an explicit delete operation, the eviction algorithm, or an overwriting
|
|
97
|
+
// insert. In either case, the handle is marked for deletion. When the an
|
|
98
|
+
// attempt to delete the element finally succeeds, the slot is freed up
|
|
99
|
+
// and becomes available again.
|
|
87
100
|
//
|
|
88
|
-
// We maintain a circular buffer with the handles available for eviction,
|
|
89
|
-
// which the clock algorithm traverses (using a "clock pointer") to pick the
|
|
90
|
-
// next victim. We use the hash table array as the circular buffer, and mark
|
|
91
|
-
// the handles that are evictable. For this we use different clock flags, namely
|
|
92
|
-
// NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
|
|
93
|
-
// represent how close an element is from being evictable, LOW being immediately
|
|
94
|
-
// evictable. NONE means the slot is not evictable. This is due to one of the
|
|
95
|
-
// following reasons:
|
|
96
|
-
// (i) the slot doesn't contain an element, or
|
|
97
|
-
// (ii) the slot contains an element that is in R state, or
|
|
98
|
-
// (iii) the slot contains an element that was in R state but it's
|
|
99
|
-
// not any more, and the clock pointer has not swept through the
|
|
100
|
-
// slot since the element stopped being referenced.
|
|
101
101
|
//
|
|
102
|
-
//
|
|
103
|
-
// two cases there are other metadata fields that already capture the state.
|
|
104
|
-
// When an element stops being referenced (and is not deleted), the clock
|
|
105
|
-
// algorithm must acknowledge this, and assign a non-NONE priority to make
|
|
106
|
-
// the element evictable again.
|
|
102
|
+
// 4. CONCURRENCY
|
|
107
103
|
//
|
|
108
|
-
|
|
109
|
-
//
|
|
104
|
+
// ClockCache is lock-free. At a high level, we synchronize the operations
|
|
105
|
+
// using a read-prioritized, non-blocking variant of RW locks on every slot of
|
|
106
|
+
// the hash table. To do this we generalize the concept of reference:
|
|
107
|
+
// - Internal reference: Taken by a thread that is attempting to read a slot
|
|
108
|
+
// or do a very precise type of update.
|
|
109
|
+
// - Exclusive reference: Taken by a thread that is attempting to write a
|
|
110
|
+
// a slot extensively.
|
|
110
111
|
//
|
|
111
|
-
// We
|
|
112
|
-
//
|
|
113
|
-
//
|
|
114
|
-
//
|
|
115
|
-
//
|
|
116
|
-
//
|
|
112
|
+
// We defer the precise definitions to the comments in the code below.
|
|
113
|
+
// A crucial feature of our references is that attempting to take one never
|
|
114
|
+
// blocks the thread. Another important feature is that readers are
|
|
115
|
+
// prioritized, as they use extremely fast synchronization primitives---they
|
|
116
|
+
// use atomic arithmetic/bit operations, but no compare-and-swaps (which are
|
|
117
|
+
// much slower).
|
|
117
118
|
//
|
|
118
|
-
//
|
|
119
|
-
//
|
|
119
|
+
// Internal references are used by threads to read slots during a probing
|
|
120
|
+
// sequence, making them the most common references (probing is performed
|
|
121
|
+
// in almost every operation, not just lookups). During a lookup, once
|
|
122
|
+
// the target element is found, and just before the handle is handed over
|
|
123
|
+
// to the user, an internal reference is converted into an external reference.
|
|
124
|
+
// During an update operation, once the target slot is found, an internal
|
|
125
|
+
// reference is converted into an exclusive reference. Interestingly, we
|
|
126
|
+
// can't atomically upgrade from internal to exclusive, or we may run into a
|
|
127
|
+
// deadlock. Releasing the internal reference and then taking an exclusive
|
|
128
|
+
// reference avoids the deadlock, but then the handle may change inbetween.
|
|
129
|
+
// One of the key observations we use in our implementation is that we can
|
|
130
|
+
// make up for this lack of atomicity using IS_ELEMENT and WILL_BE_DELETED.
|
|
120
131
|
//
|
|
121
|
-
|
|
132
|
+
// Distinguishing internal from external references is useful for two reasons:
|
|
133
|
+
// - Internal references are short lived, but external references are typically
|
|
134
|
+
// not. This is helpful when acquiring an exclusive ref: if there are any
|
|
135
|
+
// external references to the item, it's probably not worth waiting until
|
|
136
|
+
// they go away.
|
|
137
|
+
// - We can precisely determine when there are no more external references to a
|
|
138
|
+
// handle, and proceed to mark it for deletion. This is useful when users
|
|
139
|
+
// release external references.
|
|
140
|
+
//
|
|
141
|
+
//
|
|
142
|
+
// 5. CLOCK ALGORITHM
|
|
143
|
+
//
|
|
144
|
+
// The clock algorithm circularly sweeps through the hash table to find the next
|
|
145
|
+
// victim. Recall that handles that are referenced are not evictable; the clock
|
|
146
|
+
// algorithm never picks those. We use different clock priorities: NONE, LOW,
|
|
147
|
+
// MEDIUM and HIGH. Priorities LOW, MEDIUM and HIGH represent how close an
|
|
148
|
+
// element is from being evicted, LOW being the closest to evicted. NONE means
|
|
149
|
+
// the slot is not evictable. NONE priority is used in one of the following
|
|
150
|
+
// cases:
|
|
151
|
+
// (a) the slot doesn't contain an element, or
|
|
152
|
+
// (b) the slot contains an externally referenced element, or
|
|
153
|
+
// (c) the slot contains an element that used to be externally referenced,
|
|
154
|
+
// and the clock pointer has not swept through the slot since the element
|
|
155
|
+
// stopped being externally referenced.
|
|
156
|
+
// ----------------------------------------------------------------------------
|
|
122
157
|
|
|
123
158
|
// The load factor p is a real number in (0, 1) such that at all
|
|
124
159
|
// times at most a fraction p of all slots, without counting tombstones,
|
|
@@ -138,15 +173,18 @@ constexpr double kLoadFactor = 0.35;
|
|
|
138
173
|
|
|
139
174
|
// The user can exceed kLoadFactor if the sizes of the inserted values don't
|
|
140
175
|
// match estimated_value_size, or if strict_capacity_limit == false. To
|
|
141
|
-
// avoid performance
|
|
176
|
+
// avoid a performance drop, we set a strict upper bound on the load factor.
|
|
142
177
|
constexpr double kStrictLoadFactor = 0.7;
|
|
143
178
|
|
|
179
|
+
// Maximum number of spins when trying to acquire a ref.
|
|
180
|
+
// TODO(Guido) This value was set arbitrarily. Is it appropriate?
|
|
181
|
+
// What's the best way to bound the spinning?
|
|
182
|
+
constexpr uint32_t kSpinsPerTry = 100000;
|
|
183
|
+
|
|
144
184
|
// Arbitrary seeds.
|
|
145
185
|
constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
|
|
146
186
|
constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
|
|
147
187
|
|
|
148
|
-
// An experimental (under development!) alternative to LRUCache.
|
|
149
|
-
|
|
150
188
|
struct ClockHandle {
|
|
151
189
|
void* value;
|
|
152
190
|
Cache::DeleterFn deleter;
|
|
@@ -154,49 +192,6 @@ struct ClockHandle {
|
|
|
154
192
|
size_t total_charge;
|
|
155
193
|
std::array<char, kCacheKeySize> key_data;
|
|
156
194
|
|
|
157
|
-
static constexpr uint8_t kExternalRefsOffset = 0;
|
|
158
|
-
static constexpr uint8_t kSharedRefsOffset = 15;
|
|
159
|
-
static constexpr uint8_t kExclusiveRefOffset = 30;
|
|
160
|
-
static constexpr uint8_t kWillBeDeletedOffset = 31;
|
|
161
|
-
|
|
162
|
-
enum Refs : uint32_t {
|
|
163
|
-
// Number of external references to the slot.
|
|
164
|
-
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
|
|
165
|
-
<< kExternalRefsOffset, // Bits 0, ..., 14
|
|
166
|
-
// Number of internal references plus external references to the slot.
|
|
167
|
-
SHARED_REFS = ((uint32_t{1} << 15) - 1)
|
|
168
|
-
<< kSharedRefsOffset, // Bits 15, ..., 29
|
|
169
|
-
// Whether a thread has an exclusive reference to the slot.
|
|
170
|
-
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
|
|
171
|
-
// Whether the handle will be deleted soon. When this bit is set, new
|
|
172
|
-
// internal
|
|
173
|
-
// or external references to this handle stop being accepted.
|
|
174
|
-
// There is an exception: external references can be created from
|
|
175
|
-
// existing external references, or converting from existing internal
|
|
176
|
-
// references.
|
|
177
|
-
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
|
|
178
|
-
|
|
179
|
-
// Shared references (i.e., external and internal references) and exclusive
|
|
180
|
-
// references are our custom implementation of RW locks---external and
|
|
181
|
-
// internal references are read locks, and exclusive references are write
|
|
182
|
-
// locks. We prioritize readers, which never block; in fact, they don't even
|
|
183
|
-
// use compare-and-swap operations. Using our own implementation of RW locks
|
|
184
|
-
// allows us to save many atomic operations by packing data more carefully.
|
|
185
|
-
// In particular:
|
|
186
|
-
// - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
|
|
187
|
-
// internal
|
|
188
|
-
// reference into an external reference in a single atomic arithmetic
|
|
189
|
-
// operation.
|
|
190
|
-
// - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
|
|
191
|
-
// a shared reference and check whether the entry is marked for deletion
|
|
192
|
-
// in a single atomic arithmetic operation.
|
|
193
|
-
};
|
|
194
|
-
|
|
195
|
-
static constexpr uint32_t kOneInternalRef = 0x8000;
|
|
196
|
-
static constexpr uint32_t kOneExternalRef = 0x8001;
|
|
197
|
-
|
|
198
|
-
std::atomic<uint32_t> refs;
|
|
199
|
-
|
|
200
195
|
static constexpr uint8_t kIsElementOffset = 1;
|
|
201
196
|
static constexpr uint8_t kClockPriorityOffset = 2;
|
|
202
197
|
static constexpr uint8_t kIsHitOffset = 4;
|
|
@@ -209,7 +204,7 @@ struct ClockHandle {
|
|
|
209
204
|
CLOCK_PRIORITY = 3 << kClockPriorityOffset,
|
|
210
205
|
// Whether the handle has been looked up after its insertion.
|
|
211
206
|
HAS_HIT = 1 << kIsHitOffset,
|
|
212
|
-
// The value of Cache::Priority
|
|
207
|
+
// The value of Cache::Priority of the handle.
|
|
213
208
|
CACHE_PRIORITY = 1 << kCachePriorityOffset,
|
|
214
209
|
};
|
|
215
210
|
|
|
@@ -226,30 +221,67 @@ struct ClockHandle {
|
|
|
226
221
|
// up in this slot or a higher one.
|
|
227
222
|
std::atomic<uint32_t> displacements;
|
|
228
223
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
224
|
+
static constexpr uint8_t kExternalRefsOffset = 0;
|
|
225
|
+
static constexpr uint8_t kSharedRefsOffset = 15;
|
|
226
|
+
static constexpr uint8_t kExclusiveRefOffset = 30;
|
|
227
|
+
static constexpr uint8_t kWillBeDeletedOffset = 31;
|
|
228
|
+
|
|
229
|
+
enum Refs : uint32_t {
|
|
230
|
+
// Synchronization model:
|
|
231
|
+
// - An external reference guarantees that hash, value, key_data
|
|
232
|
+
// and the IS_ELEMENT flag are not modified. Doesn't allow
|
|
233
|
+
// any writes.
|
|
234
|
+
// - An internal reference has the same guarantees as an
|
|
235
|
+
// external reference, and additionally allows the following
|
|
236
|
+
// idempotent updates on the handle:
|
|
237
|
+
// * set CLOCK_PRIORITY to NONE;
|
|
238
|
+
// * set the HAS_HIT bit;
|
|
239
|
+
// * set the WILL_BE_DELETED bit.
|
|
240
|
+
// - A shared reference is either an external reference or an
|
|
241
|
+
// internal reference.
|
|
242
|
+
// - An exclusive reference guarantees that no other thread has a shared
|
|
243
|
+
// or exclusive reference to the handle, and allows writes
|
|
244
|
+
// on the handle.
|
|
245
|
+
|
|
246
|
+
// Number of external references to the slot.
|
|
247
|
+
EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
|
|
248
|
+
<< kExternalRefsOffset, // Bits 0, ..., 14
|
|
249
|
+
// Number of internal references plus external references to the slot.
|
|
250
|
+
SHARED_REFS = ((uint32_t{1} << 15) - 1)
|
|
251
|
+
<< kSharedRefsOffset, // Bits 15, ..., 29
|
|
252
|
+
// Whether a thread has an exclusive reference to the slot.
|
|
253
|
+
EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
|
|
254
|
+
// Whether the handle will be deleted soon. When this bit is set, new
|
|
255
|
+
// internal
|
|
256
|
+
// or external references to this handle stop being accepted.
|
|
257
|
+
// There is an exception: external references can be created from
|
|
258
|
+
// existing external references, or converting from existing internal
|
|
259
|
+
// references.
|
|
260
|
+
WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
|
|
261
|
+
|
|
262
|
+
// Having these 4 fields in a single variable allows us to support the
|
|
263
|
+
// following operations efficiently:
|
|
264
|
+
// - Convert an internal reference into an external reference in a single
|
|
265
|
+
// atomic arithmetic operation.
|
|
266
|
+
// - Attempt to take a shared reference using a single atomic arithmetic
|
|
267
|
+
// operation. This is because we can increment the internal ref count
|
|
268
|
+
// as well as checking whether the entry is marked for deletion using a
|
|
269
|
+
// single atomic arithmetic operation (and one non-atomic comparison).
|
|
270
|
+
};
|
|
271
|
+
|
|
272
|
+
static constexpr uint32_t kOneInternalRef = 0x8000;
|
|
273
|
+
static constexpr uint32_t kOneExternalRef = 0x8001;
|
|
274
|
+
|
|
275
|
+
std::atomic<uint32_t> refs;
|
|
244
276
|
|
|
245
277
|
ClockHandle()
|
|
246
278
|
: value(nullptr),
|
|
247
279
|
deleter(nullptr),
|
|
248
280
|
hash(0),
|
|
249
281
|
total_charge(0),
|
|
250
|
-
refs(0),
|
|
251
282
|
flags(0),
|
|
252
|
-
displacements(0)
|
|
283
|
+
displacements(0),
|
|
284
|
+
refs(0) {
|
|
253
285
|
SetWillBeDeleted(false);
|
|
254
286
|
SetIsElement(false);
|
|
255
287
|
SetClockPriority(ClockPriority::NONE);
|
|
@@ -257,26 +289,66 @@ struct ClockHandle {
|
|
|
257
289
|
key_data.fill(0);
|
|
258
290
|
}
|
|
259
291
|
|
|
292
|
+
// The copy ctor and assignment operator are only used to copy a handle
|
|
293
|
+
// for immediate deletion. (We need to copy because the slot may become
|
|
294
|
+
// re-used before the deletion is completed.) We only copy the necessary
|
|
295
|
+
// members to carry out the deletion. In particular, we don't need
|
|
296
|
+
// the atomic members.
|
|
260
297
|
ClockHandle(const ClockHandle& other) { *this = other; }
|
|
261
298
|
|
|
262
299
|
void operator=(const ClockHandle& other) {
|
|
263
300
|
value = other.value;
|
|
264
301
|
deleter = other.deleter;
|
|
265
|
-
hash = other.hash;
|
|
266
|
-
total_charge = other.total_charge;
|
|
267
|
-
refs.store(other.refs);
|
|
268
302
|
key_data = other.key_data;
|
|
269
|
-
|
|
270
|
-
SetWillBeDeleted(other.WillBeDeleted());
|
|
271
|
-
SetIsElement(other.IsElement());
|
|
272
|
-
SetClockPriority(other.GetClockPriority());
|
|
273
|
-
SetCachePriority(other.GetCachePriority());
|
|
274
|
-
displacements.store(other.displacements);
|
|
303
|
+
total_charge = other.total_charge;
|
|
275
304
|
}
|
|
276
305
|
|
|
277
306
|
Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
|
|
278
307
|
|
|
279
|
-
|
|
308
|
+
void FreeData() {
|
|
309
|
+
if (deleter) {
|
|
310
|
+
(*deleter)(key(), value);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Calculate the memory usage by metadata.
|
|
315
|
+
inline size_t CalcMetaCharge(
|
|
316
|
+
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
317
|
+
if (metadata_charge_policy != kFullChargeCacheMetadata) {
|
|
318
|
+
return 0;
|
|
319
|
+
} else {
|
|
320
|
+
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
|
321
|
+
// return malloc_usable_size(
|
|
322
|
+
// const_cast<void*>(static_cast<const void*>(this)));
|
|
323
|
+
// #else
|
|
324
|
+
// TODO(Guido) malloc_usable_size only works when we call it on
|
|
325
|
+
// a pointer allocated with malloc. Because our handles are all
|
|
326
|
+
// allocated in a single shot as an array, the user can't call
|
|
327
|
+
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
|
|
328
|
+
// pointer returned by the cache. Moreover, malloc_usable_size
|
|
329
|
+
// expects a heap-allocated handle, but sometimes in our code we
|
|
330
|
+
// wish to pass a stack-allocated handle (this is only a performance
|
|
331
|
+
// concern).
|
|
332
|
+
// What is the right way to compute metadata charges with pre-allocated
|
|
333
|
+
// handles?
|
|
334
|
+
return sizeof(ClockHandle);
|
|
335
|
+
// #endif
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
inline void CalcTotalCharge(
|
|
340
|
+
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
|
|
341
|
+
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
inline size_t GetCharge(
|
|
345
|
+
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
346
|
+
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
|
|
347
|
+
assert(total_charge >= meta_charge);
|
|
348
|
+
return total_charge - meta_charge;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// flags functions.
|
|
280
352
|
|
|
281
353
|
bool IsElement() const { return flags & IS_ELEMENT; }
|
|
282
354
|
|
|
@@ -292,10 +364,6 @@ struct ClockHandle {
|
|
|
292
364
|
|
|
293
365
|
void SetHit() { flags |= HAS_HIT; }
|
|
294
366
|
|
|
295
|
-
bool IsInClock() const {
|
|
296
|
-
return GetClockPriority() != ClockHandle::ClockPriority::NONE;
|
|
297
|
-
}
|
|
298
|
-
|
|
299
367
|
Cache::Priority GetCachePriority() const {
|
|
300
368
|
return static_cast<Cache::Priority>(flags & CACHE_PRIORITY);
|
|
301
369
|
}
|
|
@@ -308,6 +376,10 @@ struct ClockHandle {
|
|
|
308
376
|
}
|
|
309
377
|
}
|
|
310
378
|
|
|
379
|
+
bool IsInClock() const {
|
|
380
|
+
return GetClockPriority() != ClockHandle::ClockPriority::NONE;
|
|
381
|
+
}
|
|
382
|
+
|
|
311
383
|
ClockPriority GetClockPriority() const {
|
|
312
384
|
return static_cast<ClockPriority>(flags & Flags::CLOCK_PRIORITY);
|
|
313
385
|
}
|
|
@@ -328,49 +400,6 @@ struct ClockHandle {
|
|
|
328
400
|
flags |= new_priority;
|
|
329
401
|
}
|
|
330
402
|
|
|
331
|
-
void FreeData() {
|
|
332
|
-
if (deleter) {
|
|
333
|
-
(*deleter)(key(), value);
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
// Calculate the memory usage by metadata.
|
|
338
|
-
inline size_t CalcMetaCharge(
|
|
339
|
-
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
340
|
-
if (metadata_charge_policy != kFullChargeCacheMetadata) {
|
|
341
|
-
return 0;
|
|
342
|
-
} else {
|
|
343
|
-
// #ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
|
344
|
-
// return malloc_usable_size(
|
|
345
|
-
// const_cast<void*>(static_cast<const void*>(this)));
|
|
346
|
-
// #else
|
|
347
|
-
// TODO(Guido) malloc_usable_size only works when we call it on
|
|
348
|
-
// a pointer allocated with malloc. Because our handles are all
|
|
349
|
-
// allocated in a single shot as an array, the user can't call
|
|
350
|
-
// CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle
|
|
351
|
-
// pointer returned by the cache. Moreover, malloc_usable_size
|
|
352
|
-
// expects a heap-allocated handle, but sometimes in our code we
|
|
353
|
-
// wish to pass a stack-allocated handle (this is only a performance
|
|
354
|
-
// concern).
|
|
355
|
-
// What is the right way to compute metadata charges with pre-allocated
|
|
356
|
-
// handles?
|
|
357
|
-
return sizeof(ClockHandle);
|
|
358
|
-
// #endif
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
inline void CalcTotalCharge(
|
|
363
|
-
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
|
|
364
|
-
total_charge = charge + CalcMetaCharge(metadata_charge_policy);
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
inline size_t GetCharge(
|
|
368
|
-
CacheMetadataChargePolicy metadata_charge_policy) const {
|
|
369
|
-
size_t meta_charge = CalcMetaCharge(metadata_charge_policy);
|
|
370
|
-
assert(total_charge >= meta_charge);
|
|
371
|
-
return total_charge - meta_charge;
|
|
372
|
-
}
|
|
373
|
-
|
|
374
403
|
inline bool IsEmpty() const {
|
|
375
404
|
return !this->IsElement() && this->displacements == 0;
|
|
376
405
|
}
|
|
@@ -380,11 +409,12 @@ struct ClockHandle {
|
|
|
380
409
|
}
|
|
381
410
|
|
|
382
411
|
inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
|
|
383
|
-
return this->
|
|
384
|
-
this->key() == some_key;
|
|
412
|
+
return this->hash == some_hash && this->key() == some_key;
|
|
385
413
|
}
|
|
386
414
|
|
|
387
|
-
|
|
415
|
+
// refs functions.
|
|
416
|
+
|
|
417
|
+
inline bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
|
|
388
418
|
|
|
389
419
|
void SetWillBeDeleted(bool will_be_deleted) {
|
|
390
420
|
if (will_be_deleted) {
|
|
@@ -394,28 +424,7 @@ struct ClockHandle {
|
|
|
394
424
|
}
|
|
395
425
|
}
|
|
396
426
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
// Tries to take an external ref. Returns true iff it succeeds.
|
|
400
|
-
inline bool TryExternalRef() {
|
|
401
|
-
if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
|
|
402
|
-
return true;
|
|
403
|
-
}
|
|
404
|
-
refs -= kOneExternalRef;
|
|
405
|
-
return false;
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
// Releases an external ref. Returns the new value (this is useful to
|
|
409
|
-
// avoid an extra atomic read).
|
|
410
|
-
inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; }
|
|
411
|
-
|
|
412
|
-
// Take an external ref, assuming there is already one external ref
|
|
413
|
-
// to the handle.
|
|
414
|
-
void Ref() {
|
|
415
|
-
// TODO(Guido) Is it okay to assume that the existing external reference
|
|
416
|
-
// survives until this function returns?
|
|
417
|
-
refs += kOneExternalRef;
|
|
418
|
-
}
|
|
427
|
+
bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
|
|
419
428
|
|
|
420
429
|
// Tries to take an internal ref. Returns true iff it succeeds.
|
|
421
430
|
inline bool TryInternalRef() {
|
|
@@ -426,9 +435,19 @@ struct ClockHandle {
|
|
|
426
435
|
return false;
|
|
427
436
|
}
|
|
428
437
|
|
|
429
|
-
|
|
438
|
+
// Tries to take an external ref. Returns true iff it succeeds.
|
|
439
|
+
inline bool TryExternalRef() {
|
|
440
|
+
if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
|
|
441
|
+
return true;
|
|
442
|
+
}
|
|
443
|
+
refs -= kOneExternalRef;
|
|
444
|
+
return false;
|
|
445
|
+
}
|
|
430
446
|
|
|
431
447
|
// Tries to take an exclusive ref. Returns true iff it succeeds.
|
|
448
|
+
// TODO(Guido) After every TryExclusiveRef call, we always call
|
|
449
|
+
// WillBeDeleted(). We could save an atomic read by having an output parameter
|
|
450
|
+
// with the last value of refs.
|
|
432
451
|
inline bool TryExclusiveRef() {
|
|
433
452
|
uint32_t will_be_deleted = refs & WILL_BE_DELETED;
|
|
434
453
|
uint32_t expected = will_be_deleted;
|
|
@@ -436,15 +455,18 @@ struct ClockHandle {
|
|
|
436
455
|
EXCLUSIVE_REF | will_be_deleted);
|
|
437
456
|
}
|
|
438
457
|
|
|
439
|
-
// Repeatedly tries to take an exclusive reference, but
|
|
440
|
-
// as an external reference is detected (
|
|
441
|
-
// presumably be too long).
|
|
442
|
-
inline bool
|
|
458
|
+
// Repeatedly tries to take an exclusive reference, but aborts as soon
|
|
459
|
+
// as an external or exclusive reference is detected (since the wait
|
|
460
|
+
// would presumably be too long).
|
|
461
|
+
inline bool SpinTryExclusiveRef() {
|
|
443
462
|
uint32_t expected = 0;
|
|
444
463
|
uint32_t will_be_deleted = 0;
|
|
464
|
+
uint32_t spins = kSpinsPerTry;
|
|
445
465
|
while (!refs.compare_exchange_strong(expected,
|
|
446
|
-
EXCLUSIVE_REF | will_be_deleted)
|
|
447
|
-
|
|
466
|
+
EXCLUSIVE_REF | will_be_deleted) &&
|
|
467
|
+
spins--) {
|
|
468
|
+
std::this_thread::yield();
|
|
469
|
+
if (expected & (EXTERNAL_REFS | EXCLUSIVE_REF)) {
|
|
448
470
|
return false;
|
|
449
471
|
}
|
|
450
472
|
will_be_deleted = expected & WILL_BE_DELETED;
|
|
@@ -453,75 +475,88 @@ struct ClockHandle {
|
|
|
453
475
|
return true;
|
|
454
476
|
}
|
|
455
477
|
|
|
456
|
-
|
|
478
|
+
// Take an external ref, assuming there is already one external ref
|
|
479
|
+
// to the handle.
|
|
480
|
+
void Ref() {
|
|
481
|
+
// TODO(Guido) Is it okay to assume that the existing external reference
|
|
482
|
+
// survives until this function returns?
|
|
483
|
+
refs += kOneExternalRef;
|
|
484
|
+
}
|
|
457
485
|
|
|
458
|
-
|
|
459
|
-
// They guarantee atomicity, i.e., no exclusive refs to the handle
|
|
460
|
-
// can be taken by a different thread during the conversion.
|
|
486
|
+
inline void ReleaseExternalRef() { refs -= kOneExternalRef; }
|
|
461
487
|
|
|
462
|
-
inline void
|
|
463
|
-
refs += kOneInternalRef;
|
|
464
|
-
ReleaseExclusiveRef();
|
|
465
|
-
}
|
|
488
|
+
inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
|
|
466
489
|
|
|
490
|
+
inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
|
|
491
|
+
|
|
492
|
+
// Downgrade an exclusive ref to external.
|
|
467
493
|
inline void ExclusiveToExternalRef() {
|
|
468
494
|
refs += kOneExternalRef;
|
|
469
495
|
ReleaseExclusiveRef();
|
|
470
496
|
}
|
|
471
497
|
|
|
472
|
-
//
|
|
473
|
-
// algorithms to react to a failure?
|
|
474
|
-
inline void InternalToExclusiveRef() {
|
|
475
|
-
uint32_t expected = kOneInternalRef;
|
|
476
|
-
uint32_t will_be_deleted = 0;
|
|
477
|
-
while (!refs.compare_exchange_strong(expected,
|
|
478
|
-
EXCLUSIVE_REF | will_be_deleted)) {
|
|
479
|
-
will_be_deleted = expected & WILL_BE_DELETED;
|
|
480
|
-
expected = kOneInternalRef | will_be_deleted;
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
|
|
498
|
+
// Convert an internal ref into external.
|
|
484
499
|
inline void InternalToExternalRef() {
|
|
485
500
|
refs += kOneExternalRef - kOneInternalRef;
|
|
486
501
|
}
|
|
487
502
|
|
|
488
|
-
// TODO(Guido) Same concern.
|
|
489
|
-
inline void ExternalToExclusiveRef() {
|
|
490
|
-
uint32_t expected = kOneExternalRef;
|
|
491
|
-
uint32_t will_be_deleted = 0;
|
|
492
|
-
while (!refs.compare_exchange_strong(expected,
|
|
493
|
-
EXCLUSIVE_REF | will_be_deleted)) {
|
|
494
|
-
will_be_deleted = expected & WILL_BE_DELETED;
|
|
495
|
-
expected = kOneExternalRef | will_be_deleted;
|
|
496
|
-
}
|
|
497
|
-
}
|
|
498
|
-
|
|
499
503
|
}; // struct ClockHandle
|
|
500
504
|
|
|
501
505
|
class ClockHandleTable {
|
|
502
506
|
public:
|
|
503
|
-
explicit ClockHandleTable(int hash_bits);
|
|
507
|
+
explicit ClockHandleTable(size_t capacity, int hash_bits);
|
|
504
508
|
~ClockHandleTable();
|
|
505
509
|
|
|
506
|
-
// Returns a pointer to a visible
|
|
507
|
-
// nullptr if not present.
|
|
510
|
+
// Returns a pointer to a visible handle matching the key/hash, or
|
|
511
|
+
// nullptr if not present. When an actual handle is produced, an
|
|
512
|
+
// internal reference is handed over.
|
|
508
513
|
ClockHandle* Lookup(const Slice& key, uint32_t hash);
|
|
509
514
|
|
|
510
|
-
// Inserts a copy of h into the hash table.
|
|
511
|
-
//
|
|
512
|
-
//
|
|
513
|
-
//
|
|
514
|
-
//
|
|
515
|
-
//
|
|
516
|
-
|
|
515
|
+
// Inserts a copy of h into the hash table. Returns a pointer to the
|
|
516
|
+
// inserted handle, or nullptr if no available slot was found. Every
|
|
517
|
+
// existing visible handle matching the key is already present in the
|
|
518
|
+
// hash table is marked as WILL_BE_DELETED. The deletion is also attempted,
|
|
519
|
+
// and, if the attempt is successful, the handle is inserted into the
|
|
520
|
+
// autovector deleted. When take_reference is true, the function hands
|
|
521
|
+
// over an external reference on the handle, and otherwise no reference is
|
|
522
|
+
// produced.
|
|
523
|
+
ClockHandle* Insert(ClockHandle* h, autovector<ClockHandle>* deleted,
|
|
524
|
+
bool take_reference);
|
|
525
|
+
|
|
526
|
+
// Assigns h the appropriate clock priority, making it evictable.
|
|
527
|
+
void ClockOn(ClockHandle* h);
|
|
517
528
|
|
|
518
|
-
//
|
|
519
|
-
void
|
|
529
|
+
// Makes h non-evictable.
|
|
530
|
+
void ClockOff(ClockHandle* h);
|
|
520
531
|
|
|
521
|
-
//
|
|
522
|
-
//
|
|
523
|
-
|
|
524
|
-
|
|
532
|
+
// Runs the clock eviction algorithm until there is enough space to
|
|
533
|
+
// insert an element with the given charge.
|
|
534
|
+
void ClockRun(size_t charge);
|
|
535
|
+
|
|
536
|
+
// Remove h from the hash table. Requires an exclusive ref to h.
|
|
537
|
+
void Remove(ClockHandle* h, autovector<ClockHandle>* deleted);
|
|
538
|
+
|
|
539
|
+
// Remove from the hash table all handles with matching key/hash along a
|
|
540
|
+
// probe sequence, starting from the given probe number. Doesn't
|
|
541
|
+
// require any references.
|
|
542
|
+
void RemoveAll(const Slice& key, uint32_t hash, uint32_t& probe,
|
|
543
|
+
autovector<ClockHandle>* deleted);
|
|
544
|
+
|
|
545
|
+
void RemoveAll(const Slice& key, uint32_t hash,
|
|
546
|
+
autovector<ClockHandle>* deleted) {
|
|
547
|
+
uint32_t probe = 0;
|
|
548
|
+
RemoveAll(key, hash, probe, deleted);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
void Free(autovector<ClockHandle>* deleted);
|
|
552
|
+
|
|
553
|
+
// Tries to remove h from the hash table. If the attempt is successful,
|
|
554
|
+
// the function hands over an exclusive ref to h.
|
|
555
|
+
bool TryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
|
|
556
|
+
|
|
557
|
+
// Similar to TryRemove, except that it spins, increasing the chances of
|
|
558
|
+
// success. Requires that the caller thread has no shared ref to h.
|
|
559
|
+
bool SpinTryRemove(ClockHandle* h, autovector<ClockHandle>* deleted);
|
|
525
560
|
|
|
526
561
|
template <typename T>
|
|
527
562
|
void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
|
|
@@ -531,12 +566,9 @@ class ClockHandleTable {
|
|
|
531
566
|
if (h->TryExclusiveRef()) {
|
|
532
567
|
if (h->IsElement() &&
|
|
533
568
|
(apply_if_will_be_deleted || !h->WillBeDeleted())) {
|
|
534
|
-
// Hand the internal ref over to func, which is now responsible
|
|
535
|
-
// to release it.
|
|
536
569
|
func(h);
|
|
537
|
-
} else {
|
|
538
|
-
h->ReleaseExclusiveRef();
|
|
539
570
|
}
|
|
571
|
+
h->ReleaseExclusiveRef();
|
|
540
572
|
}
|
|
541
573
|
}
|
|
542
574
|
}
|
|
@@ -565,53 +597,81 @@ class ClockHandleTable {
|
|
|
565
597
|
|
|
566
598
|
uint32_t GetOccupancy() const { return occupancy_; }
|
|
567
599
|
|
|
600
|
+
size_t GetUsage() const { return usage_; }
|
|
601
|
+
|
|
602
|
+
size_t GetCapacity() const { return capacity_; }
|
|
603
|
+
|
|
568
604
|
// Returns x mod 2^{length_bits_}.
|
|
569
605
|
uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; }
|
|
570
606
|
|
|
571
607
|
private:
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
//
|
|
582
|
-
// the
|
|
583
|
-
//
|
|
584
|
-
//
|
|
585
|
-
//
|
|
586
|
-
//
|
|
587
|
-
//
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
//
|
|
598
|
-
//
|
|
608
|
+
// Extracts the element information from a handle (src), and assigns it
|
|
609
|
+
// to a hash table slot (dst). Doesn't touch displacements and refs,
|
|
610
|
+
// which are maintained by the hash table algorithm.
|
|
611
|
+
void Assign(ClockHandle* dst, ClockHandle* src);
|
|
612
|
+
|
|
613
|
+
// Returns the first slot in the probe sequence, starting from the given
|
|
614
|
+
// probe number, with a handle e such that match(e) is true. At every
|
|
615
|
+
// step, the function first tests whether match(e) holds. If this is false,
|
|
616
|
+
// it evaluates abort(e) to decide whether the search should be aborted,
|
|
617
|
+
// and in the affirmative returns -1. For every handle e probed except
|
|
618
|
+
// the last one, the function runs update(e).
|
|
619
|
+
// The probe parameter is modified as follows. We say a probe to a handle
|
|
620
|
+
// e is aborting if match(e) is false and abort(e) is true. Then the final
|
|
621
|
+
// value of probe is one more than the last non-aborting probe during the
|
|
622
|
+
// call. This is so that that the variable can be used to keep track of
|
|
623
|
+
// progress across consecutive calls to FindSlot.
|
|
624
|
+
inline ClockHandle* FindSlot(const Slice& key,
|
|
625
|
+
std::function<bool(ClockHandle*)> match,
|
|
626
|
+
std::function<bool(ClockHandle*)> stop,
|
|
627
|
+
std::function<void(ClockHandle*)> update,
|
|
628
|
+
uint32_t& probe);
|
|
629
|
+
|
|
630
|
+
// Returns an available slot for the given key. All copies of the
|
|
631
|
+
// key found along the probing sequence until an available slot is
|
|
632
|
+
// found are marked for deletion. On each of them, a deletion is
|
|
633
|
+
// attempted, and when the attempt succeeds the slot is assigned to
|
|
634
|
+
// the new copy of the element.
|
|
635
|
+
ClockHandle* FindAvailableSlot(const Slice& key, uint32_t hash,
|
|
636
|
+
uint32_t& probe,
|
|
637
|
+
autovector<ClockHandle>* deleted);
|
|
638
|
+
|
|
639
|
+
// After a failed FindSlot call (i.e., with answer -1) in
|
|
640
|
+
// FindAvailableSlot, this function fixes all displacements's
|
|
641
|
+
// starting from the 0-th probe, until the given probe.
|
|
599
642
|
void Rollback(const Slice& key, uint32_t probe);
|
|
600
643
|
|
|
601
644
|
// Number of hash bits used for table index.
|
|
602
645
|
// The size of the table is 1 << length_bits_.
|
|
603
|
-
int length_bits_;
|
|
646
|
+
const int length_bits_;
|
|
604
647
|
|
|
605
648
|
// For faster computation of ModTableSize.
|
|
606
649
|
const uint32_t length_bits_mask_;
|
|
607
650
|
|
|
608
|
-
// Number of elements in the table.
|
|
609
|
-
uint32_t occupancy_;
|
|
610
|
-
|
|
611
651
|
// Maximum number of elements the user can store in the table.
|
|
612
|
-
uint32_t occupancy_limit_;
|
|
652
|
+
const uint32_t occupancy_limit_;
|
|
653
|
+
|
|
654
|
+
// Maximum total charge of all elements stored in the table.
|
|
655
|
+
const size_t capacity_;
|
|
613
656
|
|
|
657
|
+
// We partition the following members into different cache lines
|
|
658
|
+
// to avoid false sharing among Lookup, Release, Erase and Insert
|
|
659
|
+
// operations in ClockCacheShard.
|
|
660
|
+
|
|
661
|
+
ALIGN_AS(CACHE_LINE_SIZE)
|
|
662
|
+
// Array of slots comprising the hash table.
|
|
614
663
|
std::unique_ptr<ClockHandle[]> array_;
|
|
664
|
+
|
|
665
|
+
ALIGN_AS(CACHE_LINE_SIZE)
|
|
666
|
+
// Clock algorithm sweep pointer.
|
|
667
|
+
std::atomic<uint32_t> clock_pointer_;
|
|
668
|
+
|
|
669
|
+
ALIGN_AS(CACHE_LINE_SIZE)
|
|
670
|
+
// Number of elements in the table.
|
|
671
|
+
std::atomic<uint32_t> occupancy_;
|
|
672
|
+
|
|
673
|
+
// Memory size for entries residing in the cache.
|
|
674
|
+
std::atomic<size_t> usage_;
|
|
615
675
|
}; // class ClockHandleTable
|
|
616
676
|
|
|
617
677
|
// A single shard of sharded cache.
|
|
@@ -652,20 +712,26 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
|
|
|
652
712
|
Statistics* /*stats*/) override {
|
|
653
713
|
return Lookup(key, hash);
|
|
654
714
|
}
|
|
715
|
+
|
|
655
716
|
Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
|
|
656
717
|
|
|
657
718
|
bool Release(Cache::Handle* handle, bool /*useful*/,
|
|
658
719
|
bool erase_if_last_ref) override {
|
|
659
720
|
return Release(handle, erase_if_last_ref);
|
|
660
721
|
}
|
|
722
|
+
|
|
661
723
|
bool IsReady(Cache::Handle* /*handle*/) override { return true; }
|
|
724
|
+
|
|
662
725
|
void Wait(Cache::Handle* /*handle*/) override {}
|
|
663
726
|
|
|
664
727
|
bool Ref(Cache::Handle* handle) override;
|
|
728
|
+
|
|
665
729
|
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
|
|
730
|
+
|
|
666
731
|
void Erase(const Slice& key, uint32_t hash) override;
|
|
667
732
|
|
|
668
733
|
size_t GetUsage() const override;
|
|
734
|
+
|
|
669
735
|
size_t GetPinnedUsage() const override;
|
|
670
736
|
|
|
671
737
|
void ApplyToSomeEntries(
|
|
@@ -675,20 +741,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
|
|
|
675
741
|
|
|
676
742
|
void EraseUnRefEntries() override;
|
|
677
743
|
|
|
678
|
-
std::string GetPrintableOptions() const override;
|
|
744
|
+
std::string GetPrintableOptions() const override { return std::string{}; }
|
|
679
745
|
|
|
680
746
|
private:
|
|
681
747
|
friend class ClockCache;
|
|
682
748
|
|
|
683
|
-
// Makes an element evictable by clock.
|
|
684
|
-
void ClockOn(ClockHandle* h);
|
|
685
|
-
|
|
686
|
-
// Makes an element non-evictable.
|
|
687
|
-
void ClockOff(ClockHandle* h);
|
|
688
|
-
|
|
689
|
-
// Requires an exclusive ref on h.
|
|
690
|
-
void Evict(ClockHandle* h);
|
|
691
|
-
|
|
692
749
|
// Free some space following strict clock policy until enough space
|
|
693
750
|
// to hold (usage_ + charge) is freed or there are no evictable elements.
|
|
694
751
|
void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
|
|
@@ -703,34 +760,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
|
|
|
703
760
|
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
|
|
704
761
|
CacheMetadataChargePolicy metadata_charge_policy);
|
|
705
762
|
|
|
706
|
-
// Initialized before use.
|
|
707
|
-
size_t capacity_;
|
|
708
|
-
|
|
709
763
|
// Whether to reject insertion if cache reaches its full capacity.
|
|
710
|
-
bool strict_capacity_limit_;
|
|
711
|
-
|
|
712
|
-
uint32_t clock_pointer_;
|
|
713
|
-
|
|
714
|
-
// ------------^^^^^^^^^^^^^-----------
|
|
715
|
-
// Not frequently modified data members
|
|
716
|
-
// ------------------------------------
|
|
717
|
-
//
|
|
718
|
-
// We separate data members that are updated frequently from the ones that
|
|
719
|
-
// are not frequently updated so that they don't share the same cache line
|
|
720
|
-
// which will lead into false cache sharing
|
|
721
|
-
//
|
|
722
|
-
// ------------------------------------
|
|
723
|
-
// Frequently modified data members
|
|
724
|
-
// ------------vvvvvvvvvvvvv-----------
|
|
725
|
-
ClockHandleTable table_;
|
|
764
|
+
std::atomic<bool> strict_capacity_limit_;
|
|
726
765
|
|
|
727
|
-
|
|
728
|
-
size_t usage_;
|
|
729
|
-
|
|
730
|
-
// mutex_ protects the following state.
|
|
731
|
-
// We don't count mutex_ as the cache's internal state so semantically we
|
|
732
|
-
// don't mind mutex_ invoking the non-const actions.
|
|
733
|
-
mutable DMutex mutex_;
|
|
766
|
+
ClockHandleTable table_;
|
|
734
767
|
}; // class ClockCacheShard
|
|
735
768
|
|
|
736
769
|
class ClockCache
|
|
@@ -743,19 +776,28 @@ class ClockCache
|
|
|
743
776
|
bool strict_capacity_limit,
|
|
744
777
|
CacheMetadataChargePolicy metadata_charge_policy =
|
|
745
778
|
kDontChargeCacheMetadata);
|
|
779
|
+
|
|
746
780
|
~ClockCache() override;
|
|
781
|
+
|
|
747
782
|
const char* Name() const override { return "ClockCache"; }
|
|
783
|
+
|
|
748
784
|
CacheShard* GetShard(uint32_t shard) override;
|
|
785
|
+
|
|
749
786
|
const CacheShard* GetShard(uint32_t shard) const override;
|
|
787
|
+
|
|
750
788
|
void* Value(Handle* handle) override;
|
|
789
|
+
|
|
751
790
|
size_t GetCharge(Handle* handle) const override;
|
|
791
|
+
|
|
752
792
|
uint32_t GetHash(Handle* handle) const override;
|
|
793
|
+
|
|
753
794
|
DeleterFn GetDeleter(Handle* handle) const override;
|
|
795
|
+
|
|
754
796
|
void DisownData() override;
|
|
755
797
|
|
|
756
798
|
private:
|
|
757
799
|
ClockCacheShard* shards_ = nullptr;
|
|
758
|
-
int num_shards_
|
|
800
|
+
int num_shards_;
|
|
759
801
|
}; // class ClockCache
|
|
760
802
|
|
|
761
803
|
} // namespace clock_cache
|