@nxtedition/rocksdb 7.0.39 → 7.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/binding.cc +59 -30
  2. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +27 -11
  3. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +310 -337
  4. package/deps/rocksdb/rocksdb/cache/clock_cache.h +394 -352
  5. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +1 -1
  6. package/deps/rocksdb/rocksdb/db/column_family.cc +2 -2
  7. package/deps/rocksdb/rocksdb/db/column_family_test.cc +1 -1
  8. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +13 -3
  9. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +273 -134
  10. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +33 -2
  11. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.cc +11 -3
  12. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker.h +2 -1
  13. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_fifo.cc +2 -2
  14. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_level.cc +133 -5
  15. package/deps/rocksdb/rocksdb/db/compaction/compaction_picker_test.cc +130 -1
  16. package/deps/rocksdb/rocksdb/db/compaction/compaction_service_job.cc +8 -4
  17. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +11 -9
  18. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +209 -12
  19. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +54 -39
  20. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +102 -19
  21. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +30 -11
  22. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +1 -1
  23. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_files.cc +28 -25
  24. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +0 -14
  25. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +63 -54
  26. package/deps/rocksdb/rocksdb/db/db_test.cc +6 -6
  27. package/deps/rocksdb/rocksdb/db/error_handler.cc +7 -0
  28. package/deps/rocksdb/rocksdb/db/error_handler.h +10 -9
  29. package/deps/rocksdb/rocksdb/db/log_test.cc +13 -6
  30. package/deps/rocksdb/rocksdb/db/perf_context_test.cc +1 -1
  31. package/deps/rocksdb/rocksdb/db/table_cache.cc +21 -0
  32. package/deps/rocksdb/rocksdb/db/table_cache.h +5 -0
  33. package/deps/rocksdb/rocksdb/db/version_set.cc +3 -2
  34. package/deps/rocksdb/rocksdb/db/version_set.h +6 -4
  35. package/deps/rocksdb/rocksdb/db/version_set_test.cc +8 -6
  36. package/deps/rocksdb/rocksdb/db/wal_edit.cc +22 -15
  37. package/deps/rocksdb/rocksdb/db/wal_edit.h +10 -0
  38. package/deps/rocksdb/rocksdb/db/wal_edit_test.cc +4 -5
  39. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.cc +0 -36
  40. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_driver.cc +1 -12
  41. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -29
  42. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.h +0 -5
  43. package/deps/rocksdb/rocksdb/db_stress_tool/multi_ops_txns_stress.cc +7 -0
  44. package/deps/rocksdb/rocksdb/env/env_test.cc +0 -5
  45. package/deps/rocksdb/rocksdb/env/io_posix.cc +1 -7
  46. package/deps/rocksdb/rocksdb/memtable/hash_linklist_rep.cc +100 -78
  47. package/deps/rocksdb/rocksdb/options/options_test.cc +16 -0
  48. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +51 -0
  49. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +3 -0
  50. package/deps/rocksdb/rocksdb/table/table_reader.h +14 -0
  51. package/deps/rocksdb/rocksdb/table/table_test.cc +52 -0
  52. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +8 -38
  53. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +27 -21
  54. package/deps/rocksdb/rocksdb/util/rate_limiter.h +12 -10
  55. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +11 -8
  56. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine_test.cc +2 -1
  57. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.cc +59 -0
  58. package/deps/rocksdb/rocksdb/utilities/transactions/pessimistic_transaction_db.h +12 -0
  59. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +31 -0
  60. package/deps/rocksdb/rocksdb/utilities/transactions/write_prepared_transaction_test.cc +0 -3
  61. package/max_rev_operator.h +101 -0
  62. package/package.json +1 -1
  63. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  64. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -17,7 +17,6 @@
17
17
  #include "monitoring/perf_context_imp.h"
18
18
  #include "monitoring/statistics.h"
19
19
  #include "port/lang.h"
20
- #include "util/distributed_mutex.h"
21
20
  #include "util/hash.h"
22
21
  #include "util/math.h"
23
22
  #include "util/random.h"
@@ -26,86 +25,91 @@ namespace ROCKSDB_NAMESPACE {
26
25
 
27
26
  namespace clock_cache {
28
27
 
29
- ClockHandleTable::ClockHandleTable(int hash_bits)
28
+ ClockHandleTable::ClockHandleTable(size_t capacity, int hash_bits)
30
29
  : length_bits_(hash_bits),
31
30
  length_bits_mask_((uint32_t{1} << length_bits_) - 1),
32
- occupancy_(0),
33
31
  occupancy_limit_(static_cast<uint32_t>((uint32_t{1} << length_bits_) *
34
32
  kStrictLoadFactor)),
35
- array_(new ClockHandle[size_t{1} << length_bits_]) {
33
+ capacity_(capacity),
34
+ array_(new ClockHandle[size_t{1} << length_bits_]),
35
+ clock_pointer_(0),
36
+ occupancy_(0),
37
+ usage_(0) {
36
38
  assert(hash_bits <= 32);
37
39
  }
38
40
 
39
41
  ClockHandleTable::~ClockHandleTable() {
40
- ApplyToEntriesRange([](ClockHandle* h) { h->FreeData(); }, 0, GetTableSize(),
41
- true);
42
+ // Assumes there are no references (of any type) to any slot in the table.
43
+ for (uint32_t i = 0; i < GetTableSize(); i++) {
44
+ ClockHandle* h = &array_[i];
45
+ if (h->IsElement()) {
46
+ h->FreeData();
47
+ }
48
+ }
42
49
  }
43
50
 
44
51
  ClockHandle* ClockHandleTable::Lookup(const Slice& key, uint32_t hash) {
45
52
  uint32_t probe = 0;
46
- int slot = FindElement(key, hash, probe);
47
- return (slot == -1) ? nullptr : &array_[slot];
53
+ ClockHandle* e = FindSlot(
54
+ key,
55
+ [&](ClockHandle* h) {
56
+ if (h->TryInternalRef()) {
57
+ if (h->IsElement() && h->Matches(key, hash)) {
58
+ return true;
59
+ }
60
+ h->ReleaseInternalRef();
61
+ }
62
+ return false;
63
+ },
64
+ [&](ClockHandle* h) { return h->displacements == 0; },
65
+ [&](ClockHandle* /*h*/) {}, probe);
66
+
67
+ if (e != nullptr) {
68
+ // TODO(Guido) Comment from #10347: Here it looks like we have three atomic
69
+ // updates where it would be possible to combine into one CAS (more metadata
70
+ // under one atomic field) or maybe two atomic updates (one arithmetic, one
71
+ // bitwise). Something to think about optimizing.
72
+ e->InternalToExternalRef();
73
+ e->SetHit();
74
+ // The handle is now referenced, so we take it out of clock.
75
+ ClockOff(e);
76
+ }
77
+
78
+ return e;
48
79
  }
49
80
 
50
- ClockHandle* ClockHandleTable::Insert(ClockHandle* h, ClockHandle** old) {
81
+ ClockHandle* ClockHandleTable::Insert(ClockHandle* h,
82
+ autovector<ClockHandle>* deleted,
83
+ bool take_reference) {
51
84
  uint32_t probe = 0;
52
- int slot = FindElementOrAvailableSlot(h->key(), h->hash, probe);
53
- *old = nullptr;
54
- if (slot == -1) {
55
- // The key is not already present, and there's no available slot to place
56
- // the new copy.
85
+ ClockHandle* e = FindAvailableSlot(h->key(), h->hash, probe, deleted);
86
+ if (e == nullptr) {
87
+ // No available slot to place the handle.
57
88
  return nullptr;
58
89
  }
59
90
 
60
- if (!array_[slot].IsElement()) {
61
- // The slot is empty or is a tombstone.
62
- ClockHandle* new_entry = &array_[slot];
63
- new_entry->InternalToExclusiveRef();
64
- Assign(new_entry, h);
65
- if (new_entry->displacements == 0) {
66
- // The slot was empty.
67
- return new_entry;
68
- }
69
- // It used to be a tombstone, so there may already be a copy of the
91
+ // The slot is empty or is a tombstone. And we have an exclusive ref.
92
+ Assign(e, h);
93
+ // TODO(Guido) The following RemoveAll can probably be run outside of
94
+ // the exclusive ref. I had a bad case in mind: multiple inserts could
95
+ // annihilate each. Although I think this is impossible, I'm not sure
96
+ // my mental proof covers every case.
97
+ if (e->displacements != 0) {
98
+ // It used to be a tombstone, so there may already be copies of the
70
99
  // key in the table.
71
- slot = FindElement(h->key(), h->hash, probe);
72
- if (slot == -1) {
73
- // Nope, no existing copy of the key.
74
- return new_entry;
75
- }
76
- ClockHandle* old_entry = &array_[slot];
77
- old_entry->ReleaseInternalRef();
78
- *old = old_entry;
79
- return new_entry;
80
- } else {
81
- // There is an existing copy of the key.
82
- ClockHandle* old_entry = &array_[slot];
83
- old_entry->ReleaseInternalRef();
84
- *old = old_entry;
85
- // Find an available slot for the new element.
86
- old_entry->displacements++;
87
- slot = FindAvailableSlot(h->key(), probe);
88
- if (slot == -1) {
89
- // No available slots.
90
- return nullptr;
91
- }
92
- ClockHandle* new_entry = &array_[slot];
93
- new_entry->InternalToExclusiveRef();
94
- Assign(new_entry, h);
95
- return new_entry;
100
+ RemoveAll(h->key(), h->hash, probe, deleted);
96
101
  }
97
- }
98
102
 
99
- void ClockHandleTable::Remove(ClockHandle* h) {
100
- assert(!h->IsInClock()); // Already off clock.
101
- uint32_t probe = 0;
102
- FindSlot(
103
- h->key(), [&](ClockHandle* e) { return e == h; },
104
- [&](ClockHandle* /*e*/) { return false; },
105
- [&](ClockHandle* e) { e->displacements--; }, probe);
106
- h->SetWillBeDeleted(false);
107
- h->SetIsElement(false);
108
- occupancy_--;
103
+ if (take_reference) {
104
+ // The user wants to take a reference.
105
+ e->ExclusiveToExternalRef();
106
+ } else {
107
+ // The user doesn't want to immediately take a reference, so we make
108
+ // it evictable.
109
+ ClockOn(e);
110
+ e->ReleaseExclusiveRef();
111
+ }
112
+ return e;
109
113
  }
110
114
 
111
115
  void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
@@ -117,19 +121,75 @@ void ClockHandleTable::Assign(ClockHandle* dst, ClockHandle* src) {
117
121
  dst->key_data = src->key_data;
118
122
  dst->flags.store(0);
119
123
  dst->SetIsElement(true);
120
- dst->SetClockPriority(ClockHandle::ClockPriority::NONE);
121
124
  dst->SetCachePriority(src->GetCachePriority());
125
+ usage_ += dst->total_charge;
122
126
  occupancy_++;
123
127
  }
124
128
 
125
- int ClockHandleTable::FindElement(const Slice& key, uint32_t hash,
126
- uint32_t& probe) {
127
- return FindSlot(
129
+ bool ClockHandleTable::TryRemove(ClockHandle* h,
130
+ autovector<ClockHandle>* deleted) {
131
+ if (h->TryExclusiveRef()) {
132
+ if (h->WillBeDeleted()) {
133
+ Remove(h, deleted);
134
+ return true;
135
+ }
136
+ h->ReleaseExclusiveRef();
137
+ }
138
+ return false;
139
+ }
140
+
141
+ bool ClockHandleTable::SpinTryRemove(ClockHandle* h,
142
+ autovector<ClockHandle>* deleted) {
143
+ if (h->SpinTryExclusiveRef()) {
144
+ if (h->WillBeDeleted()) {
145
+ Remove(h, deleted);
146
+ return true;
147
+ }
148
+ h->ReleaseExclusiveRef();
149
+ }
150
+ return false;
151
+ }
152
+
153
+ void ClockHandleTable::ClockOff(ClockHandle* h) {
154
+ h->SetClockPriority(ClockHandle::ClockPriority::NONE);
155
+ }
156
+
157
+ void ClockHandleTable::ClockOn(ClockHandle* h) {
158
+ assert(!h->IsInClock());
159
+ bool is_high_priority =
160
+ h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
161
+ h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
162
+ is_high_priority ? ClockHandle::ClockPriority::HIGH
163
+ : ClockHandle::ClockPriority::MEDIUM));
164
+ }
165
+
166
+ void ClockHandleTable::Remove(ClockHandle* h,
167
+ autovector<ClockHandle>* deleted) {
168
+ deleted->push_back(*h);
169
+ ClockOff(h);
170
+ uint32_t probe = 0;
171
+ FindSlot(
172
+ h->key(), [&](ClockHandle* e) { return e == h; },
173
+ [&](ClockHandle* /*e*/) { return false; },
174
+ [&](ClockHandle* e) { e->displacements--; }, probe);
175
+ h->SetWillBeDeleted(false);
176
+ h->SetIsElement(false);
177
+ }
178
+
179
+ void ClockHandleTable::RemoveAll(const Slice& key, uint32_t hash,
180
+ uint32_t& probe,
181
+ autovector<ClockHandle>* deleted) {
182
+ FindSlot(
128
183
  key,
129
184
  [&](ClockHandle* h) {
130
185
  if (h->TryInternalRef()) {
131
- if (h->Matches(key, hash)) {
132
- return true;
186
+ if (h->IsElement() && h->Matches(key, hash)) {
187
+ h->SetWillBeDeleted(true);
188
+ h->ReleaseInternalRef();
189
+ if (TryRemove(h, deleted)) {
190
+ h->ReleaseExclusiveRef();
191
+ }
192
+ return false;
133
193
  }
134
194
  h->ReleaseInternalRef();
135
195
  }
@@ -139,53 +199,74 @@ int ClockHandleTable::FindElement(const Slice& key, uint32_t hash,
139
199
  [&](ClockHandle* /*h*/) {}, probe);
140
200
  }
141
201
 
142
- int ClockHandleTable::FindAvailableSlot(const Slice& key, uint32_t& probe) {
143
- int slot = FindSlot(
144
- key,
145
- [&](ClockHandle* h) {
146
- if (h->TryInternalRef()) {
147
- if (!h->IsElement()) {
148
- return true;
149
- }
150
- h->ReleaseInternalRef();
151
- }
152
- return false;
153
- },
154
- [&](ClockHandle* /*h*/) { return false; },
155
- [&](ClockHandle* h) { h->displacements++; }, probe);
156
- if (slot == -1) {
157
- Rollback(key, probe);
202
+ void ClockHandleTable::Free(autovector<ClockHandle>* deleted) {
203
+ if (deleted->size() == 0) {
204
+ // Avoid unnecessarily reading usage_ and occupancy_.
205
+ return;
206
+ }
207
+
208
+ size_t deleted_charge = 0;
209
+ for (auto& h : *deleted) {
210
+ deleted_charge += h.total_charge;
211
+ h.FreeData();
158
212
  }
159
- return slot;
213
+ assert(usage_ >= deleted_charge);
214
+ usage_ -= deleted_charge;
215
+ occupancy_ -= static_cast<uint32_t>(deleted->size());
160
216
  }
161
217
 
162
- int ClockHandleTable::FindElementOrAvailableSlot(const Slice& key,
163
- uint32_t hash,
164
- uint32_t& probe) {
165
- int slot = FindSlot(
218
+ ClockHandle* ClockHandleTable::FindAvailableSlot(
219
+ const Slice& key, uint32_t hash, uint32_t& probe,
220
+ autovector<ClockHandle>* deleted) {
221
+ ClockHandle* e = FindSlot(
166
222
  key,
167
223
  [&](ClockHandle* h) {
224
+ // To read the handle, first acquire a shared ref.
168
225
  if (h->TryInternalRef()) {
169
- if (!h->IsElement() || h->Matches(key, hash)) {
170
- return true;
226
+ if (h->IsElement()) {
227
+ // The slot is not available.
228
+ // TODO(Guido) Is it worth testing h->WillBeDeleted()?
229
+ if (h->WillBeDeleted() || h->Matches(key, hash)) {
230
+ // The slot can be freed up, or the key we're inserting is already
231
+ // in the table, so we try to delete it. When the attempt is
232
+ // successful, the slot becomes available, so we stop probing.
233
+ // Notice that in that case TryRemove returns an exclusive ref.
234
+ h->SetWillBeDeleted(true);
235
+ h->ReleaseInternalRef();
236
+ if (TryRemove(h, deleted)) {
237
+ return true;
238
+ }
239
+ return false;
240
+ }
241
+ h->ReleaseInternalRef();
242
+ return false;
171
243
  }
244
+
245
+ // Available slot.
172
246
  h->ReleaseInternalRef();
247
+ // Try to acquire an exclusive ref. If we fail, continue probing.
248
+ if (h->SpinTryExclusiveRef()) {
249
+ // Check that the slot is still available.
250
+ if (!h->IsElement()) {
251
+ return true;
252
+ }
253
+ h->ReleaseExclusiveRef();
254
+ }
173
255
  }
174
256
  return false;
175
257
  },
176
258
  [&](ClockHandle* /*h*/) { return false; },
177
259
  [&](ClockHandle* h) { h->displacements++; }, probe);
178
- if (slot == -1) {
260
+ if (e == nullptr) {
179
261
  Rollback(key, probe);
180
262
  }
181
- return slot;
263
+ return e;
182
264
  }
183
265
 
184
- int ClockHandleTable::FindSlot(const Slice& key,
185
- std::function<bool(ClockHandle*)> match,
186
- std::function<bool(ClockHandle*)> abort,
187
- std::function<void(ClockHandle*)> update,
188
- uint32_t& probe) {
266
+ ClockHandle* ClockHandleTable::FindSlot(
267
+ const Slice& key, std::function<bool(ClockHandle*)> match,
268
+ std::function<bool(ClockHandle*)> abort,
269
+ std::function<void(ClockHandle*)> update, uint32_t& probe) {
189
270
  // We use double-hashing probing. Every probe in the sequence is a
190
271
  // pseudorandom integer, computed as a linear function of two random hashes,
191
272
  // which we call base and increment. Specifically, the i-th probe is base + i
@@ -201,14 +282,14 @@ int ClockHandleTable::FindSlot(const Slice& key,
201
282
  ClockHandle* h = &array_[current];
202
283
  if (current == base && probe > 0) {
203
284
  // We looped back.
204
- return -1;
285
+ return nullptr;
205
286
  }
206
287
  if (match(h)) {
207
288
  probe++;
208
- return current;
289
+ return h;
209
290
  }
210
291
  if (abort(h)) {
211
- return -1;
292
+ return nullptr;
212
293
  }
213
294
  probe++;
214
295
  update(h);
@@ -226,35 +307,73 @@ void ClockHandleTable::Rollback(const Slice& key, uint32_t probe) {
226
307
  }
227
308
  }
228
309
 
310
+ void ClockHandleTable::ClockRun(size_t charge) {
311
+ // TODO(Guido) When an element is in the probe sequence of a
312
+ // hot element, it will be hard to get an exclusive ref.
313
+ // Do we need a mechanism to prevent an element from sitting
314
+ // for a long time in cache waiting to be evicted?
315
+ assert(charge <= capacity_);
316
+ autovector<ClockHandle> deleted;
317
+ uint32_t max_iterations =
318
+ 1 + static_cast<uint32_t>(GetTableSize() * kLoadFactor);
319
+ size_t usage_local = usage_;
320
+ while (usage_local + charge > capacity_ && max_iterations--) {
321
+ uint32_t steps = 1 + static_cast<uint32_t>(1 / kLoadFactor);
322
+ uint32_t clock_pointer_local = (clock_pointer_ += steps) - steps;
323
+ for (uint32_t i = 0; i < steps; i++) {
324
+ ClockHandle* h = &array_[ModTableSize(clock_pointer_local + i)];
325
+
326
+ if (h->TryExclusiveRef()) {
327
+ if (h->WillBeDeleted()) {
328
+ Remove(h, &deleted);
329
+ usage_local -= h->total_charge;
330
+ } else {
331
+ if (!h->IsInClock() && h->IsElement()) {
332
+ // We adjust the clock priority to make the element evictable again.
333
+ // Why? Elements that are not in clock are either currently
334
+ // externally referenced or used to be. Because we are holding an
335
+ // exclusive ref, we know we are in the latter case. This can only
336
+ // happen when the last external reference to an element was
337
+ // released, and the element was not immediately removed.
338
+
339
+ ClockOn(h);
340
+ }
341
+ ClockHandle::ClockPriority priority = h->GetClockPriority();
342
+ if (priority == ClockHandle::ClockPriority::LOW) {
343
+ Remove(h, &deleted);
344
+ usage_local -= h->total_charge;
345
+ } else if (priority > ClockHandle::ClockPriority::LOW) {
346
+ h->DecreaseClockPriority();
347
+ }
348
+ }
349
+ h->ReleaseExclusiveRef();
350
+ }
351
+ }
352
+ }
353
+
354
+ Free(&deleted);
355
+ }
356
+
229
357
  ClockCacheShard::ClockCacheShard(
230
358
  size_t capacity, size_t estimated_value_size, bool strict_capacity_limit,
231
359
  CacheMetadataChargePolicy metadata_charge_policy)
232
- : capacity_(capacity),
233
- strict_capacity_limit_(strict_capacity_limit),
234
- clock_pointer_(0),
235
- table_(
236
- CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)),
237
- usage_(0) {
360
+ : strict_capacity_limit_(strict_capacity_limit),
361
+ table_(capacity, CalcHashBits(capacity, estimated_value_size,
362
+ metadata_charge_policy)) {
238
363
  set_metadata_charge_policy(metadata_charge_policy);
239
364
  }
240
365
 
241
366
  void ClockCacheShard::EraseUnRefEntries() {
242
- autovector<ClockHandle> last_reference_list;
243
- {
244
- DMutexLock l(mutex_);
245
- table_.ApplyToEntriesRange(
246
- [this, &last_reference_list](ClockHandle* h) {
247
- // Externally unreferenced element.
248
- last_reference_list.push_back(*h);
249
- Evict(h);
250
- },
251
- 0, table_.GetTableSize(), true);
252
- }
367
+ autovector<ClockHandle> deleted;
253
368
 
254
- // Free the entry outside of the mutex for performance reasons.
255
- for (auto& h : last_reference_list) {
256
- h.FreeData();
257
- }
369
+ table_.ApplyToEntriesRange(
370
+ [this, &deleted](ClockHandle* h) {
371
+ // Externally unreferenced element.
372
+ table_.Remove(h, &deleted);
373
+ },
374
+ 0, table_.GetTableSize(), true);
375
+
376
+ table_.Free(&deleted);
258
377
  }
259
378
 
260
379
  void ClockCacheShard::ApplyToSomeEntries(
@@ -264,7 +383,6 @@ void ClockCacheShard::ApplyToSomeEntries(
264
383
  // The state is essentially going to be the starting hash, which works
265
384
  // nicely even if we resize between calls because we use upper-most
266
385
  // hash bits for table indexes.
267
- DMutexLock l(mutex_);
268
386
  uint32_t length_bits = table_.GetLengthBits();
269
387
  uint32_t length = table_.GetTableSize();
270
388
 
@@ -276,7 +394,7 @@ void ClockCacheShard::ApplyToSomeEntries(
276
394
  uint32_t index_begin = *state >> (32 - length_bits);
277
395
  uint32_t index_end = index_begin + average_entries_per_lock;
278
396
  if (index_end >= length) {
279
- // Going to end
397
+ // Going to end.
280
398
  index_end = length;
281
399
  *state = UINT32_MAX;
282
400
  } else {
@@ -292,60 +410,6 @@ void ClockCacheShard::ApplyToSomeEntries(
292
410
  index_begin, index_end, false);
293
411
  }
294
412
 
295
- void ClockCacheShard::ClockOff(ClockHandle* h) {
296
- h->SetClockPriority(ClockHandle::ClockPriority::NONE);
297
- }
298
-
299
- void ClockCacheShard::ClockOn(ClockHandle* h) {
300
- assert(!h->IsInClock());
301
- bool is_high_priority =
302
- h->HasHit() || h->GetCachePriority() == Cache::Priority::HIGH;
303
- h->SetClockPriority(static_cast<ClockHandle::ClockPriority>(
304
- is_high_priority * ClockHandle::ClockPriority::HIGH +
305
- (1 - is_high_priority) * ClockHandle::ClockPriority::MEDIUM));
306
- }
307
-
308
- void ClockCacheShard::Evict(ClockHandle* h) {
309
- ClockOff(h);
310
- table_.Remove(h);
311
- assert(usage_ >= h->total_charge);
312
- usage_ -= h->total_charge;
313
- }
314
-
315
- void ClockCacheShard::EvictFromClock(size_t charge,
316
- autovector<ClockHandle>* deleted) {
317
- // TODO(Guido) When an element is in the probe sequence of a
318
- // hot element, it will be hard to get an exclusive ref.
319
- // We may need a mechanism to avoid that an element sits forever
320
- // in cache waiting to be evicted.
321
- assert(charge <= capacity_);
322
- uint32_t max_iterations = table_.GetTableSize();
323
- while (usage_ + charge > capacity_ && max_iterations--) {
324
- ClockHandle* h = &table_.array_[clock_pointer_];
325
- clock_pointer_ = table_.ModTableSize(clock_pointer_ + 1);
326
-
327
- if (h->TryExclusiveRef()) {
328
- if (!h->IsInClock() && h->IsElement()) {
329
- // We adjust the clock priority to make the element evictable again.
330
- // Why? Elements that are not in clock are either currently
331
- // externally referenced or used to be---because we are holding an
332
- // exclusive ref, we know we are in the latter case. This can only
333
- // happen when the last external reference to an element was released,
334
- // and the element was not immediately removed.
335
- ClockOn(h);
336
- }
337
-
338
- if (h->GetClockPriority() == ClockHandle::ClockPriority::LOW) {
339
- deleted->push_back(*h);
340
- Evict(h);
341
- } else if (h->GetClockPriority() > ClockHandle::ClockPriority::LOW) {
342
- h->DecreaseClockPriority();
343
- }
344
- h->ReleaseExclusiveRef();
345
- }
346
- }
347
- }
348
-
349
413
  size_t ClockCacheShard::CalcEstimatedHandleCharge(
350
414
  size_t estimated_value_size,
351
415
  CacheMetadataChargePolicy metadata_charge_policy) {
@@ -366,25 +430,12 @@ int ClockCacheShard::CalcHashBits(
366
430
  return FloorLog2((num_entries << 1) - 1);
367
431
  }
368
432
 
369
- void ClockCacheShard::SetCapacity(size_t capacity) {
370
- assert(false); // Not supported. TODO(Guido) Support it?
371
- autovector<ClockHandle> last_reference_list;
372
- {
373
- DMutexLock l(mutex_);
374
- capacity_ = capacity;
375
- EvictFromClock(0, &last_reference_list);
376
- }
377
-
378
- // Free the entry outside of the mutex for performance reasons.
379
- for (auto& h : last_reference_list) {
380
- h.FreeData();
381
- }
433
+ void ClockCacheShard::SetCapacity(size_t /*capacity*/) {
434
+ assert(false); // Not supported.
382
435
  }
383
436
 
384
- void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
385
- assert(false); // Not supported. TODO(Guido) Support it?
386
- DMutexLock l(mutex_);
387
- strict_capacity_limit_ = strict_capacity_limit;
437
+ void ClockCacheShard::SetStrictCapacityLimit(bool /*strict_capacity_limit*/) {
438
+ assert(false); // Not supported.
388
439
  }
389
440
 
390
441
  Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
@@ -407,87 +458,60 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
407
458
  }
408
459
 
409
460
  Status s = Status::OK();
410
- autovector<ClockHandle> last_reference_list;
411
- {
412
- DMutexLock l(mutex_);
413
-
414
- assert(table_.GetOccupancy() <= table_.GetOccupancyLimit());
415
- // Free the space following strict clock policy until enough space
416
- // is freed or there are no evictable elements.
417
- EvictFromClock(tmp.total_charge, &last_reference_list);
418
- if ((usage_ + tmp.total_charge > capacity_ &&
419
- (strict_capacity_limit_ || handle == nullptr)) ||
420
- table_.GetOccupancy() == table_.GetOccupancyLimit()) {
421
- if (handle == nullptr) {
422
- // Don't insert the entry but still return ok, as if the entry inserted
423
- // into cache and get evicted immediately.
424
- last_reference_list.push_back(tmp);
425
- } else {
426
- if (table_.GetOccupancy() == table_.GetOccupancyLimit()) {
427
- // TODO: Consider using a distinct status for this case, but usually
428
- // it will be handled the same way as reaching charge capacity limit
429
- s = Status::MemoryLimit(
430
- "Insert failed because all slots in the hash table are full.");
431
- } else {
432
- s = Status::MemoryLimit(
433
- "Insert failed because the total charge has exceeded the "
434
- "capacity.");
435
- }
436
- }
461
+
462
+ // Free space with the clock policy until enough space is freed or there are
463
+ // no evictable elements.
464
+ table_.ClockRun(tmp.total_charge);
465
+
466
+ // occupancy_ and usage_ are contended members across concurrent updates
467
+ // on the same shard, so we use a single copy to reduce cache synchronization.
468
+ uint32_t occupancy_local = table_.GetOccupancy();
469
+ size_t usage_local = table_.GetUsage();
470
+ assert(occupancy_local <= table_.GetOccupancyLimit());
471
+
472
+ autovector<ClockHandle> deleted;
473
+
474
+ if ((usage_local + tmp.total_charge > table_.GetCapacity() &&
475
+ (strict_capacity_limit_ || handle == nullptr)) ||
476
+ occupancy_local > table_.GetOccupancyLimit()) {
477
+ if (handle == nullptr) {
478
+ // Don't insert the entry but still return ok, as if the entry inserted
479
+ // into cache and get evicted immediately.
480
+ deleted.push_back(tmp);
437
481
  } else {
438
- // Insert into the cache. Note that the cache might get larger than its
439
- // capacity if not enough space was freed up.
440
- ClockHandle* old;
441
- ClockHandle* h = table_.Insert(&tmp, &old);
442
- assert(h != nullptr); // We're below occupancy, so this insertion should
443
- // never fail.
444
- usage_ += h->total_charge;
445
- if (old != nullptr) {
446
- s = Status::OkOverwritten();
447
- assert(!old->WillBeDeleted());
448
- old->SetWillBeDeleted(true);
449
- // Try to evict the old copy of the element.
450
- if (old->TryExclusiveRef()) {
451
- last_reference_list.push_back(*old);
452
- Evict(old);
453
- old->ReleaseExclusiveRef();
454
- }
455
- }
456
- if (handle == nullptr) {
457
- // If the user didn't provide a handle, no reference is taken,
458
- // so we make the element evictable.
459
- ClockOn(h);
460
- h->ReleaseExclusiveRef();
482
+ if (occupancy_local > table_.GetOccupancyLimit()) {
483
+ // TODO: Consider using a distinct status for this case, but usually
484
+ // it will be handled the same way as reaching charge capacity limit
485
+ s = Status::MemoryLimit(
486
+ "Insert failed because all slots in the hash table are full.");
461
487
  } else {
462
- // The caller already holds a ref.
463
- h->ExclusiveToExternalRef();
464
- *handle = reinterpret_cast<Cache::Handle*>(h);
488
+ s = Status::MemoryLimit(
489
+ "Insert failed because the total charge has exceeded the "
490
+ "capacity.");
465
491
  }
466
492
  }
467
- }
493
+ } else {
494
+ // Insert into the cache. Note that the cache might get larger than its
495
+ // capacity if not enough space was freed up.
496
+ ClockHandle* h = table_.Insert(&tmp, &deleted, handle != nullptr);
497
+ assert(h != nullptr); // The occupancy is way below the table size, so this
498
+ // insertion should never fail.
499
+ if (handle != nullptr) {
500
+ *handle = reinterpret_cast<Cache::Handle*>(h);
501
+ }
468
502
 
469
- // Free the entry outside of the mutex for performance reasons.
470
- for (auto& h : last_reference_list) {
471
- h.FreeData();
503
+ if (deleted.size() > 0) {
504
+ s = Status::OkOverwritten();
505
+ }
472
506
  }
473
507
 
508
+ table_.Free(&deleted);
509
+
474
510
  return s;
475
511
  }
476
512
 
477
513
  Cache::Handle* ClockCacheShard::Lookup(const Slice& key, uint32_t hash) {
478
- ClockHandle* h = nullptr;
479
- h = table_.Lookup(key, hash);
480
- if (h != nullptr) {
481
- // TODO(Guido) Comment from #10347: Here it looks like we have three atomic
482
- // updates where it would be possible to combine into one CAS (more metadata
483
- // under one atomic field) or maybe two atomic updates (one arithmetic, one
484
- // bitwise). Something to think about optimizing.
485
- h->InternalToExternalRef();
486
- h->SetHit();
487
- // The handle is now referenced, so we take it out of clock.
488
- ClockOff(h);
489
- }
490
- return reinterpret_cast<Cache::Handle*>(h);
514
+ return reinterpret_cast<Cache::Handle*>(table_.Lookup(key, hash));
491
515
  }
492
516
 
493
517
  bool ClockCacheShard::Ref(Cache::Handle* h) {
@@ -498,97 +522,50 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
498
522
 
499
523
  bool ClockCacheShard::Release(Cache::Handle* handle, bool erase_if_last_ref) {
500
524
  // In contrast with LRUCache's Release, this function won't delete the handle
501
- // when the reference is the last one and the cache is above capacity. Space
525
+ // when the cache is above capacity and the reference is the last one. Space
502
526
  // is only freed up by EvictFromClock (called by Insert when space is needed)
503
- // and Erase.
527
+ // and Erase. We do this to avoid an extra atomic read of the variable usage_.
504
528
  if (handle == nullptr) {
505
529
  return false;
506
530
  }
507
531
 
508
532
  ClockHandle* h = reinterpret_cast<ClockHandle*>(handle);
509
- uint32_t hash = h->hash;
510
- uint32_t refs = h->ReleaseExternalRef();
511
- bool last_reference = !(refs & ClockHandle::EXTERNAL_REFS);
533
+ uint32_t refs = h->refs;
534
+ bool last_reference = ((refs & ClockHandle::EXTERNAL_REFS) == 1);
512
535
  bool will_be_deleted = refs & ClockHandle::WILL_BE_DELETED;
513
536
 
514
537
  if (last_reference && (will_be_deleted || erase_if_last_ref)) {
515
- // At this point we want to evict the element, so we need to take
516
- // a lock and an exclusive reference. But there's a problem:
517
- // as soon as we released the last reference, an Insert or Erase could've
518
- // replaced this element, and by the time we take the lock and ref
519
- // we could potentially be referencing a different element.
520
- // Thus, before evicting the (potentially different) element, we need to
521
- // re-check that it's unreferenced and marked as WILL_BE_DELETED, so the
522
- // eviction is safe. Additionally, we check that the hash doesn't change,
523
- // which will detect, most of the time, whether the element is a different
524
- // one. The bottomline is that we only guarantee that the input handle will
525
- // be deleted, and occasionally also another handle, but in any case all
526
- // deleted handles are safe to delete.
527
- // TODO(Guido) With lock-free inserts and deletes we may be able to
528
- // "atomically" transition to an exclusive ref, without creating a deadlock.
529
- ClockHandle copy;
530
- {
531
- DMutexLock l(mutex_);
532
- if (h->TrySpinExclusiveRef()) {
533
- will_be_deleted = h->refs & ClockHandle::WILL_BE_DELETED;
534
- // Check that it's still safe to delete.
535
- if (h->IsElement() && (will_be_deleted || erase_if_last_ref) &&
536
- h->hash == hash) {
537
- copy = *h;
538
- Evict(h);
539
- }
540
- h->ReleaseExclusiveRef();
541
- } else {
542
- // An external ref was detected.
543
- return false;
544
- }
538
+ autovector<ClockHandle> deleted;
539
+ h->SetWillBeDeleted(true);
540
+ h->ReleaseExternalRef();
541
+ if (table_.SpinTryRemove(h, &deleted)) {
542
+ h->ReleaseExclusiveRef();
543
+ table_.Free(&deleted);
544
+ return true;
545
545
  }
546
-
547
- // Free the entry outside of the mutex for performance reasons.
548
- copy.FreeData();
549
- return true;
546
+ } else {
547
+ h->ReleaseExternalRef();
550
548
  }
551
549
 
552
550
  return false;
553
551
  }
554
552
 
555
553
  void ClockCacheShard::Erase(const Slice& key, uint32_t hash) {
556
- ClockHandle copy;
557
- bool last_reference = false;
558
- {
559
- DMutexLock l(mutex_);
560
- ClockHandle* h = table_.Lookup(key, hash);
561
- if (h != nullptr) {
562
- h->SetWillBeDeleted(true);
563
- h->ReleaseInternalRef();
564
- if (h->TryExclusiveRef()) {
565
- copy = *h;
566
- Evict(h);
567
- last_reference = true;
568
- h->ReleaseExclusiveRef();
569
- }
570
- }
571
- }
572
- // Free the entry outside of the mutex for performance reasons.
573
- if (last_reference) {
574
- copy.FreeData();
575
- }
554
+ autovector<ClockHandle> deleted;
555
+ uint32_t probe = 0;
556
+ table_.RemoveAll(key, hash, probe, &deleted);
557
+ table_.Free(&deleted);
576
558
  }
577
559
 
578
- size_t ClockCacheShard::GetUsage() const {
579
- DMutexLock l(mutex_);
580
- return usage_;
581
- }
560
+ size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); }
582
561
 
583
562
  size_t ClockCacheShard::GetPinnedUsage() const {
584
- // Computes the pinned usage scanning the whole hash table. This
585
- // is slow, but avoid keeping an exact counter on the clock usage,
563
+ // Computes the pinned usage by scanning the whole hash table. This
564
+ // is slow, but avoids keeping an exact counter on the clock usage,
586
565
  // i.e., the number of not externally referenced elements.
587
- // Why avoid this? Because Lookup removes elements from the clock
566
+ // Why avoid this counter? Because Lookup removes elements from the clock
588
567
  // list, so it would need to update the pinned usage every time,
589
568
  // which creates additional synchronization costs.
590
- DMutexLock l(mutex_);
591
-
592
569
  size_t clock_usage = 0;
593
570
 
594
571
  table_.ConstApplyToEntriesRange(
@@ -602,17 +579,13 @@ size_t ClockCacheShard::GetPinnedUsage() const {
602
579
  return clock_usage;
603
580
  }
604
581
 
605
- std::string ClockCacheShard::GetPrintableOptions() const {
606
- return std::string{};
607
- }
608
-
609
582
  ClockCache::ClockCache(size_t capacity, size_t estimated_value_size,
610
583
  int num_shard_bits, bool strict_capacity_limit,
611
584
  CacheMetadataChargePolicy metadata_charge_policy)
612
- : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
585
+ : ShardedCache(capacity, num_shard_bits, strict_capacity_limit),
586
+ num_shards_(1 << num_shard_bits) {
613
587
  assert(estimated_value_size > 0 ||
614
588
  metadata_charge_policy != kDontChargeCacheMetadata);
615
- num_shards_ = 1 << num_shard_bits;
616
589
  shards_ = reinterpret_cast<ClockCacheShard*>(
617
590
  port::cacheline_aligned_alloc(sizeof(ClockCacheShard) * num_shards_));
618
591
  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;