@nxtedition/rocksdb 7.0.26 → 7.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. package/binding.cc +67 -25
  2. package/chained-batch.js +1 -1
  3. package/deps/rocksdb/rocksdb/CMakeLists.txt +3 -0
  4. package/deps/rocksdb/rocksdb/Makefile +3 -0
  5. package/deps/rocksdb/rocksdb/TARGETS +10 -0
  6. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +17 -7
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +2 -0
  8. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +1 -0
  9. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +117 -0
  10. package/deps/rocksdb/rocksdb/cache/charged_cache.h +121 -0
  11. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +270 -180
  12. package/deps/rocksdb/rocksdb/cache/clock_cache.h +412 -124
  13. package/deps/rocksdb/rocksdb/cache/fast_lru_cache.cc +1 -0
  14. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +1 -1
  15. package/deps/rocksdb/rocksdb/cache/lru_cache.h +2 -2
  16. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +2 -2
  17. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +1 -1
  18. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +71 -9
  19. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.h +11 -2
  20. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder_test.cc +21 -14
  21. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +68 -7
  22. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +16 -0
  23. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +519 -12
  24. package/deps/rocksdb/rocksdb/db/blob/db_blob_basic_test.cc +120 -0
  25. package/deps/rocksdb/rocksdb/db/builder.cc +15 -5
  26. package/deps/rocksdb/rocksdb/db/builder.h +3 -0
  27. package/deps/rocksdb/rocksdb/db/c.cc +18 -0
  28. package/deps/rocksdb/rocksdb/db/c_test.c +18 -0
  29. package/deps/rocksdb/rocksdb/db/column_family.h +2 -0
  30. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +3 -2
  31. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +9 -4
  32. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +15 -10
  33. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +36 -34
  34. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +50 -13
  35. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.h +12 -0
  36. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +8 -1
  37. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +2 -1
  38. package/deps/rocksdb/rocksdb/db/compaction/tiered_compaction_test.cc +13 -17
  39. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +26 -9
  40. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +0 -11
  41. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +93 -0
  42. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +16 -1
  43. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +3 -8
  44. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_debug.cc +8 -1
  45. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_open.cc +17 -5
  46. package/deps/rocksdb/rocksdb/db/db_test.cc +0 -3
  47. package/deps/rocksdb/rocksdb/db/db_test2.cc +39 -12
  48. package/deps/rocksdb/rocksdb/db/db_test_util.cc +9 -0
  49. package/deps/rocksdb/rocksdb/db/db_test_util.h +2 -0
  50. package/deps/rocksdb/rocksdb/db/dbformat.cc +0 -38
  51. package/deps/rocksdb/rocksdb/db/dbformat.h +14 -13
  52. package/deps/rocksdb/rocksdb/db/dbformat_test.cc +5 -2
  53. package/deps/rocksdb/rocksdb/db/event_helpers.cc +13 -1
  54. package/deps/rocksdb/rocksdb/db/external_sst_file_basic_test.cc +0 -10
  55. package/deps/rocksdb/rocksdb/db/flush_job.cc +19 -15
  56. package/deps/rocksdb/rocksdb/db/flush_job.h +7 -0
  57. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +21 -15
  58. package/deps/rocksdb/rocksdb/db/forward_iterator.h +4 -3
  59. package/deps/rocksdb/rocksdb/db/memtable_list.cc +9 -0
  60. package/deps/rocksdb/rocksdb/db/memtable_list.h +5 -0
  61. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.cc +53 -12
  62. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler.h +14 -2
  63. package/deps/rocksdb/rocksdb/db/periodic_work_scheduler_test.cc +10 -10
  64. package/deps/rocksdb/rocksdb/db/repair.cc +8 -6
  65. package/deps/rocksdb/rocksdb/db/seqno_time_test.cc +890 -0
  66. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.cc +324 -0
  67. package/deps/rocksdb/rocksdb/db/seqno_to_time_mapping.h +186 -0
  68. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_common.h +2 -0
  69. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_gflags.cc +13 -4
  70. package/deps/rocksdb/rocksdb/db_stress_tool/db_stress_test_base.cc +23 -2
  71. package/deps/rocksdb/rocksdb/env/env_test.cc +74 -1
  72. package/deps/rocksdb/rocksdb/env/io_posix.cc +11 -8
  73. package/deps/rocksdb/rocksdb/include/rocksdb/advanced_options.h +28 -0
  74. package/deps/rocksdb/rocksdb/include/rocksdb/c.h +14 -1
  75. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +4 -4
  76. package/deps/rocksdb/rocksdb/include/rocksdb/comparator.h +30 -23
  77. package/deps/rocksdb/rocksdb/include/rocksdb/db.h +1 -1
  78. package/deps/rocksdb/rocksdb/include/rocksdb/rate_limiter.h +3 -13
  79. package/deps/rocksdb/rocksdb/include/rocksdb/table_properties.h +5 -0
  80. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/debug.h +1 -2
  81. package/deps/rocksdb/rocksdb/include/rocksdb/utilities/ldb_cmd.h +1 -0
  82. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  83. package/deps/rocksdb/rocksdb/monitoring/stats_history_test.cc +26 -26
  84. package/deps/rocksdb/rocksdb/options/cf_options.cc +14 -1
  85. package/deps/rocksdb/rocksdb/options/cf_options.h +5 -0
  86. package/deps/rocksdb/rocksdb/options/customizable_test.cc +0 -56
  87. package/deps/rocksdb/rocksdb/options/db_options.cc +4 -5
  88. package/deps/rocksdb/rocksdb/options/options.cc +11 -1
  89. package/deps/rocksdb/rocksdb/options/options_helper.cc +8 -0
  90. package/deps/rocksdb/rocksdb/options/options_helper.h +4 -0
  91. package/deps/rocksdb/rocksdb/options/options_settable_test.cc +4 -0
  92. package/deps/rocksdb/rocksdb/options/options_test.cc +4 -0
  93. package/deps/rocksdb/rocksdb/src.mk +3 -0
  94. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +6 -1
  95. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +4 -0
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +36 -3
  97. package/deps/rocksdb/rocksdb/table/block_based/index_builder.cc +36 -1
  98. package/deps/rocksdb/rocksdb/table/block_based/index_builder.h +14 -3
  99. package/deps/rocksdb/rocksdb/table/internal_iterator.h +1 -1
  100. package/deps/rocksdb/rocksdb/table/meta_blocks.cc +6 -0
  101. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.cc +5 -0
  102. package/deps/rocksdb/rocksdb/table/plain/plain_table_builder.h +3 -0
  103. package/deps/rocksdb/rocksdb/table/sst_file_writer.cc +10 -7
  104. package/deps/rocksdb/rocksdb/table/table_builder.h +7 -3
  105. package/deps/rocksdb/rocksdb/table/table_properties.cc +9 -0
  106. package/deps/rocksdb/rocksdb/test_util/mock_time_env.h +3 -2
  107. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +58 -30
  108. package/deps/rocksdb/rocksdb/tools/db_bench_tool_test.cc +1 -0
  109. package/deps/rocksdb/rocksdb/tools/ldb_cmd.cc +20 -0
  110. package/deps/rocksdb/rocksdb/util/rate_limiter.cc +29 -154
  111. package/deps/rocksdb/rocksdb/util/rate_limiter.h +16 -34
  112. package/deps/rocksdb/rocksdb/util/rate_limiter_test.cc +0 -92
  113. package/deps/rocksdb/rocksdb/util/timer.h +6 -0
  114. package/deps/rocksdb/rocksdb/util/vector_iterator.h +4 -3
  115. package/deps/rocksdb/rocksdb/utilities/backup/backup_engine.cc +4 -45
  116. package/deps/rocksdb/rocksdb/utilities/debug.cc +40 -0
  117. package/deps/rocksdb/rocksdb.gyp +2 -0
  118. package/index.js +4 -0
  119. package/package.json +1 -1
  120. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  121. package/prebuilds/linux-x64/node.napi.node +0 -0
@@ -10,6 +10,8 @@
10
10
  #pragma once
11
11
 
12
12
  #include <array>
13
+ #include <atomic>
14
+ #include <cstdint>
13
15
  #include <memory>
14
16
  #include <string>
15
17
 
@@ -27,116 +29,254 @@ namespace ROCKSDB_NAMESPACE {
27
29
 
28
30
  namespace clock_cache {
29
31
 
30
- // Clock cache implementation. This is based on FastLRUCache's open-addressed
31
- // hash table. Importantly, it stores elements in an array, and resolves
32
- // collision using a probing strategy. Visibility and referenceability of
33
- // elements works as usual. See fast_lru_cache.h for a detailed description.
32
+ // Block cache implementation using a lock-free open-address hash table
33
+ // and clock eviction.
34
+
35
+ ///////////////////////////////////////////////////////////////////////////////
36
+ // Part 1: Handles
37
+ //
38
+ // Every slot in the hash table is a ClockHandle. A handle can be in a few
39
+ // different states, that stem from the fact that handles can be externally
40
+ // referenced and, thus, can't always be immediately evicted when a delete
41
+ // operation is executed or when they are replaced by a new version (via an
42
+ // insert of the same key). Concretely, the state of a handle is defined by the
43
+ // following two properties:
44
+ // (R) Externally referenced: A handle can be referenced externally, or not.
45
+ // Importantly, a handle can be evicted if and only if it's not
46
+ // referenced. In particular, when an handle becomes referenced, it's
47
+ // temporarily taken out of clock until all references to it are released.
48
+ // (M) Marked for deletion (or invisible): An handle is marked for deletion
49
+ // when an operation attempts to delete it, but the handle is externally
50
+ // referenced, so it can't be immediately deleted. When this mark is placed,
51
+ // lookups will no longer be able to find it. Consequently, no more external
52
+ // references will be taken to the handle. When a handle is marked for
53
+ // deletion, we also say it's invisible.
54
+ // These properties induce 4 different states, with transitions defined as
55
+ // follows:
56
+ // - Not M --> M: When a handle is deleted or replaced by a new version, but
57
+ // not immediately evicted.
58
+ // - M --> not M: This cannot happen. Once a handle is marked for deletion,
59
+ // there is no can't go back.
60
+ // - R --> not R: When all references to an handle are released.
61
+ // - Not R --> R: When an unreferenced handle becomes referenced. This can only
62
+ // happen if the handle is visible, since references to an handle can only be
63
+ // created when it's visible.
64
+ //
65
+ ///////////////////////////////////////////////////////////////////////////////
66
+ // Part 2: Hash table structure
67
+ //
68
+ // Internally, the cache uses an open-addressed hash table to index the handles.
69
+ // We use tombstone counters to keep track of displacements. Probes are
70
+ // generated with double-hashing (but the code can be easily modified to use
71
+ // other probing schemes, like linear hashing). Because of the tombstones and
72
+ // the two possible visibility states of a handle, the table slots (we use the
73
+ // word "slot" to refer to handles that are not necessary valid key-value
74
+ // elements) can be in 4 different states:
75
+ // 1. Visible element: The slot contains an element in not M state.
76
+ // 2. To-be-deleted element: The slot contains an element in M state.
77
+ // 3. Tombstone: The slot doesn't contain an element, but there is some other
78
+ // element that probed this slot during its insertion.
79
+ // 4. Empty: The slot is unused.
80
+ // When a ghost is removed from the table, it can either transition to being a
81
+ // tombstone or an empty slot, depending on the number of displacements of the
82
+ // slot. In any case, the slot becomes available. When a handle is inserted
83
+ // into that slot, it becomes a visible element again.
84
+ //
85
+ ///////////////////////////////////////////////////////////////////////////////
86
+ // Part 3: The clock algorithm
87
+ //
88
+ // We maintain a circular buffer with the handles available for eviction,
89
+ // which the clock algorithm traverses (using a "clock pointer") to pick the
90
+ // next victim. We use the hash table array as the circular buffer, and mark
91
+ // the handles that are evictable. For this we use different clock flags, namely
92
+ // NONE, LOW, MEDIUM, HIGH, that represent priorities: LOW, MEDIUM and HIGH
93
+ // represent how close an element is from being evictable, LOW being immediately
94
+ // evictable. NONE means the slot is not evictable. This is due to one of the
95
+ // following reasons:
96
+ // (i) the slot doesn't contain an element, or
97
+ // (ii) the slot contains an element that is in R state, or
98
+ // (iii) the slot contains an element that was in R state but it's
99
+ // not any more, and the clock pointer has not swept through the
100
+ // slot since the element stopped being referenced.
101
+ //
102
+ // The priority NONE is really only important for case (iii), as in the other
103
+ // two cases there are other metadata fields that already capture the state.
104
+ // When an element stops being referenced (and is not deleted), the clock
105
+ // algorithm must acknowledge this, and assign a non-NONE priority to make
106
+ // the element evictable again.
107
+ //
108
+ ///////////////////////////////////////////////////////////////////////////////
109
+ // Part 4: Synchronization
110
+ //
111
+ // We provide the following synchronization guarantees:
112
+ // - Lookup is lock-free.
113
+ // - Release is lock-free, unless (i) no references to the element are left,
114
+ // and (ii) it was marked for deletion or the user wishes to delete if
115
+ // releasing the last reference.
116
+ // - Insert and Erase still use a per-shard lock.
117
+ //
118
+ // Our hash table is lock-free, in the sense that system-wide progress is
119
+ // guaranteed, i.e., some thread is always able to make progress.
34
120
  //
35
- // The main difference with FastLRUCache is, not surprisingly, the eviction
36
- // algorithm
37
- // ---instead of an LRU list, we maintain a circular list with the elements
38
- // available for eviction, which the clock algorithm traverses to pick the next
39
- // victim. The clock list is represented using the array of handles, and we
40
- // simply mark those elements that are present in the list. This is done using
41
- // different clock flags, namely NONE, LOW, MEDIUM, HIGH, that represent
42
- // priorities: NONE means that the element is not part of the clock list, and
43
- // LOW to HIGH represent how close an element is from being evictable (LOW being
44
- // immediately evictable). When the clock pointer steps on an element that is
45
- // not immediately evictable, it decreases its priority.
46
-
47
- constexpr double kLoadFactor = 0.35; // See fast_lru_cache.h.
48
-
49
- constexpr double kStrictLoadFactor = 0.7; // See fast_lru_cache.h.
121
+ ///////////////////////////////////////////////////////////////////////////////
122
+
123
+ // The load factor p is a real number in (0, 1) such that at all
124
+ // times at most a fraction p of all slots, without counting tombstones,
125
+ // are occupied by elements. This means that the probability that a
126
+ // random probe hits an empty slot is at most p, and thus at most 1/p probes
127
+ // are required on average. For example, p = 70% implies that between 1 and 2
128
+ // probes are needed on average (bear in mind that this reasoning doesn't
129
+ // consider the effects of clustering over time).
130
+ // Because the size of the hash table is always rounded up to the next
131
+ // power of 2, p is really an upper bound on the actual load factor---the
132
+ // actual load factor is anywhere between p/2 and p. This is a bit wasteful,
133
+ // but bear in mind that slots only hold metadata, not actual values.
134
+ // Since space cost is dominated by the values (the LSM blocks),
135
+ // overprovisioning the table with metadata only increases the total cache space
136
+ // usage by a tiny fraction.
137
+ constexpr double kLoadFactor = 0.35;
138
+
139
+ // The user can exceed kLoadFactor if the sizes of the inserted values don't
140
+ // match estimated_value_size, or if strict_capacity_limit == false. To
141
+ // avoid performance to plunge, we set a strict upper bound on the load factor.
142
+ constexpr double kStrictLoadFactor = 0.7;
50
143
 
51
144
  // Arbitrary seeds.
52
145
  constexpr uint32_t kProbingSeed1 = 0xbc9f1d34;
53
146
  constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5;
54
147
 
55
- // An experimental (under development!) alternative to LRUCache
148
+ // An experimental (under development!) alternative to LRUCache.
56
149
 
57
150
  struct ClockHandle {
58
151
  void* value;
59
152
  Cache::DeleterFn deleter;
60
153
  uint32_t hash;
61
- size_t total_charge; // TODO(opt): Only allow uint32_t?
62
- // The number of external refs to this entry.
63
- uint32_t refs;
154
+ size_t total_charge;
155
+ std::array<char, kCacheKeySize> key_data;
156
+
157
+ static constexpr uint8_t kExternalRefsOffset = 0;
158
+ static constexpr uint8_t kSharedRefsOffset = 15;
159
+ static constexpr uint8_t kExclusiveRefOffset = 30;
160
+ static constexpr uint8_t kWillBeDeletedOffset = 31;
161
+
162
+ enum Refs : uint32_t {
163
+ // Number of external references to the slot.
164
+ EXTERNAL_REFS = ((uint32_t{1} << 15) - 1)
165
+ << kExternalRefsOffset, // Bits 0, ..., 14
166
+ // Number of internal references plus external references to the slot.
167
+ SHARED_REFS = ((uint32_t{1} << 15) - 1)
168
+ << kSharedRefsOffset, // Bits 15, ..., 29
169
+ // Whether a thread has an exclusive reference to the slot.
170
+ EXCLUSIVE_REF = uint32_t{1} << kExclusiveRefOffset, // Bit 30
171
+ // Whether the handle will be deleted soon. When this bit is set, new
172
+ // internal
173
+ // or external references to this handle stop being accepted.
174
+ // There is an exception: external references can be created from
175
+ // existing external references, or converting from existing internal
176
+ // references.
177
+ WILL_BE_DELETED = uint32_t{1} << kWillBeDeletedOffset // Bit 31
178
+
179
+ // Shared references (i.e., external and internal references) and exclusive
180
+ // references are our custom implementation of RW locks---external and
181
+ // internal references are read locks, and exclusive references are write
182
+ // locks. We prioritize readers, which never block; in fact, they don't even
183
+ // use compare-and-swap operations. Using our own implementation of RW locks
184
+ // allows us to save many atomic operations by packing data more carefully.
185
+ // In particular:
186
+ // - Combining EXTERNAL_REFS and SHARED_REFS allows us to convert an
187
+ // internal
188
+ // reference into an external reference in a single atomic arithmetic
189
+ // operation.
190
+ // - Combining SHARED_REFS and WILL_BE_DELETED allows us to attempt to take
191
+ // a shared reference and check whether the entry is marked for deletion
192
+ // in a single atomic arithmetic operation.
193
+ };
194
+
195
+ static constexpr uint32_t kOneInternalRef = 0x8000;
196
+ static constexpr uint32_t kOneExternalRef = 0x8001;
197
+
198
+ std::atomic<uint32_t> refs;
64
199
 
65
- static constexpr int kIsVisibleOffset = 0;
66
- static constexpr int kIsElementOffset = 1;
67
- static constexpr int kClockPriorityOffset = 2;
68
- static constexpr int kIsHitOffset = 4;
69
- static constexpr int kCachePriorityOffset = 5;
200
+ static constexpr uint8_t kIsElementOffset = 1;
201
+ static constexpr uint8_t kClockPriorityOffset = 2;
202
+ static constexpr uint8_t kIsHitOffset = 4;
203
+ static constexpr uint8_t kCachePriorityOffset = 5;
70
204
 
71
205
  enum Flags : uint8_t {
72
- // Whether the handle is visible to Lookups.
73
- IS_VISIBLE = (1 << kIsVisibleOffset),
74
206
  // Whether the slot is in use by an element.
75
- IS_ELEMENT = (1 << kIsElementOffset),
76
- // Clock priorities. Represents how close a handle is from
77
- // being evictable.
78
- CLOCK_PRIORITY = (3 << kClockPriorityOffset),
207
+ IS_ELEMENT = 1 << kIsElementOffset,
208
+ // Clock priorities. Represents how close a handle is from being evictable.
209
+ CLOCK_PRIORITY = 3 << kClockPriorityOffset,
79
210
  // Whether the handle has been looked up after its insertion.
80
- HAS_HIT = (1 << kIsHitOffset),
81
- CACHE_PRIORITY = (1 << kCachePriorityOffset),
211
+ HAS_HIT = 1 << kIsHitOffset,
212
+ // The value of Cache::Priority for the handle.
213
+ CACHE_PRIORITY = 1 << kCachePriorityOffset,
82
214
  };
83
- uint8_t flags;
215
+
216
+ std::atomic<uint8_t> flags;
84
217
 
85
218
  enum ClockPriority : uint8_t {
86
- NONE = (0 << kClockPriorityOffset), // Not an element in the eyes of clock.
87
- LOW = (1 << kClockPriorityOffset), // Immediately evictable.
219
+ NONE = (0 << kClockPriorityOffset),
220
+ LOW = (1 << kClockPriorityOffset),
88
221
  MEDIUM = (2 << kClockPriorityOffset),
89
222
  HIGH = (3 << kClockPriorityOffset)
90
- // Priority is NONE if and only if
91
- // (i) the handle is not an element, or
92
- // (ii) the handle is an element but it is being referenced.
93
223
  };
94
224
 
95
- // The number of elements that hash to this slot or a lower one,
96
- // but wind up in a higher slot.
97
- uint32_t displacements;
98
-
99
- std::array<char, kCacheKeySize> key_data;
100
-
101
- ClockHandle() {
102
- value = nullptr;
103
- deleter = nullptr;
104
- hash = 0;
105
- total_charge = 0;
106
- refs = 0;
107
- flags = 0;
108
- SetIsVisible(false);
225
+ // The number of elements that hash to this slot or a lower one, but wind
226
+ // up in this slot or a higher one.
227
+ std::atomic<uint32_t> displacements;
228
+
229
+ // Synchronization rules:
230
+ // - Use a shared reference when we want the handle's identity
231
+ // members (key_data, hash, value and IS_ELEMENT flag) to
232
+ // remain untouched, but not modify them. The only updates
233
+ // that a shared reference allows are:
234
+ // * set CLOCK_PRIORITY to NONE;
235
+ // * set the HAS_HIT bit.
236
+ // Notice that these two types of updates are idempotent, so
237
+ // they don't require synchronization across shared references.
238
+ // - Use an exclusive reference when we want identity members
239
+ // to remain untouched, as well as modify any identity member
240
+ // or flag.
241
+ // - displacements can be modified without holding a reference.
242
+ // - refs is only modified through appropriate functions to
243
+ // take or release references.
244
+
245
+ ClockHandle()
246
+ : value(nullptr),
247
+ deleter(nullptr),
248
+ hash(0),
249
+ total_charge(0),
250
+ refs(0),
251
+ flags(0),
252
+ displacements(0) {
253
+ SetWillBeDeleted(false);
109
254
  SetIsElement(false);
110
255
  SetClockPriority(ClockPriority::NONE);
111
256
  SetCachePriority(Cache::Priority::LOW);
112
- displacements = 0;
113
257
  key_data.fill(0);
114
258
  }
115
259
 
116
- Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
117
-
118
- // Increase the reference count by 1.
119
- void Ref() { refs++; }
120
-
121
- // Just reduce the reference count by 1. Return true if it was last reference.
122
- bool Unref() {
123
- assert(refs > 0);
124
- refs--;
125
- return refs == 0;
260
+ ClockHandle(const ClockHandle& other) { *this = other; }
261
+
262
+ void operator=(const ClockHandle& other) {
263
+ value = other.value;
264
+ deleter = other.deleter;
265
+ hash = other.hash;
266
+ total_charge = other.total_charge;
267
+ refs.store(other.refs);
268
+ key_data = other.key_data;
269
+ flags.store(other.flags);
270
+ SetWillBeDeleted(other.WillBeDeleted());
271
+ SetIsElement(other.IsElement());
272
+ SetClockPriority(other.GetClockPriority());
273
+ SetCachePriority(other.GetCachePriority());
274
+ displacements.store(other.displacements);
126
275
  }
127
276
 
128
- // Return true if there are external refs, false otherwise.
129
- bool HasRefs() const { return refs > 0; }
277
+ Slice key() const { return Slice(key_data.data(), kCacheKeySize); }
130
278
 
131
- bool IsVisible() const { return flags & IS_VISIBLE; }
132
-
133
- void SetIsVisible(bool is_visible) {
134
- if (is_visible) {
135
- flags |= IS_VISIBLE;
136
- } else {
137
- flags &= ~IS_VISIBLE;
138
- }
139
- }
279
+ bool HasExternalRefs() const { return (refs & EXTERNAL_REFS) > 0; }
140
280
 
141
281
  bool IsElement() const { return flags & IS_ELEMENT; }
142
282
 
@@ -144,7 +284,7 @@ struct ClockHandle {
144
284
  if (is_element) {
145
285
  flags |= IS_ELEMENT;
146
286
  } else {
147
- flags &= ~IS_ELEMENT;
287
+ flags &= static_cast<uint8_t>(~IS_ELEMENT);
148
288
  }
149
289
  }
150
290
 
@@ -152,7 +292,7 @@ struct ClockHandle {
152
292
 
153
293
  void SetHit() { flags |= HAS_HIT; }
154
294
 
155
- bool IsInClockList() const {
295
+ bool IsInClock() const {
156
296
  return GetClockPriority() != ClockHandle::ClockPriority::NONE;
157
297
  }
158
298
 
@@ -164,7 +304,7 @@ struct ClockHandle {
164
304
  if (priority == Cache::Priority::HIGH) {
165
305
  flags |= Flags::CACHE_PRIORITY;
166
306
  } else {
167
- flags &= ~Flags::CACHE_PRIORITY;
307
+ flags &= static_cast<uint8_t>(~Flags::CACHE_PRIORITY);
168
308
  }
169
309
  }
170
310
 
@@ -173,7 +313,7 @@ struct ClockHandle {
173
313
  }
174
314
 
175
315
  void SetClockPriority(ClockPriority priority) {
176
- flags &= ~Flags::CLOCK_PRIORITY;
316
+ flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
177
317
  flags |= priority;
178
318
  }
179
319
 
@@ -182,14 +322,13 @@ struct ClockHandle {
182
322
  kClockPriorityOffset;
183
323
  assert(p > 0);
184
324
  p--;
185
- flags &= ~Flags::CLOCK_PRIORITY;
325
+ flags &= static_cast<uint8_t>(~Flags::CLOCK_PRIORITY);
186
326
  ClockPriority new_priority =
187
327
  static_cast<ClockPriority>(p << kClockPriorityOffset);
188
328
  flags |= new_priority;
189
329
  }
190
330
 
191
331
  void FreeData() {
192
- assert(refs == 0);
193
332
  if (deleter) {
194
333
  (*deleter)(key(), value);
195
334
  }
@@ -232,17 +371,131 @@ struct ClockHandle {
232
371
  return total_charge - meta_charge;
233
372
  }
234
373
 
235
- inline bool IsEmpty() {
374
+ inline bool IsEmpty() const {
236
375
  return !this->IsElement() && this->displacements == 0;
237
376
  }
238
377
 
239
- inline bool IsTombstone() {
378
+ inline bool IsTombstone() const {
240
379
  return !this->IsElement() && this->displacements > 0;
241
380
  }
242
381
 
243
- inline bool Matches(const Slice& some_key) {
244
- return this->IsElement() && this->key() == some_key;
382
+ inline bool Matches(const Slice& some_key, uint32_t some_hash) const {
383
+ return this->IsElement() && this->hash == some_hash &&
384
+ this->key() == some_key;
385
+ }
386
+
387
+ bool WillBeDeleted() const { return refs & WILL_BE_DELETED; }
388
+
389
+ void SetWillBeDeleted(bool will_be_deleted) {
390
+ if (will_be_deleted) {
391
+ refs |= WILL_BE_DELETED;
392
+ } else {
393
+ refs &= ~WILL_BE_DELETED;
394
+ }
395
+ }
396
+
397
+ // The following functions are for taking and releasing refs.
398
+
399
+ // Tries to take an external ref. Returns true iff it succeeds.
400
+ inline bool TryExternalRef() {
401
+ if (!((refs += kOneExternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
402
+ return true;
403
+ }
404
+ refs -= kOneExternalRef;
405
+ return false;
406
+ }
407
+
408
+ // Releases an external ref. Returns the new value (this is useful to
409
+ // avoid an extra atomic read).
410
+ inline uint32_t ReleaseExternalRef() { return refs -= kOneExternalRef; }
411
+
412
+ // Take an external ref, assuming there is already one external ref
413
+ // to the handle.
414
+ void Ref() {
415
+ // TODO(Guido) Is it okay to assume that the existing external reference
416
+ // survives until this function returns?
417
+ refs += kOneExternalRef;
418
+ }
419
+
420
+ // Tries to take an internal ref. Returns true iff it succeeds.
421
+ inline bool TryInternalRef() {
422
+ if (!((refs += kOneInternalRef) & (EXCLUSIVE_REF | WILL_BE_DELETED))) {
423
+ return true;
424
+ }
425
+ refs -= kOneInternalRef;
426
+ return false;
427
+ }
428
+
429
+ inline void ReleaseInternalRef() { refs -= kOneInternalRef; }
430
+
431
+ // Tries to take an exclusive ref. Returns true iff it succeeds.
432
+ inline bool TryExclusiveRef() {
433
+ uint32_t will_be_deleted = refs & WILL_BE_DELETED;
434
+ uint32_t expected = will_be_deleted;
435
+ return refs.compare_exchange_strong(expected,
436
+ EXCLUSIVE_REF | will_be_deleted);
245
437
  }
438
+
439
+ // Repeatedly tries to take an exclusive reference, but stops as soon
440
+ // as an external reference is detected (in this case the wait would
441
+ // presumably be too long).
442
+ inline bool TrySpinExclusiveRef() {
443
+ uint32_t expected = 0;
444
+ uint32_t will_be_deleted = 0;
445
+ while (!refs.compare_exchange_strong(expected,
446
+ EXCLUSIVE_REF | will_be_deleted)) {
447
+ if (expected & EXTERNAL_REFS) {
448
+ return false;
449
+ }
450
+ will_be_deleted = expected & WILL_BE_DELETED;
451
+ expected = will_be_deleted;
452
+ }
453
+ return true;
454
+ }
455
+
456
+ inline void ReleaseExclusiveRef() { refs.fetch_and(~EXCLUSIVE_REF); }
457
+
458
+ // The following functions are for upgrading and downgrading refs.
459
+ // They guarantee atomicity, i.e., no exclusive refs to the handle
460
+ // can be taken by a different thread during the conversion.
461
+
462
+ inline void ExclusiveToInternalRef() {
463
+ refs += kOneInternalRef;
464
+ ReleaseExclusiveRef();
465
+ }
466
+
467
+ inline void ExclusiveToExternalRef() {
468
+ refs += kOneExternalRef;
469
+ ReleaseExclusiveRef();
470
+ }
471
+
472
+ // TODO(Guido) Do we want to bound the loop and prepare the
473
+ // algorithms to react to a failure?
474
+ inline void InternalToExclusiveRef() {
475
+ uint32_t expected = kOneInternalRef;
476
+ uint32_t will_be_deleted = 0;
477
+ while (!refs.compare_exchange_strong(expected,
478
+ EXCLUSIVE_REF | will_be_deleted)) {
479
+ will_be_deleted = expected & WILL_BE_DELETED;
480
+ expected = kOneInternalRef | will_be_deleted;
481
+ }
482
+ }
483
+
484
+ inline void InternalToExternalRef() {
485
+ refs += kOneExternalRef - kOneInternalRef;
486
+ }
487
+
488
+ // TODO(Guido) Same concern.
489
+ inline void ExternalToExclusiveRef() {
490
+ uint32_t expected = kOneExternalRef;
491
+ uint32_t will_be_deleted = 0;
492
+ while (!refs.compare_exchange_strong(expected,
493
+ EXCLUSIVE_REF | will_be_deleted)) {
494
+ will_be_deleted = expected & WILL_BE_DELETED;
495
+ expected = kOneExternalRef | will_be_deleted;
496
+ }
497
+ }
498
+
246
499
  }; // struct ClockHandle
247
500
 
248
501
  class ClockHandleTable {
@@ -252,31 +505,54 @@ class ClockHandleTable {
252
505
 
253
506
  // Returns a pointer to a visible element matching the key/hash, or
254
507
  // nullptr if not present.
255
- ClockHandle* Lookup(const Slice& key);
508
+ ClockHandle* Lookup(const Slice& key, uint32_t hash);
256
509
 
257
510
  // Inserts a copy of h into the hash table.
258
511
  // Returns a pointer to the inserted handle, or nullptr if no slot
259
512
  // available was found. If an existing visible element matching the
260
513
  // key/hash is already present in the hash table, the argument old
261
- // is set to pointe to it; otherwise, it's set to nullptr.
514
+ // is set to point to it; otherwise, it's set to nullptr.
515
+ // Returns an exclusive reference to h, and no references to old.
262
516
  ClockHandle* Insert(ClockHandle* h, ClockHandle** old);
263
517
 
264
- // Removes h from the hash table. The handle must already be off
265
- // the clock list.
518
+ // Removes h from the hash table. The handle must already be off clock.
266
519
  void Remove(ClockHandle* h);
267
520
 
268
- // Turns a visible element h into a ghost (i.e., not visible).
269
- void Exclude(ClockHandle* h);
521
+ // Extracts the element information from a handle (src), and assigns it
522
+ // to a hash table slot (dst). Doesn't touch displacements and refs,
523
+ // which are maintained by the hash table algorithm.
524
+ void Assign(ClockHandle* dst, ClockHandle* src);
270
525
 
271
- // Assigns a copy of h to the given slot.
272
- void Assign(int slot, ClockHandle* h);
526
+ template <typename T>
527
+ void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end,
528
+ bool apply_if_will_be_deleted) {
529
+ for (uint32_t i = index_begin; i < index_end; i++) {
530
+ ClockHandle* h = &array_[i];
531
+ if (h->TryExclusiveRef()) {
532
+ if (h->IsElement() &&
533
+ (apply_if_will_be_deleted || !h->WillBeDeleted())) {
534
+ // Hand the internal ref over to func, which is now responsible
535
+ // to release it.
536
+ func(h);
537
+ } else {
538
+ h->ReleaseExclusiveRef();
539
+ }
540
+ }
541
+ }
542
+ }
273
543
 
274
544
  template <typename T>
275
- void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
545
+ void ConstApplyToEntriesRange(T func, uint32_t index_begin,
546
+ uint32_t index_end,
547
+ bool apply_if_will_be_deleted) const {
276
548
  for (uint32_t i = index_begin; i < index_end; i++) {
277
549
  ClockHandle* h = &array_[i];
278
- if (h->IsVisible()) {
279
- func(h);
550
+ if (h->TryExclusiveRef()) {
551
+ if (h->IsElement() &&
552
+ (apply_if_will_be_deleted || !h->WillBeDeleted())) {
553
+ func(h);
554
+ }
555
+ h->ReleaseExclusiveRef();
280
556
  }
281
557
  }
282
558
  }
@@ -295,28 +571,38 @@ class ClockHandleTable {
295
571
  private:
296
572
  friend class ClockCacheShard;
297
573
 
298
- int FindVisibleElement(const Slice& key, int& probe, int displacement);
574
+ int FindElement(const Slice& key, uint32_t hash, uint32_t& probe);
299
575
 
300
- int FindAvailableSlot(const Slice& key, int& probe, int displacement);
576
+ int FindAvailableSlot(const Slice& key, uint32_t& probe);
301
577
 
302
- int FindVisibleElementOrAvailableSlot(const Slice& key, int& probe,
303
- int displacement);
578
+ int FindElementOrAvailableSlot(const Slice& key, uint32_t hash,
579
+ uint32_t& probe);
304
580
 
305
581
  // Returns the index of the first slot probed (hashing with
306
- // the given key) with a handle e such that cond(e) is true.
307
- // Otherwise, if no match is found, returns -1.
308
- // For every handle e probed except the final slot, updates
309
- // e->displacements += displacement.
310
- // The argument probe is modified such that consecutive calls
311
- // to FindSlot continue probing right after where the previous
312
- // call left.
313
- int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> cond,
314
- int& probe, int displacement);
582
+ // the given key) with a handle e such that match(e) is true.
583
+ // At every step, the function first tests whether match(e) holds.
584
+ // If it's false, it evaluates abort(e) to decide whether the
585
+ // search should be aborted, and in the affirmative returns -1.
586
+ // For every handle e probed except the last one, the function runs
587
+ // update(e). We say a probe to a handle e is aborting if match(e) is
588
+ // false and abort(e) is true. The argument probe is one more than the
589
+ // last non-aborting probe during the call. This is so that that the
590
+ // variable can be used to keep track of progress across consecutive
591
+ // calls to FindSlot.
592
+ inline int FindSlot(const Slice& key, std::function<bool(ClockHandle*)> match,
593
+ std::function<bool(ClockHandle*)> stop,
594
+ std::function<void(ClockHandle*)> update,
595
+ uint32_t& probe);
596
+
597
+ // After a failed FindSlot call (i.e., with answer -1), this function
598
+ // decrements all displacements, starting from the 0-th probe.
599
+ void Rollback(const Slice& key, uint32_t probe);
315
600
 
316
601
  // Number of hash bits used for table index.
317
602
  // The size of the table is 1 << length_bits_.
318
603
  int length_bits_;
319
604
 
605
+ // For faster computation of ModTableSize.
320
606
  const uint32_t length_bits_mask_;
321
607
 
322
608
  // Number of elements in the table.
@@ -345,10 +631,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
345
631
  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
346
632
 
347
633
  // Like Cache methods, but with an extra "hash" parameter.
348
- // Insert an item into the hash table and, if handle is null, insert into
349
- // the clock list. Older items are evicted as necessary. If the cache is full
350
- // and free_handle_on_fail is true, the item is deleted and handle is set to
351
- // nullptr.
634
+ // Insert an item into the hash table and, if handle is null, make it
635
+ // evictable by the clock algorithm. Older items are evicted as necessary.
636
+ // If the cache is full and free_handle_on_fail is true, the item is deleted
637
+ // and handle is set to nullptr.
352
638
  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
353
639
  Cache::DeleterFn deleter, Cache::Handle** handle,
354
640
  Cache::Priority priority) override;
@@ -393,13 +679,18 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
393
679
 
394
680
  private:
395
681
  friend class ClockCache;
396
- void ClockRemove(ClockHandle* e);
397
- void ClockInsert(ClockHandle* e);
682
+
683
+ // Makes an element evictable by clock.
684
+ void ClockOn(ClockHandle* h);
685
+
686
+ // Makes an element non-evictable.
687
+ void ClockOff(ClockHandle* h);
688
+
689
+ // Requires an exclusive ref on h.
690
+ void Evict(ClockHandle* h);
398
691
 
399
692
  // Free some space following strict clock policy until enough space
400
- // to hold (usage_ + charge) is freed or the clock list is empty
401
- // This function is not thread safe - it needs to be executed while
402
- // holding the mutex_.
693
+ // to hold (usage_ + charge) is freed or there are no evictable elements.
403
694
  void EvictFromClock(size_t charge, autovector<ClockHandle>* deleted);
404
695
 
405
696
  // Returns the charge of a single handle.
@@ -436,9 +727,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShard {
436
727
  // Memory size for entries residing in the cache.
437
728
  size_t usage_;
438
729
 
439
- // Memory size for unpinned entries in the clock list.
440
- size_t clock_usage_;
441
-
442
730
  // mutex_ protects the following state.
443
731
  // We don't count mutex_ as the cache's internal state so semantically we
444
732
  // don't mind mutex_ invoking the non-const actions.