tigerbeetle-node 0.6.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -83
- package/dist/benchmark.js +102 -100
- package/dist/benchmark.js.map +1 -1
- package/dist/index.d.ts +82 -82
- package/dist/index.js +74 -93
- package/dist/index.js.map +1 -1
- package/dist/test.js +135 -112
- package/dist/test.js.map +1 -1
- package/package.json +13 -14
- package/scripts/download_node_headers.sh +3 -1
- package/src/benchmark.ts +114 -118
- package/src/index.ts +102 -111
- package/src/node.zig +53 -51
- package/src/test.ts +146 -125
- package/src/tigerbeetle/scripts/benchmark.bat +46 -46
- package/src/tigerbeetle/scripts/benchmark.sh +5 -0
- package/src/tigerbeetle/scripts/install_zig.bat +109 -109
- package/src/tigerbeetle/scripts/install_zig.sh +7 -3
- package/src/tigerbeetle/scripts/vopr.bat +47 -47
- package/src/tigerbeetle/src/benchmark.zig +63 -96
- package/src/tigerbeetle/src/config.zig +23 -19
- package/src/tigerbeetle/src/demo.zig +2 -15
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +10 -10
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
- package/src/tigerbeetle/src/{demo_04_create_transfers_two_phase_commit.zig → demo_04_create_pending_transfers.zig} +18 -12
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +37 -0
- package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +24 -0
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +1 -1
- package/src/tigerbeetle/src/io/linux.zig +4 -4
- package/src/tigerbeetle/src/main.zig +19 -3
- package/src/tigerbeetle/src/message_pool.zig +5 -2
- package/src/tigerbeetle/src/ring_buffer.zig +48 -3
- package/src/tigerbeetle/src/simulator.zig +104 -8
- package/src/tigerbeetle/src/state_machine.zig +1813 -816
- package/src/tigerbeetle/src/test/cluster.zig +165 -32
- package/src/tigerbeetle/src/test/packet_simulator.zig +14 -1
- package/src/tigerbeetle/src/test/state_checker.zig +3 -1
- package/src/tigerbeetle/src/test/state_machine.zig +8 -7
- package/src/tigerbeetle/src/test/storage.zig +99 -40
- package/src/tigerbeetle/src/tigerbeetle.zig +103 -98
- package/src/tigerbeetle/src/vsr/journal.zig +1387 -459
- package/src/tigerbeetle/src/vsr/replica.zig +1204 -417
- package/src/tigerbeetle/src/vsr.zig +203 -49
- package/src/translate.zig +10 -0
- package/.yarn/releases/yarn-berry.cjs +0 -55
- package/.yarnrc.yml +0 -1
- package/scripts/postinstall.sh +0 -6
- package/src/tigerbeetle/src/demo_05_accept_transfers.zig +0 -23
- package/src/tigerbeetle/src/demo_06_reject_transfers.zig +0 -17
- package/src/tigerbeetle/src/format_test.zig +0 -69
- package/yarn.lock +0 -42
|
@@ -12,6 +12,74 @@ const Header = vsr.Header;
|
|
|
12
12
|
|
|
13
13
|
const log = std.log.scoped(.journal);
|
|
14
14
|
|
|
15
|
+
/// There are two contiguous circular buffers on disk in the journal storage zone.
|
|
16
|
+
///
|
|
17
|
+
/// In both rings, the `op` for each reserved header is set to the slot index.
|
|
18
|
+
/// This helps WAL recovery detect misdirected reads/writes.
|
|
19
|
+
const Ring = enum {
|
|
20
|
+
/// A circular buffer of prepare message headers.
|
|
21
|
+
headers,
|
|
22
|
+
/// A circular buffer of prepare messages. Each slot is padded to `config.message_size_max`.
|
|
23
|
+
prepares,
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
const headers_per_sector = @divExact(config.sector_size, @sizeOf(Header));
|
|
27
|
+
comptime {
|
|
28
|
+
assert(headers_per_sector > 0);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/// A slot is `op % config.journal_slot_count`.
|
|
32
|
+
const Slot = struct { index: u64 };
|
|
33
|
+
|
|
34
|
+
/// An inclusive, non-empty range of slots.
|
|
35
|
+
const SlotRange = struct {
|
|
36
|
+
head: Slot,
|
|
37
|
+
tail: Slot,
|
|
38
|
+
|
|
39
|
+
/// Returns whether this range (inclusive) includes the specified slot.
|
|
40
|
+
///
|
|
41
|
+
/// Cases (`·`=included, ` `=excluded):
|
|
42
|
+
///
|
|
43
|
+
/// * `head < tail` → ` head··tail `
|
|
44
|
+
/// * `head > tail` → `··tail head··` (The range wraps around).
|
|
45
|
+
/// * `head = tail` → panic (Caller must handle this case separately).
|
|
46
|
+
fn contains(self: *const SlotRange, slot: Slot) bool {
|
|
47
|
+
// To avoid confusion, the empty range must be checked separately by the caller.
|
|
48
|
+
assert(self.head.index != self.tail.index);
|
|
49
|
+
|
|
50
|
+
if (self.head.index < self.tail.index) {
|
|
51
|
+
return self.head.index <= slot.index and slot.index <= self.tail.index;
|
|
52
|
+
}
|
|
53
|
+
if (self.head.index > self.tail.index) {
|
|
54
|
+
return slot.index <= self.tail.index or self.head.index <= slot.index;
|
|
55
|
+
}
|
|
56
|
+
unreachable;
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
const slot_count = config.journal_slot_count;
|
|
61
|
+
const headers_size = config.journal_size_headers;
|
|
62
|
+
const prepares_size = config.journal_size_prepares;
|
|
63
|
+
|
|
64
|
+
pub const write_ahead_log_zone_size = headers_size + prepares_size;
|
|
65
|
+
|
|
66
|
+
comptime {
|
|
67
|
+
assert(slot_count > 0);
|
|
68
|
+
assert(slot_count % 2 == 0);
|
|
69
|
+
assert(slot_count % headers_per_sector == 0);
|
|
70
|
+
assert(slot_count >= headers_per_sector);
|
|
71
|
+
// The length of the prepare pipeline is the upper bound on how many ops can be
|
|
72
|
+
// reordered during a view change. See `recover_prepares_callback()` for more detail.
|
|
73
|
+
assert(slot_count > config.pipeline_max);
|
|
74
|
+
|
|
75
|
+
assert(headers_size > 0);
|
|
76
|
+
assert(headers_size % config.sector_size == 0);
|
|
77
|
+
|
|
78
|
+
assert(prepares_size > 0);
|
|
79
|
+
assert(prepares_size % config.sector_size == 0);
|
|
80
|
+
assert(prepares_size % config.message_size_max == 0);
|
|
81
|
+
}
|
|
82
|
+
|
|
15
83
|
pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
16
84
|
return struct {
|
|
17
85
|
const Self = @This();
|
|
@@ -28,7 +96,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
28
96
|
};
|
|
29
97
|
|
|
30
98
|
pub const Write = struct {
|
|
31
|
-
pub const Trigger = enum { append, repair };
|
|
99
|
+
pub const Trigger = enum { append, repair, pipeline };
|
|
32
100
|
|
|
33
101
|
self: *Self,
|
|
34
102
|
callback: fn (self: *Replica, wrote: ?*Message, trigger: Trigger) void,
|
|
@@ -39,6 +107,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
39
107
|
/// True if this Write has acquired a lock on a sector of headers.
|
|
40
108
|
/// This also means that the Write is currently writing sectors or queuing to do so.
|
|
41
109
|
header_sector_locked: bool = false,
|
|
110
|
+
|
|
42
111
|
/// Linked list of Writes waiting to acquire the same header sector as this Write.
|
|
43
112
|
header_sector_next: ?*Write = null,
|
|
44
113
|
|
|
@@ -46,18 +115,17 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
46
115
|
range: Range,
|
|
47
116
|
|
|
48
117
|
const Sector = *align(config.sector_size) [config.sector_size]u8;
|
|
118
|
+
|
|
49
119
|
fn header_sector(write: *Self.Write, journal: *Self) Sector {
|
|
50
120
|
assert(journal.writes.items.len == journal.headers_iops.len);
|
|
51
|
-
const i = @divExact(
|
|
52
|
-
|
|
53
|
-
|
|
121
|
+
const i = @divExact(
|
|
122
|
+
@ptrToInt(write) - @ptrToInt(&journal.writes.items),
|
|
123
|
+
@sizeOf(Self.Write),
|
|
124
|
+
);
|
|
125
|
+
// TODO The compiler should not need this align cast as the type of `headers_iops`
|
|
126
|
+
// ensures that each buffer is properly aligned.
|
|
54
127
|
return @alignCast(config.sector_size, &journal.headers_iops[i]);
|
|
55
128
|
}
|
|
56
|
-
|
|
57
|
-
fn header_sector_same(write: *Self.Write, other: *Self.Write) bool {
|
|
58
|
-
return write_prepare_header_offset(write.message) ==
|
|
59
|
-
write_prepare_header_offset(other.message);
|
|
60
|
-
}
|
|
61
129
|
};
|
|
62
130
|
|
|
63
131
|
/// State that needs to be persisted while waiting for an overlapping
|
|
@@ -85,27 +153,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
85
153
|
|
|
86
154
|
storage: *Storage,
|
|
87
155
|
replica: u8,
|
|
88
|
-
size: u64,
|
|
89
|
-
size_headers: u64,
|
|
90
|
-
size_circular_buffer: u64,
|
|
91
156
|
|
|
157
|
+
/// A header is located at `slot == header.op % headers.len`.
|
|
158
|
+
///
|
|
159
|
+
/// Each slot's `header.command` is either `prepare` or `reserved`.
|
|
160
|
+
/// When the slot's header is `reserved`, the header's `op` is the slot index.
|
|
161
|
+
///
|
|
162
|
+
/// During recovery, store the (unvalidated) headers of the prepare ring.
|
|
163
|
+
// TODO Use 2 separate header lists: "staging" and "working".
|
|
164
|
+
// When participating in a view change, each replica should only send the headers from its
|
|
165
|
+
// working set that it knows it prepared.
|
|
166
|
+
// This also addresses the problem of redundant headers being written prematurely due to
|
|
167
|
+
// batching (after the first log cycle — for the first log cycle we write an invalid message).
|
|
92
168
|
headers: []align(config.sector_size) Header,
|
|
93
|
-
|
|
169
|
+
|
|
170
|
+
/// Store the redundant headers (unvalidated) during recovery.
|
|
171
|
+
// TODO When "headers" is split into "staging" and "working", reuse one of those instead.
|
|
172
|
+
headers_redundant: []align(config.sector_size) Header,
|
|
173
|
+
|
|
174
|
+
/// We copy-on-write to these buffers, as the in-memory headers may change while writing.
|
|
94
175
|
/// The buffers belong to the IOP at the corresponding index in IOPS.
|
|
95
176
|
headers_iops: *align(config.sector_size) [config.io_depth_write][config.sector_size]u8,
|
|
96
|
-
/// Apart from the header written with the entry, we also store two redundant copies of each
|
|
97
|
-
/// header at different locations on disk, and we alternate between these for each append.
|
|
98
|
-
/// This tracks which version (0 or 1) should be written to next:
|
|
99
|
-
headers_version: u1 = 0,
|
|
100
177
|
|
|
101
178
|
/// Statically allocated read IO operation context data.
|
|
102
179
|
reads: IOPS(Read, config.io_depth_read) = .{},
|
|
180
|
+
|
|
103
181
|
/// Statically allocated write IO operation context data.
|
|
104
182
|
writes: IOPS(Write, config.io_depth_write) = .{},
|
|
105
183
|
|
|
106
184
|
/// Whether an entry is in memory only and needs to be written or is being written:
|
|
107
185
|
/// We use this in the same sense as a dirty bit in the kernel page cache.
|
|
108
|
-
/// A dirty bit means that we have not
|
|
186
|
+
/// A dirty bit means that we have not prepared the entry, or need to repair a faulty entry.
|
|
109
187
|
dirty: BitSet,
|
|
110
188
|
|
|
111
189
|
/// Whether an entry was written to disk and this write was subsequently lost due to:
|
|
@@ -113,43 +191,65 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
113
191
|
/// * a misdirected write (or a misdirected read, we do not distinguish), or else
|
|
114
192
|
/// * a latent sector error, where the sector can no longer be read.
|
|
115
193
|
/// A faulty bit means that we prepared and then lost the entry.
|
|
116
|
-
/// A faulty bit requires the dirty bit to also be set so that
|
|
194
|
+
/// A faulty bit requires the dirty bit to also be set so that callers need not check both.
|
|
117
195
|
/// A faulty bit is used then only to qualify the severity of the dirty bit.
|
|
118
196
|
faulty: BitSet,
|
|
119
197
|
|
|
120
|
-
|
|
198
|
+
/// The checksum of the prepare in the corresponding slot.
|
|
199
|
+
/// This is used to respond to `request_prepare` messages even when the slot is faulty.
|
|
200
|
+
/// For example, the slot may be faulty because the redundant header is faulty.
|
|
201
|
+
///
|
|
202
|
+
/// The checksum will missing (`prepare_checksums[i]=0`, `prepare_inhabited[i]=false`) when:
|
|
203
|
+
/// * the message in the slot is reserved,
|
|
204
|
+
/// * the message in the slot is being written, or when
|
|
205
|
+
/// * the message in the slot is corrupt.
|
|
206
|
+
// TODO: `prepare_checksums` and `prepare_inhabited` should be combined into a []?u128,
|
|
207
|
+
// but that type is currently unusable (as of Zig 0.9.1).
|
|
208
|
+
// See: https://github.com/ziglang/zig/issues/9871
|
|
209
|
+
prepare_checksums: []u128,
|
|
210
|
+
/// When prepare_inhabited[i]==false, prepare_checksums[i]==0.
|
|
211
|
+
/// (`undefined` would may more sense than `0`, but `0` allows it to be asserted).
|
|
212
|
+
prepare_inhabited: []bool,
|
|
213
|
+
|
|
214
|
+
recovered: bool = false,
|
|
121
215
|
recovering: bool = false,
|
|
122
216
|
|
|
123
|
-
pub fn init(
|
|
124
|
-
|
|
125
|
-
storage: *Storage,
|
|
126
|
-
replica: u8,
|
|
127
|
-
size: u64,
|
|
128
|
-
headers_count: u32,
|
|
129
|
-
init_prepare: *Header,
|
|
130
|
-
) !Self {
|
|
131
|
-
if (@mod(size, config.sector_size) != 0) return error.SizeMustBeAMultipleOfSectorSize;
|
|
132
|
-
if (!math.isPowerOfTwo(headers_count)) return error.HeadersCountMustBeAPowerOfTwo;
|
|
133
|
-
assert(storage.size == size);
|
|
134
|
-
|
|
135
|
-
const headers_per_sector = @divExact(config.sector_size, @sizeOf(Header));
|
|
136
|
-
assert(headers_per_sector > 0);
|
|
137
|
-
assert(headers_count >= headers_per_sector);
|
|
217
|
+
pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Self {
|
|
218
|
+
assert(write_ahead_log_zone_size <= storage.size);
|
|
138
219
|
|
|
139
220
|
var headers = try allocator.allocAdvanced(
|
|
140
221
|
Header,
|
|
141
222
|
config.sector_size,
|
|
142
|
-
|
|
223
|
+
slot_count,
|
|
143
224
|
.exact,
|
|
144
225
|
);
|
|
145
226
|
errdefer allocator.free(headers);
|
|
146
|
-
|
|
227
|
+
for (headers) |*header| header.* = undefined;
|
|
147
228
|
|
|
148
|
-
var
|
|
229
|
+
var headers_redundant = try allocator.allocAdvanced(
|
|
230
|
+
Header,
|
|
231
|
+
config.sector_size,
|
|
232
|
+
slot_count,
|
|
233
|
+
.exact,
|
|
234
|
+
);
|
|
235
|
+
errdefer allocator.free(headers_redundant);
|
|
236
|
+
for (headers_redundant) |*header| header.* = undefined;
|
|
237
|
+
|
|
238
|
+
var dirty = try BitSet.init(allocator, slot_count);
|
|
149
239
|
errdefer dirty.deinit(allocator);
|
|
240
|
+
for (headers) |_, index| dirty.set(Slot{ .index = index });
|
|
150
241
|
|
|
151
|
-
var faulty = try BitSet.init(allocator,
|
|
242
|
+
var faulty = try BitSet.init(allocator, slot_count);
|
|
152
243
|
errdefer faulty.deinit(allocator);
|
|
244
|
+
for (headers) |_, index| faulty.set(Slot{ .index = index });
|
|
245
|
+
|
|
246
|
+
var prepare_checksums = try allocator.alloc(u128, slot_count);
|
|
247
|
+
errdefer allocator.free(prepare_checksums);
|
|
248
|
+
std.mem.set(u128, prepare_checksums, 0);
|
|
249
|
+
|
|
250
|
+
var prepare_inhabited = try allocator.alloc(bool, slot_count);
|
|
251
|
+
errdefer allocator.free(prepare_inhabited);
|
|
252
|
+
std.mem.set(bool, prepare_inhabited, false);
|
|
153
253
|
|
|
154
254
|
const headers_iops = (try allocator.allocAdvanced(
|
|
155
255
|
[config.sector_size]u8,
|
|
@@ -159,45 +259,36 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
159
259
|
))[0..config.io_depth_write];
|
|
160
260
|
errdefer allocator.free(headers_iops);
|
|
161
261
|
|
|
162
|
-
|
|
163
|
-
const size_headers = headers.len * @sizeOf(Header);
|
|
164
|
-
const size_headers_copies = size_headers * header_copies;
|
|
165
|
-
if (size_headers_copies >= size) return error.SizeTooSmallForHeadersCount;
|
|
166
|
-
|
|
167
|
-
const size_circular_buffer = size - size_headers_copies;
|
|
168
|
-
if (size_circular_buffer < 64 * 1024 * 1024) return error.SizeTooSmallForCircularBuffer;
|
|
169
|
-
|
|
170
|
-
log.debug("{}: size={} headers_len={} headers={} circular_buffer={}", .{
|
|
262
|
+
log.debug("{}: slot_count={} size={} headers_size={} prepares_size={}", .{
|
|
171
263
|
replica,
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
std.fmt.fmtIntSizeBin(
|
|
175
|
-
std.fmt.fmtIntSizeBin(
|
|
264
|
+
slot_count,
|
|
265
|
+
std.fmt.fmtIntSizeBin(write_ahead_log_zone_size),
|
|
266
|
+
std.fmt.fmtIntSizeBin(headers_size),
|
|
267
|
+
std.fmt.fmtIntSizeBin(prepares_size),
|
|
176
268
|
});
|
|
177
269
|
|
|
178
270
|
var self = Self{
|
|
179
271
|
.storage = storage,
|
|
180
272
|
.replica = replica,
|
|
181
|
-
.size = size,
|
|
182
|
-
.size_headers = size_headers,
|
|
183
|
-
.size_circular_buffer = size_circular_buffer,
|
|
184
273
|
.headers = headers,
|
|
274
|
+
.headers_redundant = headers_redundant,
|
|
185
275
|
.dirty = dirty,
|
|
186
276
|
.faulty = faulty,
|
|
277
|
+
.prepare_checksums = prepare_checksums,
|
|
278
|
+
.prepare_inhabited = prepare_inhabited,
|
|
187
279
|
.headers_iops = headers_iops,
|
|
188
280
|
};
|
|
189
281
|
|
|
190
|
-
assert(@mod(self.size_circular_buffer, config.sector_size) == 0);
|
|
191
282
|
assert(@mod(@ptrToInt(&self.headers[0]), config.sector_size) == 0);
|
|
192
|
-
assert(self.dirty.bits.
|
|
193
|
-
assert(self.faulty.bits.
|
|
283
|
+
assert(self.dirty.bits.bit_length == slot_count);
|
|
284
|
+
assert(self.faulty.bits.bit_length == slot_count);
|
|
285
|
+
assert(self.dirty.count == slot_count);
|
|
286
|
+
assert(self.faulty.count == slot_count);
|
|
287
|
+
assert(self.prepare_checksums.len == slot_count);
|
|
288
|
+
assert(self.prepare_inhabited.len == slot_count);
|
|
194
289
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
assert(init_prepare.valid_checksum());
|
|
198
|
-
assert(init_prepare.invalid() == null);
|
|
199
|
-
self.headers[0] = init_prepare.*;
|
|
200
|
-
self.assert_headers_reserved_from(init_prepare.op + 1);
|
|
290
|
+
for (self.headers) |*h| assert(!h.valid_checksum());
|
|
291
|
+
for (self.headers_redundant) |*h| assert(!h.valid_checksum());
|
|
201
292
|
|
|
202
293
|
return self;
|
|
203
294
|
}
|
|
@@ -208,7 +299,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
208
299
|
self.dirty.deinit(allocator);
|
|
209
300
|
self.faulty.deinit(allocator);
|
|
210
301
|
allocator.free(self.headers);
|
|
302
|
+
allocator.free(self.headers_redundant);
|
|
211
303
|
allocator.free(self.headers_iops);
|
|
304
|
+
allocator.free(self.prepare_checksums);
|
|
305
|
+
allocator.free(self.prepare_inhabited);
|
|
212
306
|
|
|
213
307
|
{
|
|
214
308
|
var it = self.reads.iterate();
|
|
@@ -220,74 +314,168 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
220
314
|
}
|
|
221
315
|
}
|
|
222
316
|
|
|
223
|
-
///
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
317
|
+
/// Returns whether this is a fresh database WAL; no prepares (except the root) have ever
|
|
318
|
+
/// been written. This determines whether a replica can transition immediately to normal
|
|
319
|
+
/// status, or if it needs to run recovery protocol.
|
|
320
|
+
///
|
|
321
|
+
/// Called by the replica immediately after WAL recovery completes, but before the replica
|
|
322
|
+
/// issues any I/O from handling messages.
|
|
323
|
+
pub fn is_empty(self: *const Self) bool {
|
|
324
|
+
assert(!self.recovering);
|
|
325
|
+
assert(self.recovered);
|
|
326
|
+
assert(self.writes.executing() == 0);
|
|
327
|
+
assert(self.headers[0].valid_checksum());
|
|
328
|
+
|
|
329
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
330
|
+
if (self.headers[0].operation != .root) return false;
|
|
331
|
+
|
|
332
|
+
assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
333
|
+
assert(self.headers[0].checksum == self.prepare_checksums[0]);
|
|
334
|
+
assert(self.prepare_inhabited[0]);
|
|
335
|
+
|
|
336
|
+
// If any message is faulty, we must fall back to VSR recovery protocol (i.e. treat
|
|
337
|
+
// this as a non-empty WAL) since that message may have been a prepare.
|
|
338
|
+
if (self.faulty.count > 0) return false;
|
|
339
|
+
|
|
340
|
+
for (self.headers[1..]) |*header| {
|
|
341
|
+
if (header.command == .prepare) return false;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
for (self.prepare_inhabited[1..]) |inhabited| {
|
|
345
|
+
if (inhabited) return false;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return true;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
pub fn slot_for_op(_: *const Self, op: u64) Slot {
|
|
352
|
+
return Slot{ .index = op % slot_count };
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
pub fn slot_with_op(self: *const Self, op: u64) ?Slot {
|
|
356
|
+
if (self.header_with_op(op)) |_| {
|
|
357
|
+
return self.slot_for_op(op);
|
|
358
|
+
} else {
|
|
359
|
+
return null;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
pub fn slot_with_op_and_checksum(self: *const Self, op: u64, checksum: u128) ?Slot {
|
|
364
|
+
if (self.header_with_op_and_checksum(op, checksum)) |_| {
|
|
365
|
+
return self.slot_for_op(op);
|
|
366
|
+
} else {
|
|
367
|
+
return null;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
pub fn slot_for_header(self: *const Self, header: *const Header) Slot {
|
|
372
|
+
assert(header.command == .prepare);
|
|
373
|
+
return self.slot_for_op(header.op);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
pub fn slot_with_header(self: *const Self, header: *const Header) ?Slot {
|
|
377
|
+
assert(header.command == .prepare);
|
|
378
|
+
return self.slot_with_op(header.op);
|
|
227
379
|
}
|
|
228
380
|
|
|
229
381
|
/// Returns any existing entry at the location indicated by header.op.
|
|
230
382
|
/// This existing entry may have an older or newer op number.
|
|
231
|
-
pub fn
|
|
383
|
+
pub fn header_for_entry(self: *const Self, header: *const Header) ?*const Header {
|
|
232
384
|
assert(header.command == .prepare);
|
|
233
|
-
return self.
|
|
385
|
+
return self.header_for_op(header.op);
|
|
234
386
|
}
|
|
235
387
|
|
|
236
|
-
/// We use
|
|
237
|
-
|
|
238
|
-
/// this is to prevent variable offsets from impacting the location of an op. Otherwise, the
|
|
239
|
-
/// same op number but for different views could exist at multiple locations in the journal.
|
|
240
|
-
pub fn entry_for_op(self: *Self, op: u64) ?*const Header {
|
|
388
|
+
/// We use `op` directly to index into the headers array and locate ops without a scan.
|
|
389
|
+
pub fn header_for_op(self: *const Self, op: u64) ?*const Header {
|
|
241
390
|
// TODO Snapshots
|
|
242
|
-
const
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
391
|
+
const slot = self.slot_for_op(op);
|
|
392
|
+
const existing = &self.headers[slot.index];
|
|
393
|
+
switch (existing.command) {
|
|
394
|
+
.prepare => {
|
|
395
|
+
assert(self.slot_for_op(existing.op).index == slot.index);
|
|
396
|
+
return existing;
|
|
397
|
+
},
|
|
398
|
+
.reserved => {
|
|
399
|
+
assert(existing.op == slot.index);
|
|
400
|
+
return null;
|
|
401
|
+
},
|
|
402
|
+
else => unreachable,
|
|
403
|
+
}
|
|
246
404
|
}
|
|
247
405
|
|
|
248
406
|
/// Returns the entry at `@mod(op)` location, but only if `entry.op == op`, else `null`.
|
|
249
407
|
/// Be careful of using this without considering that there may still be an existing op.
|
|
250
|
-
pub fn
|
|
251
|
-
if (self.
|
|
408
|
+
pub fn header_with_op(self: *const Self, op: u64) ?*const Header {
|
|
409
|
+
if (self.header_for_op(op)) |existing| {
|
|
252
410
|
if (existing.op == op) return existing;
|
|
253
411
|
}
|
|
254
412
|
return null;
|
|
255
413
|
}
|
|
256
414
|
|
|
257
|
-
/// As per `
|
|
258
|
-
pub fn
|
|
259
|
-
self: *Self,
|
|
415
|
+
/// As per `header_with_op()`, but only if there is an optional checksum match.
|
|
416
|
+
pub fn header_with_op_and_checksum(
|
|
417
|
+
self: *const Self,
|
|
260
418
|
op: u64,
|
|
261
419
|
checksum: ?u128,
|
|
262
420
|
) ?*const Header {
|
|
263
|
-
if (self.
|
|
421
|
+
if (self.header_with_op(op)) |existing| {
|
|
264
422
|
assert(existing.op == op);
|
|
265
423
|
if (checksum == null or existing.checksum == checksum.?) return existing;
|
|
266
424
|
}
|
|
267
425
|
return null;
|
|
268
426
|
}
|
|
269
427
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
428
|
+
// TODO How should we handle the case where the current header argument is the same as
|
|
429
|
+
// op_checkpoint?
|
|
430
|
+
pub fn previous_entry(self: *const Self, header: *const Header) ?*const Header {
|
|
431
|
+
if (header.op == 0) {
|
|
432
|
+
return null;
|
|
433
|
+
} else {
|
|
434
|
+
return self.header_for_op(header.op - 1);
|
|
435
|
+
}
|
|
274
436
|
}
|
|
275
437
|
|
|
276
|
-
pub fn next_entry(self: *Self, header: *const Header) ?*const Header {
|
|
277
|
-
|
|
278
|
-
if (header.op + 1 == self.headers.len) return null;
|
|
279
|
-
return self.entry_for_op(header.op + 1);
|
|
438
|
+
pub fn next_entry(self: *const Self, header: *const Header) ?*const Header {
|
|
439
|
+
return self.header_for_op(header.op + 1);
|
|
280
440
|
}
|
|
281
441
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
assert(
|
|
285
|
-
|
|
442
|
+
/// Returns the highest op number prepared, in any slot without reference to the checkpoint.
|
|
443
|
+
pub fn op_maximum(self: *const Self) u64 {
|
|
444
|
+
assert(self.recovered);
|
|
445
|
+
|
|
446
|
+
var op: u64 = 0;
|
|
447
|
+
for (self.headers) |*header| {
|
|
448
|
+
if (header.command == .prepare) {
|
|
449
|
+
if (header.op > op) op = header.op;
|
|
450
|
+
} else {
|
|
451
|
+
assert(header.command == .reserved);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
return op;
|
|
286
455
|
}
|
|
287
456
|
|
|
288
|
-
|
|
457
|
+
/// Returns the highest op number prepared, as per `header_ok()` in the untrusted headers.
|
|
458
|
+
fn op_maximum_headers_untrusted(cluster: u32, headers_untrusted: []const Header) u64 {
|
|
459
|
+
var op: u64 = 0;
|
|
460
|
+
for (headers_untrusted) |*header_untrusted, slot_index| {
|
|
461
|
+
const slot = Slot{ .index = slot_index };
|
|
462
|
+
if (header_ok(cluster, slot, header_untrusted)) |header| {
|
|
463
|
+
if (header.command == .prepare) {
|
|
464
|
+
if (header.op > op) op = header.op;
|
|
465
|
+
} else {
|
|
466
|
+
assert(header.command == .reserved);
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
return op;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
pub fn has(self: *const Self, header: *const Header) bool {
|
|
474
|
+
assert(self.recovered);
|
|
475
|
+
assert(header.command == .prepare);
|
|
289
476
|
// TODO Snapshots
|
|
290
|
-
const
|
|
477
|
+
const slot = self.slot_for_op(header.op);
|
|
478
|
+
const existing = &self.headers[slot.index];
|
|
291
479
|
if (existing.command == .reserved) {
|
|
292
480
|
return false;
|
|
293
481
|
} else {
|
|
@@ -301,40 +489,49 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
301
489
|
}
|
|
302
490
|
}
|
|
303
491
|
|
|
304
|
-
pub fn has_clean(self: *Self, header: *const Header) bool {
|
|
492
|
+
pub fn has_clean(self: *const Self, header: *const Header) bool {
|
|
305
493
|
// TODO Snapshots
|
|
306
|
-
|
|
494
|
+
if (self.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
|
|
495
|
+
if (!self.dirty.bit(slot)) {
|
|
496
|
+
assert(self.prepare_inhabited[slot.index]);
|
|
497
|
+
assert(self.prepare_checksums[slot.index] == header.checksum);
|
|
498
|
+
return true;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
return false;
|
|
307
502
|
}
|
|
308
503
|
|
|
309
|
-
pub fn has_dirty(self: *Self, header: *const Header) bool {
|
|
504
|
+
pub fn has_dirty(self: *const Self, header: *const Header) bool {
|
|
310
505
|
// TODO Snapshots
|
|
311
|
-
return self.has(header) and self.dirty.bit(header
|
|
506
|
+
return self.has(header) and self.dirty.bit(self.slot_with_header(header).?);
|
|
312
507
|
}
|
|
313
508
|
|
|
314
|
-
/// Copies latest headers between `op_min` and `op_max` (both inclusive) as
|
|
315
|
-
/// Reverses the order when copying so that latest headers are copied first, which
|
|
509
|
+
/// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
|
|
510
|
+
/// Reverses the order when copying so that latest headers are copied first, which protects
|
|
316
511
|
/// against the callsite slicing the buffer the wrong way and incorrectly.
|
|
317
512
|
/// Skips .reserved headers (gaps between headers).
|
|
318
513
|
/// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
|
|
319
514
|
/// Returns the number of headers actually copied.
|
|
320
515
|
pub fn copy_latest_headers_between(
|
|
321
|
-
self: *Self,
|
|
516
|
+
self: *const Self,
|
|
322
517
|
op_min: u64,
|
|
323
518
|
op_max: u64,
|
|
324
519
|
dest: []Header,
|
|
325
520
|
) usize {
|
|
521
|
+
assert(self.recovered);
|
|
326
522
|
assert(op_min <= op_max);
|
|
327
523
|
assert(dest.len > 0);
|
|
328
524
|
|
|
329
525
|
var copied: usize = 0;
|
|
330
|
-
|
|
526
|
+
// Poison all slots; only slots less than `copied` are used.
|
|
527
|
+
std.mem.set(Header, dest, undefined);
|
|
331
528
|
|
|
332
529
|
// Start at op_max + 1 and do the decrement upfront to avoid overflow when op_min == 0:
|
|
333
530
|
var op = op_max + 1;
|
|
334
531
|
while (op > op_min) {
|
|
335
532
|
op -= 1;
|
|
336
533
|
|
|
337
|
-
if (self.
|
|
534
|
+
if (self.header_with_op(op)) |header| {
|
|
338
535
|
dest[copied] = header.*;
|
|
339
536
|
assert(dest[copied].invalid() == null);
|
|
340
537
|
copied += 1;
|
|
@@ -342,12 +539,16 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
342
539
|
}
|
|
343
540
|
}
|
|
344
541
|
|
|
345
|
-
log.debug(
|
|
346
|
-
op_min,
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
542
|
+
log.debug(
|
|
543
|
+
"{}: copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}",
|
|
544
|
+
.{
|
|
545
|
+
self.replica,
|
|
546
|
+
op_min,
|
|
547
|
+
op_max,
|
|
548
|
+
dest.len,
|
|
549
|
+
copied,
|
|
550
|
+
},
|
|
551
|
+
);
|
|
351
552
|
|
|
352
553
|
return copied;
|
|
353
554
|
}
|
|
@@ -360,7 +561,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
360
561
|
/// We expect that `op_min` and `op_max` (`replica.commit_min` and `replica.op`) must exist.
|
|
361
562
|
/// A range will never include `op_min` because this is already committed.
|
|
362
563
|
/// A range will never include `op_max` because this must be up to date as the latest op.
|
|
363
|
-
/// We must therefore first resolve any
|
|
564
|
+
/// We must therefore first resolve any op uncertainty so that we can trust `op_max` here.
|
|
364
565
|
///
|
|
365
566
|
/// For example: If ops 3, 9 and 10 are missing, returns: `{ .op_min = 9, .op_max = 10 }`.
|
|
366
567
|
///
|
|
@@ -382,11 +583,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
382
583
|
op -= 1;
|
|
383
584
|
|
|
384
585
|
// Get the entry at @mod(op) location, but only if entry.op == op, else null:
|
|
385
|
-
var A = self.
|
|
586
|
+
var A = self.header_with_op(op);
|
|
386
587
|
if (A) |a| {
|
|
387
588
|
if (B) |b| {
|
|
388
589
|
// If A was reordered then A may have a newer op than B (but an older view).
|
|
389
|
-
// However, here we use
|
|
590
|
+
// However, here we use header_with_op() to assert a.op + 1 == b.op:
|
|
390
591
|
assert(a.op + 1 == b.op);
|
|
391
592
|
|
|
392
593
|
// We do not assert a.view <= b.view here unless the chain is intact because
|
|
@@ -417,15 +618,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
417
618
|
} else if (a.checksum == b.parent) {
|
|
418
619
|
// A is connected to B, and B is connected or B is op_max.
|
|
419
620
|
assert(a.view <= b.view);
|
|
420
|
-
} else if (a.view
|
|
421
|
-
// A is not connected to B,
|
|
621
|
+
} else if (a.view != b.view) {
|
|
622
|
+
// A is not connected to B, open range:
|
|
422
623
|
assert(a.op > op_min);
|
|
624
|
+
assert(b.op <= op_max);
|
|
423
625
|
range = .{ .op_min = a.op, .op_max = a.op };
|
|
424
|
-
} else if (a.view > b.view) {
|
|
425
|
-
// A is not connected to B, but A is newer than B, open and close range:
|
|
426
|
-
assert(b.op < op_max);
|
|
427
|
-
range = .{ .op_min = b.op, .op_max = b.op };
|
|
428
|
-
break;
|
|
429
626
|
} else {
|
|
430
627
|
// Op numbers in the same view must be connected.
|
|
431
628
|
unreachable;
|
|
@@ -478,6 +675,9 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
478
675
|
checksum: u128,
|
|
479
676
|
destination_replica: ?u8,
|
|
480
677
|
) void {
|
|
678
|
+
assert(self.recovered);
|
|
679
|
+
assert(checksum != 0);
|
|
680
|
+
|
|
481
681
|
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
482
682
|
if (op > replica.op) {
|
|
483
683
|
self.read_prepare_log(op, checksum, "beyond replica.op");
|
|
@@ -487,39 +687,57 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
487
687
|
|
|
488
688
|
// Do not use this pointer beyond this function's scope, as the
|
|
489
689
|
// header memory may then change:
|
|
490
|
-
const exact = self.
|
|
690
|
+
const exact = self.header_with_op_and_checksum(op, checksum) orelse {
|
|
491
691
|
self.read_prepare_log(op, checksum, "no entry exactly");
|
|
492
692
|
callback(replica, null, null);
|
|
493
693
|
return;
|
|
494
694
|
};
|
|
495
695
|
|
|
496
|
-
|
|
497
|
-
|
|
696
|
+
const slot = self.slot_with_op_and_checksum(op, checksum).?;
|
|
697
|
+
if (self.faulty.bit(slot)) {
|
|
698
|
+
assert(self.dirty.bit(slot));
|
|
498
699
|
|
|
499
700
|
self.read_prepare_log(op, checksum, "faulty");
|
|
500
701
|
callback(replica, null, null);
|
|
501
702
|
return;
|
|
502
703
|
}
|
|
503
704
|
|
|
504
|
-
if (self.dirty.bit(
|
|
705
|
+
if (self.dirty.bit(slot)) {
|
|
505
706
|
self.read_prepare_log(op, checksum, "dirty");
|
|
506
707
|
callback(replica, null, null);
|
|
507
708
|
return;
|
|
508
709
|
}
|
|
509
710
|
|
|
510
|
-
const physical_size = vsr.sector_ceil(exact.size);
|
|
511
|
-
assert(physical_size >= exact.size);
|
|
512
|
-
|
|
513
|
-
const message = replica.message_bus.get_message();
|
|
514
|
-
defer replica.message_bus.unref(message);
|
|
515
|
-
|
|
516
711
|
// Skip the disk read if the header is all we need:
|
|
517
712
|
if (exact.size == @sizeOf(Header)) {
|
|
713
|
+
const message = replica.message_bus.get_message();
|
|
714
|
+
defer replica.message_bus.unref(message);
|
|
715
|
+
|
|
518
716
|
message.header.* = exact.*;
|
|
519
717
|
callback(replica, message, destination_replica);
|
|
520
718
|
return;
|
|
521
719
|
}
|
|
522
720
|
|
|
721
|
+
self.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
/// Read a prepare from disk. There may or may not be an in-memory header.
|
|
725
|
+
pub fn read_prepare_with_op_and_checksum(
|
|
726
|
+
self: *Self,
|
|
727
|
+
callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
|
|
728
|
+
op: u64,
|
|
729
|
+
checksum: u128,
|
|
730
|
+
destination_replica: ?u8,
|
|
731
|
+
) void {
|
|
732
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
733
|
+
const slot = self.slot_for_op(op);
|
|
734
|
+
assert(self.recovered);
|
|
735
|
+
assert(self.prepare_inhabited[slot.index]);
|
|
736
|
+
assert(self.prepare_checksums[slot.index] == checksum);
|
|
737
|
+
|
|
738
|
+
const message = replica.message_bus.get_message();
|
|
739
|
+
defer replica.message_bus.unref(message);
|
|
740
|
+
|
|
523
741
|
const read = self.reads.acquire() orelse {
|
|
524
742
|
self.read_prepare_log(op, checksum, "waiting for IOP");
|
|
525
743
|
callback(replica, null, null);
|
|
@@ -536,29 +754,34 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
536
754
|
.destination_replica = destination_replica,
|
|
537
755
|
};
|
|
538
756
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
const buffer = message.buffer[0..physical_size];
|
|
542
|
-
const offset = self.offset_in_circular_buffer(exact.offset);
|
|
543
|
-
|
|
544
|
-
// Memory must not be owned by `self.headers` as these may be modified concurrently:
|
|
545
|
-
assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
|
|
546
|
-
@ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + self.size_headers);
|
|
757
|
+
const buffer: []u8 = message.buffer[0..config.message_size_max];
|
|
758
|
+
const offset = offset_physical(.prepares, slot);
|
|
547
759
|
|
|
548
760
|
log.debug(
|
|
549
761
|
"{}: read_sectors: offset={} len={}",
|
|
550
762
|
.{ replica.replica, offset, buffer.len },
|
|
551
763
|
);
|
|
552
764
|
|
|
553
|
-
self.
|
|
765
|
+
// Memory must not be owned by `self.headers` as these may be modified concurrently:
|
|
766
|
+
assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
|
|
767
|
+
@ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
|
|
768
|
+
|
|
769
|
+
assert_bounds(.prepares, offset, buffer.len);
|
|
770
|
+
self.storage.read_sectors(
|
|
771
|
+
read_prepare_with_op_and_checksum_callback,
|
|
772
|
+
&read.completion,
|
|
773
|
+
buffer,
|
|
774
|
+
offset,
|
|
775
|
+
);
|
|
554
776
|
}
|
|
555
777
|
|
|
556
|
-
fn
|
|
778
|
+
fn read_prepare_with_op_and_checksum_callback(completion: *Storage.Read) void {
|
|
557
779
|
const read = @fieldParentPtr(Self.Read, "completion", completion);
|
|
558
780
|
const self = read.self;
|
|
559
781
|
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
560
782
|
const op = read.op;
|
|
561
783
|
const checksum = read.checksum;
|
|
784
|
+
assert(self.recovered);
|
|
562
785
|
|
|
563
786
|
defer {
|
|
564
787
|
replica.message_bus.unref(read.message);
|
|
@@ -571,43 +794,84 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
571
794
|
return;
|
|
572
795
|
}
|
|
573
796
|
|
|
574
|
-
|
|
575
|
-
|
|
797
|
+
const checksum_inhabited = self.prepare_inhabited[self.slot_for_op(op).index];
|
|
798
|
+
const checksum_match = self.prepare_checksums[self.slot_for_op(op).index] == checksum;
|
|
799
|
+
if (!checksum_inhabited or !checksum_match) {
|
|
800
|
+
self.read_prepare_log(op, checksum, "prepare changed during read");
|
|
576
801
|
read.callback(replica, null, null);
|
|
577
802
|
return;
|
|
578
|
-
}
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
// Check that the `headers` slot belongs to the same op that it did when the read began.
|
|
806
|
+
// The slot may not match the Read's op/checksum due to either:
|
|
807
|
+
// * The in-memory header changed since the read began.
|
|
808
|
+
// * The in-memory header is reserved+faulty; the read was via `prepare_checksums`
|
|
809
|
+
const slot = self.slot_with_op_and_checksum(op, checksum);
|
|
579
810
|
|
|
580
811
|
if (!read.message.header.valid_checksum()) {
|
|
581
|
-
|
|
582
|
-
|
|
812
|
+
if (slot) |s| {
|
|
813
|
+
self.faulty.set(s);
|
|
814
|
+
self.dirty.set(s);
|
|
815
|
+
}
|
|
583
816
|
|
|
584
817
|
self.read_prepare_log(op, checksum, "corrupt header after read");
|
|
585
818
|
read.callback(replica, null, null);
|
|
586
819
|
return;
|
|
587
820
|
}
|
|
588
821
|
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
822
|
+
if (read.message.header.cluster != replica.cluster) {
|
|
823
|
+
// This could be caused by a misdirected read or write.
|
|
824
|
+
// Though when a prepare spans multiple sectors, a misdirected read/write will
|
|
825
|
+
// likely manifest as a checksum failure instead.
|
|
826
|
+
if (slot) |s| {
|
|
827
|
+
self.faulty.set(s);
|
|
828
|
+
self.dirty.set(s);
|
|
829
|
+
}
|
|
593
830
|
|
|
594
|
-
self.read_prepare_log(op, checksum, "
|
|
831
|
+
self.read_prepare_log(op, checksum, "wrong cluster");
|
|
595
832
|
read.callback(replica, null, null);
|
|
596
833
|
return;
|
|
597
834
|
}
|
|
598
835
|
|
|
599
836
|
if (read.message.header.op != op) {
|
|
837
|
+
// Possible causes:
|
|
838
|
+
// * The prepare was rewritten since the read began.
|
|
839
|
+
// * Misdirected read/write.
|
|
840
|
+
// * The combination of:
|
|
841
|
+
// * The leader is responding to a `request_prepare`.
|
|
842
|
+
// * The `request_prepare` did not include a checksum.
|
|
843
|
+
// * The requested op's slot is faulty, but the prepare is valid. Since the
|
|
844
|
+
// prepare is valid, WAL recovery set `prepare_checksums[slot]`. But on reading
|
|
845
|
+
// this entry it turns out not to have the right op.
|
|
846
|
+
// (This case (and the accompanying unnessary read) could be prevented by storing
|
|
847
|
+
// the op along with the checksum in `prepare_checksums`.)
|
|
848
|
+
assert(slot == null);
|
|
849
|
+
|
|
600
850
|
self.read_prepare_log(op, checksum, "op changed during read");
|
|
601
851
|
read.callback(replica, null, null);
|
|
602
852
|
return;
|
|
603
853
|
}
|
|
604
854
|
|
|
605
855
|
if (read.message.header.checksum != checksum) {
|
|
856
|
+
// This can also be caused by a misdirected read/write.
|
|
857
|
+
assert(slot == null);
|
|
858
|
+
|
|
606
859
|
self.read_prepare_log(op, checksum, "checksum changed during read");
|
|
607
860
|
read.callback(replica, null, null);
|
|
608
861
|
return;
|
|
609
862
|
}
|
|
610
863
|
|
|
864
|
+
if (!read.message.header.valid_checksum_body(read.message.body())) {
|
|
865
|
+
if (slot) |s| {
|
|
866
|
+
self.faulty.set(s);
|
|
867
|
+
self.dirty.set(s);
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
self.read_prepare_log(op, checksum, "corrupt body after read");
|
|
871
|
+
read.callback(replica, null, null);
|
|
872
|
+
return;
|
|
873
|
+
}
|
|
874
|
+
|
|
611
875
|
read.callback(replica, read.message, read.destination_replica);
|
|
612
876
|
}
|
|
613
877
|
|
|
@@ -620,46 +884,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
620
884
|
|
|
621
885
|
pub fn recover(self: *Self) void {
|
|
622
886
|
assert(!self.recovered);
|
|
887
|
+
assert(!self.recovering);
|
|
888
|
+
assert(self.dirty.count == slot_count);
|
|
889
|
+
assert(self.faulty.count == slot_count);
|
|
623
890
|
|
|
624
|
-
if (self.recovering) return;
|
|
625
891
|
self.recovering = true;
|
|
626
892
|
|
|
627
893
|
log.debug("{}: recover: recovering", .{self.replica});
|
|
628
894
|
|
|
629
|
-
self.recover_headers(0
|
|
630
|
-
self.recover_headers(0, 1);
|
|
895
|
+
self.recover_headers(0);
|
|
631
896
|
}
|
|
632
897
|
|
|
633
|
-
fn recover_headers(self: *Self, offset: u64
|
|
898
|
+
fn recover_headers(self: *Self, offset: u64) void {
|
|
634
899
|
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
635
900
|
|
|
636
901
|
assert(!self.recovered);
|
|
637
902
|
assert(self.recovering);
|
|
903
|
+
assert(self.dirty.count == slot_count);
|
|
904
|
+
assert(self.faulty.count == slot_count);
|
|
638
905
|
|
|
639
|
-
if (offset ==
|
|
640
|
-
log.debug("{}: recover_headers:
|
|
641
|
-
|
|
642
|
-
version,
|
|
643
|
-
});
|
|
644
|
-
if (self.reads.executing() == 0) {
|
|
645
|
-
log.debug("{}: recover_headers: both versions recovered", .{self.replica});
|
|
646
|
-
self.recovered = true;
|
|
647
|
-
self.recovering = false;
|
|
648
|
-
// The initialization op (TODO Snapshots):
|
|
649
|
-
assert(!self.dirty.bit(0));
|
|
650
|
-
assert(!self.faulty.bit(0));
|
|
651
|
-
// From here it's over to the Recovery protocol from VRR 2012.
|
|
652
|
-
}
|
|
906
|
+
if (offset == headers_size) {
|
|
907
|
+
log.debug("{}: recover_headers: complete", .{self.replica});
|
|
908
|
+
self.recover_prepares(Slot{ .index = 0 });
|
|
653
909
|
return;
|
|
654
910
|
}
|
|
655
|
-
assert(offset <
|
|
911
|
+
assert(offset < headers_size);
|
|
656
912
|
|
|
657
913
|
const message = replica.message_bus.get_message();
|
|
658
914
|
defer replica.message_bus.unref(message);
|
|
659
915
|
|
|
660
|
-
// We use the count of reads executing to know when both versions have finished reading:
|
|
661
916
|
// We expect that no other process is issuing reads while we are recovering.
|
|
662
|
-
assert(self.reads.executing()
|
|
917
|
+
assert(self.reads.executing() == 0);
|
|
663
918
|
|
|
664
919
|
const read = self.reads.acquire() orelse unreachable;
|
|
665
920
|
read.* = .{
|
|
@@ -669,148 +924,545 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
669
924
|
.callback = undefined,
|
|
670
925
|
.op = undefined,
|
|
671
926
|
.checksum = offset,
|
|
672
|
-
.destination_replica =
|
|
927
|
+
.destination_replica = null,
|
|
673
928
|
};
|
|
674
929
|
|
|
675
|
-
const buffer =
|
|
930
|
+
const buffer = recover_headers_buffer(message, offset);
|
|
676
931
|
assert(buffer.len > 0);
|
|
677
932
|
|
|
678
|
-
log.debug("{}: recover_headers:
|
|
933
|
+
log.debug("{}: recover_headers: offset={} size={} recovering", .{
|
|
679
934
|
self.replica,
|
|
680
|
-
version,
|
|
681
935
|
offset,
|
|
682
936
|
buffer.len,
|
|
683
937
|
});
|
|
684
938
|
|
|
685
939
|
self.storage.read_sectors(
|
|
686
|
-
|
|
940
|
+
recover_headers_callback,
|
|
687
941
|
&read.completion,
|
|
688
942
|
buffer,
|
|
689
|
-
|
|
943
|
+
offset_physical_for_logical(.headers, offset),
|
|
690
944
|
);
|
|
691
945
|
}
|
|
692
946
|
|
|
693
|
-
fn
|
|
694
|
-
const max = std.math.min(message.buffer.len, self.size_headers - offset);
|
|
695
|
-
assert(max % config.sector_size == 0);
|
|
696
|
-
return message.buffer[0..max];
|
|
697
|
-
}
|
|
698
|
-
|
|
699
|
-
fn recover_headers_on_read(completion: *Storage.Read) void {
|
|
947
|
+
fn recover_headers_callback(completion: *Storage.Read) void {
|
|
700
948
|
const read = @fieldParentPtr(Self.Read, "completion", completion);
|
|
701
949
|
const self = read.self;
|
|
702
950
|
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
703
951
|
const message = read.message;
|
|
704
952
|
|
|
705
953
|
const offset = @intCast(u64, read.checksum);
|
|
706
|
-
const
|
|
707
|
-
const buffer = self.recover_headers_buffer(message, offset);
|
|
954
|
+
const buffer = recover_headers_buffer(message, offset);
|
|
708
955
|
|
|
709
|
-
log.debug("{}: recover_headers:
|
|
956
|
+
log.debug("{}: recover_headers: offset={} size={} recovered", .{
|
|
710
957
|
self.replica,
|
|
711
|
-
version,
|
|
712
958
|
offset,
|
|
713
959
|
buffer.len,
|
|
714
960
|
});
|
|
715
961
|
|
|
962
|
+
assert(!self.recovered);
|
|
963
|
+
assert(self.recovering);
|
|
716
964
|
assert(offset % @sizeOf(Header) == 0);
|
|
717
965
|
assert(buffer.len >= @sizeOf(Header));
|
|
718
966
|
assert(buffer.len % @sizeOf(Header) == 0);
|
|
967
|
+
assert(read.destination_replica == null);
|
|
968
|
+
assert(self.dirty.count == slot_count);
|
|
969
|
+
assert(self.faulty.count == slot_count);
|
|
970
|
+
|
|
971
|
+
// Directly store all the redundant headers in `self.headers_redundant` (including any
|
|
972
|
+
// that are invalid or corrupt). As the prepares are recovered, these will be replaced
|
|
973
|
+
// or removed as necessary.
|
|
974
|
+
const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
|
|
975
|
+
std.mem.copy(
|
|
976
|
+
Header,
|
|
977
|
+
self.headers_redundant[@divExact(offset, @sizeOf(Header))..][0..buffer_headers.len],
|
|
978
|
+
buffer_headers,
|
|
979
|
+
);
|
|
719
980
|
|
|
720
|
-
|
|
721
|
-
const op = offset / @sizeOf(Header) + index;
|
|
722
|
-
|
|
723
|
-
if (header.valid_checksum()) {
|
|
724
|
-
// This header is valid.
|
|
725
|
-
if (self.entry_for_op(op)) |existing| {
|
|
726
|
-
if (existing.checksum == header.checksum) {
|
|
727
|
-
// We also have the same header from the other version.
|
|
728
|
-
assert(!self.faulty.bit(op));
|
|
729
|
-
} else if (existing.command == .reserved) {
|
|
730
|
-
self.set_entry_as_dirty(header);
|
|
731
|
-
self.faulty.clear(op);
|
|
732
|
-
} else {
|
|
733
|
-
// Don't replace any existing op from the other version.
|
|
734
|
-
// First come, first served.
|
|
735
|
-
// We'll sort out the right order later when we recover higher up.
|
|
736
|
-
assert(!self.faulty.bit(op));
|
|
737
|
-
}
|
|
738
|
-
} else if (header.command == .reserved) {
|
|
739
|
-
self.dirty.set(op);
|
|
740
|
-
self.faulty.clear(op);
|
|
741
|
-
} else {
|
|
742
|
-
self.set_entry_as_dirty(header);
|
|
743
|
-
}
|
|
744
|
-
} else {
|
|
745
|
-
// This header is corrupt.
|
|
746
|
-
if (self.entry_for_op(op)) |_| {
|
|
747
|
-
// However, we have a valid header from the other version.
|
|
748
|
-
} else {
|
|
749
|
-
self.dirty.set(op);
|
|
750
|
-
self.faulty.set(op);
|
|
751
|
-
}
|
|
752
|
-
}
|
|
753
|
-
}
|
|
754
|
-
|
|
981
|
+
const offset_next = offset + buffer.len;
|
|
755
982
|
// We must release before we call `recover_headers()` in case Storage is synchronous.
|
|
756
983
|
// Otherwise, we would run out of messages and reads.
|
|
757
984
|
replica.message_bus.unref(read.message);
|
|
758
985
|
self.reads.release(read);
|
|
759
986
|
|
|
760
|
-
self.recover_headers(
|
|
987
|
+
self.recover_headers(offset_next);
|
|
761
988
|
}
|
|
762
989
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
990
|
+
fn recover_headers_buffer(message: *Message, offset: u64) []u8 {
|
|
991
|
+
const max = std.math.min(message.buffer.len, headers_size - offset);
|
|
992
|
+
assert(max % config.sector_size == 0);
|
|
993
|
+
assert(max % @sizeOf(Header) == 0);
|
|
994
|
+
return message.buffer[0..max];
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
fn recover_prepares(self: *Self, slot: Slot) void {
|
|
998
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
999
|
+
assert(!self.recovered);
|
|
1000
|
+
assert(self.recovering);
|
|
1001
|
+
assert(self.dirty.count == slot_count);
|
|
1002
|
+
assert(self.faulty.count == slot_count);
|
|
1003
|
+
// We expect that no other process is issuing reads while we are recovering.
|
|
1004
|
+
assert(self.reads.executing() == 0);
|
|
1005
|
+
|
|
1006
|
+
if (slot.index == slot_count) {
|
|
1007
|
+
self.recover_slots();
|
|
1008
|
+
return;
|
|
1009
|
+
}
|
|
1010
|
+
assert(slot.index < slot_count);
|
|
1011
|
+
|
|
1012
|
+
const message = replica.message_bus.get_message();
|
|
1013
|
+
defer replica.message_bus.unref(message);
|
|
1014
|
+
|
|
1015
|
+
const read = self.reads.acquire() orelse unreachable;
|
|
1016
|
+
read.* = .{
|
|
1017
|
+
.self = self,
|
|
1018
|
+
.completion = undefined,
|
|
1019
|
+
.message = message.ref(),
|
|
1020
|
+
.callback = undefined,
|
|
1021
|
+
.op = undefined,
|
|
1022
|
+
.checksum = slot.index,
|
|
1023
|
+
.destination_replica = null,
|
|
1024
|
+
};
|
|
1025
|
+
|
|
1026
|
+
log.debug("{}: recover_prepares: recovering slot={}", .{
|
|
768
1027
|
self.replica,
|
|
769
|
-
|
|
770
|
-
|
|
1028
|
+
slot.index,
|
|
1029
|
+
});
|
|
1030
|
+
|
|
1031
|
+
self.storage.read_sectors(
|
|
1032
|
+
recover_prepares_callback,
|
|
1033
|
+
&read.completion,
|
|
1034
|
+
// We load the entire message to verify that it isn't torn or corrupt.
|
|
1035
|
+
// We don't know the message's size, so use the entire buffer.
|
|
1036
|
+
message.buffer[0..config.message_size_max],
|
|
1037
|
+
offset_physical(.prepares, slot),
|
|
1038
|
+
);
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
fn recover_prepares_callback(completion: *Storage.Read) void {
|
|
1042
|
+
const read = @fieldParentPtr(Self.Read, "completion", completion);
|
|
1043
|
+
const self = read.self;
|
|
1044
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1045
|
+
|
|
1046
|
+
assert(!self.recovered);
|
|
1047
|
+
assert(self.recovering);
|
|
1048
|
+
assert(self.dirty.count == slot_count);
|
|
1049
|
+
assert(self.faulty.count == slot_count);
|
|
1050
|
+
assert(read.destination_replica == null);
|
|
1051
|
+
|
|
1052
|
+
const slot = Slot{ .index = @intCast(u64, read.checksum) };
|
|
1053
|
+
assert(slot.index < slot_count);
|
|
1054
|
+
|
|
1055
|
+
// Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
|
|
1056
|
+
// to hold onto the whole message (just the header).
|
|
1057
|
+
if (read.message.header.valid_checksum() and
|
|
1058
|
+
read.message.header.valid_checksum_body(read.message.body()))
|
|
1059
|
+
{
|
|
1060
|
+
self.headers[slot.index] = read.message.header.*;
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
replica.message_bus.unref(read.message);
|
|
1064
|
+
self.reads.release(read);
|
|
1065
|
+
|
|
1066
|
+
self.recover_prepares(Slot{ .index = slot.index + 1 });
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
/// When in doubt about whether a particular message was received, it must be marked as
|
|
1070
|
+
/// faulty to avoid nacking a prepare which was received then lost/misdirected/corrupted.
|
|
1071
|
+
///
|
|
1072
|
+
///
|
|
1073
|
+
/// There are two special cases where faulty slots must be carefully handled:
|
|
1074
|
+
///
|
|
1075
|
+
/// A) Redundant headers are written in batches. Slots that are marked faulty are written
|
|
1076
|
+
/// as invalid (zeroed). This ensures that if the replica crashes and recovers, the
|
|
1077
|
+
/// entries are still faulty rather than reserved.
|
|
1078
|
+
/// The recovery process must be conservative about which headers are stored in
|
|
1079
|
+
/// `journal.headers`. To understand why this is important, consider what happens if it did
|
|
1080
|
+
/// load the faulty header into `journal.headers`, and then reads it back after a restart:
|
|
1081
|
+
///
|
|
1082
|
+
/// 1. Suppose slot 8 is in case @D. Per the table below, mark slot 8 faulty.
|
|
1083
|
+
/// 2. Suppose slot 9 is also loaded as faulty.
|
|
1084
|
+
/// 3. Journal recovery finishes. The replica beings to repair its missing/broken messages.
|
|
1085
|
+
/// 4. VSR recovery protocol fetches the true prepare for slot 9.
|
|
1086
|
+
/// 5. The message from step 4 is written to slot 9 of the prepares.
|
|
1087
|
+
/// 6. The header from step 4 is written to slot 9 of the redundant headers.
|
|
1088
|
+
/// But writes to the redundant headers are done in batches of `headers_per_sector`!
|
|
1089
|
+
/// So if step 1 loaded slot 8's prepare header into `journal.headers`, slot 8's
|
|
1090
|
+
/// redundant header would be updated at the same time (in the same write) as slot 9.
|
|
1091
|
+
/// 7! Immediately after step 6's write finishes, suppose the replica crashes (e.g. due to
|
|
1092
|
+
/// power failure.
|
|
1093
|
+
/// 8! Journal recovery again — but now slot 8 is loaded *without* being marked faulty.
|
|
1094
|
+
/// So we may incorrectly nack slot 8's message.
|
|
1095
|
+
///
|
|
1096
|
+
/// Therefore, recovery will never load a header into a slot *and* mark that slot faulty.
|
|
1097
|
+
///
|
|
1098
|
+
///
|
|
1099
|
+
/// B) When replica_count=1, repairing broken/lost prepares over VSR is not an option,
|
|
1100
|
+
/// so if a message is faulty the replica will abort.
|
|
1101
|
+
///
|
|
1102
|
+
///
|
|
1103
|
+
/// Recovery decision table:
|
|
1104
|
+
///
|
|
1105
|
+
/// label @A @B @C @D @E @F @G @H @I @J @K @L @M @N
|
|
1106
|
+
/// header valid 0 1 1 0 0 0 1 1 1 1 1 1 1 1
|
|
1107
|
+
/// header reserved _ 1 0 _ _ _ 1 1 0 1 0 0 0 0
|
|
1108
|
+
/// prepare valid 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
|
1109
|
+
/// prepare reserved _ _ _ 1 0 0 0 0 1 1 0 0 0 0
|
|
1110
|
+
/// prepare.op is maximum _ _ _ _ 0 1 0 1 _ _ _ _ _ _
|
|
1111
|
+
/// match checksum _ _ _ _ _ _ _ _ _ !1 0 0 0 1
|
|
1112
|
+
/// match op _ _ _ _ _ _ _ _ _ !1 < > 1 !1
|
|
1113
|
+
/// match view _ _ _ _ _ _ _ _ _ !1 _ _ !0 !1
|
|
1114
|
+
/// decision (replicas>1) vsr vsr vsr vsr vsr fix vsr fix vsr nil fix vsr vsr eql
|
|
1115
|
+
/// decision (replicas=1) fix fix
|
|
1116
|
+
///
|
|
1117
|
+
/// Legend:
|
|
1118
|
+
///
|
|
1119
|
+
/// 0 false
|
|
1120
|
+
/// 1 true
|
|
1121
|
+
/// !0 assert false
|
|
1122
|
+
/// !1 assert true
|
|
1123
|
+
/// _ ignore
|
|
1124
|
+
/// < header.op < prepare.op
|
|
1125
|
+
/// > header.op > prepare.op
|
|
1126
|
+
/// eql The header and prepare are identical; no repair necessary.
|
|
1127
|
+
/// nil Reserved; dirty/faulty are clear, no repair necessary.
|
|
1128
|
+
/// fix When replicas=1, use intact prepare. When replicas>1, use VSR `request_prepare`.
|
|
1129
|
+
/// vsr Repair with VSR `request_prepare`.
|
|
1130
|
+
///
|
|
1131
|
+
/// A "valid" header/prepare:
|
|
1132
|
+
/// 1. has a valid checksum
|
|
1133
|
+
/// 2. has the correct cluster
|
|
1134
|
+
/// 3. is in the correct slot (op % slot_count)
|
|
1135
|
+
/// 4. has command=reserved or command=prepare
|
|
1136
|
+
fn recover_slots(self: *Self) void {
|
|
1137
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1138
|
+
|
|
1139
|
+
assert(!self.recovered);
|
|
1140
|
+
assert(self.recovering);
|
|
1141
|
+
assert(self.reads.executing() == 0);
|
|
1142
|
+
assert(self.writes.executing() == 0);
|
|
1143
|
+
assert(self.dirty.count == slot_count);
|
|
1144
|
+
assert(self.faulty.count == slot_count);
|
|
1145
|
+
|
|
1146
|
+
const prepare_op_max = std.math.max(
|
|
1147
|
+
replica.op_checkpoint,
|
|
1148
|
+
op_maximum_headers_untrusted(replica.cluster, self.headers),
|
|
1149
|
+
);
|
|
1150
|
+
|
|
1151
|
+
var cases: [slot_count]*const Case = undefined;
|
|
1152
|
+
|
|
1153
|
+
for (self.headers) |_, index| {
|
|
1154
|
+
const slot = Slot{ .index = index };
|
|
1155
|
+
const header = header_ok(replica.cluster, slot, &self.headers_redundant[index]);
|
|
1156
|
+
const prepare = header_ok(replica.cluster, slot, &self.headers[index]);
|
|
1157
|
+
|
|
1158
|
+
cases[index] = recovery_case(header, prepare, prepare_op_max);
|
|
1159
|
+
|
|
1160
|
+
// `prepare_checksums` improves the availability of `request_prepare` by being more
|
|
1161
|
+
// flexible than `headers` regarding the prepares it references. It may hold a
|
|
1162
|
+
// prepare whose redundant header is broken, as long as the prepare itself is valid.
|
|
1163
|
+
if (prepare != null and prepare.?.command == .prepare) {
|
|
1164
|
+
assert(!self.prepare_inhabited[index]);
|
|
1165
|
+
self.prepare_inhabited[index] = true;
|
|
1166
|
+
self.prepare_checksums[index] = prepare.?.checksum;
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1169
|
+
assert(self.headers.len == cases.len);
|
|
1170
|
+
|
|
1171
|
+
// Refine cases @B and @C: Repair (truncate) a prepare if it was torn during a crash.
|
|
1172
|
+
if (self.recover_torn_prepare(&cases)) |torn_slot| {
|
|
1173
|
+
assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
|
|
1174
|
+
cases[torn_slot.index] = &case_cut;
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
for (cases) |case, index| self.recover_slot(Slot{ .index = index }, case);
|
|
1178
|
+
assert(cases.len == slot_count);
|
|
1179
|
+
|
|
1180
|
+
log.debug("{}: recover_slots: dirty={} faulty={}", .{
|
|
1181
|
+
self.replica,
|
|
1182
|
+
self.dirty.count,
|
|
1183
|
+
self.faulty.count,
|
|
771
1184
|
});
|
|
772
1185
|
|
|
773
|
-
|
|
774
|
-
|
|
1186
|
+
self.recovered = true;
|
|
1187
|
+
self.recovering = false;
|
|
1188
|
+
self.assert_recovered();
|
|
1189
|
+
// From here it's over to the Recovery protocol from VRR 2012.
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
/// Returns a slot that is safe to truncate.
|
|
1193
|
+
//
|
|
1194
|
+
/// Truncate any prepare that was torn while being appended to the log before a crash, when:
|
|
1195
|
+
/// * the maximum valid op is the same in the prepare headers and redundant headers,
|
|
1196
|
+
/// * in the slot following the maximum valid op:
|
|
1197
|
+
/// - the redundant header is valid,
|
|
1198
|
+
/// - the redundant header is reserved, and/or the op is at least a log cycle behind,
|
|
1199
|
+
/// - the prepare is corrupt, and
|
|
1200
|
+
/// * there are no faults except for those between `op_checkpoint` and `op_max + 1`,
|
|
1201
|
+
/// so that we can be sure that the maximum valid op is in fact the maximum.
|
|
1202
|
+
fn recover_torn_prepare(self: *const Self, cases: []const *const Case) ?Slot {
|
|
1203
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1204
|
+
|
|
1205
|
+
assert(!self.recovered);
|
|
1206
|
+
assert(self.recovering);
|
|
1207
|
+
assert(self.dirty.count == slot_count);
|
|
1208
|
+
assert(self.faulty.count == slot_count);
|
|
1209
|
+
|
|
1210
|
+
const op_max = op_maximum_headers_untrusted(replica.cluster, self.headers_redundant);
|
|
1211
|
+
if (op_max != op_maximum_headers_untrusted(replica.cluster, self.headers)) return null;
|
|
1212
|
+
if (op_max < replica.op_checkpoint) return null;
|
|
1213
|
+
// We can't assume that the header at `op_max` is a prepare — an empty journal with a
|
|
1214
|
+
// corrupt root prepare (op_max=0) will be repaired later.
|
|
1215
|
+
|
|
1216
|
+
const torn_op = op_max + 1;
|
|
1217
|
+
const torn_slot = self.slot_for_op(torn_op);
|
|
1218
|
+
|
|
1219
|
+
const torn_prepare_untrusted = &self.headers[torn_slot.index];
|
|
1220
|
+
if (torn_prepare_untrusted.valid_checksum()) return null;
|
|
1221
|
+
// The prepare is at least corrupt, possibly torn, but not valid and simply misdirected.
|
|
1222
|
+
|
|
1223
|
+
const header_untrusted = &self.headers_redundant[torn_slot.index];
|
|
1224
|
+
const header = header_ok(replica.cluster, torn_slot, header_untrusted) orelse return null;
|
|
1225
|
+
// The redundant header is valid, also for the correct cluster and not misdirected.
|
|
1226
|
+
|
|
1227
|
+
if (header.command == .prepare) {
|
|
1228
|
+
// The redundant header was already written, so the prepare is corrupt, not torn.
|
|
1229
|
+
if (header.op == torn_op) return null;
|
|
1230
|
+
|
|
1231
|
+
assert(header.op < torn_op); // Since torn_op > op_max.
|
|
1232
|
+
// The redundant header is from any previous log cycle.
|
|
1233
|
+
} else {
|
|
1234
|
+
assert(header.command == .reserved);
|
|
1235
|
+
|
|
1236
|
+
// This is the first log cycle.
|
|
1237
|
+
|
|
1238
|
+
// TODO Can we be more sure about this? What if op_max is clearly many cycles ahead?
|
|
1239
|
+
// Any previous log cycle is then expected to have a prepare, not a reserved header,
|
|
1240
|
+
// unless the prepare header was lost, in which case this slot may also not be torn.
|
|
1241
|
+
}
|
|
1242
|
+
|
|
1243
|
+
const checkpoint_index = self.slot_for_op(replica.op_checkpoint).index;
|
|
1244
|
+
if (checkpoint_index == torn_slot.index) {
|
|
1245
|
+
// The checkpoint and the torn op are in the same slot.
|
|
1246
|
+
assert(cases[checkpoint_index].decision(replica.replica_count) == .vsr);
|
|
1247
|
+
assert(slot_count > 1);
|
|
1248
|
+
assert(op_max >= replica.op_checkpoint);
|
|
1249
|
+
assert(torn_op == op_max + 1);
|
|
1250
|
+
assert(torn_op > replica.op_checkpoint);
|
|
1251
|
+
return null;
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
const known_range = SlotRange{
|
|
1255
|
+
.head = Slot{ .index = checkpoint_index },
|
|
1256
|
+
.tail = torn_slot,
|
|
1257
|
+
};
|
|
1258
|
+
|
|
1259
|
+
// We must be certain that the torn prepare really was being appended to the WAL.
|
|
1260
|
+
// Return if any faults do not lie between the checkpoint and the torn prepare, such as:
|
|
1261
|
+
//
|
|
1262
|
+
// (fault [checkpoint..........torn] fault)
|
|
1263
|
+
// (...torn] fault fault [checkpoint......)
|
|
1264
|
+
for (cases) |case, index| {
|
|
1265
|
+
// Do not use `faulty.bit()` because the decisions have not been processed yet.
|
|
1266
|
+
if (case.decision(replica.replica_count) == .vsr and
|
|
1267
|
+
!known_range.contains(Slot{ .index = index }))
|
|
1268
|
+
{
|
|
1269
|
+
return null;
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
// The prepare is torn.
|
|
1274
|
+
assert(!self.prepare_inhabited[torn_slot.index]);
|
|
1275
|
+
assert(!torn_prepare_untrusted.valid_checksum());
|
|
1276
|
+
assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
|
|
1277
|
+
return torn_slot;
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
fn recover_slot(self: *Self, slot: Slot, case: *const Case) void {
|
|
1281
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1282
|
+
const cluster = replica.cluster;
|
|
1283
|
+
|
|
1284
|
+
assert(!self.recovered);
|
|
1285
|
+
assert(self.recovering);
|
|
1286
|
+
assert(self.dirty.bit(slot));
|
|
1287
|
+
assert(self.faulty.bit(slot));
|
|
1288
|
+
|
|
1289
|
+
const header = header_ok(cluster, slot, &self.headers_redundant[slot.index]);
|
|
1290
|
+
const prepare = header_ok(cluster, slot, &self.headers[slot.index]);
|
|
1291
|
+
const decision = case.decision(replica.replica_count);
|
|
1292
|
+
switch (decision) {
|
|
1293
|
+
.eql => {
|
|
1294
|
+
assert(header.?.command == .prepare);
|
|
1295
|
+
assert(prepare.?.command == .prepare);
|
|
1296
|
+
assert(header.?.checksum == prepare.?.checksum);
|
|
1297
|
+
assert(self.prepare_inhabited[slot.index]);
|
|
1298
|
+
assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1299
|
+
self.headers[slot.index] = header.?.*;
|
|
1300
|
+
self.dirty.clear(slot);
|
|
1301
|
+
self.faulty.clear(slot);
|
|
1302
|
+
},
|
|
1303
|
+
.nil => {
|
|
1304
|
+
assert(header.?.command == .reserved);
|
|
1305
|
+
assert(prepare.?.command == .reserved);
|
|
1306
|
+
assert(header.?.checksum == prepare.?.checksum);
|
|
1307
|
+
assert(header.?.checksum == Header.reserved(cluster, slot.index).checksum);
|
|
1308
|
+
assert(!self.prepare_inhabited[slot.index]);
|
|
1309
|
+
assert(self.prepare_checksums[slot.index] == 0);
|
|
1310
|
+
self.headers[slot.index] = header.?.*;
|
|
1311
|
+
self.dirty.clear(slot);
|
|
1312
|
+
self.faulty.clear(slot);
|
|
1313
|
+
},
|
|
1314
|
+
.fix => {
|
|
1315
|
+
// TODO Perhaps we should have 3 separate branches here for the different cases.
|
|
1316
|
+
// The header may be valid or invalid.
|
|
1317
|
+
// The header may be reserved or a prepare.
|
|
1318
|
+
assert(prepare.?.command == .prepare);
|
|
1319
|
+
assert(self.prepare_inhabited[slot.index]);
|
|
1320
|
+
assert(self.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1321
|
+
|
|
1322
|
+
self.headers[slot.index] = prepare.?.*;
|
|
1323
|
+
self.faulty.clear(slot);
|
|
1324
|
+
if (replica.replica_count == 1) {
|
|
1325
|
+
// @E, @F, @G, @H, @K:
|
|
1326
|
+
self.dirty.clear(slot);
|
|
1327
|
+
// TODO Repair header on disk to restore durability.
|
|
1328
|
+
} else {
|
|
1329
|
+
// @F, @H, @K:
|
|
1330
|
+
// TODO Repair without retrieving remotely (i.e. don't set dirty or faulty).
|
|
1331
|
+
assert(self.dirty.bit(slot));
|
|
1332
|
+
}
|
|
1333
|
+
},
|
|
1334
|
+
.vsr => {
|
|
1335
|
+
self.headers[slot.index] = Header.reserved(cluster, slot.index);
|
|
1336
|
+
assert(self.dirty.bit(slot));
|
|
1337
|
+
assert(self.faulty.bit(slot));
|
|
1338
|
+
},
|
|
1339
|
+
.cut => {
|
|
1340
|
+
assert(header != null);
|
|
1341
|
+
assert(prepare == null);
|
|
1342
|
+
assert(!self.prepare_inhabited[slot.index]);
|
|
1343
|
+
assert(self.prepare_checksums[slot.index] == 0);
|
|
1344
|
+
self.headers[slot.index] = Header.reserved(cluster, slot.index);
|
|
1345
|
+
self.dirty.clear(slot);
|
|
1346
|
+
self.faulty.clear(slot);
|
|
1347
|
+
},
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
switch (decision) {
|
|
1351
|
+
.eql, .nil => {
|
|
1352
|
+
log.debug("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
|
|
1353
|
+
self.replica,
|
|
1354
|
+
slot.index,
|
|
1355
|
+
case.label,
|
|
1356
|
+
@tagName(decision),
|
|
1357
|
+
});
|
|
1358
|
+
},
|
|
1359
|
+
.fix, .vsr, .cut => {
|
|
1360
|
+
log.warn("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
|
|
1361
|
+
self.replica,
|
|
1362
|
+
slot.index,
|
|
1363
|
+
case.label,
|
|
1364
|
+
@tagName(decision),
|
|
1365
|
+
});
|
|
1366
|
+
},
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
fn assert_recovered(self: *const Self) void {
|
|
1371
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1372
|
+
|
|
1373
|
+
assert(self.recovered);
|
|
1374
|
+
assert(!self.recovering);
|
|
775
1375
|
|
|
776
|
-
|
|
777
|
-
self.
|
|
778
|
-
self.faulty.
|
|
1376
|
+
assert(self.dirty.count <= slot_count);
|
|
1377
|
+
assert(self.faulty.count <= slot_count);
|
|
1378
|
+
assert(self.faulty.count <= self.dirty.count);
|
|
1379
|
+
|
|
1380
|
+
// Abort if all slots are faulty, since something is very wrong.
|
|
1381
|
+
if (self.faulty.count == slot_count) @panic("WAL is completely corrupt");
|
|
1382
|
+
if (self.faulty.count > 0 and replica.replica_count == 1) @panic("WAL is corrupt");
|
|
1383
|
+
|
|
1384
|
+
if (self.headers[0].op == 0 and self.headers[0].command == .prepare) {
|
|
1385
|
+
assert(self.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
1386
|
+
assert(!self.faulty.bit(Slot{ .index = 0 }));
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
for (self.headers) |*header, index| {
|
|
1390
|
+
assert(header.valid_checksum());
|
|
1391
|
+
assert(header.cluster == replica.cluster);
|
|
1392
|
+
if (header.command == .reserved) {
|
|
1393
|
+
assert(header.op == index);
|
|
1394
|
+
} else {
|
|
1395
|
+
assert(header.command == .prepare);
|
|
1396
|
+
assert(header.op % slot_count == index);
|
|
1397
|
+
assert(self.prepare_inhabited[index]);
|
|
1398
|
+
assert(self.prepare_checksums[index] == header.checksum);
|
|
1399
|
+
assert(!self.faulty.bit(Slot{ .index = index }));
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
779
1402
|
}
|
|
780
1403
|
|
|
781
1404
|
/// Removes entries from `op_min` (inclusive) onwards.
|
|
782
|
-
///
|
|
1405
|
+
/// Used after a view change to remove uncommitted entries discarded by the new leader.
|
|
783
1406
|
pub fn remove_entries_from(self: *Self, op_min: u64) void {
|
|
784
|
-
|
|
785
|
-
|
|
1407
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1408
|
+
|
|
1409
|
+
assert(self.recovered);
|
|
786
1410
|
assert(op_min > 0);
|
|
1411
|
+
|
|
787
1412
|
log.debug("{}: remove_entries_from: op_min={}", .{ self.replica, op_min });
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
1413
|
+
|
|
1414
|
+
for (self.headers) |*header, index| {
|
|
1415
|
+
// We must remove the header regardless of whether it is a prepare or reserved,
|
|
1416
|
+
// since a reserved header may have been marked faulty for case @G, and
|
|
1417
|
+
// since the caller expects the WAL to be truncated, with clean slots.
|
|
1418
|
+
if (header.op >= op_min) {
|
|
1419
|
+
// TODO Explore scenarios where the data on disk may resurface after a crash.
|
|
1420
|
+
const slot = self.slot_for_op(header.op);
|
|
1421
|
+
assert(slot.index == index);
|
|
1422
|
+
self.headers[slot.index] = Header.reserved(replica.cluster, slot.index);
|
|
1423
|
+
self.dirty.clear(slot);
|
|
1424
|
+
self.faulty.clear(slot);
|
|
1425
|
+
// Do not clear `prepare_inhabited`/`prepare_checksums`. The prepare is
|
|
1426
|
+
// untouched on disk, and may be useful later. Consider this scenario:
|
|
1427
|
+
//
|
|
1428
|
+
// 1. Op 4 is received; start writing it.
|
|
1429
|
+
// 2. Op 4's prepare is written (setting `prepare_checksums`), start writing
|
|
1430
|
+
// the headers.
|
|
1431
|
+
// 3. View change. Op 4 is discarded by `remove_entries_from`.
|
|
1432
|
+
// 4. View change. Op 4 (the same one from before) is back, marked as dirty. But
|
|
1433
|
+
// we don't start a write, because `journal.writing()` says it is already in
|
|
1434
|
+
// progress.
|
|
1435
|
+
// 5. Op 4's header write finishes (`write_prepare_on_write_header`).
|
|
1436
|
+
//
|
|
1437
|
+
// If `remove_entries_from` cleared `prepare_checksums`,
|
|
1438
|
+
// `write_prepare_on_write_header` would clear `dirty`/`faulty` for a slot with
|
|
1439
|
+
// `prepare_inhabited=false`.
|
|
791
1440
|
}
|
|
792
1441
|
}
|
|
793
|
-
self.assert_headers_reserved_from(op_min);
|
|
794
|
-
// TODO At startup we need to handle entries that may have been removed but now reappear.
|
|
795
|
-
// This is because we do not call `write_headers_between()` here.
|
|
796
1442
|
}
|
|
797
1443
|
|
|
798
|
-
pub fn
|
|
799
|
-
|
|
1444
|
+
pub fn set_header_as_dirty(self: *Self, header: *const Header) void {
|
|
1445
|
+
assert(self.recovered);
|
|
1446
|
+
assert(header.command == .prepare);
|
|
1447
|
+
|
|
1448
|
+
log.debug("{}: set_header_as_dirty: op={} checksum={}", .{
|
|
800
1449
|
self.replica,
|
|
801
1450
|
header.op,
|
|
802
1451
|
header.checksum,
|
|
803
1452
|
});
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
1453
|
+
const slot = self.slot_for_header(header);
|
|
1454
|
+
|
|
1455
|
+
if (self.has(header)) {
|
|
1456
|
+
assert(self.dirty.bit(slot));
|
|
1457
|
+
// Do not clear any faulty bit for the same entry.
|
|
1458
|
+
} else {
|
|
1459
|
+
self.headers[slot.index] = header.*;
|
|
1460
|
+
self.dirty.set(slot);
|
|
1461
|
+
self.faulty.clear(slot);
|
|
808
1462
|
}
|
|
809
|
-
self.headers[header.op] = header.*;
|
|
810
|
-
self.dirty.set(header.op);
|
|
811
|
-
// Do not clear any faulty bit for the same entry.
|
|
812
1463
|
}
|
|
813
1464
|
|
|
1465
|
+
/// `write_prepare` uses `write_sectors` to prevent concurrent disk writes.
|
|
814
1466
|
pub fn write_prepare(
|
|
815
1467
|
self: *Self,
|
|
816
1468
|
callback: fn (self: *Replica, wrote: ?*Message, trigger: Write.Trigger) void,
|
|
@@ -819,17 +1471,23 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
819
1471
|
) void {
|
|
820
1472
|
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
821
1473
|
|
|
1474
|
+
assert(self.recovered);
|
|
822
1475
|
assert(message.header.command == .prepare);
|
|
823
1476
|
assert(message.header.size >= @sizeOf(Header));
|
|
824
1477
|
assert(message.header.size <= message.buffer.len);
|
|
1478
|
+
assert(self.has(message.header));
|
|
825
1479
|
|
|
826
1480
|
// The underlying header memory must be owned by the buffer and not by self.headers:
|
|
827
1481
|
// Otherwise, concurrent writes may modify the memory of the pointer while we write.
|
|
828
1482
|
assert(@ptrToInt(message.header) == @ptrToInt(message.buffer.ptr));
|
|
829
1483
|
|
|
830
|
-
|
|
1484
|
+
const slot = self.slot_with_header(message.header).?;
|
|
1485
|
+
|
|
1486
|
+
if (!self.dirty.bit(slot)) {
|
|
831
1487
|
// Any function that sets the faulty bit should also set the dirty bit:
|
|
832
|
-
assert(!self.faulty.bit(
|
|
1488
|
+
assert(!self.faulty.bit(slot));
|
|
1489
|
+
assert(self.prepare_inhabited[slot.index]);
|
|
1490
|
+
assert(self.prepare_checksums[slot.index] == message.header.checksum);
|
|
833
1491
|
self.write_prepare_debug(message.header, "skipping (clean)");
|
|
834
1492
|
callback(replica, message, trigger);
|
|
835
1493
|
return;
|
|
@@ -854,22 +1512,21 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
854
1512
|
};
|
|
855
1513
|
|
|
856
1514
|
// Slice the message to the nearest sector, we don't want to write the whole buffer:
|
|
857
|
-
const
|
|
858
|
-
|
|
1515
|
+
const buffer = message.buffer[0..vsr.sector_ceil(message.header.size)];
|
|
1516
|
+
const offset = offset_physical(.prepares, slot);
|
|
859
1517
|
|
|
860
1518
|
if (builtin.mode == .Debug) {
|
|
861
1519
|
// Assert that any sector padding has already been zeroed:
|
|
862
|
-
var sum_of_sector_padding_bytes:
|
|
863
|
-
for (
|
|
1520
|
+
var sum_of_sector_padding_bytes: u8 = 0;
|
|
1521
|
+
for (buffer[message.header.size..]) |byte| sum_of_sector_padding_bytes |= byte;
|
|
864
1522
|
assert(sum_of_sector_padding_bytes == 0);
|
|
865
1523
|
}
|
|
866
1524
|
|
|
867
|
-
self.
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
);
|
|
1525
|
+
self.prepare_inhabited[slot.index] = false;
|
|
1526
|
+
self.prepare_checksums[slot.index] = 0;
|
|
1527
|
+
|
|
1528
|
+
assert_bounds(.prepares, offset, buffer.len);
|
|
1529
|
+
self.write_sectors(write_prepare_header, write, buffer, offset);
|
|
873
1530
|
}
|
|
874
1531
|
|
|
875
1532
|
/// Attempt to lock the in-memory sector containing the header being written.
|
|
@@ -877,8 +1534,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
877
1534
|
fn write_prepare_header(write: *Self.Write) void {
|
|
878
1535
|
const self = write.self;
|
|
879
1536
|
const message = write.message;
|
|
1537
|
+
assert(self.recovered);
|
|
880
1538
|
|
|
881
|
-
if (
|
|
1539
|
+
if (self.slot_with_op_and_checksum(message.header.op, message.header.checksum)) |slot| {
|
|
1540
|
+
assert(!self.prepare_inhabited[slot.index]);
|
|
1541
|
+
self.prepare_inhabited[slot.index] = true;
|
|
1542
|
+
self.prepare_checksums[slot.index] = message.header.checksum;
|
|
1543
|
+
} else {
|
|
882
1544
|
self.write_prepare_debug(message.header, "entry changed while writing sectors");
|
|
883
1545
|
self.write_prepare_release(write, null);
|
|
884
1546
|
return;
|
|
@@ -887,14 +1549,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
887
1549
|
assert(!write.header_sector_locked);
|
|
888
1550
|
assert(write.header_sector_next == null);
|
|
889
1551
|
|
|
1552
|
+
const write_offset = self.offset_logical_in_headers_for_message(message);
|
|
1553
|
+
|
|
890
1554
|
var it = self.writes.iterate();
|
|
891
1555
|
while (it.next()) |other| {
|
|
892
1556
|
if (other == write) continue;
|
|
893
1557
|
if (!other.header_sector_locked) continue;
|
|
894
1558
|
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
other
|
|
1559
|
+
const other_offset = self.offset_logical_in_headers_for_message(other.message);
|
|
1560
|
+
if (other_offset == write_offset) {
|
|
1561
|
+
// The `other` and `write` target the same sector; append to the list.
|
|
1562
|
+
var tail = other;
|
|
1563
|
+
while (tail.header_sector_next) |next| tail = next;
|
|
1564
|
+
tail.header_sector_next = write;
|
|
898
1565
|
return;
|
|
899
1566
|
}
|
|
900
1567
|
}
|
|
@@ -904,6 +1571,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
904
1571
|
}
|
|
905
1572
|
|
|
906
1573
|
fn write_prepare_on_lock_header_sector(self: *Self, write: *Write) void {
|
|
1574
|
+
assert(self.recovered);
|
|
907
1575
|
assert(write.header_sector_locked);
|
|
908
1576
|
|
|
909
1577
|
// TODO It's possible within this section that the header has since been replaced but we
|
|
@@ -912,13 +1580,66 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
912
1580
|
// For this, we'll need to have a way to tweak write_prepare_release() to release locks.
|
|
913
1581
|
// At present, we don't return early here simply because it doesn't yet do that.
|
|
914
1582
|
|
|
1583
|
+
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
915
1584
|
const message = write.message;
|
|
916
|
-
const
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
);
|
|
1585
|
+
const slot_of_message = self.slot_for_header(message.header);
|
|
1586
|
+
const slot_first = Slot{
|
|
1587
|
+
.index = @divFloor(slot_of_message.index, headers_per_sector) * headers_per_sector,
|
|
1588
|
+
};
|
|
1589
|
+
|
|
1590
|
+
const offset = offset_physical(.headers, slot_of_message);
|
|
1591
|
+
assert(offset % config.sector_size == 0);
|
|
1592
|
+
assert(offset == slot_first.index * @sizeOf(Header));
|
|
1593
|
+
|
|
1594
|
+
const buffer: []u8 = write.header_sector(self);
|
|
1595
|
+
const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
|
|
1596
|
+
assert(buffer_headers.len == headers_per_sector);
|
|
1597
|
+
|
|
1598
|
+
var i: usize = 0;
|
|
1599
|
+
while (i < headers_per_sector) : (i += 1) {
|
|
1600
|
+
const slot = Slot{ .index = slot_first.index + i };
|
|
1601
|
+
|
|
1602
|
+
if (self.faulty.bit(slot)) {
|
|
1603
|
+
// Redundant faulty headers are deliberately written as invalid.
|
|
1604
|
+
// This ensures that faulty headers are still faulty when they are read back
|
|
1605
|
+
// from disk during recovery. This prevents faulty entries from changing to
|
|
1606
|
+
// reserved (and clean) after a crash and restart (e.g. accidentally converting
|
|
1607
|
+
// a case `@D` to a `@J` after a restart).
|
|
1608
|
+
buffer_headers[i] = .{
|
|
1609
|
+
.checksum = 0,
|
|
1610
|
+
.cluster = replica.cluster,
|
|
1611
|
+
.command = .reserved,
|
|
1612
|
+
};
|
|
1613
|
+
assert(!buffer_headers[i].valid_checksum());
|
|
1614
|
+
} else if (message.header.op < slot_count and
|
|
1615
|
+
!self.prepare_inhabited[slot.index] and
|
|
1616
|
+
message.header.command == .prepare and
|
|
1617
|
+
self.dirty.bit(slot))
|
|
1618
|
+
{
|
|
1619
|
+
// When:
|
|
1620
|
+
// * this is the first wrap of the WAL, and
|
|
1621
|
+
// * this prepare slot is not inhabited (never has been), and
|
|
1622
|
+
// * this prepare slot is a dirty prepare,
|
|
1623
|
+
// write a reserved header instead of the in-memory prepare header.
|
|
1624
|
+
//
|
|
1625
|
+
// This can be triggered by the follow sequence of events:
|
|
1626
|
+
// 1. Ops 6 and 7 arrive.
|
|
1627
|
+
// 2. The write of prepare 7 finishes (before prepare 6).
|
|
1628
|
+
// 3. Op 7 continues on to write the redundant headers.
|
|
1629
|
+
// Because prepare 6 is not yet written, header 6 is written as reserved.
|
|
1630
|
+
// 4. (If at this point the replica crashes & restarts, slot 6 is in case `@J`
|
|
1631
|
+
// (decision=nil) which can be locally repaired. In contrast, if op 6's
|
|
1632
|
+
// header was written in step 3, it would be case `@I`, which requires
|
|
1633
|
+
// remote repair.
|
|
1634
|
+
//
|
|
1635
|
+
// * When `replica_count=1`, case `@I`, is not recoverable.
|
|
1636
|
+
// * When `replica_count>1` this marginally improves availability by enabling
|
|
1637
|
+
// local repair.
|
|
1638
|
+
buffer_headers[i] = Header.reserved(replica.cluster, slot.index);
|
|
1639
|
+
} else {
|
|
1640
|
+
buffer_headers[i] = self.headers[slot.index];
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
922
1643
|
|
|
923
1644
|
log.debug("{}: write_header: op={} sectors[{}..{}]", .{
|
|
924
1645
|
self.replica,
|
|
@@ -927,35 +1648,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
927
1648
|
offset + config.sector_size,
|
|
928
1649
|
});
|
|
929
1650
|
|
|
930
|
-
//
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
self.write_prepare_header_to_version(write, write_prepare_on_write_header, version, write.header_sector(self), offset);
|
|
934
|
-
} else {
|
|
935
|
-
// Versions must be incremented upfront:
|
|
936
|
-
// If we don't increment upfront we could end up writing to the same copy twice.
|
|
937
|
-
// We would then lose the redundancy required to locate headers or even overwrite all copies.
|
|
938
|
-
const version = self.write_headers_increment_version();
|
|
939
|
-
_ = self.write_headers_increment_version();
|
|
940
|
-
switch (version) {
|
|
941
|
-
0 => self.write_prepare_header_to_version(write, write_prepare_on_write_header_version_0, 0, write.header_sector(self), offset),
|
|
942
|
-
1 => self.write_prepare_header_to_version(write, write_prepare_on_write_header_version_1, 1, write.header_sector(self), offset),
|
|
943
|
-
}
|
|
944
|
-
}
|
|
945
|
-
}
|
|
946
|
-
|
|
947
|
-
fn write_prepare_on_write_header_version_0(write: *Self.Write) void {
|
|
948
|
-
const self = write.self;
|
|
949
|
-
const offset = write_prepare_header_offset(write.message);
|
|
950
|
-
// Pass the opposite version bit from the one we just finished writing.
|
|
951
|
-
self.write_prepare_header_to_version(write, write_prepare_on_write_header, 1, write.header_sector(self), offset);
|
|
952
|
-
}
|
|
1651
|
+
// Memory must not be owned by self.headers as these may be modified concurrently:
|
|
1652
|
+
assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
|
|
1653
|
+
@ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + headers_size);
|
|
953
1654
|
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
const offset = write_prepare_header_offset(write.message);
|
|
957
|
-
// Pass the opposite version bit from the one we just finished writing.
|
|
958
|
-
self.write_prepare_header_to_version(write, write_prepare_on_write_header, 0, write.header_sector(self), offset);
|
|
1655
|
+
assert_bounds(.headers, offset, buffer.len);
|
|
1656
|
+
self.write_sectors(write_prepare_on_write_header, write, buffer, offset);
|
|
959
1657
|
}
|
|
960
1658
|
|
|
961
1659
|
fn write_prepare_on_write_header(write: *Self.Write) void {
|
|
@@ -973,9 +1671,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
973
1671
|
|
|
974
1672
|
self.write_prepare_debug(message.header, "complete, marking clean");
|
|
975
1673
|
// TODO Snapshots
|
|
976
|
-
|
|
977
|
-
self.
|
|
978
|
-
self.
|
|
1674
|
+
|
|
1675
|
+
const slot = self.slot_with_header(message.header).?;
|
|
1676
|
+
self.dirty.clear(slot);
|
|
1677
|
+
self.faulty.clear(slot);
|
|
979
1678
|
|
|
980
1679
|
self.write_prepare_release(write, message);
|
|
981
1680
|
}
|
|
@@ -1006,105 +1705,68 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1006
1705
|
self.writes.release(write);
|
|
1007
1706
|
}
|
|
1008
1707
|
|
|
1009
|
-
fn write_prepare_debug(self: *Self, header: *const Header, status: []const u8) void {
|
|
1010
|
-
log.debug("{}: write: view={} op={}
|
|
1708
|
+
fn write_prepare_debug(self: *const Self, header: *const Header, status: []const u8) void {
|
|
1709
|
+
log.debug("{}: write: view={} op={} len={}: {} {s}", .{
|
|
1011
1710
|
self.replica,
|
|
1012
1711
|
header.view,
|
|
1013
1712
|
header.op,
|
|
1014
|
-
header.offset,
|
|
1015
1713
|
header.size,
|
|
1016
1714
|
header.checksum,
|
|
1017
1715
|
status,
|
|
1018
1716
|
});
|
|
1019
1717
|
}
|
|
1020
1718
|
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1719
|
+
fn assert_bounds(ring: Ring, offset: u64, size: u64) void {
|
|
1720
|
+
switch (ring) {
|
|
1721
|
+
.headers => assert(offset + size <= headers_size),
|
|
1722
|
+
.prepares => {
|
|
1723
|
+
assert(offset >= headers_size);
|
|
1724
|
+
assert(offset + size <= headers_size + prepares_size);
|
|
1725
|
+
},
|
|
1726
|
+
}
|
|
1024
1727
|
}
|
|
1025
1728
|
|
|
1026
|
-
fn
|
|
1027
|
-
assert(
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1729
|
+
fn offset_logical(ring: Ring, slot: Slot) u64 {
|
|
1730
|
+
assert(slot.index < slot_count);
|
|
1731
|
+
|
|
1732
|
+
switch (ring) {
|
|
1733
|
+
.headers => {
|
|
1734
|
+
comptime assert(config.sector_size % @sizeOf(Header) == 0);
|
|
1735
|
+
const offset = vsr.sector_floor(slot.index * @sizeOf(Header));
|
|
1736
|
+
assert(offset < headers_size);
|
|
1737
|
+
return offset;
|
|
1738
|
+
},
|
|
1739
|
+
.prepares => {
|
|
1740
|
+
const offset = config.message_size_max * slot.index;
|
|
1741
|
+
assert(offset < prepares_size);
|
|
1742
|
+
return offset;
|
|
1743
|
+
},
|
|
1744
|
+
}
|
|
1032
1745
|
}
|
|
1033
1746
|
|
|
1034
|
-
fn
|
|
1035
|
-
|
|
1036
|
-
|
|
1747
|
+
fn offset_physical(ring: Ring, slot: Slot) u64 {
|
|
1748
|
+
return switch (ring) {
|
|
1749
|
+
.headers => offset_logical(.headers, slot),
|
|
1750
|
+
.prepares => headers_size + offset_logical(.prepares, slot),
|
|
1751
|
+
};
|
|
1037
1752
|
}
|
|
1038
1753
|
|
|
1039
|
-
fn
|
|
1040
|
-
self.
|
|
1041
|
-
return self.headers_version;
|
|
1754
|
+
fn offset_logical_in_headers_for_message(self: *const Self, message: *Message) u64 {
|
|
1755
|
+
return offset_logical(.headers, self.slot_for_header(message.header));
|
|
1042
1756
|
}
|
|
1043
1757
|
|
|
1044
|
-
///
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
// TODO Snapshots
|
|
1056
|
-
if (header.command == .reserved) {
|
|
1057
|
-
log.debug("{}: write_prepare_header_once: dirty reserved header", .{
|
|
1058
|
-
self.replica,
|
|
1059
|
-
});
|
|
1060
|
-
return false;
|
|
1758
|
+
/// Where `offset` is a logical offset relative to the start of the respective ring.
|
|
1759
|
+
fn offset_physical_for_logical(ring: Ring, offset: u64) u64 {
|
|
1760
|
+
switch (ring) {
|
|
1761
|
+
.headers => {
|
|
1762
|
+
assert(offset < headers_size);
|
|
1763
|
+
return offset;
|
|
1764
|
+
},
|
|
1765
|
+
.prepares => {
|
|
1766
|
+
assert(offset < prepares_size);
|
|
1767
|
+
return headers_size + offset;
|
|
1768
|
+
},
|
|
1061
1769
|
}
|
|
1062
|
-
if (self.previous_entry(header)) |previous| {
|
|
1063
|
-
assert(previous.command == .prepare);
|
|
1064
|
-
if (previous.checksum != header.parent) {
|
|
1065
|
-
log.debug("{}: write_headers_once: no hash chain", .{self.replica});
|
|
1066
|
-
return false;
|
|
1067
|
-
}
|
|
1068
|
-
// TODO Add is_dirty(header)
|
|
1069
|
-
// TODO Snapshots
|
|
1070
|
-
if (self.dirty.bit(previous.op)) {
|
|
1071
|
-
log.debug("{}: write_prepare_header_once: previous entry is dirty", .{
|
|
1072
|
-
self.replica,
|
|
1073
|
-
});
|
|
1074
|
-
return false;
|
|
1075
|
-
}
|
|
1076
|
-
} else {
|
|
1077
|
-
log.debug("{}: write_prepare_header_once: no previous entry", .{self.replica});
|
|
1078
|
-
return false;
|
|
1079
|
-
}
|
|
1080
|
-
return true;
|
|
1081
|
-
}
|
|
1082
|
-
|
|
1083
|
-
fn write_prepare_header_to_version(
|
|
1084
|
-
self: *Self,
|
|
1085
|
-
write: *Self.Write,
|
|
1086
|
-
callback: fn (completion: *Self.Write) void,
|
|
1087
|
-
version: u1,
|
|
1088
|
-
buffer: []const u8,
|
|
1089
|
-
offset: u64,
|
|
1090
|
-
) void {
|
|
1091
|
-
log.debug("{}: write_prepare_header_to_version: version={} offset={} len={}", .{
|
|
1092
|
-
self.replica,
|
|
1093
|
-
version,
|
|
1094
|
-
offset,
|
|
1095
|
-
buffer.len,
|
|
1096
|
-
});
|
|
1097
|
-
assert(offset + buffer.len <= self.size_headers);
|
|
1098
|
-
// Memory must not be owned by self.headers as self.headers may be modified concurrently:
|
|
1099
|
-
assert(@ptrToInt(buffer.ptr) < @ptrToInt(self.headers.ptr) or
|
|
1100
|
-
@ptrToInt(buffer.ptr) > @ptrToInt(self.headers.ptr) + self.size_headers);
|
|
1101
|
-
|
|
1102
|
-
self.write_sectors(
|
|
1103
|
-
callback,
|
|
1104
|
-
write,
|
|
1105
|
-
buffer,
|
|
1106
|
-
self.offset_in_headers_version(offset, version),
|
|
1107
|
-
);
|
|
1108
1770
|
}
|
|
1109
1771
|
|
|
1110
1772
|
fn write_sectors(
|
|
@@ -1136,8 +1798,9 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1136
1798
|
if (!other.range.locked) continue;
|
|
1137
1799
|
|
|
1138
1800
|
if (other.range.overlaps(&write.range)) {
|
|
1139
|
-
|
|
1140
|
-
|
|
1801
|
+
var tail = &other.range;
|
|
1802
|
+
while (tail.next) |next| tail = next;
|
|
1803
|
+
tail.next = &write.range;
|
|
1141
1804
|
return;
|
|
1142
1805
|
}
|
|
1143
1806
|
}
|
|
@@ -1155,8 +1818,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1155
1818
|
write.range.buffer,
|
|
1156
1819
|
write.range.offset,
|
|
1157
1820
|
);
|
|
1158
|
-
// We rely on the Storage.write_sectors() implementation being
|
|
1159
|
-
// in which case writes never actually need to be queued, or always
|
|
1821
|
+
// We rely on the Storage.write_sectors() implementation being always synchronous,
|
|
1822
|
+
// in which case writes never actually need to be queued, or always asynchronous,
|
|
1160
1823
|
// in which case write_sectors_on_write() doesn't have to handle lock_sectors()
|
|
1161
1824
|
// synchronously completing a write and making a nested write_sectors_on_write() call.
|
|
1162
1825
|
//
|
|
@@ -1193,7 +1856,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1193
1856
|
self.lock_sectors(@fieldParentPtr(Self.Write, "range", waiting));
|
|
1194
1857
|
}
|
|
1195
1858
|
|
|
1196
|
-
// The callback may set range, so we can't set range to undefined after
|
|
1859
|
+
// The callback may set range, so we can't set range to undefined after the callback.
|
|
1197
1860
|
const callback = range.callback;
|
|
1198
1861
|
range.* = undefined;
|
|
1199
1862
|
callback(write);
|
|
@@ -1208,7 +1871,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1208
1871
|
// However, we compare against the 64-bit op first, since it's a cheap machine word.
|
|
1209
1872
|
if (write.message.header.op == op and write.message.header.checksum == checksum) {
|
|
1210
1873
|
// If we truly are writing, then the dirty bit must be set:
|
|
1211
|
-
assert(self.dirty.bit(op));
|
|
1874
|
+
assert(self.dirty.bit(self.slot_for_op(op)));
|
|
1212
1875
|
return true;
|
|
1213
1876
|
}
|
|
1214
1877
|
}
|
|
@@ -1219,132 +1882,125 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1219
1882
|
|
|
1220
1883
|
// TODO Snapshots
|
|
1221
1884
|
pub const BitSet = struct {
|
|
1222
|
-
bits:
|
|
1885
|
+
bits: std.DynamicBitSetUnmanaged,
|
|
1223
1886
|
|
|
1224
1887
|
/// The number of bits set (updated incrementally as bits are set or cleared):
|
|
1225
|
-
|
|
1888
|
+
count: u64 = 0,
|
|
1226
1889
|
|
|
1227
|
-
fn init(allocator: Allocator, count:
|
|
1228
|
-
const bits = try
|
|
1229
|
-
errdefer
|
|
1230
|
-
std.mem.set(bool, bits, false);
|
|
1890
|
+
fn init(allocator: Allocator, count: usize) !BitSet {
|
|
1891
|
+
const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
|
|
1892
|
+
errdefer bits.deinit(allocator);
|
|
1231
1893
|
|
|
1232
1894
|
return BitSet{ .bits = bits };
|
|
1233
1895
|
}
|
|
1234
1896
|
|
|
1235
1897
|
fn deinit(self: *BitSet, allocator: Allocator) void {
|
|
1236
|
-
|
|
1898
|
+
self.bits.deinit(allocator);
|
|
1237
1899
|
}
|
|
1238
1900
|
|
|
1239
|
-
/// Clear the bit for
|
|
1240
|
-
pub fn clear(self: *BitSet,
|
|
1241
|
-
if (self.bits
|
|
1242
|
-
self.bits
|
|
1243
|
-
self.
|
|
1901
|
+
/// Clear the bit for a slot (idempotent):
|
|
1902
|
+
pub fn clear(self: *BitSet, slot: Slot) void {
|
|
1903
|
+
if (self.bits.isSet(slot.index)) {
|
|
1904
|
+
self.bits.unset(slot.index);
|
|
1905
|
+
self.count -= 1;
|
|
1244
1906
|
}
|
|
1245
1907
|
}
|
|
1246
1908
|
|
|
1247
|
-
/// Whether the bit for
|
|
1248
|
-
pub fn bit(self: *BitSet,
|
|
1249
|
-
return self.bits
|
|
1909
|
+
/// Whether the bit for a slot is set:
|
|
1910
|
+
pub fn bit(self: *const BitSet, slot: Slot) bool {
|
|
1911
|
+
return self.bits.isSet(slot.index);
|
|
1250
1912
|
}
|
|
1251
1913
|
|
|
1252
|
-
/// Set the bit for
|
|
1253
|
-
pub fn set(self: *BitSet,
|
|
1254
|
-
if (!self.bits
|
|
1255
|
-
self.bits
|
|
1256
|
-
self.
|
|
1257
|
-
assert(self.
|
|
1914
|
+
/// Set the bit for a slot (idempotent):
|
|
1915
|
+
pub fn set(self: *BitSet, slot: Slot) void {
|
|
1916
|
+
if (!self.bits.isSet(slot.index)) {
|
|
1917
|
+
self.bits.set(slot.index);
|
|
1918
|
+
self.count += 1;
|
|
1919
|
+
assert(self.count <= self.bits.bit_length);
|
|
1258
1920
|
}
|
|
1259
1921
|
}
|
|
1260
1922
|
};
|
|
1261
1923
|
|
|
1262
1924
|
/// Take a u6 to limit to 64 items max (2^6 = 64)
|
|
1263
1925
|
pub fn IOPS(comptime T: type, comptime size: u6) type {
|
|
1264
|
-
const Map = std.
|
|
1265
|
-
const MapLog2 = math.Log2Int(Map);
|
|
1926
|
+
const Map = std.StaticBitSet(size);
|
|
1266
1927
|
return struct {
|
|
1267
1928
|
const Self = @This();
|
|
1268
1929
|
|
|
1269
1930
|
items: [size]T = undefined,
|
|
1270
|
-
/// 1 bits are free items
|
|
1271
|
-
free: Map =
|
|
1931
|
+
/// 1 bits are free items.
|
|
1932
|
+
free: Map = Map.initFull(),
|
|
1272
1933
|
|
|
1273
1934
|
pub fn acquire(self: *Self) ?*T {
|
|
1274
|
-
const i =
|
|
1275
|
-
|
|
1276
|
-
if (i == @bitSizeOf(Map)) return null;
|
|
1277
|
-
self.free &= ~(@as(Map, 1) << @intCast(MapLog2, i));
|
|
1935
|
+
const i = self.free.findFirstSet() orelse return null;
|
|
1936
|
+
self.free.unset(i);
|
|
1278
1937
|
return &self.items[i];
|
|
1279
1938
|
}
|
|
1280
1939
|
|
|
1281
1940
|
pub fn release(self: *Self, item: *T) void {
|
|
1282
1941
|
item.* = undefined;
|
|
1283
1942
|
const i = (@ptrToInt(item) - @ptrToInt(&self.items)) / @sizeOf(T);
|
|
1284
|
-
assert(self.free
|
|
1285
|
-
self.free
|
|
1943
|
+
assert(!self.free.isSet(i));
|
|
1944
|
+
self.free.set(i);
|
|
1286
1945
|
}
|
|
1287
1946
|
|
|
1288
|
-
/// Returns
|
|
1289
|
-
pub fn available(self: *const Self)
|
|
1290
|
-
return
|
|
1947
|
+
/// Returns the count of IOPs available.
|
|
1948
|
+
pub fn available(self: *const Self) usize {
|
|
1949
|
+
return self.free.count();
|
|
1291
1950
|
}
|
|
1292
1951
|
|
|
1293
|
-
/// Returns
|
|
1294
|
-
pub fn executing(self: *const Self)
|
|
1295
|
-
return
|
|
1952
|
+
/// Returns the count of IOPs in use.
|
|
1953
|
+
pub fn executing(self: *const Self) usize {
|
|
1954
|
+
return size - self.available();
|
|
1296
1955
|
}
|
|
1297
1956
|
|
|
1298
1957
|
pub const Iterator = struct {
|
|
1299
1958
|
iops: *Self,
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
pub fn next(iterator: *Iterator) ?*T {
|
|
1305
|
-
const i = @ctz(Map, iterator.unseen);
|
|
1306
|
-
assert(i <= @bitSizeOf(Map));
|
|
1307
|
-
if (i == @bitSizeOf(Map)) return null;
|
|
1308
|
-
// Set this bit of unseen to 1 to indicate this slot has been seen.
|
|
1309
|
-
iterator.unseen &= ~(@as(Map, 1) << @intCast(MapLog2, i));
|
|
1959
|
+
bitset_iterator: Map.Iterator(.{ .kind = .unset }),
|
|
1960
|
+
|
|
1961
|
+
pub fn next(iterator: *@This()) ?*T {
|
|
1962
|
+
const i = iterator.bitset_iterator.next() orelse return null;
|
|
1310
1963
|
return &iterator.iops.items[i];
|
|
1311
1964
|
}
|
|
1312
1965
|
};
|
|
1313
1966
|
|
|
1314
1967
|
pub fn iterate(self: *Self) Iterator {
|
|
1315
|
-
return .{
|
|
1968
|
+
return .{
|
|
1969
|
+
.iops = self,
|
|
1970
|
+
.bitset_iterator = self.free.iterator(.{ .kind = .unset }),
|
|
1971
|
+
};
|
|
1316
1972
|
}
|
|
1317
1973
|
};
|
|
1318
1974
|
}
|
|
1319
1975
|
|
|
1320
|
-
test {
|
|
1976
|
+
test "IOPS" {
|
|
1321
1977
|
const testing = std.testing;
|
|
1322
1978
|
var iops = IOPS(u32, 4){};
|
|
1323
1979
|
|
|
1324
|
-
try testing.expectEqual(@as(
|
|
1325
|
-
try testing.expectEqual(@as(
|
|
1980
|
+
try testing.expectEqual(@as(usize, 4), iops.available());
|
|
1981
|
+
try testing.expectEqual(@as(usize, 0), iops.executing());
|
|
1326
1982
|
|
|
1327
1983
|
var one = iops.acquire().?;
|
|
1328
1984
|
|
|
1329
|
-
try testing.expectEqual(@as(
|
|
1330
|
-
try testing.expectEqual(@as(
|
|
1985
|
+
try testing.expectEqual(@as(usize, 3), iops.available());
|
|
1986
|
+
try testing.expectEqual(@as(usize, 1), iops.executing());
|
|
1331
1987
|
|
|
1332
1988
|
var two = iops.acquire().?;
|
|
1333
1989
|
var three = iops.acquire().?;
|
|
1334
1990
|
|
|
1335
|
-
try testing.expectEqual(@as(
|
|
1336
|
-
try testing.expectEqual(@as(
|
|
1991
|
+
try testing.expectEqual(@as(usize, 1), iops.available());
|
|
1992
|
+
try testing.expectEqual(@as(usize, 3), iops.executing());
|
|
1337
1993
|
|
|
1338
1994
|
var four = iops.acquire().?;
|
|
1339
1995
|
try testing.expectEqual(@as(?*u32, null), iops.acquire());
|
|
1340
1996
|
|
|
1341
|
-
try testing.expectEqual(@as(
|
|
1342
|
-
try testing.expectEqual(@as(
|
|
1997
|
+
try testing.expectEqual(@as(usize, 0), iops.available());
|
|
1998
|
+
try testing.expectEqual(@as(usize, 4), iops.executing());
|
|
1343
1999
|
|
|
1344
2000
|
iops.release(two);
|
|
1345
2001
|
|
|
1346
|
-
try testing.expectEqual(@as(
|
|
1347
|
-
try testing.expectEqual(@as(
|
|
2002
|
+
try testing.expectEqual(@as(usize, 1), iops.available());
|
|
2003
|
+
try testing.expectEqual(@as(usize, 3), iops.executing());
|
|
1348
2004
|
|
|
1349
2005
|
// there is only one slot free, so we will get the same pointer back.
|
|
1350
2006
|
try testing.expectEqual(@as(?*u32, two), iops.acquire());
|
|
@@ -1354,8 +2010,8 @@ test {
|
|
|
1354
2010
|
iops.release(one);
|
|
1355
2011
|
iops.release(three);
|
|
1356
2012
|
|
|
1357
|
-
try testing.expectEqual(@as(
|
|
1358
|
-
try testing.expectEqual(@as(
|
|
2013
|
+
try testing.expectEqual(@as(usize, 4), iops.available());
|
|
2014
|
+
try testing.expectEqual(@as(usize, 0), iops.executing());
|
|
1359
2015
|
|
|
1360
2016
|
one = iops.acquire().?;
|
|
1361
2017
|
two = iops.acquire().?;
|
|
@@ -1363,3 +2019,275 @@ test {
|
|
|
1363
2019
|
four = iops.acquire().?;
|
|
1364
2020
|
try testing.expectEqual(@as(?*u32, null), iops.acquire());
|
|
1365
2021
|
}
|
|
2022
|
+
|
|
2023
|
+
/// @B and @C:
|
|
2024
|
+
/// This prepare header is corrupt.
|
|
2025
|
+
/// We may have a valid redundant header, but need to recover the full message.
|
|
2026
|
+
///
|
|
2027
|
+
/// Case @B may be caused by crashing while writing the prepare (torn write).
|
|
2028
|
+
///
|
|
2029
|
+
/// @E:
|
|
2030
|
+
/// Valid prepare, corrupt header. One of:
|
|
2031
|
+
///
|
|
2032
|
+
/// 1. The replica crashed while writing the redundant header (torn write).
|
|
2033
|
+
/// 2. The read to the header is corrupt or misdirected.
|
|
2034
|
+
/// 3. Multiple faults, for example: the redundant header read is corrupt, and the prepare read is
|
|
2035
|
+
/// misdirected.
|
|
2036
|
+
///
|
|
2037
|
+
///
|
|
2038
|
+
/// @F and @H:
|
|
2039
|
+
/// The replica is recovering from a crash after writing the prepare, but before writing the
|
|
2040
|
+
/// redundant header.
|
|
2041
|
+
///
|
|
2042
|
+
///
|
|
2043
|
+
/// @G:
|
|
2044
|
+
/// One of:
|
|
2045
|
+
///
|
|
2046
|
+
/// * A misdirected read to a reserved header.
|
|
2047
|
+
/// * The redundant header's write was lost or misdirected.
|
|
2048
|
+
///
|
|
2049
|
+
/// For multi-replica clusters, don't repair locally to prevent data loss in case of 2 lost writes.
|
|
2050
|
+
///
|
|
2051
|
+
///
|
|
2052
|
+
/// @I:
|
|
2053
|
+
/// The redundant header is present & valid, but the corresponding prepare was a lost or misdirected
|
|
2054
|
+
/// read or write.
|
|
2055
|
+
///
|
|
2056
|
+
///
|
|
2057
|
+
/// @J:
|
|
2058
|
+
/// This slot is legitimately reserved — this may be the first fill of the log.
|
|
2059
|
+
///
|
|
2060
|
+
///
|
|
2061
|
+
/// @K and @L:
|
|
2062
|
+
/// When the redundant header & prepare header are both valid but distinct ops, always pick the
|
|
2063
|
+
/// higher op.
|
|
2064
|
+
///
|
|
2065
|
+
/// For example, consider slot_count=10, the op to the left is 12, the op to the right is 14, and
|
|
2066
|
+
/// the tiebreak is between an op=3 and op=13. Choosing op=13 over op=3 is safe because the op=3
|
|
2067
|
+
/// must be from a previous wrap — it is too far back (>pipeline) to have been replaced by a view
|
|
2068
|
+
/// change.
|
|
2069
|
+
///
|
|
2070
|
+
/// The length of the prepare pipeline is the upper bound on how many ops can be reordered during a
|
|
2071
|
+
/// view change.
|
|
2072
|
+
///
|
|
2073
|
+
/// @K:
|
|
2074
|
+
/// When the higher op belongs to the prepare, repair locally.
|
|
2075
|
+
/// The most likely cause for this case is that the log wrapped, but the redundant header write was
|
|
2076
|
+
/// lost.
|
|
2077
|
+
///
|
|
2078
|
+
/// @L:
|
|
2079
|
+
/// When the higher op belongs to the header, mark faulty.
|
|
2080
|
+
///
|
|
2081
|
+
///
|
|
2082
|
+
/// @M:
|
|
2083
|
+
/// The message was rewritten due to a view change.
|
|
2084
|
+
/// A single-replica cluster doesn't ever change views.
|
|
2085
|
+
///
|
|
2086
|
+
///
|
|
2087
|
+
/// @N:
|
|
2088
|
+
/// The redundant header matches the message's header.
|
|
2089
|
+
/// This is the usual case: both the prepare and header are correct and equivalent.
|
|
2090
|
+
const recovery_cases = table: {
|
|
2091
|
+
const __ = Matcher.any;
|
|
2092
|
+
const _0 = Matcher.is_false;
|
|
2093
|
+
const _1 = Matcher.is_true;
|
|
2094
|
+
// The replica will abort if any of these checks fail:
|
|
2095
|
+
const a0 = Matcher.assert_is_false;
|
|
2096
|
+
const a1 = Matcher.assert_is_true;
|
|
2097
|
+
|
|
2098
|
+
break :table [_]Case{
|
|
2099
|
+
// Legend:
|
|
2100
|
+
//
|
|
2101
|
+
// R>1 replica_count > 1
|
|
2102
|
+
// R=1 replica_count = 1
|
|
2103
|
+
// ok valid checksum ∧ valid cluster ∧ valid slot ∧ valid command
|
|
2104
|
+
// nil command == reserved
|
|
2105
|
+
// ✓∑ header.checksum == prepare.checksum
|
|
2106
|
+
// op⌈ prepare.op is maximum of all prepare.ops
|
|
2107
|
+
// op= header.op == prepare.op
|
|
2108
|
+
// op< header.op < prepare.op
|
|
2109
|
+
// view header.view == prepare.view
|
|
2110
|
+
//
|
|
2111
|
+
// Label Decision Header Prepare Compare
|
|
2112
|
+
// R>1 R=1 ok nil ok nil op⌈ ✓∑ op= op< view
|
|
2113
|
+
Case.init("@A", .vsr, .vsr, .{ _0, __, _0, __, __, __, __, __, __ }),
|
|
2114
|
+
Case.init("@B", .vsr, .vsr, .{ _1, _1, _0, __, __, __, __, __, __ }),
|
|
2115
|
+
Case.init("@C", .vsr, .vsr, .{ _1, _0, _0, __, __, __, __, __, __ }),
|
|
2116
|
+
Case.init("@D", .vsr, .vsr, .{ _0, __, _1, _1, __, __, __, __, __ }),
|
|
2117
|
+
Case.init("@E", .vsr, .fix, .{ _0, __, _1, _0, _0, __, __, __, __ }),
|
|
2118
|
+
Case.init("@F", .fix, .fix, .{ _0, __, _1, _0, _1, __, __, __, __ }),
|
|
2119
|
+
Case.init("@G", .vsr, .fix, .{ _1, _1, _1, _0, _0, __, __, __, __ }),
|
|
2120
|
+
Case.init("@H", .fix, .fix, .{ _1, _1, _1, _0, _1, __, __, __, __ }),
|
|
2121
|
+
Case.init("@I", .vsr, .vsr, .{ _1, _0, _1, _1, __, __, __, __, __ }),
|
|
2122
|
+
Case.init("@J", .nil, .nil, .{ _1, _1, _1, _1, __, a1, a1, a0, a1 }), // normal path: reserved
|
|
2123
|
+
Case.init("@K", .fix, .fix, .{ _1, _0, _1, _0, __, _0, _0, _1, __ }), // header.op < prepare.op
|
|
2124
|
+
Case.init("@L", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _0, _0, __ }), // header.op > prepare.op
|
|
2125
|
+
Case.init("@M", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _1, a0, a0 }),
|
|
2126
|
+
Case.init("@N", .eql, .eql, .{ _1, _0, _1, _0, __, _1, a1, a0, a1 }), // normal path: prepare
|
|
2127
|
+
};
|
|
2128
|
+
};
|
|
2129
|
+
|
|
2130
|
+
const case_cut = Case{
|
|
2131
|
+
.label = "@Truncate",
|
|
2132
|
+
.decision_multiple = .cut,
|
|
2133
|
+
.decision_single = .cut,
|
|
2134
|
+
.pattern = undefined,
|
|
2135
|
+
};
|
|
2136
|
+
|
|
2137
|
+
const RecoveryDecision = enum {
|
|
2138
|
+
/// The header and prepare are identical; no repair necessary.
|
|
2139
|
+
eql,
|
|
2140
|
+
/// Reserved; dirty/faulty are clear, no repair necessary.
|
|
2141
|
+
nil,
|
|
2142
|
+
/// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, clear faulty.
|
|
2143
|
+
/// If replica_count=1: Use intact prepare. Clear dirty, clear faulty.
|
|
2144
|
+
/// (Don't set faulty, because we have the valid message.)
|
|
2145
|
+
fix,
|
|
2146
|
+
/// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, mark faulty.
|
|
2147
|
+
/// If replica_count=1: Fail; cannot recover safely.
|
|
2148
|
+
vsr,
|
|
2149
|
+
/// Truncate the op, setting it to reserved. Dirty/faulty are clear.
|
|
2150
|
+
cut,
|
|
2151
|
+
};
|
|
2152
|
+
|
|
2153
|
+
const Matcher = enum { any, is_false, is_true, assert_is_false, assert_is_true };
|
|
2154
|
+
|
|
2155
|
+
const Case = struct {
|
|
2156
|
+
label: []const u8,
|
|
2157
|
+
/// Decision when replica_count>1.
|
|
2158
|
+
decision_multiple: RecoveryDecision,
|
|
2159
|
+
/// Decision when replica_count=1.
|
|
2160
|
+
decision_single: RecoveryDecision,
|
|
2161
|
+
/// 0: header_ok(header)
|
|
2162
|
+
/// 1: header.command == reserved
|
|
2163
|
+
/// 2: header_ok(prepare) ∧ valid_checksum_body
|
|
2164
|
+
/// 3: prepare.command == reserved
|
|
2165
|
+
/// 4: prepare.op is maximum of all prepare.ops
|
|
2166
|
+
/// 5: header.checksum == prepare.checksum
|
|
2167
|
+
/// 6: header.op == prepare.op
|
|
2168
|
+
/// 7: header.op < prepare.op
|
|
2169
|
+
/// 8: header.view == prepare.view
|
|
2170
|
+
pattern: [9]Matcher,
|
|
2171
|
+
|
|
2172
|
+
fn init(
|
|
2173
|
+
label: []const u8,
|
|
2174
|
+
decision_multiple: RecoveryDecision,
|
|
2175
|
+
decision_single: RecoveryDecision,
|
|
2176
|
+
pattern: [9]Matcher,
|
|
2177
|
+
) Case {
|
|
2178
|
+
return .{
|
|
2179
|
+
.label = label,
|
|
2180
|
+
.decision_multiple = decision_multiple,
|
|
2181
|
+
.decision_single = decision_single,
|
|
2182
|
+
.pattern = pattern,
|
|
2183
|
+
};
|
|
2184
|
+
}
|
|
2185
|
+
|
|
2186
|
+
fn check(self: *const Case, parameters: [9]bool) !bool {
|
|
2187
|
+
for (parameters) |b, i| {
|
|
2188
|
+
switch (self.pattern[i]) {
|
|
2189
|
+
.any => {},
|
|
2190
|
+
.is_false => if (b) return false,
|
|
2191
|
+
.is_true => if (!b) return false,
|
|
2192
|
+
.assert_is_false => if (b) return error.ExpectFalse,
|
|
2193
|
+
.assert_is_true => if (!b) return error.ExpectTrue,
|
|
2194
|
+
}
|
|
2195
|
+
}
|
|
2196
|
+
return true;
|
|
2197
|
+
}
|
|
2198
|
+
|
|
2199
|
+
fn decision(self: *const Case, replica_count: u8) RecoveryDecision {
|
|
2200
|
+
assert(replica_count > 0);
|
|
2201
|
+
if (replica_count == 1) {
|
|
2202
|
+
return self.decision_single;
|
|
2203
|
+
} else {
|
|
2204
|
+
return self.decision_multiple;
|
|
2205
|
+
}
|
|
2206
|
+
}
|
|
2207
|
+
};
|
|
2208
|
+
|
|
2209
|
+
fn recovery_case(header: ?*const Header, prepare: ?*const Header, prepare_op_max: u64) *const Case {
|
|
2210
|
+
const h_ok = header != null;
|
|
2211
|
+
const p_ok = prepare != null;
|
|
2212
|
+
|
|
2213
|
+
if (h_ok) assert(header.?.invalid() == null);
|
|
2214
|
+
if (p_ok) assert(prepare.?.invalid() == null);
|
|
2215
|
+
|
|
2216
|
+
const parameters = .{
|
|
2217
|
+
h_ok,
|
|
2218
|
+
if (h_ok) header.?.command == .reserved else false,
|
|
2219
|
+
p_ok,
|
|
2220
|
+
if (p_ok) prepare.?.command == .reserved else false,
|
|
2221
|
+
if (p_ok) prepare.?.op == prepare_op_max else false,
|
|
2222
|
+
if (h_ok and p_ok) header.?.checksum == prepare.?.checksum else false,
|
|
2223
|
+
if (h_ok and p_ok) header.?.op == prepare.?.op else false,
|
|
2224
|
+
if (h_ok and p_ok) header.?.op < prepare.?.op else false,
|
|
2225
|
+
if (h_ok and p_ok) header.?.view == prepare.?.view else false,
|
|
2226
|
+
};
|
|
2227
|
+
|
|
2228
|
+
var result: ?*const Case = null;
|
|
2229
|
+
for (recovery_cases) |*case| {
|
|
2230
|
+
const match = case.check(parameters) catch {
|
|
2231
|
+
log.err("recovery_case: impossible state: case={s} parameters={any}", .{
|
|
2232
|
+
case.label,
|
|
2233
|
+
parameters,
|
|
2234
|
+
});
|
|
2235
|
+
unreachable;
|
|
2236
|
+
};
|
|
2237
|
+
if (match) {
|
|
2238
|
+
assert(result == null);
|
|
2239
|
+
result = case;
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
// The recovery table is exhaustive.
|
|
2243
|
+
// Every combination of parameters matches exactly one case.
|
|
2244
|
+
return result.?;
|
|
2245
|
+
}
|
|
2246
|
+
|
|
2247
|
+
/// Returns the header, only if the header:
|
|
2248
|
+
/// * has a valid checksum, and
|
|
2249
|
+
/// * has the expected cluster, and
|
|
2250
|
+
/// * has an expected command, and
|
|
2251
|
+
/// * resides in the correct slot.
|
|
2252
|
+
fn header_ok(cluster: u32, slot: Slot, header: *const Header) ?*const Header {
|
|
2253
|
+
// We must first validate the header checksum before accessing any fields.
|
|
2254
|
+
// Otherwise, we may hit undefined data or an out-of-bounds enum and cause a runtime crash.
|
|
2255
|
+
if (!header.valid_checksum()) return null;
|
|
2256
|
+
|
|
2257
|
+
// A header with the wrong cluster, or in the wrong slot, may indicate a misdirected read/write.
|
|
2258
|
+
// All journalled headers should be reserved or else prepares.
|
|
2259
|
+
// A misdirected read/write to or from another storage zone may return the wrong message.
|
|
2260
|
+
const valid_cluster_command_and_slot = switch (header.command) {
|
|
2261
|
+
.prepare => header.cluster == cluster and slot.index == header.op % slot_count,
|
|
2262
|
+
.reserved => header.cluster == cluster and slot.index == header.op,
|
|
2263
|
+
else => false,
|
|
2264
|
+
};
|
|
2265
|
+
|
|
2266
|
+
// Do not check the checksum here, because that would run only after the other field accesses.
|
|
2267
|
+
return if (valid_cluster_command_and_slot) header else null;
|
|
2268
|
+
}
|
|
2269
|
+
|
|
2270
|
+
test "recovery_cases" {
|
|
2271
|
+
// Verify that every pattern matches exactly one case.
|
|
2272
|
+
//
|
|
2273
|
+
// Every possible combination of parameters must either:
|
|
2274
|
+
// * have a matching case
|
|
2275
|
+
// * have a case that fails (which would result in a panic).
|
|
2276
|
+
var i: usize = 0;
|
|
2277
|
+
while (i <= std.math.maxInt(u8)) : (i += 1) {
|
|
2278
|
+
var parameters: [9]bool = undefined;
|
|
2279
|
+
comptime var j: usize = 0;
|
|
2280
|
+
inline while (j < parameters.len) : (j += 1) {
|
|
2281
|
+
parameters[j] = i & (1 << j) != 0;
|
|
2282
|
+
}
|
|
2283
|
+
|
|
2284
|
+
var case_match: ?*const Case = null;
|
|
2285
|
+
for (recovery_cases) |*case| {
|
|
2286
|
+
if (case.check(parameters) catch true) {
|
|
2287
|
+
try std.testing.expectEqual(case_match, null);
|
|
2288
|
+
case_match = case;
|
|
2289
|
+
}
|
|
2290
|
+
}
|
|
2291
|
+
if (case_match == null) @panic("no matching case");
|
|
2292
|
+
}
|
|
2293
|
+
}
|