tigerbeetle-node 0.11.5 → 0.11.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/dist/index.d.ts +41 -42
- package/dist/index.js +41 -42
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/src/index.ts +0 -1
- package/src/tigerbeetle/scripts/benchmark.bat +7 -3
- package/src/tigerbeetle/scripts/benchmark.sh +2 -3
- package/src/tigerbeetle/scripts/install.bat +7 -0
- package/src/tigerbeetle/scripts/install.sh +2 -3
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/config.zig +24 -3
- package/src/tigerbeetle/src/constants.zig +8 -5
- package/src/tigerbeetle/src/ewah.zig +6 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/io/darwin.zig +19 -0
- package/src/tigerbeetle/src/io/linux.zig +8 -0
- package/src/tigerbeetle/src/io/windows.zig +20 -2
- package/src/tigerbeetle/src/iops.zig +7 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +27 -72
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +10 -11
- package/src/tigerbeetle/src/lsm/grid.zig +267 -267
- package/src/tigerbeetle/src/lsm/groove.zig +3 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +19 -11
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +106 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
- package/src/tigerbeetle/src/lsm/table.zig +56 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +29 -2
- package/src/tigerbeetle/src/lsm/table_mutable.zig +49 -15
- package/src/tigerbeetle/src/lsm/test.zig +10 -7
- package/src/tigerbeetle/src/lsm/tree.zig +27 -6
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +302 -263
- package/src/tigerbeetle/src/message_pool.zig +2 -1
- package/src/tigerbeetle/src/simulator.zig +22 -84
- package/src/tigerbeetle/src/{test/accounting → state_machine}/auditor.zig +8 -8
- package/src/tigerbeetle/src/{test/accounting → state_machine}/workload.zig +108 -48
- package/src/tigerbeetle/src/state_machine.zig +20 -14
- package/src/tigerbeetle/src/storage.zig +58 -6
- package/src/tigerbeetle/src/test/cluster.zig +14 -11
- package/src/tigerbeetle/src/test/conductor.zig +2 -3
- package/src/tigerbeetle/src/test/id.zig +10 -0
- package/src/tigerbeetle/src/test/state_checker.zig +1 -1
- package/src/tigerbeetle/src/test/state_machine.zig +151 -46
- package/src/tigerbeetle/src/test/storage.zig +22 -1
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -1
- package/src/tigerbeetle/src/tracer.zig +50 -28
- package/src/tigerbeetle/src/unit_tests.zig +11 -6
- package/src/tigerbeetle/src/vopr.zig +4 -4
- package/src/tigerbeetle/src/vsr/client.zig +5 -5
- package/src/tigerbeetle/src/vsr/clock.zig +2 -2
- package/src/tigerbeetle/src/vsr/journal.zig +647 -537
- package/src/tigerbeetle/src/vsr/replica.zig +333 -333
- package/src/tigerbeetle/src/vsr/replica_format.zig +7 -4
- package/src/tigerbeetle/src/vsr/superblock.zig +87 -39
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +11 -8
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +3 -3
- package/src/tigerbeetle/src/vsr.zig +60 -13
- package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
- package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
- package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
- package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
- package/src/tigerbeetle/src/c/tb_client.h +0 -221
- package/src/tigerbeetle/src/c/tb_client.zig +0 -177
- package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
- package/src/tigerbeetle/src/c/test.zig +0 -371
- package/src/tigerbeetle/src/cli.zig +0 -375
- package/src/tigerbeetle/src/main.zig +0 -245
|
@@ -46,8 +46,10 @@ const Ring = enum {
|
|
|
46
46
|
};
|
|
47
47
|
|
|
48
48
|
const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
|
|
49
|
+
const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
|
|
49
50
|
comptime {
|
|
50
51
|
assert(headers_per_sector > 0);
|
|
52
|
+
assert(headers_per_message > 0);
|
|
51
53
|
}
|
|
52
54
|
|
|
53
55
|
/// A slot is an index within:
|
|
@@ -60,7 +62,7 @@ comptime {
|
|
|
60
62
|
/// - `journal.faulty`
|
|
61
63
|
///
|
|
62
64
|
/// A header's slot is `header.op % constants.journal_slot_count`.
|
|
63
|
-
const Slot = struct { index:
|
|
65
|
+
const Slot = struct { index: usize };
|
|
64
66
|
|
|
65
67
|
/// An inclusive, non-empty range of slots.
|
|
66
68
|
const SlotRange = struct {
|
|
@@ -74,26 +76,20 @@ const SlotRange = struct {
|
|
|
74
76
|
/// * `head < tail` → ` head··tail `
|
|
75
77
|
/// * `head > tail` → `··tail head··` (The range wraps around).
|
|
76
78
|
/// * `head = tail` → panic (Caller must handle this case separately).
|
|
77
|
-
fn contains(
|
|
79
|
+
fn contains(range: *const SlotRange, slot: Slot) bool {
|
|
78
80
|
// To avoid confusion, the empty range must be checked separately by the caller.
|
|
79
|
-
assert(
|
|
81
|
+
assert(range.head.index != range.tail.index);
|
|
80
82
|
|
|
81
|
-
if (
|
|
82
|
-
return
|
|
83
|
+
if (range.head.index < range.tail.index) {
|
|
84
|
+
return range.head.index <= slot.index and slot.index <= range.tail.index;
|
|
83
85
|
}
|
|
84
|
-
if (
|
|
85
|
-
return slot.index <=
|
|
86
|
+
if (range.head.index > range.tail.index) {
|
|
87
|
+
return slot.index <= range.tail.index or range.head.index <= slot.index;
|
|
86
88
|
}
|
|
87
89
|
unreachable;
|
|
88
90
|
}
|
|
89
91
|
};
|
|
90
92
|
|
|
91
|
-
const Status = enum {
|
|
92
|
-
init,
|
|
93
|
-
recovering,
|
|
94
|
-
recovered,
|
|
95
|
-
};
|
|
96
|
-
|
|
97
93
|
const slot_count = constants.journal_slot_count;
|
|
98
94
|
const headers_size = constants.journal_size_headers;
|
|
99
95
|
const prepares_size = constants.journal_size_prepares;
|
|
@@ -117,14 +113,21 @@ comptime {
|
|
|
117
113
|
assert(prepares_size % constants.message_size_max == 0);
|
|
118
114
|
}
|
|
119
115
|
|
|
120
|
-
pub fn
|
|
116
|
+
pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
121
117
|
return struct {
|
|
122
|
-
const
|
|
118
|
+
const Journal = @This();
|
|
119
|
+
const Sector = *align(constants.sector_size) [constants.sector_size]u8;
|
|
120
|
+
|
|
121
|
+
const Status = union(enum) {
|
|
122
|
+
init: void,
|
|
123
|
+
recovering: fn (journal: *Journal) void,
|
|
124
|
+
recovered: void,
|
|
125
|
+
};
|
|
123
126
|
|
|
124
127
|
pub const Read = struct {
|
|
125
|
-
|
|
128
|
+
journal: *Journal,
|
|
126
129
|
completion: Storage.Read,
|
|
127
|
-
callback: fn (
|
|
130
|
+
callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
|
|
128
131
|
|
|
129
132
|
message: *Message,
|
|
130
133
|
op: u64,
|
|
@@ -133,10 +136,10 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
133
136
|
};
|
|
134
137
|
|
|
135
138
|
pub const Write = struct {
|
|
136
|
-
pub const Trigger = enum { append, repair, pipeline };
|
|
139
|
+
pub const Trigger = enum { append, fix, repair, pipeline };
|
|
137
140
|
|
|
138
|
-
|
|
139
|
-
callback: fn (
|
|
141
|
+
journal: *Journal,
|
|
142
|
+
callback: fn (replica: *Replica, wrote: ?*Message, trigger: Trigger) void,
|
|
140
143
|
|
|
141
144
|
message: *Message,
|
|
142
145
|
trigger: Trigger,
|
|
@@ -150,26 +153,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
150
153
|
|
|
151
154
|
/// This is reset to undefined and reused for each Storage.write_sectors() call.
|
|
152
155
|
range: Range,
|
|
153
|
-
|
|
154
|
-
const Sector = *align(constants.sector_size) [constants.sector_size]u8;
|
|
155
|
-
|
|
156
|
-
fn header_sector(write: *Self.Write, journal: *Self) Sector {
|
|
157
|
-
assert(journal.writes.items.len == journal.headers_iops.len);
|
|
158
|
-
const i = @divExact(
|
|
159
|
-
@ptrToInt(write) - @ptrToInt(&journal.writes.items),
|
|
160
|
-
@sizeOf(Self.Write),
|
|
161
|
-
);
|
|
162
|
-
// TODO The compiler should not need this align cast as the type of `headers_iops`
|
|
163
|
-
// ensures that each buffer is properly aligned.
|
|
164
|
-
return @alignCast(constants.sector_size, &journal.headers_iops[i]);
|
|
165
|
-
}
|
|
166
156
|
};
|
|
167
157
|
|
|
168
158
|
/// State that needs to be persisted while waiting for an overlapping
|
|
169
159
|
/// concurrent write to complete. This is a range on the physical disk.
|
|
170
160
|
const Range = struct {
|
|
171
161
|
completion: Storage.Write,
|
|
172
|
-
callback: fn (write: *
|
|
162
|
+
callback: fn (write: *Journal.Write) void,
|
|
173
163
|
buffer: []const u8,
|
|
174
164
|
ring: Ring,
|
|
175
165
|
/// Offset within the ring.
|
|
@@ -181,17 +171,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
181
171
|
/// True if a Storage.write_sectors() operation is in progress for this buffer/offset.
|
|
182
172
|
locked: bool,
|
|
183
173
|
|
|
184
|
-
fn overlaps(
|
|
185
|
-
if (
|
|
174
|
+
fn overlaps(journal: *const Range, other: *const Range) bool {
|
|
175
|
+
if (journal.ring != other.ring) return false;
|
|
186
176
|
|
|
187
|
-
if (
|
|
188
|
-
return
|
|
177
|
+
if (journal.offset < other.offset) {
|
|
178
|
+
return journal.offset + journal.buffer.len > other.offset;
|
|
189
179
|
} else {
|
|
190
|
-
return other.offset + other.buffer.len >
|
|
180
|
+
return other.offset + other.buffer.len > journal.offset;
|
|
191
181
|
}
|
|
192
182
|
}
|
|
193
183
|
};
|
|
194
184
|
|
|
185
|
+
const HeaderChunks = std.StaticBitSet(util.div_ceil(slot_count, headers_per_message));
|
|
186
|
+
|
|
195
187
|
storage: *Storage,
|
|
196
188
|
replica: u8,
|
|
197
189
|
|
|
@@ -224,6 +216,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
224
216
|
/// The buffers belong to the IOP at the corresponding index in IOPS.
|
|
225
217
|
headers_iops: *align(constants.sector_size) [constants.journal_iops_write_max][constants.sector_size]u8,
|
|
226
218
|
|
|
219
|
+
/// A set bit indicates a chunk of redundant headers that no read has been issued to yet.
|
|
220
|
+
header_chunks_requested: HeaderChunks = HeaderChunks.initFull(),
|
|
221
|
+
/// A set bit indicates a chunk of redundant headers that has been recovered.
|
|
222
|
+
header_chunks_recovered: HeaderChunks = HeaderChunks.initEmpty(),
|
|
223
|
+
|
|
227
224
|
/// Statically allocated read IO operation context data.
|
|
228
225
|
reads: IOPS(Read, constants.journal_iops_read_max) = .{},
|
|
229
226
|
|
|
@@ -262,7 +259,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
262
259
|
|
|
263
260
|
status: Status = .init,
|
|
264
261
|
|
|
265
|
-
pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !
|
|
262
|
+
pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Journal {
|
|
266
263
|
// TODO Fix this assertion:
|
|
267
264
|
// assert(write_ahead_log_zone_size <= storage.size);
|
|
268
265
|
|
|
@@ -284,13 +281,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
284
281
|
errdefer allocator.free(headers_redundant);
|
|
285
282
|
for (headers_redundant) |*header| header.* = undefined;
|
|
286
283
|
|
|
287
|
-
var dirty = try BitSet.
|
|
284
|
+
var dirty = try BitSet.init_full(allocator, slot_count);
|
|
288
285
|
errdefer dirty.deinit(allocator);
|
|
289
|
-
for (headers) |_, index| dirty.set(Slot{ .index = index });
|
|
290
286
|
|
|
291
|
-
var faulty = try BitSet.
|
|
287
|
+
var faulty = try BitSet.init_full(allocator, slot_count);
|
|
292
288
|
errdefer faulty.deinit(allocator);
|
|
293
|
-
for (headers) |_, index| faulty.set(Slot{ .index = index });
|
|
294
289
|
|
|
295
290
|
var prepare_checksums = try allocator.alloc(u128, slot_count);
|
|
296
291
|
errdefer allocator.free(prepare_checksums);
|
|
@@ -316,7 +311,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
316
311
|
std.fmt.fmtIntSizeBin(prepares_size),
|
|
317
312
|
});
|
|
318
313
|
|
|
319
|
-
var
|
|
314
|
+
var journal = Journal{
|
|
320
315
|
.storage = storage,
|
|
321
316
|
.replica = replica,
|
|
322
317
|
.headers = headers,
|
|
@@ -328,37 +323,37 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
328
323
|
.headers_iops = headers_iops,
|
|
329
324
|
};
|
|
330
325
|
|
|
331
|
-
assert(@mod(@ptrToInt(&
|
|
332
|
-
assert(
|
|
333
|
-
assert(
|
|
334
|
-
assert(
|
|
335
|
-
assert(
|
|
336
|
-
assert(
|
|
337
|
-
assert(
|
|
326
|
+
assert(@mod(@ptrToInt(&journal.headers[0]), constants.sector_size) == 0);
|
|
327
|
+
assert(journal.dirty.bits.bit_length == slot_count);
|
|
328
|
+
assert(journal.faulty.bits.bit_length == slot_count);
|
|
329
|
+
assert(journal.dirty.count == slot_count);
|
|
330
|
+
assert(journal.faulty.count == slot_count);
|
|
331
|
+
assert(journal.prepare_checksums.len == slot_count);
|
|
332
|
+
assert(journal.prepare_inhabited.len == slot_count);
|
|
338
333
|
|
|
339
|
-
for (
|
|
340
|
-
for (
|
|
334
|
+
for (journal.headers) |*h| assert(!h.valid_checksum());
|
|
335
|
+
for (journal.headers_redundant) |*h| assert(!h.valid_checksum());
|
|
341
336
|
|
|
342
|
-
return
|
|
337
|
+
return journal;
|
|
343
338
|
}
|
|
344
339
|
|
|
345
|
-
pub fn deinit(
|
|
346
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
340
|
+
pub fn deinit(journal: *Journal, allocator: Allocator) void {
|
|
341
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
347
342
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
allocator.free(
|
|
351
|
-
allocator.free(
|
|
352
|
-
allocator.free(
|
|
353
|
-
allocator.free(
|
|
354
|
-
allocator.free(
|
|
343
|
+
journal.dirty.deinit(allocator);
|
|
344
|
+
journal.faulty.deinit(allocator);
|
|
345
|
+
allocator.free(journal.headers);
|
|
346
|
+
allocator.free(journal.headers_redundant);
|
|
347
|
+
allocator.free(journal.headers_iops);
|
|
348
|
+
allocator.free(journal.prepare_checksums);
|
|
349
|
+
allocator.free(journal.prepare_inhabited);
|
|
355
350
|
|
|
356
351
|
{
|
|
357
|
-
var it =
|
|
352
|
+
var it = journal.reads.iterate();
|
|
358
353
|
while (it.next()) |read| replica.message_bus.unref(read.message);
|
|
359
354
|
}
|
|
360
355
|
{
|
|
361
|
-
var it =
|
|
356
|
+
var it = journal.writes.iterate();
|
|
362
357
|
while (it.next()) |write| replica.message_bus.unref(write.message);
|
|
363
358
|
}
|
|
364
359
|
}
|
|
@@ -369,79 +364,78 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
369
364
|
///
|
|
370
365
|
/// Called by the replica immediately after WAL recovery completes, but before the replica
|
|
371
366
|
/// issues any I/O from handling messages.
|
|
372
|
-
pub fn is_empty(
|
|
373
|
-
assert(
|
|
374
|
-
assert(
|
|
367
|
+
pub fn is_empty(journal: *const Journal) bool {
|
|
368
|
+
assert(journal.status == .recovered);
|
|
369
|
+
assert(journal.writes.executing() == 0);
|
|
375
370
|
|
|
376
|
-
if (!
|
|
377
|
-
if (
|
|
371
|
+
if (!journal.headers[0].valid_checksum()) return false;
|
|
372
|
+
if (journal.headers[0].operation != .root) return false;
|
|
378
373
|
|
|
379
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
380
|
-
assert(
|
|
381
|
-
assert(
|
|
382
|
-
assert(
|
|
374
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
375
|
+
assert(journal.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
376
|
+
assert(journal.headers[0].checksum == journal.prepare_checksums[0]);
|
|
377
|
+
assert(journal.prepare_inhabited[0]);
|
|
383
378
|
|
|
384
379
|
// If any message is faulty, we must fall back to VSR recovery protocol (i.e. treat
|
|
385
380
|
// this as a non-empty WAL) since that message may have been a prepare.
|
|
386
|
-
if (
|
|
381
|
+
if (journal.faulty.count > 0) return false;
|
|
387
382
|
|
|
388
|
-
for (
|
|
383
|
+
for (journal.headers[1..]) |*header| {
|
|
389
384
|
if (header.command == .prepare) return false;
|
|
390
385
|
}
|
|
391
386
|
|
|
392
|
-
for (
|
|
387
|
+
for (journal.prepare_inhabited[1..]) |inhabited| {
|
|
393
388
|
if (inhabited) return false;
|
|
394
389
|
}
|
|
395
390
|
|
|
396
391
|
return true;
|
|
397
392
|
}
|
|
398
393
|
|
|
399
|
-
pub fn slot_for_op(_: *const
|
|
394
|
+
pub fn slot_for_op(_: *const Journal, op: u64) Slot {
|
|
400
395
|
return Slot{ .index = op % slot_count };
|
|
401
396
|
}
|
|
402
397
|
|
|
403
|
-
pub fn slot_with_op(
|
|
404
|
-
if (
|
|
405
|
-
return
|
|
398
|
+
pub fn slot_with_op(journal: *const Journal, op: u64) ?Slot {
|
|
399
|
+
if (journal.header_with_op(op)) |_| {
|
|
400
|
+
return journal.slot_for_op(op);
|
|
406
401
|
} else {
|
|
407
402
|
return null;
|
|
408
403
|
}
|
|
409
404
|
}
|
|
410
405
|
|
|
411
|
-
pub fn slot_with_op_and_checksum(
|
|
412
|
-
if (
|
|
413
|
-
return
|
|
406
|
+
pub fn slot_with_op_and_checksum(journal: *const Journal, op: u64, checksum: u128) ?Slot {
|
|
407
|
+
if (journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
408
|
+
return journal.slot_for_op(op);
|
|
414
409
|
} else {
|
|
415
410
|
return null;
|
|
416
411
|
}
|
|
417
412
|
}
|
|
418
413
|
|
|
419
|
-
pub fn slot_for_header(
|
|
414
|
+
pub fn slot_for_header(journal: *const Journal, header: *const Header) Slot {
|
|
420
415
|
assert(header.command == .prepare);
|
|
421
|
-
return
|
|
416
|
+
return journal.slot_for_op(header.op);
|
|
422
417
|
}
|
|
423
418
|
|
|
424
|
-
pub fn slot_with_header(
|
|
419
|
+
pub fn slot_with_header(journal: *const Journal, header: *const Header) ?Slot {
|
|
425
420
|
assert(header.command == .prepare);
|
|
426
|
-
return
|
|
421
|
+
return journal.slot_with_op(header.op);
|
|
427
422
|
}
|
|
428
423
|
|
|
429
424
|
/// Returns any existing header at the location indicated by header.op.
|
|
430
425
|
/// The existing header may have an older or newer op number.
|
|
431
|
-
pub fn header_for_prepare(
|
|
426
|
+
pub fn header_for_prepare(journal: *const Journal, header: *const Header) ?*const Header {
|
|
432
427
|
assert(header.command == .prepare);
|
|
433
|
-
return
|
|
428
|
+
return journal.header_for_op(header.op);
|
|
434
429
|
}
|
|
435
430
|
|
|
436
431
|
/// We use `op` directly to index into the headers array and locate ops without a scan.
|
|
437
432
|
/// The existing header may have an older or newer op number.
|
|
438
|
-
pub fn header_for_op(
|
|
439
|
-
|
|
440
|
-
const
|
|
441
|
-
const existing = &self.headers[slot.index];
|
|
433
|
+
pub fn header_for_op(journal: *const Journal, op: u64) ?*const Header {
|
|
434
|
+
const slot = journal.slot_for_op(op);
|
|
435
|
+
const existing = &journal.headers[slot.index];
|
|
442
436
|
switch (existing.command) {
|
|
443
437
|
.prepare => {
|
|
444
|
-
assert(
|
|
438
|
+
assert(journal.slot_for_op(existing.op).index == slot.index);
|
|
445
439
|
return existing;
|
|
446
440
|
},
|
|
447
441
|
.reserved => {
|
|
@@ -454,8 +448,8 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
454
448
|
|
|
455
449
|
/// Returns the entry at `@mod(op)` location, but only if `entry.op == op`, else `null`.
|
|
456
450
|
/// Be careful of using this without considering that there may still be an existing op.
|
|
457
|
-
pub fn header_with_op(
|
|
458
|
-
if (
|
|
451
|
+
pub fn header_with_op(journal: *const Journal, op: u64) ?*const Header {
|
|
452
|
+
if (journal.header_for_op(op)) |existing| {
|
|
459
453
|
if (existing.op == op) return existing;
|
|
460
454
|
}
|
|
461
455
|
return null;
|
|
@@ -463,37 +457,35 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
463
457
|
|
|
464
458
|
/// As per `header_with_op()`, but only if there is an optional checksum match.
|
|
465
459
|
pub fn header_with_op_and_checksum(
|
|
466
|
-
|
|
460
|
+
journal: *const Journal,
|
|
467
461
|
op: u64,
|
|
468
462
|
checksum: ?u128,
|
|
469
463
|
) ?*const Header {
|
|
470
|
-
if (
|
|
464
|
+
if (journal.header_with_op(op)) |existing| {
|
|
471
465
|
assert(existing.op == op);
|
|
472
466
|
if (checksum == null or existing.checksum == checksum.?) return existing;
|
|
473
467
|
}
|
|
474
468
|
return null;
|
|
475
469
|
}
|
|
476
470
|
|
|
477
|
-
|
|
478
|
-
// op_checkpoint?
|
|
479
|
-
pub fn previous_entry(self: *const Self, header: *const Header) ?*const Header {
|
|
471
|
+
pub fn previous_entry(journal: *const Journal, header: *const Header) ?*const Header {
|
|
480
472
|
if (header.op == 0) {
|
|
481
473
|
return null;
|
|
482
474
|
} else {
|
|
483
|
-
return
|
|
475
|
+
return journal.header_for_op(header.op - 1);
|
|
484
476
|
}
|
|
485
477
|
}
|
|
486
478
|
|
|
487
|
-
pub fn next_entry(
|
|
488
|
-
return
|
|
479
|
+
pub fn next_entry(journal: *const Journal, header: *const Header) ?*const Header {
|
|
480
|
+
return journal.header_for_op(header.op + 1);
|
|
489
481
|
}
|
|
490
482
|
|
|
491
483
|
/// Returns the highest op number prepared, in any slot without reference to the checkpoint.
|
|
492
|
-
pub fn op_maximum(
|
|
493
|
-
assert(
|
|
484
|
+
pub fn op_maximum(journal: *const Journal) u64 {
|
|
485
|
+
assert(journal.status == .recovered);
|
|
494
486
|
|
|
495
487
|
var op: u64 = 0;
|
|
496
|
-
for (
|
|
488
|
+
for (journal.headers) |*header| {
|
|
497
489
|
if (header.command == .prepare) {
|
|
498
490
|
if (header.op > op) op = header.op;
|
|
499
491
|
} else {
|
|
@@ -519,12 +511,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
519
511
|
return op;
|
|
520
512
|
}
|
|
521
513
|
|
|
522
|
-
pub fn has(
|
|
523
|
-
assert(
|
|
514
|
+
pub fn has(journal: *const Journal, header: *const Header) bool {
|
|
515
|
+
assert(journal.status == .recovered);
|
|
524
516
|
assert(header.command == .prepare);
|
|
525
517
|
|
|
526
|
-
const slot =
|
|
527
|
-
const existing = &
|
|
518
|
+
const slot = journal.slot_for_op(header.op);
|
|
519
|
+
const existing = &journal.headers[slot.index];
|
|
528
520
|
if (existing.command == .reserved) {
|
|
529
521
|
return false;
|
|
530
522
|
} else {
|
|
@@ -538,19 +530,19 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
538
530
|
}
|
|
539
531
|
}
|
|
540
532
|
|
|
541
|
-
pub fn has_clean(
|
|
542
|
-
if (
|
|
543
|
-
if (!
|
|
544
|
-
assert(
|
|
545
|
-
assert(
|
|
533
|
+
pub fn has_clean(journal: *const Journal, header: *const Header) bool {
|
|
534
|
+
if (journal.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
|
|
535
|
+
if (!journal.dirty.bit(slot)) {
|
|
536
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
537
|
+
assert(journal.prepare_checksums[slot.index] == header.checksum);
|
|
546
538
|
return true;
|
|
547
539
|
}
|
|
548
540
|
}
|
|
549
541
|
return false;
|
|
550
542
|
}
|
|
551
543
|
|
|
552
|
-
pub fn has_dirty(
|
|
553
|
-
return
|
|
544
|
+
pub fn has_dirty(journal: *const Journal, header: *const Header) bool {
|
|
545
|
+
return journal.has(header) and journal.dirty.bit(journal.slot_with_header(header).?);
|
|
554
546
|
}
|
|
555
547
|
|
|
556
548
|
/// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
|
|
@@ -561,12 +553,12 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
561
553
|
/// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
|
|
562
554
|
/// Returns the number of headers actually copied.
|
|
563
555
|
pub fn copy_latest_headers_between(
|
|
564
|
-
|
|
556
|
+
journal: *const Journal,
|
|
565
557
|
op_min: u64,
|
|
566
558
|
op_max: u64,
|
|
567
559
|
dest: []Header,
|
|
568
560
|
) usize {
|
|
569
|
-
assert(
|
|
561
|
+
assert(journal.status == .recovered);
|
|
570
562
|
assert(op_min <= op_max);
|
|
571
563
|
assert(dest.len > 0);
|
|
572
564
|
|
|
@@ -579,7 +571,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
579
571
|
while (op > op_min) {
|
|
580
572
|
op -= 1;
|
|
581
573
|
|
|
582
|
-
if (
|
|
574
|
+
if (journal.header_with_op(op)) |header| {
|
|
583
575
|
dest[copied] = header.*;
|
|
584
576
|
assert(dest[copied].invalid() == null);
|
|
585
577
|
copied += 1;
|
|
@@ -590,7 +582,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
590
582
|
log.debug(
|
|
591
583
|
"{}: copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}",
|
|
592
584
|
.{
|
|
593
|
-
|
|
585
|
+
journal.replica,
|
|
594
586
|
op_min,
|
|
595
587
|
op_max,
|
|
596
588
|
dest.len,
|
|
@@ -616,7 +608,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
616
608
|
/// Another example: If op 17 is disconnected from op 18, 16 is connected to 17, and 12-15
|
|
617
609
|
/// are missing, returns: `{ .op_min = 12, .op_max = 17 }`.
|
|
618
610
|
pub fn find_latest_headers_break_between(
|
|
619
|
-
|
|
611
|
+
journal: *const Journal,
|
|
620
612
|
op_min: u64,
|
|
621
613
|
op_max: u64,
|
|
622
614
|
) ?HeaderRange {
|
|
@@ -632,7 +624,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
632
624
|
op -= 1;
|
|
633
625
|
|
|
634
626
|
// Get the entry at @mod(op) location, but only if entry.op == op, else null:
|
|
635
|
-
var A =
|
|
627
|
+
var A = journal.header_with_op(op);
|
|
636
628
|
if (A) |a| {
|
|
637
629
|
if (B) |b| {
|
|
638
630
|
// If A was reordered then A may have a newer op than B (but an older view).
|
|
@@ -719,51 +711,51 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
719
711
|
|
|
720
712
|
/// Read a prepare from disk. There must be a matching in-memory header.
|
|
721
713
|
pub fn read_prepare(
|
|
722
|
-
|
|
714
|
+
journal: *Journal,
|
|
723
715
|
callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
|
|
724
716
|
op: u64,
|
|
725
717
|
checksum: u128,
|
|
726
718
|
destination_replica: ?u8,
|
|
727
719
|
) void {
|
|
728
|
-
assert(
|
|
720
|
+
assert(journal.status == .recovered);
|
|
729
721
|
assert(checksum != 0);
|
|
730
722
|
|
|
731
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
723
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
732
724
|
if (op > replica.op) {
|
|
733
|
-
|
|
725
|
+
journal.read_prepare_log(op, checksum, "beyond replica.op");
|
|
734
726
|
callback(replica, null, null);
|
|
735
727
|
return;
|
|
736
728
|
}
|
|
737
729
|
|
|
738
|
-
const slot =
|
|
739
|
-
|
|
730
|
+
const slot = journal.slot_with_op_and_checksum(op, checksum) orelse {
|
|
731
|
+
journal.read_prepare_log(op, checksum, "no entry exactly");
|
|
740
732
|
callback(replica, null, null);
|
|
741
733
|
return;
|
|
742
734
|
};
|
|
743
735
|
|
|
744
|
-
if (
|
|
745
|
-
|
|
736
|
+
if (journal.prepare_inhabited[slot.index] and
|
|
737
|
+
journal.prepare_checksums[slot.index] == checksum)
|
|
746
738
|
{
|
|
747
|
-
|
|
739
|
+
journal.read_prepare_with_op_and_checksum(callback, op, checksum, destination_replica);
|
|
748
740
|
} else {
|
|
749
|
-
|
|
741
|
+
journal.read_prepare_log(op, checksum, "no matching prepare");
|
|
750
742
|
callback(replica, null, null);
|
|
751
743
|
}
|
|
752
744
|
}
|
|
753
745
|
|
|
754
746
|
/// Read a prepare from disk. There may or may not be an in-memory header.
|
|
755
747
|
pub fn read_prepare_with_op_and_checksum(
|
|
756
|
-
|
|
748
|
+
journal: *Journal,
|
|
757
749
|
callback: fn (replica: *Replica, prepare: ?*Message, destination_replica: ?u8) void,
|
|
758
750
|
op: u64,
|
|
759
751
|
checksum: u128,
|
|
760
752
|
destination_replica: ?u8,
|
|
761
753
|
) void {
|
|
762
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
763
|
-
const slot =
|
|
764
|
-
assert(
|
|
765
|
-
assert(
|
|
766
|
-
assert(
|
|
754
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
755
|
+
const slot = journal.slot_for_op(op);
|
|
756
|
+
assert(journal.status == .recovered);
|
|
757
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
758
|
+
assert(journal.prepare_checksums[slot.index] == checksum);
|
|
767
759
|
|
|
768
760
|
const message = replica.message_bus.get_message();
|
|
769
761
|
defer replica.message_bus.unref(message);
|
|
@@ -771,7 +763,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
771
763
|
var message_size: usize = constants.message_size_max;
|
|
772
764
|
|
|
773
765
|
// If the header is in-memory, we can skip the read from the disk.
|
|
774
|
-
if (
|
|
766
|
+
if (journal.header_with_op_and_checksum(op, checksum)) |exact| {
|
|
775
767
|
if (exact.size == @sizeOf(Header)) {
|
|
776
768
|
message.header.* = exact.*;
|
|
777
769
|
// Normally the message's padding would have been zeroed by the MessageBus,
|
|
@@ -787,14 +779,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
787
779
|
}
|
|
788
780
|
}
|
|
789
781
|
|
|
790
|
-
const read =
|
|
791
|
-
|
|
782
|
+
const read = journal.reads.acquire() orelse {
|
|
783
|
+
journal.read_prepare_log(op, checksum, "waiting for IOP");
|
|
792
784
|
callback(replica, null, null);
|
|
793
785
|
return;
|
|
794
786
|
};
|
|
795
787
|
|
|
796
788
|
read.* = .{
|
|
797
|
-
.
|
|
789
|
+
.journal = journal,
|
|
798
790
|
.completion = undefined,
|
|
799
791
|
.message = message.ref(),
|
|
800
792
|
.callback = callback,
|
|
@@ -805,11 +797,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
805
797
|
|
|
806
798
|
const buffer: []u8 = message.buffer[0..message_size];
|
|
807
799
|
|
|
808
|
-
// Memory must not be owned by `
|
|
809
|
-
assert(@ptrToInt(buffer.ptr) < @ptrToInt(
|
|
810
|
-
@ptrToInt(buffer.ptr) > @ptrToInt(
|
|
800
|
+
// Memory must not be owned by `journal.headers` as these may be modified concurrently:
|
|
801
|
+
assert(@ptrToInt(buffer.ptr) < @ptrToInt(journal.headers.ptr) or
|
|
802
|
+
@ptrToInt(buffer.ptr) > @ptrToInt(journal.headers.ptr) + headers_size);
|
|
811
803
|
|
|
812
|
-
|
|
804
|
+
journal.storage.read_sectors(
|
|
813
805
|
read_prepare_with_op_and_checksum_callback,
|
|
814
806
|
&read.completion,
|
|
815
807
|
buffer,
|
|
@@ -819,28 +811,28 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
819
811
|
}
|
|
820
812
|
|
|
821
813
|
fn read_prepare_with_op_and_checksum_callback(completion: *Storage.Read) void {
|
|
822
|
-
const read = @fieldParentPtr(
|
|
823
|
-
const
|
|
824
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
814
|
+
const read = @fieldParentPtr(Journal.Read, "completion", completion);
|
|
815
|
+
const journal = read.journal;
|
|
816
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
825
817
|
const op = read.op;
|
|
826
818
|
const checksum = read.checksum;
|
|
827
|
-
assert(
|
|
819
|
+
assert(journal.status == .recovered);
|
|
828
820
|
|
|
829
821
|
defer {
|
|
830
822
|
replica.message_bus.unref(read.message);
|
|
831
|
-
|
|
823
|
+
journal.reads.release(read);
|
|
832
824
|
}
|
|
833
825
|
|
|
834
826
|
if (op > replica.op) {
|
|
835
|
-
|
|
827
|
+
journal.read_prepare_log(op, checksum, "beyond replica.op");
|
|
836
828
|
read.callback(replica, null, null);
|
|
837
829
|
return;
|
|
838
830
|
}
|
|
839
831
|
|
|
840
|
-
const checksum_inhabited =
|
|
841
|
-
const checksum_match =
|
|
832
|
+
const checksum_inhabited = journal.prepare_inhabited[journal.slot_for_op(op).index];
|
|
833
|
+
const checksum_match = journal.prepare_checksums[journal.slot_for_op(op).index] == checksum;
|
|
842
834
|
if (!checksum_inhabited or !checksum_match) {
|
|
843
|
-
|
|
835
|
+
journal.read_prepare_log(op, checksum, "prepare changed during read");
|
|
844
836
|
read.callback(replica, null, null);
|
|
845
837
|
return;
|
|
846
838
|
}
|
|
@@ -849,15 +841,15 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
849
841
|
// The slot may not match the Read's op/checksum due to either:
|
|
850
842
|
// * The in-memory header changed since the read began.
|
|
851
843
|
// * The in-memory header is reserved+faulty; the read was via `prepare_checksums`
|
|
852
|
-
const slot =
|
|
844
|
+
const slot = journal.slot_with_op_and_checksum(op, checksum);
|
|
853
845
|
|
|
854
846
|
if (!read.message.header.valid_checksum()) {
|
|
855
847
|
if (slot) |s| {
|
|
856
|
-
|
|
857
|
-
|
|
848
|
+
journal.faulty.set(s);
|
|
849
|
+
journal.dirty.set(s);
|
|
858
850
|
}
|
|
859
851
|
|
|
860
|
-
|
|
852
|
+
journal.read_prepare_log(op, checksum, "corrupt header after read");
|
|
861
853
|
read.callback(replica, null, null);
|
|
862
854
|
return;
|
|
863
855
|
}
|
|
@@ -868,11 +860,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
868
860
|
// Though when a prepare spans multiple sectors, a misdirected read/write will
|
|
869
861
|
// likely manifest as a checksum failure instead.
|
|
870
862
|
if (slot) |s| {
|
|
871
|
-
|
|
872
|
-
|
|
863
|
+
journal.faulty.set(s);
|
|
864
|
+
journal.dirty.set(s);
|
|
873
865
|
}
|
|
874
866
|
|
|
875
|
-
|
|
867
|
+
journal.read_prepare_log(op, checksum, "wrong cluster");
|
|
876
868
|
read.callback(replica, null, null);
|
|
877
869
|
return;
|
|
878
870
|
}
|
|
@@ -882,7 +874,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
882
874
|
// * The prepare was rewritten since the read began.
|
|
883
875
|
// * Misdirected read/write.
|
|
884
876
|
// * The combination of:
|
|
885
|
-
// * The
|
|
877
|
+
// * The primary is responding to a `request_prepare`.
|
|
886
878
|
// * The `request_prepare` did not include a checksum.
|
|
887
879
|
// * The requested op's slot is faulty, but the prepare is valid. Since the
|
|
888
880
|
// prepare is valid, WAL recovery set `prepare_checksums[slot]`. But on reading
|
|
@@ -891,7 +883,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
891
883
|
// the op along with the checksum in `prepare_checksums`.)
|
|
892
884
|
assert(slot == null);
|
|
893
885
|
|
|
894
|
-
|
|
886
|
+
journal.read_prepare_log(op, checksum, "op changed during read");
|
|
895
887
|
read.callback(replica, null, null);
|
|
896
888
|
return;
|
|
897
889
|
}
|
|
@@ -900,18 +892,18 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
900
892
|
// This can also be caused by a misdirected read/write.
|
|
901
893
|
assert(slot == null);
|
|
902
894
|
|
|
903
|
-
|
|
895
|
+
journal.read_prepare_log(op, checksum, "checksum changed during read");
|
|
904
896
|
read.callback(replica, null, null);
|
|
905
897
|
return;
|
|
906
898
|
}
|
|
907
899
|
|
|
908
900
|
if (!read.message.header.valid_checksum_body(read.message.body())) {
|
|
909
901
|
if (slot) |s| {
|
|
910
|
-
|
|
911
|
-
|
|
902
|
+
journal.faulty.set(s);
|
|
903
|
+
journal.dirty.set(s);
|
|
912
904
|
}
|
|
913
905
|
|
|
914
|
-
|
|
906
|
+
journal.read_prepare_log(op, checksum, "corrupt body after read");
|
|
915
907
|
read.callback(replica, null, null);
|
|
916
908
|
return;
|
|
917
909
|
}
|
|
@@ -919,68 +911,80 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
919
911
|
read.callback(replica, read.message, read.destination_replica);
|
|
920
912
|
}
|
|
921
913
|
|
|
922
|
-
fn read_prepare_log(
|
|
914
|
+
fn read_prepare_log(journal: *Journal, op: u64, checksum: ?u128, notice: []const u8) void {
|
|
923
915
|
log.info(
|
|
924
916
|
"{}: read_prepare: op={} checksum={}: {s}",
|
|
925
|
-
.{
|
|
917
|
+
.{ journal.replica, op, checksum, notice },
|
|
926
918
|
);
|
|
927
919
|
}
|
|
928
920
|
|
|
929
|
-
pub fn recover(
|
|
930
|
-
assert(
|
|
931
|
-
assert(
|
|
932
|
-
assert(
|
|
921
|
+
pub fn recover(journal: *Journal, callback: fn (journal: *Journal) void) void {
|
|
922
|
+
assert(journal.status == .init);
|
|
923
|
+
assert(journal.dirty.count == slot_count);
|
|
924
|
+
assert(journal.faulty.count == slot_count);
|
|
925
|
+
assert(journal.reads.executing() == 0);
|
|
926
|
+
assert(journal.writes.executing() == 0);
|
|
927
|
+
assert(journal.header_chunks_requested.count() == HeaderChunks.bit_length);
|
|
928
|
+
assert(journal.header_chunks_recovered.count() == 0);
|
|
933
929
|
|
|
934
|
-
|
|
930
|
+
journal.status = .{ .recovering = callback };
|
|
931
|
+
log.debug("{}: recover: recovering", .{journal.replica});
|
|
935
932
|
|
|
936
|
-
|
|
933
|
+
var available: usize = journal.reads.available();
|
|
934
|
+
while (available > 0) : (available -= 1) journal.recover_headers();
|
|
937
935
|
|
|
938
|
-
|
|
936
|
+
assert(journal.header_chunks_recovered.count() == 0);
|
|
937
|
+
assert(journal.header_chunks_requested.count() ==
|
|
938
|
+
HeaderChunks.bit_length - journal.reads.executing());
|
|
939
939
|
}
|
|
940
940
|
|
|
941
|
-
fn recover_headers(
|
|
942
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
943
|
-
|
|
944
|
-
assert(
|
|
945
|
-
assert(self.dirty.count == slot_count);
|
|
946
|
-
assert(self.faulty.count == slot_count);
|
|
941
|
+
fn recover_headers(journal: *Journal) void {
|
|
942
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
943
|
+
assert(journal.status == .recovering);
|
|
944
|
+
assert(journal.reads.available() > 0);
|
|
947
945
|
|
|
948
|
-
if (
|
|
949
|
-
|
|
950
|
-
|
|
946
|
+
if (journal.header_chunks_recovered.count() == HeaderChunks.bit_length) {
|
|
947
|
+
assert(journal.header_chunks_requested.count() == 0);
|
|
948
|
+
log.debug("{}: recover_headers: complete", .{journal.replica});
|
|
949
|
+
journal.recover_prepares();
|
|
951
950
|
return;
|
|
952
951
|
}
|
|
953
|
-
|
|
952
|
+
|
|
953
|
+
const chunk_index = journal.header_chunks_requested.findFirstSet() orelse return;
|
|
954
|
+
assert(!journal.header_chunks_recovered.isSet(chunk_index));
|
|
954
955
|
|
|
955
956
|
const message = replica.message_bus.get_message();
|
|
956
957
|
defer replica.message_bus.unref(message);
|
|
957
958
|
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
const read = self.reads.acquire() orelse unreachable;
|
|
962
|
-
read.* = .{
|
|
963
|
-
.self = self,
|
|
959
|
+
const chunk_read = journal.reads.acquire() orelse unreachable;
|
|
960
|
+
chunk_read.* = .{
|
|
961
|
+
.journal = journal,
|
|
964
962
|
.completion = undefined,
|
|
965
963
|
.message = message.ref(),
|
|
966
964
|
.callback = undefined,
|
|
967
|
-
.op =
|
|
968
|
-
.checksum =
|
|
965
|
+
.op = chunk_index,
|
|
966
|
+
.checksum = undefined,
|
|
969
967
|
.destination_replica = null,
|
|
970
968
|
};
|
|
971
969
|
|
|
970
|
+
const offset = constants.message_size_max * chunk_index;
|
|
971
|
+
assert(offset < headers_size);
|
|
972
|
+
|
|
972
973
|
const buffer = recover_headers_buffer(message, offset);
|
|
973
974
|
assert(buffer.len > 0);
|
|
975
|
+
assert(buffer.len <= constants.message_size_max);
|
|
976
|
+
assert(buffer.len + offset <= headers_size);
|
|
974
977
|
|
|
975
978
|
log.debug("{}: recover_headers: offset={} size={} recovering", .{
|
|
976
|
-
|
|
979
|
+
journal.replica,
|
|
977
980
|
offset,
|
|
978
981
|
buffer.len,
|
|
979
982
|
});
|
|
980
983
|
|
|
981
|
-
|
|
984
|
+
journal.header_chunks_requested.unset(chunk_index);
|
|
985
|
+
journal.storage.read_sectors(
|
|
982
986
|
recover_headers_callback,
|
|
983
|
-
&
|
|
987
|
+
&chunk_read.completion,
|
|
984
988
|
buffer,
|
|
985
989
|
.wal_headers,
|
|
986
990
|
offset,
|
|
@@ -988,90 +992,116 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
988
992
|
}
|
|
989
993
|
|
|
990
994
|
fn recover_headers_callback(completion: *Storage.Read) void {
|
|
991
|
-
const
|
|
992
|
-
const
|
|
993
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
const
|
|
995
|
+
const chunk_read = @fieldParentPtr(Journal.Read, "completion", completion);
|
|
996
|
+
const journal = chunk_read.journal;
|
|
997
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
998
|
+
assert(journal.status == .recovering);
|
|
999
|
+
assert(chunk_read.destination_replica == null);
|
|
1000
|
+
|
|
1001
|
+
const chunk_index = chunk_read.op;
|
|
1002
|
+
assert(!journal.header_chunks_requested.isSet(chunk_index));
|
|
1003
|
+
assert(!journal.header_chunks_recovered.isSet(chunk_index));
|
|
1004
|
+
|
|
1005
|
+
const chunk_buffer = recover_headers_buffer(
|
|
1006
|
+
chunk_read.message,
|
|
1007
|
+
chunk_index * constants.message_size_max,
|
|
1008
|
+
);
|
|
1009
|
+
assert(chunk_buffer.len >= @sizeOf(Header));
|
|
1010
|
+
assert(chunk_buffer.len % @sizeOf(Header) == 0);
|
|
998
1011
|
|
|
999
1012
|
log.debug("{}: recover_headers: offset={} size={} recovered", .{
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1013
|
+
journal.replica,
|
|
1014
|
+
chunk_index * constants.message_size_max,
|
|
1015
|
+
chunk_buffer.len,
|
|
1003
1016
|
});
|
|
1004
1017
|
|
|
1005
|
-
|
|
1006
|
-
assert(offset % @sizeOf(Header) == 0);
|
|
1007
|
-
assert(buffer.len >= @sizeOf(Header));
|
|
1008
|
-
assert(buffer.len % @sizeOf(Header) == 0);
|
|
1009
|
-
assert(read.destination_replica == null);
|
|
1010
|
-
assert(self.dirty.count == slot_count);
|
|
1011
|
-
assert(self.faulty.count == slot_count);
|
|
1012
|
-
|
|
1013
|
-
// Directly store all the redundant headers in `self.headers_redundant` (including any
|
|
1018
|
+
// Directly store all the redundant headers in `journal.headers_redundant` (including any
|
|
1014
1019
|
// that are invalid or corrupt). As the prepares are recovered, these will be replaced
|
|
1015
1020
|
// or removed as necessary.
|
|
1016
|
-
const
|
|
1021
|
+
const chunk_headers = std.mem.bytesAsSlice(Header, chunk_buffer);
|
|
1017
1022
|
util.copy_disjoint(
|
|
1018
1023
|
.exact,
|
|
1019
1024
|
Header,
|
|
1020
|
-
|
|
1021
|
-
|
|
1025
|
+
journal.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
|
|
1026
|
+
chunk_headers,
|
|
1022
1027
|
);
|
|
1023
1028
|
|
|
1024
|
-
const offset_next = offset + buffer.len;
|
|
1025
1029
|
// We must release before we call `recover_headers()` in case Storage is synchronous.
|
|
1026
1030
|
// Otherwise, we would run out of messages and reads.
|
|
1027
|
-
replica.message_bus.unref(
|
|
1028
|
-
|
|
1031
|
+
replica.message_bus.unref(chunk_read.message);
|
|
1032
|
+
journal.reads.release(chunk_read);
|
|
1029
1033
|
|
|
1030
|
-
|
|
1034
|
+
journal.header_chunks_recovered.set(chunk_index);
|
|
1035
|
+
journal.recover_headers();
|
|
1031
1036
|
}
|
|
1032
1037
|
|
|
1033
1038
|
fn recover_headers_buffer(message: *Message, offset: u64) []align(@alignOf(Header)) u8 {
|
|
1034
|
-
const max = std.math.min(
|
|
1039
|
+
const max = std.math.min(constants.message_size_max, headers_size - offset);
|
|
1035
1040
|
assert(max % constants.sector_size == 0);
|
|
1036
1041
|
assert(max % @sizeOf(Header) == 0);
|
|
1037
1042
|
return message.buffer[0..max];
|
|
1038
1043
|
}
|
|
1039
1044
|
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1045
|
+
/// Recover the prepares ring. Reads are issued concurrently.
|
|
1046
|
+
/// - `dirty` is initially full.
|
|
1047
|
+
/// Bits are cleared when a read is issued to the slot.
|
|
1048
|
+
/// All bits are set again before recover_slots() is called.
|
|
1049
|
+
/// - `faulty` is initially full.
|
|
1050
|
+
/// Bits are cleared when the slot's read finishes.
|
|
1051
|
+
/// All bits are set again before recover_slots() is called.
|
|
1052
|
+
/// - The prepare's headers are loaded into `journal.headers`.
|
|
1053
|
+
fn recover_prepares(journal: *Journal) void {
|
|
1054
|
+
assert(journal.status == .recovering);
|
|
1055
|
+
assert(journal.dirty.count == slot_count);
|
|
1056
|
+
assert(journal.faulty.count == slot_count);
|
|
1057
|
+
assert(journal.reads.executing() == 0);
|
|
1058
|
+
assert(journal.writes.executing() == 0);
|
|
1059
|
+
|
|
1060
|
+
var available: usize = journal.reads.available();
|
|
1061
|
+
while (available > 0) : (available -= 1) journal.recover_prepare();
|
|
1062
|
+
|
|
1063
|
+
assert(journal.writes.executing() == 0);
|
|
1064
|
+
assert(journal.reads.executing() > 0);
|
|
1065
|
+
assert(journal.reads.executing() + journal.dirty.count == slot_count);
|
|
1066
|
+
assert(journal.faulty.count == slot_count);
|
|
1067
|
+
}
|
|
1047
1068
|
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1069
|
+
fn recover_prepare(journal: *Journal) void {
|
|
1070
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1071
|
+
assert(journal.status == .recovering);
|
|
1072
|
+
assert(journal.reads.available() > 0);
|
|
1073
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1074
|
+
|
|
1075
|
+
if (journal.faulty.count == 0) {
|
|
1076
|
+
for (journal.headers) |_, index| journal.dirty.set(Slot{ .index = index });
|
|
1077
|
+
for (journal.headers) |_, index| journal.faulty.set(Slot{ .index = index });
|
|
1078
|
+
return journal.recover_slots();
|
|
1051
1079
|
}
|
|
1052
|
-
assert(slot.index < slot_count);
|
|
1053
1080
|
|
|
1081
|
+
const slot_index = journal.dirty.bits.findFirstSet() orelse return;
|
|
1082
|
+
const slot = Slot{ .index = slot_index };
|
|
1054
1083
|
const message = replica.message_bus.get_message();
|
|
1055
1084
|
defer replica.message_bus.unref(message);
|
|
1056
1085
|
|
|
1057
|
-
const read =
|
|
1086
|
+
const read = journal.reads.acquire() orelse unreachable;
|
|
1058
1087
|
read.* = .{
|
|
1059
|
-
.
|
|
1088
|
+
.journal = journal,
|
|
1060
1089
|
.completion = undefined,
|
|
1061
1090
|
.message = message.ref(),
|
|
1062
1091
|
.callback = undefined,
|
|
1063
|
-
.op =
|
|
1064
|
-
.checksum =
|
|
1092
|
+
.op = slot.index,
|
|
1093
|
+
.checksum = undefined,
|
|
1065
1094
|
.destination_replica = null,
|
|
1066
1095
|
};
|
|
1067
1096
|
|
|
1068
|
-
log.debug("{}:
|
|
1069
|
-
|
|
1097
|
+
log.debug("{}: recover_prepare: recovering slot={}", .{
|
|
1098
|
+
journal.replica,
|
|
1070
1099
|
slot.index,
|
|
1071
1100
|
});
|
|
1072
1101
|
|
|
1073
|
-
|
|
1074
|
-
|
|
1102
|
+
journal.dirty.clear(slot);
|
|
1103
|
+
journal.storage.read_sectors(
|
|
1104
|
+
recover_prepare_callback,
|
|
1075
1105
|
&read.completion,
|
|
1076
1106
|
// We load the entire message to verify that it isn't torn or corrupt.
|
|
1077
1107
|
// We don't know the message's size, so use the entire buffer.
|
|
@@ -1081,31 +1111,33 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1081
1111
|
);
|
|
1082
1112
|
}
|
|
1083
1113
|
|
|
1084
|
-
fn
|
|
1085
|
-
const read = @fieldParentPtr(
|
|
1086
|
-
const
|
|
1087
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1114
|
+
fn recover_prepare_callback(completion: *Storage.Read) void {
|
|
1115
|
+
const read = @fieldParentPtr(Journal.Read, "completion", completion);
|
|
1116
|
+
const journal = read.journal;
|
|
1117
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1088
1118
|
|
|
1089
|
-
assert(
|
|
1090
|
-
assert(
|
|
1091
|
-
assert(self.faulty.count == slot_count);
|
|
1119
|
+
assert(journal.status == .recovering);
|
|
1120
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1092
1121
|
assert(read.destination_replica == null);
|
|
1093
1122
|
|
|
1094
|
-
const slot = Slot{ .index = @intCast(u64, read.
|
|
1123
|
+
const slot = Slot{ .index = @intCast(u64, read.op) };
|
|
1095
1124
|
assert(slot.index < slot_count);
|
|
1125
|
+
assert(!journal.dirty.bit(slot));
|
|
1126
|
+
assert(journal.faulty.bit(slot));
|
|
1096
1127
|
|
|
1097
1128
|
// Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
|
|
1098
1129
|
// to hold onto the whole message (just the header).
|
|
1099
1130
|
if (read.message.header.valid_checksum() and
|
|
1100
1131
|
read.message.header.valid_checksum_body(read.message.body()))
|
|
1101
1132
|
{
|
|
1102
|
-
|
|
1133
|
+
journal.headers[slot.index] = read.message.header.*;
|
|
1103
1134
|
}
|
|
1104
1135
|
|
|
1105
1136
|
replica.message_bus.unref(read.message);
|
|
1106
|
-
|
|
1137
|
+
journal.reads.release(read);
|
|
1107
1138
|
|
|
1108
|
-
|
|
1139
|
+
journal.faulty.clear(slot);
|
|
1140
|
+
journal.recover_prepare();
|
|
1109
1141
|
}
|
|
1110
1142
|
|
|
1111
1143
|
/// When in doubt about whether a particular message was received, it must be marked as
|
|
@@ -1175,65 +1207,63 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1175
1207
|
/// 2. has the correct cluster
|
|
1176
1208
|
/// 3. is in the correct slot (op % slot_count)
|
|
1177
1209
|
/// 4. has command=reserved or command=prepare
|
|
1178
|
-
fn recover_slots(
|
|
1179
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1210
|
+
fn recover_slots(journal: *Journal) void {
|
|
1211
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1180
1212
|
|
|
1181
|
-
assert(
|
|
1182
|
-
assert(
|
|
1183
|
-
assert(
|
|
1184
|
-
assert(
|
|
1185
|
-
assert(
|
|
1213
|
+
assert(journal.status == .recovering);
|
|
1214
|
+
assert(journal.reads.executing() == 0);
|
|
1215
|
+
assert(journal.writes.executing() == 0);
|
|
1216
|
+
assert(journal.dirty.count == slot_count);
|
|
1217
|
+
assert(journal.faulty.count == slot_count);
|
|
1186
1218
|
|
|
1187
1219
|
const prepare_op_max = std.math.max(
|
|
1188
1220
|
replica.op_checkpoint,
|
|
1189
|
-
op_maximum_headers_untrusted(replica.cluster,
|
|
1221
|
+
op_maximum_headers_untrusted(replica.cluster, journal.headers),
|
|
1190
1222
|
);
|
|
1191
1223
|
|
|
1192
1224
|
var cases: [slot_count]*const Case = undefined;
|
|
1193
1225
|
|
|
1194
|
-
for (
|
|
1226
|
+
for (journal.headers) |_, index| {
|
|
1195
1227
|
const slot = Slot{ .index = index };
|
|
1196
|
-
const header = header_ok(replica.cluster, slot, &
|
|
1197
|
-
const prepare = header_ok(replica.cluster, slot, &
|
|
1228
|
+
const header = header_ok(replica.cluster, slot, &journal.headers_redundant[index]);
|
|
1229
|
+
const prepare = header_ok(replica.cluster, slot, &journal.headers[index]);
|
|
1198
1230
|
|
|
1199
1231
|
cases[index] = recovery_case(header, prepare, prepare_op_max);
|
|
1200
1232
|
|
|
1201
1233
|
// `prepare_checksums` improves the availability of `request_prepare` by being more
|
|
1202
1234
|
// flexible than `headers` regarding the prepares it references. It may hold a
|
|
1203
|
-
// prepare whose redundant header is broken, as long as the prepare
|
|
1235
|
+
// prepare whose redundant header is broken, as long as the prepare itjournal is valid.
|
|
1204
1236
|
if (prepare != null and prepare.?.command == .prepare) {
|
|
1205
|
-
assert(!
|
|
1206
|
-
|
|
1207
|
-
|
|
1237
|
+
assert(!journal.prepare_inhabited[index]);
|
|
1238
|
+
journal.prepare_inhabited[index] = true;
|
|
1239
|
+
journal.prepare_checksums[index] = prepare.?.checksum;
|
|
1208
1240
|
}
|
|
1209
1241
|
}
|
|
1210
|
-
assert(
|
|
1242
|
+
assert(journal.headers.len == cases.len);
|
|
1211
1243
|
|
|
1212
1244
|
// Refine cases @B and @C: Repair (truncate) a prepare if it was torn during a crash.
|
|
1213
|
-
if (
|
|
1245
|
+
if (journal.recover_torn_prepare(&cases)) |torn_slot| {
|
|
1214
1246
|
assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
|
|
1215
1247
|
cases[torn_slot.index] = &case_cut;
|
|
1216
1248
|
|
|
1217
1249
|
log.warn("{}: recover_slots: torn prepare in slot={}", .{
|
|
1218
|
-
|
|
1250
|
+
journal.replica,
|
|
1219
1251
|
torn_slot.index,
|
|
1220
1252
|
});
|
|
1221
1253
|
}
|
|
1222
1254
|
|
|
1223
|
-
for (cases) |case, index|
|
|
1255
|
+
for (cases) |case, index| journal.recover_slot(Slot{ .index = index }, case);
|
|
1224
1256
|
assert(cases.len == slot_count);
|
|
1225
1257
|
|
|
1226
|
-
util.copy_disjoint(.exact, Header,
|
|
1258
|
+
util.copy_disjoint(.exact, Header, journal.headers_redundant, journal.headers);
|
|
1227
1259
|
|
|
1228
1260
|
log.debug("{}: recover_slots: dirty={} faulty={}", .{
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1261
|
+
journal.replica,
|
|
1262
|
+
journal.dirty.count,
|
|
1263
|
+
journal.faulty.count,
|
|
1232
1264
|
});
|
|
1233
1265
|
|
|
1234
|
-
|
|
1235
|
-
self.assert_recovered();
|
|
1236
|
-
// From here it's over to the Recovery protocol from VRR 2012.
|
|
1266
|
+
journal.recover_fix();
|
|
1237
1267
|
}
|
|
1238
1268
|
|
|
1239
1269
|
/// Returns a slot that is safe to truncate.
|
|
@@ -1246,27 +1276,27 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1246
1276
|
/// - the prepare is corrupt, and
|
|
1247
1277
|
/// * there are no faults except for those between `op_checkpoint` and `op_max + 1`,
|
|
1248
1278
|
/// so that we can be sure that the maximum valid op is in fact the maximum.
|
|
1249
|
-
fn recover_torn_prepare(
|
|
1250
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1279
|
+
fn recover_torn_prepare(journal: *const Journal, cases: []const *const Case) ?Slot {
|
|
1280
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1251
1281
|
|
|
1252
|
-
assert(
|
|
1253
|
-
assert(
|
|
1254
|
-
assert(
|
|
1282
|
+
assert(journal.status == .recovering);
|
|
1283
|
+
assert(journal.dirty.count == slot_count);
|
|
1284
|
+
assert(journal.faulty.count == slot_count);
|
|
1255
1285
|
|
|
1256
|
-
const op_max = op_maximum_headers_untrusted(replica.cluster,
|
|
1257
|
-
if (op_max != op_maximum_headers_untrusted(replica.cluster,
|
|
1286
|
+
const op_max = op_maximum_headers_untrusted(replica.cluster, journal.headers_redundant);
|
|
1287
|
+
if (op_max != op_maximum_headers_untrusted(replica.cluster, journal.headers)) return null;
|
|
1258
1288
|
if (op_max < replica.op_checkpoint) return null;
|
|
1259
1289
|
// We can't assume that the header at `op_max` is a prepare — an empty journal with a
|
|
1260
1290
|
// corrupt root prepare (op_max=0) will be repaired later.
|
|
1261
1291
|
|
|
1262
1292
|
const torn_op = op_max + 1;
|
|
1263
|
-
const torn_slot =
|
|
1293
|
+
const torn_slot = journal.slot_for_op(torn_op);
|
|
1264
1294
|
|
|
1265
|
-
const torn_prepare_untrusted = &
|
|
1295
|
+
const torn_prepare_untrusted = &journal.headers[torn_slot.index];
|
|
1266
1296
|
if (torn_prepare_untrusted.valid_checksum()) return null;
|
|
1267
1297
|
// The prepare is at least corrupt, possibly torn, but not valid and simply misdirected.
|
|
1268
1298
|
|
|
1269
|
-
const header_untrusted = &
|
|
1299
|
+
const header_untrusted = &journal.headers_redundant[torn_slot.index];
|
|
1270
1300
|
const header = header_ok(replica.cluster, torn_slot, header_untrusted) orelse return null;
|
|
1271
1301
|
// The redundant header is valid, also for the correct cluster and not misdirected.
|
|
1272
1302
|
|
|
@@ -1286,7 +1316,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1286
1316
|
// unless the prepare header was lost, in which case this slot may also not be torn.
|
|
1287
1317
|
}
|
|
1288
1318
|
|
|
1289
|
-
const checkpoint_index =
|
|
1319
|
+
const checkpoint_index = journal.slot_for_op(replica.op_checkpoint).index;
|
|
1290
1320
|
const known_range = SlotRange{
|
|
1291
1321
|
.head = Slot{ .index = checkpoint_index },
|
|
1292
1322
|
.tail = torn_slot,
|
|
@@ -1304,7 +1334,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1304
1334
|
// truncate).
|
|
1305
1335
|
//
|
|
1306
1336
|
// When the checkpoint and torn op are in the same slot, then we can only be certain
|
|
1307
|
-
// if there are no faults other than the torn op
|
|
1337
|
+
// if there are no faults other than the torn op itjournal.
|
|
1308
1338
|
for (cases) |case, index| {
|
|
1309
1339
|
// Do not use `faulty.bit()` because the decisions have not been processed yet.
|
|
1310
1340
|
if (case.decision(replica.replica_count) == .vsr) {
|
|
@@ -1319,81 +1349,78 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1319
1349
|
}
|
|
1320
1350
|
|
|
1321
1351
|
// The prepare is torn.
|
|
1322
|
-
assert(!
|
|
1352
|
+
assert(!journal.prepare_inhabited[torn_slot.index]);
|
|
1323
1353
|
assert(!torn_prepare_untrusted.valid_checksum());
|
|
1324
1354
|
assert(cases[torn_slot.index].decision(replica.replica_count) == .vsr);
|
|
1325
1355
|
return torn_slot;
|
|
1326
1356
|
}
|
|
1327
1357
|
|
|
1328
|
-
fn recover_slot(
|
|
1329
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1358
|
+
fn recover_slot(journal: *Journal, slot: Slot, case: *const Case) void {
|
|
1359
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1330
1360
|
const cluster = replica.cluster;
|
|
1331
1361
|
|
|
1332
|
-
assert(
|
|
1333
|
-
assert(
|
|
1334
|
-
assert(
|
|
1362
|
+
assert(journal.status == .recovering);
|
|
1363
|
+
assert(journal.dirty.bit(slot));
|
|
1364
|
+
assert(journal.faulty.bit(slot));
|
|
1335
1365
|
|
|
1336
|
-
const header = header_ok(cluster, slot, &
|
|
1337
|
-
const prepare = header_ok(cluster, slot, &
|
|
1366
|
+
const header = header_ok(cluster, slot, &journal.headers_redundant[slot.index]);
|
|
1367
|
+
const prepare = header_ok(cluster, slot, &journal.headers[slot.index]);
|
|
1338
1368
|
const decision = case.decision(replica.replica_count);
|
|
1339
1369
|
switch (decision) {
|
|
1340
1370
|
.eql => {
|
|
1341
1371
|
assert(header.?.command == .prepare);
|
|
1342
1372
|
assert(prepare.?.command == .prepare);
|
|
1343
1373
|
assert(header.?.checksum == prepare.?.checksum);
|
|
1344
|
-
assert(
|
|
1345
|
-
assert(
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1374
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1375
|
+
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1376
|
+
journal.headers[slot.index] = header.?.*;
|
|
1377
|
+
journal.dirty.clear(slot);
|
|
1378
|
+
journal.faulty.clear(slot);
|
|
1349
1379
|
},
|
|
1350
1380
|
.nil => {
|
|
1351
1381
|
assert(header.?.command == .reserved);
|
|
1352
1382
|
assert(prepare.?.command == .reserved);
|
|
1353
1383
|
assert(header.?.checksum == prepare.?.checksum);
|
|
1354
1384
|
assert(header.?.checksum == Header.reserved(cluster, slot.index).checksum);
|
|
1355
|
-
assert(!
|
|
1356
|
-
assert(
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1385
|
+
assert(!journal.prepare_inhabited[slot.index]);
|
|
1386
|
+
assert(journal.prepare_checksums[slot.index] == 0);
|
|
1387
|
+
journal.headers[slot.index] = header.?.*;
|
|
1388
|
+
journal.dirty.clear(slot);
|
|
1389
|
+
journal.faulty.clear(slot);
|
|
1360
1390
|
},
|
|
1361
1391
|
.fix => {
|
|
1362
|
-
|
|
1363
|
-
|
|
1392
|
+
journal.headers[slot.index] = prepare.?.*;
|
|
1393
|
+
journal.faulty.clear(slot);
|
|
1394
|
+
assert(journal.dirty.bit(slot));
|
|
1364
1395
|
if (replica.replica_count == 1) {
|
|
1365
|
-
// @D, @E, @F, @G, @H, @K
|
|
1366
|
-
self.dirty.clear(slot);
|
|
1367
|
-
// TODO Repair header on disk to restore durability.
|
|
1396
|
+
// @D, @E, @F, @G, @H, @K
|
|
1368
1397
|
} else {
|
|
1369
1398
|
assert(prepare.?.command == .prepare);
|
|
1370
|
-
assert(
|
|
1371
|
-
assert(
|
|
1372
|
-
// @F, @H, @K
|
|
1373
|
-
// TODO Repair without retrieving remotely (i.e. don't set dirty or faulty).
|
|
1374
|
-
assert(self.dirty.bit(slot));
|
|
1399
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1400
|
+
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1401
|
+
// @F, @H, @K
|
|
1375
1402
|
}
|
|
1376
1403
|
},
|
|
1377
1404
|
.vsr => {
|
|
1378
|
-
|
|
1379
|
-
assert(
|
|
1380
|
-
assert(
|
|
1405
|
+
journal.headers[slot.index] = Header.reserved(cluster, slot.index);
|
|
1406
|
+
assert(journal.dirty.bit(slot));
|
|
1407
|
+
assert(journal.faulty.bit(slot));
|
|
1381
1408
|
},
|
|
1382
1409
|
.cut => {
|
|
1383
1410
|
assert(header != null);
|
|
1384
1411
|
assert(prepare == null);
|
|
1385
|
-
assert(!
|
|
1386
|
-
assert(
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1412
|
+
assert(!journal.prepare_inhabited[slot.index]);
|
|
1413
|
+
assert(journal.prepare_checksums[slot.index] == 0);
|
|
1414
|
+
journal.headers[slot.index] = Header.reserved(cluster, slot.index);
|
|
1415
|
+
journal.dirty.clear(slot);
|
|
1416
|
+
journal.faulty.clear(slot);
|
|
1390
1417
|
},
|
|
1391
1418
|
}
|
|
1392
1419
|
|
|
1393
1420
|
switch (decision) {
|
|
1394
1421
|
.eql, .nil => {
|
|
1395
1422
|
log.debug("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
|
|
1396
|
-
|
|
1423
|
+
journal.replica,
|
|
1397
1424
|
slot.index,
|
|
1398
1425
|
case.label,
|
|
1399
1426
|
@tagName(decision),
|
|
@@ -1401,7 +1428,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1401
1428
|
},
|
|
1402
1429
|
.fix, .vsr, .cut => {
|
|
1403
1430
|
log.warn("{}: recover_slot: recovered slot={} label={s} decision={s}", .{
|
|
1404
|
-
|
|
1431
|
+
journal.replica,
|
|
1405
1432
|
slot.index,
|
|
1406
1433
|
case.label,
|
|
1407
1434
|
@tagName(decision),
|
|
@@ -1410,69 +1437,126 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1410
1437
|
}
|
|
1411
1438
|
}
|
|
1412
1439
|
|
|
1413
|
-
|
|
1414
|
-
|
|
1440
|
+
/// Repair the redundant headers for slots with decision=fix, one sector at a time.
|
|
1441
|
+
fn recover_fix(journal: *Journal) void {
|
|
1442
|
+
assert(journal.status == .recovering);
|
|
1443
|
+
assert(journal.writes.executing() == 0);
|
|
1444
|
+
assert(journal.dirty.count >= journal.faulty.count);
|
|
1445
|
+
assert(journal.dirty.count <= slot_count);
|
|
1446
|
+
|
|
1447
|
+
var fix_sector: ?usize = null;
|
|
1448
|
+
var dirty_iterator = journal.dirty.bits.iterator(.{ .kind = .set });
|
|
1449
|
+
while (dirty_iterator.next()) |dirty_slot| {
|
|
1450
|
+
if (journal.faulty.bit(Slot{ .index = dirty_slot })) continue;
|
|
1451
|
+
|
|
1452
|
+
const dirty_slot_sector = @divFloor(dirty_slot, headers_per_sector);
|
|
1453
|
+
if (fix_sector) |fix_sector_| {
|
|
1454
|
+
if (fix_sector_ != dirty_slot_sector) break;
|
|
1455
|
+
} else {
|
|
1456
|
+
fix_sector = dirty_slot_sector;
|
|
1457
|
+
}
|
|
1458
|
+
journal.dirty.clear(Slot{ .index = dirty_slot });
|
|
1459
|
+
}
|
|
1460
|
+
|
|
1461
|
+
if (fix_sector == null) return journal.recover_done();
|
|
1462
|
+
|
|
1463
|
+
const write = journal.writes.acquire().?;
|
|
1464
|
+
write.* = .{
|
|
1465
|
+
.journal = journal,
|
|
1466
|
+
.callback = undefined,
|
|
1467
|
+
.message = undefined,
|
|
1468
|
+
.trigger = .fix,
|
|
1469
|
+
.range = undefined,
|
|
1470
|
+
};
|
|
1471
|
+
|
|
1472
|
+
const buffer: []u8 = journal.header_sector(fix_sector.?, write);
|
|
1473
|
+
const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
|
|
1474
|
+
assert(buffer_headers.len == headers_per_sector);
|
|
1475
|
+
|
|
1476
|
+
const offset = Ring.headers.offset(Slot{ .index = fix_sector.? * headers_per_sector });
|
|
1477
|
+
journal.write_sectors(recover_fix_callback, write, buffer, .headers, offset);
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
fn recover_fix_callback(write: *Journal.Write) void {
|
|
1481
|
+
const journal = write.journal;
|
|
1482
|
+
assert(journal.status == .recovering);
|
|
1483
|
+
assert(write.trigger == .fix);
|
|
1415
1484
|
|
|
1416
|
-
|
|
1485
|
+
journal.writes.release(write);
|
|
1486
|
+
journal.recover_fix();
|
|
1487
|
+
}
|
|
1417
1488
|
|
|
1418
|
-
|
|
1419
|
-
assert(
|
|
1420
|
-
assert(
|
|
1489
|
+
fn recover_done(journal: *Journal) void {
|
|
1490
|
+
assert(journal.status == .recovering);
|
|
1491
|
+
assert(journal.reads.executing() == 0);
|
|
1492
|
+
assert(journal.writes.executing() == 0);
|
|
1493
|
+
assert(journal.dirty.count <= slot_count);
|
|
1494
|
+
assert(journal.faulty.count <= slot_count);
|
|
1495
|
+
assert(journal.faulty.count == journal.dirty.count);
|
|
1496
|
+
assert(journal.header_chunks_requested.count() == 0);
|
|
1497
|
+
assert(journal.header_chunks_recovered.count() == HeaderChunks.bit_length);
|
|
1498
|
+
|
|
1499
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1500
|
+
const callback = journal.status.recovering;
|
|
1501
|
+
journal.status = .recovered;
|
|
1421
1502
|
|
|
1422
1503
|
// Abort if all slots are faulty, since something is very wrong.
|
|
1423
|
-
if (
|
|
1424
|
-
if (
|
|
1504
|
+
if (journal.faulty.count == slot_count) @panic("WAL is completely corrupt");
|
|
1505
|
+
if (journal.faulty.count > 0 and replica.replica_count == 1) @panic("WAL is corrupt");
|
|
1425
1506
|
|
|
1426
|
-
if (
|
|
1427
|
-
assert(
|
|
1428
|
-
assert(!
|
|
1507
|
+
if (journal.headers[0].op == 0 and journal.headers[0].command == .prepare) {
|
|
1508
|
+
assert(journal.headers[0].checksum == Header.root_prepare(replica.cluster).checksum);
|
|
1509
|
+
assert(!journal.faulty.bit(Slot{ .index = 0 }));
|
|
1429
1510
|
}
|
|
1430
1511
|
|
|
1431
|
-
for (
|
|
1512
|
+
for (journal.headers) |*header, index| {
|
|
1432
1513
|
assert(header.valid_checksum());
|
|
1433
1514
|
assert(header.cluster == replica.cluster);
|
|
1434
|
-
assert(std.meta.eql(header.*,
|
|
1515
|
+
assert(std.meta.eql(header.*, journal.headers_redundant[index]));
|
|
1435
1516
|
if (header.command == .reserved) {
|
|
1436
1517
|
assert(header.op == index);
|
|
1437
1518
|
} else {
|
|
1438
1519
|
assert(header.command == .prepare);
|
|
1439
1520
|
assert(header.op % slot_count == index);
|
|
1440
|
-
assert(
|
|
1441
|
-
assert(
|
|
1442
|
-
assert(!
|
|
1521
|
+
assert(journal.prepare_inhabited[index]);
|
|
1522
|
+
assert(journal.prepare_checksums[index] == header.checksum);
|
|
1523
|
+
assert(!journal.faulty.bit(Slot{ .index = index }));
|
|
1443
1524
|
}
|
|
1444
1525
|
}
|
|
1526
|
+
|
|
1527
|
+
// From here it's over to the Recovery protocol from VRR 2012.
|
|
1528
|
+
callback(journal);
|
|
1445
1529
|
}
|
|
1446
1530
|
|
|
1447
1531
|
/// Removes entries from `op_min` (inclusive) onwards.
|
|
1448
|
-
/// Used after a view change to remove uncommitted entries discarded by the new
|
|
1449
|
-
pub fn remove_entries_from(
|
|
1450
|
-
assert(
|
|
1532
|
+
/// Used after a view change to remove uncommitted entries discarded by the new primary.
|
|
1533
|
+
pub fn remove_entries_from(journal: *Journal, op_min: u64) void {
|
|
1534
|
+
assert(journal.status == .recovered);
|
|
1451
1535
|
assert(op_min > 0);
|
|
1452
1536
|
|
|
1453
|
-
log.debug("{}: remove_entries_from: op_min={}", .{
|
|
1537
|
+
log.debug("{}: remove_entries_from: op_min={}", .{ journal.replica, op_min });
|
|
1454
1538
|
|
|
1455
|
-
for (
|
|
1539
|
+
for (journal.headers) |*header, index| {
|
|
1456
1540
|
// We must remove the header regardless of whether it is a prepare or reserved,
|
|
1457
1541
|
// since a reserved header may have been marked faulty for case @G, and
|
|
1458
1542
|
// since the caller expects the WAL to be truncated, with clean slots.
|
|
1459
1543
|
if (header.op >= op_min) {
|
|
1460
1544
|
// TODO Explore scenarios where the data on disk may resurface after a crash.
|
|
1461
|
-
const slot =
|
|
1545
|
+
const slot = journal.slot_for_op(header.op);
|
|
1462
1546
|
assert(slot.index == index);
|
|
1463
|
-
|
|
1547
|
+
journal.remove_entry(slot);
|
|
1464
1548
|
}
|
|
1465
1549
|
}
|
|
1466
1550
|
}
|
|
1467
1551
|
|
|
1468
|
-
pub fn remove_entry(
|
|
1469
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1552
|
+
pub fn remove_entry(journal: *Journal, slot: Slot) void {
|
|
1553
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1470
1554
|
|
|
1471
1555
|
const reserved = Header.reserved(replica.cluster, slot.index);
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1556
|
+
journal.headers[slot.index] = reserved;
|
|
1557
|
+
journal.headers_redundant[slot.index] = reserved;
|
|
1558
|
+
journal.dirty.clear(slot);
|
|
1559
|
+
journal.faulty.clear(slot);
|
|
1476
1560
|
// Do not clear `prepare_inhabited`/`prepare_checksums`. The prepare is
|
|
1477
1561
|
// untouched on disk, and may be useful later. Consider this scenario:
|
|
1478
1562
|
//
|
|
@@ -1490,29 +1574,29 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1490
1574
|
// `prepare_inhabited=false`.
|
|
1491
1575
|
}
|
|
1492
1576
|
|
|
1493
|
-
pub fn set_header_as_dirty(
|
|
1494
|
-
assert(
|
|
1577
|
+
pub fn set_header_as_dirty(journal: *Journal, header: *const Header) void {
|
|
1578
|
+
assert(journal.status == .recovered);
|
|
1495
1579
|
assert(header.command == .prepare);
|
|
1496
1580
|
|
|
1497
1581
|
log.debug("{}: set_header_as_dirty: op={} checksum={}", .{
|
|
1498
|
-
|
|
1582
|
+
journal.replica,
|
|
1499
1583
|
header.op,
|
|
1500
1584
|
header.checksum,
|
|
1501
1585
|
});
|
|
1502
1586
|
|
|
1503
|
-
const slot =
|
|
1587
|
+
const slot = journal.slot_for_header(header);
|
|
1504
1588
|
|
|
1505
|
-
if (
|
|
1506
|
-
assert(
|
|
1589
|
+
if (journal.has(header)) {
|
|
1590
|
+
assert(journal.dirty.bit(slot));
|
|
1507
1591
|
// Do not clear any faulty bit for the same entry.
|
|
1508
1592
|
} else {
|
|
1509
1593
|
// Overwriting a new op with an old op would be a correctness bug; it could cause a
|
|
1510
1594
|
// message to be uncommitted.
|
|
1511
|
-
assert(
|
|
1595
|
+
assert(journal.headers[slot.index].op <= header.op);
|
|
1512
1596
|
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1597
|
+
journal.headers[slot.index] = header.*;
|
|
1598
|
+
journal.dirty.set(slot);
|
|
1599
|
+
journal.faulty.clear(slot);
|
|
1516
1600
|
}
|
|
1517
1601
|
}
|
|
1518
1602
|
|
|
@@ -1520,49 +1604,49 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1520
1604
|
// TODO To guard against torn writes, don't write simultaneously to all redundant header
|
|
1521
1605
|
// sectors. (This is mostly a risk for single-replica clusters with small WALs).
|
|
1522
1606
|
pub fn write_prepare(
|
|
1523
|
-
|
|
1524
|
-
callback: fn (
|
|
1607
|
+
journal: *Journal,
|
|
1608
|
+
callback: fn (journal: *Replica, wrote: ?*Message, trigger: Write.Trigger) void,
|
|
1525
1609
|
message: *Message,
|
|
1526
|
-
trigger:
|
|
1610
|
+
trigger: Journal.Write.Trigger,
|
|
1527
1611
|
) void {
|
|
1528
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1612
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1529
1613
|
|
|
1530
|
-
assert(
|
|
1614
|
+
assert(journal.status == .recovered);
|
|
1531
1615
|
assert(message.header.command == .prepare);
|
|
1532
1616
|
assert(message.header.size >= @sizeOf(Header));
|
|
1533
1617
|
assert(message.header.size <= message.buffer.len);
|
|
1534
|
-
assert(
|
|
1535
|
-
assert(replica.replica_count != 1 or
|
|
1618
|
+
assert(journal.has(message.header));
|
|
1619
|
+
assert(replica.replica_count != 1 or journal.writes.executing() == 0);
|
|
1536
1620
|
|
|
1537
|
-
// The underlying header memory must be owned by the buffer and not by
|
|
1621
|
+
// The underlying header memory must be owned by the buffer and not by journal.headers:
|
|
1538
1622
|
// Otherwise, concurrent writes may modify the memory of the pointer while we write.
|
|
1539
1623
|
assert(@ptrToInt(message.header) == @ptrToInt(message.buffer.ptr));
|
|
1540
1624
|
|
|
1541
|
-
const slot =
|
|
1625
|
+
const slot = journal.slot_with_header(message.header).?;
|
|
1542
1626
|
|
|
1543
|
-
if (!
|
|
1627
|
+
if (!journal.dirty.bit(slot)) {
|
|
1544
1628
|
// Any function that sets the faulty bit should also set the dirty bit:
|
|
1545
|
-
assert(!
|
|
1546
|
-
assert(
|
|
1547
|
-
assert(
|
|
1548
|
-
assert(
|
|
1549
|
-
|
|
1629
|
+
assert(!journal.faulty.bit(slot));
|
|
1630
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1631
|
+
assert(journal.prepare_checksums[slot.index] == message.header.checksum);
|
|
1632
|
+
assert(journal.headers_redundant[slot.index].checksum == message.header.checksum);
|
|
1633
|
+
journal.write_prepare_debug(message.header, "skipping (clean)");
|
|
1550
1634
|
callback(replica, message, trigger);
|
|
1551
1635
|
return;
|
|
1552
1636
|
}
|
|
1553
1637
|
|
|
1554
|
-
assert(
|
|
1638
|
+
assert(journal.has_dirty(message.header));
|
|
1555
1639
|
|
|
1556
|
-
const write =
|
|
1557
|
-
|
|
1640
|
+
const write = journal.writes.acquire() orelse {
|
|
1641
|
+
journal.write_prepare_debug(message.header, "waiting for IOP");
|
|
1558
1642
|
callback(replica, null, trigger);
|
|
1559
1643
|
return;
|
|
1560
1644
|
};
|
|
1561
1645
|
|
|
1562
|
-
|
|
1646
|
+
journal.write_prepare_debug(message.header, "starting");
|
|
1563
1647
|
|
|
1564
1648
|
write.* = .{
|
|
1565
|
-
.
|
|
1649
|
+
.journal = journal,
|
|
1566
1650
|
.callback = callback,
|
|
1567
1651
|
.message = message.ref(),
|
|
1568
1652
|
.trigger = trigger,
|
|
@@ -1580,47 +1664,47 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1580
1664
|
assert(sum_of_sector_padding_bytes == 0);
|
|
1581
1665
|
}
|
|
1582
1666
|
|
|
1583
|
-
|
|
1584
|
-
|
|
1667
|
+
journal.prepare_inhabited[slot.index] = false;
|
|
1668
|
+
journal.prepare_checksums[slot.index] = 0;
|
|
1585
1669
|
|
|
1586
|
-
|
|
1670
|
+
journal.write_sectors(write_prepare_header, write, buffer, .prepares, offset);
|
|
1587
1671
|
}
|
|
1588
1672
|
|
|
1589
1673
|
/// Attempt to lock the in-memory sector containing the header being written.
|
|
1590
1674
|
/// If the sector is already locked, add this write to the wait queue.
|
|
1591
|
-
fn write_prepare_header(write: *
|
|
1592
|
-
const
|
|
1675
|
+
fn write_prepare_header(write: *Journal.Write) void {
|
|
1676
|
+
const journal = write.journal;
|
|
1593
1677
|
const message = write.message;
|
|
1594
|
-
assert(
|
|
1678
|
+
assert(journal.status == .recovered);
|
|
1595
1679
|
|
|
1596
1680
|
{
|
|
1597
1681
|
// `prepare_inhabited[slot.index]` is usually false here, but may be true if two
|
|
1598
1682
|
// (or more) writes to the same slot were queued concurrently and this is not the
|
|
1599
1683
|
// first to finish writing its prepare.
|
|
1600
|
-
const slot =
|
|
1601
|
-
|
|
1602
|
-
|
|
1684
|
+
const slot = journal.slot_for_header(message.header);
|
|
1685
|
+
journal.prepare_inhabited[slot.index] = true;
|
|
1686
|
+
journal.prepare_checksums[slot.index] = message.header.checksum;
|
|
1603
1687
|
}
|
|
1604
1688
|
|
|
1605
|
-
if (
|
|
1606
|
-
|
|
1689
|
+
if (journal.slot_with_op_and_checksum(message.header.op, message.header.checksum)) |slot| {
|
|
1690
|
+
journal.headers_redundant[slot.index] = message.header.*;
|
|
1607
1691
|
} else {
|
|
1608
|
-
|
|
1609
|
-
|
|
1692
|
+
journal.write_prepare_debug(message.header, "entry changed while writing sectors");
|
|
1693
|
+
journal.write_prepare_release(write, null);
|
|
1610
1694
|
return;
|
|
1611
1695
|
}
|
|
1612
1696
|
|
|
1613
1697
|
assert(!write.header_sector_locked);
|
|
1614
1698
|
assert(write.header_sector_next == null);
|
|
1615
1699
|
|
|
1616
|
-
const write_offset =
|
|
1700
|
+
const write_offset = journal.offset_logical_in_headers_for_message(message);
|
|
1617
1701
|
|
|
1618
|
-
var it =
|
|
1702
|
+
var it = journal.writes.iterate();
|
|
1619
1703
|
while (it.next()) |other| {
|
|
1620
1704
|
if (other == write) continue;
|
|
1621
1705
|
if (!other.header_sector_locked) continue;
|
|
1622
1706
|
|
|
1623
|
-
const other_offset =
|
|
1707
|
+
const other_offset = journal.offset_logical_in_headers_for_message(other.message);
|
|
1624
1708
|
if (other_offset == write_offset) {
|
|
1625
1709
|
// The `other` and `write` target the same sector; append to the list.
|
|
1626
1710
|
var tail = other;
|
|
@@ -1631,11 +1715,11 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1631
1715
|
}
|
|
1632
1716
|
|
|
1633
1717
|
write.header_sector_locked = true;
|
|
1634
|
-
|
|
1718
|
+
journal.write_prepare_on_lock_header_sector(write);
|
|
1635
1719
|
}
|
|
1636
1720
|
|
|
1637
|
-
fn write_prepare_on_lock_header_sector(
|
|
1638
|
-
assert(
|
|
1721
|
+
fn write_prepare_on_lock_header_sector(journal: *Journal, write: *Write) void {
|
|
1722
|
+
assert(journal.status == .recovered);
|
|
1639
1723
|
assert(write.header_sector_locked);
|
|
1640
1724
|
|
|
1641
1725
|
// TODO It's possible within this section that the header has since been replaced but we
|
|
@@ -1644,84 +1728,55 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1644
1728
|
// For this, we'll need to have a way to tweak write_prepare_release() to release locks.
|
|
1645
1729
|
// At present, we don't return early here simply because it doesn't yet do that.
|
|
1646
1730
|
|
|
1647
|
-
const replica = @fieldParentPtr(Replica, "journal", self);
|
|
1648
1731
|
const message = write.message;
|
|
1649
|
-
const slot_of_message =
|
|
1650
|
-
const slot_first = Slot{
|
|
1651
|
-
.index = @divFloor(slot_of_message.index, headers_per_sector) * headers_per_sector,
|
|
1652
|
-
};
|
|
1653
|
-
|
|
1732
|
+
const slot_of_message = journal.slot_for_header(message.header);
|
|
1654
1733
|
const offset = Ring.headers.offset(slot_of_message);
|
|
1655
1734
|
assert(offset % constants.sector_size == 0);
|
|
1656
1735
|
|
|
1657
|
-
const buffer: []u8 =
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
var i: usize = 0;
|
|
1662
|
-
while (i < headers_per_sector) : (i += 1) {
|
|
1663
|
-
const slot = Slot{ .index = slot_first.index + i };
|
|
1664
|
-
|
|
1665
|
-
if (self.faulty.bit(slot)) {
|
|
1666
|
-
// Redundant faulty headers are deliberately written as invalid.
|
|
1667
|
-
// This ensures that faulty headers are still faulty when they are read back
|
|
1668
|
-
// from disk during recovery. This prevents faulty entries from changing to
|
|
1669
|
-
// reserved (and clean) after a crash and restart (e.g. accidentally converting
|
|
1670
|
-
// a case `@D` to a `@J` after a restart).
|
|
1671
|
-
buffer_headers[i] = .{
|
|
1672
|
-
.checksum = 0,
|
|
1673
|
-
.cluster = replica.cluster,
|
|
1674
|
-
.command = .reserved,
|
|
1675
|
-
};
|
|
1676
|
-
assert(!buffer_headers[i].valid_checksum());
|
|
1677
|
-
} else {
|
|
1678
|
-
// Write headers from `headers_redundant` instead of `headers` — we need to
|
|
1679
|
-
// avoid writing (leaking) a redundant header before its corresponding prepare
|
|
1680
|
-
// is on disk.
|
|
1681
|
-
buffer_headers[i] = self.headers_redundant[slot.index];
|
|
1682
|
-
}
|
|
1683
|
-
}
|
|
1736
|
+
const buffer: []u8 = journal.header_sector(
|
|
1737
|
+
@divFloor(slot_of_message.index, headers_per_sector),
|
|
1738
|
+
write,
|
|
1739
|
+
);
|
|
1684
1740
|
|
|
1685
1741
|
log.debug("{}: write_header: op={} sectors[{}..{}]", .{
|
|
1686
|
-
|
|
1742
|
+
journal.replica,
|
|
1687
1743
|
message.header.op,
|
|
1688
1744
|
offset,
|
|
1689
1745
|
offset + constants.sector_size,
|
|
1690
1746
|
});
|
|
1691
1747
|
|
|
1692
|
-
// Memory must not be owned by
|
|
1693
|
-
assert(@ptrToInt(buffer.ptr) < @ptrToInt(
|
|
1694
|
-
@ptrToInt(buffer.ptr) > @ptrToInt(
|
|
1748
|
+
// Memory must not be owned by journal.headers as these may be modified concurrently:
|
|
1749
|
+
assert(@ptrToInt(buffer.ptr) < @ptrToInt(journal.headers.ptr) or
|
|
1750
|
+
@ptrToInt(buffer.ptr) > @ptrToInt(journal.headers.ptr) + headers_size);
|
|
1695
1751
|
|
|
1696
|
-
|
|
1752
|
+
journal.write_sectors(write_prepare_on_write_header, write, buffer, .headers, offset);
|
|
1697
1753
|
}
|
|
1698
1754
|
|
|
1699
|
-
fn write_prepare_on_write_header(write: *
|
|
1700
|
-
const
|
|
1755
|
+
fn write_prepare_on_write_header(write: *Journal.Write) void {
|
|
1756
|
+
const journal = write.journal;
|
|
1701
1757
|
const message = write.message;
|
|
1702
1758
|
|
|
1703
1759
|
assert(write.header_sector_locked);
|
|
1704
|
-
|
|
1760
|
+
journal.write_prepare_unlock_header_sector(write);
|
|
1705
1761
|
|
|
1706
|
-
if (!
|
|
1707
|
-
|
|
1708
|
-
|
|
1762
|
+
if (!journal.has(message.header)) {
|
|
1763
|
+
journal.write_prepare_debug(message.header, "entry changed while writing headers");
|
|
1764
|
+
journal.write_prepare_release(write, null);
|
|
1709
1765
|
return;
|
|
1710
1766
|
}
|
|
1711
1767
|
|
|
1712
|
-
|
|
1713
|
-
// TODO Snapshots
|
|
1768
|
+
journal.write_prepare_debug(message.header, "complete, marking clean");
|
|
1714
1769
|
|
|
1715
|
-
const slot =
|
|
1716
|
-
|
|
1717
|
-
|
|
1770
|
+
const slot = journal.slot_with_header(message.header).?;
|
|
1771
|
+
journal.dirty.clear(slot);
|
|
1772
|
+
journal.faulty.clear(slot);
|
|
1718
1773
|
|
|
1719
|
-
|
|
1774
|
+
journal.write_prepare_release(write, message);
|
|
1720
1775
|
}
|
|
1721
1776
|
|
|
1722
1777
|
/// Release the lock held by a write on an in-memory header sector and pass
|
|
1723
1778
|
/// it to a waiting Write, if any.
|
|
1724
|
-
fn write_prepare_unlock_header_sector(
|
|
1779
|
+
fn write_prepare_unlock_header_sector(journal: *Journal, write: *Journal.Write) void {
|
|
1725
1780
|
assert(write.header_sector_locked);
|
|
1726
1781
|
write.header_sector_locked = false;
|
|
1727
1782
|
|
|
@@ -1733,13 +1788,13 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1733
1788
|
|
|
1734
1789
|
assert(waiting.header_sector_locked == false);
|
|
1735
1790
|
waiting.header_sector_locked = true;
|
|
1736
|
-
|
|
1791
|
+
journal.write_prepare_on_lock_header_sector(waiting);
|
|
1737
1792
|
}
|
|
1738
1793
|
assert(write.header_sector_next == null);
|
|
1739
1794
|
}
|
|
1740
1795
|
|
|
1741
|
-
fn write_prepare_release(
|
|
1742
|
-
const replica = @fieldParentPtr(Replica, "journal",
|
|
1796
|
+
fn write_prepare_release(journal: *Journal, write: *Journal.Write, wrote: ?*Message) void {
|
|
1797
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1743
1798
|
const write_callback = write.callback;
|
|
1744
1799
|
const write_trigger = write.trigger;
|
|
1745
1800
|
const write_message = write.message;
|
|
@@ -1747,14 +1802,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1747
1802
|
// Release the write prior to returning control to the caller.
|
|
1748
1803
|
// This allows us to enforce journal.writes.len≤1 when replica_count=1, because the
|
|
1749
1804
|
// callback may immediately start the next write.
|
|
1750
|
-
|
|
1805
|
+
journal.writes.release(write);
|
|
1751
1806
|
write_callback(replica, wrote, write_trigger);
|
|
1752
1807
|
replica.message_bus.unref(write_message);
|
|
1753
1808
|
}
|
|
1754
1809
|
|
|
1755
|
-
fn write_prepare_debug(
|
|
1810
|
+
fn write_prepare_debug(journal: *const Journal, header: *const Header, status: []const u8) void {
|
|
1756
1811
|
log.debug("{}: write: view={} op={} len={}: {} {s}", .{
|
|
1757
|
-
|
|
1812
|
+
journal.replica,
|
|
1758
1813
|
header.view,
|
|
1759
1814
|
header.op,
|
|
1760
1815
|
header.size,
|
|
@@ -1763,14 +1818,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1763
1818
|
});
|
|
1764
1819
|
}
|
|
1765
1820
|
|
|
1766
|
-
fn offset_logical_in_headers_for_message(
|
|
1767
|
-
return Ring.headers.offset(
|
|
1821
|
+
fn offset_logical_in_headers_for_message(journal: *const Journal, message: *Message) u64 {
|
|
1822
|
+
return Ring.headers.offset(journal.slot_for_header(message.header));
|
|
1768
1823
|
}
|
|
1769
1824
|
|
|
1770
1825
|
fn write_sectors(
|
|
1771
|
-
|
|
1772
|
-
callback: fn (write: *
|
|
1773
|
-
write: *
|
|
1826
|
+
journal: *Journal,
|
|
1827
|
+
callback: fn (write: *Journal.Write) void,
|
|
1828
|
+
write: *Journal.Write,
|
|
1774
1829
|
buffer: []const u8,
|
|
1775
1830
|
ring: Ring,
|
|
1776
1831
|
offset: u64, // Offset within the Ring.
|
|
@@ -1783,16 +1838,16 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1783
1838
|
.offset = offset,
|
|
1784
1839
|
.locked = false,
|
|
1785
1840
|
};
|
|
1786
|
-
|
|
1841
|
+
journal.lock_sectors(write);
|
|
1787
1842
|
}
|
|
1788
1843
|
|
|
1789
1844
|
/// Start the write on the current range or add it to the proper queue
|
|
1790
1845
|
/// if an overlapping range is currently being written.
|
|
1791
|
-
fn lock_sectors(
|
|
1846
|
+
fn lock_sectors(journal: *Journal, write: *Journal.Write) void {
|
|
1792
1847
|
assert(!write.range.locked);
|
|
1793
1848
|
assert(write.range.next == null);
|
|
1794
1849
|
|
|
1795
|
-
var it =
|
|
1850
|
+
var it = journal.writes.iterate();
|
|
1796
1851
|
while (it.next()) |other| {
|
|
1797
1852
|
if (other == write) continue;
|
|
1798
1853
|
if (!other.range.locked) continue;
|
|
@@ -1806,14 +1861,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1806
1861
|
}
|
|
1807
1862
|
|
|
1808
1863
|
log.debug("{}: write_sectors: ring={} offset={} len={} locked", .{
|
|
1809
|
-
|
|
1864
|
+
journal.replica,
|
|
1810
1865
|
write.range.ring,
|
|
1811
1866
|
write.range.offset,
|
|
1812
1867
|
write.range.buffer.len,
|
|
1813
1868
|
});
|
|
1814
1869
|
|
|
1815
1870
|
write.range.locked = true;
|
|
1816
|
-
|
|
1871
|
+
journal.storage.write_sectors(
|
|
1817
1872
|
write_sectors_on_write,
|
|
1818
1873
|
&write.range.completion,
|
|
1819
1874
|
write.range.buffer,
|
|
@@ -1839,14 +1894,14 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1839
1894
|
|
|
1840
1895
|
fn write_sectors_on_write(completion: *Storage.Write) void {
|
|
1841
1896
|
const range = @fieldParentPtr(Range, "completion", completion);
|
|
1842
|
-
const write = @fieldParentPtr(
|
|
1843
|
-
const
|
|
1897
|
+
const write = @fieldParentPtr(Journal.Write, "range", range);
|
|
1898
|
+
const journal = write.journal;
|
|
1844
1899
|
|
|
1845
1900
|
assert(write.range.locked);
|
|
1846
1901
|
write.range.locked = false;
|
|
1847
1902
|
|
|
1848
1903
|
log.debug("{}: write_sectors: ring={} offset={} len={} unlocked", .{
|
|
1849
|
-
|
|
1904
|
+
journal.replica,
|
|
1850
1905
|
write.range.ring,
|
|
1851
1906
|
write.range.offset,
|
|
1852
1907
|
write.range.buffer.len,
|
|
@@ -1859,18 +1914,70 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1859
1914
|
assert(waiting.locked == false);
|
|
1860
1915
|
current = waiting.next;
|
|
1861
1916
|
waiting.next = null;
|
|
1862
|
-
|
|
1917
|
+
journal.lock_sectors(@fieldParentPtr(Journal.Write, "range", waiting));
|
|
1863
1918
|
}
|
|
1864
1919
|
|
|
1865
1920
|
range.callback(write);
|
|
1866
1921
|
}
|
|
1867
1922
|
|
|
1868
|
-
|
|
1869
|
-
|
|
1923
|
+
/// Returns a sector of redundant headers, ready to be written to the specified sector.
|
|
1924
|
+
/// `sector_index` is relative to the start of the redundant header zone.
|
|
1925
|
+
fn header_sector(
|
|
1926
|
+
journal: *const Journal,
|
|
1927
|
+
sector_index: usize,
|
|
1928
|
+
write: *const Journal.Write,
|
|
1929
|
+
) Sector {
|
|
1930
|
+
assert(journal.status != .init);
|
|
1931
|
+
assert(journal.writes.items.len == journal.headers_iops.len);
|
|
1932
|
+
assert(sector_index < @divFloor(slot_count, headers_per_sector));
|
|
1933
|
+
|
|
1934
|
+
const replica = @fieldParentPtr(Replica, "journal", journal);
|
|
1935
|
+
const sector_slot = Slot{ .index = sector_index * headers_per_sector };
|
|
1936
|
+
assert(sector_slot.index < slot_count);
|
|
1937
|
+
|
|
1938
|
+
const write_index = @divExact(
|
|
1939
|
+
@ptrToInt(write) - @ptrToInt(&journal.writes.items),
|
|
1940
|
+
@sizeOf(Journal.Write),
|
|
1941
|
+
);
|
|
1942
|
+
|
|
1943
|
+
// TODO The compiler should not need this align cast as the type of `headers_iops`
|
|
1944
|
+
// ensures that each buffer is properly aligned.
|
|
1945
|
+
const sector = @alignCast(constants.sector_size, &journal.headers_iops[write_index]);
|
|
1946
|
+
const sector_headers = std.mem.bytesAsSlice(Header, sector);
|
|
1947
|
+
assert(sector_headers.len == headers_per_sector);
|
|
1948
|
+
|
|
1949
|
+
var i: usize = 0;
|
|
1950
|
+
while (i < headers_per_sector) : (i += 1) {
|
|
1951
|
+
const slot = Slot{ .index = sector_slot.index + i };
|
|
1952
|
+
|
|
1953
|
+
if (journal.faulty.bit(slot)) {
|
|
1954
|
+
// Redundant faulty headers are deliberately written as invalid.
|
|
1955
|
+
// This ensures that faulty headers are still faulty when they are read back
|
|
1956
|
+
// from disk during recovery. This prevents faulty entries from changing to
|
|
1957
|
+
// reserved (and clean) after a crash and restart (e.g. accidentally converting
|
|
1958
|
+
// a case `@D` to a `@J` after a restart).
|
|
1959
|
+
sector_headers[i] = .{
|
|
1960
|
+
.checksum = 0,
|
|
1961
|
+
.cluster = replica.cluster,
|
|
1962
|
+
.command = .reserved,
|
|
1963
|
+
};
|
|
1964
|
+
assert(!sector_headers[i].valid_checksum());
|
|
1965
|
+
} else {
|
|
1966
|
+
// Write headers from `headers_redundant` instead of `headers` — we need to
|
|
1967
|
+
// avoid writing (leaking) a redundant header before its corresponding prepare
|
|
1968
|
+
// is on disk.
|
|
1969
|
+
sector_headers[i] = journal.headers_redundant[slot.index];
|
|
1970
|
+
}
|
|
1971
|
+
}
|
|
1972
|
+
return sector;
|
|
1973
|
+
}
|
|
1974
|
+
|
|
1975
|
+
pub fn writing(journal: *Journal, op: u64, checksum: u128) bool {
|
|
1976
|
+
const slot = journal.slot_for_op(op);
|
|
1870
1977
|
var found: bool = false;
|
|
1871
|
-
var it =
|
|
1978
|
+
var it = journal.writes.iterate();
|
|
1872
1979
|
while (it.next()) |write| {
|
|
1873
|
-
const write_slot =
|
|
1980
|
+
const write_slot = journal.slot_for_op(write.message.header.op);
|
|
1874
1981
|
|
|
1875
1982
|
// It's possible that we might be writing the same op but with a different checksum.
|
|
1876
1983
|
// For example, if the op we are writing did not survive the view change and was
|
|
@@ -1878,7 +1985,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1878
1985
|
// However, we compare against the 64-bit op first, since it's a cheap machine word.
|
|
1879
1986
|
if (write.message.header.op == op and write.message.header.checksum == checksum) {
|
|
1880
1987
|
// If we truly are writing, then the dirty bit must be set:
|
|
1881
|
-
assert(
|
|
1988
|
+
assert(journal.dirty.bit(journal.slot_for_op(op)));
|
|
1882
1989
|
found = true;
|
|
1883
1990
|
} else if (write_slot.index == slot.index) {
|
|
1884
1991
|
// If the in-progress write of '{op, checksum}' will be overwritten by another
|
|
@@ -1891,46 +1998,6 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
1891
1998
|
};
|
|
1892
1999
|
}
|
|
1893
2000
|
|
|
1894
|
-
pub const BitSet = struct {
|
|
1895
|
-
bits: std.DynamicBitSetUnmanaged,
|
|
1896
|
-
|
|
1897
|
-
/// The number of bits set (updated incrementally as bits are set or cleared):
|
|
1898
|
-
count: u64 = 0,
|
|
1899
|
-
|
|
1900
|
-
fn init(allocator: Allocator, count: usize) !BitSet {
|
|
1901
|
-
const bits = try std.DynamicBitSetUnmanaged.initEmpty(allocator, count);
|
|
1902
|
-
errdefer bits.deinit(allocator);
|
|
1903
|
-
|
|
1904
|
-
return BitSet{ .bits = bits };
|
|
1905
|
-
}
|
|
1906
|
-
|
|
1907
|
-
fn deinit(self: *BitSet, allocator: Allocator) void {
|
|
1908
|
-
self.bits.deinit(allocator);
|
|
1909
|
-
}
|
|
1910
|
-
|
|
1911
|
-
/// Clear the bit for a slot (idempotent):
|
|
1912
|
-
pub fn clear(self: *BitSet, slot: Slot) void {
|
|
1913
|
-
if (self.bits.isSet(slot.index)) {
|
|
1914
|
-
self.bits.unset(slot.index);
|
|
1915
|
-
self.count -= 1;
|
|
1916
|
-
}
|
|
1917
|
-
}
|
|
1918
|
-
|
|
1919
|
-
/// Whether the bit for a slot is set:
|
|
1920
|
-
pub fn bit(self: *const BitSet, slot: Slot) bool {
|
|
1921
|
-
return self.bits.isSet(slot.index);
|
|
1922
|
-
}
|
|
1923
|
-
|
|
1924
|
-
/// Set the bit for a slot (idempotent):
|
|
1925
|
-
pub fn set(self: *BitSet, slot: Slot) void {
|
|
1926
|
-
if (!self.bits.isSet(slot.index)) {
|
|
1927
|
-
self.bits.set(slot.index);
|
|
1928
|
-
self.count += 1;
|
|
1929
|
-
assert(self.count <= self.bits.bit_length);
|
|
1930
|
-
}
|
|
1931
|
-
}
|
|
1932
|
-
};
|
|
1933
|
-
|
|
1934
2001
|
/// @B and @C:
|
|
1935
2002
|
/// This prepare header is corrupt.
|
|
1936
2003
|
/// We may have a valid redundant header, but need to recover the full message.
|
|
@@ -2058,9 +2125,7 @@ const RecoveryDecision = enum {
|
|
|
2058
2125
|
eql,
|
|
2059
2126
|
/// Reserved; dirty/faulty are clear, no repair necessary.
|
|
2060
2127
|
nil,
|
|
2061
|
-
///
|
|
2062
|
-
/// If replica_count=1: Use intact prepare. Clear dirty, clear faulty.
|
|
2063
|
-
/// (Don't set faulty, because we have the valid message.)
|
|
2128
|
+
/// Use intact prepare to repair redundant header. Dirty/faulty are clear.
|
|
2064
2129
|
fix,
|
|
2065
2130
|
/// If replica_count>1: Repair with VSR `request_prepare`. Mark dirty, mark faulty.
|
|
2066
2131
|
/// If replica_count=1: Fail; cannot recover safely.
|
|
@@ -2102,9 +2167,9 @@ const Case = struct {
|
|
|
2102
2167
|
};
|
|
2103
2168
|
}
|
|
2104
2169
|
|
|
2105
|
-
fn check(
|
|
2170
|
+
fn check(case: *const Case, parameters: [9]bool) !bool {
|
|
2106
2171
|
for (parameters) |b, i| {
|
|
2107
|
-
switch (
|
|
2172
|
+
switch (case.pattern[i]) {
|
|
2108
2173
|
.any => {},
|
|
2109
2174
|
.is_false => if (b) return false,
|
|
2110
2175
|
.is_true => if (!b) return false,
|
|
@@ -2115,12 +2180,12 @@ const Case = struct {
|
|
|
2115
2180
|
return true;
|
|
2116
2181
|
}
|
|
2117
2182
|
|
|
2118
|
-
fn decision(
|
|
2183
|
+
fn decision(case: *const Case, replica_count: u8) RecoveryDecision {
|
|
2119
2184
|
assert(replica_count > 0);
|
|
2120
2185
|
if (replica_count == 1) {
|
|
2121
|
-
return
|
|
2186
|
+
return case.decision_single;
|
|
2122
2187
|
} else {
|
|
2123
|
-
return
|
|
2188
|
+
return case.decision_multiple;
|
|
2124
2189
|
}
|
|
2125
2190
|
}
|
|
2126
2191
|
};
|
|
@@ -2211,6 +2276,51 @@ test "recovery_cases" {
|
|
|
2211
2276
|
}
|
|
2212
2277
|
}
|
|
2213
2278
|
|
|
2279
|
+
pub const BitSet = struct {
|
|
2280
|
+
bits: std.DynamicBitSetUnmanaged,
|
|
2281
|
+
|
|
2282
|
+
/// The number of bits set (updated incrementally as bits are set or cleared):
|
|
2283
|
+
count: u64 = 0,
|
|
2284
|
+
|
|
2285
|
+
fn init_full(allocator: Allocator, count: usize) !BitSet {
|
|
2286
|
+
const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
|
|
2287
|
+
errdefer bits.deinit(allocator);
|
|
2288
|
+
|
|
2289
|
+
return BitSet{
|
|
2290
|
+
.bits = bits,
|
|
2291
|
+
.count = count,
|
|
2292
|
+
};
|
|
2293
|
+
}
|
|
2294
|
+
|
|
2295
|
+
fn deinit(bit_set: *BitSet, allocator: Allocator) void {
|
|
2296
|
+
assert(bit_set.count == bit_set.bits.count());
|
|
2297
|
+
|
|
2298
|
+
bit_set.bits.deinit(allocator);
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
/// Clear the bit for a slot (idempotent):
|
|
2302
|
+
pub fn clear(bit_set: *BitSet, slot: Slot) void {
|
|
2303
|
+
if (bit_set.bits.isSet(slot.index)) {
|
|
2304
|
+
bit_set.bits.unset(slot.index);
|
|
2305
|
+
bit_set.count -= 1;
|
|
2306
|
+
}
|
|
2307
|
+
}
|
|
2308
|
+
|
|
2309
|
+
/// Whether the bit for a slot is set:
|
|
2310
|
+
pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
|
|
2311
|
+
return bit_set.bits.isSet(slot.index);
|
|
2312
|
+
}
|
|
2313
|
+
|
|
2314
|
+
/// Set the bit for a slot (idempotent):
|
|
2315
|
+
pub fn set(bit_set: *BitSet, slot: Slot) void {
|
|
2316
|
+
if (!bit_set.bits.isSet(slot.index)) {
|
|
2317
|
+
bit_set.bits.set(slot.index);
|
|
2318
|
+
bit_set.count += 1;
|
|
2319
|
+
assert(bit_set.count <= bit_set.bits.bit_length);
|
|
2320
|
+
}
|
|
2321
|
+
}
|
|
2322
|
+
};
|
|
2323
|
+
|
|
2214
2324
|
/// Format part of a new WAL's Zone.wal_headers, writing to `target`.
|
|
2215
2325
|
///
|
|
2216
2326
|
/// `offset_logical` is relative to the beginning of the `wal_headers` zone.
|