tigerbeetle-node 0.11.9 → 0.11.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +1 -1
- package/scripts/build_lib.sh +22 -10
- package/src/tigerbeetle/src/config.zig +2 -0
- package/src/tigerbeetle/src/constants.zig +18 -0
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -2
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +6 -5
- package/src/tigerbeetle/src/lsm/test.zig +5 -4
- package/src/tigerbeetle/src/lsm/tree.zig +3 -30
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -2
- package/src/tigerbeetle/src/simulator.zig +34 -53
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +1 -1
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +6 -6
- package/src/tigerbeetle/src/testing/cluster.zig +5 -99
- package/src/tigerbeetle/src/testing/storage.zig +26 -23
- package/src/tigerbeetle/src/vsr/journal.zig +86 -90
- package/src/tigerbeetle/src/vsr/replica.zig +631 -687
- package/src/tigerbeetle/src/vsr/replica_format.zig +12 -12
- package/src/tigerbeetle/src/vsr/superblock.zig +192 -110
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +35 -8
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +48 -48
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +50 -50
- package/src/tigerbeetle/src/vsr.zig +93 -29
package/dist/.client.node.sha256
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
21c6105d76e0efc68fe5cbe799d363a083a9b44359e16d7dbfbb172e381ea0c3 dist/client.node
|
package/package.json
CHANGED
package/scripts/build_lib.sh
CHANGED
|
@@ -5,25 +5,37 @@ set -e
|
|
|
5
5
|
# macOS 13 Ventura is not supported on Zig 0.9.x.
|
|
6
6
|
# Overriding -target is one workaround Andrew suggests.
|
|
7
7
|
# https://github.com/ziglang/zig/issues/10478#issuecomment-1294313967
|
|
8
|
-
|
|
9
|
-
# because the rest of it doesn't always seem to be valid when passed
|
|
10
|
-
# back in to `-target`.
|
|
11
|
-
target="$(./zig/zig targets | grep triple |cut -d '"' -f 4 | cut -d '.' -f 1)"
|
|
8
|
+
target=""
|
|
12
9
|
if [ "$(./zig/zig targets | grep triple |cut -d '"' -f 4 | cut -d '.' -f 1,2)" = "aarch64-macos.13" ]; then
|
|
13
|
-
target="native-macos.11"
|
|
10
|
+
target="-target native-macos.11"
|
|
14
11
|
fi
|
|
15
12
|
|
|
16
|
-
|
|
13
|
+
# Zig picks musl libc on RHEL instead of glibc, incorrectly
|
|
14
|
+
# https://github.com/ziglang/zig/issues/12156
|
|
15
|
+
if [ -f "/etc/redhat-release" ]; then
|
|
16
|
+
if ! grep Fedora /etc/redhat-release; then
|
|
17
|
+
target="-target native-native-gnu"
|
|
18
|
+
fi
|
|
19
|
+
fi
|
|
20
|
+
|
|
21
|
+
if [ "$target" = "" ]; then
|
|
22
|
+
echo "Building default target"
|
|
23
|
+
else
|
|
24
|
+
echo "Building for '$target'"
|
|
25
|
+
fi
|
|
17
26
|
|
|
18
27
|
mkdir -p dist
|
|
19
28
|
|
|
20
|
-
|
|
29
|
+
# Need to do string eval-ing because of shellcheck's strict string
|
|
30
|
+
# interpolation rules.
|
|
31
|
+
cmd="./zig/zig build-lib \
|
|
21
32
|
-mcpu=baseline \
|
|
22
33
|
-OReleaseSafe \
|
|
23
34
|
-dynamic \
|
|
24
35
|
-lc \
|
|
25
|
-
-isystem build/node
|
|
36
|
+
-isystem build/node-$(node --version)/include/node \
|
|
26
37
|
-fallow-shlib-undefined \
|
|
27
38
|
-femit-bin=dist/client.node \
|
|
28
|
-
|
|
29
|
-
|
|
39
|
+
$target src/node.zig"
|
|
40
|
+
|
|
41
|
+
eval "$cmd"
|
|
@@ -80,6 +80,7 @@ const ConfigCluster = struct {
|
|
|
80
80
|
cache_line_size: comptime_int = 64,
|
|
81
81
|
clients_max: usize,
|
|
82
82
|
pipeline_prepare_queue_max: usize = 8,
|
|
83
|
+
view_change_headers_max: usize = 8,
|
|
83
84
|
quorum_replication_max: u8 = 3,
|
|
84
85
|
journal_slot_count: usize = 1024,
|
|
85
86
|
message_size_max: usize = 1 * 1024 * 1024,
|
|
@@ -180,6 +181,7 @@ pub const configs = struct {
|
|
|
180
181
|
.cluster = .{
|
|
181
182
|
.clients_max = 4 + 3,
|
|
182
183
|
.pipeline_prepare_queue_max = 4,
|
|
184
|
+
.view_change_headers_max = 4,
|
|
183
185
|
.journal_slot_count = Config.Cluster.journal_slot_count_min,
|
|
184
186
|
.message_size_max = Config.Cluster.message_size_max_min(4),
|
|
185
187
|
.storage_size_max = 4 * 1024 * 1024 * 1024,
|
|
@@ -157,6 +157,9 @@ comptime {
|
|
|
157
157
|
assert(message_size_max >= @sizeOf(vsr.Header));
|
|
158
158
|
assert(message_size_max >= sector_size);
|
|
159
159
|
assert(message_size_max >= Config.Cluster.message_size_max_min(clients_max));
|
|
160
|
+
|
|
161
|
+
// Ensure that DVC/SV messages can fit all necessary headers.
|
|
162
|
+
assert(message_body_size_max >= view_change_headers_max * @sizeOf(vsr.Header));
|
|
160
163
|
}
|
|
161
164
|
|
|
162
165
|
/// The maximum number of Viewstamped Replication prepare messages that can be inflight at a time.
|
|
@@ -184,6 +187,21 @@ comptime {
|
|
|
184
187
|
assert(pipeline_request_queue_max >= 0);
|
|
185
188
|
}
|
|
186
189
|
|
|
190
|
+
/// The number of prepare headers to include in the body of a DVC/SV.
|
|
191
|
+
///
|
|
192
|
+
/// CRITICAL:
|
|
193
|
+
/// We must provide enough headers to cover all uncommitted headers so that the new
|
|
194
|
+
/// primary (if we are in a view change) can decide whether to discard uncommitted headers
|
|
195
|
+
/// that cannot be repaired because they are gaps. See DVCQuorum for more detail.
|
|
196
|
+
pub const view_change_headers_max = config.cluster.view_change_headers_max;
|
|
197
|
+
|
|
198
|
+
comptime {
|
|
199
|
+
assert(view_change_headers_max > 0);
|
|
200
|
+
assert(view_change_headers_max >= pipeline_prepare_queue_max);
|
|
201
|
+
assert(view_change_headers_max <= journal_slot_count);
|
|
202
|
+
assert(view_change_headers_max <= @divFloor(message_body_size_max, @sizeOf(vsr.Header)));
|
|
203
|
+
}
|
|
204
|
+
|
|
187
205
|
/// The minimum and maximum amount of time in milliseconds to wait before initiating a connection.
|
|
188
206
|
/// Exponential backoff and jitter are applied within this range.
|
|
189
207
|
pub const connection_delay_min_ms = config.process.connection_delay_min_ms;
|
|
@@ -419,16 +419,17 @@ const Environment = struct {
|
|
|
419
419
|
env.manifest_log.checkpoint(checkpoint_callback);
|
|
420
420
|
env.wait(&env.manifest_log);
|
|
421
421
|
|
|
422
|
-
|
|
423
|
-
vsr_state.commit_min += 1;
|
|
424
|
-
vsr_state.commit_min_checksum += 1;
|
|
425
|
-
vsr_state.commit_max += 1;
|
|
422
|
+
const vsr_state = &env.manifest_log.superblock.working.vsr_state;
|
|
426
423
|
|
|
427
424
|
env.pending += 1;
|
|
428
425
|
env.manifest_log.superblock.checkpoint(
|
|
429
426
|
checkpoint_superblock_callback,
|
|
430
427
|
&env.superblock_context,
|
|
431
|
-
|
|
428
|
+
.{
|
|
429
|
+
.commit_min_checksum = vsr_state.commit_min_checksum + 1,
|
|
430
|
+
.commit_min = vsr_state.commit_min + 1,
|
|
431
|
+
.commit_max = vsr_state.commit_max + 1,
|
|
432
|
+
},
|
|
432
433
|
);
|
|
433
434
|
env.wait(&env.manifest_log);
|
|
434
435
|
|
|
@@ -202,15 +202,16 @@ const Environment = struct {
|
|
|
202
202
|
|
|
203
203
|
log.debug("forest checkpointing completed!", .{});
|
|
204
204
|
|
|
205
|
-
|
|
206
|
-
vsr_state.commit_min += 1;
|
|
207
|
-
vsr_state.commit_min_checkpoint += 1;
|
|
205
|
+
const vsr_state = &env.superblock.staging.vsr_state;
|
|
208
206
|
|
|
209
207
|
env.state = .superblock_checkpointing;
|
|
210
208
|
env.superblock.checkpoint(
|
|
211
209
|
superblock_checkpoint_callback,
|
|
212
210
|
&env.superblock_context,
|
|
213
|
-
|
|
211
|
+
.{
|
|
212
|
+
.commit_min_checkpoint = vsr_state.commit_min_checkpoint + 1,
|
|
213
|
+
.commit_min = vsr_state.commit_min + 1,
|
|
214
|
+
},
|
|
214
215
|
);
|
|
215
216
|
}
|
|
216
217
|
|
|
@@ -145,7 +145,6 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
145
145
|
|
|
146
146
|
compaction_io_pending: usize,
|
|
147
147
|
compaction_callback: ?fn (*Tree) void,
|
|
148
|
-
compaction_next_tick: Grid.NextTick = undefined,
|
|
149
148
|
|
|
150
149
|
checkpoint_callback: ?fn (*Tree) void,
|
|
151
150
|
open_callback: ?fn (*Tree) void,
|
|
@@ -337,8 +336,7 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
337
336
|
}
|
|
338
337
|
|
|
339
338
|
if (index_block_count == 0) {
|
|
340
|
-
context
|
|
341
|
-
tree.grid.on_next_tick(lookup_invalid_tick_callback, &context.next_tick);
|
|
339
|
+
callback(context, null);
|
|
342
340
|
return;
|
|
343
341
|
}
|
|
344
342
|
|
|
@@ -348,7 +346,6 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
348
346
|
context.* = .{
|
|
349
347
|
.tree = tree,
|
|
350
348
|
.completion = undefined,
|
|
351
|
-
.next_tick = undefined,
|
|
352
349
|
|
|
353
350
|
.key = key,
|
|
354
351
|
.fingerprint = fingerprint,
|
|
@@ -363,18 +360,12 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
363
360
|
context.read_index_block();
|
|
364
361
|
}
|
|
365
362
|
|
|
366
|
-
fn lookup_invalid_tick_callback(next_tick: *Grid.NextTick) void {
|
|
367
|
-
const context = @fieldParentPtr(LookupContext, "next_tick", next_tick);
|
|
368
|
-
context.callback(context, null);
|
|
369
|
-
}
|
|
370
|
-
|
|
371
363
|
pub const LookupContext = struct {
|
|
372
364
|
const Read = Grid.Read;
|
|
373
365
|
const BlockPtrConst = Grid.BlockPtrConst;
|
|
374
366
|
|
|
375
367
|
tree: *Tree,
|
|
376
368
|
completion: Read,
|
|
377
|
-
next_tick: Grid.NextTick,
|
|
378
369
|
|
|
379
370
|
key: Key,
|
|
380
371
|
fingerprint: bloom_filter.Fingerprint,
|
|
@@ -569,8 +560,7 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
569
560
|
tree.compact_mutable_table_into_immutable();
|
|
570
561
|
}
|
|
571
562
|
|
|
572
|
-
tree
|
|
573
|
-
tree.grid.on_next_tick(compact_skip_tick_callback, &tree.compaction_next_tick);
|
|
563
|
+
callback(tree);
|
|
574
564
|
return;
|
|
575
565
|
}
|
|
576
566
|
|
|
@@ -598,13 +588,6 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
598
588
|
tree.compact_drive();
|
|
599
589
|
}
|
|
600
590
|
|
|
601
|
-
fn compact_skip_tick_callback(next_tick: *Grid.NextTick) void {
|
|
602
|
-
const tree = @fieldParentPtr(Tree, "compaction_next_tick", next_tick);
|
|
603
|
-
const callback = tree.compaction_callback.?;
|
|
604
|
-
tree.compaction_callback = null;
|
|
605
|
-
callback(tree);
|
|
606
|
-
}
|
|
607
|
-
|
|
608
591
|
fn compact_start(tree: *Tree, callback: fn (*Tree) void) void {
|
|
609
592
|
assert(tree.compaction_io_pending == 0);
|
|
610
593
|
assert(tree.compaction_callback == null);
|
|
@@ -874,7 +857,7 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
874
857
|
// We are at the end of a half-bar, but the compactions have not finished.
|
|
875
858
|
// We keep ticking them until they finish.
|
|
876
859
|
log.debug(tree_name ++ ": compact_done: driving outstanding compactions", .{});
|
|
877
|
-
tree.
|
|
860
|
+
tree.compact_drive();
|
|
878
861
|
return;
|
|
879
862
|
}
|
|
880
863
|
|
|
@@ -952,16 +935,6 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
952
935
|
tree.manifest.compact(compact_manifest_callback);
|
|
953
936
|
}
|
|
954
937
|
|
|
955
|
-
/// Asynchronously continue to drive the compactions when they haven't finished at the time
|
|
956
|
-
/// they were supposed to at the end of a half-bar.
|
|
957
|
-
fn compact_drive_tick_callback(next_tick: *Grid.NextTick) void {
|
|
958
|
-
const tree = @fieldParentPtr(Tree, "compaction_next_tick", next_tick);
|
|
959
|
-
assert(tree.compaction_io_pending == 0);
|
|
960
|
-
assert(tree.compaction_callback != null);
|
|
961
|
-
assert(tree.compaction_op == tree.lookup_snapshot_max);
|
|
962
|
-
tree.compact_drive();
|
|
963
|
-
}
|
|
964
|
-
|
|
965
938
|
/// Called after the last beat of a full compaction bar.
|
|
966
939
|
fn compact_mutable_table_into_immutable(tree: *Tree) void {
|
|
967
940
|
assert(tree.table_immutable.free);
|
|
@@ -143,7 +143,7 @@ pub fn main() !void {
|
|
|
143
143
|
const simulator_options = Simulator.Options{
|
|
144
144
|
.cluster = cluster_options,
|
|
145
145
|
.workload = workload_options,
|
|
146
|
-
.replica_crash_probability = 0.
|
|
146
|
+
.replica_crash_probability = 0.00002,
|
|
147
147
|
.replica_crash_stability = random.uintLessThan(u32, 1_000),
|
|
148
148
|
.replica_restart_probability = 0.0001,
|
|
149
149
|
.replica_restart_stability = random.uintLessThan(u32, 1_000),
|
|
@@ -444,43 +444,11 @@ pub const Simulator = struct {
|
|
|
444
444
|
}
|
|
445
445
|
|
|
446
446
|
fn tick_crash(simulator: *Simulator) void {
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
var replica_normal_min = if (simulator.options.cluster.replica_count == 1)
|
|
453
|
-
0
|
|
454
|
-
else
|
|
455
|
-
vsr.quorums(simulator.options.cluster.replica_count).view_change;
|
|
456
|
-
break :blk simulator.cluster.replica_normal_count() -| replica_normal_min;
|
|
457
|
-
};
|
|
458
|
-
|
|
459
|
-
for (simulator.cluster.storages) |*storage, replica| {
|
|
460
|
-
if (simulator.cluster.replicas[replica].journal.status == .recovered) {
|
|
461
|
-
// TODO Remove this workaround when VSR recovery protocol is disabled.
|
|
462
|
-
// When only the minimum number of replicas are healthy (no more crashes allowed),
|
|
463
|
-
// disable storage faults on all healthy replicas.
|
|
464
|
-
//
|
|
465
|
-
// This is a workaround to avoid the deadlock that occurs when (for example) in a
|
|
466
|
-
// cluster of 3 replicas, one is down, another has a corrupt prepare, and the last does
|
|
467
|
-
// not have the prepare. The two healthy replicas can never complete a view change,
|
|
468
|
-
// because two replicas are not enough to nack, and the unhealthy replica cannot
|
|
469
|
-
// complete the VSR recovery protocol either.
|
|
470
|
-
if (simulator.cluster.replica_health[replica] == .up and crashes == 0) {
|
|
471
|
-
if (storage.faulty) {
|
|
472
|
-
log_simulator.debug("{}: disable storage faults", .{replica});
|
|
473
|
-
storage.faulty = false;
|
|
474
|
-
}
|
|
475
|
-
} else {
|
|
476
|
-
// When a journal recovers for the first time, enable its storage faults.
|
|
477
|
-
// Future crashes will recover in the presence of faults.
|
|
478
|
-
if (!storage.faulty) {
|
|
479
|
-
log_simulator.debug("{}: enable storage faults", .{replica});
|
|
480
|
-
storage.faulty = true;
|
|
481
|
-
}
|
|
482
|
-
}
|
|
483
|
-
}
|
|
447
|
+
const recoverable_count_min =
|
|
448
|
+
vsr.quorums(simulator.options.cluster.replica_count).view_change;
|
|
449
|
+
var recoverable_count: usize = 0;
|
|
450
|
+
for (simulator.cluster.replicas) |*replica| {
|
|
451
|
+
recoverable_count += @boolToInt(replica.status != .recovering_head);
|
|
484
452
|
}
|
|
485
453
|
|
|
486
454
|
for (simulator.cluster.replicas) |*replica| {
|
|
@@ -490,28 +458,41 @@ pub const Simulator = struct {
|
|
|
490
458
|
|
|
491
459
|
switch (simulator.cluster.replica_health[replica.replica]) {
|
|
492
460
|
.up => {
|
|
493
|
-
|
|
494
|
-
const replica_writes =
|
|
461
|
+
const storage = &simulator.cluster.storages[replica.replica];
|
|
462
|
+
const replica_writes = storage.writes.count();
|
|
495
463
|
const crash_probability = simulator.options.replica_crash_probability *
|
|
496
464
|
@as(f64, if (replica_writes == 0) 1.0 else 10.0);
|
|
497
465
|
if (!chance_f64(simulator.random, crash_probability)) continue;
|
|
498
466
|
|
|
499
|
-
const
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
467
|
+
const fault = recoverable_count > recoverable_count_min;
|
|
468
|
+
replica.superblock.storage.faulty = fault;
|
|
469
|
+
|
|
470
|
+
if (!fault) {
|
|
471
|
+
// The journal writes redundant headers of faulty ops as zeroes to ensure
|
|
472
|
+
// that they remain faulty after a crash/recover. Since that fault cannot
|
|
473
|
+
// be disabled by `storage.faulty`, we must manually repair it here to
|
|
474
|
+
// ensure a cluster cannot become stuck in status=recovering_head.
|
|
475
|
+
// See recover_slots() for more detail.
|
|
476
|
+
const offset = vsr.Zone.wal_headers.offset(0);
|
|
477
|
+
const size = vsr.Zone.wal_headers.size().?;
|
|
478
|
+
const headers_bytes = storage.memory[offset..][0..size];
|
|
479
|
+
const headers = mem.bytesAsSlice(vsr.Header, headers_bytes);
|
|
480
|
+
for (headers) |*h, slot| {
|
|
481
|
+
if (h.checksum == 0) h.* = storage.wal_prepares()[slot].header;
|
|
482
|
+
}
|
|
511
483
|
}
|
|
484
|
+
|
|
485
|
+
log_simulator.debug("{}: crash replica (faults={})", .{ replica.replica, fault });
|
|
486
|
+
simulator.cluster.crash_replica(replica.replica) catch unreachable;
|
|
487
|
+
replica.superblock.storage.faulty = true;
|
|
488
|
+
|
|
489
|
+
recoverable_count -= @boolToInt(replica.status == .recovering_head);
|
|
490
|
+
assert(replica.status != .recovering_head or fault);
|
|
491
|
+
|
|
492
|
+
simulator.replica_stability[replica.replica] =
|
|
493
|
+
simulator.options.replica_crash_stability;
|
|
512
494
|
},
|
|
513
495
|
.down => {
|
|
514
|
-
assert(replica.status == .recovering);
|
|
515
496
|
if (chance_f64(simulator.random, simulator.options.replica_restart_probability)) {
|
|
516
497
|
simulator.cluster.restart_replica(replica.replica);
|
|
517
498
|
log_simulator.debug("{}: restart replica", .{replica.replica});
|
|
@@ -72,7 +72,7 @@ pub fn StateCheckerType(comptime Client: type, comptime Replica: type) type {
|
|
|
72
72
|
const commit_b = replica.commit_min;
|
|
73
73
|
|
|
74
74
|
const header_b = replica.journal.header_with_op(replica.commit_min);
|
|
75
|
-
assert(header_b != null or replica.commit_min == replica.op_checkpoint);
|
|
75
|
+
assert(header_b != null or replica.commit_min == replica.op_checkpoint());
|
|
76
76
|
assert(header_b == null or header_b.?.op == commit_b);
|
|
77
77
|
|
|
78
78
|
const checksum_a = state_checker.commits.items[commit_a].header.checksum;
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
//! - Acquired Grid blocks
|
|
15
15
|
//!
|
|
16
16
|
//! Areas not verified:
|
|
17
|
-
//! - SuperBlock
|
|
17
|
+
//! - SuperBlock headers, which hold replica-specific state.
|
|
18
18
|
//! - WAL headers, which may differ because the WAL writes deliberately corrupt redundant headers
|
|
19
19
|
//! to faulty slots to ensure recovery is consistent.
|
|
20
20
|
//! - Non-allocated Grid blocks, which may differ due to state transfer.
|
|
@@ -25,7 +25,7 @@ const log = std.log.scoped(.storage_checker);
|
|
|
25
25
|
const constants = @import("../../constants.zig");
|
|
26
26
|
const vsr = @import("../../vsr.zig");
|
|
27
27
|
const superblock = @import("../../vsr/superblock.zig");
|
|
28
|
-
const
|
|
28
|
+
const SuperBlockHeader = superblock.SuperBlockHeader;
|
|
29
29
|
const Storage = @import("../storage.zig").Storage;
|
|
30
30
|
|
|
31
31
|
/// After each compaction half measure, save the cumulative hash of all acquired grid blocks.
|
|
@@ -44,7 +44,7 @@ const Checkpoints = std.AutoHashMap(u64, Checkpoint);
|
|
|
44
44
|
|
|
45
45
|
const Checkpoint = struct {
|
|
46
46
|
// The superblock trailers are an XOR of all copies of all respective trailers, not the
|
|
47
|
-
// `
|
|
47
|
+
// `SuperBlockHeader.{trailer}_checksum`.
|
|
48
48
|
checksum_superblock_manifest: u128,
|
|
49
49
|
checksum_superblock_free_set: u128,
|
|
50
50
|
checksum_superblock_client_table: u128,
|
|
@@ -145,15 +145,15 @@ pub fn StorageCheckerType(comptime Replica: type) type {
|
|
|
145
145
|
inline for (std.meta.fields(Checkpoint)) |field| {
|
|
146
146
|
log.debug("{}: replica_checkpoint: checkpoint={} area={s} value={}", .{
|
|
147
147
|
replica.replica,
|
|
148
|
-
replica.op_checkpoint,
|
|
148
|
+
replica.op_checkpoint(),
|
|
149
149
|
field.name,
|
|
150
150
|
@field(checkpoint, field.name),
|
|
151
151
|
});
|
|
152
152
|
}
|
|
153
153
|
|
|
154
|
-
const checkpoint_expect = checker.checkpoints.get(replica.op_checkpoint) orelse {
|
|
154
|
+
const checkpoint_expect = checker.checkpoints.get(replica.op_checkpoint()) orelse {
|
|
155
155
|
// This replica is the first to reach op_checkpoint.
|
|
156
|
-
try checker.checkpoints.putNoClobber(replica.op_checkpoint, checkpoint);
|
|
156
|
+
try checker.checkpoints.putNoClobber(replica.op_checkpoint(), checkpoint);
|
|
157
157
|
return;
|
|
158
158
|
};
|
|
159
159
|
|
|
@@ -142,7 +142,7 @@ pub fn ClusterType(comptime StateMachineType: fn (comptime Storage: type, compti
|
|
|
142
142
|
storage_options.replica_index = @intCast(u8, replica_index);
|
|
143
143
|
storage_options.fault_atlas = storage_fault_atlas;
|
|
144
144
|
storage.* = try Storage.init(allocator, options.storage_size_limit, storage_options);
|
|
145
|
-
// Disable most faults at startup, so that the replicas don't get stuck
|
|
145
|
+
// Disable most faults at startup, so that the replicas don't get stuck recovering_head.
|
|
146
146
|
storage.faulty = replica_index >= vsr.quorums(options.replica_count).view_change;
|
|
147
147
|
}
|
|
148
148
|
errdefer for (storages) |*storage| storage.deinit(allocator);
|
|
@@ -301,99 +301,17 @@ pub fn ClusterType(comptime StateMachineType: fn (comptime Storage: type, compti
|
|
|
301
301
|
///
|
|
302
302
|
/// Returns whether the replica was crashed.
|
|
303
303
|
/// Returns an error when the replica was unable to recover (open).
|
|
304
|
-
pub fn crash_replica(cluster: *Self, replica_index: u8) !
|
|
304
|
+
pub fn crash_replica(cluster: *Self, replica_index: u8) !void {
|
|
305
305
|
assert(cluster.replica_health[replica_index] == .up);
|
|
306
306
|
|
|
307
|
-
const replica = &cluster.replicas[replica_index];
|
|
308
|
-
if (replica.op == 0) {
|
|
309
|
-
// Only crash when `replica.op > 0` — an empty WAL would skip recovery after a crash.
|
|
310
|
-
return false;
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
// TODO Remove this workaround when VSR recovery protocol is disabled.
|
|
314
|
-
for (replica.journal.prepare_inhabited) |inhabited, i| {
|
|
315
|
-
if (i == 0) {
|
|
316
|
-
// Ignore the root header.
|
|
317
|
-
} else {
|
|
318
|
-
if (inhabited) break;
|
|
319
|
-
}
|
|
320
|
-
} else {
|
|
321
|
-
// Only crash when at least one header has been written to the WAL.
|
|
322
|
-
// An empty WAL would skip recovery after a crash.
|
|
323
|
-
return false;
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
// Ensure that the cluster can eventually recover without this replica.
|
|
327
|
-
// Verify that each op is recoverable by the current healthy cluster (minus the replica we
|
|
328
|
-
// are trying to crash).
|
|
329
|
-
// TODO Remove this workaround when VSR recovery protocol is disabled.
|
|
330
|
-
if (cluster.options.replica_count != 1) {
|
|
331
|
-
var parent: u128 = undefined;
|
|
332
|
-
const cluster_op_max = op_max: {
|
|
333
|
-
var v: ?u32 = null;
|
|
334
|
-
var op_max: ?u64 = null;
|
|
335
|
-
for (cluster.replicas) |other_replica, i| {
|
|
336
|
-
if (cluster.replica_health[i] == .down) continue;
|
|
337
|
-
if (other_replica.status == .recovering) continue;
|
|
338
|
-
|
|
339
|
-
if (v == null or other_replica.log_view > v.? or
|
|
340
|
-
(other_replica.log_view == v.? and other_replica.op > op_max.?))
|
|
341
|
-
{
|
|
342
|
-
v = other_replica.log_view;
|
|
343
|
-
op_max = other_replica.op;
|
|
344
|
-
parent = other_replica.journal.header_with_op(op_max.?).?.checksum;
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
break :op_max op_max.?;
|
|
348
|
-
};
|
|
349
|
-
|
|
350
|
-
// This whole workaround doesn't handle log wrapping correctly.
|
|
351
|
-
// If the log has wrapped, don't crash the replica.
|
|
352
|
-
if (cluster_op_max >= constants.journal_slot_count) {
|
|
353
|
-
return false;
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
var op: u64 = cluster_op_max + 1;
|
|
357
|
-
while (op > 0) {
|
|
358
|
-
op -= 1;
|
|
359
|
-
|
|
360
|
-
var cluster_op_known: bool = false;
|
|
361
|
-
for (cluster.replicas) |other_replica, i| {
|
|
362
|
-
// Ignore replicas that are ineligible to assist recovery.
|
|
363
|
-
if (replica_index == i) continue;
|
|
364
|
-
if (cluster.replica_health[i] == .down) continue;
|
|
365
|
-
if (other_replica.status == .recovering) continue;
|
|
366
|
-
|
|
367
|
-
if (other_replica.journal.header_with_op_and_checksum(op, parent)) |header| {
|
|
368
|
-
parent = header.parent;
|
|
369
|
-
if (!other_replica.journal.dirty.bit(.{ .index = op })) {
|
|
370
|
-
// The op is recoverable if this replica crashes.
|
|
371
|
-
break;
|
|
372
|
-
}
|
|
373
|
-
cluster_op_known = true;
|
|
374
|
-
}
|
|
375
|
-
} else {
|
|
376
|
-
if (op == cluster_op_max and !cluster_op_known) {
|
|
377
|
-
// The replica can crash; it will be able to truncate the last op.
|
|
378
|
-
} else {
|
|
379
|
-
// The op isn't recoverable if this replica is crashed.
|
|
380
|
-
return false;
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
// We can't crash this replica because without it we won't be able to repair a broken
|
|
386
|
-
// hash chain.
|
|
387
|
-
if (parent != 0) return false;
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
cluster.replica_health[replica_index] = .down;
|
|
391
|
-
|
|
392
307
|
// Reset the storage before the replica so that pending writes can (partially) finish.
|
|
393
308
|
cluster.storages[replica_index].reset();
|
|
309
|
+
|
|
310
|
+
const replica = &cluster.replicas[replica_index];
|
|
394
311
|
const replica_time = replica.time;
|
|
395
312
|
replica.deinit(cluster.allocator);
|
|
396
313
|
cluster.network.process_disable(.{ .replica = replica_index });
|
|
314
|
+
cluster.replica_health[replica_index] = .down;
|
|
397
315
|
|
|
398
316
|
// Ensure that none of the replica's messages leaked when it was deinitialized.
|
|
399
317
|
var messages_in_pool: usize = 0;
|
|
@@ -411,18 +329,6 @@ pub fn ClusterType(comptime StateMachineType: fn (comptime Storage: type, compti
|
|
|
411
329
|
// Pass the old replica's Time through to the new replica. It will continue to tick while
|
|
412
330
|
// the replica is crashed, to ensure the clocks don't desyncronize too far to recover.
|
|
413
331
|
try cluster.open_replica(replica_index, replica_time);
|
|
414
|
-
|
|
415
|
-
return true;
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
/// Returns the number of replicas capable of helping a crashed node recover (i.e. with
|
|
419
|
-
/// replica.status=normal).
|
|
420
|
-
pub fn replica_normal_count(cluster: *const Self) u8 {
|
|
421
|
-
var count: u8 = 0;
|
|
422
|
-
for (cluster.replicas) |*replica| {
|
|
423
|
-
if (replica.status == .normal) count += 1;
|
|
424
|
-
}
|
|
425
|
-
return count;
|
|
426
332
|
}
|
|
427
333
|
|
|
428
334
|
fn open_replica(cluster: *Self, replica_index: u8, time: Time) !void {
|