tigerbeetle-node 0.11.11 → 0.11.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +212 -196
- package/dist/.client.node.sha256 +1 -1
- package/package.json +6 -8
- package/scripts/build_lib.sh +22 -2
- package/src/node.zig +1 -0
- package/src/tigerbeetle/scripts/benchmark.bat +9 -5
- package/src/tigerbeetle/scripts/benchmark.sh +1 -4
- package/src/tigerbeetle/scripts/fail_on_diff.sh +9 -0
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +12 -0
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +55 -0
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +66 -0
- package/src/tigerbeetle/scripts/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +9 -0
- package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +12 -0
- package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/scripts/install.bat +7 -0
- package/src/tigerbeetle/scripts/scripts/install.sh +21 -0
- package/src/tigerbeetle/scripts/scripts/install_zig.bat +113 -0
- package/src/tigerbeetle/scripts/scripts/install_zig.sh +90 -0
- package/src/tigerbeetle/scripts/scripts/lint.zig +199 -0
- package/src/tigerbeetle/scripts/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +48 -0
- package/src/tigerbeetle/scripts/scripts/validate_docs.sh +23 -0
- package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +46 -0
- package/src/tigerbeetle/src/benchmark.zig +253 -231
- package/src/tigerbeetle/src/config.zig +2 -3
- package/src/tigerbeetle/src/constants.zig +2 -10
- package/src/tigerbeetle/src/io/linux.zig +15 -6
- package/src/tigerbeetle/src/lsm/forest.zig +1 -0
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +63 -14
- package/src/tigerbeetle/src/lsm/groove.zig +134 -70
- package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
- package/src/tigerbeetle/src/lsm/manifest_level.zig +1 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +7 -4
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1 -0
- package/src/tigerbeetle/src/lsm/table.zig +29 -51
- package/src/tigerbeetle/src/lsm/table_immutable.zig +6 -17
- package/src/tigerbeetle/src/lsm/table_iterator.zig +2 -2
- package/src/tigerbeetle/src/lsm/table_mutable.zig +9 -26
- package/src/tigerbeetle/src/lsm/test.zig +1 -0
- package/src/tigerbeetle/src/lsm/tree.zig +2 -26
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +7 -2
- package/src/tigerbeetle/src/message_bus.zig +2 -1
- package/src/tigerbeetle/src/simulator.zig +14 -3
- package/src/tigerbeetle/src/state_machine/auditor.zig +1 -0
- package/src/tigerbeetle/src/state_machine.zig +402 -184
- package/src/tigerbeetle/src/stdx.zig +32 -0
- package/src/tigerbeetle/src/testing/cluster/network.zig +6 -7
- package/src/tigerbeetle/src/testing/cluster.zig +6 -5
- package/src/tigerbeetle/src/testing/packet_simulator.zig +19 -10
- package/src/tigerbeetle/src/testing/state_machine.zig +1 -0
- package/src/tigerbeetle/src/unit_tests.zig +20 -22
- package/src/tigerbeetle/src/vsr/README.md +209 -0
- package/src/tigerbeetle/src/vsr/client.zig +4 -4
- package/src/tigerbeetle/src/vsr/clock.zig +2 -0
- package/src/tigerbeetle/src/vsr/journal.zig +2 -0
- package/src/tigerbeetle/src/vsr/replica.zig +646 -578
- package/src/tigerbeetle/src/vsr/superblock.zig +14 -17
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +5 -5
- package/src/tigerbeetle/src/vsr.zig +370 -37
|
@@ -96,6 +96,9 @@ pub const Command = enum(u8) {
|
|
|
96
96
|
ping,
|
|
97
97
|
pong,
|
|
98
98
|
|
|
99
|
+
ping_client,
|
|
100
|
+
pong_client,
|
|
101
|
+
|
|
99
102
|
request,
|
|
100
103
|
prepare,
|
|
101
104
|
prepare_ok,
|
|
@@ -159,6 +162,8 @@ pub const Operation = enum(u8) {
|
|
|
159
162
|
/// We reuse the same header for both so that prepare messages from the primary can simply be
|
|
160
163
|
/// journalled as is by the backups without requiring any further modification.
|
|
161
164
|
pub const Header = extern struct {
|
|
165
|
+
const checksum_body_empty = checksum(&.{});
|
|
166
|
+
|
|
162
167
|
comptime {
|
|
163
168
|
assert(@sizeOf(Header) == 128);
|
|
164
169
|
// Assert that there is no implicit padding in the struct.
|
|
@@ -250,6 +255,7 @@ pub const Header = extern struct {
|
|
|
250
255
|
/// * A `pong` sets this to the sender's wall clock value.
|
|
251
256
|
/// * A `request_prepare` sets this to `1` when `context` is set to a checksum, and `0`
|
|
252
257
|
/// otherwise.
|
|
258
|
+
/// * A `commit` message sets this to the replica's monotonic timestamp.
|
|
253
259
|
timestamp: u64 = 0,
|
|
254
260
|
|
|
255
261
|
/// The size of the Header structure (always), plus any associated body.
|
|
@@ -312,6 +318,8 @@ pub const Header = extern struct {
|
|
|
312
318
|
.reserved => self.invalid_reserved(),
|
|
313
319
|
.ping => self.invalid_ping(),
|
|
314
320
|
.pong => self.invalid_pong(),
|
|
321
|
+
.ping_client => self.invalid_ping_client(),
|
|
322
|
+
.pong_client => self.invalid_pong_client(),
|
|
315
323
|
.request => self.invalid_request(),
|
|
316
324
|
.prepare => self.invalid_prepare(),
|
|
317
325
|
.prepare_ok => self.invalid_prepare_ok(),
|
|
@@ -348,10 +356,14 @@ pub const Header = extern struct {
|
|
|
348
356
|
fn invalid_ping(self: *const Header) ?[]const u8 {
|
|
349
357
|
assert(self.command == .ping);
|
|
350
358
|
if (self.parent != 0) return "parent != 0";
|
|
359
|
+
if (self.client != 0) return "client != 0";
|
|
351
360
|
if (self.context != 0) return "context != 0";
|
|
352
361
|
if (self.request != 0) return "request != 0";
|
|
362
|
+
if (self.view != 0) return "view != 0";
|
|
353
363
|
if (self.commit != 0) return "commit != 0";
|
|
354
364
|
if (self.timestamp != 0) return "timestamp != 0";
|
|
365
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
366
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
355
367
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
356
368
|
return null;
|
|
357
369
|
}
|
|
@@ -362,7 +374,43 @@ pub const Header = extern struct {
|
|
|
362
374
|
if (self.client != 0) return "client != 0";
|
|
363
375
|
if (self.context != 0) return "context != 0";
|
|
364
376
|
if (self.request != 0) return "request != 0";
|
|
377
|
+
if (self.view != 0) return "view != 0";
|
|
378
|
+
if (self.commit != 0) return "commit != 0";
|
|
379
|
+
if (self.timestamp == 0) return "timestamp == 0";
|
|
380
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
381
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
382
|
+
if (self.operation != .reserved) return "operation != .reserved";
|
|
383
|
+
return null;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
fn invalid_ping_client(self: *const Header) ?[]const u8 {
|
|
387
|
+
assert(self.command == .ping_client);
|
|
388
|
+
if (self.parent != 0) return "parent != 0";
|
|
389
|
+
if (self.client == 0) return "client == 0";
|
|
390
|
+
if (self.context != 0) return "context != 0";
|
|
391
|
+
if (self.request != 0) return "request != 0";
|
|
392
|
+
if (self.view != 0) return "view != 0";
|
|
393
|
+
if (self.op != 0) return "op != 0";
|
|
394
|
+
if (self.commit != 0) return "commit != 0";
|
|
395
|
+
if (self.timestamp != 0) return "timestamp != 0";
|
|
396
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
397
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
398
|
+
if (self.replica != 0) return "replica != 0";
|
|
399
|
+
if (self.operation != .reserved) return "operation != .reserved";
|
|
400
|
+
return null;
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
fn invalid_pong_client(self: *const Header) ?[]const u8 {
|
|
404
|
+
assert(self.command == .pong_client);
|
|
405
|
+
if (self.parent != 0) return "parent != 0";
|
|
406
|
+
if (self.client != 0) return "client != 0";
|
|
407
|
+
if (self.context != 0) return "context != 0";
|
|
408
|
+
if (self.request != 0) return "request != 0";
|
|
409
|
+
if (self.op != 0) return "op != 0";
|
|
365
410
|
if (self.commit != 0) return "commit != 0";
|
|
411
|
+
if (self.timestamp != 0) return "timestamp != 0";
|
|
412
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
413
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
366
414
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
367
415
|
return null;
|
|
368
416
|
}
|
|
@@ -379,10 +427,11 @@ pub const Header = extern struct {
|
|
|
379
427
|
.root => return "operation == .root",
|
|
380
428
|
.register => {
|
|
381
429
|
// The first request a client makes must be to register with the cluster:
|
|
382
|
-
if (self.parent != 0) return "parent != 0";
|
|
383
|
-
if (self.context != 0) return "context != 0";
|
|
384
|
-
if (self.request != 0) return "request != 0";
|
|
430
|
+
if (self.parent != 0) return "register: parent != 0";
|
|
431
|
+
if (self.context != 0) return "register: context != 0";
|
|
432
|
+
if (self.request != 0) return "register: request != 0";
|
|
385
433
|
// The .register operation carries no payload:
|
|
434
|
+
if (self.checksum_body != checksum_body_empty) return "register: checksum_body != expected";
|
|
386
435
|
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
387
436
|
},
|
|
388
437
|
else => {
|
|
@@ -408,6 +457,7 @@ pub const Header = extern struct {
|
|
|
408
457
|
if (self.op != 0) return "root: op != 0";
|
|
409
458
|
if (self.commit != 0) return "root: commit != 0";
|
|
410
459
|
if (self.timestamp != 0) return "root: timestamp != 0";
|
|
460
|
+
if (self.checksum_body != checksum_body_empty) return "root: checksum_body != expected";
|
|
411
461
|
if (self.size != @sizeOf(Header)) return "root: size != @sizeOf(Header)";
|
|
412
462
|
if (self.replica != 0) return "root: replica != 0";
|
|
413
463
|
},
|
|
@@ -430,6 +480,7 @@ pub const Header = extern struct {
|
|
|
430
480
|
|
|
431
481
|
fn invalid_prepare_ok(self: *const Header) ?[]const u8 {
|
|
432
482
|
assert(self.command == .prepare_ok);
|
|
483
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
433
484
|
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
434
485
|
switch (self.operation) {
|
|
435
486
|
.reserved => return "operation == .reserved",
|
|
@@ -448,6 +499,7 @@ pub const Header = extern struct {
|
|
|
448
499
|
if (self.client == 0) return "client == 0";
|
|
449
500
|
if (self.op == 0) return "op == 0";
|
|
450
501
|
if (self.op <= self.commit) return "op <= commit";
|
|
502
|
+
if (self.timestamp == 0) return "timestamp == 0";
|
|
451
503
|
if (self.operation == .register) {
|
|
452
504
|
if (self.request != 0) return "request != 0";
|
|
453
505
|
} else {
|
|
@@ -483,7 +535,9 @@ pub const Header = extern struct {
|
|
|
483
535
|
if (self.client != 0) return "client != 0";
|
|
484
536
|
if (self.request != 0) return "request != 0";
|
|
485
537
|
if (self.op != 0) return "op != 0";
|
|
486
|
-
if (self.timestamp
|
|
538
|
+
if (self.timestamp == 0) return "timestamp == 0";
|
|
539
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
540
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
487
541
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
488
542
|
return null;
|
|
489
543
|
}
|
|
@@ -497,6 +551,8 @@ pub const Header = extern struct {
|
|
|
497
551
|
if (self.op != 0) return "op != 0";
|
|
498
552
|
if (self.commit != 0) return "commit != 0";
|
|
499
553
|
if (self.timestamp != 0) return "timestamp != 0";
|
|
554
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
555
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
500
556
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
501
557
|
return null;
|
|
502
558
|
}
|
|
@@ -531,6 +587,8 @@ pub const Header = extern struct {
|
|
|
531
587
|
if (self.op != 0) return "op != 0";
|
|
532
588
|
if (self.commit != 0) return "commit != 0";
|
|
533
589
|
if (self.timestamp != 0) return "timestamp != 0";
|
|
590
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
591
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
534
592
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
535
593
|
return null;
|
|
536
594
|
}
|
|
@@ -541,8 +599,10 @@ pub const Header = extern struct {
|
|
|
541
599
|
if (self.client != 0) return "client != 0";
|
|
542
600
|
if (self.context != 0) return "context != 0";
|
|
543
601
|
if (self.request != 0) return "request != 0";
|
|
544
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
545
602
|
if (self.commit > self.op) return "op_min > op_max";
|
|
603
|
+
if (self.timestamp != 0) return "timestamp != 0";
|
|
604
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
605
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
546
606
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
547
607
|
return null;
|
|
548
608
|
}
|
|
@@ -553,6 +613,8 @@ pub const Header = extern struct {
|
|
|
553
613
|
if (self.client != 0) return "client != 0";
|
|
554
614
|
if (self.request != 0) return "request != 0";
|
|
555
615
|
if (self.commit != 0) return "commit != 0";
|
|
616
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
617
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
556
618
|
switch (self.timestamp) {
|
|
557
619
|
0 => if (self.context != 0) return "context != 0",
|
|
558
620
|
1 => {}, // context is a checksum, which may be 0.
|
|
@@ -581,6 +643,8 @@ pub const Header = extern struct {
|
|
|
581
643
|
if (self.request != 0) return "request != 0";
|
|
582
644
|
if (self.commit != 0) return "commit != 0";
|
|
583
645
|
if (self.timestamp != 0) return "timestamp != 0";
|
|
646
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
647
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
584
648
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
585
649
|
return null;
|
|
586
650
|
}
|
|
@@ -593,6 +657,8 @@ pub const Header = extern struct {
|
|
|
593
657
|
if (self.op != 0) return "op != 0";
|
|
594
658
|
if (self.commit != 0) return "commit != 0";
|
|
595
659
|
if (self.timestamp != 0) return "timestamp != 0";
|
|
660
|
+
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
661
|
+
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
596
662
|
if (self.operation != .reserved) return "operation != .reserved";
|
|
597
663
|
return null;
|
|
598
664
|
}
|
|
@@ -611,15 +677,7 @@ pub const Header = extern struct {
|
|
|
611
677
|
},
|
|
612
678
|
.prepare => return .unknown,
|
|
613
679
|
// These messages identify the peer as either a replica or a client:
|
|
614
|
-
|
|
615
|
-
.ping, .pong => {
|
|
616
|
-
if (self.client > 0) {
|
|
617
|
-
assert(self.replica == 0);
|
|
618
|
-
return .client;
|
|
619
|
-
} else {
|
|
620
|
-
return .replica;
|
|
621
|
-
}
|
|
622
|
-
},
|
|
680
|
+
.ping_client => return .client,
|
|
623
681
|
// All other messages identify the peer as a replica:
|
|
624
682
|
else => return .replica,
|
|
625
683
|
}
|
|
@@ -679,7 +737,7 @@ pub const Timeout = struct {
|
|
|
679
737
|
|
|
680
738
|
/// It's important to check that when fired() is acted on that the timeout is stopped/started,
|
|
681
739
|
/// otherwise further ticks around the event loop may trigger a thundering herd of messages.
|
|
682
|
-
pub fn fired(self: *Timeout) bool {
|
|
740
|
+
pub fn fired(self: *const Timeout) bool {
|
|
683
741
|
if (self.ticking and self.ticks >= self.after) {
|
|
684
742
|
log.debug("{}: {s} fired", .{ self.id, self.name });
|
|
685
743
|
if (self.ticks > self.after) {
|
|
@@ -967,6 +1025,8 @@ pub fn sector_ceil(offset: u64) u64 {
|
|
|
967
1025
|
}
|
|
968
1026
|
|
|
969
1027
|
pub fn checksum(source: []const u8) u128 {
|
|
1028
|
+
@setEvalBranchQuota(4000);
|
|
1029
|
+
|
|
970
1030
|
var target: [32]u8 = undefined;
|
|
971
1031
|
std.crypto.hash.Blake3.hash(source, target[0..], .{});
|
|
972
1032
|
return @bitCast(u128, target[0..@sizeOf(u128)].*);
|
|
@@ -976,22 +1036,30 @@ pub fn quorums(replica_count: u8) struct {
|
|
|
976
1036
|
replication: u8,
|
|
977
1037
|
view_change: u8,
|
|
978
1038
|
} {
|
|
979
|
-
|
|
980
|
-
assert(majority <= replica_count);
|
|
1039
|
+
assert(replica_count > 0);
|
|
981
1040
|
|
|
982
1041
|
assert(constants.quorum_replication_max >= 2);
|
|
983
|
-
|
|
1042
|
+
// For replica_count=2, set quorum_replication=2 even though =1 would intersect.
|
|
1043
|
+
// This improves durability of small clusters.
|
|
1044
|
+
const quorum_replication = if (replica_count == 2) 2 else std.math.min(
|
|
1045
|
+
constants.quorum_replication_max,
|
|
1046
|
+
stdx.div_ceil(replica_count, 2),
|
|
1047
|
+
);
|
|
1048
|
+
assert(quorum_replication <= replica_count);
|
|
984
1049
|
assert(quorum_replication >= 2 or quorum_replication == replica_count);
|
|
985
1050
|
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
1051
|
+
// For replica_count=2, set quorum_view_change=2 even though =1 would intersect.
|
|
1052
|
+
// This avoids special cases for a single-replica view-change in Replica.
|
|
1053
|
+
const quorum_view_change =
|
|
1054
|
+
if (replica_count == 2) 2 else replica_count - quorum_replication + 1;
|
|
990
1055
|
// The view change quorum may be more expensive to make the replication quorum cheaper.
|
|
991
1056
|
// The insight is that the replication phase is by far more common than the view change.
|
|
992
1057
|
// This trade-off allows us to optimize for the common case.
|
|
993
1058
|
// See the comments in `constants.zig` for further explanation.
|
|
994
|
-
assert(quorum_view_change
|
|
1059
|
+
assert(quorum_view_change <= replica_count);
|
|
1060
|
+
assert(quorum_view_change >= 2 or quorum_view_change == replica_count);
|
|
1061
|
+
assert(quorum_view_change >= @divFloor(replica_count, 2) + 1);
|
|
1062
|
+
assert(quorum_view_change + quorum_replication > replica_count);
|
|
995
1063
|
|
|
996
1064
|
return .{
|
|
997
1065
|
.replication = quorum_replication,
|
|
@@ -999,26 +1067,43 @@ pub fn quorums(replica_count: u8) struct {
|
|
|
999
1067
|
};
|
|
1000
1068
|
}
|
|
1001
1069
|
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1070
|
+
test "quorums" {
|
|
1071
|
+
if (constants.quorum_replication_max != 3) return error.SkipZigTest;
|
|
1072
|
+
|
|
1073
|
+
const expect_replication = [_]u8{ 1, 2, 2, 2, 3, 3, 3, 3 };
|
|
1074
|
+
const expect_view_change = [_]u8{ 1, 2, 2, 3, 3, 4, 5, 6 };
|
|
1075
|
+
|
|
1076
|
+
for (expect_replication[0..]) |_, i| {
|
|
1077
|
+
const actual = quorums(@intCast(u8, i) + 1);
|
|
1078
|
+
try std.testing.expectEqual(actual.replication, expect_replication[i]);
|
|
1079
|
+
try std.testing.expectEqual(actual.view_change, expect_view_change[i]);
|
|
1080
|
+
}
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
pub const Headers = struct {
|
|
1084
|
+
pub const Array = std.BoundedArray(Header, constants.view_change_headers_max);
|
|
1085
|
+
/// The SuperBlock's persisted VSR headers.
|
|
1086
|
+
/// One of the following:
|
|
1087
|
+
///
|
|
1088
|
+
/// - SV headers (consecutive chain)
|
|
1089
|
+
/// - DVC headers (disjoint chain)
|
|
1090
|
+
pub const ViewChangeSlice = ViewChangeHeadersSlice;
|
|
1091
|
+
pub const ViewChangeArray = ViewChangeHeadersArray;
|
|
1092
|
+
};
|
|
1093
|
+
|
|
1094
|
+
const ViewChangeHeadersSlice = struct {
|
|
1008
1095
|
/// Headers are ordered from high-to-low op.
|
|
1009
1096
|
slice: []const Header,
|
|
1010
1097
|
|
|
1011
|
-
pub
|
|
1012
|
-
|
|
1013
|
-
pub fn init(slice: []const Header) ViewChangeHeaders {
|
|
1014
|
-
ViewChangeHeaders.verify(slice);
|
|
1098
|
+
pub fn init(slice: []const Header) ViewChangeHeadersSlice {
|
|
1099
|
+
ViewChangeHeadersSlice.verify(slice);
|
|
1015
1100
|
|
|
1016
1101
|
return .{ .slice = slice };
|
|
1017
1102
|
}
|
|
1018
1103
|
|
|
1019
1104
|
pub fn verify(slice: []const Header) void {
|
|
1020
1105
|
assert(slice.len > 0);
|
|
1021
|
-
assert(slice.len <= constants.
|
|
1106
|
+
assert(slice.len <= constants.view_change_headers_max);
|
|
1022
1107
|
|
|
1023
1108
|
var child: ?*const Header = null;
|
|
1024
1109
|
for (slice) |*header| {
|
|
@@ -1053,7 +1138,7 @@ pub const ViewChangeHeaders = struct {
|
|
|
1053
1138
|
/// - When these are SV headers for a log_view=V, we can continue to add to them (by preparing
|
|
1054
1139
|
/// more ops), but those ops will laways be part of the log_view. If they were prepared during
|
|
1055
1140
|
/// a view prior to the log_view, they would already be part of the headers.
|
|
1056
|
-
pub fn view_for_op(headers:
|
|
1141
|
+
pub fn view_for_op(headers: ViewChangeHeadersSlice, op: u64, log_view: u32) ViewRange {
|
|
1057
1142
|
const header_newest = &headers.slice[0];
|
|
1058
1143
|
const header_oldest = &headers.slice[headers.slice.len - 1];
|
|
1059
1144
|
|
|
@@ -1074,13 +1159,13 @@ pub const ViewChangeHeaders = struct {
|
|
|
1074
1159
|
}
|
|
1075
1160
|
};
|
|
1076
1161
|
|
|
1077
|
-
test "
|
|
1162
|
+
test "Headers.ViewChangeSlice.view_for_op" {
|
|
1078
1163
|
var headers_array = [_]Header{
|
|
1079
1164
|
std.mem.zeroInit(Header, .{ .op = 9, .view = 10 }),
|
|
1080
1165
|
std.mem.zeroInit(Header, .{ .op = 6, .view = 7 }),
|
|
1081
1166
|
};
|
|
1082
1167
|
|
|
1083
|
-
const headers =
|
|
1168
|
+
const headers = Headers.ViewChangeSlice{ .slice = &headers_array };
|
|
1084
1169
|
try std.testing.expect(std.meta.eql(headers.view_for_op(11, 12), .{ .min = 12, .max = 12 }));
|
|
1085
1170
|
try std.testing.expect(std.meta.eql(headers.view_for_op(10, 12), .{ .min = 12, .max = 12 }));
|
|
1086
1171
|
try std.testing.expect(std.meta.eql(headers.view_for_op(9, 12), .{ .min = 10, .max = 10 }));
|
|
@@ -1090,3 +1175,251 @@ test "ViewChangeHeaders.view_for_op" {
|
|
|
1090
1175
|
try std.testing.expect(std.meta.eql(headers.view_for_op(5, 12), .{ .min = 0, .max = 7 }));
|
|
1091
1176
|
try std.testing.expect(std.meta.eql(headers.view_for_op(0, 12), .{ .min = 0, .max = 7 }));
|
|
1092
1177
|
}
|
|
1178
|
+
|
|
1179
|
+
/// The headers of a SV or DVC message.
|
|
1180
|
+
const ViewChangeHeadersArray = struct {
|
|
1181
|
+
array: Headers.Array,
|
|
1182
|
+
|
|
1183
|
+
pub fn root(cluster: u32) ViewChangeHeadersArray {
|
|
1184
|
+
var array = Headers.Array{ .buffer = undefined };
|
|
1185
|
+
array.appendAssumeCapacity(Header.root_prepare(cluster));
|
|
1186
|
+
return ViewChangeHeadersArray.init(array);
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
fn init(array: Headers.Array) ViewChangeHeadersArray {
|
|
1190
|
+
Headers.ViewChangeSlice.verify(array.constSlice());
|
|
1191
|
+
return .{ .array = array };
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1194
|
+
/// This function generates either DVC headers or SV headers:
|
|
1195
|
+
/// - When `current.log_view < current.view`, generate headers for a SV message.
|
|
1196
|
+
/// - When `current.log_view = current.view`, generate headers for a DVC message.
|
|
1197
|
+
///
|
|
1198
|
+
/// Additionally, the current log_view/view/primary state informs the sort of "faults"
|
|
1199
|
+
/// (gaps/breaks/etc) that we expect to find in the journal headers (`current.headers`).
|
|
1200
|
+
/// For example, backups generating a DVC can safely skip over gaps (if the gap is after the DVC
|
|
1201
|
+
/// anchor).
|
|
1202
|
+
///
|
|
1203
|
+
/// Primaries and backups both generate DVCs and SVs.
|
|
1204
|
+
/// - However, SVs are broadcast only by the primary.
|
|
1205
|
+
/// - Backups generate a SV for persisting to the superblock.
|
|
1206
|
+
/// (For convenience/symmetry, not correctness).
|
|
1207
|
+
///
|
|
1208
|
+
/// DVCs and SVs have different invariants they must abide.
|
|
1209
|
+
/// - Read DVCQuorum's comments to understand DVC invariants.
|
|
1210
|
+
/// - SV headers are much simpler: no gaps or breaks, and all uncommitted ops must be included.
|
|
1211
|
+
pub fn build(
|
|
1212
|
+
results: *ViewChangeHeadersArray,
|
|
1213
|
+
options: struct {
|
|
1214
|
+
op_checkpoint: u64,
|
|
1215
|
+
/// The last view_change_headers_max headers of the journal, starting with the head op
|
|
1216
|
+
/// then descending, skipping over all gaps.
|
|
1217
|
+
current: struct {
|
|
1218
|
+
headers: *const Headers.Array,
|
|
1219
|
+
view: u32,
|
|
1220
|
+
log_view: u32,
|
|
1221
|
+
log_view_primary: bool,
|
|
1222
|
+
},
|
|
1223
|
+
// The vsr_headers from the working superblock.
|
|
1224
|
+
// The durable headers are useful (complimenting `current.headers`) because:
|
|
1225
|
+
// - They simplify generation of DVCs in the case where we are recovering from a crash,
|
|
1226
|
+
// when we were generating the same DVC prior to the crash.
|
|
1227
|
+
// - They enable additional verification of header gaps/breaks based on the
|
|
1228
|
+
// gap's/break's position relative to the durable headers.
|
|
1229
|
+
durable: struct {
|
|
1230
|
+
headers: Headers.ViewChangeSlice,
|
|
1231
|
+
view: u32,
|
|
1232
|
+
log_view: u32,
|
|
1233
|
+
log_view_primary: bool,
|
|
1234
|
+
},
|
|
1235
|
+
},
|
|
1236
|
+
) void {
|
|
1237
|
+
defer Headers.ViewChangeSlice.verify(results.array.constSlice());
|
|
1238
|
+
|
|
1239
|
+
const headers = &results.array;
|
|
1240
|
+
const current = options.current;
|
|
1241
|
+
const durable = options.durable;
|
|
1242
|
+
|
|
1243
|
+
assert(headers.len == 0);
|
|
1244
|
+
assert(durable.headers.slice.len > 0);
|
|
1245
|
+
assert(current.headers.len > 0);
|
|
1246
|
+
for (current.headers.constSlice()[1..]) |*header, i| {
|
|
1247
|
+
assert(current.headers.get(i).op > header.op);
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
assert(current.view >= durable.view);
|
|
1251
|
+
assert(current.log_view >= durable.log_view);
|
|
1252
|
+
assert(current.view >= current.log_view);
|
|
1253
|
+
assert(durable.view >= durable.log_view);
|
|
1254
|
+
|
|
1255
|
+
const op_head_current = current.headers.get(0).op;
|
|
1256
|
+
const op_head_durable = durable.headers.slice[0].op;
|
|
1257
|
+
|
|
1258
|
+
// The rules for generating DVCs and SVs differ. We use the current view numbers to
|
|
1259
|
+
// determine which is being generated:
|
|
1260
|
+
// - When `log_view < view`, generate a DVC.
|
|
1261
|
+
// - When `log_view = view`, generate a SV.
|
|
1262
|
+
const command_current: enum { start_view, do_view_change } =
|
|
1263
|
+
if (current.log_view == current.view) .start_view else .do_view_change;
|
|
1264
|
+
// Likewise, the durable view numbers identify whether the durable headers were from a past
|
|
1265
|
+
// DVC or SV. The durable headers are only useful if they are from the same view as our
|
|
1266
|
+
// current headers, though.
|
|
1267
|
+
const command_durable: enum { start_view, do_view_change, outdated } = command: {
|
|
1268
|
+
if (durable.log_view == current.log_view) {
|
|
1269
|
+
if (durable.log_view == durable.view) {
|
|
1270
|
+
break :command .start_view;
|
|
1271
|
+
} else {
|
|
1272
|
+
break :command .do_view_change;
|
|
1273
|
+
}
|
|
1274
|
+
} else {
|
|
1275
|
+
break :command .outdated;
|
|
1276
|
+
}
|
|
1277
|
+
};
|
|
1278
|
+
|
|
1279
|
+
if (command_durable == .do_view_change and command_current == .do_view_change) {
|
|
1280
|
+
assert(op_head_durable == op_head_current);
|
|
1281
|
+
// Ensure that if we started a DVC before a crash, that we will resume sending the exact
|
|
1282
|
+
// same DVC after recovery. (An alternative implementation would be to load the
|
|
1283
|
+
// superblock's DVC headers (including gaps) into the journal during Replica.open(), but
|
|
1284
|
+
// that is more complicated to implement correctly).
|
|
1285
|
+
for (durable.headers.slice) |*header| headers.appendAssumeCapacity(header.*);
|
|
1286
|
+
return;
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
// What is the relationship between two prepares?
|
|
1290
|
+
const Chain = enum {
|
|
1291
|
+
// The ops are sequential, and the hash-chain is valid.
|
|
1292
|
+
chain_sequence,
|
|
1293
|
+
// The ops are sequential, and the hash-chain is invalid.
|
|
1294
|
+
chain_break,
|
|
1295
|
+
// The ops are non-sequential, and belong to the same view.
|
|
1296
|
+
// This gap never hides a break.
|
|
1297
|
+
chain_view,
|
|
1298
|
+
// The ops are non-sequential, and belong to the different views.
|
|
1299
|
+
// Depending on the replica state, this gap may hide a break.
|
|
1300
|
+
chain_gap,
|
|
1301
|
+
};
|
|
1302
|
+
|
|
1303
|
+
// The DVC anchor: Within the log suffix following the anchor, we have additional
|
|
1304
|
+
// guarantees about the state of the log headers which allow us to tolerate certain
|
|
1305
|
+
// gaps (by locally guaranteeing that the gap does not hide a break).
|
|
1306
|
+
const op_dvc_anchor = std.math.max(
|
|
1307
|
+
options.op_checkpoint,
|
|
1308
|
+
// +1: We may have a full pipeline, but not yet have performed any repair.
|
|
1309
|
+
// In such a case, we want to send those pipeline_prepare_queue_max headers in
|
|
1310
|
+
// the DVC, but not the preceding op (which may belong to a different chain).
|
|
1311
|
+
// This satisfies the DVC invariant because the first op in the pipeline is
|
|
1312
|
+
// "connected" to the canonical chain (via its "parent" checksum).
|
|
1313
|
+
1 + op_head_current -| constants.pipeline_prepare_queue_max,
|
|
1314
|
+
);
|
|
1315
|
+
|
|
1316
|
+
// Within the "suffix" we can make additional assumptions about gaps/etc.
|
|
1317
|
+
// After the suffix, we just add as many extra (valid) headers as we can fit.
|
|
1318
|
+
var suffix_done = false;
|
|
1319
|
+
|
|
1320
|
+
for (current.headers.constSlice()) |*header, i| {
|
|
1321
|
+
const op = header.op;
|
|
1322
|
+
const chain = chain: {
|
|
1323
|
+
// Always include the head message.
|
|
1324
|
+
if (i == 0) break :chain Chain.chain_sequence;
|
|
1325
|
+
|
|
1326
|
+
const child = headers.get(i - 1);
|
|
1327
|
+
if (child.op == header.op + 1) {
|
|
1328
|
+
break :chain if (child.parent == header.checksum) Chain.chain_sequence else Chain.chain_break;
|
|
1329
|
+
} else {
|
|
1330
|
+
break :chain if (child.view == header.view) Chain.chain_view else Chain.chain_gap;
|
|
1331
|
+
}
|
|
1332
|
+
};
|
|
1333
|
+
|
|
1334
|
+
if (command_current == .start_view) {
|
|
1335
|
+
// Primary: Collect headers for a start_view message.
|
|
1336
|
+
// Backup: these headers are stored in the superblock's vsr_headers.
|
|
1337
|
+
switch (chain) {
|
|
1338
|
+
.chain_sequence => {},
|
|
1339
|
+
// Gaps are due to either:
|
|
1340
|
+
// - entries before checkpoint, which are not repaired, or
|
|
1341
|
+
// - backup missed prepares and has not repaired headers. (Immediately after
|
|
1342
|
+
// receiving a start_view this is not a concern, but the view_durable_update()
|
|
1343
|
+
// may be delayed if another is in progress).
|
|
1344
|
+
.chain_view, .chain_gap => {
|
|
1345
|
+
assert(op <= options.op_checkpoint or !current.log_view_primary);
|
|
1346
|
+
break;
|
|
1347
|
+
},
|
|
1348
|
+
// Breaks are due to:
|
|
1349
|
+
// - entries before checkpoint, which are not repaired
|
|
1350
|
+
.chain_break => {
|
|
1351
|
+
assert(op <= options.op_checkpoint);
|
|
1352
|
+
break;
|
|
1353
|
+
},
|
|
1354
|
+
}
|
|
1355
|
+
} else if (suffix_done) {
|
|
1356
|
+
// Add extra headers to the DVC. These are not required for correctness or
|
|
1357
|
+
// availability, but including extra (correct) headers minimizes header repair at
|
|
1358
|
+
// the new primary.
|
|
1359
|
+
switch (chain) {
|
|
1360
|
+
.chain_sequence => {},
|
|
1361
|
+
.chain_view => {},
|
|
1362
|
+
// Outside of the log suffix, repair may not have been finished, so gaps and
|
|
1363
|
+
// breaks are possible. Non-same-view gaps may hide breaks.
|
|
1364
|
+
.chain_gap => break,
|
|
1365
|
+
.chain_break => break,
|
|
1366
|
+
}
|
|
1367
|
+
} else if (current.log_view_primary and command_durable == .start_view) {
|
|
1368
|
+
switch (chain) {
|
|
1369
|
+
.chain_sequence => {},
|
|
1370
|
+
// Gaps to the right of the (durable) SV originate from:
|
|
1371
|
+
// 1. The primary (durable SV: 1,2,3) prepares several ops (4,5,6).
|
|
1372
|
+
// 2. However, the WAL writes are reordered such that some later ops (5,6)
|
|
1373
|
+
// finish before an earlier op (4).
|
|
1374
|
+
// 3. Crash, recover. Start sending a DVC for the next view. Either:
|
|
1375
|
+
// - There is a gap in the WAL at op=4, but this is to the right of the
|
|
1376
|
+
// durable SV, so it may be safely skipped.
|
|
1377
|
+
// - Same as above, except op=4 was a torn write (or bit rot).
|
|
1378
|
+
.chain_view, .chain_gap => assert(op + 1 > op_head_durable),
|
|
1379
|
+
// Breaks are impossible to the right of the durable SV — journal recovery uses
|
|
1380
|
+
// the durable SV to prune bad headers by their view numbers.
|
|
1381
|
+
.chain_break => unreachable,
|
|
1382
|
+
}
|
|
1383
|
+
suffix_done = op <= op_head_durable;
|
|
1384
|
+
} else if (current.log_view_primary and command_durable != .start_view) {
|
|
1385
|
+
switch (chain) {
|
|
1386
|
+
.chain_sequence => {},
|
|
1387
|
+
.chain_view => {},
|
|
1388
|
+
// The retiring primary may have gap-breaks or breaks in its suffix iff:
|
|
1389
|
+
// - it didn't finish repairs before the second view-change, and
|
|
1390
|
+
// - some uncommitted ops were truncated during the first view-change.
|
|
1391
|
+
// (Truncation "moves" the suffix backwards).
|
|
1392
|
+
.chain_gap => break,
|
|
1393
|
+
.chain_break => break,
|
|
1394
|
+
}
|
|
1395
|
+
suffix_done = op <= op_dvc_anchor;
|
|
1396
|
+
} else if (!current.log_view_primary and command_durable == .start_view) {
|
|
1397
|
+
switch (chain) {
|
|
1398
|
+
.chain_sequence => {},
|
|
1399
|
+
// Backups load a full suffix of headers from the view's SV message. If there
|
|
1400
|
+
// is now a gap in it the bcakup's suffix, this must be due to missed prepares.
|
|
1401
|
+
.chain_view, .chain_gap => assert(op + 1 > op_head_durable),
|
|
1402
|
+
// Breaks are impossible to the right of the durable SV — journal recovery uses
|
|
1403
|
+
// the durable SV to prune bad headers by their view numbers.
|
|
1404
|
+
.chain_break => unreachable,
|
|
1405
|
+
}
|
|
1406
|
+
suffix_done = op <= op_head_durable;
|
|
1407
|
+
} else if (!current.log_view_primary and command_durable != .start_view) {
|
|
1408
|
+
switch (chain) {
|
|
1409
|
+
.chain_sequence => {},
|
|
1410
|
+
.chain_view => {},
|
|
1411
|
+
// Backups load a full suffix of headers from the view's SV message.
|
|
1412
|
+
// That SV isn't durable, but it is part of the journal, so any gaps to its
|
|
1413
|
+
// right must be due to missed prepares.
|
|
1414
|
+
.chain_gap => {},
|
|
1415
|
+
// Breaks are impossible to the right of the ephemeral SV, since the log was
|
|
1416
|
+
// truncated when the SV was installed.
|
|
1417
|
+
.chain_break => unreachable,
|
|
1418
|
+
}
|
|
1419
|
+
suffix_done = op <= op_dvc_anchor;
|
|
1420
|
+
} else unreachable;
|
|
1421
|
+
|
|
1422
|
+
headers.appendAssumeCapacity(header.*);
|
|
1423
|
+
}
|
|
1424
|
+
}
|
|
1425
|
+
};
|