tigerbeetle-node 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -4
- package/package.json +1 -1
- package/src/node.zig +2 -12
- package/src/tigerbeetle/scripts/benchmark.bat +46 -0
- package/src/tigerbeetle/scripts/install_zig.bat +2 -2
- package/src/tigerbeetle/scripts/install_zig.sh +1 -1
- package/src/tigerbeetle/scripts/vopr.sh +2 -2
- package/src/tigerbeetle/src/benchmark.zig +2 -6
- package/src/tigerbeetle/src/cli.zig +39 -18
- package/src/tigerbeetle/src/config.zig +24 -9
- package/src/tigerbeetle/src/demo.zig +1 -1
- package/src/tigerbeetle/src/io/benchmark.zig +24 -49
- package/src/tigerbeetle/src/io/darwin.zig +175 -44
- package/src/tigerbeetle/src/io/linux.zig +177 -72
- package/src/tigerbeetle/src/io/test.zig +61 -39
- package/src/tigerbeetle/src/io/windows.zig +1161 -0
- package/src/tigerbeetle/src/io.zig +2 -0
- package/src/tigerbeetle/src/main.zig +13 -8
- package/src/tigerbeetle/src/message_bus.zig +49 -61
- package/src/tigerbeetle/src/message_pool.zig +63 -57
- package/src/tigerbeetle/src/ring_buffer.zig +7 -0
- package/src/tigerbeetle/src/simulator.zig +4 -4
- package/src/tigerbeetle/src/storage.zig +0 -230
- package/src/tigerbeetle/src/test/cluster.zig +3 -6
- package/src/tigerbeetle/src/test/message_bus.zig +4 -3
- package/src/tigerbeetle/src/test/network.zig +13 -16
- package/src/tigerbeetle/src/test/state_checker.zig +3 -2
- package/src/tigerbeetle/src/tigerbeetle.zig +5 -3
- package/src/tigerbeetle/src/time.zig +58 -11
- package/src/tigerbeetle/src/vsr/client.zig +18 -32
- package/src/tigerbeetle/src/vsr/clock.zig +1 -1
- package/src/tigerbeetle/src/vsr/journal.zig +2 -6
- package/src/tigerbeetle/src/vsr/replica.zig +146 -169
- package/src/tigerbeetle/src/vsr.zig +263 -5
|
@@ -510,11 +510,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
510
510
|
const physical_size = vsr.sector_ceil(exact.size);
|
|
511
511
|
assert(physical_size >= exact.size);
|
|
512
512
|
|
|
513
|
-
const message = replica.message_bus.get_message()
|
|
514
|
-
self.read_prepare_log(op, checksum, "no message available");
|
|
515
|
-
callback(replica, null, null);
|
|
516
|
-
return;
|
|
517
|
-
};
|
|
513
|
+
const message = replica.message_bus.get_message();
|
|
518
514
|
defer replica.message_bus.unref(message);
|
|
519
515
|
|
|
520
516
|
// Skip the disk read if the header is all we need:
|
|
@@ -658,7 +654,7 @@ pub fn Journal(comptime Replica: type, comptime Storage: type) type {
|
|
|
658
654
|
}
|
|
659
655
|
assert(offset < self.size_headers);
|
|
660
656
|
|
|
661
|
-
const message = replica.message_bus.get_message()
|
|
657
|
+
const message = replica.message_bus.get_message();
|
|
662
658
|
defer replica.message_bus.unref(message);
|
|
663
659
|
|
|
664
660
|
// We use the count of reads executing to know when both versions have finished reading:
|
|
@@ -52,7 +52,7 @@ const Prepare = struct {
|
|
|
52
52
|
message: *Message,
|
|
53
53
|
|
|
54
54
|
/// Unique prepare_ok messages for the same view, op number and checksum from ALL replicas.
|
|
55
|
-
ok_from_all_replicas:
|
|
55
|
+
ok_from_all_replicas: QuorumCounter = QuorumCounterNull,
|
|
56
56
|
|
|
57
57
|
/// Whether a quorum of prepare_ok messages has been received for this prepare.
|
|
58
58
|
ok_quorum_received: bool = false,
|
|
@@ -61,6 +61,9 @@ const Prepare = struct {
|
|
|
61
61
|
const QuorumMessages = [config.replicas_max]?*Message;
|
|
62
62
|
const QuorumMessagesNull = [_]?*Message{null} ** config.replicas_max;
|
|
63
63
|
|
|
64
|
+
const QuorumCounter = std.StaticBitSet(config.replicas_max);
|
|
65
|
+
const QuorumCounterNull = QuorumCounter.initEmpty();
|
|
66
|
+
|
|
64
67
|
pub fn Replica(
|
|
65
68
|
comptime StateMachine: type,
|
|
66
69
|
comptime MessageBus: type,
|
|
@@ -141,13 +144,13 @@ pub fn Replica(
|
|
|
141
144
|
loopback_queue: ?*Message = null,
|
|
142
145
|
|
|
143
146
|
/// Unique start_view_change messages for the same view from OTHER replicas (excluding ourself).
|
|
144
|
-
start_view_change_from_other_replicas:
|
|
147
|
+
start_view_change_from_other_replicas: QuorumCounter = QuorumCounterNull,
|
|
145
148
|
|
|
146
149
|
/// Unique do_view_change messages for the same view from ALL replicas (including ourself).
|
|
147
150
|
do_view_change_from_all_replicas: QuorumMessages = QuorumMessagesNull,
|
|
148
151
|
|
|
149
152
|
/// Unique nack_prepare messages for the same view from OTHER replicas (excluding ourself).
|
|
150
|
-
nack_prepare_from_other_replicas:
|
|
153
|
+
nack_prepare_from_other_replicas: QuorumCounter = QuorumCounterNull,
|
|
151
154
|
|
|
152
155
|
/// Whether a replica has received a quorum of start_view_change messages for the view change:
|
|
153
156
|
start_view_change_quorum: bool = false,
|
|
@@ -379,9 +382,6 @@ pub fn Replica(
|
|
|
379
382
|
var it = self.pipeline.iterator();
|
|
380
383
|
while (it.next()) |prepare| {
|
|
381
384
|
self.message_bus.unref(prepare.message);
|
|
382
|
-
for (prepare.ok_from_all_replicas) |message| {
|
|
383
|
-
if (message) |m| self.message_bus.unref(m);
|
|
384
|
-
}
|
|
385
385
|
}
|
|
386
386
|
}
|
|
387
387
|
|
|
@@ -391,15 +391,9 @@ pub fn Replica(
|
|
|
391
391
|
self.loopback_queue = null;
|
|
392
392
|
}
|
|
393
393
|
|
|
394
|
-
for (self.start_view_change_from_other_replicas) |message| {
|
|
395
|
-
if (message) |m| self.message_bus.unref(m);
|
|
396
|
-
}
|
|
397
394
|
for (self.do_view_change_from_all_replicas) |message| {
|
|
398
395
|
if (message) |m| self.message_bus.unref(m);
|
|
399
396
|
}
|
|
400
|
-
for (self.nack_prepare_from_other_replicas) |message| {
|
|
401
|
-
if (message) |m| self.message_bus.unref(m);
|
|
402
|
-
}
|
|
403
397
|
}
|
|
404
398
|
|
|
405
399
|
/// Time is measured in logical ticks that are incremented on every call to tick().
|
|
@@ -608,7 +602,7 @@ pub fn Replica(
|
|
|
608
602
|
|
|
609
603
|
log.debug("{}: on_request: prepare {}", .{ self.replica, message.header.checksum });
|
|
610
604
|
|
|
611
|
-
self.pipeline.
|
|
605
|
+
self.pipeline.push_assume_capacity(.{ .message = message.ref() });
|
|
612
606
|
assert(self.pipeline.count >= 1);
|
|
613
607
|
|
|
614
608
|
if (self.pipeline.count == 1) {
|
|
@@ -735,7 +729,7 @@ pub fn Replica(
|
|
|
735
729
|
// Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
|
|
736
730
|
const threshold = self.quorum_replication;
|
|
737
731
|
|
|
738
|
-
const count = self.
|
|
732
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
739
733
|
&prepare.ok_from_all_replicas,
|
|
740
734
|
message,
|
|
741
735
|
threshold,
|
|
@@ -878,32 +872,18 @@ pub fn Replica(
|
|
|
878
872
|
assert(self.status == .view_change);
|
|
879
873
|
assert(message.header.view == self.view);
|
|
880
874
|
|
|
881
|
-
if (self.leader_index(self.view) == self.replica) {
|
|
882
|
-
// If we are the leader of the new view, then wait until we have a message to send a
|
|
883
|
-
// do_view_change message to ourself. The on_do_view_change() handler will panic if
|
|
884
|
-
// we received a start_view_change quorum without a do_view_change to ourself.
|
|
885
|
-
if (self.message_bus.get_message()) |available| {
|
|
886
|
-
self.message_bus.unref(available);
|
|
887
|
-
} else {
|
|
888
|
-
log.err("{}: on_start_view_change: waiting for message for do_view_change", .{
|
|
889
|
-
self.replica,
|
|
890
|
-
});
|
|
891
|
-
return;
|
|
892
|
-
}
|
|
893
|
-
}
|
|
894
|
-
|
|
895
875
|
// Wait until we have `f` messages (excluding ourself) for quorum:
|
|
896
876
|
assert(self.replica_count > 1);
|
|
897
877
|
const threshold = self.quorum_view_change - 1;
|
|
898
878
|
|
|
899
|
-
const count = self.
|
|
879
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
900
880
|
&self.start_view_change_from_other_replicas,
|
|
901
881
|
message,
|
|
902
882
|
threshold,
|
|
903
883
|
) orelse return;
|
|
904
884
|
|
|
905
885
|
assert(count == threshold);
|
|
906
|
-
assert(self.start_view_change_from_other_replicas
|
|
886
|
+
assert(!self.start_view_change_from_other_replicas.isSet(self.replica));
|
|
907
887
|
log.debug("{}: on_start_view_change: view={} quorum received", .{
|
|
908
888
|
self.replica,
|
|
909
889
|
self.view,
|
|
@@ -956,7 +936,7 @@ pub fn Replica(
|
|
|
956
936
|
assert(self.replica_count > 1);
|
|
957
937
|
const threshold = self.quorum_view_change;
|
|
958
938
|
|
|
959
|
-
const count = self.
|
|
939
|
+
const count = self.reference_message_and_receive_quorum_exactly_once(
|
|
960
940
|
&self.do_view_change_from_all_replicas,
|
|
961
941
|
message,
|
|
962
942
|
threshold,
|
|
@@ -1097,12 +1077,7 @@ pub fn Replica(
|
|
|
1097
1077
|
assert(message.header.replica != self.replica);
|
|
1098
1078
|
assert(self.leader());
|
|
1099
1079
|
|
|
1100
|
-
const start_view = self.create_view_change_message(.start_view)
|
|
1101
|
-
log.err("{}: on_request_start_view: dropping start_view, no message available", .{
|
|
1102
|
-
self.replica,
|
|
1103
|
-
});
|
|
1104
|
-
return;
|
|
1105
|
-
};
|
|
1080
|
+
const start_view = self.create_view_change_message(.start_view);
|
|
1106
1081
|
defer self.message_bus.unref(start_view);
|
|
1107
1082
|
|
|
1108
1083
|
assert(start_view.references == 1);
|
|
@@ -1126,10 +1101,7 @@ pub fn Replica(
|
|
|
1126
1101
|
return;
|
|
1127
1102
|
}
|
|
1128
1103
|
|
|
1129
|
-
const response = self.message_bus.get_message()
|
|
1130
|
-
log.err("{}: on_recovery: ignoring (waiting for message)", .{self.replica});
|
|
1131
|
-
return;
|
|
1132
|
-
};
|
|
1104
|
+
const response = self.message_bus.get_message();
|
|
1133
1105
|
defer self.message_bus.unref(response);
|
|
1134
1106
|
|
|
1135
1107
|
response.header.* = .{
|
|
@@ -1276,14 +1248,7 @@ pub fn Replica(
|
|
|
1276
1248
|
assert(message.header.view == self.view);
|
|
1277
1249
|
assert(message.header.replica != self.replica);
|
|
1278
1250
|
|
|
1279
|
-
const response = self.message_bus.get_message()
|
|
1280
|
-
log.err("{}: on_request_headers: ignoring (op={}..{}, no message available)", .{
|
|
1281
|
-
self.replica,
|
|
1282
|
-
message.header.commit,
|
|
1283
|
-
message.header.op,
|
|
1284
|
-
});
|
|
1285
|
-
return;
|
|
1286
|
-
};
|
|
1251
|
+
const response = self.message_bus.get_message();
|
|
1287
1252
|
defer self.message_bus.unref(response);
|
|
1288
1253
|
|
|
1289
1254
|
response.header.* = .{
|
|
@@ -1410,14 +1375,14 @@ pub fn Replica(
|
|
|
1410
1375
|
assert(threshold < self.replica_count);
|
|
1411
1376
|
|
|
1412
1377
|
// Wait until we have `threshold` messages for quorum:
|
|
1413
|
-
const count = self.
|
|
1378
|
+
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
1414
1379
|
&self.nack_prepare_from_other_replicas,
|
|
1415
1380
|
message,
|
|
1416
1381
|
threshold,
|
|
1417
1382
|
) orelse return;
|
|
1418
1383
|
|
|
1419
1384
|
assert(count == threshold);
|
|
1420
|
-
assert(self.nack_prepare_from_other_replicas
|
|
1385
|
+
assert(!self.nack_prepare_from_other_replicas.isSet(self.replica));
|
|
1421
1386
|
log.debug("{}: on_nack_prepare: quorum received", .{self.replica});
|
|
1422
1387
|
|
|
1423
1388
|
self.discard_uncommitted_ops_from(op, checksum);
|
|
@@ -1487,18 +1452,32 @@ pub fn Replica(
|
|
|
1487
1452
|
// The list of remote replicas yet to send a prepare_ok:
|
|
1488
1453
|
var waiting: [config.replicas_max]u8 = undefined;
|
|
1489
1454
|
var waiting_len: usize = 0;
|
|
1490
|
-
|
|
1491
|
-
|
|
1455
|
+
var ok_from_all_replicas_iterator = prepare.ok_from_all_replicas.iterator(.{
|
|
1456
|
+
.kind = .unset,
|
|
1457
|
+
});
|
|
1458
|
+
while (ok_from_all_replicas_iterator.next()) |replica| {
|
|
1459
|
+
// Ensure we don't wait for replicas that don't exist.
|
|
1460
|
+
// The bits between `replica_count` and `replicas_max` are always unset,
|
|
1461
|
+
// since they don't actually represent replicas.
|
|
1462
|
+
if (replica == self.replica_count) {
|
|
1463
|
+
assert(self.replica_count < config.replicas_max);
|
|
1464
|
+
break;
|
|
1465
|
+
}
|
|
1466
|
+
assert(replica < self.replica_count);
|
|
1467
|
+
|
|
1468
|
+
if (replica != self.replica) {
|
|
1492
1469
|
waiting[waiting_len] = @intCast(u8, replica);
|
|
1493
1470
|
waiting_len += 1;
|
|
1494
1471
|
}
|
|
1472
|
+
} else {
|
|
1473
|
+
assert(self.replica_count == config.replicas_max);
|
|
1495
1474
|
}
|
|
1496
1475
|
|
|
1497
1476
|
if (waiting_len == 0) {
|
|
1498
1477
|
self.prepare_timeout.reset();
|
|
1499
1478
|
|
|
1500
1479
|
log.debug("{}: on_prepare_timeout: waiting for journal", .{self.replica});
|
|
1501
|
-
assert(prepare.ok_from_all_replicas
|
|
1480
|
+
assert(!prepare.ok_from_all_replicas.isSet(self.replica));
|
|
1502
1481
|
|
|
1503
1482
|
// We may be slow and waiting for the write to complete.
|
|
1504
1483
|
//
|
|
@@ -1590,7 +1569,7 @@ pub fn Replica(
|
|
|
1590
1569
|
self.repair();
|
|
1591
1570
|
}
|
|
1592
1571
|
|
|
1593
|
-
fn
|
|
1572
|
+
fn reference_message_and_receive_quorum_exactly_once(
|
|
1594
1573
|
self: *Self,
|
|
1595
1574
|
messages: *QuorumMessages,
|
|
1596
1575
|
message: *Message,
|
|
@@ -1604,18 +1583,6 @@ pub fn Replica(
|
|
|
1604
1583
|
assert(message.header.replica < self.replica_count);
|
|
1605
1584
|
assert(message.header.view == self.view);
|
|
1606
1585
|
switch (message.header.command) {
|
|
1607
|
-
.prepare_ok => {
|
|
1608
|
-
if (self.replica_count <= 2) assert(threshold == self.replica_count);
|
|
1609
|
-
|
|
1610
|
-
assert(self.status == .normal);
|
|
1611
|
-
assert(self.leader());
|
|
1612
|
-
},
|
|
1613
|
-
.start_view_change => {
|
|
1614
|
-
assert(self.replica_count > 1);
|
|
1615
|
-
if (self.replica_count == 2) assert(threshold == 1);
|
|
1616
|
-
|
|
1617
|
-
assert(self.status == .view_change);
|
|
1618
|
-
},
|
|
1619
1586
|
.do_view_change => {
|
|
1620
1587
|
assert(self.replica_count > 1);
|
|
1621
1588
|
if (self.replica_count == 2) assert(threshold == 2);
|
|
@@ -1623,13 +1590,6 @@ pub fn Replica(
|
|
|
1623
1590
|
assert(self.status == .view_change);
|
|
1624
1591
|
assert(self.leader_index(self.view) == self.replica);
|
|
1625
1592
|
},
|
|
1626
|
-
.nack_prepare => {
|
|
1627
|
-
assert(self.replica_count > 1);
|
|
1628
|
-
if (self.replica_count == 2) assert(threshold >= 1);
|
|
1629
|
-
|
|
1630
|
-
assert(self.status == .view_change);
|
|
1631
|
-
assert(self.leader_index(self.view) == self.replica);
|
|
1632
|
-
},
|
|
1633
1593
|
else => unreachable,
|
|
1634
1594
|
}
|
|
1635
1595
|
|
|
@@ -1673,6 +1633,76 @@ pub fn Replica(
|
|
|
1673
1633
|
return count;
|
|
1674
1634
|
}
|
|
1675
1635
|
|
|
1636
|
+
fn count_message_and_receive_quorum_exactly_once(
|
|
1637
|
+
self: *Self,
|
|
1638
|
+
counter: *QuorumCounter,
|
|
1639
|
+
message: *Message,
|
|
1640
|
+
threshold: u32,
|
|
1641
|
+
) ?usize {
|
|
1642
|
+
assert(threshold >= 1);
|
|
1643
|
+
assert(threshold <= self.replica_count);
|
|
1644
|
+
|
|
1645
|
+
assert(QuorumCounter.bit_length == config.replicas_max);
|
|
1646
|
+
assert(message.header.cluster == self.cluster);
|
|
1647
|
+
assert(message.header.replica < self.replica_count);
|
|
1648
|
+
assert(message.header.view == self.view);
|
|
1649
|
+
|
|
1650
|
+
switch (message.header.command) {
|
|
1651
|
+
.prepare_ok => {
|
|
1652
|
+
if (self.replica_count <= 2) assert(threshold == self.replica_count);
|
|
1653
|
+
|
|
1654
|
+
assert(self.status == .normal);
|
|
1655
|
+
assert(self.leader());
|
|
1656
|
+
},
|
|
1657
|
+
.start_view_change => {
|
|
1658
|
+
assert(self.replica_count > 1);
|
|
1659
|
+
if (self.replica_count == 2) assert(threshold == 1);
|
|
1660
|
+
|
|
1661
|
+
assert(self.status == .view_change);
|
|
1662
|
+
},
|
|
1663
|
+
.nack_prepare => {
|
|
1664
|
+
assert(self.replica_count > 1);
|
|
1665
|
+
if (self.replica_count == 2) assert(threshold >= 1);
|
|
1666
|
+
|
|
1667
|
+
assert(self.status == .view_change);
|
|
1668
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
1669
|
+
},
|
|
1670
|
+
else => unreachable,
|
|
1671
|
+
}
|
|
1672
|
+
|
|
1673
|
+
const command: []const u8 = @tagName(message.header.command);
|
|
1674
|
+
|
|
1675
|
+
// Do not allow duplicate messages to trigger multiple passes through a state transition:
|
|
1676
|
+
if (counter.isSet(message.header.replica)) {
|
|
1677
|
+
log.debug("{}: on_{s}: ignoring (duplicate message)", .{ self.replica, command });
|
|
1678
|
+
return null;
|
|
1679
|
+
}
|
|
1680
|
+
|
|
1681
|
+
// Record the first receipt of this message:
|
|
1682
|
+
counter.set(message.header.replica);
|
|
1683
|
+
assert(counter.isSet(message.header.replica));
|
|
1684
|
+
|
|
1685
|
+
// Count the number of unique messages now received:
|
|
1686
|
+
const count = counter.count();
|
|
1687
|
+
log.debug("{}: on_{s}: {} message(s)", .{ self.replica, command, count });
|
|
1688
|
+
assert(count <= self.replica_count);
|
|
1689
|
+
|
|
1690
|
+
// Wait until we have exactly `threshold` messages for quorum:
|
|
1691
|
+
if (count < threshold) {
|
|
1692
|
+
log.debug("{}: on_{s}: waiting for quorum", .{ self.replica, command });
|
|
1693
|
+
return null;
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
// This is not the first time we have had quorum, the state transition has already happened:
|
|
1697
|
+
if (count > threshold) {
|
|
1698
|
+
log.debug("{}: on_{s}: ignoring (quorum received already)", .{ self.replica, command });
|
|
1699
|
+
return null;
|
|
1700
|
+
}
|
|
1701
|
+
|
|
1702
|
+
assert(count == threshold);
|
|
1703
|
+
return count;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1676
1706
|
fn append(self: *Self, message: *Message) void {
|
|
1677
1707
|
assert(self.status == .normal);
|
|
1678
1708
|
assert(message.header.command == .prepare);
|
|
@@ -1834,14 +1864,7 @@ pub fn Replica(
|
|
|
1834
1864
|
return;
|
|
1835
1865
|
}
|
|
1836
1866
|
|
|
1837
|
-
|
|
1838
|
-
const reply = self.message_bus.get_message() orelse {
|
|
1839
|
-
log.err("{}: commit_ops_commit: waiting for message", .{self.replica});
|
|
1840
|
-
return;
|
|
1841
|
-
};
|
|
1842
|
-
defer self.message_bus.unref(reply);
|
|
1843
|
-
|
|
1844
|
-
self.commit_op(prepare.?, reply);
|
|
1867
|
+
self.commit_op(prepare.?);
|
|
1845
1868
|
|
|
1846
1869
|
assert(self.commit_min == op);
|
|
1847
1870
|
assert(self.commit_min <= self.commit_max);
|
|
@@ -1851,7 +1874,7 @@ pub fn Replica(
|
|
|
1851
1874
|
self.commit_ops_read();
|
|
1852
1875
|
}
|
|
1853
1876
|
|
|
1854
|
-
fn commit_op(self: *Self, prepare: *const Message
|
|
1877
|
+
fn commit_op(self: *Self, prepare: *const Message) void {
|
|
1855
1878
|
// TODO Can we add more checks around allowing commit_op() during a view change?
|
|
1856
1879
|
assert(self.status == .normal or self.status == .view_change);
|
|
1857
1880
|
assert(prepare.header.command == .prepare);
|
|
@@ -1875,6 +1898,9 @@ pub fn Replica(
|
|
|
1875
1898
|
@tagName(prepare.header.operation.cast(StateMachine)),
|
|
1876
1899
|
});
|
|
1877
1900
|
|
|
1901
|
+
const reply = self.message_bus.get_message();
|
|
1902
|
+
defer self.message_bus.unref(reply);
|
|
1903
|
+
|
|
1878
1904
|
const reply_body_size = @intCast(u32, self.state_machine.commit(
|
|
1879
1905
|
prepare.header.client,
|
|
1880
1906
|
prepare.header.operation.cast(StateMachine),
|
|
@@ -1939,27 +1965,16 @@ pub fn Replica(
|
|
|
1939
1965
|
return;
|
|
1940
1966
|
}
|
|
1941
1967
|
|
|
1942
|
-
const count =
|
|
1943
|
-
&prepare.ok_from_all_replicas,
|
|
1944
|
-
.prepare_ok,
|
|
1945
|
-
prepare.message.header.checksum,
|
|
1946
|
-
);
|
|
1968
|
+
const count = prepare.ok_from_all_replicas.count();
|
|
1947
1969
|
assert(count >= self.quorum_replication);
|
|
1970
|
+
assert(count <= self.replica_count);
|
|
1948
1971
|
|
|
1949
|
-
|
|
1950
|
-
const reply = self.message_bus.get_message() orelse {
|
|
1951
|
-
// Eventually handled by on_prepare_timeout().
|
|
1952
|
-
log.err("{}: commit_pipeline: waiting for message", .{self.replica});
|
|
1953
|
-
return;
|
|
1954
|
-
};
|
|
1955
|
-
defer self.message_bus.unref(reply);
|
|
1956
|
-
|
|
1957
|
-
self.commit_op(prepare.message, reply);
|
|
1972
|
+
self.commit_op(prepare.message);
|
|
1958
1973
|
|
|
1959
1974
|
assert(self.commit_min == self.commit_max);
|
|
1960
1975
|
assert(self.commit_max == prepare.message.header.op);
|
|
1961
1976
|
|
|
1962
|
-
self.
|
|
1977
|
+
self.message_bus.unref(prepare.message);
|
|
1963
1978
|
assert(self.pipeline.pop() != null);
|
|
1964
1979
|
}
|
|
1965
1980
|
|
|
@@ -1984,15 +1999,6 @@ pub fn Replica(
|
|
|
1984
1999
|
assert(m.header.context == context);
|
|
1985
2000
|
assert(m.header.replica == replica);
|
|
1986
2001
|
switch (command) {
|
|
1987
|
-
.prepare_ok => {
|
|
1988
|
-
if (self.status == .normal) {
|
|
1989
|
-
assert(self.leader());
|
|
1990
|
-
assert(m.header.view == self.view);
|
|
1991
|
-
} else {
|
|
1992
|
-
assert(self.status == .view_change);
|
|
1993
|
-
assert(m.header.view < self.view);
|
|
1994
|
-
}
|
|
1995
|
-
},
|
|
1996
2002
|
.start_view_change => {
|
|
1997
2003
|
assert(m.header.replica != self.replica);
|
|
1998
2004
|
assert(m.header.view == self.view);
|
|
@@ -2090,13 +2096,13 @@ pub fn Replica(
|
|
|
2090
2096
|
}
|
|
2091
2097
|
|
|
2092
2098
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
2093
|
-
fn create_view_change_message(self: *Self, command: Command)
|
|
2099
|
+
fn create_view_change_message(self: *Self, command: Command) *Message {
|
|
2094
2100
|
assert(command == .do_view_change or command == .start_view);
|
|
2095
2101
|
|
|
2096
2102
|
// We may send a start_view message in normal status to resolve a follower's view jump:
|
|
2097
2103
|
assert(self.status == .normal or self.status == .view_change);
|
|
2098
2104
|
|
|
2099
|
-
const message = self.message_bus.get_message()
|
|
2105
|
+
const message = self.message_bus.get_message();
|
|
2100
2106
|
defer self.message_bus.unref(message);
|
|
2101
2107
|
|
|
2102
2108
|
message.header.* = .{
|
|
@@ -2146,12 +2152,12 @@ pub fn Replica(
|
|
|
2146
2152
|
}
|
|
2147
2153
|
|
|
2148
2154
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
2149
|
-
fn create_message_from_header(self: *Self, header: Header)
|
|
2155
|
+
fn create_message_from_header(self: *Self, header: Header) *Message {
|
|
2150
2156
|
assert(header.replica == self.replica);
|
|
2151
2157
|
assert(header.view == self.view or header.command == .request_start_view);
|
|
2152
2158
|
assert(header.size == @sizeOf(Header));
|
|
2153
2159
|
|
|
2154
|
-
const message = self.message_bus.pool.
|
|
2160
|
+
const message = self.message_bus.pool.get_message();
|
|
2155
2161
|
defer self.message_bus.unref(message);
|
|
2156
2162
|
|
|
2157
2163
|
message.header.* = header;
|
|
@@ -3206,7 +3212,7 @@ pub fn Replica(
|
|
|
3206
3212
|
prepare.?.header.checksum,
|
|
3207
3213
|
});
|
|
3208
3214
|
|
|
3209
|
-
self.pipeline.
|
|
3215
|
+
self.pipeline.push_assume_capacity(.{ .message = prepare.?.ref() });
|
|
3210
3216
|
assert(self.pipeline.count >= 1);
|
|
3211
3217
|
|
|
3212
3218
|
self.repairing_pipeline = true;
|
|
@@ -3336,17 +3342,11 @@ pub fn Replica(
|
|
|
3336
3342
|
assert(nack_prepare_op <= op);
|
|
3337
3343
|
if (nack_prepare_op != op) {
|
|
3338
3344
|
self.nack_prepare_op = op;
|
|
3339
|
-
self.
|
|
3340
|
-
&self.nack_prepare_from_other_replicas,
|
|
3341
|
-
.nack_prepare,
|
|
3342
|
-
);
|
|
3345
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3343
3346
|
}
|
|
3344
3347
|
} else {
|
|
3345
3348
|
self.nack_prepare_op = op;
|
|
3346
|
-
self.
|
|
3347
|
-
&self.nack_prepare_from_other_replicas,
|
|
3348
|
-
.nack_prepare,
|
|
3349
|
-
);
|
|
3349
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3350
3350
|
}
|
|
3351
3351
|
|
|
3352
3352
|
assert(self.nack_prepare_op.? == op);
|
|
@@ -3421,7 +3421,7 @@ pub fn Replica(
|
|
|
3421
3421
|
/// Stops the prepare timeout and resets the timeouts counter.
|
|
3422
3422
|
fn reset_pipeline(self: *Self) void {
|
|
3423
3423
|
while (self.pipeline.pop()) |prepare| {
|
|
3424
|
-
self.
|
|
3424
|
+
self.message_bus.unref(prepare.message);
|
|
3425
3425
|
}
|
|
3426
3426
|
|
|
3427
3427
|
self.prepare_timeout.stop();
|
|
@@ -3460,18 +3460,33 @@ pub fn Replica(
|
|
|
3460
3460
|
log.debug("{}: reset {} {s} message(s)", .{ self.replica, count, @tagName(command) });
|
|
3461
3461
|
}
|
|
3462
3462
|
|
|
3463
|
+
fn reset_quorum_counter(self: *Self, counter: *QuorumCounter) void {
|
|
3464
|
+
var counter_iterator = counter.iterator(.{});
|
|
3465
|
+
while (counter_iterator.next()) |replica| {
|
|
3466
|
+
assert(replica < self.replica_count);
|
|
3467
|
+
}
|
|
3468
|
+
|
|
3469
|
+
counter.setIntersection(QuorumCounterNull);
|
|
3470
|
+
assert(counter.count() == 0);
|
|
3471
|
+
|
|
3472
|
+
var replica: usize = 0;
|
|
3473
|
+
while (replica < self.replica_count) : (replica += 1) {
|
|
3474
|
+
assert(!counter.isSet(replica));
|
|
3475
|
+
}
|
|
3476
|
+
}
|
|
3477
|
+
|
|
3463
3478
|
fn reset_quorum_do_view_change(self: *Self) void {
|
|
3464
3479
|
self.reset_quorum_messages(&self.do_view_change_from_all_replicas, .do_view_change);
|
|
3465
3480
|
self.do_view_change_quorum = false;
|
|
3466
3481
|
}
|
|
3467
3482
|
|
|
3468
3483
|
fn reset_quorum_nack_prepare(self: *Self) void {
|
|
3469
|
-
self.
|
|
3484
|
+
self.reset_quorum_counter(&self.nack_prepare_from_other_replicas);
|
|
3470
3485
|
self.nack_prepare_op = null;
|
|
3471
3486
|
}
|
|
3472
3487
|
|
|
3473
3488
|
fn reset_quorum_start_view_change(self: *Self) void {
|
|
3474
|
-
self.
|
|
3489
|
+
self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
|
|
3475
3490
|
self.start_view_change_quorum = false;
|
|
3476
3491
|
}
|
|
3477
3492
|
|
|
@@ -3576,17 +3591,12 @@ pub fn Replica(
|
|
|
3576
3591
|
assert(self.status == .view_change);
|
|
3577
3592
|
assert(self.start_view_change_quorum);
|
|
3578
3593
|
assert(!self.do_view_change_quorum);
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
.start_view_change,
|
|
3582
|
-
0,
|
|
3583
|
-
);
|
|
3594
|
+
|
|
3595
|
+
const count_start_view_change = self.start_view_change_from_other_replicas.count();
|
|
3584
3596
|
assert(count_start_view_change >= self.quorum_view_change - 1);
|
|
3597
|
+
assert(count_start_view_change <= self.replica_count - 1);
|
|
3585
3598
|
|
|
3586
|
-
const message = self.create_view_change_message(.do_view_change)
|
|
3587
|
-
log.err("{}: send_do_view_change: waiting for message", .{self.replica});
|
|
3588
|
-
return;
|
|
3589
|
-
};
|
|
3599
|
+
const message = self.create_view_change_message(.do_view_change);
|
|
3590
3600
|
defer self.message_bus.unref(message);
|
|
3591
3601
|
|
|
3592
3602
|
assert(message.references == 1);
|
|
@@ -3618,25 +3628,14 @@ pub fn Replica(
|
|
|
3618
3628
|
}
|
|
3619
3629
|
|
|
3620
3630
|
fn send_header_to_client(self: *Self, client: u128, header: Header) void {
|
|
3621
|
-
const message = self.create_message_from_header(header)
|
|
3622
|
-
log.err("{}: no header-only message available, dropping message to client {}", .{
|
|
3623
|
-
self.replica,
|
|
3624
|
-
client,
|
|
3625
|
-
});
|
|
3626
|
-
return;
|
|
3627
|
-
};
|
|
3631
|
+
const message = self.create_message_from_header(header);
|
|
3628
3632
|
defer self.message_bus.unref(message);
|
|
3629
3633
|
|
|
3630
3634
|
self.message_bus.send_message_to_client(client, message);
|
|
3631
3635
|
}
|
|
3632
3636
|
|
|
3633
3637
|
fn send_header_to_other_replicas(self: *Self, header: Header) void {
|
|
3634
|
-
const message = self.create_message_from_header(header)
|
|
3635
|
-
log.err("{}: no header-only message available, dropping message to replicas", .{
|
|
3636
|
-
self.replica,
|
|
3637
|
-
});
|
|
3638
|
-
return;
|
|
3639
|
-
};
|
|
3638
|
+
const message = self.create_message_from_header(header);
|
|
3640
3639
|
defer self.message_bus.unref(message);
|
|
3641
3640
|
|
|
3642
3641
|
var replica: u8 = 0;
|
|
@@ -3648,13 +3647,7 @@ pub fn Replica(
|
|
|
3648
3647
|
}
|
|
3649
3648
|
|
|
3650
3649
|
fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
|
|
3651
|
-
const message = self.create_message_from_header(header)
|
|
3652
|
-
log.err("{}: no header-only message available, dropping message to replica {}", .{
|
|
3653
|
-
self.replica,
|
|
3654
|
-
replica,
|
|
3655
|
-
});
|
|
3656
|
-
return;
|
|
3657
|
-
};
|
|
3650
|
+
const message = self.create_message_from_header(header);
|
|
3658
3651
|
defer self.message_bus.unref(message);
|
|
3659
3652
|
|
|
3660
3653
|
self.send_message_to_replica(replica, message);
|
|
@@ -3912,10 +3905,7 @@ pub fn Replica(
|
|
|
3912
3905
|
assert(self.journal.faulty.len == 0);
|
|
3913
3906
|
assert(self.nack_prepare_op == null);
|
|
3914
3907
|
|
|
3915
|
-
const start_view = self.create_view_change_message(.start_view)
|
|
3916
|
-
log.err("{}: start_view_as_the_new_leader: waiting for message", .{self.replica});
|
|
3917
|
-
return;
|
|
3918
|
-
};
|
|
3908
|
+
const start_view = self.create_view_change_message(.start_view);
|
|
3919
3909
|
defer self.message_bus.unref(start_view);
|
|
3920
3910
|
|
|
3921
3911
|
self.transition_to_normal_status(self.view);
|
|
@@ -4022,19 +4012,6 @@ pub fn Replica(
|
|
|
4022
4012
|
self.send_start_view_change();
|
|
4023
4013
|
}
|
|
4024
4014
|
|
|
4025
|
-
fn unref_prepare_message_and_quorum_messages(
|
|
4026
|
-
self: *Self,
|
|
4027
|
-
prepare: *const Prepare,
|
|
4028
|
-
) void {
|
|
4029
|
-
self.message_bus.unref(prepare.message);
|
|
4030
|
-
for (prepare.ok_from_all_replicas) |received, replica| {
|
|
4031
|
-
if (received) |prepare_ok| {
|
|
4032
|
-
assert(replica < self.replica_count);
|
|
4033
|
-
self.message_bus.unref(prepare_ok);
|
|
4034
|
-
}
|
|
4035
|
-
}
|
|
4036
|
-
}
|
|
4037
|
-
|
|
4038
4015
|
fn update_client_table_entry(self: *Self, reply: *Message) void {
|
|
4039
4016
|
assert(reply.header.command == .reply);
|
|
4040
4017
|
assert(reply.header.operation != .register);
|