tigerbeetle-node 0.11.13 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -10
- package/dist/bin/aarch64-linux-gnu/client.node +0 -0
- package/dist/bin/aarch64-linux-musl/client.node +0 -0
- package/dist/bin/aarch64-macos/client.node +0 -0
- package/dist/bin/x86_64-linux-gnu/client.node +0 -0
- package/dist/bin/x86_64-linux-musl/client.node +0 -0
- package/dist/bin/x86_64-macos/client.node +0 -0
- package/dist/index.js +33 -1
- package/dist/index.js.map +1 -1
- package/package-lock.json +66 -0
- package/package.json +6 -16
- package/src/index.ts +56 -1
- package/src/node.zig +9 -9
- package/dist/.client.node.sha256 +0 -1
- package/scripts/build_lib.sh +0 -61
- package/scripts/download_node_headers.sh +0 -32
- package/src/tigerbeetle/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/src/benchmark.zig +0 -336
- package/src/tigerbeetle/src/config.zig +0 -233
- package/src/tigerbeetle/src/constants.zig +0 -428
- package/src/tigerbeetle/src/ewah.zig +0 -286
- package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
- package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
- package/src/tigerbeetle/src/fifo.zig +0 -120
- package/src/tigerbeetle/src/io/benchmark.zig +0 -213
- package/src/tigerbeetle/src/io/darwin.zig +0 -814
- package/src/tigerbeetle/src/io/linux.zig +0 -1071
- package/src/tigerbeetle/src/io/test.zig +0 -643
- package/src/tigerbeetle/src/io/windows.zig +0 -1183
- package/src/tigerbeetle/src/io.zig +0 -34
- package/src/tigerbeetle/src/iops.zig +0 -107
- package/src/tigerbeetle/src/lsm/README.md +0 -308
- package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
- package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
- package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
- package/src/tigerbeetle/src/lsm/direction.zig +0 -11
- package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
- package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
- package/src/tigerbeetle/src/lsm/forest.zig +0 -205
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
- package/src/tigerbeetle/src/lsm/grid.zig +0 -573
- package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
- package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
- package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
- package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
- package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
- package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
- package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
- package/src/tigerbeetle/src/lsm/table.zig +0 -1009
- package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
- package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
- package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
- package/src/tigerbeetle/src/lsm/test.zig +0 -439
- package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
- package/src/tigerbeetle/src/message_bus.zig +0 -1013
- package/src/tigerbeetle/src/message_pool.zig +0 -156
- package/src/tigerbeetle/src/ring_buffer.zig +0 -399
- package/src/tigerbeetle/src/simulator.zig +0 -580
- package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
- package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
- package/src/tigerbeetle/src/state_machine.zig +0 -2099
- package/src/tigerbeetle/src/static_allocator.zig +0 -65
- package/src/tigerbeetle/src/stdx.zig +0 -171
- package/src/tigerbeetle/src/storage.zig +0 -393
- package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
- package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
- package/src/tigerbeetle/src/testing/cluster.zig +0 -444
- package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
- package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
- package/src/tigerbeetle/src/testing/id.zig +0 -99
- package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
- package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
- package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
- package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
- package/src/tigerbeetle/src/testing/storage.zig +0 -757
- package/src/tigerbeetle/src/testing/table.zig +0 -247
- package/src/tigerbeetle/src/testing/time.zig +0 -84
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
- package/src/tigerbeetle/src/time.zig +0 -112
- package/src/tigerbeetle/src/tracer.zig +0 -529
- package/src/tigerbeetle/src/unit_tests.zig +0 -40
- package/src/tigerbeetle/src/vopr.zig +0 -495
- package/src/tigerbeetle/src/vsr/README.md +0 -209
- package/src/tigerbeetle/src/vsr/client.zig +0 -544
- package/src/tigerbeetle/src/vsr/clock.zig +0 -855
- package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
- package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
- package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
- package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
- package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
- package/src/tigerbeetle/src/vsr.zig +0 -1425
|
@@ -1,1425 +0,0 @@
|
|
|
1
|
-
const std = @import("std");
|
|
2
|
-
const math = std.math;
|
|
3
|
-
const Allocator = std.mem.Allocator;
|
|
4
|
-
const assert = std.debug.assert;
|
|
5
|
-
const log = std.log.scoped(.vsr);
|
|
6
|
-
|
|
7
|
-
// vsr.zig is the root of a zig package, reexport all public APIs.
|
|
8
|
-
//
|
|
9
|
-
// Note that we don't promise any stability of these interfaces yet.
|
|
10
|
-
pub const constants = @import("constants.zig");
|
|
11
|
-
pub const io = @import("io.zig");
|
|
12
|
-
pub const message_bus = @import("message_bus.zig");
|
|
13
|
-
pub const message_pool = @import("message_pool.zig");
|
|
14
|
-
pub const state_machine = @import("state_machine.zig");
|
|
15
|
-
pub const storage = @import("storage.zig");
|
|
16
|
-
pub const tigerbeetle = @import("tigerbeetle.zig");
|
|
17
|
-
pub const time = @import("time.zig");
|
|
18
|
-
pub const tracer = @import("tracer.zig");
|
|
19
|
-
pub const config = @import("config.zig");
|
|
20
|
-
pub const stdx = @import("stdx.zig");
|
|
21
|
-
pub const superblock = @import("vsr/superblock.zig");
|
|
22
|
-
pub const lsm = .{
|
|
23
|
-
.tree = @import("lsm/tree.zig"),
|
|
24
|
-
.grid = @import("lsm/grid.zig"),
|
|
25
|
-
.groove = @import("lsm/groove.zig"),
|
|
26
|
-
.forest = @import("lsm/forest.zig"),
|
|
27
|
-
.posted_groove = @import("lsm/posted_groove.zig"),
|
|
28
|
-
};
|
|
29
|
-
pub const testing = .{
|
|
30
|
-
.cluster = @import("testing/cluster.zig"),
|
|
31
|
-
};
|
|
32
|
-
|
|
33
|
-
pub const ReplicaType = @import("vsr/replica.zig").ReplicaType;
|
|
34
|
-
pub const format = @import("vsr/replica_format.zig").format;
|
|
35
|
-
pub const Status = @import("vsr/replica.zig").Status;
|
|
36
|
-
pub const Client = @import("vsr/client.zig").Client;
|
|
37
|
-
pub const Clock = @import("vsr/clock.zig").Clock;
|
|
38
|
-
pub const JournalType = @import("vsr/journal.zig").JournalType;
|
|
39
|
-
pub const SlotRange = @import("vsr/journal.zig").SlotRange;
|
|
40
|
-
pub const SuperBlockType = superblock.SuperBlockType;
|
|
41
|
-
pub const VSRState = superblock.SuperBlockHeader.VSRState;
|
|
42
|
-
|
|
43
|
-
/// The version of our Viewstamped Replication protocol in use, including customizations.
|
|
44
|
-
/// For backwards compatibility through breaking changes (e.g. upgrading checksums/ciphers).
|
|
45
|
-
pub const Version: u8 = 0;
|
|
46
|
-
|
|
47
|
-
pub const ProcessType = enum { replica, client };
|
|
48
|
-
|
|
49
|
-
pub const Zone = enum {
|
|
50
|
-
superblock,
|
|
51
|
-
wal_headers,
|
|
52
|
-
wal_prepares,
|
|
53
|
-
grid,
|
|
54
|
-
|
|
55
|
-
const size_superblock = superblock.superblock_zone_size;
|
|
56
|
-
const size_wal_headers = constants.journal_size_headers;
|
|
57
|
-
const size_wal_prepares = constants.journal_size_prepares;
|
|
58
|
-
|
|
59
|
-
comptime {
|
|
60
|
-
for (.{
|
|
61
|
-
size_superblock,
|
|
62
|
-
size_wal_headers,
|
|
63
|
-
size_wal_prepares,
|
|
64
|
-
}) |zone_size| {
|
|
65
|
-
assert(zone_size % constants.sector_size == 0);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
pub fn offset(zone: Zone, offset_logical: u64) u64 {
|
|
70
|
-
if (zone.size()) |zone_size| {
|
|
71
|
-
assert(offset_logical < zone_size);
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
return offset_logical + switch (zone) {
|
|
75
|
-
.superblock => 0,
|
|
76
|
-
.wal_headers => size_superblock,
|
|
77
|
-
.wal_prepares => size_superblock + size_wal_headers,
|
|
78
|
-
.grid => size_superblock + size_wal_headers + size_wal_prepares,
|
|
79
|
-
};
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
pub fn size(zone: Zone) ?u64 {
|
|
83
|
-
return switch (zone) {
|
|
84
|
-
.superblock => size_superblock,
|
|
85
|
-
.wal_headers => size_wal_headers,
|
|
86
|
-
.wal_prepares => size_wal_prepares,
|
|
87
|
-
.grid => null,
|
|
88
|
-
};
|
|
89
|
-
}
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
/// Viewstamped Replication protocol commands:
|
|
93
|
-
pub const Command = enum(u8) {
|
|
94
|
-
reserved,
|
|
95
|
-
|
|
96
|
-
ping,
|
|
97
|
-
pong,
|
|
98
|
-
|
|
99
|
-
ping_client,
|
|
100
|
-
pong_client,
|
|
101
|
-
|
|
102
|
-
request,
|
|
103
|
-
prepare,
|
|
104
|
-
prepare_ok,
|
|
105
|
-
reply,
|
|
106
|
-
commit,
|
|
107
|
-
|
|
108
|
-
start_view_change,
|
|
109
|
-
do_view_change,
|
|
110
|
-
start_view,
|
|
111
|
-
|
|
112
|
-
request_start_view,
|
|
113
|
-
request_headers,
|
|
114
|
-
request_prepare,
|
|
115
|
-
headers,
|
|
116
|
-
nack_prepare,
|
|
117
|
-
|
|
118
|
-
eviction,
|
|
119
|
-
|
|
120
|
-
request_block,
|
|
121
|
-
block,
|
|
122
|
-
};
|
|
123
|
-
|
|
124
|
-
/// This type exists to avoid making the Header type dependant on the state
|
|
125
|
-
/// machine used, which would cause awkward circular type dependencies.
|
|
126
|
-
pub const Operation = enum(u8) {
|
|
127
|
-
/// Operations reserved by VR protocol (for all state machines):
|
|
128
|
-
/// The value 0 is reserved to prevent a spurious zero from being interpreted as an operation.
|
|
129
|
-
reserved = 0,
|
|
130
|
-
/// The value 1 is reserved to initialize the cluster.
|
|
131
|
-
root = 1,
|
|
132
|
-
/// The value 2 is reserved to register a client session with the cluster.
|
|
133
|
-
register = 2,
|
|
134
|
-
|
|
135
|
-
/// Operations exported by the state machine (all other values are free):
|
|
136
|
-
_,
|
|
137
|
-
|
|
138
|
-
pub fn from(comptime StateMachine: type, op: StateMachine.Operation) Operation {
|
|
139
|
-
check_state_machine_operations(StateMachine.Operation);
|
|
140
|
-
return @intToEnum(Operation, @enumToInt(op));
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
pub fn cast(self: Operation, comptime StateMachine: type) StateMachine.Operation {
|
|
144
|
-
check_state_machine_operations(StateMachine.Operation);
|
|
145
|
-
return @intToEnum(StateMachine.Operation, @enumToInt(self));
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
fn check_state_machine_operations(comptime Op: type) void {
|
|
149
|
-
if (!@hasField(Op, "reserved") or std.meta.fieldInfo(Op, .reserved).value != 0) {
|
|
150
|
-
@compileError("StateMachine.Operation must have a 'reserved' field with value 0");
|
|
151
|
-
}
|
|
152
|
-
if (!@hasField(Op, "root") or std.meta.fieldInfo(Op, .root).value != 1) {
|
|
153
|
-
@compileError("StateMachine.Operation must have a 'root' field with value 1");
|
|
154
|
-
}
|
|
155
|
-
if (!@hasField(Op, "register") or std.meta.fieldInfo(Op, .register).value != 2) {
|
|
156
|
-
@compileError("StateMachine.Operation must have a 'register' field with value 2");
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
};
|
|
160
|
-
|
|
161
|
-
/// Network message and journal entry header:
|
|
162
|
-
/// We reuse the same header for both so that prepare messages from the primary can simply be
|
|
163
|
-
/// journalled as is by the backups without requiring any further modification.
|
|
164
|
-
pub const Header = extern struct {
|
|
165
|
-
const checksum_body_empty = checksum(&.{});
|
|
166
|
-
|
|
167
|
-
comptime {
|
|
168
|
-
assert(@sizeOf(Header) == 128);
|
|
169
|
-
// Assert that there is no implicit padding in the struct.
|
|
170
|
-
assert(@bitSizeOf(Header) == @sizeOf(Header) * 8);
|
|
171
|
-
}
|
|
172
|
-
/// A checksum covering only the remainder of this header.
|
|
173
|
-
/// This allows the header to be trusted without having to recv() or read() the associated body.
|
|
174
|
-
/// This checksum is enough to uniquely identify a network message or journal entry.
|
|
175
|
-
checksum: u128 = 0,
|
|
176
|
-
|
|
177
|
-
/// A checksum covering only the associated body after this header.
|
|
178
|
-
checksum_body: u128 = 0,
|
|
179
|
-
|
|
180
|
-
/// A backpointer to the previous request or prepare checksum for hash chain verification.
|
|
181
|
-
/// This provides a cryptographic guarantee for linearizability:
|
|
182
|
-
/// 1. across our distributed log of prepares, and
|
|
183
|
-
/// 2. across a client's requests and our replies.
|
|
184
|
-
/// This may also be used as the initialization vector for AEAD encryption at rest, provided
|
|
185
|
-
/// that the primary ratchets the encryption key every view change to ensure that prepares
|
|
186
|
-
/// reordered through a view change never repeat the same IV for the same encryption key.
|
|
187
|
-
parent: u128 = 0,
|
|
188
|
-
|
|
189
|
-
/// Each client process generates a unique, random and ephemeral client ID at initialization.
|
|
190
|
-
/// The client ID identifies connections made by the client to the cluster for the sake of
|
|
191
|
-
/// routing messages back to the client.
|
|
192
|
-
///
|
|
193
|
-
/// With the client ID in hand, the client then registers a monotonically increasing session
|
|
194
|
-
/// number (committed through the cluster) to allow the client's session to be evicted safely
|
|
195
|
-
/// from the client table if too many concurrent clients cause the client table to overflow.
|
|
196
|
-
/// The monotonically increasing session number prevents duplicate client requests from being
|
|
197
|
-
/// replayed.
|
|
198
|
-
///
|
|
199
|
-
/// The problem of routing is therefore solved by the 128-bit client ID, and the problem of
|
|
200
|
-
/// detecting whether a session has been evicted is solved by the session number.
|
|
201
|
-
client: u128 = 0,
|
|
202
|
-
|
|
203
|
-
/// The checksum of the message to which this message refers, or a unique recovery nonce.
|
|
204
|
-
///
|
|
205
|
-
/// We use this cryptographic context in various ways, for example:
|
|
206
|
-
///
|
|
207
|
-
/// * A `request` sets this to the client's session number.
|
|
208
|
-
/// * A `prepare` sets this to the checksum of the client's request.
|
|
209
|
-
/// * A `prepare_ok` sets this to the checksum of the prepare being acked.
|
|
210
|
-
/// * A `commit` sets this to the checksum of the latest committed prepare.
|
|
211
|
-
/// * A `request_prepare` sets this to the checksum of the prepare being requested.
|
|
212
|
-
/// * A `nack_prepare` sets this to the checksum of the prepare being nacked.
|
|
213
|
-
/// * A `recovery` and `recovery_response` sets this to the nonce.
|
|
214
|
-
///
|
|
215
|
-
/// This allows for cryptographic guarantees beyond request, op, and commit numbers, which have
|
|
216
|
-
/// low entropy and may otherwise collide in the event of any correctness bugs.
|
|
217
|
-
context: u128 = 0,
|
|
218
|
-
|
|
219
|
-
/// Each request is given a number by the client and later requests must have larger numbers
|
|
220
|
-
/// than earlier ones. The request number is used by the replicas to avoid running requests more
|
|
221
|
-
/// than once; it is also used by the client to discard duplicate responses to its requests.
|
|
222
|
-
/// A client is allowed to have at most one request inflight at a time.
|
|
223
|
-
request: u32 = 0,
|
|
224
|
-
|
|
225
|
-
/// The cluster number binds intention into the header, so that a client or replica can indicate
|
|
226
|
-
/// the cluster it believes it is speaking to, instead of accidentally talking to the wrong
|
|
227
|
-
/// cluster (for example, staging vs production).
|
|
228
|
-
cluster: u32,
|
|
229
|
-
|
|
230
|
-
/// The cluster reconfiguration epoch number (for future use).
|
|
231
|
-
epoch: u32 = 0,
|
|
232
|
-
|
|
233
|
-
/// Every message sent from one replica to another contains the sending replica's current view.
|
|
234
|
-
/// A `u32` allows for a minimum lifetime of 136 years at a rate of one view change per second.
|
|
235
|
-
view: u32 = 0,
|
|
236
|
-
|
|
237
|
-
/// The op number of the latest prepare that may or may not yet be committed. Uncommitted ops
|
|
238
|
-
/// may be replaced by different ops if they do not survive through a view change.
|
|
239
|
-
op: u64 = 0,
|
|
240
|
-
|
|
241
|
-
/// The commit number of the latest committed prepare. Committed ops are immutable.
|
|
242
|
-
///
|
|
243
|
-
/// * A `do_view_change` sets this to `commit_min`, to indicate the sending replica's progress.
|
|
244
|
-
/// The sending replica may continue to commit after sending the DVC.
|
|
245
|
-
/// * A `start_view` sets this to `commit_max`.
|
|
246
|
-
commit: u64 = 0,
|
|
247
|
-
|
|
248
|
-
/// This field is used in various ways:
|
|
249
|
-
///
|
|
250
|
-
/// * A `prepare` sets this to the primary's state machine `prepare_timestamp`.
|
|
251
|
-
/// For `create_accounts` and `create_transfers` this is the batch's highest timestamp.
|
|
252
|
-
/// * A `reply` sets this to the corresponding `prepare`'s timestamp.
|
|
253
|
-
/// This allows the test workload to verify transfer timeouts.
|
|
254
|
-
/// * A `do_view_change` sets this to the latest normal view number.
|
|
255
|
-
/// * A `pong` sets this to the sender's wall clock value.
|
|
256
|
-
/// * A `request_prepare` sets this to `1` when `context` is set to a checksum, and `0`
|
|
257
|
-
/// otherwise.
|
|
258
|
-
/// * A `commit` message sets this to the replica's monotonic timestamp.
|
|
259
|
-
timestamp: u64 = 0,
|
|
260
|
-
|
|
261
|
-
/// The size of the Header structure (always), plus any associated body.
|
|
262
|
-
size: u32 = @sizeOf(Header),
|
|
263
|
-
|
|
264
|
-
/// The index of the replica in the cluster configuration array that authored this message.
|
|
265
|
-
/// This identifies only the ultimate author because messages may be forwarded amongst replicas.
|
|
266
|
-
replica: u8 = 0,
|
|
267
|
-
|
|
268
|
-
/// The Viewstamped Replication protocol command for this message.
|
|
269
|
-
command: Command,
|
|
270
|
-
|
|
271
|
-
/// The state machine operation to apply.
|
|
272
|
-
operation: Operation = .reserved,
|
|
273
|
-
|
|
274
|
-
/// The version of the protocol implementation that originated this message.
|
|
275
|
-
version: u8 = Version,
|
|
276
|
-
|
|
277
|
-
pub fn calculate_checksum(self: *const Header) u128 {
|
|
278
|
-
const checksum_size = @sizeOf(@TypeOf(self.checksum));
|
|
279
|
-
assert(checksum_size == 16);
|
|
280
|
-
const checksum_value = checksum(std.mem.asBytes(self)[checksum_size..]);
|
|
281
|
-
assert(@TypeOf(checksum_value) == @TypeOf(self.checksum));
|
|
282
|
-
return checksum_value;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
pub fn calculate_checksum_body(self: *const Header, body: []const u8) u128 {
|
|
286
|
-
assert(self.size == @sizeOf(Header) + body.len);
|
|
287
|
-
const checksum_size = @sizeOf(@TypeOf(self.checksum_body));
|
|
288
|
-
assert(checksum_size == 16);
|
|
289
|
-
const checksum_value = checksum(body);
|
|
290
|
-
assert(@TypeOf(checksum_value) == @TypeOf(self.checksum_body));
|
|
291
|
-
return checksum_value;
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
/// This must be called only after set_checksum_body() so that checksum_body is also covered:
|
|
295
|
-
pub fn set_checksum(self: *Header) void {
|
|
296
|
-
self.checksum = self.calculate_checksum();
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
pub fn set_checksum_body(self: *Header, body: []const u8) void {
|
|
300
|
-
self.checksum_body = self.calculate_checksum_body(body);
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
pub fn valid_checksum(self: *const Header) bool {
|
|
304
|
-
return self.checksum == self.calculate_checksum();
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
pub fn valid_checksum_body(self: *const Header, body: []const u8) bool {
|
|
308
|
-
return self.checksum_body == self.calculate_checksum_body(body);
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
/// Returns null if all fields are set correctly according to the command, or else a warning.
|
|
312
|
-
/// This does not verify that checksum is valid, and expects that this has already been done.
|
|
313
|
-
pub fn invalid(self: *const Header) ?[]const u8 {
|
|
314
|
-
if (self.version != Version) return "version != Version";
|
|
315
|
-
if (self.size < @sizeOf(Header)) return "size < @sizeOf(Header)";
|
|
316
|
-
if (self.epoch != 0) return "epoch != 0";
|
|
317
|
-
return switch (self.command) {
|
|
318
|
-
.reserved => self.invalid_reserved(),
|
|
319
|
-
.ping => self.invalid_ping(),
|
|
320
|
-
.pong => self.invalid_pong(),
|
|
321
|
-
.ping_client => self.invalid_ping_client(),
|
|
322
|
-
.pong_client => self.invalid_pong_client(),
|
|
323
|
-
.request => self.invalid_request(),
|
|
324
|
-
.prepare => self.invalid_prepare(),
|
|
325
|
-
.prepare_ok => self.invalid_prepare_ok(),
|
|
326
|
-
.reply => self.invalid_reply(),
|
|
327
|
-
.commit => self.invalid_commit(),
|
|
328
|
-
.start_view_change => self.invalid_start_view_change(),
|
|
329
|
-
.do_view_change => self.invalid_do_view_change(),
|
|
330
|
-
.start_view => self.invalid_start_view(),
|
|
331
|
-
.request_start_view => self.invalid_request_start_view(),
|
|
332
|
-
.request_headers => self.invalid_request_headers(),
|
|
333
|
-
.request_prepare => self.invalid_request_prepare(),
|
|
334
|
-
.request_block => null, // TODO
|
|
335
|
-
.headers => self.invalid_headers(),
|
|
336
|
-
.nack_prepare => self.invalid_nack_prepare(),
|
|
337
|
-
.eviction => self.invalid_eviction(),
|
|
338
|
-
.block => null, // TODO
|
|
339
|
-
};
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
fn invalid_reserved(self: *const Header) ?[]const u8 {
|
|
343
|
-
assert(self.command == .reserved);
|
|
344
|
-
if (self.parent != 0) return "parent != 0";
|
|
345
|
-
if (self.client != 0) return "client != 0";
|
|
346
|
-
if (self.context != 0) return "context != 0";
|
|
347
|
-
if (self.request != 0) return "request != 0";
|
|
348
|
-
if (self.view != 0) return "view != 0";
|
|
349
|
-
if (self.commit != 0) return "commit != 0";
|
|
350
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
351
|
-
if (self.replica != 0) return "replica != 0";
|
|
352
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
353
|
-
return null;
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
fn invalid_ping(self: *const Header) ?[]const u8 {
|
|
357
|
-
assert(self.command == .ping);
|
|
358
|
-
if (self.parent != 0) return "parent != 0";
|
|
359
|
-
if (self.client != 0) return "client != 0";
|
|
360
|
-
if (self.context != 0) return "context != 0";
|
|
361
|
-
if (self.request != 0) return "request != 0";
|
|
362
|
-
if (self.view != 0) return "view != 0";
|
|
363
|
-
if (self.commit != 0) return "commit != 0";
|
|
364
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
365
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
366
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
367
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
368
|
-
return null;
|
|
369
|
-
}
|
|
370
|
-
|
|
371
|
-
fn invalid_pong(self: *const Header) ?[]const u8 {
|
|
372
|
-
assert(self.command == .pong);
|
|
373
|
-
if (self.parent != 0) return "parent != 0";
|
|
374
|
-
if (self.client != 0) return "client != 0";
|
|
375
|
-
if (self.context != 0) return "context != 0";
|
|
376
|
-
if (self.request != 0) return "request != 0";
|
|
377
|
-
if (self.view != 0) return "view != 0";
|
|
378
|
-
if (self.commit != 0) return "commit != 0";
|
|
379
|
-
if (self.timestamp == 0) return "timestamp == 0";
|
|
380
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
381
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
382
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
383
|
-
return null;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
fn invalid_ping_client(self: *const Header) ?[]const u8 {
|
|
387
|
-
assert(self.command == .ping_client);
|
|
388
|
-
if (self.parent != 0) return "parent != 0";
|
|
389
|
-
if (self.client == 0) return "client == 0";
|
|
390
|
-
if (self.context != 0) return "context != 0";
|
|
391
|
-
if (self.request != 0) return "request != 0";
|
|
392
|
-
if (self.view != 0) return "view != 0";
|
|
393
|
-
if (self.op != 0) return "op != 0";
|
|
394
|
-
if (self.commit != 0) return "commit != 0";
|
|
395
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
396
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
397
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
398
|
-
if (self.replica != 0) return "replica != 0";
|
|
399
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
400
|
-
return null;
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
fn invalid_pong_client(self: *const Header) ?[]const u8 {
|
|
404
|
-
assert(self.command == .pong_client);
|
|
405
|
-
if (self.parent != 0) return "parent != 0";
|
|
406
|
-
if (self.client != 0) return "client != 0";
|
|
407
|
-
if (self.context != 0) return "context != 0";
|
|
408
|
-
if (self.request != 0) return "request != 0";
|
|
409
|
-
if (self.op != 0) return "op != 0";
|
|
410
|
-
if (self.commit != 0) return "commit != 0";
|
|
411
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
412
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
413
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
414
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
415
|
-
return null;
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
fn invalid_request(self: *const Header) ?[]const u8 {
|
|
419
|
-
assert(self.command == .request);
|
|
420
|
-
if (self.client == 0) return "client == 0";
|
|
421
|
-
if (self.op != 0) return "op != 0";
|
|
422
|
-
if (self.commit != 0) return "commit != 0";
|
|
423
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
424
|
-
if (self.replica != 0) return "replica != 0";
|
|
425
|
-
switch (self.operation) {
|
|
426
|
-
.reserved => return "operation == .reserved",
|
|
427
|
-
.root => return "operation == .root",
|
|
428
|
-
.register => {
|
|
429
|
-
// The first request a client makes must be to register with the cluster:
|
|
430
|
-
if (self.parent != 0) return "register: parent != 0";
|
|
431
|
-
if (self.context != 0) return "register: context != 0";
|
|
432
|
-
if (self.request != 0) return "register: request != 0";
|
|
433
|
-
// The .register operation carries no payload:
|
|
434
|
-
if (self.checksum_body != checksum_body_empty) return "register: checksum_body != expected";
|
|
435
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
436
|
-
},
|
|
437
|
-
else => {
|
|
438
|
-
// Thereafter, the client must provide the session number in the context:
|
|
439
|
-
// These requests should set `parent` to the `checksum` of the previous reply.
|
|
440
|
-
if (self.context == 0) return "context == 0";
|
|
441
|
-
if (self.request == 0) return "request == 0";
|
|
442
|
-
},
|
|
443
|
-
}
|
|
444
|
-
return null;
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
fn invalid_prepare(self: *const Header) ?[]const u8 {
|
|
448
|
-
assert(self.command == .prepare);
|
|
449
|
-
switch (self.operation) {
|
|
450
|
-
.reserved => return "operation == .reserved",
|
|
451
|
-
.root => {
|
|
452
|
-
if (self.parent != 0) return "root: parent != 0";
|
|
453
|
-
if (self.client != 0) return "root: client != 0";
|
|
454
|
-
if (self.context != 0) return "root: context != 0";
|
|
455
|
-
if (self.request != 0) return "root: request != 0";
|
|
456
|
-
if (self.view != 0) return "root: view != 0";
|
|
457
|
-
if (self.op != 0) return "root: op != 0";
|
|
458
|
-
if (self.commit != 0) return "root: commit != 0";
|
|
459
|
-
if (self.timestamp != 0) return "root: timestamp != 0";
|
|
460
|
-
if (self.checksum_body != checksum_body_empty) return "root: checksum_body != expected";
|
|
461
|
-
if (self.size != @sizeOf(Header)) return "root: size != @sizeOf(Header)";
|
|
462
|
-
if (self.replica != 0) return "root: replica != 0";
|
|
463
|
-
},
|
|
464
|
-
else => {
|
|
465
|
-
if (self.client == 0) return "client == 0";
|
|
466
|
-
if (self.op == 0) return "op == 0";
|
|
467
|
-
if (self.op <= self.commit) return "op <= commit";
|
|
468
|
-
if (self.timestamp == 0) return "timestamp == 0";
|
|
469
|
-
if (self.operation == .register) {
|
|
470
|
-
// Client session numbers are replaced by the reference to the previous prepare.
|
|
471
|
-
if (self.request != 0) return "request != 0";
|
|
472
|
-
} else {
|
|
473
|
-
// Client session numbers are replaced by the reference to the previous prepare.
|
|
474
|
-
if (self.request == 0) return "request == 0";
|
|
475
|
-
}
|
|
476
|
-
},
|
|
477
|
-
}
|
|
478
|
-
return null;
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
fn invalid_prepare_ok(self: *const Header) ?[]const u8 {
|
|
482
|
-
assert(self.command == .prepare_ok);
|
|
483
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
484
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
485
|
-
switch (self.operation) {
|
|
486
|
-
.reserved => return "operation == .reserved",
|
|
487
|
-
.root => {
|
|
488
|
-
if (self.parent != 0) return "root: parent != 0";
|
|
489
|
-
if (self.client != 0) return "root: client != 0";
|
|
490
|
-
if (self.context != 0) return "root: context != 0";
|
|
491
|
-
if (self.request != 0) return "root: request != 0";
|
|
492
|
-
if (self.view != 0) return "root: view != 0";
|
|
493
|
-
if (self.op != 0) return "root: op != 0";
|
|
494
|
-
if (self.commit != 0) return "root: commit != 0";
|
|
495
|
-
if (self.timestamp != 0) return "root: timestamp != 0";
|
|
496
|
-
if (self.replica != 0) return "root: replica != 0";
|
|
497
|
-
},
|
|
498
|
-
else => {
|
|
499
|
-
if (self.client == 0) return "client == 0";
|
|
500
|
-
if (self.op == 0) return "op == 0";
|
|
501
|
-
if (self.op <= self.commit) return "op <= commit";
|
|
502
|
-
if (self.timestamp == 0) return "timestamp == 0";
|
|
503
|
-
if (self.operation == .register) {
|
|
504
|
-
if (self.request != 0) return "request != 0";
|
|
505
|
-
} else {
|
|
506
|
-
if (self.request == 0) return "request == 0";
|
|
507
|
-
}
|
|
508
|
-
},
|
|
509
|
-
}
|
|
510
|
-
return null;
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
fn invalid_reply(self: *const Header) ?[]const u8 {
|
|
514
|
-
assert(self.command == .reply);
|
|
515
|
-
// Initialization within `client.zig` asserts that client `id` is greater than zero:
|
|
516
|
-
if (self.client == 0) return "client == 0";
|
|
517
|
-
if (self.context != 0) return "context != 0";
|
|
518
|
-
if (self.op != self.commit) return "op != commit";
|
|
519
|
-
if (self.timestamp == 0) return "timestamp == 0";
|
|
520
|
-
if (self.operation == .register) {
|
|
521
|
-
// In this context, the commit number is the newly registered session number.
|
|
522
|
-
// The `0` commit number is reserved for cluster initialization.
|
|
523
|
-
if (self.commit == 0) return "commit == 0";
|
|
524
|
-
if (self.request != 0) return "request != 0";
|
|
525
|
-
} else {
|
|
526
|
-
if (self.commit == 0) return "commit == 0";
|
|
527
|
-
if (self.request == 0) return "request == 0";
|
|
528
|
-
}
|
|
529
|
-
return null;
|
|
530
|
-
}
|
|
531
|
-
|
|
532
|
-
fn invalid_commit(self: *const Header) ?[]const u8 {
|
|
533
|
-
assert(self.command == .commit);
|
|
534
|
-
if (self.parent != 0) return "parent != 0";
|
|
535
|
-
if (self.client != 0) return "client != 0";
|
|
536
|
-
if (self.request != 0) return "request != 0";
|
|
537
|
-
if (self.op != 0) return "op != 0";
|
|
538
|
-
if (self.timestamp == 0) return "timestamp == 0";
|
|
539
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
540
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
541
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
542
|
-
return null;
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
fn invalid_start_view_change(self: *const Header) ?[]const u8 {
|
|
546
|
-
assert(self.command == .start_view_change);
|
|
547
|
-
if (self.parent != 0) return "parent != 0";
|
|
548
|
-
if (self.client != 0) return "client != 0";
|
|
549
|
-
if (self.context != 0) return "context != 0";
|
|
550
|
-
if (self.request != 0) return "request != 0";
|
|
551
|
-
if (self.op != 0) return "op != 0";
|
|
552
|
-
if (self.commit != 0) return "commit != 0";
|
|
553
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
554
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
555
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
556
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
557
|
-
return null;
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
fn invalid_do_view_change(self: *const Header) ?[]const u8 {
|
|
561
|
-
assert(self.command == .do_view_change);
|
|
562
|
-
if (self.parent != 0) return "parent != 0";
|
|
563
|
-
if (self.client != 0) return "client != 0";
|
|
564
|
-
if (self.context != 0) return "context != 0";
|
|
565
|
-
if (self.request != 0) return "request != 0";
|
|
566
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
567
|
-
return null;
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
fn invalid_start_view(self: *const Header) ?[]const u8 {
|
|
571
|
-
assert(self.command == .start_view);
|
|
572
|
-
if (self.parent != 0) return "parent != 0";
|
|
573
|
-
if (self.client != 0) return "client != 0";
|
|
574
|
-
if (self.context != 0) return "context != 0";
|
|
575
|
-
if (self.request != 0) return "request != 0";
|
|
576
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
577
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
578
|
-
return null;
|
|
579
|
-
}
|
|
580
|
-
|
|
581
|
-
fn invalid_request_start_view(self: *const Header) ?[]const u8 {
|
|
582
|
-
assert(self.command == .request_start_view);
|
|
583
|
-
if (self.parent != 0) return "parent != 0";
|
|
584
|
-
if (self.client != 0) return "client != 0";
|
|
585
|
-
if (self.context != 0) return "context != 0";
|
|
586
|
-
if (self.request != 0) return "request != 0";
|
|
587
|
-
if (self.op != 0) return "op != 0";
|
|
588
|
-
if (self.commit != 0) return "commit != 0";
|
|
589
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
590
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
591
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
592
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
593
|
-
return null;
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
fn invalid_request_headers(self: *const Header) ?[]const u8 {
|
|
597
|
-
assert(self.command == .request_headers);
|
|
598
|
-
if (self.parent != 0) return "parent != 0";
|
|
599
|
-
if (self.client != 0) return "client != 0";
|
|
600
|
-
if (self.context != 0) return "context != 0";
|
|
601
|
-
if (self.request != 0) return "request != 0";
|
|
602
|
-
if (self.commit > self.op) return "op_min > op_max";
|
|
603
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
604
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
605
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
606
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
607
|
-
return null;
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
fn invalid_request_prepare(self: *const Header) ?[]const u8 {
|
|
611
|
-
assert(self.command == .request_prepare);
|
|
612
|
-
if (self.parent != 0) return "parent != 0";
|
|
613
|
-
if (self.client != 0) return "client != 0";
|
|
614
|
-
if (self.request != 0) return "request != 0";
|
|
615
|
-
if (self.commit != 0) return "commit != 0";
|
|
616
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
617
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
618
|
-
switch (self.timestamp) {
|
|
619
|
-
0 => if (self.context != 0) return "context != 0",
|
|
620
|
-
1 => {}, // context is a checksum, which may be 0.
|
|
621
|
-
else => return "timestamp > 1",
|
|
622
|
-
}
|
|
623
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
624
|
-
return null;
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
fn invalid_headers(self: *const Header) ?[]const u8 {
|
|
628
|
-
assert(self.command == .headers);
|
|
629
|
-
if (self.parent != 0) return "parent != 0";
|
|
630
|
-
if (self.client != 0) return "client != 0";
|
|
631
|
-
if (self.request != 0) return "request != 0";
|
|
632
|
-
if (self.op != 0) return "op != 0";
|
|
633
|
-
if (self.commit != 0) return "commit != 0";
|
|
634
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
635
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
636
|
-
return null;
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
fn invalid_nack_prepare(self: *const Header) ?[]const u8 {
|
|
640
|
-
assert(self.command == .nack_prepare);
|
|
641
|
-
if (self.parent != 0) return "parent != 0";
|
|
642
|
-
if (self.client != 0) return "client != 0";
|
|
643
|
-
if (self.request != 0) return "request != 0";
|
|
644
|
-
if (self.commit != 0) return "commit != 0";
|
|
645
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
646
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
647
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
648
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
649
|
-
return null;
|
|
650
|
-
}
|
|
651
|
-
|
|
652
|
-
fn invalid_eviction(self: *const Header) ?[]const u8 {
|
|
653
|
-
assert(self.command == .eviction);
|
|
654
|
-
if (self.parent != 0) return "parent != 0";
|
|
655
|
-
if (self.context != 0) return "context != 0";
|
|
656
|
-
if (self.request != 0) return "request != 0";
|
|
657
|
-
if (self.op != 0) return "op != 0";
|
|
658
|
-
if (self.commit != 0) return "commit != 0";
|
|
659
|
-
if (self.timestamp != 0) return "timestamp != 0";
|
|
660
|
-
if (self.checksum_body != checksum_body_empty) return "checksum_body != expected";
|
|
661
|
-
if (self.size != @sizeOf(Header)) return "size != @sizeOf(Header)";
|
|
662
|
-
if (self.operation != .reserved) return "operation != .reserved";
|
|
663
|
-
return null;
|
|
664
|
-
}
|
|
665
|
-
|
|
666
|
-
/// Returns whether the immediate sender is a replica or client (if this can be determined).
|
|
667
|
-
/// Some commands such as .request or .prepare may be forwarded on to other replicas so that
|
|
668
|
-
/// Header.replica or Header.client only identifies the ultimate origin, not the latest peer.
|
|
669
|
-
pub fn peer_type(self: *const Header) enum { unknown, replica, client } {
|
|
670
|
-
switch (self.command) {
|
|
671
|
-
.reserved => unreachable,
|
|
672
|
-
// These messages cannot always identify the peer as they may be forwarded:
|
|
673
|
-
.request => switch (self.operation) {
|
|
674
|
-
// However, we do not forward the first .register request sent by a client:
|
|
675
|
-
.register => return .client,
|
|
676
|
-
else => return .unknown,
|
|
677
|
-
},
|
|
678
|
-
.prepare => return .unknown,
|
|
679
|
-
// These messages identify the peer as either a replica or a client:
|
|
680
|
-
.ping_client => return .client,
|
|
681
|
-
// All other messages identify the peer as a replica:
|
|
682
|
-
else => return .replica,
|
|
683
|
-
}
|
|
684
|
-
}
|
|
685
|
-
|
|
686
|
-
pub fn reserved(cluster: u32, slot: u64) Header {
|
|
687
|
-
assert(slot < constants.journal_slot_count);
|
|
688
|
-
|
|
689
|
-
var header = Header{
|
|
690
|
-
.command = .reserved,
|
|
691
|
-
.cluster = cluster,
|
|
692
|
-
.op = slot,
|
|
693
|
-
};
|
|
694
|
-
header.set_checksum_body(&[0]u8{});
|
|
695
|
-
header.set_checksum();
|
|
696
|
-
assert(header.invalid() == null);
|
|
697
|
-
return header;
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
pub fn root_prepare(cluster: u32) Header {
|
|
701
|
-
var header = Header{
|
|
702
|
-
.cluster = cluster,
|
|
703
|
-
.size = @sizeOf(Header),
|
|
704
|
-
.command = .prepare,
|
|
705
|
-
.operation = .root,
|
|
706
|
-
};
|
|
707
|
-
header.set_checksum_body(&[0]u8{});
|
|
708
|
-
header.set_checksum();
|
|
709
|
-
assert(header.invalid() == null);
|
|
710
|
-
return header;
|
|
711
|
-
}
|
|
712
|
-
};
|
|
713
|
-
|
|
714
|
-
pub const Timeout = struct {
|
|
715
|
-
name: []const u8,
|
|
716
|
-
id: u128,
|
|
717
|
-
after: u64,
|
|
718
|
-
attempts: u8 = 0,
|
|
719
|
-
rtt: u64 = constants.rtt_ticks,
|
|
720
|
-
rtt_multiple: u8 = constants.rtt_multiple,
|
|
721
|
-
ticks: u64 = 0,
|
|
722
|
-
ticking: bool = false,
|
|
723
|
-
|
|
724
|
-
/// Increments the attempts counter and resets the timeout with exponential backoff and jitter.
|
|
725
|
-
/// Allows the attempts counter to wrap from time to time.
|
|
726
|
-
/// The overflow period is kept short to surface any related bugs sooner rather than later.
|
|
727
|
-
/// We do not saturate the counter as this would cause round-robin retries to get stuck.
|
|
728
|
-
pub fn backoff(self: *Timeout, random: std.rand.Random) void {
|
|
729
|
-
assert(self.ticking);
|
|
730
|
-
|
|
731
|
-
self.ticks = 0;
|
|
732
|
-
self.attempts +%= 1;
|
|
733
|
-
|
|
734
|
-
log.debug("{}: {s} backing off", .{ self.id, self.name });
|
|
735
|
-
self.set_after_for_rtt_and_attempts(random);
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
/// It's important to check that when fired() is acted on that the timeout is stopped/started,
|
|
739
|
-
/// otherwise further ticks around the event loop may trigger a thundering herd of messages.
|
|
740
|
-
pub fn fired(self: *const Timeout) bool {
|
|
741
|
-
if (self.ticking and self.ticks >= self.after) {
|
|
742
|
-
log.debug("{}: {s} fired", .{ self.id, self.name });
|
|
743
|
-
if (self.ticks > self.after) {
|
|
744
|
-
log.err("{}: {s} is firing every tick", .{ self.id, self.name });
|
|
745
|
-
@panic("timeout was not reset correctly");
|
|
746
|
-
}
|
|
747
|
-
return true;
|
|
748
|
-
} else {
|
|
749
|
-
return false;
|
|
750
|
-
}
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
pub fn reset(self: *Timeout) void {
|
|
754
|
-
self.attempts = 0;
|
|
755
|
-
self.ticks = 0;
|
|
756
|
-
assert(self.ticking);
|
|
757
|
-
// TODO Use self.prng to adjust for rtt and attempts.
|
|
758
|
-
log.debug("{}: {s} reset", .{ self.id, self.name });
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
/// Sets the value of `after` as a function of `rtt` and `attempts`.
|
|
762
|
-
/// Adds exponential backoff and jitter.
|
|
763
|
-
/// May be called only after a timeout has been stopped or reset, to prevent backward jumps.
|
|
764
|
-
pub fn set_after_for_rtt_and_attempts(self: *Timeout, random: std.rand.Random) void {
|
|
765
|
-
// If `after` is reduced by this function to less than `ticks`, then `fired()` will panic:
|
|
766
|
-
assert(self.ticks == 0);
|
|
767
|
-
assert(self.rtt > 0);
|
|
768
|
-
|
|
769
|
-
const after = (self.rtt * self.rtt_multiple) + exponential_backoff_with_jitter(
|
|
770
|
-
random,
|
|
771
|
-
constants.backoff_min_ticks,
|
|
772
|
-
constants.backoff_max_ticks,
|
|
773
|
-
self.attempts,
|
|
774
|
-
);
|
|
775
|
-
|
|
776
|
-
// TODO Clamp `after` to min/max tick bounds for timeout.
|
|
777
|
-
|
|
778
|
-
log.debug("{}: {s} after={}..{} (rtt={} min={} max={} attempts={})", .{
|
|
779
|
-
self.id,
|
|
780
|
-
self.name,
|
|
781
|
-
self.after,
|
|
782
|
-
after,
|
|
783
|
-
self.rtt,
|
|
784
|
-
constants.backoff_min_ticks,
|
|
785
|
-
constants.backoff_max_ticks,
|
|
786
|
-
self.attempts,
|
|
787
|
-
});
|
|
788
|
-
|
|
789
|
-
self.after = after;
|
|
790
|
-
assert(self.after > 0);
|
|
791
|
-
}
|
|
792
|
-
|
|
793
|
-
pub fn set_rtt(self: *Timeout, rtt_ticks: u64) void {
|
|
794
|
-
assert(self.rtt > 0);
|
|
795
|
-
assert(rtt_ticks > 0);
|
|
796
|
-
|
|
797
|
-
log.debug("{}: {s} rtt={}..{}", .{
|
|
798
|
-
self.id,
|
|
799
|
-
self.name,
|
|
800
|
-
self.rtt,
|
|
801
|
-
rtt_ticks,
|
|
802
|
-
});
|
|
803
|
-
|
|
804
|
-
self.rtt = rtt_ticks;
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
pub fn start(self: *Timeout) void {
|
|
808
|
-
self.attempts = 0;
|
|
809
|
-
self.ticks = 0;
|
|
810
|
-
self.ticking = true;
|
|
811
|
-
// TODO Use self.prng to adjust for rtt and attempts.
|
|
812
|
-
log.debug("{}: {s} started", .{ self.id, self.name });
|
|
813
|
-
}
|
|
814
|
-
|
|
815
|
-
pub fn stop(self: *Timeout) void {
|
|
816
|
-
self.attempts = 0;
|
|
817
|
-
self.ticks = 0;
|
|
818
|
-
self.ticking = false;
|
|
819
|
-
log.debug("{}: {s} stopped", .{ self.id, self.name });
|
|
820
|
-
}
|
|
821
|
-
|
|
822
|
-
pub fn tick(self: *Timeout) void {
|
|
823
|
-
if (self.ticking) self.ticks += 1;
|
|
824
|
-
}
|
|
825
|
-
};
|
|
826
|
-
|
|
827
|
-
/// Calculates exponential backoff with jitter to prevent cascading failure due to thundering herds.
|
|
828
|
-
pub fn exponential_backoff_with_jitter(
|
|
829
|
-
random: std.rand.Random,
|
|
830
|
-
min: u64,
|
|
831
|
-
max: u64,
|
|
832
|
-
attempt: u64,
|
|
833
|
-
) u64 {
|
|
834
|
-
const range = max - min;
|
|
835
|
-
assert(range > 0);
|
|
836
|
-
|
|
837
|
-
// Do not use `@truncate(u6, attempt)` since that only discards the high bits:
|
|
838
|
-
// We want a saturating exponent here instead.
|
|
839
|
-
const exponent = @intCast(u6, std.math.min(std.math.maxInt(u6), attempt));
|
|
840
|
-
|
|
841
|
-
// A "1" shifted left gives any power of two:
|
|
842
|
-
// 1<<0 = 1, 1<<1 = 2, 1<<2 = 4, 1<<3 = 8
|
|
843
|
-
const power = std.math.shlExact(u128, 1, exponent) catch unreachable; // Do not truncate.
|
|
844
|
-
|
|
845
|
-
// Ensure that `backoff` is calculated correctly when min is 0, taking `std.math.max(1, min)`.
|
|
846
|
-
// Otherwise, the final result will always be 0. This was an actual bug we encountered.
|
|
847
|
-
const min_non_zero = std.math.max(1, min);
|
|
848
|
-
assert(min_non_zero > 0);
|
|
849
|
-
assert(power > 0);
|
|
850
|
-
|
|
851
|
-
// Calculate the capped exponential backoff component, `min(range, min * 2 ^ attempt)`:
|
|
852
|
-
const backoff = std.math.min(range, min_non_zero * power);
|
|
853
|
-
const jitter = random.uintAtMostBiased(u64, backoff);
|
|
854
|
-
|
|
855
|
-
const result = @intCast(u64, min + jitter);
|
|
856
|
-
assert(result >= min);
|
|
857
|
-
assert(result <= max);
|
|
858
|
-
|
|
859
|
-
return result;
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
test "exponential_backoff_with_jitter" {
|
|
863
|
-
var prng = std.rand.DefaultPrng.init(0);
|
|
864
|
-
const random = prng.random();
|
|
865
|
-
|
|
866
|
-
const attempts = 1000;
|
|
867
|
-
const max: u64 = std.math.maxInt(u64);
|
|
868
|
-
const min = max - attempts;
|
|
869
|
-
|
|
870
|
-
var attempt = max - attempts;
|
|
871
|
-
while (attempt < max) : (attempt += 1) {
|
|
872
|
-
const ebwj = exponential_backoff_with_jitter(random, min, max, attempt);
|
|
873
|
-
try std.testing.expect(ebwj >= min);
|
|
874
|
-
try std.testing.expect(ebwj <= max);
|
|
875
|
-
}
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
/// Returns An array containing the remote or local addresses of each of the 2f + 1 replicas:
|
|
879
|
-
/// Unlike the VRR paper, we do not sort the array but leave the order explicitly to the user.
|
|
880
|
-
/// There are several advantages to this:
|
|
881
|
-
/// * The operator may deploy a cluster with proximity in mind since replication follows order.
|
|
882
|
-
/// * A replica's IP address may be changed without reconfiguration.
|
|
883
|
-
/// This does require that the user specify the same order to all replicas.
|
|
884
|
-
/// The caller owns the memory of the returned slice of addresses.
|
|
885
|
-
pub fn parse_addresses(allocator: std.mem.Allocator, raw: []const u8, address_limit: usize) ![]std.net.Address {
|
|
886
|
-
const address_count = std.mem.count(u8, raw, ",") + 1;
|
|
887
|
-
if (address_count > address_limit) return error.AddressLimitExceeded;
|
|
888
|
-
|
|
889
|
-
const addresses = try allocator.alloc(std.net.Address, address_count);
|
|
890
|
-
errdefer allocator.free(addresses);
|
|
891
|
-
|
|
892
|
-
var index: usize = 0;
|
|
893
|
-
var comma_iterator = std.mem.split(u8, raw, ",");
|
|
894
|
-
while (comma_iterator.next()) |raw_address| : (index += 1) {
|
|
895
|
-
assert(index < address_limit);
|
|
896
|
-
if (raw_address.len == 0) return error.AddressHasTrailingComma;
|
|
897
|
-
addresses[index] = try parse_address(raw_address);
|
|
898
|
-
}
|
|
899
|
-
assert(index == address_count);
|
|
900
|
-
|
|
901
|
-
return addresses;
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
pub fn parse_address(raw: []const u8) !std.net.Address {
|
|
905
|
-
var colon_iterator = std.mem.split(u8, raw, ":");
|
|
906
|
-
// The split iterator will always return non-null once, even if the delimiter is not found:
|
|
907
|
-
const raw_ipv4 = colon_iterator.next().?;
|
|
908
|
-
|
|
909
|
-
if (colon_iterator.next()) |raw_port| {
|
|
910
|
-
if (colon_iterator.next() != null) return error.AddressHasMoreThanOneColon;
|
|
911
|
-
|
|
912
|
-
const port = std.fmt.parseUnsigned(u16, raw_port, 10) catch |err| switch (err) {
|
|
913
|
-
error.Overflow => return error.PortOverflow,
|
|
914
|
-
error.InvalidCharacter => return error.PortInvalid,
|
|
915
|
-
};
|
|
916
|
-
return std.net.Address.parseIp4(raw_ipv4, port) catch {
|
|
917
|
-
return error.AddressInvalid;
|
|
918
|
-
};
|
|
919
|
-
} else {
|
|
920
|
-
// There was no colon in the address so there are now two cases:
|
|
921
|
-
// 1. an IPv4 address with the default port, or
|
|
922
|
-
// 2. a port with the default IPv4 address.
|
|
923
|
-
|
|
924
|
-
// Let's try parsing as a port first:
|
|
925
|
-
if (std.fmt.parseUnsigned(u16, raw, 10)) |port| {
|
|
926
|
-
return std.net.Address.parseIp4(constants.address, port) catch unreachable;
|
|
927
|
-
} else |err| switch (err) {
|
|
928
|
-
error.Overflow => return error.PortOverflow,
|
|
929
|
-
error.InvalidCharacter => {
|
|
930
|
-
// Something was not a digit, let's try parsing as an IPv4 instead:
|
|
931
|
-
return std.net.Address.parseIp4(raw, constants.port) catch {
|
|
932
|
-
return error.AddressInvalid;
|
|
933
|
-
};
|
|
934
|
-
},
|
|
935
|
-
}
|
|
936
|
-
}
|
|
937
|
-
}
|
|
938
|
-
|
|
939
|
-
test "parse_addresses" {
|
|
940
|
-
const vectors_positive = &[_]struct {
|
|
941
|
-
raw: []const u8,
|
|
942
|
-
addresses: []const std.net.Address,
|
|
943
|
-
}{
|
|
944
|
-
.{
|
|
945
|
-
// Test the minimum/maximum address/port.
|
|
946
|
-
.raw = "1.2.3.4:567,0.0.0.0:0,255.255.255.255:65535",
|
|
947
|
-
.addresses = &[3]std.net.Address{
|
|
948
|
-
std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 567),
|
|
949
|
-
std.net.Address.initIp4([_]u8{ 0, 0, 0, 0 }, 0),
|
|
950
|
-
std.net.Address.initIp4([_]u8{ 255, 255, 255, 255 }, 65535),
|
|
951
|
-
},
|
|
952
|
-
},
|
|
953
|
-
.{
|
|
954
|
-
// Addresses are not reordered.
|
|
955
|
-
.raw = "3.4.5.6:7777,200.3.4.5:6666,1.2.3.4:5555",
|
|
956
|
-
.addresses = &[3]std.net.Address{
|
|
957
|
-
std.net.Address.initIp4([_]u8{ 3, 4, 5, 6 }, 7777),
|
|
958
|
-
std.net.Address.initIp4([_]u8{ 200, 3, 4, 5 }, 6666),
|
|
959
|
-
std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5555),
|
|
960
|
-
},
|
|
961
|
-
},
|
|
962
|
-
.{
|
|
963
|
-
// Test default address and port.
|
|
964
|
-
.raw = "1.2.3.4:5,4321,2.3.4.5",
|
|
965
|
-
.addresses = &[3]std.net.Address{
|
|
966
|
-
std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
|
|
967
|
-
try std.net.Address.parseIp4(constants.address, 4321),
|
|
968
|
-
std.net.Address.initIp4([_]u8{ 2, 3, 4, 5 }, constants.port),
|
|
969
|
-
},
|
|
970
|
-
},
|
|
971
|
-
.{
|
|
972
|
-
// Test addresses less than address_limit.
|
|
973
|
-
.raw = "1.2.3.4:5,4321",
|
|
974
|
-
.addresses = &[2]std.net.Address{
|
|
975
|
-
std.net.Address.initIp4([_]u8{ 1, 2, 3, 4 }, 5),
|
|
976
|
-
try std.net.Address.parseIp4(constants.address, 4321),
|
|
977
|
-
},
|
|
978
|
-
},
|
|
979
|
-
};
|
|
980
|
-
|
|
981
|
-
const vectors_negative = &[_]struct {
|
|
982
|
-
raw: []const u8,
|
|
983
|
-
err: anyerror![]std.net.Address,
|
|
984
|
-
}{
|
|
985
|
-
.{ .raw = "", .err = error.AddressHasTrailingComma },
|
|
986
|
-
.{ .raw = "1.2.3.4:5,2.3.4.5:6,4.5.6.7:8", .err = error.AddressLimitExceeded },
|
|
987
|
-
.{ .raw = "1.2.3.4:7777,", .err = error.AddressHasTrailingComma },
|
|
988
|
-
.{ .raw = "1.2.3.4:7777,2.3.4.5::8888", .err = error.AddressHasMoreThanOneColon },
|
|
989
|
-
.{ .raw = "1.2.3.4:5,A", .err = error.AddressInvalid }, // default port
|
|
990
|
-
.{ .raw = "1.2.3.4:5,2.a.4.5", .err = error.AddressInvalid }, // default port
|
|
991
|
-
.{ .raw = "1.2.3.4:5,2.a.4.5:6", .err = error.AddressInvalid }, // specified port
|
|
992
|
-
.{ .raw = "1.2.3.4:5,2.3.4.5:", .err = error.PortInvalid },
|
|
993
|
-
.{ .raw = "1.2.3.4:5,2.3.4.5:A", .err = error.PortInvalid },
|
|
994
|
-
.{ .raw = "1.2.3.4:5,65536", .err = error.PortOverflow }, // default address
|
|
995
|
-
.{ .raw = "1.2.3.4:5,2.3.4.5:65536", .err = error.PortOverflow },
|
|
996
|
-
};
|
|
997
|
-
|
|
998
|
-
for (vectors_positive) |vector| {
|
|
999
|
-
const addresses_actual = try parse_addresses(std.testing.allocator, vector.raw, 3);
|
|
1000
|
-
defer std.testing.allocator.free(addresses_actual);
|
|
1001
|
-
|
|
1002
|
-
try std.testing.expectEqual(addresses_actual.len, vector.addresses.len);
|
|
1003
|
-
for (vector.addresses) |address_expect, i| {
|
|
1004
|
-
const address_actual = addresses_actual[i];
|
|
1005
|
-
try std.testing.expectEqual(address_expect.in.sa.family, address_actual.in.sa.family);
|
|
1006
|
-
try std.testing.expectEqual(address_expect.in.sa.port, address_actual.in.sa.port);
|
|
1007
|
-
try std.testing.expectEqual(address_expect.in.sa.addr, address_actual.in.sa.addr);
|
|
1008
|
-
try std.testing.expectEqual(address_expect.in.sa.zero, address_actual.in.sa.zero);
|
|
1009
|
-
}
|
|
1010
|
-
}
|
|
1011
|
-
|
|
1012
|
-
for (vectors_negative) |vector| {
|
|
1013
|
-
try std.testing.expectEqual(vector.err, parse_addresses(std.testing.allocator, vector.raw, 2));
|
|
1014
|
-
}
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
pub fn sector_floor(offset: u64) u64 {
|
|
1018
|
-
const sectors = math.divFloor(u64, offset, constants.sector_size) catch unreachable;
|
|
1019
|
-
return sectors * constants.sector_size;
|
|
1020
|
-
}
|
|
1021
|
-
|
|
1022
|
-
pub fn sector_ceil(offset: u64) u64 {
|
|
1023
|
-
const sectors = math.divCeil(u64, offset, constants.sector_size) catch unreachable;
|
|
1024
|
-
return sectors * constants.sector_size;
|
|
1025
|
-
}
|
|
1026
|
-
|
|
1027
|
-
pub fn checksum(source: []const u8) u128 {
|
|
1028
|
-
@setEvalBranchQuota(4000);
|
|
1029
|
-
|
|
1030
|
-
var target: [32]u8 = undefined;
|
|
1031
|
-
std.crypto.hash.Blake3.hash(source, target[0..], .{});
|
|
1032
|
-
return @bitCast(u128, target[0..@sizeOf(u128)].*);
|
|
1033
|
-
}
|
|
1034
|
-
|
|
1035
|
-
pub fn quorums(replica_count: u8) struct {
|
|
1036
|
-
replication: u8,
|
|
1037
|
-
view_change: u8,
|
|
1038
|
-
} {
|
|
1039
|
-
assert(replica_count > 0);
|
|
1040
|
-
|
|
1041
|
-
assert(constants.quorum_replication_max >= 2);
|
|
1042
|
-
// For replica_count=2, set quorum_replication=2 even though =1 would intersect.
|
|
1043
|
-
// This improves durability of small clusters.
|
|
1044
|
-
const quorum_replication = if (replica_count == 2) 2 else std.math.min(
|
|
1045
|
-
constants.quorum_replication_max,
|
|
1046
|
-
stdx.div_ceil(replica_count, 2),
|
|
1047
|
-
);
|
|
1048
|
-
assert(quorum_replication <= replica_count);
|
|
1049
|
-
assert(quorum_replication >= 2 or quorum_replication == replica_count);
|
|
1050
|
-
|
|
1051
|
-
// For replica_count=2, set quorum_view_change=2 even though =1 would intersect.
|
|
1052
|
-
// This avoids special cases for a single-replica view-change in Replica.
|
|
1053
|
-
const quorum_view_change =
|
|
1054
|
-
if (replica_count == 2) 2 else replica_count - quorum_replication + 1;
|
|
1055
|
-
// The view change quorum may be more expensive to make the replication quorum cheaper.
|
|
1056
|
-
// The insight is that the replication phase is by far more common than the view change.
|
|
1057
|
-
// This trade-off allows us to optimize for the common case.
|
|
1058
|
-
// See the comments in `constants.zig` for further explanation.
|
|
1059
|
-
assert(quorum_view_change <= replica_count);
|
|
1060
|
-
assert(quorum_view_change >= 2 or quorum_view_change == replica_count);
|
|
1061
|
-
assert(quorum_view_change >= @divFloor(replica_count, 2) + 1);
|
|
1062
|
-
assert(quorum_view_change + quorum_replication > replica_count);
|
|
1063
|
-
|
|
1064
|
-
return .{
|
|
1065
|
-
.replication = quorum_replication,
|
|
1066
|
-
.view_change = quorum_view_change,
|
|
1067
|
-
};
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
test "quorums" {
|
|
1071
|
-
if (constants.quorum_replication_max != 3) return error.SkipZigTest;
|
|
1072
|
-
|
|
1073
|
-
const expect_replication = [_]u8{ 1, 2, 2, 2, 3, 3, 3, 3 };
|
|
1074
|
-
const expect_view_change = [_]u8{ 1, 2, 2, 3, 3, 4, 5, 6 };
|
|
1075
|
-
|
|
1076
|
-
for (expect_replication[0..]) |_, i| {
|
|
1077
|
-
const actual = quorums(@intCast(u8, i) + 1);
|
|
1078
|
-
try std.testing.expectEqual(actual.replication, expect_replication[i]);
|
|
1079
|
-
try std.testing.expectEqual(actual.view_change, expect_view_change[i]);
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
|
|
1083
|
-
pub const Headers = struct {
|
|
1084
|
-
pub const Array = std.BoundedArray(Header, constants.view_change_headers_max);
|
|
1085
|
-
/// The SuperBlock's persisted VSR headers.
|
|
1086
|
-
/// One of the following:
|
|
1087
|
-
///
|
|
1088
|
-
/// - SV headers (consecutive chain)
|
|
1089
|
-
/// - DVC headers (disjoint chain)
|
|
1090
|
-
pub const ViewChangeSlice = ViewChangeHeadersSlice;
|
|
1091
|
-
pub const ViewChangeArray = ViewChangeHeadersArray;
|
|
1092
|
-
};
|
|
1093
|
-
|
|
1094
|
-
const ViewChangeHeadersSlice = struct {
|
|
1095
|
-
/// Headers are ordered from high-to-low op.
|
|
1096
|
-
slice: []const Header,
|
|
1097
|
-
|
|
1098
|
-
pub fn init(slice: []const Header) ViewChangeHeadersSlice {
|
|
1099
|
-
ViewChangeHeadersSlice.verify(slice);
|
|
1100
|
-
|
|
1101
|
-
return .{ .slice = slice };
|
|
1102
|
-
}
|
|
1103
|
-
|
|
1104
|
-
pub fn verify(slice: []const Header) void {
|
|
1105
|
-
assert(slice.len > 0);
|
|
1106
|
-
assert(slice.len <= constants.view_change_headers_max);
|
|
1107
|
-
|
|
1108
|
-
var child: ?*const Header = null;
|
|
1109
|
-
for (slice) |*header| {
|
|
1110
|
-
assert(header.valid_checksum());
|
|
1111
|
-
assert(header.command == .prepare);
|
|
1112
|
-
|
|
1113
|
-
if (child) |child_header| {
|
|
1114
|
-
assert(header.op < child_header.op);
|
|
1115
|
-
assert(header.view <= child_header.view);
|
|
1116
|
-
assert((header.op + 1 == child_header.op) ==
|
|
1117
|
-
(header.checksum == child_header.parent));
|
|
1118
|
-
assert(header.timestamp < child_header.timestamp);
|
|
1119
|
-
}
|
|
1120
|
-
child = header;
|
|
1121
|
-
}
|
|
1122
|
-
}
|
|
1123
|
-
|
|
1124
|
-
const ViewRange = struct {
|
|
1125
|
-
min: u32, // inclusive
|
|
1126
|
-
max: u32, // inclusive
|
|
1127
|
-
|
|
1128
|
-
pub fn contains(range: ViewRange, view: u32) bool {
|
|
1129
|
-
return range.min <= view and view <= range.max;
|
|
1130
|
-
}
|
|
1131
|
-
};
|
|
1132
|
-
|
|
1133
|
-
/// Returns the range of possible views (of prepare, not commit) for a message that is part of
|
|
1134
|
-
/// the same log_view as these headers.
|
|
1135
|
-
///
|
|
1136
|
-
/// - When these are DVC headers for a log_view=V, we must be in view_change status working to
|
|
1137
|
-
/// transition to a view beyond V. So we will never prepare anything else as part of view V.
|
|
1138
|
-
/// - When these are SV headers for a log_view=V, we can continue to add to them (by preparing
|
|
1139
|
-
/// more ops), but those ops will laways be part of the log_view. If they were prepared during
|
|
1140
|
-
/// a view prior to the log_view, they would already be part of the headers.
|
|
1141
|
-
pub fn view_for_op(headers: ViewChangeHeadersSlice, op: u64, log_view: u32) ViewRange {
|
|
1142
|
-
const header_newest = &headers.slice[0];
|
|
1143
|
-
const header_oldest = &headers.slice[headers.slice.len - 1];
|
|
1144
|
-
|
|
1145
|
-
if (op < header_oldest.op) return .{ .min = 0, .max = header_oldest.view };
|
|
1146
|
-
if (op > header_newest.op) return .{ .min = log_view, .max = log_view };
|
|
1147
|
-
|
|
1148
|
-
for (headers.slice) |*header| {
|
|
1149
|
-
if (header.op == op) return .{ .min = header.view, .max = header.view };
|
|
1150
|
-
}
|
|
1151
|
-
|
|
1152
|
-
for (headers.slice[0 .. headers.slice.len - 1]) |*header_next, header_next_index| {
|
|
1153
|
-
const header_prev = headers.slice[header_next_index + 1];
|
|
1154
|
-
if (header_prev.op < op and op < header_next.op) {
|
|
1155
|
-
return .{ .min = header_prev.view, .max = header_next.view };
|
|
1156
|
-
}
|
|
1157
|
-
}
|
|
1158
|
-
unreachable;
|
|
1159
|
-
}
|
|
1160
|
-
};
|
|
1161
|
-
|
|
1162
|
-
test "Headers.ViewChangeSlice.view_for_op" {
|
|
1163
|
-
var headers_array = [_]Header{
|
|
1164
|
-
std.mem.zeroInit(Header, .{ .op = 9, .view = 10 }),
|
|
1165
|
-
std.mem.zeroInit(Header, .{ .op = 6, .view = 7 }),
|
|
1166
|
-
};
|
|
1167
|
-
|
|
1168
|
-
const headers = Headers.ViewChangeSlice{ .slice = &headers_array };
|
|
1169
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(11, 12), .{ .min = 12, .max = 12 }));
|
|
1170
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(10, 12), .{ .min = 12, .max = 12 }));
|
|
1171
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(9, 12), .{ .min = 10, .max = 10 }));
|
|
1172
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(8, 12), .{ .min = 7, .max = 10 }));
|
|
1173
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(7, 12), .{ .min = 7, .max = 10 }));
|
|
1174
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(6, 12), .{ .min = 7, .max = 7 }));
|
|
1175
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(5, 12), .{ .min = 0, .max = 7 }));
|
|
1176
|
-
try std.testing.expect(std.meta.eql(headers.view_for_op(0, 12), .{ .min = 0, .max = 7 }));
|
|
1177
|
-
}
|
|
1178
|
-
|
|
1179
|
-
/// The headers of a SV or DVC message.
|
|
1180
|
-
const ViewChangeHeadersArray = struct {
|
|
1181
|
-
array: Headers.Array,
|
|
1182
|
-
|
|
1183
|
-
pub fn root(cluster: u32) ViewChangeHeadersArray {
|
|
1184
|
-
var array = Headers.Array{ .buffer = undefined };
|
|
1185
|
-
array.appendAssumeCapacity(Header.root_prepare(cluster));
|
|
1186
|
-
return ViewChangeHeadersArray.init(array);
|
|
1187
|
-
}
|
|
1188
|
-
|
|
1189
|
-
fn init(array: Headers.Array) ViewChangeHeadersArray {
|
|
1190
|
-
Headers.ViewChangeSlice.verify(array.constSlice());
|
|
1191
|
-
return .{ .array = array };
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
|
-
/// This function generates either DVC headers or SV headers:
|
|
1195
|
-
/// - When `current.log_view < current.view`, generate headers for a SV message.
|
|
1196
|
-
/// - When `current.log_view = current.view`, generate headers for a DVC message.
|
|
1197
|
-
///
|
|
1198
|
-
/// Additionally, the current log_view/view/primary state informs the sort of "faults"
|
|
1199
|
-
/// (gaps/breaks/etc) that we expect to find in the journal headers (`current.headers`).
|
|
1200
|
-
/// For example, backups generating a DVC can safely skip over gaps (if the gap is after the DVC
|
|
1201
|
-
/// anchor).
|
|
1202
|
-
///
|
|
1203
|
-
/// Primaries and backups both generate DVCs and SVs.
|
|
1204
|
-
/// - However, SVs are broadcast only by the primary.
|
|
1205
|
-
/// - Backups generate a SV for persisting to the superblock.
|
|
1206
|
-
/// (For convenience/symmetry, not correctness).
|
|
1207
|
-
///
|
|
1208
|
-
/// DVCs and SVs have different invariants they must abide.
|
|
1209
|
-
/// - Read DVCQuorum's comments to understand DVC invariants.
|
|
1210
|
-
/// - SV headers are much simpler: no gaps or breaks, and all uncommitted ops must be included.
|
|
1211
|
-
pub fn build(
|
|
1212
|
-
results: *ViewChangeHeadersArray,
|
|
1213
|
-
options: struct {
|
|
1214
|
-
op_checkpoint: u64,
|
|
1215
|
-
/// The last view_change_headers_max headers of the journal, starting with the head op
|
|
1216
|
-
/// then descending, skipping over all gaps.
|
|
1217
|
-
current: struct {
|
|
1218
|
-
headers: *const Headers.Array,
|
|
1219
|
-
view: u32,
|
|
1220
|
-
log_view: u32,
|
|
1221
|
-
log_view_primary: bool,
|
|
1222
|
-
},
|
|
1223
|
-
// The vsr_headers from the working superblock.
|
|
1224
|
-
// The durable headers are useful (complimenting `current.headers`) because:
|
|
1225
|
-
// - They simplify generation of DVCs in the case where we are recovering from a crash,
|
|
1226
|
-
// when we were generating the same DVC prior to the crash.
|
|
1227
|
-
// - They enable additional verification of header gaps/breaks based on the
|
|
1228
|
-
// gap's/break's position relative to the durable headers.
|
|
1229
|
-
durable: struct {
|
|
1230
|
-
headers: Headers.ViewChangeSlice,
|
|
1231
|
-
view: u32,
|
|
1232
|
-
log_view: u32,
|
|
1233
|
-
log_view_primary: bool,
|
|
1234
|
-
},
|
|
1235
|
-
},
|
|
1236
|
-
) void {
|
|
1237
|
-
defer Headers.ViewChangeSlice.verify(results.array.constSlice());
|
|
1238
|
-
|
|
1239
|
-
const headers = &results.array;
|
|
1240
|
-
const current = options.current;
|
|
1241
|
-
const durable = options.durable;
|
|
1242
|
-
|
|
1243
|
-
assert(headers.len == 0);
|
|
1244
|
-
assert(durable.headers.slice.len > 0);
|
|
1245
|
-
assert(current.headers.len > 0);
|
|
1246
|
-
for (current.headers.constSlice()[1..]) |*header, i| {
|
|
1247
|
-
assert(current.headers.get(i).op > header.op);
|
|
1248
|
-
}
|
|
1249
|
-
|
|
1250
|
-
assert(current.view >= durable.view);
|
|
1251
|
-
assert(current.log_view >= durable.log_view);
|
|
1252
|
-
assert(current.view >= current.log_view);
|
|
1253
|
-
assert(durable.view >= durable.log_view);
|
|
1254
|
-
|
|
1255
|
-
const op_head_current = current.headers.get(0).op;
|
|
1256
|
-
const op_head_durable = durable.headers.slice[0].op;
|
|
1257
|
-
|
|
1258
|
-
// The rules for generating DVCs and SVs differ. We use the current view numbers to
|
|
1259
|
-
// determine which is being generated:
|
|
1260
|
-
// - When `log_view < view`, generate a DVC.
|
|
1261
|
-
// - When `log_view = view`, generate a SV.
|
|
1262
|
-
const command_current: enum { start_view, do_view_change } =
|
|
1263
|
-
if (current.log_view == current.view) .start_view else .do_view_change;
|
|
1264
|
-
// Likewise, the durable view numbers identify whether the durable headers were from a past
|
|
1265
|
-
// DVC or SV. The durable headers are only useful if they are from the same view as our
|
|
1266
|
-
// current headers, though.
|
|
1267
|
-
const command_durable: enum { start_view, do_view_change, outdated } = command: {
|
|
1268
|
-
if (durable.log_view == current.log_view) {
|
|
1269
|
-
if (durable.log_view == durable.view) {
|
|
1270
|
-
break :command .start_view;
|
|
1271
|
-
} else {
|
|
1272
|
-
break :command .do_view_change;
|
|
1273
|
-
}
|
|
1274
|
-
} else {
|
|
1275
|
-
break :command .outdated;
|
|
1276
|
-
}
|
|
1277
|
-
};
|
|
1278
|
-
|
|
1279
|
-
if (command_durable == .do_view_change and command_current == .do_view_change) {
|
|
1280
|
-
assert(op_head_durable == op_head_current);
|
|
1281
|
-
// Ensure that if we started a DVC before a crash, that we will resume sending the exact
|
|
1282
|
-
// same DVC after recovery. (An alternative implementation would be to load the
|
|
1283
|
-
// superblock's DVC headers (including gaps) into the journal during Replica.open(), but
|
|
1284
|
-
// that is more complicated to implement correctly).
|
|
1285
|
-
for (durable.headers.slice) |*header| headers.appendAssumeCapacity(header.*);
|
|
1286
|
-
return;
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
// What is the relationship between two prepares?
|
|
1290
|
-
const Chain = enum {
|
|
1291
|
-
// The ops are sequential, and the hash-chain is valid.
|
|
1292
|
-
chain_sequence,
|
|
1293
|
-
// The ops are sequential, and the hash-chain is invalid.
|
|
1294
|
-
chain_break,
|
|
1295
|
-
// The ops are non-sequential, and belong to the same view.
|
|
1296
|
-
// This gap never hides a break.
|
|
1297
|
-
chain_view,
|
|
1298
|
-
// The ops are non-sequential, and belong to the different views.
|
|
1299
|
-
// Depending on the replica state, this gap may hide a break.
|
|
1300
|
-
chain_gap,
|
|
1301
|
-
};
|
|
1302
|
-
|
|
1303
|
-
// The DVC anchor: Within the log suffix following the anchor, we have additional
|
|
1304
|
-
// guarantees about the state of the log headers which allow us to tolerate certain
|
|
1305
|
-
// gaps (by locally guaranteeing that the gap does not hide a break).
|
|
1306
|
-
const op_dvc_anchor = std.math.max(
|
|
1307
|
-
options.op_checkpoint,
|
|
1308
|
-
// +1: We may have a full pipeline, but not yet have performed any repair.
|
|
1309
|
-
// In such a case, we want to send those pipeline_prepare_queue_max headers in
|
|
1310
|
-
// the DVC, but not the preceding op (which may belong to a different chain).
|
|
1311
|
-
// This satisfies the DVC invariant because the first op in the pipeline is
|
|
1312
|
-
// "connected" to the canonical chain (via its "parent" checksum).
|
|
1313
|
-
1 + op_head_current -| constants.pipeline_prepare_queue_max,
|
|
1314
|
-
);
|
|
1315
|
-
|
|
1316
|
-
// Within the "suffix" we can make additional assumptions about gaps/etc.
|
|
1317
|
-
// After the suffix, we just add as many extra (valid) headers as we can fit.
|
|
1318
|
-
var suffix_done = false;
|
|
1319
|
-
|
|
1320
|
-
for (current.headers.constSlice()) |*header, i| {
|
|
1321
|
-
const op = header.op;
|
|
1322
|
-
const chain = chain: {
|
|
1323
|
-
// Always include the head message.
|
|
1324
|
-
if (i == 0) break :chain Chain.chain_sequence;
|
|
1325
|
-
|
|
1326
|
-
const child = headers.get(i - 1);
|
|
1327
|
-
if (child.op == header.op + 1) {
|
|
1328
|
-
break :chain if (child.parent == header.checksum) Chain.chain_sequence else Chain.chain_break;
|
|
1329
|
-
} else {
|
|
1330
|
-
break :chain if (child.view == header.view) Chain.chain_view else Chain.chain_gap;
|
|
1331
|
-
}
|
|
1332
|
-
};
|
|
1333
|
-
|
|
1334
|
-
if (command_current == .start_view) {
|
|
1335
|
-
// Primary: Collect headers for a start_view message.
|
|
1336
|
-
// Backup: these headers are stored in the superblock's vsr_headers.
|
|
1337
|
-
switch (chain) {
|
|
1338
|
-
.chain_sequence => {},
|
|
1339
|
-
// Gaps are due to either:
|
|
1340
|
-
// - entries before checkpoint, which are not repaired, or
|
|
1341
|
-
// - backup missed prepares and has not repaired headers. (Immediately after
|
|
1342
|
-
// receiving a start_view this is not a concern, but the view_durable_update()
|
|
1343
|
-
// may be delayed if another is in progress).
|
|
1344
|
-
.chain_view, .chain_gap => {
|
|
1345
|
-
assert(op <= options.op_checkpoint or !current.log_view_primary);
|
|
1346
|
-
break;
|
|
1347
|
-
},
|
|
1348
|
-
// Breaks are due to:
|
|
1349
|
-
// - entries before checkpoint, which are not repaired
|
|
1350
|
-
.chain_break => {
|
|
1351
|
-
assert(op <= options.op_checkpoint);
|
|
1352
|
-
break;
|
|
1353
|
-
},
|
|
1354
|
-
}
|
|
1355
|
-
} else if (suffix_done) {
|
|
1356
|
-
// Add extra headers to the DVC. These are not required for correctness or
|
|
1357
|
-
// availability, but including extra (correct) headers minimizes header repair at
|
|
1358
|
-
// the new primary.
|
|
1359
|
-
switch (chain) {
|
|
1360
|
-
.chain_sequence => {},
|
|
1361
|
-
.chain_view => {},
|
|
1362
|
-
// Outside of the log suffix, repair may not have been finished, so gaps and
|
|
1363
|
-
// breaks are possible. Non-same-view gaps may hide breaks.
|
|
1364
|
-
.chain_gap => break,
|
|
1365
|
-
.chain_break => break,
|
|
1366
|
-
}
|
|
1367
|
-
} else if (current.log_view_primary and command_durable == .start_view) {
|
|
1368
|
-
switch (chain) {
|
|
1369
|
-
.chain_sequence => {},
|
|
1370
|
-
// Gaps to the right of the (durable) SV originate from:
|
|
1371
|
-
// 1. The primary (durable SV: 1,2,3) prepares several ops (4,5,6).
|
|
1372
|
-
// 2. However, the WAL writes are reordered such that some later ops (5,6)
|
|
1373
|
-
// finish before an earlier op (4).
|
|
1374
|
-
// 3. Crash, recover. Start sending a DVC for the next view. Either:
|
|
1375
|
-
// - There is a gap in the WAL at op=4, but this is to the right of the
|
|
1376
|
-
// durable SV, so it may be safely skipped.
|
|
1377
|
-
// - Same as above, except op=4 was a torn write (or bit rot).
|
|
1378
|
-
.chain_view, .chain_gap => assert(op + 1 > op_head_durable),
|
|
1379
|
-
// Breaks are impossible to the right of the durable SV — journal recovery uses
|
|
1380
|
-
// the durable SV to prune bad headers by their view numbers.
|
|
1381
|
-
.chain_break => unreachable,
|
|
1382
|
-
}
|
|
1383
|
-
suffix_done = op <= op_head_durable;
|
|
1384
|
-
} else if (current.log_view_primary and command_durable != .start_view) {
|
|
1385
|
-
switch (chain) {
|
|
1386
|
-
.chain_sequence => {},
|
|
1387
|
-
.chain_view => {},
|
|
1388
|
-
// The retiring primary may have gap-breaks or breaks in its suffix iff:
|
|
1389
|
-
// - it didn't finish repairs before the second view-change, and
|
|
1390
|
-
// - some uncommitted ops were truncated during the first view-change.
|
|
1391
|
-
// (Truncation "moves" the suffix backwards).
|
|
1392
|
-
.chain_gap => break,
|
|
1393
|
-
.chain_break => break,
|
|
1394
|
-
}
|
|
1395
|
-
suffix_done = op <= op_dvc_anchor;
|
|
1396
|
-
} else if (!current.log_view_primary and command_durable == .start_view) {
|
|
1397
|
-
switch (chain) {
|
|
1398
|
-
.chain_sequence => {},
|
|
1399
|
-
// Backups load a full suffix of headers from the view's SV message. If there
|
|
1400
|
-
// is now a gap in it the bcakup's suffix, this must be due to missed prepares.
|
|
1401
|
-
.chain_view, .chain_gap => assert(op + 1 > op_head_durable),
|
|
1402
|
-
// Breaks are impossible to the right of the durable SV — journal recovery uses
|
|
1403
|
-
// the durable SV to prune bad headers by their view numbers.
|
|
1404
|
-
.chain_break => unreachable,
|
|
1405
|
-
}
|
|
1406
|
-
suffix_done = op <= op_head_durable;
|
|
1407
|
-
} else if (!current.log_view_primary and command_durable != .start_view) {
|
|
1408
|
-
switch (chain) {
|
|
1409
|
-
.chain_sequence => {},
|
|
1410
|
-
.chain_view => {},
|
|
1411
|
-
// Backups load a full suffix of headers from the view's SV message.
|
|
1412
|
-
// That SV isn't durable, but it is part of the journal, so any gaps to its
|
|
1413
|
-
// right must be due to missed prepares.
|
|
1414
|
-
.chain_gap => {},
|
|
1415
|
-
// Breaks are impossible to the right of the ephemeral SV, since the log was
|
|
1416
|
-
// truncated when the SV was installed.
|
|
1417
|
-
.chain_break => unreachable,
|
|
1418
|
-
}
|
|
1419
|
-
suffix_done = op <= op_dvc_anchor;
|
|
1420
|
-
} else unreachable;
|
|
1421
|
-
|
|
1422
|
-
headers.appendAssumeCapacity(header.*);
|
|
1423
|
-
}
|
|
1424
|
-
}
|
|
1425
|
-
};
|