tigerbeetle-node 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +305 -103
- package/dist/index.d.ts +70 -67
- package/dist/index.js +70 -67
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +11 -10
- package/src/node.zig +22 -20
- package/src/tigerbeetle/scripts/benchmark.bat +4 -3
- package/src/tigerbeetle/scripts/benchmark.sh +25 -10
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +20 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +32 -26
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +12 -3
- package/src/tigerbeetle/src/benchmark.zig +19 -9
- package/src/tigerbeetle/src/benchmark_array_search.zig +317 -0
- package/src/tigerbeetle/src/benchmarks/perf.zig +299 -0
- package/src/tigerbeetle/src/c/tb_client/context.zig +103 -0
- package/src/tigerbeetle/src/c/tb_client/packet.zig +80 -0
- package/src/tigerbeetle/src/c/tb_client/signal.zig +288 -0
- package/src/tigerbeetle/src/c/tb_client/thread.zig +328 -0
- package/src/tigerbeetle/src/c/tb_client.h +221 -0
- package/src/tigerbeetle/src/c/tb_client.zig +104 -0
- package/src/tigerbeetle/src/c/test.zig +1 -0
- package/src/tigerbeetle/src/cli.zig +143 -84
- package/src/tigerbeetle/src/config.zig +161 -20
- package/src/tigerbeetle/src/demo.zig +14 -8
- package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +2 -2
- package/src/tigerbeetle/src/ewah.zig +318 -0
- package/src/tigerbeetle/src/ewah_benchmark.zig +121 -0
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +317 -0
- package/src/tigerbeetle/src/fifo.zig +17 -1
- package/src/tigerbeetle/src/io/darwin.zig +12 -10
- package/src/tigerbeetle/src/io/linux.zig +25 -9
- package/src/tigerbeetle/src/io/windows.zig +13 -9
- package/src/tigerbeetle/src/iops.zig +101 -0
- package/src/tigerbeetle/src/lsm/README.md +214 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +341 -0
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +125 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +557 -0
- package/src/tigerbeetle/src/lsm/composite_key.zig +77 -0
- package/src/tigerbeetle/src/lsm/direction.zig +11 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +587 -0
- package/src/tigerbeetle/src/lsm/forest.zig +204 -0
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
- package/src/tigerbeetle/src/lsm/grid.zig +549 -0
- package/src/tigerbeetle/src/lsm/groove.zig +1002 -0
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +474 -0
- package/src/tigerbeetle/src/lsm/level_iterator.zig +315 -0
- package/src/tigerbeetle/src/lsm/manifest.zig +580 -0
- package/src/tigerbeetle/src/lsm/manifest_level.zig +925 -0
- package/src/tigerbeetle/src/lsm/manifest_log.zig +953 -0
- package/src/tigerbeetle/src/lsm/node_pool.zig +231 -0
- package/src/tigerbeetle/src/lsm/posted_groove.zig +387 -0
- package/src/tigerbeetle/src/lsm/segmented_array.zig +1318 -0
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +894 -0
- package/src/tigerbeetle/src/lsm/table.zig +967 -0
- package/src/tigerbeetle/src/lsm/table_immutable.zig +203 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +306 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +174 -0
- package/src/tigerbeetle/src/lsm/test.zig +423 -0
- package/src/tigerbeetle/src/lsm/tree.zig +1090 -0
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
- package/src/tigerbeetle/src/main.zig +141 -109
- package/src/tigerbeetle/src/message_bus.zig +49 -48
- package/src/tigerbeetle/src/message_pool.zig +22 -12
- package/src/tigerbeetle/src/ring_buffer.zig +126 -30
- package/src/tigerbeetle/src/simulator.zig +205 -140
- package/src/tigerbeetle/src/state_machine.zig +1268 -721
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +40 -14
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
- package/src/tigerbeetle/src/test/cluster.zig +104 -88
- package/src/tigerbeetle/src/test/conductor.zig +365 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/message_bus.zig +15 -24
- package/src/tigerbeetle/src/test/network.zig +26 -17
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +94 -68
- package/src/tigerbeetle/src/test/state_machine.zig +135 -69
- package/src/tigerbeetle/src/test/storage.zig +78 -28
- package/src/tigerbeetle/src/tigerbeetle.zig +19 -16
- package/src/tigerbeetle/src/unit_tests.zig +15 -0
- package/src/tigerbeetle/src/util.zig +51 -0
- package/src/tigerbeetle/src/vopr.zig +494 -0
- package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
- package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
- package/src/tigerbeetle/src/vsr/client.zig +34 -7
- package/src/tigerbeetle/src/vsr/journal.zig +164 -174
- package/src/tigerbeetle/src/vsr/replica.zig +1602 -651
- package/src/tigerbeetle/src/vsr/superblock.zig +1761 -0
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +255 -0
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +644 -0
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +561 -0
- package/src/tigerbeetle/src/vsr.zig +118 -170
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
|
@@ -4,14 +4,20 @@ const assert = std.debug.assert;
|
|
|
4
4
|
|
|
5
5
|
const config = @import("../config.zig");
|
|
6
6
|
|
|
7
|
+
const StaticAllocator = @import("../static_allocator.zig");
|
|
8
|
+
const GridType = @import("../lsm/grid.zig").GridType;
|
|
9
|
+
const MessagePool = @import("../message_pool.zig").MessagePool;
|
|
7
10
|
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
8
11
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
12
|
+
const ClientTable = @import("superblock_client_table.zig").ClientTable;
|
|
13
|
+
const format_journal = @import("./journal.zig").format_journal;
|
|
9
14
|
|
|
10
15
|
const vsr = @import("../vsr.zig");
|
|
11
16
|
const Header = vsr.Header;
|
|
12
17
|
const Timeout = vsr.Timeout;
|
|
13
18
|
const Command = vsr.Command;
|
|
14
19
|
const Version = vsr.Version;
|
|
20
|
+
const VSRState = vsr.VSRState;
|
|
15
21
|
|
|
16
22
|
const log = std.log.scoped(.replica);
|
|
17
23
|
|
|
@@ -39,32 +45,6 @@ pub const Status = enum {
|
|
|
39
45
|
recovering,
|
|
40
46
|
};
|
|
41
47
|
|
|
42
|
-
const ClientTable = std.AutoHashMapUnmanaged(u128, ClientTableEntry);
|
|
43
|
-
|
|
44
|
-
/// We found two bugs in the VRR paper relating to the client table:
|
|
45
|
-
///
|
|
46
|
-
/// 1. a correctness bug, where successive client crashes may cause request numbers to collide for
|
|
47
|
-
/// different request payloads, resulting in requests receiving the wrong reply, and
|
|
48
|
-
///
|
|
49
|
-
/// 2. a liveness bug, where if the client table is updated for request and prepare messages with
|
|
50
|
-
/// the client's latest request number, then the client may be locked out from the cluster if the
|
|
51
|
-
/// request is ever reordered through a view change.
|
|
52
|
-
///
|
|
53
|
-
/// We therefore take a different approach with the implementation of our client table, to:
|
|
54
|
-
///
|
|
55
|
-
/// 1. register client sessions explicitly through the state machine to ensure that client session
|
|
56
|
-
/// numbers always increase, and
|
|
57
|
-
///
|
|
58
|
-
/// 2. make a more careful distinction between uncommitted and committed request numbers,
|
|
59
|
-
/// considering that uncommitted requests may not survive a view change.
|
|
60
|
-
const ClientTableEntry = struct {
|
|
61
|
-
/// The client's session number as committed to the cluster by a register request.
|
|
62
|
-
session: u64,
|
|
63
|
-
|
|
64
|
-
/// The reply sent to the client's latest committed request.
|
|
65
|
-
reply: *Message,
|
|
66
|
-
};
|
|
67
|
-
|
|
68
48
|
const Nonce = u128;
|
|
69
49
|
|
|
70
50
|
const Prepare = struct {
|
|
@@ -84,18 +64,40 @@ const quorum_messages_null = [_]?*Message{null} ** config.replicas_max;
|
|
|
84
64
|
const QuorumCounter = std.StaticBitSet(config.replicas_max);
|
|
85
65
|
const quorum_counter_null = QuorumCounter.initEmpty();
|
|
86
66
|
|
|
87
|
-
|
|
67
|
+
// CRITICAL: The number of prepare headers to include in the body:
|
|
68
|
+
// We must provide enough headers to cover all uncommitted headers so that the new
|
|
69
|
+
// leader (if we are in a view change) can decide whether to discard uncommitted headers
|
|
70
|
+
// that cannot be repaired because they are gaps, and this must be relative to the
|
|
71
|
+
// cluster as a whole (not relative to the difference between our op and commit number)
|
|
72
|
+
// as otherwise we would break correctness.
|
|
73
|
+
const view_change_headers_count = config.pipeline_max;
|
|
74
|
+
|
|
75
|
+
comptime {
|
|
76
|
+
assert(view_change_headers_count > 0);
|
|
77
|
+
assert(view_change_headers_count >= config.pipeline_max);
|
|
78
|
+
assert(view_change_headers_count <=
|
|
79
|
+
@divFloor(config.message_size_max - @sizeOf(Header), @sizeOf(Header)));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
pub fn ReplicaType(
|
|
88
83
|
comptime StateMachine: type,
|
|
89
84
|
comptime MessageBus: type,
|
|
90
85
|
comptime Storage: type,
|
|
91
86
|
comptime Time: type,
|
|
92
87
|
) type {
|
|
88
|
+
const Grid = GridType(Storage);
|
|
89
|
+
const SuperBlock = vsr.SuperBlockType(Storage);
|
|
90
|
+
|
|
93
91
|
return struct {
|
|
94
92
|
const Self = @This();
|
|
95
93
|
|
|
96
94
|
const Journal = vsr.Journal(Self, Storage);
|
|
97
95
|
const Clock = vsr.Clock(Time);
|
|
98
96
|
|
|
97
|
+
/// We use this allocator during open/init and then disable it.
|
|
98
|
+
/// An accidental dynamic allocation after open/init will cause an assertion failure.
|
|
99
|
+
static_allocator: StaticAllocator,
|
|
100
|
+
|
|
99
101
|
/// The number of the cluster to which this replica belongs:
|
|
100
102
|
cluster: u32,
|
|
101
103
|
|
|
@@ -111,6 +113,8 @@ pub fn Replica(
|
|
|
111
113
|
/// The minimum number of replicas required to form a view change quorum:
|
|
112
114
|
quorum_view_change: u8,
|
|
113
115
|
|
|
116
|
+
time: Time,
|
|
117
|
+
|
|
114
118
|
/// A distributed fault-tolerant clock for lower and upper bounds on the leader's wall clock:
|
|
115
119
|
clock: Clock,
|
|
116
120
|
|
|
@@ -118,14 +122,17 @@ pub fn Replica(
|
|
|
118
122
|
journal: Journal,
|
|
119
123
|
|
|
120
124
|
/// An abstraction to send messages from the replica to another replica or client.
|
|
121
|
-
/// The message bus will also deliver messages to this replica by calling `
|
|
122
|
-
message_bus:
|
|
125
|
+
/// The message bus will also deliver messages to this replica by calling `on_message_from_bus()`.
|
|
126
|
+
message_bus: MessageBus,
|
|
123
127
|
|
|
124
128
|
/// For executing service up-calls after an operation has been committed:
|
|
125
|
-
state_machine:
|
|
129
|
+
state_machine: StateMachine,
|
|
126
130
|
|
|
127
|
-
|
|
128
|
-
|
|
131
|
+
// TODO Document.
|
|
132
|
+
superblock: SuperBlock,
|
|
133
|
+
superblock_context: SuperBlock.Context = undefined,
|
|
134
|
+
grid: Grid,
|
|
135
|
+
opened: bool,
|
|
129
136
|
|
|
130
137
|
/// The current view, initially 0:
|
|
131
138
|
view: u32,
|
|
@@ -136,24 +143,46 @@ pub fn Replica(
|
|
|
136
143
|
/// The current status, either normal, view_change, or recovering:
|
|
137
144
|
status: Status = .recovering,
|
|
138
145
|
|
|
139
|
-
/// The op number assigned to the most recently prepared operation
|
|
146
|
+
/// The op number assigned to the most recently prepared operation.
|
|
147
|
+
///
|
|
148
|
+
/// Invariants (not applicable during status=recovering):
|
|
149
|
+
/// * `replica.op` exists in the Journal.
|
|
150
|
+
/// * `replica.op ≥ replica.commit_min`.
|
|
151
|
+
/// * `replica.op ≤ replica.op_checkpoint_trigger`: don't wrap the WAL until we are sure
|
|
152
|
+
/// that the overwritten entry will not be required for recovery.
|
|
153
|
+
// TODO: When recovery protocol is removed, load the `op` from the WAL, and verify that it is ≥op_checkpoint.
|
|
154
|
+
// Also verify that a corresponding header exists in the WAL.
|
|
140
155
|
op: u64,
|
|
141
156
|
|
|
142
157
|
/// The op of the highest checkpointed message.
|
|
143
|
-
// TODO Update this to use LSM storage.
|
|
144
158
|
// TODO Refuse to store/ack any op>op_checkpoint+journal_slot_count.
|
|
145
|
-
|
|
146
|
-
op_checkpoint: u64 = 0,
|
|
159
|
+
op_checkpoint: u64,
|
|
147
160
|
|
|
148
161
|
/// The op number of the latest committed and executed operation (according to the replica):
|
|
149
162
|
/// The replica may have to wait for repairs to complete before commit_min reaches commit_max.
|
|
163
|
+
///
|
|
164
|
+
/// Invariants (not applicable during status=recovering):
|
|
165
|
+
/// * `replica.commit_min` exists in the Journal.
|
|
166
|
+
/// * `replica.commit_min ≤ replica.op`
|
|
167
|
+
/// * `replica.commit_min ≥ replica.op_checkpoint`.
|
|
168
|
+
/// * never decreases
|
|
150
169
|
commit_min: u64,
|
|
151
170
|
|
|
152
171
|
/// The op number of the latest committed operation (according to the cluster):
|
|
153
172
|
/// This is the commit number in terms of the VRR paper.
|
|
173
|
+
///
|
|
174
|
+
/// Invariants:
|
|
175
|
+
/// * `replica.commit_max ≥ replica.commit_min`.
|
|
176
|
+
/// * never decreases
|
|
154
177
|
commit_max: u64,
|
|
155
178
|
|
|
156
|
-
///
|
|
179
|
+
/// Guards against concurrent commits.
|
|
180
|
+
///
|
|
181
|
+
/// Set while:
|
|
182
|
+
/// * prefetching from storage, in preparation for a commit
|
|
183
|
+
/// * reading a prepare from storage in order to commit
|
|
184
|
+
/// * compacting storage
|
|
185
|
+
/// * checkpointing
|
|
157
186
|
committing: bool = false,
|
|
158
187
|
|
|
159
188
|
/// Whether we are reading a prepare from storage in order to push to the pipeline.
|
|
@@ -164,7 +193,7 @@ pub fn Replica(
|
|
|
164
193
|
///
|
|
165
194
|
/// After a view change, the old leader's pipeline is left untouched so that it is able to
|
|
166
195
|
/// help the new leader repair, even in the face of local storage faults.
|
|
167
|
-
pipeline: RingBuffer(Prepare, config.pipeline_max) = .{},
|
|
196
|
+
pipeline: RingBuffer(Prepare, config.pipeline_max, .array) = .{},
|
|
168
197
|
|
|
169
198
|
/// In some cases, a replica may send a message to itself. We do not submit these messages
|
|
170
199
|
/// to the message bus but rather queue them here for guaranteed immediate delivery, which
|
|
@@ -236,18 +265,112 @@ pub fn Replica(
|
|
|
236
265
|
|
|
237
266
|
on_change_state: ?fn (replica: *Self) void = null,
|
|
238
267
|
|
|
239
|
-
|
|
240
|
-
|
|
268
|
+
/// Called when `commit_prepare` finishes committing.
|
|
269
|
+
commit_callback: ?fn (*Self) void = null,
|
|
270
|
+
|
|
271
|
+
/// The prepare message being committed.
|
|
272
|
+
commit_prepare: ?*Message = null,
|
|
273
|
+
|
|
274
|
+
const OpenOptions = struct {
|
|
275
|
+
replica_count: u8,
|
|
276
|
+
storage: *Storage,
|
|
277
|
+
message_pool: *MessagePool,
|
|
278
|
+
time: Time,
|
|
279
|
+
state_machine_options: StateMachine.Options,
|
|
280
|
+
message_bus_options: MessageBus.Options,
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
/// Initializes and opens the provided replica using the options.
|
|
284
|
+
pub fn open(self: *Self, parent_allocator: std.mem.Allocator, options: OpenOptions) !void {
|
|
285
|
+
self.static_allocator = StaticAllocator.init(parent_allocator);
|
|
286
|
+
const allocator = self.static_allocator.allocator();
|
|
287
|
+
|
|
288
|
+
self.superblock = try SuperBlock.init(
|
|
289
|
+
allocator,
|
|
290
|
+
options.storage,
|
|
291
|
+
options.message_pool,
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
// Once initialzed, the replica is in charge of calling superblock.deinit()
|
|
295
|
+
var initialized = false;
|
|
296
|
+
errdefer if (!initialized) self.superblock.deinit(allocator);
|
|
297
|
+
|
|
298
|
+
// Open the superblock:
|
|
299
|
+
self.opened = false;
|
|
300
|
+
self.superblock.open(superblock_open_callback, &self.superblock_context);
|
|
301
|
+
while (!self.opened) self.superblock.storage.tick();
|
|
302
|
+
assert(self.superblock.working.vsr_state.internally_consistent());
|
|
303
|
+
|
|
304
|
+
if (self.superblock.working.replica >= options.replica_count) {
|
|
305
|
+
log.err("{}: open: no address for replica (replica_count={})", .{
|
|
306
|
+
self.superblock.working.replica,
|
|
307
|
+
options.replica_count,
|
|
308
|
+
});
|
|
309
|
+
return error.NoAddress;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Intiaize the replica:
|
|
313
|
+
try self.init(allocator, .{
|
|
314
|
+
.cluster = self.superblock.working.cluster,
|
|
315
|
+
.replica_index = self.superblock.working.replica,
|
|
316
|
+
.replica_count = options.replica_count,
|
|
317
|
+
.storage = options.storage,
|
|
318
|
+
.time = options.time,
|
|
319
|
+
.message_pool = options.message_pool,
|
|
320
|
+
.state_machine_options = options.state_machine_options,
|
|
321
|
+
.message_bus_options = options.message_bus_options,
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
// Disable all dynamic allocation from this point onwards.
|
|
325
|
+
self.static_allocator.transition_from_init_to_static();
|
|
326
|
+
|
|
327
|
+
initialized = true;
|
|
328
|
+
errdefer self.deinit(allocator);
|
|
329
|
+
|
|
330
|
+
// Open the (Forest inside) StateMachine:
|
|
331
|
+
self.opened = false;
|
|
332
|
+
self.state_machine.open(state_machine_open_callback);
|
|
333
|
+
while (!self.opened) {
|
|
334
|
+
self.grid.tick();
|
|
335
|
+
self.superblock.storage.tick();
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
fn superblock_open_callback(superblock_context: *SuperBlock.Context) void {
|
|
340
|
+
const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
|
|
341
|
+
assert(!self.opened);
|
|
342
|
+
self.opened = true;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
fn state_machine_open_callback(state_machine: *StateMachine) void {
|
|
346
|
+
const self = @fieldParentPtr(Self, "state_machine", state_machine);
|
|
347
|
+
assert(!self.opened);
|
|
348
|
+
self.opened = true;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const Options = struct {
|
|
241
352
|
cluster: u32,
|
|
242
353
|
replica_count: u8,
|
|
243
|
-
|
|
244
|
-
time:
|
|
354
|
+
replica_index: u8,
|
|
355
|
+
time: Time,
|
|
245
356
|
storage: *Storage,
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
357
|
+
message_pool: *MessagePool,
|
|
358
|
+
// TODO With https://github.com/coilhq/tigerbeetle/issues/71,
|
|
359
|
+
// the separate message_bus_options won't be necessary.
|
|
360
|
+
message_bus_options: MessageBus.Options,
|
|
361
|
+
state_machine_options: StateMachine.Options,
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
/// NOTE: self.superblock must be initialized and opened prior to this call.
|
|
365
|
+
fn init(self: *Self, allocator: Allocator, options: Options) !void {
|
|
366
|
+
const replica_count = options.replica_count;
|
|
367
|
+
const replica_index = options.replica_index;
|
|
249
368
|
assert(replica_count > 0);
|
|
250
|
-
assert(
|
|
369
|
+
assert(replica_index < replica_count);
|
|
370
|
+
|
|
371
|
+
assert(self.opened);
|
|
372
|
+
assert(self.superblock.opened);
|
|
373
|
+
assert(self.superblock.working.vsr_state.internally_consistent());
|
|
251
374
|
|
|
252
375
|
const majority = (replica_count / 2) + 1;
|
|
253
376
|
assert(majority <= replica_count);
|
|
@@ -277,91 +400,112 @@ pub fn Replica(
|
|
|
277
400
|
// Flexible quorums are safe if these two quorums intersect so that this relation holds:
|
|
278
401
|
assert(quorum_replication + quorum_view_change > replica_count);
|
|
279
402
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
403
|
+
self.time = options.time;
|
|
404
|
+
self.clock = try Clock.init(
|
|
405
|
+
allocator,
|
|
406
|
+
replica_count,
|
|
407
|
+
replica_index,
|
|
408
|
+
&self.time,
|
|
409
|
+
);
|
|
410
|
+
errdefer self.clock.deinit(allocator);
|
|
284
411
|
|
|
285
|
-
|
|
412
|
+
self.journal = try Journal.init(allocator, options.storage, replica_index);
|
|
413
|
+
errdefer self.journal.deinit(allocator);
|
|
286
414
|
|
|
287
|
-
|
|
415
|
+
self.message_bus = try MessageBus.init(
|
|
288
416
|
allocator,
|
|
289
|
-
|
|
290
|
-
replica,
|
|
291
|
-
|
|
417
|
+
options.cluster,
|
|
418
|
+
.{ .replica = options.replica_index },
|
|
419
|
+
options.message_pool,
|
|
420
|
+
Self.on_message_from_bus,
|
|
421
|
+
options.message_bus_options,
|
|
292
422
|
);
|
|
293
|
-
errdefer
|
|
423
|
+
errdefer self.message_bus.deinit(allocator);
|
|
424
|
+
|
|
425
|
+
self.grid = try Grid.init(allocator, &self.superblock);
|
|
426
|
+
errdefer self.grid.deinit(allocator);
|
|
294
427
|
|
|
295
|
-
|
|
296
|
-
|
|
428
|
+
self.state_machine = try StateMachine.init(
|
|
429
|
+
allocator,
|
|
430
|
+
&self.grid,
|
|
431
|
+
options.state_machine_options,
|
|
432
|
+
);
|
|
433
|
+
errdefer self.state_machine.deinit(allocator);
|
|
297
434
|
|
|
298
435
|
const recovery_nonce = blk: {
|
|
299
436
|
var nonce: [@sizeOf(Nonce)]u8 = undefined;
|
|
300
437
|
var hash = std.crypto.hash.Blake3.init(.{});
|
|
301
|
-
hash.update(std.mem.asBytes(&clock.monotonic()));
|
|
302
|
-
hash.update(&[_]u8{
|
|
438
|
+
hash.update(std.mem.asBytes(&self.clock.monotonic()));
|
|
439
|
+
hash.update(&[_]u8{replica_index});
|
|
303
440
|
hash.final(&nonce);
|
|
304
441
|
break :blk @bitCast(Nonce, nonce);
|
|
305
442
|
};
|
|
306
443
|
|
|
307
|
-
|
|
308
|
-
.
|
|
444
|
+
self.* = Self{
|
|
445
|
+
.static_allocator = self.static_allocator,
|
|
446
|
+
.cluster = options.cluster,
|
|
309
447
|
.replica_count = replica_count,
|
|
310
|
-
.replica =
|
|
448
|
+
.replica = replica_index,
|
|
311
449
|
.quorum_replication = quorum_replication,
|
|
312
450
|
.quorum_view_change = quorum_view_change,
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
.
|
|
316
|
-
.
|
|
317
|
-
.
|
|
318
|
-
.
|
|
319
|
-
.
|
|
320
|
-
.
|
|
321
|
-
.
|
|
322
|
-
.
|
|
451
|
+
// Copy the (already-initialized) time back, to avoid regressing the monotonic
|
|
452
|
+
// clock guard.
|
|
453
|
+
.time = self.time,
|
|
454
|
+
.clock = self.clock,
|
|
455
|
+
.journal = self.journal,
|
|
456
|
+
.message_bus = self.message_bus,
|
|
457
|
+
.state_machine = self.state_machine,
|
|
458
|
+
.superblock = self.superblock,
|
|
459
|
+
.grid = self.grid,
|
|
460
|
+
.opened = self.opened,
|
|
461
|
+
.view = self.superblock.working.vsr_state.view,
|
|
462
|
+
.view_normal = self.superblock.working.vsr_state.view_normal,
|
|
463
|
+
.op = 0,
|
|
464
|
+
.op_checkpoint = self.superblock.working.vsr_state.commit_min,
|
|
465
|
+
.commit_min = self.superblock.working.vsr_state.commit_min,
|
|
466
|
+
.commit_max = self.superblock.working.vsr_state.commit_max,
|
|
323
467
|
.ping_timeout = Timeout{
|
|
324
468
|
.name = "ping_timeout",
|
|
325
|
-
.id =
|
|
469
|
+
.id = replica_index,
|
|
326
470
|
.after = 100,
|
|
327
471
|
},
|
|
328
472
|
.prepare_timeout = Timeout{
|
|
329
473
|
.name = "prepare_timeout",
|
|
330
|
-
.id =
|
|
474
|
+
.id = replica_index,
|
|
331
475
|
.after = 50,
|
|
332
476
|
},
|
|
333
477
|
.commit_timeout = Timeout{
|
|
334
478
|
.name = "commit_timeout",
|
|
335
|
-
.id =
|
|
479
|
+
.id = replica_index,
|
|
336
480
|
.after = 100,
|
|
337
481
|
},
|
|
338
482
|
.normal_status_timeout = Timeout{
|
|
339
483
|
.name = "normal_status_timeout",
|
|
340
|
-
.id =
|
|
484
|
+
.id = replica_index,
|
|
341
485
|
.after = 500,
|
|
342
486
|
},
|
|
343
487
|
.view_change_status_timeout = Timeout{
|
|
344
488
|
.name = "view_change_status_timeout",
|
|
345
|
-
.id =
|
|
489
|
+
.id = replica_index,
|
|
346
490
|
.after = 500,
|
|
347
491
|
},
|
|
348
492
|
.view_change_message_timeout = Timeout{
|
|
349
493
|
.name = "view_change_message_timeout",
|
|
350
|
-
.id =
|
|
494
|
+
.id = replica_index,
|
|
351
495
|
.after = 50,
|
|
352
496
|
},
|
|
353
497
|
.repair_timeout = Timeout{
|
|
354
498
|
.name = "repair_timeout",
|
|
355
|
-
.id =
|
|
499
|
+
.id = replica_index,
|
|
356
500
|
.after = 50,
|
|
357
501
|
},
|
|
358
502
|
.recovery_timeout = Timeout{
|
|
359
503
|
.name = "recovery_timeout",
|
|
360
|
-
.id =
|
|
504
|
+
.id = replica_index,
|
|
361
505
|
.after = 200,
|
|
362
506
|
},
|
|
363
507
|
.recovery_nonce = recovery_nonce,
|
|
364
|
-
.prng = std.rand.DefaultPrng.init(
|
|
508
|
+
.prng = std.rand.DefaultPrng.init(replica_index),
|
|
365
509
|
};
|
|
366
510
|
|
|
367
511
|
log.debug("{}: init: replica_count={} quorum_view_change={} quorum_replication={}", .{
|
|
@@ -375,28 +519,24 @@ pub fn Replica(
|
|
|
375
519
|
// always overallocate capacity by a factor of two.
|
|
376
520
|
log.debug("{}: init: client_table.capacity()={} for config.clients_max={} entries", .{
|
|
377
521
|
self.replica,
|
|
378
|
-
self.client_table.capacity(),
|
|
522
|
+
self.client_table().capacity(),
|
|
379
523
|
config.clients_max,
|
|
380
524
|
});
|
|
381
525
|
|
|
382
526
|
assert(self.status == .recovering);
|
|
383
|
-
|
|
384
|
-
return self;
|
|
385
527
|
}
|
|
386
528
|
|
|
387
529
|
/// Free all memory and unref all messages held by the replica
|
|
388
530
|
/// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
|
|
389
531
|
pub fn deinit(self: *Self, allocator: Allocator) void {
|
|
532
|
+
self.static_allocator.transition_from_static_to_deinit();
|
|
533
|
+
|
|
390
534
|
self.journal.deinit(allocator);
|
|
391
535
|
self.clock.deinit(allocator);
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
self.message_bus.unref(entry.value_ptr.reply);
|
|
397
|
-
}
|
|
398
|
-
self.client_table.deinit(allocator);
|
|
399
|
-
}
|
|
536
|
+
self.state_machine.deinit(allocator);
|
|
537
|
+
self.superblock.deinit(allocator);
|
|
538
|
+
self.grid.deinit(allocator);
|
|
539
|
+
defer self.message_bus.deinit(allocator);
|
|
400
540
|
|
|
401
541
|
while (self.pipeline.pop()) |prepare| self.message_bus.unref(prepare.message);
|
|
402
542
|
|
|
@@ -406,6 +546,15 @@ pub fn Replica(
|
|
|
406
546
|
self.loopback_queue = null;
|
|
407
547
|
}
|
|
408
548
|
|
|
549
|
+
if (self.commit_prepare) |message| {
|
|
550
|
+
assert(self.committing);
|
|
551
|
+
assert(self.commit_callback != null);
|
|
552
|
+
self.message_bus.unref(message);
|
|
553
|
+
self.commit_prepare = null;
|
|
554
|
+
} else {
|
|
555
|
+
assert(self.commit_callback == null);
|
|
556
|
+
}
|
|
557
|
+
|
|
409
558
|
for (self.do_view_change_from_all_replicas) |message| {
|
|
410
559
|
if (message) |m| self.message_bus.unref(m);
|
|
411
560
|
}
|
|
@@ -415,6 +564,11 @@ pub fn Replica(
|
|
|
415
564
|
}
|
|
416
565
|
}
|
|
417
566
|
|
|
567
|
+
/// The client table records for each client the latest session and the latest committed reply.
|
|
568
|
+
inline fn client_table(self: *Self) *ClientTable {
|
|
569
|
+
return &self.superblock.client_table;
|
|
570
|
+
}
|
|
571
|
+
|
|
418
572
|
/// Time is measured in logical ticks that are incremented on every call to tick().
|
|
419
573
|
/// This eliminates a dependency on the system time and enables deterministic testing.
|
|
420
574
|
pub fn tick(self: *Self) void {
|
|
@@ -424,8 +578,15 @@ pub fn Replica(
|
|
|
424
578
|
// decrease throughput significantly.
|
|
425
579
|
assert(self.loopback_queue == null);
|
|
426
580
|
|
|
581
|
+
// TODO Replica owns Time; should it tick() here instead of Clock?
|
|
427
582
|
self.clock.tick();
|
|
428
583
|
|
|
584
|
+
// Storage/IO is ticked by top-level in case of multiple replicas sharing the same IO.
|
|
585
|
+
// self.journal.storage.tick();
|
|
586
|
+
|
|
587
|
+
self.grid.tick();
|
|
588
|
+
self.message_bus.tick();
|
|
589
|
+
|
|
429
590
|
if (!self.journal.recovered) {
|
|
430
591
|
if (!self.journal.recovering) self.journal.recover();
|
|
431
592
|
return;
|
|
@@ -442,6 +603,10 @@ pub fn Replica(
|
|
|
442
603
|
// The data file is brand new — no messages have ever been written.
|
|
443
604
|
// Transition to normal status; no need to run the VSR recovery protocol.
|
|
444
605
|
assert(self.journal.faulty.count == 0);
|
|
606
|
+
assert(self.commit_min == 0);
|
|
607
|
+
assert(self.commit_max == 0);
|
|
608
|
+
assert(self.op_checkpoint == 0);
|
|
609
|
+
assert(self.op == 0);
|
|
445
610
|
self.transition_to_normal_from_recovering_status(0);
|
|
446
611
|
assert(self.status == .normal);
|
|
447
612
|
} else if (self.replica_count == 1) {
|
|
@@ -449,8 +614,13 @@ pub fn Replica(
|
|
|
449
614
|
if (self.journal.faulty.count != 0) @panic("journal is corrupt");
|
|
450
615
|
if (self.committing) return;
|
|
451
616
|
assert(self.op == 0);
|
|
617
|
+
// TODO Assert that this path isn't taken more than once.
|
|
452
618
|
self.op = self.journal.op_maximum();
|
|
453
|
-
self.
|
|
619
|
+
assert(self.op >= self.commit_min);
|
|
620
|
+
assert(self.op >= self.op_checkpoint);
|
|
621
|
+
assert(self.op <= self.op_checkpoint_trigger());
|
|
622
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
623
|
+
self.commit_journal(self.op);
|
|
454
624
|
// The recovering→normal transition is deferred until all ops are committed.
|
|
455
625
|
} else {
|
|
456
626
|
// The journal just finished recovery.
|
|
@@ -482,6 +652,11 @@ pub fn Replica(
|
|
|
482
652
|
}
|
|
483
653
|
|
|
484
654
|
/// Called by the MessageBus to deliver a message to the replica.
|
|
655
|
+
fn on_message_from_bus(message_bus: *MessageBus, message: *Message) void {
|
|
656
|
+
const self = @fieldParentPtr(Self, "message_bus", message_bus);
|
|
657
|
+
self.on_message(message);
|
|
658
|
+
}
|
|
659
|
+
|
|
485
660
|
pub fn on_message(self: *Self, message: *Message) void {
|
|
486
661
|
assert(self.loopback_queue == null);
|
|
487
662
|
assert(message.references > 0);
|
|
@@ -533,6 +708,7 @@ pub fn Replica(
|
|
|
533
708
|
.request_start_view => self.on_request_start_view(message),
|
|
534
709
|
.request_prepare => self.on_request_prepare(message),
|
|
535
710
|
.request_headers => self.on_request_headers(message),
|
|
711
|
+
.request_block => unreachable, // TODO
|
|
536
712
|
.headers => self.on_headers(message),
|
|
537
713
|
.nack_prepare => self.on_nack_prepare(message),
|
|
538
714
|
// A replica should never handle misdirected messages intended for a client:
|
|
@@ -543,6 +719,7 @@ pub fn Replica(
|
|
|
543
719
|
});
|
|
544
720
|
return;
|
|
545
721
|
},
|
|
722
|
+
.block => unreachable, // TODO
|
|
546
723
|
.reserved => unreachable,
|
|
547
724
|
}
|
|
548
725
|
|
|
@@ -731,7 +908,7 @@ pub fn Replica(
|
|
|
731
908
|
}
|
|
732
909
|
|
|
733
910
|
// Verify that the new request will fit in the WAL.
|
|
734
|
-
if (message.header.op
|
|
911
|
+
if (message.header.op > self.op_checkpoint_trigger()) {
|
|
735
912
|
log.debug("{}: on_prepare: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
736
913
|
self.replica,
|
|
737
914
|
message.header.op,
|
|
@@ -749,13 +926,15 @@ pub fn Replica(
|
|
|
749
926
|
assert(message.header.op > self.op_checkpoint);
|
|
750
927
|
assert(message.header.op > self.op);
|
|
751
928
|
assert(message.header.op > self.commit_min);
|
|
752
|
-
assert(message.header.op
|
|
929
|
+
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
753
930
|
|
|
754
931
|
if (self.follower()) self.normal_status_timeout.reset();
|
|
755
932
|
|
|
756
933
|
if (message.header.op > self.op + 1) {
|
|
757
934
|
log.debug("{}: on_prepare: newer op", .{self.replica});
|
|
758
935
|
self.jump_to_newer_op_in_normal_status(message.header);
|
|
936
|
+
// "`replica.op` exists" invariant is temporarily broken.
|
|
937
|
+
assert(self.journal.header_with_op(message.header.op - 1) == null);
|
|
759
938
|
}
|
|
760
939
|
|
|
761
940
|
if (self.journal.previous_entry(message.header)) |previous| {
|
|
@@ -782,7 +961,7 @@ pub fn Replica(
|
|
|
782
961
|
|
|
783
962
|
if (self.follower()) {
|
|
784
963
|
// A prepare may already be committed if requested by repair() so take the max:
|
|
785
|
-
self.
|
|
964
|
+
self.commit_journal(std.math.max(message.header.commit, self.commit_max));
|
|
786
965
|
assert(self.commit_max >= message.header.commit);
|
|
787
966
|
}
|
|
788
967
|
}
|
|
@@ -802,7 +981,10 @@ pub fn Replica(
|
|
|
802
981
|
assert(prepare.message.header.op <= self.op);
|
|
803
982
|
|
|
804
983
|
// Wait until we have `f + 1` prepare_ok messages (including ourself) for quorum:
|
|
805
|
-
const threshold = self.quorum_replication;
|
|
984
|
+
// const threshold = self.quorum_replication;
|
|
985
|
+
// TODO: When Block recover & state transfer are implemented, this can be removed.
|
|
986
|
+
const threshold =
|
|
987
|
+
if (prepare.message.header.op == self.op_checkpoint_trigger()) self.replica_count else self.quorum_replication;
|
|
806
988
|
|
|
807
989
|
const count = self.count_message_and_receive_quorum_exactly_once(
|
|
808
990
|
&prepare.ok_from_all_replicas,
|
|
@@ -867,7 +1049,7 @@ pub fn Replica(
|
|
|
867
1049
|
}
|
|
868
1050
|
|
|
869
1051
|
self.normal_status_timeout.reset();
|
|
870
|
-
self.
|
|
1052
|
+
self.commit_journal(message.header.commit);
|
|
871
1053
|
}
|
|
872
1054
|
|
|
873
1055
|
fn on_repair(self: *Self, message: *Message) void {
|
|
@@ -894,7 +1076,9 @@ pub fn Replica(
|
|
|
894
1076
|
}
|
|
895
1077
|
|
|
896
1078
|
if (self.status == .view_change and !self.do_view_change_quorum) {
|
|
897
|
-
log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{
|
|
1079
|
+
log.debug("{}: on_repair: ignoring (view change, waiting for quorum)", .{
|
|
1080
|
+
self.replica,
|
|
1081
|
+
});
|
|
898
1082
|
return;
|
|
899
1083
|
}
|
|
900
1084
|
|
|
@@ -911,6 +1095,7 @@ pub fn Replica(
|
|
|
911
1095
|
|
|
912
1096
|
if (self.journal.has_clean(message.header)) {
|
|
913
1097
|
log.debug("{}: on_repair: ignoring (duplicate)", .{self.replica});
|
|
1098
|
+
|
|
914
1099
|
self.send_prepare_ok(message.header);
|
|
915
1100
|
defer self.flush_loopback_queue();
|
|
916
1101
|
return;
|
|
@@ -985,6 +1170,28 @@ pub fn Replica(
|
|
|
985
1170
|
/// informs the other replicas of the completion of the view change by sending
|
|
986
1171
|
/// ⟨start_view v, l, n, k⟩ messages to the other replicas, where l is the new log, n is the
|
|
987
1172
|
/// op number, and k is the commit number.
|
|
1173
|
+
///
|
|
1174
|
+
/// For each DVC in the quorum:
|
|
1175
|
+
///
|
|
1176
|
+
/// * The headers must all belong to the same hash chain. (Gaps are allowed).
|
|
1177
|
+
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1178
|
+
/// loaded into the new leader with `replace_header()`, not `repair_header()`).
|
|
1179
|
+
///
|
|
1180
|
+
/// Across all DVCs in the quorum:
|
|
1181
|
+
///
|
|
1182
|
+
/// * The headers of every DVC with the same view_normal must agree. In other words:
|
|
1183
|
+
/// dvc₁.headers[i].op == dvc₂.headers[j].op implies
|
|
1184
|
+
/// dvc₁.headers[i].checksum == dvc₂.headers[j].checksum.
|
|
1185
|
+
/// (Reason: the headers bundled with the DVC(s) with the highest view_normal will be
|
|
1186
|
+
/// loaded into the new leader with `replace_header()`, not `repair_header()`).
|
|
1187
|
+
///
|
|
1188
|
+
/// Perhaps unintuitively, it is safe to advertise a header before its message is prepared
|
|
1189
|
+
/// (e.g. the write is still queued). The header is either:
|
|
1190
|
+
///
|
|
1191
|
+
/// * committed — so another replica in the quorum must have a copy, according to the quorum
|
|
1192
|
+
/// intersection property. Or,
|
|
1193
|
+
/// * uncommitted — if the header is chosen, but cannot be recovered from any replica, then
|
|
1194
|
+
/// it will be discarded by the nack protocol.
|
|
988
1195
|
fn on_do_view_change(self: *Self, message: *Message) void {
|
|
989
1196
|
if (self.ignore_view_change_message(message)) return;
|
|
990
1197
|
|
|
@@ -1000,8 +1207,9 @@ pub fn Replica(
|
|
|
1000
1207
|
// We may receive a `do_view_change` quorum from other replicas, which already have a
|
|
1001
1208
|
// `start_view_change_quorum`, before we receive a `start_view_change_quorum`:
|
|
1002
1209
|
if (!self.start_view_change_quorum) {
|
|
1003
|
-
log.debug("{}: on_do_view_change: waiting for start_view_change quorum", .{
|
|
1210
|
+
log.debug("{}: on_do_view_change: waiting for start_view_change quorum (view={})", .{
|
|
1004
1211
|
self.replica,
|
|
1212
|
+
self.view,
|
|
1005
1213
|
});
|
|
1006
1214
|
return;
|
|
1007
1215
|
}
|
|
@@ -1023,75 +1231,14 @@ pub fn Replica(
|
|
|
1023
1231
|
self.view,
|
|
1024
1232
|
});
|
|
1025
1233
|
|
|
1026
|
-
var v: ?u32 = null;
|
|
1027
|
-
var k: ?u64 = null;
|
|
1028
|
-
var latest = Header.reserved(self.cluster, 0);
|
|
1029
|
-
|
|
1030
|
-
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
1031
|
-
if (received) |m| {
|
|
1032
|
-
assert(m.header.command == .do_view_change);
|
|
1033
|
-
assert(m.header.cluster == self.cluster);
|
|
1034
|
-
assert(m.header.replica == replica);
|
|
1035
|
-
assert(m.header.view == self.view);
|
|
1036
|
-
|
|
1037
|
-
// The latest normal view experienced by this replica:
|
|
1038
|
-
// This may be higher than the view in any of the prepare headers.
|
|
1039
|
-
var replica_view_normal = @intCast(u32, m.header.timestamp);
|
|
1040
|
-
assert(replica_view_normal < m.header.view);
|
|
1041
|
-
|
|
1042
|
-
var replica_latest = Header.reserved(self.cluster, 0);
|
|
1043
|
-
set_latest_op(self.message_body_as_headers(m), &replica_latest);
|
|
1044
|
-
assert(replica_latest.op == m.header.op);
|
|
1045
|
-
|
|
1046
|
-
log.debug(
|
|
1047
|
-
"{}: on_do_view_change: replica={} v'={} op={} commit={} latest={}",
|
|
1048
|
-
.{
|
|
1049
|
-
self.replica,
|
|
1050
|
-
m.header.replica,
|
|
1051
|
-
replica_view_normal,
|
|
1052
|
-
m.header.op,
|
|
1053
|
-
m.header.commit,
|
|
1054
|
-
replica_latest,
|
|
1055
|
-
},
|
|
1056
|
-
);
|
|
1057
|
-
|
|
1058
|
-
if (v == null or replica_view_normal > v.?) {
|
|
1059
|
-
v = replica_view_normal;
|
|
1060
|
-
latest = replica_latest;
|
|
1061
|
-
} else if (replica_view_normal == v.? and replica_latest.op > latest.op) {
|
|
1062
|
-
v = replica_view_normal;
|
|
1063
|
-
latest = replica_latest;
|
|
1064
|
-
}
|
|
1065
|
-
|
|
1066
|
-
if (k == null or m.header.commit > k.?) k = m.header.commit;
|
|
1067
|
-
}
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
self.set_latest_op_and_k(&latest, k.?, "on_do_view_change");
|
|
1071
|
-
|
|
1072
|
-
// Now that we have the latest op in place, repair any other headers:
|
|
1073
|
-
for (self.do_view_change_from_all_replicas) |received| {
|
|
1074
|
-
if (received) |m| {
|
|
1075
|
-
for (self.message_body_as_headers(m)) |*h| {
|
|
1076
|
-
_ = self.repair_header(h);
|
|
1077
|
-
}
|
|
1078
|
-
}
|
|
1079
|
-
}
|
|
1080
|
-
|
|
1081
|
-
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1082
|
-
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1083
|
-
|
|
1084
1234
|
assert(self.start_view_change_quorum);
|
|
1085
1235
|
assert(!self.do_view_change_quorum);
|
|
1086
1236
|
self.do_view_change_quorum = true;
|
|
1087
1237
|
|
|
1088
|
-
self.
|
|
1238
|
+
self.set_log_from_do_view_change_messages();
|
|
1089
1239
|
assert(self.op >= self.commit_max);
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
if (self.state_machine.prepare_timestamp < prepare_timestamp) {
|
|
1093
|
-
self.state_machine.prepare_timestamp = prepare_timestamp;
|
|
1094
|
-
}
|
|
1240
|
+
assert(self.state_machine.prepare_timestamp >=
|
|
1241
|
+
self.journal.header_with_op(self.op).?.timestamp);
|
|
1095
1242
|
|
|
1096
1243
|
// Start repairs according to the CTRL protocol:
|
|
1097
1244
|
assert(!self.repair_timeout.ticking);
|
|
@@ -1109,6 +1256,16 @@ pub fn Replica(
|
|
|
1109
1256
|
fn on_start_view(self: *Self, message: *const Message) void {
|
|
1110
1257
|
if (self.ignore_view_change_message(message)) return;
|
|
1111
1258
|
|
|
1259
|
+
if (message.header.op > self.op_checkpoint_trigger()) {
|
|
1260
|
+
// This replica is too far behind, i.e. the new `self.op` is too far ahead of the
|
|
1261
|
+
// last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
|
|
1262
|
+
// precluding recovery.
|
|
1263
|
+
//
|
|
1264
|
+
// TODO State transfer. Currently this is unreachable because the
|
|
1265
|
+
// leader won't checkpoint until all replicas are caught up.
|
|
1266
|
+
unreachable;
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1112
1269
|
assert(self.status == .view_change or self.status == .normal);
|
|
1113
1270
|
assert(message.header.view >= self.view);
|
|
1114
1271
|
assert(message.header.replica != self.replica);
|
|
@@ -1118,20 +1275,12 @@ pub fn Replica(
|
|
|
1118
1275
|
|
|
1119
1276
|
assert(self.status == .view_change);
|
|
1120
1277
|
assert(message.header.view == self.view);
|
|
1278
|
+
assert(message.header.op == op_highest(message_body_as_headers(message)));
|
|
1121
1279
|
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
assert(latest.op == message.header.op);
|
|
1125
|
-
|
|
1126
|
-
self.set_latest_op_and_k(&latest, message.header.commit, "on_start_view");
|
|
1280
|
+
self.set_op_and_commit_max(message.header.op, message.header.commit, "on_start_view");
|
|
1281
|
+
self.replace_headers(message_body_as_headers(message));
|
|
1127
1282
|
|
|
1128
|
-
|
|
1129
|
-
for (self.message_body_as_headers(message)) |*h| {
|
|
1130
|
-
_ = self.repair_header(h);
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
|
-
// Verify that the repairs above have not replaced or advanced the latest op:
|
|
1134
|
-
assert(self.journal.header_with_op(self.op).?.checksum == latest.checksum);
|
|
1283
|
+
assert(self.op == message.header.op);
|
|
1135
1284
|
|
|
1136
1285
|
if (self.status == .view_change) {
|
|
1137
1286
|
self.transition_to_normal_from_view_change_status(message.header.view);
|
|
@@ -1142,7 +1291,7 @@ pub fn Replica(
|
|
|
1142
1291
|
assert(message.header.view == self.view);
|
|
1143
1292
|
assert(self.follower());
|
|
1144
1293
|
|
|
1145
|
-
self.
|
|
1294
|
+
self.commit_journal(self.commit_max);
|
|
1146
1295
|
|
|
1147
1296
|
self.repair();
|
|
1148
1297
|
}
|
|
@@ -1201,8 +1350,45 @@ pub fn Replica(
|
|
|
1201
1350
|
.commit = self.commit_max,
|
|
1202
1351
|
};
|
|
1203
1352
|
|
|
1204
|
-
|
|
1205
|
-
|
|
1353
|
+
// A recovery response attaches at least as many headers as a DVC message attaches.
|
|
1354
|
+
// To understand why, consider this scenario, where:
|
|
1355
|
+
//
|
|
1356
|
+
// replica_count 3
|
|
1357
|
+
// do_view_change.headers.len 3 (= pipeline_max)
|
|
1358
|
+
// recovery_response.headers.len 2 (!)
|
|
1359
|
+
// replica 0 log 3, 4a, 5a, 6a, 7a, 8a (status=normal, leader)
|
|
1360
|
+
// replica 1 log 3, 4a, 5a, --, --, -- (status=normal, follower)
|
|
1361
|
+
// replica 2 log 3, 4b, 5b, --, --, -- (status=recovering)
|
|
1362
|
+
//
|
|
1363
|
+
// 1. Replica 2 receives a recovery_response quorum.
|
|
1364
|
+
// 2. Replica 2 sets `replica.op` to 8a.
|
|
1365
|
+
// 3. Replica 2 sets its headers from the leader's recovery_response (8a, 7a)
|
|
1366
|
+
// (via `replace_header()`).
|
|
1367
|
+
// 4. Replica 2 transitions to status=normal.
|
|
1368
|
+
// 5. Replica 0 fails (before replica 2 has a chance to repair its hash chain.)
|
|
1369
|
+
// 6. Replica 1 initiates a view change.
|
|
1370
|
+
// 7. Replica 1 collects a DVC quorum:
|
|
1371
|
+
// replica 1: 3, 4a, 5a (view_normal=latest)
|
|
1372
|
+
// replica 2: 5b, 7a, 8a (view_normal=latest)
|
|
1373
|
+
// Replicas 1 and 2 share the highest view_normal, so both sets of headers are canonical.
|
|
1374
|
+
// 8. Replica 1 loads the canonical headers (via `replace_header()`) from both DVCs.
|
|
1375
|
+
// Messages 8a and 7a will be dropped via `do_view_change_op_max()` (due to the
|
|
1376
|
+
// gap at op 6). But there is a conflict at op=5. For correctness, replica 1 must
|
|
1377
|
+
// pick 5a — 5a may be committed by replica 0.
|
|
1378
|
+
// Without replica 0's assistance, replica 1 has no way to pick between 5a/5b.
|
|
1379
|
+
//
|
|
1380
|
+
// Including at least as many headers in the recovery response as the DVC maintains the
|
|
1381
|
+
// invariant: DVCs with the same view_normal must never disagree on the identity of a
|
|
1382
|
+
// message.
|
|
1383
|
+
//
|
|
1384
|
+
// (DVCs can still safely include gaps — but they must be of the form [4a,__,6a],
|
|
1385
|
+
// not [4a,__,6b]).
|
|
1386
|
+
const count = self.copy_latest_headers_and_set_size(
|
|
1387
|
+
0,
|
|
1388
|
+
self.op,
|
|
1389
|
+
view_change_headers_count,
|
|
1390
|
+
response,
|
|
1391
|
+
);
|
|
1206
1392
|
assert(count > 0); // We expect that self.op always exists.
|
|
1207
1393
|
assert(@divExact(response.header.size, @sizeOf(Header)) == 1 + count);
|
|
1208
1394
|
|
|
@@ -1258,7 +1444,7 @@ pub fn Replica(
|
|
|
1258
1444
|
// receiver's state changed in the mean time.
|
|
1259
1445
|
|
|
1260
1446
|
log.debug(
|
|
1261
|
-
"{}: on_recovery_response: replica={} view={}..{} op={}..{} commit={}..{}",
|
|
1447
|
+
"{}: on_recovery_response: replacing response replica={} view={}..{} op={}..{} commit={}..{}",
|
|
1262
1448
|
.{
|
|
1263
1449
|
self.replica,
|
|
1264
1450
|
existing.header.replica,
|
|
@@ -1371,17 +1557,41 @@ pub fn Replica(
|
|
|
1371
1557
|
// protocol), if the view number indicates that this replica is a leader, it must
|
|
1372
1558
|
// transition to status=view_change instead of status=normal.
|
|
1373
1559
|
|
|
1374
|
-
const leader_headers =
|
|
1560
|
+
const leader_headers = message_body_as_headers(leader_response.?);
|
|
1375
1561
|
assert(leader_headers.len > 0);
|
|
1376
1562
|
|
|
1377
1563
|
const commit = leader_response.?.header.commit;
|
|
1378
1564
|
{
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1565
|
+
const op = op_highest(leader_headers);
|
|
1566
|
+
assert(op == leader_response.?.header.op);
|
|
1567
|
+
|
|
1568
|
+
self.set_op_and_commit_max(op, commit, "on_recovery_response");
|
|
1569
|
+
|
|
1570
|
+
// TODO If the view's primary is >1 WAL ahead of us, these headers could cause
|
|
1571
|
+
// problems. We don't want to jump this far ahead to repair, but we still need to
|
|
1572
|
+
// use the hash chain to figure out which headers to request. Maybe include our
|
|
1573
|
+
// `op_checkpoint` in the recovery (request) message so that the response can give
|
|
1574
|
+
// more useful (i.e. older) headers.
|
|
1575
|
+
self.replace_headers(leader_headers);
|
|
1576
|
+
|
|
1577
|
+
if (self.op < config.journal_slot_count) {
|
|
1578
|
+
if (self.journal.header_with_op(0)) |header| {
|
|
1579
|
+
assert(header.command == .prepare);
|
|
1580
|
+
assert(header.operation == .root);
|
|
1581
|
+
} else {
|
|
1582
|
+
// This is the first wrap of the log, and the root prepare is corrupt.
|
|
1583
|
+
// Repair the root repair. This is necessary to maintain the invariant that
|
|
1584
|
+
// the op=commit_min exists in-memory.
|
|
1585
|
+
//
|
|
1586
|
+
// op=0 wouldn't have been repaired by replace_headers above, because it is
|
|
1587
|
+
// already "checkpointed".
|
|
1588
|
+
const header = Header.root_prepare(self.cluster);
|
|
1589
|
+
self.journal.set_header_as_dirty(&header);
|
|
1590
|
+
log.debug("{}: on_recovery_response: repair root op", .{self.replica});
|
|
1591
|
+
}
|
|
1592
|
+
}
|
|
1382
1593
|
|
|
1383
|
-
self.
|
|
1384
|
-
assert(self.op == latest.op);
|
|
1594
|
+
assert(self.op == op);
|
|
1385
1595
|
assert(self.journal.header_with_op(self.op) != null);
|
|
1386
1596
|
}
|
|
1387
1597
|
|
|
@@ -1390,30 +1600,7 @@ pub fn Replica(
|
|
|
1390
1600
|
assert(self.status == .normal);
|
|
1391
1601
|
assert(self.follower());
|
|
1392
1602
|
|
|
1393
|
-
|
|
1394
|
-
// problems. We don't want to jump this far ahead to repair, but we still need to use
|
|
1395
|
-
// the hash chain to figure out which headers to request. Maybe include our
|
|
1396
|
-
// `op_checkpoint` in the recovery (request) message so that the response can give more
|
|
1397
|
-
// useful (i.e. older) headers.
|
|
1398
|
-
for (leader_headers) |*header| {
|
|
1399
|
-
_ = self.repair_header(header);
|
|
1400
|
-
}
|
|
1401
|
-
|
|
1402
|
-
if (self.op < config.journal_slot_count) {
|
|
1403
|
-
if (self.journal.header_with_op(0)) |header| {
|
|
1404
|
-
assert(header.command == .prepare);
|
|
1405
|
-
assert(header.operation == .root);
|
|
1406
|
-
} else {
|
|
1407
|
-
// This is the first wrap of the log, and the root prepare is corrupt.
|
|
1408
|
-
// Repair the root repair. This is necessary to maintain the invariant that the
|
|
1409
|
-
// op=commit_min exists in-memory.
|
|
1410
|
-
const header = Header.root_prepare(self.cluster);
|
|
1411
|
-
self.journal.set_header_as_dirty(&header);
|
|
1412
|
-
log.debug("{}: on_recovery_response: repair root op", .{self.replica});
|
|
1413
|
-
}
|
|
1414
|
-
}
|
|
1415
|
-
|
|
1416
|
-
log.debug("{}: on_recovery_response: responses={} view={} headers={}..{}" ++
|
|
1603
|
+
log.info("{}: on_recovery_response: recovery done responses={} view={} headers={}..{}" ++
|
|
1417
1604
|
" commit={} dirty={} faulty={}", .{
|
|
1418
1605
|
self.replica,
|
|
1419
1606
|
count,
|
|
@@ -1429,7 +1616,7 @@ pub fn Replica(
|
|
|
1429
1616
|
// `state_machine.commit_timestamp` is updated as messages are committed.
|
|
1430
1617
|
|
|
1431
1618
|
self.reset_quorum_recovery_response();
|
|
1432
|
-
self.
|
|
1619
|
+
self.commit_journal(commit);
|
|
1433
1620
|
self.repair();
|
|
1434
1621
|
}
|
|
1435
1622
|
|
|
@@ -1486,28 +1673,18 @@ pub fn Replica(
|
|
|
1486
1673
|
checksum,
|
|
1487
1674
|
});
|
|
1488
1675
|
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
} else {
|
|
1502
|
-
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1503
|
-
// this particular destination replica.
|
|
1504
|
-
self.journal.read_prepare_with_op_and_checksum(
|
|
1505
|
-
on_request_prepare_read,
|
|
1506
|
-
op,
|
|
1507
|
-
prepare_checksum,
|
|
1508
|
-
message.header.replica,
|
|
1509
|
-
);
|
|
1510
|
-
}
|
|
1676
|
+
// Improve availability by calling `read_prepare_with_op_and_checksum` instead
|
|
1677
|
+
// of `read_prepare` — even if `journal.headers` contains the target message.
|
|
1678
|
+
// The latter skips the read when the target prepare is present but dirty (e.g.
|
|
1679
|
+
// it was recovered with decision=fix).
|
|
1680
|
+
// TODO Do not reissue the read if we are already reading in order to send to
|
|
1681
|
+
// this particular destination replica.
|
|
1682
|
+
self.journal.read_prepare_with_op_and_checksum(
|
|
1683
|
+
on_request_prepare_read,
|
|
1684
|
+
op,
|
|
1685
|
+
prepare_checksum,
|
|
1686
|
+
message.header.replica,
|
|
1687
|
+
);
|
|
1511
1688
|
|
|
1512
1689
|
// We have guaranteed the prepare (not safe to nack).
|
|
1513
1690
|
// Our copy may or may not be valid, but we will try to read & forward it.
|
|
@@ -1734,7 +1911,7 @@ pub fn Replica(
|
|
|
1734
1911
|
|
|
1735
1912
|
var op_min: ?u64 = null;
|
|
1736
1913
|
var op_max: ?u64 = null;
|
|
1737
|
-
for (
|
|
1914
|
+
for (message_body_as_headers(message)) |*h| {
|
|
1738
1915
|
if (op_min == null or h.op < op_min.?) op_min = h.op;
|
|
1739
1916
|
if (op_max == null or h.op > op_max.?) op_max = h.op;
|
|
1740
1917
|
_ = self.repair_header(h);
|
|
@@ -1944,10 +2121,44 @@ pub fn Replica(
|
|
|
1944
2121
|
assert(m.header.replica == message.header.replica);
|
|
1945
2122
|
assert(m.header.view == message.header.view);
|
|
1946
2123
|
assert(m.header.op == message.header.op);
|
|
1947
|
-
assert(m.header.commit == message.header.commit);
|
|
1948
2124
|
assert(m.header.checksum_body == message.header.checksum_body);
|
|
1949
|
-
|
|
1950
|
-
|
|
2125
|
+
|
|
2126
|
+
if (message.header.command == .do_view_change) {
|
|
2127
|
+
// Replicas don't resend `do_view_change` messages to themselves.
|
|
2128
|
+
assert(message.header.replica != self.replica);
|
|
2129
|
+
// A replica may resend a `do_view_change` with a different commit if it was
|
|
2130
|
+
// committing originally. Keep the one with the highest commit.
|
|
2131
|
+
// This is *not* necessary for correctness.
|
|
2132
|
+
if (m.header.commit < message.header.commit) {
|
|
2133
|
+
log.debug("{}: on_{s}: replacing (newer message replica={} commit={}..{})", .{
|
|
2134
|
+
self.replica,
|
|
2135
|
+
command,
|
|
2136
|
+
message.header.replica,
|
|
2137
|
+
m.header.commit,
|
|
2138
|
+
message.header.commit,
|
|
2139
|
+
});
|
|
2140
|
+
// TODO(Buggify): skip updating the DVC, since it isn't required for correctness.
|
|
2141
|
+
self.message_bus.unref(m);
|
|
2142
|
+
messages[message.header.replica] = message.ref();
|
|
2143
|
+
} else if (m.header.commit > message.header.commit) {
|
|
2144
|
+
log.debug("{}: on_{s}: ignoring (older message replica={})", .{
|
|
2145
|
+
self.replica,
|
|
2146
|
+
command,
|
|
2147
|
+
message.header.replica,
|
|
2148
|
+
});
|
|
2149
|
+
} else {
|
|
2150
|
+
assert(m.header.checksum == message.header.checksum);
|
|
2151
|
+
}
|
|
2152
|
+
} else {
|
|
2153
|
+
assert(m.header.commit == message.header.commit);
|
|
2154
|
+
assert(m.header.checksum == message.header.checksum);
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
log.debug("{}: on_{s}: ignoring (duplicate message replica={})", .{
|
|
2158
|
+
self.replica,
|
|
2159
|
+
command,
|
|
2160
|
+
message.header.replica,
|
|
2161
|
+
});
|
|
1951
2162
|
return null;
|
|
1952
2163
|
}
|
|
1953
2164
|
|
|
@@ -2004,6 +2215,7 @@ pub fn Replica(
|
|
|
2004
2215
|
if (self.replica_count == 2) assert(threshold == 1);
|
|
2005
2216
|
|
|
2006
2217
|
assert(self.status == .view_change);
|
|
2218
|
+
assert(self.replica != message.header.replica);
|
|
2007
2219
|
},
|
|
2008
2220
|
.nack_prepare => {
|
|
2009
2221
|
assert(self.replica_count > 1);
|
|
@@ -2011,6 +2223,8 @@ pub fn Replica(
|
|
|
2011
2223
|
|
|
2012
2224
|
assert(self.status == .view_change);
|
|
2013
2225
|
assert(self.leader_index(self.view) == self.replica);
|
|
2226
|
+
assert(message.header.replica != self.replica);
|
|
2227
|
+
assert(message.header.op == self.nack_prepare_op.?);
|
|
2014
2228
|
},
|
|
2015
2229
|
else => unreachable,
|
|
2016
2230
|
}
|
|
@@ -2065,9 +2279,15 @@ pub fn Replica(
|
|
|
2065
2279
|
// In a cluster-of-one, the prepares must always be written to the WAL sequentially
|
|
2066
2280
|
// (never concurrently). This ensures that there will be no gaps in the WAL during
|
|
2067
2281
|
// crash recovery.
|
|
2068
|
-
log.debug("{}: append: serializing append op={}", .{
|
|
2282
|
+
log.debug("{}: append: serializing append op={}", .{
|
|
2283
|
+
self.replica,
|
|
2284
|
+
message.header.op,
|
|
2285
|
+
});
|
|
2069
2286
|
} else {
|
|
2070
|
-
log.debug("{}: append: appending to journal", .{
|
|
2287
|
+
log.debug("{}: append: appending to journal op={}", .{
|
|
2288
|
+
self.replica,
|
|
2289
|
+
message.header.op,
|
|
2290
|
+
});
|
|
2071
2291
|
self.write_prepare(message, .append);
|
|
2072
2292
|
}
|
|
2073
2293
|
}
|
|
@@ -2115,9 +2335,9 @@ pub fn Replica(
|
|
|
2115
2335
|
}
|
|
2116
2336
|
|
|
2117
2337
|
/// Commit ops up to commit number `commit` (inclusive).
|
|
2118
|
-
/// A function which calls `
|
|
2119
|
-
/// Otherwise, we may fork the log.
|
|
2120
|
-
fn
|
|
2338
|
+
/// A function which calls `commit_journal()` to set `commit_max` must first call
|
|
2339
|
+
/// `view_jump()`. Otherwise, we may fork the log.
|
|
2340
|
+
fn commit_journal(self: *Self, commit: u64) void {
|
|
2121
2341
|
// TODO Restrict `view_change` status only to the leader purely as defense-in-depth.
|
|
2122
2342
|
// Be careful of concurrency when doing this, as successive view changes can happen quickly.
|
|
2123
2343
|
assert(self.status == .normal or self.status == .view_change or
|
|
@@ -2131,9 +2351,9 @@ pub fn Replica(
|
|
|
2131
2351
|
if (commit <= self.commit_min) return;
|
|
2132
2352
|
|
|
2133
2353
|
// We must update `commit_max` even if we are already committing, otherwise we will lose
|
|
2134
|
-
// information that we should know, and `
|
|
2354
|
+
// information that we should know, and `set_op_and_commit_max()` will catch us out:
|
|
2135
2355
|
if (commit > self.commit_max) {
|
|
2136
|
-
log.debug("{}:
|
|
2356
|
+
log.debug("{}: commit_journal: advancing commit_max={}..{}", .{
|
|
2137
2357
|
self.replica,
|
|
2138
2358
|
self.commit_max,
|
|
2139
2359
|
commit,
|
|
@@ -2141,9 +2361,9 @@ pub fn Replica(
|
|
|
2141
2361
|
self.commit_max = commit;
|
|
2142
2362
|
}
|
|
2143
2363
|
|
|
2144
|
-
// Guard against multiple concurrent invocations of
|
|
2364
|
+
// Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
|
|
2145
2365
|
if (self.committing) {
|
|
2146
|
-
log.debug("{}:
|
|
2366
|
+
log.debug("{}: commit_journal: already committing...", .{self.replica});
|
|
2147
2367
|
return;
|
|
2148
2368
|
}
|
|
2149
2369
|
|
|
@@ -2160,19 +2380,19 @@ pub fn Replica(
|
|
|
2160
2380
|
assert(!self.committing);
|
|
2161
2381
|
self.committing = true;
|
|
2162
2382
|
|
|
2163
|
-
self.
|
|
2383
|
+
self.commit_journal_next();
|
|
2164
2384
|
}
|
|
2165
2385
|
|
|
2166
|
-
fn
|
|
2386
|
+
fn commit_journal_next(self: *Self) void {
|
|
2167
2387
|
assert(self.committing);
|
|
2168
2388
|
assert(self.status == .normal or self.status == .view_change or
|
|
2169
2389
|
(self.status == .recovering and self.replica_count == 1));
|
|
2170
2390
|
assert(self.commit_min <= self.commit_max);
|
|
2171
2391
|
assert(self.commit_min <= self.op);
|
|
2172
2392
|
|
|
2173
|
-
if (!self.valid_hash_chain("
|
|
2174
|
-
self.committing = false;
|
|
2393
|
+
if (!self.valid_hash_chain("commit_journal_next")) {
|
|
2175
2394
|
assert(self.replica_count > 1);
|
|
2395
|
+
self.commit_ops_done();
|
|
2176
2396
|
return;
|
|
2177
2397
|
}
|
|
2178
2398
|
assert(self.op >= self.commit_max);
|
|
@@ -2182,9 +2402,9 @@ pub fn Replica(
|
|
|
2182
2402
|
if (self.commit_min < self.commit_max and self.commit_min < self.op) {
|
|
2183
2403
|
const op = self.commit_min + 1;
|
|
2184
2404
|
const checksum = self.journal.header_with_op(op).?.checksum;
|
|
2185
|
-
self.journal.read_prepare(
|
|
2405
|
+
self.journal.read_prepare(commit_journal_next_callback, op, checksum, null);
|
|
2186
2406
|
} else {
|
|
2187
|
-
self.
|
|
2407
|
+
self.commit_ops_done();
|
|
2188
2408
|
// This is an optimization to expedite the view change before the `repair_timeout`:
|
|
2189
2409
|
if (self.status == .view_change and self.repairs_allowed()) self.repair();
|
|
2190
2410
|
|
|
@@ -2194,33 +2414,43 @@ pub fn Replica(
|
|
|
2194
2414
|
assert(self.commit_min == self.op);
|
|
2195
2415
|
self.transition_to_normal_from_recovering_status(0);
|
|
2196
2416
|
} else {
|
|
2197
|
-
// We expect that a cluster-of-one only calls
|
|
2417
|
+
// We expect that a cluster-of-one only calls commit_journal() in recovering status.
|
|
2198
2418
|
assert(self.replica_count > 1);
|
|
2199
2419
|
}
|
|
2200
2420
|
}
|
|
2201
2421
|
}
|
|
2202
2422
|
|
|
2203
|
-
fn
|
|
2204
|
-
assert(destination_replica == null);
|
|
2205
|
-
|
|
2423
|
+
fn commit_journal_next_callback(self: *Self, prepare: ?*Message, destination_replica: ?u8) void {
|
|
2206
2424
|
assert(self.committing);
|
|
2207
|
-
|
|
2425
|
+
assert(destination_replica == null);
|
|
2208
2426
|
|
|
2209
2427
|
if (prepare == null) {
|
|
2210
|
-
|
|
2428
|
+
self.commit_ops_done();
|
|
2429
|
+
log.debug("{}: commit_journal_next_callback: prepare == null", .{self.replica});
|
|
2211
2430
|
if (self.replica_count == 1) @panic("cannot recover corrupt prepare");
|
|
2212
2431
|
return;
|
|
2213
2432
|
}
|
|
2214
2433
|
|
|
2434
|
+
const slot = self.journal.slot_with_op_and_checksum(
|
|
2435
|
+
prepare.?.header.op,
|
|
2436
|
+
prepare.?.header.checksum,
|
|
2437
|
+
).?;
|
|
2438
|
+
assert(self.journal.prepare_inhabited[slot.index]);
|
|
2439
|
+
assert(self.journal.prepare_checksums[slot.index] == prepare.?.header.checksum);
|
|
2440
|
+
assert(self.journal.has(prepare.?.header));
|
|
2441
|
+
|
|
2215
2442
|
switch (self.status) {
|
|
2216
2443
|
.normal => {},
|
|
2217
2444
|
.view_change => {
|
|
2218
2445
|
if (self.leader_index(self.view) != self.replica) {
|
|
2219
|
-
|
|
2446
|
+
self.commit_ops_done();
|
|
2447
|
+
log.debug("{}: commit_journal_next_callback: no longer leader view={}", .{
|
|
2448
|
+
self.replica,
|
|
2449
|
+
self.view,
|
|
2450
|
+
});
|
|
2220
2451
|
assert(self.replica_count > 1);
|
|
2221
2452
|
return;
|
|
2222
2453
|
}
|
|
2223
|
-
|
|
2224
2454
|
// Only the leader may commit during a view change before starting the new view.
|
|
2225
2455
|
// Fall through if this is indeed the case.
|
|
2226
2456
|
},
|
|
@@ -2231,31 +2461,194 @@ pub fn Replica(
|
|
|
2231
2461
|
}
|
|
2232
2462
|
|
|
2233
2463
|
const op = self.commit_min + 1;
|
|
2464
|
+
assert(prepare.?.header.op == op);
|
|
2234
2465
|
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2466
|
+
self.commit_op_prefetch(prepare.?, commit_journal_callback);
|
|
2467
|
+
}
|
|
2468
|
+
|
|
2469
|
+
fn commit_journal_callback(self: *Self) void {
|
|
2470
|
+
assert(self.committing);
|
|
2471
|
+
assert(self.commit_min <= self.commit_max);
|
|
2472
|
+
assert(self.commit_min <= self.op);
|
|
2473
|
+
|
|
2474
|
+
self.commit_journal_next();
|
|
2475
|
+
}
|
|
2476
|
+
|
|
2477
|
+
/// Begin the commit path that is common between `commit_pipeline` and `commit_journal`:
|
|
2478
|
+
///
|
|
2479
|
+
/// 1. prefetch
|
|
2480
|
+
/// 2. commit_op: Update the state machine and the replica's commit_min/commit_max.
|
|
2481
|
+
/// 3. compact
|
|
2482
|
+
/// 4. checkpoint: (Only called when `commit_min == op_checkpoint_trigger`).
|
|
2483
|
+
/// 5. done: Call the `callback` that was passed to `commit_op_prefetch`.
|
|
2484
|
+
fn commit_op_prefetch(
|
|
2485
|
+
self: *Self,
|
|
2486
|
+
prepare: *Message,
|
|
2487
|
+
callback: fn (*Self) void,
|
|
2488
|
+
) void {
|
|
2489
|
+
assert(self.committing);
|
|
2490
|
+
assert(self.status == .normal or self.status == .view_change or
|
|
2491
|
+
(self.status == .recovering and self.replica_count == 1));
|
|
2492
|
+
assert(self.commit_prepare == null);
|
|
2493
|
+
assert(self.commit_callback == null);
|
|
2494
|
+
assert(prepare.header.command == .prepare);
|
|
2495
|
+
assert(prepare.header.operation != .root);
|
|
2496
|
+
assert(prepare.header.op == self.commit_min + 1);
|
|
2497
|
+
assert(prepare.header.op <= self.op);
|
|
2498
|
+
|
|
2499
|
+
self.commit_prepare = prepare.ref();
|
|
2500
|
+
self.commit_callback = callback;
|
|
2501
|
+
self.state_machine.prefetch(
|
|
2502
|
+
commit_op_prefetch_callback,
|
|
2503
|
+
prepare.header.op,
|
|
2504
|
+
prepare.header.operation.cast(StateMachine),
|
|
2505
|
+
prepare.body(),
|
|
2506
|
+
);
|
|
2507
|
+
}
|
|
2508
|
+
|
|
2509
|
+
fn commit_op_prefetch_callback(state_machine: *StateMachine) void {
|
|
2510
|
+
const self = @fieldParentPtr(Self, "state_machine", state_machine);
|
|
2511
|
+
assert(self.committing);
|
|
2512
|
+
assert(self.commit_prepare != null);
|
|
2513
|
+
assert(self.commit_callback != null);
|
|
2514
|
+
assert(self.commit_prepare.?.header.op == self.commit_min + 1);
|
|
2515
|
+
|
|
2516
|
+
self.commit_op(self.commit_prepare.?);
|
|
2517
|
+
assert(self.commit_min == self.commit_prepare.?.header.op);
|
|
2518
|
+
assert(self.commit_min <= self.commit_max);
|
|
2519
|
+
|
|
2520
|
+
if (self.status == .normal and self.leader()) {
|
|
2521
|
+
const prepare = self.pipeline.pop().?;
|
|
2522
|
+
assert(self.commit_min == self.commit_max);
|
|
2523
|
+
assert(prepare.message.header.checksum == self.commit_prepare.?.header.checksum);
|
|
2524
|
+
assert(prepare.message.header.op == self.commit_min);
|
|
2525
|
+
assert(prepare.message.header.op == self.commit_max);
|
|
2526
|
+
assert(self.prepare_timeout.ticking);
|
|
2527
|
+
|
|
2528
|
+
self.message_bus.unref(prepare.message);
|
|
2529
|
+
|
|
2530
|
+
if (self.pipeline.head_ptr()) |next| {
|
|
2531
|
+
assert(next.message.header.op == self.commit_min + 1);
|
|
2532
|
+
assert(next.message.header.op == self.commit_prepare.?.header.op + 1);
|
|
2533
|
+
|
|
2534
|
+
if (self.replica_count == 1) {
|
|
2535
|
+
// Write the next message in the queue.
|
|
2536
|
+
// A cluster-of-one writes prepares sequentially to avoid gaps in the
|
|
2537
|
+
// WAL caused by reordered writes.
|
|
2538
|
+
log.debug("{}: append: appending to journal op={}", .{
|
|
2539
|
+
self.replica,
|
|
2540
|
+
next.message.header.op,
|
|
2541
|
+
});
|
|
2542
|
+
self.write_prepare(next.message, .append);
|
|
2543
|
+
}
|
|
2544
|
+
} else {
|
|
2545
|
+
// When the pipeline is empty, stop the prepare timeout.
|
|
2546
|
+
// The timeout will be restarted when another entry arrives for the pipeline.
|
|
2547
|
+
self.prepare_timeout.stop();
|
|
2548
|
+
}
|
|
2239
2549
|
}
|
|
2240
2550
|
|
|
2241
|
-
|
|
2242
|
-
|
|
2243
|
-
|
|
2244
|
-
|
|
2551
|
+
self.state_machine.compact(commit_op_compact_callback, self.commit_prepare.?.header.op);
|
|
2552
|
+
}
|
|
2553
|
+
|
|
2554
|
+
fn commit_op_compact_callback(state_machine: *StateMachine) void {
|
|
2555
|
+
const self = @fieldParentPtr(Self, "state_machine", state_machine);
|
|
2556
|
+
assert(self.committing);
|
|
2557
|
+
assert(self.commit_callback != null);
|
|
2558
|
+
assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
|
|
2559
|
+
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2560
|
+
|
|
2561
|
+
const op = self.commit_prepare.?.header.op;
|
|
2562
|
+
assert(op == self.commit_min);
|
|
2563
|
+
|
|
2564
|
+
if (op == self.op_checkpoint_trigger()) {
|
|
2565
|
+
assert(op == self.op);
|
|
2566
|
+
assert((op + 1) % config.lsm_batch_multiple == 0);
|
|
2567
|
+
log.debug("{}: commit_op_compact_callback: checkpoint start " ++
|
|
2568
|
+
"(op={} current_checkpoint={} next_checkpoint={})", .{
|
|
2569
|
+
self.replica,
|
|
2570
|
+
self.op,
|
|
2571
|
+
self.op_checkpoint,
|
|
2572
|
+
self.op_checkpoint_next(),
|
|
2573
|
+
});
|
|
2574
|
+
self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
|
|
2575
|
+
} else {
|
|
2576
|
+
assert(op < self.op_checkpoint_trigger());
|
|
2577
|
+
self.commit_op_done();
|
|
2245
2578
|
}
|
|
2579
|
+
}
|
|
2246
2580
|
|
|
2247
|
-
|
|
2581
|
+
fn commit_op_checkpoint_state_machine_callback(state_machine: *StateMachine) void {
|
|
2582
|
+
const self = @fieldParentPtr(Self, "state_machine", state_machine);
|
|
2583
|
+
assert(self.committing);
|
|
2584
|
+
assert(self.commit_callback != null);
|
|
2585
|
+
assert(self.commit_prepare.?.header.op == self.op);
|
|
2586
|
+
assert(self.commit_prepare.?.header.op == self.commit_min);
|
|
2587
|
+
assert(self.commit_prepare.?.header.op == self.op_checkpoint_trigger());
|
|
2248
2588
|
|
|
2249
|
-
|
|
2250
|
-
|
|
2251
|
-
|
|
2589
|
+
// For the given WAL (journal_slot_count=8, lsm_batch_multiple=2, op=commit_min=7):
|
|
2590
|
+
//
|
|
2591
|
+
// A B C D E
|
|
2592
|
+
// |01|23|45|67|
|
|
2593
|
+
//
|
|
2594
|
+
// The checkpoint is triggered at "E".
|
|
2595
|
+
// At this point, ops 6 and 7 are in the in-memory immutable table.
|
|
2596
|
+
// They will only be compacted to disk in the next bar.
|
|
2597
|
+
// Therefore, only ops "A..D" are committed to disk.
|
|
2598
|
+
// Thus, the SuperBlock's `commit_min` is set to 7-2=5.
|
|
2599
|
+
const vsr_state_new = .{
|
|
2600
|
+
.commit_min = self.op_checkpoint_next(),
|
|
2601
|
+
.commit_max = self.commit_max,
|
|
2602
|
+
.view_normal = self.view_normal,
|
|
2603
|
+
.view = self.view,
|
|
2604
|
+
};
|
|
2605
|
+
assert(VSRState.monotonic(self.superblock.working.vsr_state, vsr_state_new));
|
|
2252
2606
|
|
|
2253
|
-
self.
|
|
2254
|
-
self.
|
|
2607
|
+
self.superblock.staging.vsr_state = vsr_state_new;
|
|
2608
|
+
self.superblock.checkpoint(
|
|
2609
|
+
commit_op_checkpoint_superblock_callback,
|
|
2610
|
+
&self.superblock_context,
|
|
2611
|
+
);
|
|
2612
|
+
}
|
|
2613
|
+
|
|
2614
|
+
fn commit_op_checkpoint_superblock_callback(superblock_context: *SuperBlock.Context) void {
|
|
2615
|
+
const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
|
|
2616
|
+
assert(self.committing);
|
|
2617
|
+
assert(self.commit_callback != null);
|
|
2618
|
+
assert(self.commit_prepare.?.header.op == self.op);
|
|
2619
|
+
assert(self.commit_prepare.?.header.op == self.commit_min);
|
|
2620
|
+
|
|
2621
|
+
self.op_checkpoint = self.op_checkpoint_next();
|
|
2622
|
+
assert(self.op_checkpoint == self.commit_min - config.lsm_batch_multiple);
|
|
2623
|
+
assert(self.op_checkpoint == self.superblock.staging.vsr_state.commit_min);
|
|
2624
|
+
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2625
|
+
|
|
2626
|
+
log.debug("{}: commit_op_compact_callback: checkpoint done (op={} new_checkpoint={})", .{
|
|
2627
|
+
self.replica,
|
|
2628
|
+
self.op,
|
|
2629
|
+
self.op_checkpoint,
|
|
2630
|
+
});
|
|
2631
|
+
|
|
2632
|
+
self.commit_op_done();
|
|
2633
|
+
}
|
|
2634
|
+
|
|
2635
|
+
fn commit_op_done(self: *Self) void {
|
|
2636
|
+
const callback = self.commit_callback.?;
|
|
2637
|
+
assert(self.committing);
|
|
2638
|
+
assert(self.commit_prepare.?.header.op == self.commit_min);
|
|
2639
|
+
assert(self.commit_prepare.?.header.op < self.op_checkpoint_trigger());
|
|
2640
|
+
|
|
2641
|
+
self.message_bus.unref(self.commit_prepare.?);
|
|
2642
|
+
self.commit_prepare = null;
|
|
2643
|
+
self.commit_callback = null;
|
|
2644
|
+
callback(self);
|
|
2255
2645
|
}
|
|
2256
2646
|
|
|
2257
2647
|
fn commit_op(self: *Self, prepare: *const Message) void {
|
|
2258
2648
|
// TODO Can we add more checks around allowing commit_op() during a view change?
|
|
2649
|
+
assert(self.committing);
|
|
2650
|
+
assert(self.commit_prepare.? == prepare);
|
|
2651
|
+
assert(self.commit_callback != null);
|
|
2259
2652
|
assert(self.status == .normal or self.status == .view_change or
|
|
2260
2653
|
(self.status == .recovering and self.replica_count == 1));
|
|
2261
2654
|
assert(prepare.header.command == .prepare);
|
|
@@ -2263,10 +2656,12 @@ pub fn Replica(
|
|
|
2263
2656
|
assert(prepare.header.op == self.commit_min + 1);
|
|
2264
2657
|
assert(prepare.header.op <= self.op);
|
|
2265
2658
|
|
|
2266
|
-
// If we are a follower committing through `
|
|
2267
|
-
// happened since we last checked in `
|
|
2268
|
-
// subsequent ops, since by now we have already verified the hash chain for
|
|
2659
|
+
// If we are a follower committing through `commit_journal()` then a view change may
|
|
2660
|
+
// have happened since we last checked in `commit_journal_next()`. However, this would
|
|
2661
|
+
// relate to subsequent ops, since by now we have already verified the hash chain for
|
|
2662
|
+
// this commit.
|
|
2269
2663
|
|
|
2664
|
+
assert(self.journal.has(prepare.header));
|
|
2270
2665
|
assert(self.journal.header_with_op(self.commit_min).?.checksum ==
|
|
2271
2666
|
prepare.header.parent);
|
|
2272
2667
|
|
|
@@ -2282,10 +2677,16 @@ pub fn Replica(
|
|
|
2282
2677
|
const reply = self.message_bus.get_message();
|
|
2283
2678
|
defer self.message_bus.unref(reply);
|
|
2284
2679
|
|
|
2680
|
+
log.debug("{}: commit_op: commit_timestamp={} prepare.header.timestamp={}", .{
|
|
2681
|
+
self.replica,
|
|
2682
|
+
self.state_machine.commit_timestamp,
|
|
2683
|
+
prepare.header.timestamp,
|
|
2684
|
+
});
|
|
2285
2685
|
assert(self.state_machine.commit_timestamp < prepare.header.timestamp);
|
|
2286
2686
|
|
|
2287
2687
|
const reply_body_size = @intCast(u32, self.state_machine.commit(
|
|
2288
2688
|
prepare.header.client,
|
|
2689
|
+
prepare.header.op,
|
|
2289
2690
|
prepare.header.operation.cast(StateMachine),
|
|
2290
2691
|
prepare.buffer[@sizeOf(Header)..prepare.header.size],
|
|
2291
2692
|
reply.buffer[@sizeOf(Header)..],
|
|
@@ -2310,20 +2711,38 @@ pub fn Replica(
|
|
|
2310
2711
|
.replica = prepare.header.replica,
|
|
2311
2712
|
.view = prepare.header.view,
|
|
2312
2713
|
.op = prepare.header.op,
|
|
2714
|
+
.timestamp = prepare.header.timestamp,
|
|
2313
2715
|
.commit = prepare.header.op,
|
|
2314
2716
|
.size = @sizeOf(Header) + reply_body_size,
|
|
2315
2717
|
};
|
|
2316
|
-
assert(reply.header.timestamp == 0);
|
|
2317
2718
|
assert(reply.header.epoch == 0);
|
|
2318
2719
|
|
|
2319
2720
|
reply.header.set_checksum_body(reply.body());
|
|
2320
2721
|
reply.header.set_checksum();
|
|
2321
2722
|
|
|
2322
|
-
if (
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
self.
|
|
2326
|
-
|
|
2723
|
+
if (self.superblock.working.vsr_state.op_compacted(prepare.header.op)) {
|
|
2724
|
+
// We are recovering from a checkpoint. Prior to the crash, the client table was
|
|
2725
|
+
// updated with entries for one bar beyond the op_checkpoint.
|
|
2726
|
+
assert(self.op_checkpoint == self.superblock.working.vsr_state.commit_min);
|
|
2727
|
+
if (self.client_table().get(prepare.header.client)) |entry| {
|
|
2728
|
+
assert(entry.reply.header.command == .reply);
|
|
2729
|
+
assert(entry.reply.header.op >= prepare.header.op);
|
|
2730
|
+
} else {
|
|
2731
|
+
assert(self.client_table().count() == self.client_table().capacity());
|
|
2732
|
+
}
|
|
2733
|
+
|
|
2734
|
+
log.debug("{}: commit_op: skip client table update: prepare.op={} checkpoint={}", .{
|
|
2735
|
+
self.replica,
|
|
2736
|
+
prepare.header.op,
|
|
2737
|
+
self.op_checkpoint,
|
|
2738
|
+
});
|
|
2739
|
+
} else {
|
|
2740
|
+
if (reply.header.operation == .register) {
|
|
2741
|
+
self.create_client_table_entry(reply);
|
|
2742
|
+
} else {
|
|
2743
|
+
self.update_client_table_entry(reply);
|
|
2744
|
+
}
|
|
2745
|
+
}
|
|
2327
2746
|
|
|
2328
2747
|
if (self.leader_index(self.view) == self.replica) {
|
|
2329
2748
|
log.debug("{}: commit_op: replying to client: {}", .{ self.replica, reply.header });
|
|
@@ -2332,22 +2751,38 @@ pub fn Replica(
|
|
|
2332
2751
|
}
|
|
2333
2752
|
|
|
2334
2753
|
/// Commits, frees and pops as many prepares at the head of the pipeline as have quorum.
|
|
2754
|
+
/// Can be called only when the replica is the leader.
|
|
2335
2755
|
/// Can be called only when the pipeline has at least one prepare.
|
|
2336
|
-
/// Stops the prepare timeout and resets the timeouts counter if the pipeline becomes empty.
|
|
2337
2756
|
fn commit_pipeline(self: *Self) void {
|
|
2338
2757
|
assert(self.status == .normal);
|
|
2339
2758
|
assert(self.leader());
|
|
2340
2759
|
assert(self.pipeline.count > 0);
|
|
2341
2760
|
|
|
2342
|
-
|
|
2343
|
-
|
|
2761
|
+
// Guard against multiple concurrent invocations of commit_journal()/commit_pipeline():
|
|
2762
|
+
if (self.committing) {
|
|
2763
|
+
log.debug("{}: commit_pipeline: already committing...", .{self.replica});
|
|
2764
|
+
return;
|
|
2765
|
+
}
|
|
2766
|
+
|
|
2767
|
+
self.committing = true;
|
|
2768
|
+
self.commit_pipeline_next();
|
|
2769
|
+
}
|
|
2770
|
+
|
|
2771
|
+
fn commit_pipeline_next(self: *Self) void {
|
|
2772
|
+
assert(self.committing);
|
|
2773
|
+
assert(self.status == .normal);
|
|
2774
|
+
assert(self.leader());
|
|
2775
|
+
|
|
2776
|
+
if (self.pipeline.head_ptr()) |prepare| {
|
|
2344
2777
|
assert(self.commit_min == self.commit_max);
|
|
2345
|
-
assert(self.
|
|
2346
|
-
assert(self.
|
|
2778
|
+
assert(self.commit_min + 1 == prepare.message.header.op);
|
|
2779
|
+
assert(self.commit_min + self.pipeline.count == self.op);
|
|
2780
|
+
assert(self.journal.has(prepare.message.header));
|
|
2347
2781
|
|
|
2348
2782
|
if (!prepare.ok_quorum_received) {
|
|
2349
2783
|
// Eventually handled by on_prepare_timeout().
|
|
2350
2784
|
log.debug("{}: commit_pipeline: waiting for quorum", .{self.replica});
|
|
2785
|
+
self.commit_ops_done();
|
|
2351
2786
|
return;
|
|
2352
2787
|
}
|
|
2353
2788
|
|
|
@@ -2355,26 +2790,30 @@ pub fn Replica(
|
|
|
2355
2790
|
assert(count >= self.quorum_replication);
|
|
2356
2791
|
assert(count <= self.replica_count);
|
|
2357
2792
|
|
|
2358
|
-
self.
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2793
|
+
self.commit_op_prefetch(prepare.message, commit_pipeline_callback);
|
|
2794
|
+
} else {
|
|
2795
|
+
self.commit_ops_done();
|
|
2796
|
+
}
|
|
2797
|
+
}
|
|
2362
2798
|
|
|
2363
|
-
|
|
2799
|
+
fn commit_pipeline_callback(self: *Self) void {
|
|
2800
|
+
assert(self.committing);
|
|
2801
|
+
assert(self.commit_min <= self.commit_max);
|
|
2802
|
+
assert(self.commit_min <= self.op);
|
|
2364
2803
|
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
// A cluster-of-one writes prepares sequentially to avoid gaps in the WAL.
|
|
2369
|
-
self.write_prepare(head.message, .append);
|
|
2370
|
-
// The loop will wrap around and exit when `!ok_quorum_received`.
|
|
2371
|
-
}
|
|
2804
|
+
if (self.status == .normal and self.leader()) {
|
|
2805
|
+
if (self.pipeline.head_ptr()) |pipeline_head| {
|
|
2806
|
+
assert(pipeline_head.message.header.op == self.commit_min + 1);
|
|
2372
2807
|
}
|
|
2808
|
+
self.commit_pipeline_next();
|
|
2809
|
+
} else {
|
|
2810
|
+
self.commit_ops_done();
|
|
2373
2811
|
}
|
|
2812
|
+
}
|
|
2374
2813
|
|
|
2375
|
-
|
|
2376
|
-
|
|
2377
|
-
|
|
2814
|
+
fn commit_ops_done(self: *Self) void {
|
|
2815
|
+
assert(self.committing);
|
|
2816
|
+
self.committing = false;
|
|
2378
2817
|
}
|
|
2379
2818
|
|
|
2380
2819
|
fn copy_latest_headers_and_set_size(
|
|
@@ -2402,7 +2841,10 @@ pub fn Replica(
|
|
|
2402
2841
|
const count = self.journal.copy_latest_headers_between(
|
|
2403
2842
|
op_min,
|
|
2404
2843
|
op_max,
|
|
2405
|
-
std.mem.bytesAsSlice(
|
|
2844
|
+
std.mem.bytesAsSlice(
|
|
2845
|
+
Header,
|
|
2846
|
+
message.buffer[@sizeOf(Header)..][0..body_size_max],
|
|
2847
|
+
),
|
|
2406
2848
|
);
|
|
2407
2849
|
|
|
2408
2850
|
message.header.size = @intCast(u32, @sizeOf(Header) * (1 + count));
|
|
@@ -2426,17 +2868,8 @@ pub fn Replica(
|
|
|
2426
2868
|
assert(m.header.context == context);
|
|
2427
2869
|
assert(m.header.replica == replica);
|
|
2428
2870
|
switch (command) {
|
|
2429
|
-
.start_view_change => {
|
|
2430
|
-
assert(m.header.replica != self.replica);
|
|
2431
|
-
assert(m.header.view == self.view);
|
|
2432
|
-
},
|
|
2433
2871
|
.do_view_change => assert(m.header.view == self.view),
|
|
2434
2872
|
.recovery_response => assert(m.header.replica != self.replica),
|
|
2435
|
-
.nack_prepare => {
|
|
2436
|
-
// TODO See if we can restrict this branch further.
|
|
2437
|
-
assert(m.header.replica != self.replica);
|
|
2438
|
-
assert(m.header.op == self.nack_prepare_op.?);
|
|
2439
|
-
},
|
|
2440
2873
|
else => unreachable,
|
|
2441
2874
|
}
|
|
2442
2875
|
count += 1;
|
|
@@ -2473,12 +2906,12 @@ pub fn Replica(
|
|
|
2473
2906
|
// we do require that all entries have different commit numbers and are iterated.
|
|
2474
2907
|
// This ensures that we will always pick the entry with the oldest commit number.
|
|
2475
2908
|
// We also check that a client has only one entry in the hash map (or it's buggy).
|
|
2476
|
-
const clients = self.client_table.count();
|
|
2909
|
+
const clients = self.client_table().count();
|
|
2477
2910
|
assert(clients <= config.clients_max);
|
|
2478
2911
|
if (clients == config.clients_max) {
|
|
2479
2912
|
var evictee: ?*Message = null;
|
|
2480
2913
|
var iterated: usize = 0;
|
|
2481
|
-
var iterator = self.client_table.
|
|
2914
|
+
var iterator = self.client_table().iterator();
|
|
2482
2915
|
while (iterator.next()) |entry| : (iterated += 1) {
|
|
2483
2916
|
assert(entry.reply.header.command == .reply);
|
|
2484
2917
|
assert(entry.reply.header.context == 0);
|
|
@@ -2503,8 +2936,7 @@ pub fn Replica(
|
|
|
2503
2936
|
config.clients_max,
|
|
2504
2937
|
evictee.?.header.client,
|
|
2505
2938
|
});
|
|
2506
|
-
|
|
2507
|
-
assert(!self.client_table.contains(evictee.?.header.client));
|
|
2939
|
+
self.client_table().remove(evictee.?.header.client);
|
|
2508
2940
|
self.message_bus.unref(evictee.?);
|
|
2509
2941
|
}
|
|
2510
2942
|
|
|
@@ -2517,11 +2949,11 @@ pub fn Replica(
|
|
|
2517
2949
|
|
|
2518
2950
|
// Any duplicate .register requests should have received the same session number if the
|
|
2519
2951
|
// client table entry already existed, or been dropped if a session was being committed:
|
|
2520
|
-
self.client_table.
|
|
2952
|
+
self.client_table().put(&.{
|
|
2521
2953
|
.session = session,
|
|
2522
2954
|
.reply = reply.ref(),
|
|
2523
2955
|
});
|
|
2524
|
-
assert(self.client_table.count() <= config.clients_max);
|
|
2956
|
+
assert(self.client_table().count() <= config.clients_max);
|
|
2525
2957
|
}
|
|
2526
2958
|
|
|
2527
2959
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
@@ -2545,19 +2977,16 @@ pub fn Replica(
|
|
|
2545
2977
|
// We use the `timestamp` field to send this in addition to the current view number:
|
|
2546
2978
|
.timestamp = if (command == .do_view_change) self.view_normal else 0,
|
|
2547
2979
|
.op = self.op,
|
|
2548
|
-
|
|
2980
|
+
// See the comment in `on_do_view_change()` for why `commit_min` is crucial:
|
|
2981
|
+
.commit = if (command == .do_view_change) self.commit_min else self.commit_max,
|
|
2549
2982
|
};
|
|
2550
2983
|
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
const count_max = config.pipeline_max;
|
|
2558
|
-
assert(count_max > 0);
|
|
2559
|
-
|
|
2560
|
-
const count = self.copy_latest_headers_and_set_size(0, self.op, count_max, message);
|
|
2984
|
+
const count = self.copy_latest_headers_and_set_size(
|
|
2985
|
+
0,
|
|
2986
|
+
self.op,
|
|
2987
|
+
view_change_headers_count,
|
|
2988
|
+
message,
|
|
2989
|
+
);
|
|
2561
2990
|
assert(count > 0); // We expect that self.op always exists.
|
|
2562
2991
|
assert(@divExact(message.header.size, @sizeOf(Header)) == 1 + count);
|
|
2563
2992
|
|
|
@@ -2585,106 +3014,65 @@ pub fn Replica(
|
|
|
2585
3014
|
return message.ref();
|
|
2586
3015
|
}
|
|
2587
3016
|
|
|
2588
|
-
///
|
|
2589
|
-
///
|
|
2590
|
-
///
|
|
3017
|
+
/// Returns the op of the highest canonical message, according to this replica (the new
|
|
3018
|
+
/// leader) prior to loading the current view change's DVC quorum headers.
|
|
3019
|
+
/// When this replica participated in the last `view_normal`, this is just `replica.op`.
|
|
2591
3020
|
///
|
|
2592
|
-
///
|
|
2593
|
-
///
|
|
2594
|
-
///
|
|
2595
|
-
///
|
|
3021
|
+
/// - A *canonical* message was part of the last view_normal.
|
|
3022
|
+
/// - An *uncanonical* message may have been removed/changed by a prior view.
|
|
3023
|
+
/// - Canonical messages do not necessarily survive into the new view, but they take
|
|
3024
|
+
/// precedence over uncanonical messages.
|
|
3025
|
+
/// - Canonical messages may be committed or uncommitted.
|
|
2596
3026
|
///
|
|
2597
|
-
///
|
|
2598
|
-
/// the rest of the cluster may have already discarded it. We therefore iterate over our
|
|
2599
|
-
/// uncommitted header gaps and compare them with the quorum of do_view_change messages
|
|
2600
|
-
/// received from other replicas, before starting the new view, to discard any that may be
|
|
2601
|
-
/// impossible to repair.
|
|
3027
|
+
/// Consider these logs:
|
|
2602
3028
|
///
|
|
2603
|
-
///
|
|
2604
|
-
///
|
|
2605
|
-
///
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
3029
|
+
/// replica 0: 4, 5, 6b, 7b, 8b (commit_min=6b, leader, status=normal, view=X)
|
|
3030
|
+
/// replica 1: 4, 5, 6b, --, -- (commit_min=5, follower, status=normal, view=X)
|
|
3031
|
+
/// replica 2: 4, 5, 6a, --, 8b (view<X)
|
|
3032
|
+
///
|
|
3033
|
+
/// 1. Replica 0 crashes immediately after committing 6b.
|
|
3034
|
+
/// 2. Replicas 1 and 2 must determine the new chain HEAD.
|
|
3035
|
+
/// 3. 8b is discarded due to the gap in 7.
|
|
3036
|
+
/// 4. To distinguish between 6a and 6b (and safely discard 6a), the new leader trusts ops
|
|
3037
|
+
/// from the DVC(s) with the greatest `view_normal`.
|
|
3038
|
+
fn op_canonical_max(self: *const Self, view_normal_canonical: u64) usize {
|
|
3039
|
+
assert(self.replica_count > 1);
|
|
2609
3040
|
assert(self.status == .view_change);
|
|
2610
3041
|
assert(self.leader_index(self.view) == self.replica);
|
|
2611
3042
|
assert(self.do_view_change_quorum);
|
|
2612
3043
|
assert(!self.repair_timeout.ticking);
|
|
2613
|
-
assert(self.op
|
|
2614
|
-
assert(self.
|
|
2615
|
-
assert(self.op - self.commit_max <= config.journal_slot_count);
|
|
3044
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
3045
|
+
assert(self.view_normal <= view_normal_canonical);
|
|
2616
3046
|
|
|
2617
|
-
|
|
2618
|
-
if (threshold == 0) {
|
|
2619
|
-
assert(self.replica_count == 2);
|
|
2620
|
-
return;
|
|
2621
|
-
}
|
|
3047
|
+
if (self.view_normal == view_normal_canonical) return self.op;
|
|
2622
3048
|
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
log.debug("{}: discard_uncommitted_headers: op={} gap", .{ self.replica, op });
|
|
2632
|
-
|
|
2633
|
-
var nacks: usize = 0;
|
|
2634
|
-
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
2635
|
-
if (received) |m| {
|
|
2636
|
-
assert(m.header.command == .do_view_change);
|
|
2637
|
-
assert(m.header.cluster == self.cluster);
|
|
2638
|
-
assert(m.header.replica == replica);
|
|
2639
|
-
assert(m.header.view == self.view);
|
|
2640
|
-
assert(m.header.commit <= self.commit_max);
|
|
2641
|
-
|
|
2642
|
-
if (replica != self.replica) {
|
|
2643
|
-
// Check for a gap in the uncommitted headers from this replica.
|
|
2644
|
-
const received_headers = self.message_body_as_headers(m);
|
|
2645
|
-
assert(received_headers.len >= 1);
|
|
2646
|
-
|
|
2647
|
-
const received_op_min = received_headers[received_headers.len - 1].op;
|
|
2648
|
-
const received_op_max = received_headers[0].op;
|
|
2649
|
-
assert(received_op_max >= received_op_min);
|
|
2650
|
-
|
|
2651
|
-
const nack = for (received_headers) |*h| {
|
|
2652
|
-
if (h.op == op) break false;
|
|
2653
|
-
} else nack: {
|
|
2654
|
-
// Don't nack ops that didn't fit in the message's attached headers.
|
|
2655
|
-
break :nack op >= received_op_min;
|
|
2656
|
-
};
|
|
2657
|
-
|
|
2658
|
-
if (nack) nacks += 1;
|
|
2659
|
-
log.debug("{}: discard_uncommitted_headers: replica={} op={} nack={}", .{
|
|
2660
|
-
self.replica,
|
|
2661
|
-
m.header.replica,
|
|
2662
|
-
op,
|
|
2663
|
-
nack,
|
|
2664
|
-
});
|
|
2665
|
-
}
|
|
2666
|
-
}
|
|
2667
|
-
}
|
|
3049
|
+
const uncanonical_op_count = std.math.min(
|
|
3050
|
+
// Do not reset any ops that we have already committed.
|
|
3051
|
+
self.op - self.commit_min,
|
|
3052
|
+
// The number of uncommitted ops cannot be more than the length of the pipeline.
|
|
3053
|
+
// Do not reset any ops that we did not include in our do_view_change message.
|
|
3054
|
+
config.pipeline_max,
|
|
3055
|
+
);
|
|
2668
3056
|
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
op,
|
|
2672
|
-
nacks,
|
|
2673
|
-
threshold,
|
|
2674
|
-
});
|
|
3057
|
+
assert(uncanonical_op_count <= config.pipeline_max);
|
|
3058
|
+
if (uncanonical_op_count == 0) return self.op;
|
|
2675
3059
|
|
|
2676
|
-
|
|
2677
|
-
|
|
3060
|
+
// * When uncanonical_op_count = self.op - self.commit_min,
|
|
3061
|
+
// self.op - uncanonical_op_count = self.commit_min.
|
|
3062
|
+
// * When uncanonical_op_count = config.pipeline_max,
|
|
3063
|
+
// config.pipeline_max < self.op - self.commit_min holds.
|
|
3064
|
+
const canonical_op_max = self.op - uncanonical_op_count;
|
|
2678
3065
|
|
|
2679
|
-
|
|
2680
|
-
|
|
3066
|
+
log.debug("{}: on_do_view_change: not canonical ops={}..{}", .{
|
|
3067
|
+
self.replica,
|
|
3068
|
+
canonical_op_max + 1,
|
|
3069
|
+
self.op,
|
|
3070
|
+
});
|
|
2681
3071
|
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
}
|
|
2687
|
-
}
|
|
3072
|
+
assert(canonical_op_max <= self.op);
|
|
3073
|
+
assert(canonical_op_max >= self.commit_min);
|
|
3074
|
+
assert(canonical_op_max + config.pipeline_max >= self.op);
|
|
3075
|
+
return canonical_op_max;
|
|
2688
3076
|
}
|
|
2689
3077
|
|
|
2690
3078
|
/// Discards uncommitted ops during a view change from after and including `op`.
|
|
@@ -2710,8 +3098,8 @@ pub fn Replica(
|
|
|
2710
3098
|
self.view,
|
|
2711
3099
|
});
|
|
2712
3100
|
|
|
2713
|
-
self.journal.remove_entries_from(op);
|
|
2714
3101
|
self.op = op - 1;
|
|
3102
|
+
self.journal.remove_entries_from(op);
|
|
2715
3103
|
|
|
2716
3104
|
assert(self.journal.header_for_op(op) == null);
|
|
2717
3105
|
assert(!self.journal.dirty.bit(slot));
|
|
@@ -2729,8 +3117,8 @@ pub fn Replica(
|
|
|
2729
3117
|
}
|
|
2730
3118
|
|
|
2731
3119
|
fn flush_loopback_queue(self: *Self) void {
|
|
2732
|
-
// There are
|
|
2733
|
-
// However, of these
|
|
3120
|
+
// There are four cases where a replica will send a message to itself:
|
|
3121
|
+
// However, of these four cases, all but one call send_message_to_replica().
|
|
2734
3122
|
//
|
|
2735
3123
|
// 1. In on_request(), the leader sends a synchronous prepare to itself, but this is
|
|
2736
3124
|
// done by calling on_prepare() directly, and subsequent prepare timeout retries will
|
|
@@ -2739,6 +3127,8 @@ pub fn Replica(
|
|
|
2739
3127
|
// asynchronous prepare_ok to itself.
|
|
2740
3128
|
// 3. In on_start_view_change(), after receiving a quorum of start_view_change
|
|
2741
3129
|
// messages, the new leader sends a synchronous do_view_change to itself.
|
|
3130
|
+
// 4. In start_view_as_the_new_leader(), the new leader sends itself a prepare_ok
|
|
3131
|
+
// message for each uncommitted message.
|
|
2742
3132
|
if (self.loopback_queue) |message| {
|
|
2743
3133
|
defer self.message_bus.unref(message);
|
|
2744
3134
|
|
|
@@ -2891,10 +3281,10 @@ pub fn Replica(
|
|
|
2891
3281
|
|
|
2892
3282
|
// Verify that the new request will fit in the WAL.
|
|
2893
3283
|
// The message's op hasn't been assigned yet, but it will be `self.op + 1`.
|
|
2894
|
-
if (self.op
|
|
3284
|
+
if (self.op == self.op_checkpoint_trigger()) {
|
|
2895
3285
|
log.debug("{}: on_request: ignoring op={} (too far ahead, checkpoint={})", .{
|
|
2896
3286
|
self.replica,
|
|
2897
|
-
|
|
3287
|
+
self.op + 1,
|
|
2898
3288
|
self.op_checkpoint,
|
|
2899
3289
|
});
|
|
2900
3290
|
return true;
|
|
@@ -2915,7 +3305,7 @@ pub fn Replica(
|
|
|
2915
3305
|
assert(message.header.context == 0 or message.header.operation != .register);
|
|
2916
3306
|
assert(message.header.request == 0 or message.header.operation != .register);
|
|
2917
3307
|
|
|
2918
|
-
if (self.client_table.
|
|
3308
|
+
if (self.client_table().get(message.header.client)) |entry| {
|
|
2919
3309
|
assert(entry.reply.header.command == .reply);
|
|
2920
3310
|
assert(entry.reply.header.client == message.header.client);
|
|
2921
3311
|
|
|
@@ -3105,6 +3495,81 @@ pub fn Replica(
|
|
|
3105
3495
|
return false;
|
|
3106
3496
|
}
|
|
3107
3497
|
|
|
3498
|
+
fn is_repair(self: *const Self, message: *const Message) bool {
|
|
3499
|
+
assert(message.header.command == .prepare);
|
|
3500
|
+
|
|
3501
|
+
if (self.status == .normal) {
|
|
3502
|
+
if (message.header.view < self.view) return true;
|
|
3503
|
+
if (message.header.view == self.view and message.header.op <= self.op) return true;
|
|
3504
|
+
} else if (self.status == .view_change) {
|
|
3505
|
+
if (message.header.view < self.view) return true;
|
|
3506
|
+
// The view has already started or is newer.
|
|
3507
|
+
}
|
|
3508
|
+
|
|
3509
|
+
return false;
|
|
3510
|
+
}
|
|
3511
|
+
|
|
3512
|
+
/// Returns whether the replica is the leader for the current view.
|
|
3513
|
+
/// This may be used only when the replica status is normal.
|
|
3514
|
+
fn leader(self: *const Self) bool {
|
|
3515
|
+
assert(self.status == .normal);
|
|
3516
|
+
return self.leader_index(self.view) == self.replica;
|
|
3517
|
+
}
|
|
3518
|
+
|
|
3519
|
+
/// Returns the index into the configuration of the leader for a given view.
|
|
3520
|
+
fn leader_index(self: *const Self, view: u32) u8 {
|
|
3521
|
+
return @intCast(u8, @mod(view, self.replica_count));
|
|
3522
|
+
}
|
|
3523
|
+
|
|
3524
|
+
/// Advances `op` to where we need to be before `header` can be processed as a prepare.
|
|
3525
|
+
///
|
|
3526
|
+
/// This function temporarily violates the "replica.op must exist in WAL" invariant.
|
|
3527
|
+
fn jump_to_newer_op_in_normal_status(self: *Self, header: *const Header) void {
|
|
3528
|
+
assert(self.status == .normal);
|
|
3529
|
+
assert(self.follower());
|
|
3530
|
+
assert(header.view == self.view);
|
|
3531
|
+
assert(header.op > self.op + 1);
|
|
3532
|
+
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
3533
|
+
// to a newer op that is less than `commit_max` but greater than `commit_min`:
|
|
3534
|
+
assert(header.op > self.commit_min);
|
|
3535
|
+
// Never overwrite an op that still needs to be checkpointed.
|
|
3536
|
+
assert(header.op <= self.op_checkpoint_trigger());
|
|
3537
|
+
|
|
3538
|
+
log.debug("{}: jump_to_newer_op: advancing: op={}..{} checksum={}..{}", .{
|
|
3539
|
+
self.replica,
|
|
3540
|
+
self.op,
|
|
3541
|
+
header.op - 1,
|
|
3542
|
+
self.journal.header_with_op(self.op).?.checksum,
|
|
3543
|
+
header.parent,
|
|
3544
|
+
});
|
|
3545
|
+
|
|
3546
|
+
self.op = header.op - 1;
|
|
3547
|
+
assert(self.op >= self.commit_min);
|
|
3548
|
+
assert(self.op + 1 == header.op);
|
|
3549
|
+
assert(self.journal.header_with_op(self.op) == null);
|
|
3550
|
+
}
|
|
3551
|
+
|
|
3552
|
+
fn message_body_as_headers(message: *const Message) []const Header {
|
|
3553
|
+
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
3554
|
+
assert(message.header.command == .do_view_change or
|
|
3555
|
+
message.header.command == .start_view or
|
|
3556
|
+
message.header.command == .headers or
|
|
3557
|
+
message.header.command == .recovery_response);
|
|
3558
|
+
|
|
3559
|
+
const headers = std.mem.bytesAsSlice(
|
|
3560
|
+
Header,
|
|
3561
|
+
message.buffer[@sizeOf(Header)..message.header.size],
|
|
3562
|
+
);
|
|
3563
|
+
|
|
3564
|
+
for (headers[0 .. headers.len - 1]) |header, index| {
|
|
3565
|
+
// Headers must be provided in reverse order for the sake of `repair_header()`.
|
|
3566
|
+
// Otherwise, headers may never be repaired where the hash chain never connects.
|
|
3567
|
+
assert(header.op > headers[index + 1].op);
|
|
3568
|
+
}
|
|
3569
|
+
|
|
3570
|
+
return headers;
|
|
3571
|
+
}
|
|
3572
|
+
|
|
3108
3573
|
/// Returns whether the highest known op is certain.
|
|
3109
3574
|
///
|
|
3110
3575
|
/// After recovering the WAL, there are 2 possible outcomes:
|
|
@@ -3169,70 +3634,83 @@ pub fn Replica(
|
|
|
3169
3634
|
return true;
|
|
3170
3635
|
}
|
|
3171
3636
|
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
3637
|
+
/// Returns the op that will be `op_checkpoint` after the next checkpoint.
|
|
3638
|
+
///
|
|
3639
|
+
/// For a replica with journal_slot_count=8 and lsm_batch_multiple=2:
|
|
3640
|
+
///
|
|
3641
|
+
/// checkpoint() call 0 1 2 3
|
|
3642
|
+
/// op_checkpoint 0 5 11 17
|
|
3643
|
+
/// op_checkpoint_next 5 11 17 23
|
|
3644
|
+
/// op_checkpoint_trigger 7 13 19 25
|
|
3645
|
+
///
|
|
3646
|
+
/// commit log (ops) │ write-ahead log (slots)
|
|
3647
|
+
/// 0 4 8 2 6 0 4 │ 0---4---
|
|
3648
|
+
/// 0 ─────✓·% │ 01234✓6% initial log fill
|
|
3649
|
+
/// 1 ───────────✓·% │ 890✓2%45 first wrap of log
|
|
3650
|
+
/// 2 ─────────────────✓·% │ 6✓8%0123 second wrap of log
|
|
3651
|
+
/// 3 ───────────────────────✓·% │ 4%67890✓ third wrap of log
|
|
3652
|
+
///
|
|
3653
|
+
/// Legend:
|
|
3654
|
+
///
|
|
3655
|
+
/// ─/✓ op on disk at checkpoint
|
|
3656
|
+
/// ·/% op in memory at checkpoint
|
|
3657
|
+
/// ✓ op_checkpoint
|
|
3658
|
+
/// % op_checkpoint_trigger
|
|
3659
|
+
///
|
|
3660
|
+
fn op_checkpoint_next(self: *const Self) u64 {
|
|
3661
|
+
assert(self.op_checkpoint <= self.commit_min);
|
|
3662
|
+
assert(self.op_checkpoint <= self.op);
|
|
3663
|
+
assert(self.op_checkpoint == 0 or
|
|
3664
|
+
(self.op_checkpoint + 1) % config.lsm_batch_multiple == 0);
|
|
3185
3665
|
|
|
3186
|
-
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3666
|
+
const op = if (self.op_checkpoint == 0)
|
|
3667
|
+
// First wrap: op_checkpoint_next = 8-2-1 = 5
|
|
3668
|
+
config.journal_slot_count - config.lsm_batch_multiple - 1
|
|
3669
|
+
else
|
|
3670
|
+
// Second wrap: op_checkpoint_next = 5+8-2 = 11
|
|
3671
|
+
// Third wrap: op_checkpoint_next = 11+8-2 = 17
|
|
3672
|
+
self.op_checkpoint + config.journal_slot_count - config.lsm_batch_multiple;
|
|
3673
|
+
assert((op + 1) % config.lsm_batch_multiple == 0);
|
|
3674
|
+
// The checkpoint always advances.
|
|
3675
|
+
assert(op > self.op_checkpoint);
|
|
3676
|
+
|
|
3677
|
+
return op;
|
|
3191
3678
|
}
|
|
3192
3679
|
|
|
3193
|
-
/// Returns the
|
|
3194
|
-
|
|
3195
|
-
|
|
3680
|
+
/// Returns the next op that will trigger a checkpoint.
|
|
3681
|
+
///
|
|
3682
|
+
/// Receiving and storing an op higher than `op_checkpoint_trigger()` is forbidden; doing so
|
|
3683
|
+
/// would overwrite a message (or the slot of a message) that has not yet been committed and
|
|
3684
|
+
/// checkpointed.
|
|
3685
|
+
///
|
|
3686
|
+
/// See `op_checkpoint_next` for more detail.
|
|
3687
|
+
fn op_checkpoint_trigger(self: *const Self) u64 {
|
|
3688
|
+
return self.op_checkpoint_next() + config.lsm_batch_multiple;
|
|
3196
3689
|
}
|
|
3197
3690
|
|
|
3198
|
-
///
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
assert(
|
|
3202
|
-
assert(header.view == self.view);
|
|
3203
|
-
assert(header.op > self.op + 1);
|
|
3204
|
-
// We may have learned of a higher `commit_max` through a commit message before jumping
|
|
3205
|
-
// to a newer op that is less than `commit_max` but greater than `commit_min`:
|
|
3206
|
-
assert(header.op > self.commit_min);
|
|
3207
|
-
// Never overwrite an op that still needs to be checkpointed.
|
|
3208
|
-
assert(header.op - self.op_checkpoint < config.journal_slot_count);
|
|
3691
|
+
/// Finds the header with the highest op number in a slice of headers from a replica.
|
|
3692
|
+
/// The headers must be continuous, in reverse order, all connected, and with no gaps.
|
|
3693
|
+
fn op_highest(headers: []const Header) u64 {
|
|
3694
|
+
assert(headers.len > 0);
|
|
3209
3695
|
|
|
3210
|
-
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
header.
|
|
3214
|
-
self.journal.header_with_op(self.op).?.checksum,
|
|
3215
|
-
header.parent,
|
|
3216
|
-
});
|
|
3696
|
+
for (headers) |header, index| {
|
|
3697
|
+
assert(header.valid_checksum());
|
|
3698
|
+
assert(header.invalid() == null);
|
|
3699
|
+
assert(header.command == .prepare);
|
|
3217
3700
|
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3701
|
+
if (index > 0) {
|
|
3702
|
+
assert(header.op + 1 == headers[index - 1].op);
|
|
3703
|
+
assert(header.checksum == headers[index - 1].parent);
|
|
3704
|
+
}
|
|
3705
|
+
}
|
|
3222
3706
|
|
|
3223
|
-
|
|
3224
|
-
// TODO Assert message commands that we expect this to be called for.
|
|
3225
|
-
assert(message.header.size > @sizeOf(Header)); // Body must contain at least one header.
|
|
3226
|
-
return std.mem.bytesAsSlice(
|
|
3227
|
-
Header,
|
|
3228
|
-
message.buffer[@sizeOf(Header)..message.header.size],
|
|
3229
|
-
);
|
|
3707
|
+
return headers[0].op;
|
|
3230
3708
|
}
|
|
3231
3709
|
|
|
3232
3710
|
/// Panics if immediate neighbors in the same view would have a broken hash chain.
|
|
3233
3711
|
/// Assumes gaps and does not require that a preceeds b.
|
|
3234
3712
|
fn panic_if_hash_chain_would_break_in_the_same_view(
|
|
3235
|
-
self: *Self,
|
|
3713
|
+
self: *const Self,
|
|
3236
3714
|
a: *const Header,
|
|
3237
3715
|
b: *const Header,
|
|
3238
3716
|
) void {
|
|
@@ -3279,7 +3757,7 @@ pub fn Replica(
|
|
|
3279
3757
|
|
|
3280
3758
|
var op = self.commit_max + 1;
|
|
3281
3759
|
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
3282
|
-
var iterator = self.pipeline.
|
|
3760
|
+
var iterator = self.pipeline.iterator_mutable();
|
|
3283
3761
|
while (iterator.next_ptr()) |prepare| {
|
|
3284
3762
|
assert(prepare.message.header.command == .prepare);
|
|
3285
3763
|
assert(prepare.message.header.op == op);
|
|
@@ -3380,10 +3858,7 @@ pub fn Replica(
|
|
|
3380
3858
|
|
|
3381
3859
|
// The replica repairs backwards from `commit_max`. But if `commit_max` is too high
|
|
3382
3860
|
// (>1 WAL ahead), then bound it such that uncommitted WAL entries are not overwritten.
|
|
3383
|
-
const commit_max_limit = std.math.min(
|
|
3384
|
-
self.commit_max,
|
|
3385
|
-
self.op_checkpoint + config.journal_slot_count,
|
|
3386
|
-
);
|
|
3861
|
+
const commit_max_limit = std.math.min(self.commit_max, self.op_checkpoint_trigger());
|
|
3387
3862
|
|
|
3388
3863
|
// Request outstanding committed prepares to advance our op number:
|
|
3389
3864
|
// This handles the case of an idle cluster, where a follower will not otherwise advance.
|
|
@@ -3460,13 +3935,12 @@ pub fn Replica(
|
|
|
3460
3935
|
// Commit ops, which may in turn discover faulty prepares and drive more repairs:
|
|
3461
3936
|
if (self.commit_min < self.commit_max) {
|
|
3462
3937
|
assert(self.replica_count > 1);
|
|
3463
|
-
self.
|
|
3938
|
+
self.commit_journal(self.commit_max);
|
|
3464
3939
|
return;
|
|
3465
3940
|
}
|
|
3466
3941
|
|
|
3467
3942
|
if (self.status == .view_change and self.leader_index(self.view) == self.replica) {
|
|
3468
3943
|
if (self.repair_pipeline_op() != null) return self.repair_pipeline();
|
|
3469
|
-
|
|
3470
3944
|
// Start the view as the new leader:
|
|
3471
3945
|
self.start_view_as_the_new_leader();
|
|
3472
3946
|
}
|
|
@@ -3505,6 +3979,9 @@ pub fn Replica(
|
|
|
3505
3979
|
/// with an older view number may be committed instead of an op with a newer view number:
|
|
3506
3980
|
/// http://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf.
|
|
3507
3981
|
///
|
|
3982
|
+
/// * Do not replace an op belonging to the current WAL wrap with an op belonging to a
|
|
3983
|
+
/// previous wrap. In other words, don't repair checkpointed ops.
|
|
3984
|
+
///
|
|
3508
3985
|
fn repair_header(self: *Self, header: *const Header) bool {
|
|
3509
3986
|
assert(header.valid_checksum());
|
|
3510
3987
|
assert(header.invalid() == null);
|
|
@@ -3517,145 +3994,121 @@ pub fn Replica(
|
|
|
3517
3994
|
}
|
|
3518
3995
|
|
|
3519
3996
|
if (header.op > self.op) {
|
|
3520
|
-
log.debug("{}: repair_header:
|
|
3997
|
+
log.debug("{}: repair_header: op={} checksum={} (advances hash chain head)", .{
|
|
3521
3998
|
self.replica,
|
|
3522
|
-
|
|
3999
|
+
header.op,
|
|
4000
|
+
header.checksum,
|
|
4001
|
+
});
|
|
4002
|
+
return false;
|
|
4003
|
+
} else if (header.op == self.op and !self.journal.has(header)) {
|
|
4004
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
4005
|
+
log.debug("{}: repair_header: op={} checksum={} (changes hash chain head)", .{
|
|
4006
|
+
self.replica,
|
|
4007
|
+
header.op,
|
|
4008
|
+
header.checksum,
|
|
3523
4009
|
});
|
|
3524
4010
|
return false;
|
|
3525
|
-
}
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
|
|
3529
|
-
//
|
|
3530
|
-
// avoid overwriting any overlapping op.
|
|
4011
|
+
}
|
|
4012
|
+
|
|
4013
|
+
if (header.op <= self.op_checkpoint) {
|
|
4014
|
+
if (header.op == 0 and self.op_checkpoint == 0) {
|
|
4015
|
+
// Repairing the root op is allowed until the first checkpoint.
|
|
3531
4016
|
} else {
|
|
3532
|
-
|
|
4017
|
+
// Otherwise don't repair checkpointed ops, since their slots now belong to
|
|
4018
|
+
// the next wrap of the WAL.
|
|
4019
|
+
log.debug("{}: repair_header: false (precedes self.op_checkpoint={})", .{
|
|
3533
4020
|
self.replica,
|
|
3534
|
-
self.
|
|
4021
|
+
self.op_checkpoint,
|
|
3535
4022
|
});
|
|
3536
4023
|
return false;
|
|
3537
4024
|
}
|
|
3538
4025
|
}
|
|
3539
4026
|
|
|
3540
|
-
if (self.journal.
|
|
3541
|
-
assert(existing.op == header.op);
|
|
3542
|
-
|
|
3543
|
-
// Do not replace any existing op lightly as doing so may impair durability and even
|
|
3544
|
-
// violate correctness by undoing a prepare already acknowledged to the leader:
|
|
4027
|
+
if (self.journal.header_for_prepare(header)) |existing| {
|
|
3545
4028
|
if (existing.checksum == header.checksum) {
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
log.debug("{}: repair_header: op={} false (checksum clean)", .{
|
|
4029
|
+
if (self.journal.has_clean(header)) {
|
|
4030
|
+
log.debug("{}: repair_header: op={} checksum={} (checksum clean)", .{
|
|
3549
4031
|
self.replica,
|
|
3550
4032
|
header.op,
|
|
4033
|
+
header.checksum,
|
|
3551
4034
|
});
|
|
3552
4035
|
return false;
|
|
4036
|
+
} else {
|
|
4037
|
+
log.debug("{}: repair_header: op={} checksum={} (checksum dirty)", .{
|
|
4038
|
+
self.replica,
|
|
4039
|
+
header.op,
|
|
4040
|
+
header.checksum,
|
|
4041
|
+
});
|
|
3553
4042
|
}
|
|
3554
|
-
|
|
3555
|
-
log.debug("{}: repair_header: op={} exists, checksum dirty", .{
|
|
3556
|
-
self.replica,
|
|
3557
|
-
header.op,
|
|
3558
|
-
});
|
|
3559
4043
|
} else if (existing.view == header.view) {
|
|
3560
4044
|
// The journal must have wrapped:
|
|
3561
|
-
// We expect that the same view and op
|
|
4045
|
+
// We expect that the same view and op would have had the same checksum.
|
|
3562
4046
|
assert(existing.op != header.op);
|
|
3563
|
-
|
|
3564
4047
|
if (existing.op > header.op) {
|
|
3565
|
-
log.debug("{}: repair_header: op={}
|
|
4048
|
+
log.debug("{}: repair_header: op={} checksum={} (same view, newer op)", .{
|
|
3566
4049
|
self.replica,
|
|
3567
4050
|
header.op,
|
|
4051
|
+
header.checksum,
|
|
3568
4052
|
});
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
log.debug("{}: repair_header: op={} exists, view has older op", .{
|
|
3573
|
-
self.replica,
|
|
3574
|
-
header.op,
|
|
3575
|
-
});
|
|
3576
|
-
} else {
|
|
3577
|
-
assert(existing.view != header.view);
|
|
3578
|
-
assert(existing.op == header.op or existing.op != header.op);
|
|
3579
|
-
|
|
3580
|
-
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
3581
|
-
// We cannot replace this op until we are sure that doing so would not
|
|
3582
|
-
// violate any prior commitments made to the leader.
|
|
3583
|
-
log.debug("{}: repair_header: op={} false (exists)", .{
|
|
4053
|
+
} else {
|
|
4054
|
+
log.debug("{}: repair_header: op={} checksum={} (same view, older op)", .{
|
|
3584
4055
|
self.replica,
|
|
3585
4056
|
header.op,
|
|
4057
|
+
header.checksum,
|
|
3586
4058
|
});
|
|
3587
|
-
return false;
|
|
3588
4059
|
}
|
|
4060
|
+
} else {
|
|
4061
|
+
assert(existing.view != header.view);
|
|
4062
|
+
assert(existing.op == header.op or existing.op != header.op);
|
|
3589
4063
|
|
|
3590
|
-
log.debug("{}: repair_header: op={}
|
|
4064
|
+
log.debug("{}: repair_header: op={} checksum={} (different view)", .{
|
|
3591
4065
|
self.replica,
|
|
3592
4066
|
header.op,
|
|
4067
|
+
header.checksum,
|
|
3593
4068
|
});
|
|
3594
4069
|
}
|
|
3595
4070
|
} else {
|
|
3596
|
-
log.debug("{}: repair_header: op={} gap", .{
|
|
3597
|
-
}
|
|
3598
|
-
|
|
3599
|
-
// Caveat: Do not repair an existing op or gap if doing so would break the hash chain:
|
|
3600
|
-
if (self.repair_header_would_break_hash_chain_with_next_entry(header)) {
|
|
3601
|
-
log.debug("{}: repair_header: op={} false (breaks hash chain)", .{
|
|
4071
|
+
log.debug("{}: repair_header: op={} checksum={} (gap)", .{
|
|
3602
4072
|
self.replica,
|
|
3603
4073
|
header.op,
|
|
4074
|
+
header.checksum,
|
|
3604
4075
|
});
|
|
3605
|
-
return false;
|
|
3606
4076
|
}
|
|
3607
4077
|
|
|
3608
|
-
// TODO Snapshots: Skip if this header is already snapshotted.
|
|
3609
|
-
|
|
3610
4078
|
assert(header.op < self.op or
|
|
3611
4079
|
self.journal.header_with_op(self.op).?.checksum == header.checksum);
|
|
3612
4080
|
|
|
3613
|
-
self.
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
) bool {
|
|
3624
|
-
if (self.journal.previous_entry(header)) |previous| {
|
|
3625
|
-
self.panic_if_hash_chain_would_break_in_the_same_view(previous, header);
|
|
4081
|
+
if (!self.repair_header_would_connect_hash_chain(header)) {
|
|
4082
|
+
// We cannot replace this op until we are sure that this would not:
|
|
4083
|
+
// 1. undermine any prior prepare_ok guarantee made to the primary, and
|
|
4084
|
+
// 2. leak stale ops back into our in-memory headers (and so into a view change).
|
|
4085
|
+
log.debug("{}: repair_header: op={} checksum={} (disconnected from hash chain)", .{
|
|
4086
|
+
self.replica,
|
|
4087
|
+
header.op,
|
|
4088
|
+
header.checksum,
|
|
4089
|
+
});
|
|
4090
|
+
return false;
|
|
3626
4091
|
}
|
|
3627
4092
|
|
|
3628
|
-
if (self.
|
|
3629
|
-
self.
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
assert(header.view <= next.view);
|
|
3633
|
-
assert(header.op + 1 == next.op);
|
|
3634
|
-
// We don't break with `next` but this is no guarantee that `next` does not
|
|
3635
|
-
// break.
|
|
3636
|
-
return false;
|
|
3637
|
-
} else {
|
|
3638
|
-
// If the journal has wrapped, then err in favor of a break regardless of op
|
|
3639
|
-
// order:
|
|
3640
|
-
return true;
|
|
4093
|
+
if (header.op <= self.commit_min) {
|
|
4094
|
+
if (self.journal.header_with_op(header.op)) |existing| {
|
|
4095
|
+
// If we already committed this op, the repair must be the identical message.
|
|
4096
|
+
assert(header.checksum == existing.checksum);
|
|
3641
4097
|
}
|
|
3642
4098
|
}
|
|
3643
4099
|
|
|
3644
|
-
|
|
3645
|
-
return
|
|
4100
|
+
self.journal.set_header_as_dirty(header);
|
|
4101
|
+
return true;
|
|
3646
4102
|
}
|
|
3647
4103
|
|
|
3648
|
-
/// If we repair this header,
|
|
3649
|
-
///
|
|
3650
|
-
/// op.
|
|
4104
|
+
/// If we repair this header, would this connect the hash chain through to the latest op?
|
|
4105
|
+
/// This offers a strong guarantee that may be used to replace an existing op.
|
|
3651
4106
|
///
|
|
3652
4107
|
/// Here is an example of what could go wrong if we did not check for complete connection:
|
|
3653
4108
|
///
|
|
3654
4109
|
/// 1. We do a prepare that's going to be committed.
|
|
3655
|
-
/// 2. We do a stale prepare to the right
|
|
3656
|
-
///
|
|
3657
|
-
/// 3. We do another stale prepare that replaces the first op because it connects to the
|
|
3658
|
-
/// second.
|
|
4110
|
+
/// 2. We do a stale prepare to the right, ignoring the hash chain break to the left.
|
|
4111
|
+
/// 3. We do another stale prepare that replaces the first since it connects to the second.
|
|
3659
4112
|
///
|
|
3660
4113
|
/// This would violate our quorum replication commitment to the leader.
|
|
3661
4114
|
/// The mistake in this example was not that we ignored the break to the left, which we must
|
|
@@ -4086,6 +4539,55 @@ pub fn Replica(
|
|
|
4086
4539
|
}
|
|
4087
4540
|
}
|
|
4088
4541
|
|
|
4542
|
+
/// The caller must ensure that the headers are trustworthy.
|
|
4543
|
+
///
|
|
4544
|
+
/// Asserts that sequential ops are hash-chained. (Gaps are permitted).
|
|
4545
|
+
fn replace_headers(self: *Self, headers: []const Header) void {
|
|
4546
|
+
for (headers) |*header, i| {
|
|
4547
|
+
if (i > 0) {
|
|
4548
|
+
const next = &headers[i - 1];
|
|
4549
|
+
assert(next.view >= header.view);
|
|
4550
|
+
if (next.op == header.op + 1) {
|
|
4551
|
+
assert(next.parent == header.checksum);
|
|
4552
|
+
} else {
|
|
4553
|
+
assert(next.op > header.op);
|
|
4554
|
+
}
|
|
4555
|
+
}
|
|
4556
|
+
|
|
4557
|
+
self.replace_header(header);
|
|
4558
|
+
}
|
|
4559
|
+
}
|
|
4560
|
+
|
|
4561
|
+
/// Replaces the header if the header is different and not already committed.
|
|
4562
|
+
/// The caller must ensure that the header is trustworthy.
|
|
4563
|
+
fn replace_header(self: *Self, header: *const Header) void {
|
|
4564
|
+
assert(self.op_checkpoint <= self.commit_min);
|
|
4565
|
+
assert(header.command == .prepare);
|
|
4566
|
+
assert(header.op <= self.op); // Never advance the op.
|
|
4567
|
+
assert(header.op <= self.op_checkpoint_trigger());
|
|
4568
|
+
|
|
4569
|
+
if (header.op <= self.commit_min) {
|
|
4570
|
+
if (self.journal.header_with_op(header.op)) |existing_header| {
|
|
4571
|
+
assert(existing_header.checksum == header.checksum);
|
|
4572
|
+
return;
|
|
4573
|
+
} else {
|
|
4574
|
+
if (header.op <= self.op_checkpoint) {
|
|
4575
|
+
// Never replace a checkpointed op — those slots are needed by the following
|
|
4576
|
+
// WAL wrap.
|
|
4577
|
+
return;
|
|
4578
|
+
} else {
|
|
4579
|
+
// If an op is committed but not checkpointed, we must still have the header.
|
|
4580
|
+
@panic("missing committed, uncheckpointed header");
|
|
4581
|
+
}
|
|
4582
|
+
}
|
|
4583
|
+
}
|
|
4584
|
+
|
|
4585
|
+
// Do not set an op as dirty if we already have it exactly because:
|
|
4586
|
+
// 1. this would trigger a repair and delay the view change, or worse,
|
|
4587
|
+
// 2. prevent repairs to another replica when we have the op.
|
|
4588
|
+
if (!self.journal.has(header)) self.journal.set_header_as_dirty(header);
|
|
4589
|
+
}
|
|
4590
|
+
|
|
4089
4591
|
/// Replicates to the next replica in the configuration (until we get back to the leader):
|
|
4090
4592
|
/// Replication starts and ends with the leader, we never forward back to the leader.
|
|
4091
4593
|
/// Does not flood the network with prepares that have already committed.
|
|
@@ -4149,7 +4651,7 @@ pub fn Replica(
|
|
|
4149
4651
|
assert(replica < self.replica_count);
|
|
4150
4652
|
}
|
|
4151
4653
|
|
|
4152
|
-
counter
|
|
4654
|
+
counter.* = quorum_counter_null;
|
|
4153
4655
|
assert(counter.count() == 0);
|
|
4154
4656
|
|
|
4155
4657
|
var replica: usize = 0;
|
|
@@ -4168,6 +4670,20 @@ pub fn Replica(
|
|
|
4168
4670
|
self.nack_prepare_op = null;
|
|
4169
4671
|
}
|
|
4170
4672
|
|
|
4673
|
+
fn reset_quorum_prepare_ok(self: *Self) void {
|
|
4674
|
+
// "prepare_ok"s from previous views are not valid, even if the pipeline entry is reused
|
|
4675
|
+
// after a cycle of view changes. In other words, when a view change cycles around, so
|
|
4676
|
+
// that the original primary becomes a primary of a new view, pipeline entries may be
|
|
4677
|
+
// reused. However, the pipeline's prepare_ok quorums must not be reused, since the
|
|
4678
|
+
// replicas that sent them may have swapped them out during a previous view change.
|
|
4679
|
+
var iterator = self.pipeline.iterator_mutable();
|
|
4680
|
+
while (iterator.next_ptr()) |prepare| {
|
|
4681
|
+
prepare.ok_quorum_received = false;
|
|
4682
|
+
prepare.ok_from_all_replicas = quorum_counter_null;
|
|
4683
|
+
assert(prepare.ok_from_all_replicas.count() == 0);
|
|
4684
|
+
}
|
|
4685
|
+
}
|
|
4686
|
+
|
|
4171
4687
|
fn reset_quorum_start_view_change(self: *Self) void {
|
|
4172
4688
|
self.reset_quorum_counter(&self.start_view_change_from_other_replicas);
|
|
4173
4689
|
self.start_view_change_quorum = false;
|
|
@@ -4296,8 +4812,15 @@ pub fn Replica(
|
|
|
4296
4812
|
assert(message.header.command == .do_view_change);
|
|
4297
4813
|
assert(message.header.view == self.view);
|
|
4298
4814
|
assert(message.header.op == self.op);
|
|
4299
|
-
assert(message.header.op ==
|
|
4300
|
-
|
|
4815
|
+
assert(message.header.op == message_body_as_headers(message)[0].op);
|
|
4816
|
+
// Each replica must advertise its own commit number, so that the new primary can know
|
|
4817
|
+
// which headers must be replaced in its log. Otherwise, a gap in the log may prevent
|
|
4818
|
+
// the new primary from repairing its log, resulting in the log being forked if the new
|
|
4819
|
+
// primary also discards uncommitted operations.
|
|
4820
|
+
// It is also safe not to use `commit_max` here because the new primary will assume that
|
|
4821
|
+
// operations after the highest `commit_min` may yet have been committed before the old
|
|
4822
|
+
// primary crashed. The new primary will use the NACK protocol to be sure of a discard.
|
|
4823
|
+
assert(message.header.commit == self.commit_min);
|
|
4301
4824
|
|
|
4302
4825
|
self.send_message_to_replica(self.leader_index(self.view), message);
|
|
4303
4826
|
}
|
|
@@ -4389,6 +4912,7 @@ pub fn Replica(
|
|
|
4389
4912
|
.prepare_ok => {
|
|
4390
4913
|
assert(self.status == .normal);
|
|
4391
4914
|
assert(message.header.view == self.view);
|
|
4915
|
+
assert(message.header.op <= self.op_checkpoint_trigger());
|
|
4392
4916
|
// We must only ever send a prepare_ok to the latest leader of the active view:
|
|
4393
4917
|
// We must never straddle views by sending to a leader in an older view.
|
|
4394
4918
|
// Otherwise, we would be enabling a partitioned leader to commit.
|
|
@@ -4407,6 +4931,7 @@ pub fn Replica(
|
|
|
4407
4931
|
assert(message.header.view == self.view);
|
|
4408
4932
|
assert(message.header.replica == self.replica);
|
|
4409
4933
|
assert(message.header.op == self.op);
|
|
4934
|
+
assert(message.header.commit == self.commit_min);
|
|
4410
4935
|
assert(replica == self.leader_index(self.view));
|
|
4411
4936
|
},
|
|
4412
4937
|
.start_view => switch (self.status) {
|
|
@@ -4479,46 +5004,13 @@ pub fn Replica(
|
|
|
4479
5004
|
}
|
|
4480
5005
|
}
|
|
4481
5006
|
|
|
4482
|
-
|
|
4483
|
-
/// Searches only by op number to find the highest `self.op` for the replica.
|
|
4484
|
-
fn set_latest_op(headers: []const Header, latest: *Header) void {
|
|
4485
|
-
switch (latest.command) {
|
|
4486
|
-
.reserved, .prepare => assert(latest.valid_checksum()),
|
|
4487
|
-
else => unreachable,
|
|
4488
|
-
}
|
|
4489
|
-
|
|
4490
|
-
for (headers) |header| {
|
|
4491
|
-
assert(header.valid_checksum());
|
|
4492
|
-
assert(header.invalid() == null);
|
|
4493
|
-
assert(header.command == .prepare);
|
|
4494
|
-
|
|
4495
|
-
if (latest.command == .reserved or header.op > latest.op) {
|
|
4496
|
-
// We are simply trying to find the latest `self.op` in the replica's log.
|
|
4497
|
-
// We therefore do not compare views here.
|
|
4498
|
-
latest.* = header;
|
|
4499
|
-
}
|
|
4500
|
-
}
|
|
4501
|
-
}
|
|
4502
|
-
|
|
4503
|
-
fn set_latest_op_and_k(
|
|
4504
|
-
self: *Self,
|
|
4505
|
-
latest: *const Header,
|
|
4506
|
-
k: u64,
|
|
4507
|
-
method: []const u8,
|
|
4508
|
-
) void {
|
|
5007
|
+
fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
|
|
4509
5008
|
assert(self.status == .view_change or self.status == .recovering);
|
|
4510
5009
|
assert(self.journal.recovered);
|
|
4511
|
-
assert(latest.valid_checksum());
|
|
4512
|
-
assert(latest.invalid() == null);
|
|
4513
|
-
assert(latest.command == .prepare);
|
|
4514
|
-
assert(latest.cluster == self.cluster);
|
|
4515
5010
|
|
|
4516
5011
|
switch (self.status) {
|
|
4517
5012
|
.normal => unreachable,
|
|
4518
|
-
.view_change => {
|
|
4519
|
-
// The view may have started already, so we can have a prepare in the same view:
|
|
4520
|
-
assert(latest.view <= self.view);
|
|
4521
|
-
},
|
|
5013
|
+
.view_change => {},
|
|
4522
5014
|
.recovering => {
|
|
4523
5015
|
// The replica's view hasn't been set yet.
|
|
4524
5016
|
// It will be set shortly, when we transition to normal status.
|
|
@@ -4526,73 +5018,406 @@ pub fn Replica(
|
|
|
4526
5018
|
},
|
|
4527
5019
|
}
|
|
4528
5020
|
|
|
4529
|
-
|
|
4530
|
-
self.replica,
|
|
4531
|
-
method,
|
|
4532
|
-
self.view,
|
|
4533
|
-
self.op,
|
|
4534
|
-
latest.op,
|
|
4535
|
-
self.commit_max,
|
|
4536
|
-
k,
|
|
4537
|
-
latest.checksum,
|
|
4538
|
-
});
|
|
4539
|
-
|
|
4540
|
-
// Uncommitted ops may not survive a view change so we must assert `latest.op` against
|
|
5021
|
+
// Uncommitted ops may not survive a view change so we must assert `op` against
|
|
4541
5022
|
// `commit_max` and not `self.op`. However, committed ops (`commit_max`) must survive:
|
|
4542
|
-
assert(
|
|
4543
|
-
assert(
|
|
4544
|
-
assert(
|
|
4545
|
-
|
|
4546
|
-
//
|
|
4547
|
-
//
|
|
4548
|
-
//
|
|
4549
|
-
//
|
|
4550
|
-
// The intersection property only requires that all
|
|
4551
|
-
// survive into the new view so that they can then be committed by the new leader.
|
|
4552
|
-
// guarantees that if the old leader
|
|
4553
|
-
// leader will also commit the operation.
|
|
4554
|
-
if (
|
|
5023
|
+
assert(op >= self.commit_max);
|
|
5024
|
+
assert(op >= commit_max);
|
|
5025
|
+
assert(op <= self.op_checkpoint_trigger());
|
|
5026
|
+
|
|
5027
|
+
// We expect that our commit numbers may also be greater even than `commit_max` because
|
|
5028
|
+
// we may be the old leader joining towards the end of the view change and we may have
|
|
5029
|
+
// committed `op` already.
|
|
5030
|
+
// However, this is bounded by pipelining.
|
|
5031
|
+
// The intersection property only requires that all possibly committed operations must
|
|
5032
|
+
// survive into the new view so that they can then be committed by the new leader.
|
|
5033
|
+
// This guarantees that if the old leader possibly committed the operation, then the
|
|
5034
|
+
// new leader will also commit the operation.
|
|
5035
|
+
if (commit_max < self.commit_max and self.commit_min == self.commit_max) {
|
|
4555
5036
|
log.debug("{}: {s}: k={} < commit_max={} and commit_min == commit_max", .{
|
|
4556
5037
|
self.replica,
|
|
4557
5038
|
method,
|
|
4558
|
-
|
|
5039
|
+
commit_max,
|
|
4559
5040
|
self.commit_max,
|
|
4560
5041
|
});
|
|
4561
5042
|
}
|
|
4562
|
-
|
|
4563
|
-
assert(
|
|
5043
|
+
|
|
5044
|
+
assert(commit_max >=
|
|
5045
|
+
self.commit_max - std.math.min(config.pipeline_max, self.commit_max));
|
|
4564
5046
|
|
|
4565
5047
|
assert(self.commit_min <= self.commit_max);
|
|
4566
5048
|
assert(self.op >= self.commit_max or self.op < self.commit_max);
|
|
4567
5049
|
|
|
4568
|
-
|
|
5050
|
+
const previous_op = self.op;
|
|
5051
|
+
const previous_commit_max = self.commit_max;
|
|
5052
|
+
|
|
5053
|
+
self.op = op;
|
|
5054
|
+
self.journal.remove_entries_from(self.op + 1);
|
|
5055
|
+
|
|
4569
5056
|
// Crucially, we must never rewind `commit_max` (and then `commit_min`) because
|
|
4570
5057
|
// `commit_min` represents what we have already applied to our state machine:
|
|
4571
|
-
self.commit_max = std.math.max(self.commit_max,
|
|
5058
|
+
self.commit_max = std.math.max(self.commit_max, commit_max);
|
|
4572
5059
|
|
|
4573
5060
|
assert(self.commit_min <= self.commit_max);
|
|
4574
|
-
assert(self.
|
|
5061
|
+
assert(self.commit_max <= self.op);
|
|
4575
5062
|
|
|
4576
|
-
|
|
4577
|
-
|
|
4578
|
-
|
|
4579
|
-
|
|
4580
|
-
|
|
4581
|
-
|
|
4582
|
-
|
|
5063
|
+
log.debug("{}: {s}: view={} op={}..{} commit={}..{}", .{
|
|
5064
|
+
self.replica,
|
|
5065
|
+
method,
|
|
5066
|
+
self.view,
|
|
5067
|
+
previous_op,
|
|
5068
|
+
self.op,
|
|
5069
|
+
previous_commit_max,
|
|
5070
|
+
self.commit_max,
|
|
5071
|
+
});
|
|
5072
|
+
}
|
|
5073
|
+
|
|
5074
|
+
/// Load the new view's headers from the DVC quorum.
|
|
5075
|
+
///
|
|
5076
|
+
/// The iteration order of DVCs for repair does not impact the final result.
|
|
5077
|
+
/// In other words, you can't end up in a situation with a DVC quorum like:
|
|
5078
|
+
///
|
|
5079
|
+
/// replica headers commit_min
|
|
5080
|
+
/// 0 4 5 _ _ 8 4 (new leader; handling DVC quorum)
|
|
5081
|
+
/// 1 4 _ 6 _ 8 4
|
|
5082
|
+
/// 2 4 _ _ 7 8 4
|
|
5083
|
+
/// 3 (4 5 6 7 8) 8 (didn't participate in view change)
|
|
5084
|
+
/// 4 (4 5 6 7 8) 8 (didn't participate in view change)
|
|
5085
|
+
///
|
|
5086
|
+
/// where the new leader's headers depends on which of replica 1 and 2's DVC is used
|
|
5087
|
+
/// for repair before the other (i.e. whether they repair op 6 or 7 first).
|
|
5088
|
+
///
|
|
5089
|
+
/// For the above case to occur, replicas 0, 1, and 2 must all share the highest `view_normal`.
|
|
5090
|
+
/// And since they share the latest `view_normal`, ops 5,6,7 were just installed by
|
|
5091
|
+
/// `replace_header`, which is order-independent (it doesn't use the hash chain).
|
|
5092
|
+
///
|
|
5093
|
+
/// (If replica 0's view_normal was greater than 1/2's, then replica 0 must have all
|
|
5094
|
+
/// headers from previous views. Which means 6,7 are from the current view. But since
|
|
5095
|
+
/// replica 0 doesn't have 6/7, then replica 1/2 must share the latest view_normal. ∎)
|
|
5096
|
+
fn set_log_from_do_view_change_messages(self: *Self) void {
|
|
5097
|
+
assert(self.status == .view_change);
|
|
5098
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
5099
|
+
assert(self.replica_count > 1);
|
|
5100
|
+
assert(self.start_view_change_quorum);
|
|
5101
|
+
assert(self.do_view_change_quorum);
|
|
5102
|
+
|
|
5103
|
+
const do_view_change_head = self.do_view_change_quorum_head();
|
|
5104
|
+
assert(do_view_change_head.view_normal >= self.view_normal);
|
|
5105
|
+
assert(do_view_change_head.op >= self.commit_min);
|
|
5106
|
+
assert(do_view_change_head.op >= do_view_change_head.commit_min_max);
|
|
5107
|
+
assert(do_view_change_head.commit_min_max >= self.commit_min);
|
|
5108
|
+
|
|
5109
|
+
// The `prepare_timestamp` prevents a primary's own clock from running backwards.
|
|
5110
|
+
// Therefore, `prepare_timestamp`:
|
|
5111
|
+
// 1. is advanced if behind the cluster, but never reset if ahead of the cluster, i.e.
|
|
5112
|
+
// 2. may not always reflect the timestamp of the latest prepared op, and
|
|
5113
|
+
// 3. should be advanced before discarding the timestamps of any uncommitted headers.
|
|
5114
|
+
if (self.state_machine.prepare_timestamp < do_view_change_head.timestamp) {
|
|
5115
|
+
self.state_machine.prepare_timestamp = do_view_change_head.timestamp;
|
|
4583
5116
|
}
|
|
4584
5117
|
|
|
4585
|
-
|
|
4586
|
-
|
|
4587
|
-
|
|
5118
|
+
const view_normal_canonical = do_view_change_head.view_normal;
|
|
5119
|
+
// `op_canonical` must be computed before calling `set_op_and_commit_max()`, since
|
|
5120
|
+
// that may change `replica.op`.
|
|
5121
|
+
//
|
|
5122
|
+
// Don't remove the uncanonical headers yet — even though the removed headers are
|
|
5123
|
+
// a subset of the DVC headers, removing and then adding them back would cause clean
|
|
5124
|
+
// headers to become dirty.
|
|
5125
|
+
const op_canonical = self.op_canonical_max(view_normal_canonical);
|
|
5126
|
+
assert(op_canonical <= self.op);
|
|
5127
|
+
assert(op_canonical >= self.op -| config.pipeline_max);
|
|
5128
|
+
assert(op_canonical >= self.commit_min);
|
|
5129
|
+
|
|
5130
|
+
if (do_view_change_head.op > self.op_checkpoint_trigger()) {
|
|
5131
|
+
// This replica is too far behind, i.e. the new `self.op` is too far ahead of the
|
|
5132
|
+
// last checkpoint. If we wrap now, we overwrite un-checkpointed transfers in the WAL,
|
|
5133
|
+
// precluding recovery.
|
|
5134
|
+
//
|
|
5135
|
+
// TODO State transfer. Currently this is unreachable because the
|
|
5136
|
+
// leader won't checkpoint until all replicas are caught up.
|
|
5137
|
+
unreachable;
|
|
5138
|
+
}
|
|
5139
|
+
|
|
5140
|
+
self.set_op_and_commit_max(
|
|
5141
|
+
do_view_change_head.op,
|
|
5142
|
+
// `set_op_and_commit_max()` expects the highest commit_max that we know of.
|
|
5143
|
+
// But DVCs include replica's `commit_min`, not `commit_max`.
|
|
5144
|
+
std.math.max(
|
|
5145
|
+
self.commit_max,
|
|
5146
|
+
do_view_change_head.commit_min_max,
|
|
5147
|
+
),
|
|
5148
|
+
"on_do_view_change",
|
|
5149
|
+
);
|
|
5150
|
+
// "`replica.op` exists" invariant may be broken until after the canonical DVC headers
|
|
5151
|
+
// are installed.
|
|
5152
|
+
|
|
5153
|
+
// First, set all the canonical headers from the replica(s) with highest `view_normal`:
|
|
5154
|
+
for (self.do_view_change_from_all_replicas) |received| {
|
|
5155
|
+
if (received) |message| {
|
|
5156
|
+
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5157
|
+
// The view in which this replica's status was normal must be before this view.
|
|
5158
|
+
assert(view_normal < message.header.view);
|
|
5159
|
+
|
|
5160
|
+
if (view_normal < view_normal_canonical) continue;
|
|
5161
|
+
assert(view_normal == view_normal_canonical);
|
|
5162
|
+
|
|
5163
|
+
const message_headers = message_body_as_headers(message);
|
|
5164
|
+
for (message_headers) |*header| {
|
|
5165
|
+
log.debug(
|
|
5166
|
+
"{}: on_do_view_change: canonical: replica={} op={} checksum={}",
|
|
5167
|
+
.{
|
|
5168
|
+
self.replica,
|
|
5169
|
+
message.header.replica,
|
|
5170
|
+
header.op,
|
|
5171
|
+
header.checksum,
|
|
5172
|
+
},
|
|
5173
|
+
);
|
|
5174
|
+
}
|
|
5175
|
+
self.replace_headers(message_headers);
|
|
5176
|
+
}
|
|
5177
|
+
}
|
|
5178
|
+
|
|
5179
|
+
// Since we used do_view_change_head to set the replica.op, it must have been loaded
|
|
5180
|
+
// into the headers (if it wasn't present already).
|
|
5181
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
5182
|
+
|
|
5183
|
+
// Now that the canonical headers are all in place, repair any other headers:
|
|
5184
|
+
for (self.do_view_change_from_all_replicas) |received| {
|
|
5185
|
+
if (received) |message| {
|
|
5186
|
+
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5187
|
+
assert(view_normal < message.header.view);
|
|
5188
|
+
|
|
5189
|
+
if (view_normal == view_normal_canonical) continue;
|
|
5190
|
+
assert(view_normal < view_normal_canonical);
|
|
5191
|
+
|
|
5192
|
+
for (message_body_as_headers(message)) |*header| {
|
|
5193
|
+
// We must trust headers that other replicas have committed, because
|
|
5194
|
+
// repair_header() will not repair a header if the hash chain has a gap.
|
|
5195
|
+
if (header.op <= message.header.commit) {
|
|
5196
|
+
log.debug(
|
|
5197
|
+
"{}: on_do_view_change: committed: replica={} op={} checksum={}",
|
|
5198
|
+
.{
|
|
5199
|
+
self.replica,
|
|
5200
|
+
message.header.replica,
|
|
5201
|
+
header.op,
|
|
5202
|
+
header.checksum,
|
|
5203
|
+
},
|
|
5204
|
+
);
|
|
5205
|
+
self.replace_header(header);
|
|
5206
|
+
} else {
|
|
5207
|
+
_ = self.repair_header(header);
|
|
5208
|
+
}
|
|
5209
|
+
}
|
|
5210
|
+
}
|
|
5211
|
+
}
|
|
5212
|
+
|
|
5213
|
+
const op_max = self.do_view_change_op_max(op_canonical);
|
|
5214
|
+
assert(op_max <= self.op);
|
|
5215
|
+
assert(op_max >= self.commit_min);
|
|
5216
|
+
if (op_max != self.op) {
|
|
5217
|
+
log.debug("{}: set_log_from_do_view_change_messages: discard op={}..{}", .{
|
|
5218
|
+
self.replica,
|
|
5219
|
+
op_max + 1,
|
|
5220
|
+
self.op,
|
|
5221
|
+
});
|
|
5222
|
+
self.journal.remove_entries_from(op_max + 1);
|
|
5223
|
+
self.op = op_max;
|
|
5224
|
+
}
|
|
5225
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
4588
5226
|
}
|
|
4589
5227
|
|
|
4590
|
-
fn
|
|
5228
|
+
fn do_view_change_quorum_head(self: *const Self) struct {
|
|
5229
|
+
/// The highest `view_normal` of any DVC.
|
|
5230
|
+
///
|
|
5231
|
+
/// The headers bundled with DVCs with the highest `view_normal` are canonical, since
|
|
5232
|
+
/// the replica has knowledge of previous view changes in which headers were replaced.
|
|
5233
|
+
view_normal: u32,
|
|
5234
|
+
/// The highest `commit_min` from any DVC (this is not a `commit_max`).
|
|
5235
|
+
commit_min_max: u64,
|
|
5236
|
+
/// The highest `op` from a DVC with the highest `view_normal`.
|
|
5237
|
+
op: u64,
|
|
5238
|
+
/// The higest timestamp from any DVC.
|
|
5239
|
+
timestamp: u64,
|
|
5240
|
+
} {
|
|
4591
5241
|
assert(self.status == .view_change);
|
|
4592
5242
|
assert(self.leader_index(self.view) == self.replica);
|
|
5243
|
+
assert(self.replica_count > 1);
|
|
5244
|
+
assert(self.start_view_change_quorum);
|
|
4593
5245
|
assert(self.do_view_change_quorum);
|
|
5246
|
+
assert(self.do_view_change_from_all_replicas[self.replica] != null);
|
|
4594
5247
|
|
|
4595
|
-
|
|
5248
|
+
var v: ?u32 = null; // The highest `view_normal` from any replica.
|
|
5249
|
+
var n: ?u64 = null; // The highest `op` for the highest `view_normal` from any replica.
|
|
5250
|
+
var k: ?u64 = null; // The highest `commit_min` from any replica.
|
|
5251
|
+
var t: ?u64 = null; // The highest `timestamp` from any replica.
|
|
5252
|
+
|
|
5253
|
+
for (self.do_view_change_from_all_replicas) |received, replica| {
|
|
5254
|
+
if (received) |message| {
|
|
5255
|
+
assert(message.header.command == .do_view_change);
|
|
5256
|
+
assert(message.header.cluster == self.cluster);
|
|
5257
|
+
assert(message.header.replica == replica);
|
|
5258
|
+
assert(message.header.view == self.view);
|
|
5259
|
+
assert(message.header.op >= message.header.commit);
|
|
5260
|
+
assert(message.header.op - message.header.commit <= config.journal_slot_count);
|
|
5261
|
+
|
|
5262
|
+
// The view when this replica was last in normal status, which:
|
|
5263
|
+
// * may be higher than the view in any of the prepare headers.
|
|
5264
|
+
// * must be lower than the view of this view change.
|
|
5265
|
+
const view_normal = @intCast(u32, message.header.timestamp);
|
|
5266
|
+
assert(view_normal < message.header.view);
|
|
5267
|
+
|
|
5268
|
+
if (replica == self.replica) {
|
|
5269
|
+
assert(view_normal == self.view_normal);
|
|
5270
|
+
assert(message.header.op == self.op);
|
|
5271
|
+
// We may have a newer commit than our DVC due to async commits (see below).
|
|
5272
|
+
assert(message.header.commit <= self.commit_min);
|
|
5273
|
+
}
|
|
5274
|
+
|
|
5275
|
+
log.debug(
|
|
5276
|
+
"{}: on_do_view_change: " ++
|
|
5277
|
+
"replica={} view_normal={} op={} commit_min={}",
|
|
5278
|
+
.{
|
|
5279
|
+
self.replica,
|
|
5280
|
+
message.header.replica,
|
|
5281
|
+
view_normal,
|
|
5282
|
+
message.header.op,
|
|
5283
|
+
message.header.commit, // The `commit_min` of the replica.
|
|
5284
|
+
},
|
|
5285
|
+
);
|
|
5286
|
+
|
|
5287
|
+
if (v == null or view_normal > v.?) {
|
|
5288
|
+
v = view_normal;
|
|
5289
|
+
n = message.header.op;
|
|
5290
|
+
} else if (view_normal == v.? and message.header.op > n.?) {
|
|
5291
|
+
n = message.header.op;
|
|
5292
|
+
}
|
|
5293
|
+
|
|
5294
|
+
if (k == null or message.header.commit > k.?) k = message.header.commit;
|
|
5295
|
+
|
|
5296
|
+
const message_headers = message_body_as_headers(message);
|
|
5297
|
+
if (t == null or t.? < message_headers[0].timestamp) {
|
|
5298
|
+
t = message_headers[0].timestamp;
|
|
5299
|
+
}
|
|
5300
|
+
}
|
|
5301
|
+
}
|
|
5302
|
+
|
|
5303
|
+
// Consider the case:
|
|
5304
|
+
// 1. Start committing op=N…M.
|
|
5305
|
+
// 2. Send `do_view_change` to self.
|
|
5306
|
+
// 3. Finish committing op=N…M.
|
|
5307
|
+
// 4. Remaining `do_view_change` messages arrive, completing the quorum.
|
|
5308
|
+
// In this scenario, our own DVC's commit is `N-1`, but `commit_min=M`.
|
|
5309
|
+
// Don't let the commit backtrack.
|
|
5310
|
+
if (k.? < self.commit_min) {
|
|
5311
|
+
assert(self.commit_min >
|
|
5312
|
+
self.do_view_change_from_all_replicas[self.replica].?.header.commit);
|
|
5313
|
+
log.debug("{}: on_do_view_change: bump commit_min view={} commit={}..{}", .{
|
|
5314
|
+
self.replica,
|
|
5315
|
+
self.view,
|
|
5316
|
+
k.?,
|
|
5317
|
+
self.commit_min,
|
|
5318
|
+
});
|
|
5319
|
+
k = self.commit_min;
|
|
5320
|
+
}
|
|
5321
|
+
|
|
5322
|
+
assert(v.? >= self.view_normal);
|
|
5323
|
+
assert(k.? >= self.commit_min);
|
|
5324
|
+
|
|
5325
|
+
return .{
|
|
5326
|
+
.view_normal = v.?,
|
|
5327
|
+
.commit_min_max = k.?,
|
|
5328
|
+
.op = n.?,
|
|
5329
|
+
.timestamp = t.?,
|
|
5330
|
+
};
|
|
5331
|
+
}
|
|
5332
|
+
|
|
5333
|
+
/// Identify headers to discard during a view change before the primary starts the view.
|
|
5334
|
+
/// This is required to maximize availability in the presence of storage faults.
|
|
5335
|
+
/// Refer to the CTRL protocol from Protocol-Aware Recovery for Consensus-Based Storage.
|
|
5336
|
+
///
|
|
5337
|
+
/// Returns the highest op that:
|
|
5338
|
+
/// - precedes any hash chain breaks in the uncanonical headers, and
|
|
5339
|
+
/// - precedes any gaps in the uncommitted headers.
|
|
5340
|
+
///
|
|
5341
|
+
/// Breaks
|
|
5342
|
+
///
|
|
5343
|
+
/// If there is a hash chain break, none of the headers from the canonical DVCs replaced
|
|
5344
|
+
/// the broken (leftover uncanonical) op.
|
|
5345
|
+
/// Removing these is necessary for correctness and liveness, to ensure that
|
|
5346
|
+
/// disconnected headers do not remain in place in lieu of gaps.
|
|
5347
|
+
///
|
|
5348
|
+
/// Gaps
|
|
5349
|
+
///
|
|
5350
|
+
/// It is possible for the new primary to have done an op jump in a previous view, and
|
|
5351
|
+
/// introduced a header gap for an op, which may have then been discarded by another primary
|
|
5352
|
+
/// during a view change, before surviving into this view as a gap because our latest op was
|
|
5353
|
+
/// set as the latest op for the quorum.
|
|
5354
|
+
///
|
|
5355
|
+
/// In this case, it may be impossible for the new primary to repair the missing header as
|
|
5356
|
+
/// the rest of the cluster may have already discarded it. We therefore iterate over our
|
|
5357
|
+
/// uncommitted header gaps to discard any that may be impossible to repair.
|
|
5358
|
+
///
|
|
5359
|
+
/// For example, if the old primary replicates ops=7,8,9 (all uncommitted) but only op=9 is
|
|
5360
|
+
/// prepared on another replica before the old primary crashes, then this function finds a
|
|
5361
|
+
/// gap for ops=7,8 and will attempt to discard ops 7,8,9.
|
|
5362
|
+
fn do_view_change_op_max(self: *const Self, op_canonical: u64) u64 {
|
|
5363
|
+
assert(self.replica_count > 1);
|
|
5364
|
+
assert(self.status == .view_change);
|
|
5365
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
5366
|
+
assert(self.do_view_change_quorum);
|
|
5367
|
+
assert(!self.repair_timeout.ticking);
|
|
5368
|
+
assert(self.op >= self.commit_max);
|
|
5369
|
+
// At least one replica in the new quorum committed in the new replica.op's WAL wrap —
|
|
5370
|
+
// wrapping implies a checkpoint (which implies a commit).
|
|
5371
|
+
assert(self.op - self.commit_max <= config.journal_slot_count);
|
|
5372
|
+
assert(self.op - self.commit_min <= config.journal_slot_count);
|
|
5373
|
+
|
|
5374
|
+
assert(op_canonical <= self.op);
|
|
5375
|
+
assert(op_canonical >= self.commit_min);
|
|
5376
|
+
|
|
5377
|
+
// Any uncanonical ops remaining either:
|
|
5378
|
+
// * Connect to the hash chain on the right.
|
|
5379
|
+
// * Do not connect on the right (hash chain break).
|
|
5380
|
+
//
|
|
5381
|
+
// If there is a hash chain break, none of the headers from the canonical DVCs replaced
|
|
5382
|
+
// the broken op. It is truncated like a gap.
|
|
5383
|
+
//
|
|
5384
|
+
// Removing these is necessary for correctness and liveness, to ensure that
|
|
5385
|
+
// disconnected headers do not remain in place in lieu of gaps.
|
|
5386
|
+
const op_before_break = blk: {
|
|
5387
|
+
var op: u64 = op_canonical;
|
|
5388
|
+
while (op < self.op) : (op += 1) {
|
|
5389
|
+
if (self.journal.header_with_op(op)) |header| {
|
|
5390
|
+
if (self.journal.header_with_op(op + 1)) |next| {
|
|
5391
|
+
// Broken hash chain.
|
|
5392
|
+
if (header.checksum != next.parent) break :blk op;
|
|
5393
|
+
}
|
|
5394
|
+
}
|
|
5395
|
+
} else break :blk self.op;
|
|
5396
|
+
};
|
|
5397
|
+
|
|
5398
|
+
// Find the beginning of the lowest gap.
|
|
5399
|
+
//
|
|
5400
|
+
// While iterating > commit_max does not in itself guarantee that an op is uncommitted
|
|
5401
|
+
// (the old primary may have committed the op shortly before crashing), nevertheless,
|
|
5402
|
+
// if it was committed it would have survived into the new view as a header not a gap.
|
|
5403
|
+
const op_before_gap = blk: {
|
|
5404
|
+
// An op cannot be uncommitted if it is definitely outside the pipeline.
|
|
5405
|
+
const op_committed = std.math.max(self.commit_max, self.op -| config.pipeline_max);
|
|
5406
|
+
assert(op_committed <= self.op);
|
|
5407
|
+
|
|
5408
|
+
var op = op_committed;
|
|
5409
|
+
while (op < self.op) : (op += 1) {
|
|
5410
|
+
if (self.journal.header_with_op(op + 1) == null) break :blk op;
|
|
5411
|
+
} else break :blk self.op;
|
|
5412
|
+
};
|
|
5413
|
+
|
|
5414
|
+
return std.math.min(op_before_break, op_before_gap);
|
|
5415
|
+
}
|
|
5416
|
+
|
|
5417
|
+
fn start_view_as_the_new_leader(self: *Self) void {
|
|
5418
|
+
assert(self.status == .view_change);
|
|
5419
|
+
assert(self.leader_index(self.view) == self.replica);
|
|
5420
|
+
assert(self.do_view_change_quorum);
|
|
4596
5421
|
assert(!self.repairing_pipeline);
|
|
4597
5422
|
|
|
4598
5423
|
assert(self.commit_min == self.commit_max);
|
|
@@ -4630,6 +5455,9 @@ pub fn Replica(
|
|
|
4630
5455
|
fn transition_to_normal_from_recovering_status(self: *Self, new_view: u32) void {
|
|
4631
5456
|
assert(self.status == .recovering);
|
|
4632
5457
|
assert(self.view == 0);
|
|
5458
|
+
assert(!self.committing);
|
|
5459
|
+
assert(self.replica_count > 1 or new_view == 0);
|
|
5460
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
4633
5461
|
self.view = new_view;
|
|
4634
5462
|
self.view_normal = new_view;
|
|
4635
5463
|
self.status = .normal;
|
|
@@ -4679,6 +5507,7 @@ pub fn Replica(
|
|
|
4679
5507
|
// For example, this could happen after a state transfer triggered by an op jump.
|
|
4680
5508
|
assert(self.status == .view_change);
|
|
4681
5509
|
assert(new_view >= self.view);
|
|
5510
|
+
assert(self.journal.header_with_op(self.op) != null);
|
|
4682
5511
|
self.view = new_view;
|
|
4683
5512
|
self.view_normal = new_view;
|
|
4684
5513
|
self.status = .normal;
|
|
@@ -4724,6 +5553,7 @@ pub fn Replica(
|
|
|
4724
5553
|
self.reset_quorum_start_view_change();
|
|
4725
5554
|
self.reset_quorum_do_view_change();
|
|
4726
5555
|
self.reset_quorum_nack_prepare();
|
|
5556
|
+
self.reset_quorum_prepare_ok();
|
|
4727
5557
|
|
|
4728
5558
|
assert(self.start_view_change_quorum == false);
|
|
4729
5559
|
assert(self.do_view_change_quorum == false);
|
|
@@ -4763,6 +5593,7 @@ pub fn Replica(
|
|
|
4763
5593
|
self.reset_quorum_start_view_change();
|
|
4764
5594
|
self.reset_quorum_do_view_change();
|
|
4765
5595
|
self.reset_quorum_nack_prepare();
|
|
5596
|
+
self.reset_quorum_prepare_ok();
|
|
4766
5597
|
|
|
4767
5598
|
assert(self.start_view_change_quorum == false);
|
|
4768
5599
|
assert(self.do_view_change_quorum == false);
|
|
@@ -4780,7 +5611,7 @@ pub fn Replica(
|
|
|
4780
5611
|
assert(reply.header.commit > 0);
|
|
4781
5612
|
assert(reply.header.request > 0);
|
|
4782
5613
|
|
|
4783
|
-
if (self.client_table.
|
|
5614
|
+
if (self.client_table().get(reply.header.client)) |entry| {
|
|
4784
5615
|
assert(entry.reply.header.command == .reply);
|
|
4785
5616
|
assert(entry.reply.header.context == 0);
|
|
4786
5617
|
assert(entry.reply.header.op == entry.reply.header.commit);
|
|
@@ -4868,12 +5699,16 @@ pub fn Replica(
|
|
|
4868
5699
|
}
|
|
4869
5700
|
|
|
4870
5701
|
fn verify_pipeline(self: *Self) void {
|
|
5702
|
+
assert(self.status == .view_change);
|
|
5703
|
+
|
|
4871
5704
|
var op = self.commit_max + 1;
|
|
4872
5705
|
var parent = self.journal.header_with_op(self.commit_max).?.checksum;
|
|
4873
5706
|
|
|
4874
5707
|
var iterator = self.pipeline.iterator();
|
|
4875
5708
|
while (iterator.next_ptr()) |prepare| {
|
|
4876
5709
|
assert(prepare.message.header.command == .prepare);
|
|
5710
|
+
assert(!prepare.ok_quorum_received);
|
|
5711
|
+
assert(prepare.ok_from_all_replicas.count() == 0);
|
|
4877
5712
|
|
|
4878
5713
|
log.debug("{}: verify_pipeline: op={} checksum={x} parent={x}", .{
|
|
4879
5714
|
self.replica,
|
|
@@ -4971,6 +5806,12 @@ pub fn Replica(
|
|
|
4971
5806
|
assert(message.header.view <= self.view);
|
|
4972
5807
|
assert(message.header.op <= self.op);
|
|
4973
5808
|
|
|
5809
|
+
if (message.header.op == self.op_checkpoint) {
|
|
5810
|
+
assert(message.header.op == 0);
|
|
5811
|
+
} else {
|
|
5812
|
+
assert(message.header.op > self.op_checkpoint);
|
|
5813
|
+
}
|
|
5814
|
+
|
|
4974
5815
|
if (!self.journal.has(message.header)) {
|
|
4975
5816
|
log.debug("{}: write_prepare: ignoring op={} checksum={} (header changed)", .{
|
|
4976
5817
|
self.replica,
|
|
@@ -5013,3 +5854,113 @@ pub fn Replica(
|
|
|
5013
5854
|
}
|
|
5014
5855
|
};
|
|
5015
5856
|
}
|
|
5857
|
+
|
|
5858
|
+
/// Initialize the TigerBeetle replica's data file.
|
|
5859
|
+
pub fn format(
|
|
5860
|
+
comptime Storage: type,
|
|
5861
|
+
allocator: std.mem.Allocator,
|
|
5862
|
+
cluster: u32,
|
|
5863
|
+
replica: u8,
|
|
5864
|
+
storage: *Storage,
|
|
5865
|
+
superblock: *vsr.SuperBlockType(Storage),
|
|
5866
|
+
) !void {
|
|
5867
|
+
const ReplicaFormat = ReplicaFormatType(Storage);
|
|
5868
|
+
var replica_format = ReplicaFormat{};
|
|
5869
|
+
|
|
5870
|
+
try replica_format.format_wal(allocator, cluster, storage);
|
|
5871
|
+
assert(!replica_format.formatting);
|
|
5872
|
+
|
|
5873
|
+
superblock.format(
|
|
5874
|
+
ReplicaFormat.format_superblock_callback,
|
|
5875
|
+
&replica_format.superblock_context,
|
|
5876
|
+
.{
|
|
5877
|
+
.cluster = cluster,
|
|
5878
|
+
.replica = replica,
|
|
5879
|
+
.size_max = config.size_max, // This can later become a runtime arg, to cap storage.
|
|
5880
|
+
},
|
|
5881
|
+
);
|
|
5882
|
+
|
|
5883
|
+
replica_format.formatting = true;
|
|
5884
|
+
while (replica_format.formatting) storage.tick();
|
|
5885
|
+
}
|
|
5886
|
+
|
|
5887
|
+
fn ReplicaFormatType(comptime Storage: type) type {
|
|
5888
|
+
const SuperBlock = vsr.SuperBlockType(Storage);
|
|
5889
|
+
return struct {
|
|
5890
|
+
const Self = @This();
|
|
5891
|
+
|
|
5892
|
+
formatting: bool = false,
|
|
5893
|
+
superblock_context: SuperBlock.Context = undefined,
|
|
5894
|
+
wal_write: Storage.Write = undefined,
|
|
5895
|
+
|
|
5896
|
+
fn format_wal(
|
|
5897
|
+
self: *Self,
|
|
5898
|
+
allocator: std.mem.Allocator,
|
|
5899
|
+
cluster: u32,
|
|
5900
|
+
storage: *Storage,
|
|
5901
|
+
) !void {
|
|
5902
|
+
const header_zeroes = [_]u8{0} ** @sizeOf(Header);
|
|
5903
|
+
const wal_write_size_max = 4 * 1024 * 1024;
|
|
5904
|
+
assert(wal_write_size_max % config.sector_size == 0);
|
|
5905
|
+
|
|
5906
|
+
// Direct I/O requires the buffer to be sector-aligned.
|
|
5907
|
+
var wal_buffer = try allocator.allocAdvanced(
|
|
5908
|
+
u8,
|
|
5909
|
+
config.sector_size,
|
|
5910
|
+
wal_write_size_max,
|
|
5911
|
+
.exact,
|
|
5912
|
+
);
|
|
5913
|
+
errdefer allocator.free(wal_buffer);
|
|
5914
|
+
|
|
5915
|
+
// The logical offset *within the WAL*.
|
|
5916
|
+
var wal_offset: u64 = 0;
|
|
5917
|
+
while (wal_offset < config.journal_size_max) {
|
|
5918
|
+
const size = format_journal(cluster, wal_offset, wal_buffer);
|
|
5919
|
+
assert(size % config.sector_size == 0);
|
|
5920
|
+
assert(size > 0);
|
|
5921
|
+
|
|
5922
|
+
for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
|
|
5923
|
+
if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
|
|
5924
|
+
// This is the (empty) body of a reserved or root Prepare.
|
|
5925
|
+
} else {
|
|
5926
|
+
// This is either a Prepare's header or a redundant header.
|
|
5927
|
+
assert(header.valid_checksum());
|
|
5928
|
+
if (header.op == 0) {
|
|
5929
|
+
assert(header.command == .prepare);
|
|
5930
|
+
assert(header.operation == .root);
|
|
5931
|
+
} else {
|
|
5932
|
+
assert(header.command == .reserved);
|
|
5933
|
+
assert(header.operation == .reserved);
|
|
5934
|
+
}
|
|
5935
|
+
}
|
|
5936
|
+
}
|
|
5937
|
+
|
|
5938
|
+
storage.write_sectors(
|
|
5939
|
+
format_wal_sectors_callback,
|
|
5940
|
+
&self.wal_write,
|
|
5941
|
+
wal_buffer[0..size],
|
|
5942
|
+
.wal,
|
|
5943
|
+
wal_offset,
|
|
5944
|
+
);
|
|
5945
|
+
self.formatting = true;
|
|
5946
|
+
while (self.formatting) storage.tick();
|
|
5947
|
+
wal_offset += size;
|
|
5948
|
+
}
|
|
5949
|
+
|
|
5950
|
+
// There is nothing left to write.
|
|
5951
|
+
assert(format_journal(cluster, wal_offset, wal_buffer) == 0);
|
|
5952
|
+
}
|
|
5953
|
+
|
|
5954
|
+
fn format_wal_sectors_callback(write: *Storage.Write) void {
|
|
5955
|
+
const self = @fieldParentPtr(Self, "wal_write", write);
|
|
5956
|
+
assert(self.formatting);
|
|
5957
|
+
self.formatting = false;
|
|
5958
|
+
}
|
|
5959
|
+
|
|
5960
|
+
fn format_superblock_callback(superblock_context: *SuperBlock.Context) void {
|
|
5961
|
+
const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
|
|
5962
|
+
assert(self.formatting);
|
|
5963
|
+
self.formatting = false;
|
|
5964
|
+
}
|
|
5965
|
+
};
|
|
5966
|
+
}
|