tigerbeetle-node 0.8.1 → 0.9.143

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +584 -184
  2. package/dist/benchmark.js +59 -51
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  5. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  6. package/dist/bin/aarch64-macos/client.node +0 -0
  7. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  8. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  9. package/dist/bin/x86_64-macos/client.node +0 -0
  10. package/dist/bin/x86_64-windows/client.node +0 -0
  11. package/dist/bindings.d.ts +141 -0
  12. package/dist/bindings.js +112 -0
  13. package/dist/bindings.js.map +1 -0
  14. package/dist/index.d.ts +2 -125
  15. package/dist/index.js +51 -101
  16. package/dist/index.js.map +1 -1
  17. package/dist/test.js +69 -55
  18. package/dist/test.js.map +1 -1
  19. package/package-lock.json +26 -0
  20. package/package.json +17 -28
  21. package/src/benchmark.ts +58 -49
  22. package/src/bindings.ts +631 -0
  23. package/src/index.ts +71 -163
  24. package/src/node.zig +169 -148
  25. package/src/test.ts +71 -57
  26. package/src/translate.zig +19 -36
  27. package/.yarn/releases/yarn-berry.cjs +0 -55
  28. package/.yarnrc.yml +0 -1
  29. package/scripts/download_node_headers.sh +0 -25
  30. package/scripts/postinstall.sh +0 -6
  31. package/src/tigerbeetle/scripts/benchmark.bat +0 -46
  32. package/src/tigerbeetle/scripts/benchmark.sh +0 -55
  33. package/src/tigerbeetle/scripts/install.sh +0 -6
  34. package/src/tigerbeetle/scripts/install_zig.bat +0 -109
  35. package/src/tigerbeetle/scripts/install_zig.sh +0 -84
  36. package/src/tigerbeetle/scripts/lint.zig +0 -199
  37. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -39
  38. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  39. package/src/tigerbeetle/scripts/vopr.sh +0 -33
  40. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  41. package/src/tigerbeetle/src/benchmark.zig +0 -290
  42. package/src/tigerbeetle/src/cli.zig +0 -244
  43. package/src/tigerbeetle/src/config.zig +0 -239
  44. package/src/tigerbeetle/src/demo.zig +0 -125
  45. package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
  46. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
  47. package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -24
  48. package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
  49. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
  50. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
  51. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
  52. package/src/tigerbeetle/src/fifo.zig +0 -104
  53. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  54. package/src/tigerbeetle/src/io/darwin.zig +0 -793
  55. package/src/tigerbeetle/src/io/linux.zig +0 -1038
  56. package/src/tigerbeetle/src/io/test.zig +0 -643
  57. package/src/tigerbeetle/src/io/windows.zig +0 -1161
  58. package/src/tigerbeetle/src/io.zig +0 -34
  59. package/src/tigerbeetle/src/main.zig +0 -144
  60. package/src/tigerbeetle/src/message_bus.zig +0 -1000
  61. package/src/tigerbeetle/src/message_pool.zig +0 -142
  62. package/src/tigerbeetle/src/ring_buffer.zig +0 -289
  63. package/src/tigerbeetle/src/simulator.zig +0 -417
  64. package/src/tigerbeetle/src/state_machine.zig +0 -2470
  65. package/src/tigerbeetle/src/storage.zig +0 -308
  66. package/src/tigerbeetle/src/test/cluster.zig +0 -351
  67. package/src/tigerbeetle/src/test/message_bus.zig +0 -93
  68. package/src/tigerbeetle/src/test/network.zig +0 -179
  69. package/src/tigerbeetle/src/test/packet_simulator.zig +0 -387
  70. package/src/tigerbeetle/src/test/state_checker.zig +0 -145
  71. package/src/tigerbeetle/src/test/state_machine.zig +0 -76
  72. package/src/tigerbeetle/src/test/storage.zig +0 -438
  73. package/src/tigerbeetle/src/test/time.zig +0 -84
  74. package/src/tigerbeetle/src/tigerbeetle.zig +0 -222
  75. package/src/tigerbeetle/src/time.zig +0 -113
  76. package/src/tigerbeetle/src/unit_tests.zig +0 -14
  77. package/src/tigerbeetle/src/vsr/client.zig +0 -505
  78. package/src/tigerbeetle/src/vsr/clock.zig +0 -812
  79. package/src/tigerbeetle/src/vsr/journal.zig +0 -2293
  80. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  81. package/src/tigerbeetle/src/vsr/replica.zig +0 -5015
  82. package/src/tigerbeetle/src/vsr.zig +0 -1017
  83. package/yarn.lock +0 -42
@@ -1,308 +0,0 @@
1
- const std = @import("std");
2
- const builtin = @import("builtin");
3
- const os = std.os;
4
- const Allocator = std.mem.Allocator;
5
- const assert = std.debug.assert;
6
- const log = std.log.scoped(.storage);
7
-
8
- const IO = @import("io.zig").IO;
9
- const config = @import("config.zig");
10
- const vsr = @import("vsr.zig");
11
-
12
- pub const Storage = struct {
13
- /// See usage in Journal.write_sectors() for details.
14
- pub const synchronicity: enum {
15
- always_synchronous,
16
- always_asynchronous,
17
- } = .always_asynchronous;
18
-
19
- pub const Read = struct {
20
- completion: IO.Completion,
21
- callback: fn (read: *Storage.Read) void,
22
-
23
- /// The buffer to read into, re-sliced and re-assigned as we go, e.g. after partial reads.
24
- buffer: []u8,
25
-
26
- /// The position into the file descriptor from where we should read, also adjusted as we go.
27
- offset: u64,
28
-
29
- /// The maximum amount of bytes to read per syscall. We use this to subdivide troublesome
30
- /// reads into smaller reads to work around latent sector errors (LSEs).
31
- target_max: u64,
32
-
33
- /// Returns a target slice into `buffer` to read into, capped by `target_max`.
34
- /// If the previous read was a partial read of physical sectors (e.g. 512 bytes) less than
35
- /// our logical sector size (e.g. 4 KiB), so that the remainder of the buffer is no longer
36
- /// aligned to a logical sector, then we further cap the slice to get back onto a logical
37
- /// sector boundary.
38
- fn target(read: *Read) []u8 {
39
- // A worked example of a partial read that leaves the rest of the buffer unaligned:
40
- // This could happen for non-Advanced Format disks with a physical sector of 512 bytes.
41
- // We want to read 8 KiB:
42
- // buffer.ptr = 0
43
- // buffer.len = 8192
44
- // ... and then experience a partial read of only 512 bytes:
45
- // buffer.ptr = 512
46
- // buffer.len = 7680
47
- // We can now see that `buffer.len` is no longer a sector multiple of 4 KiB and further
48
- // that we have 3584 bytes left of the partial sector read. If we subtract this amount
49
- // from our logical sector size of 4 KiB we get 512 bytes, which is the alignment error
50
- // that we need to subtract from `target_max` to get back onto the boundary.
51
- var max = read.target_max;
52
-
53
- const partial_sector_read_remainder = read.buffer.len % config.sector_size;
54
- if (partial_sector_read_remainder != 0) {
55
- // TODO log.debug() because this is interesting, and to ensure fuzz test coverage.
56
- const partial_sector_read = config.sector_size - partial_sector_read_remainder;
57
- max -= partial_sector_read;
58
- }
59
-
60
- return read.buffer[0..std.math.min(read.buffer.len, max)];
61
- }
62
- };
63
-
64
- pub const Write = struct {
65
- completion: IO.Completion,
66
- callback: fn (write: *Storage.Write) void,
67
- buffer: []const u8,
68
- offset: u64,
69
- };
70
-
71
- size: u64,
72
- fd: os.fd_t,
73
- io: *IO,
74
-
75
- pub fn init(size: u64, fd: os.fd_t, io: *IO) !Storage {
76
- return Storage{
77
- .size = size,
78
- .fd = fd,
79
- .io = io,
80
- };
81
- }
82
-
83
- pub fn deinit() void {}
84
-
85
- pub fn read_sectors(
86
- self: *Storage,
87
- callback: fn (read: *Storage.Read) void,
88
- read: *Storage.Read,
89
- buffer: []u8,
90
- offset: u64,
91
- ) void {
92
- assert_alignment(buffer, offset);
93
-
94
- read.* = .{
95
- .completion = undefined,
96
- .callback = callback,
97
- .buffer = buffer,
98
- .offset = offset,
99
- .target_max = buffer.len,
100
- };
101
-
102
- self.start_read(read, 0);
103
- }
104
-
105
- fn start_read(self: *Storage, read: *Storage.Read, bytes_read: usize) void {
106
- assert(bytes_read <= read.target().len);
107
-
108
- read.offset += bytes_read;
109
- read.buffer = read.buffer[bytes_read..];
110
-
111
- const target = read.target();
112
- if (target.len == 0) {
113
- read.callback(read);
114
- return;
115
- }
116
-
117
- self.assert_bounds(target, read.offset);
118
- self.io.read(
119
- *Storage,
120
- self,
121
- on_read,
122
- &read.completion,
123
- self.fd,
124
- target,
125
- read.offset,
126
- );
127
- }
128
-
129
- fn on_read(self: *Storage, completion: *IO.Completion, result: IO.ReadError!usize) void {
130
- const read = @fieldParentPtr(Storage.Read, "completion", completion);
131
-
132
- const bytes_read = result catch |err| switch (err) {
133
- error.InputOutput => {
134
- // The disk was unable to read some sectors (an internal CRC or hardware failure):
135
- // We may also have already experienced a partial unaligned read, reading less
136
- // physical sectors than the logical sector size, so we cannot expect `target.len`
137
- // to be an exact logical sector multiple.
138
- const target = read.target();
139
- if (target.len > config.sector_size) {
140
- // We tried to read more than a logical sector and failed.
141
- log.err("latent sector error: offset={}, subdividing read...", .{read.offset});
142
-
143
- // Divide the buffer in half and try to read each half separately:
144
- // This creates a recursive binary search for the sector(s) causing the error.
145
- // This is considerably slower than doing a single bulk read and by now we might
146
- // also have experienced the disk's read retry timeout (in seconds).
147
- // TODO Our docs must instruct on why and how to reduce disk firmware timeouts.
148
-
149
- // These lines both implement ceiling division e.g. `((3 - 1) / 2) + 1 == 2` and
150
- // require that the numerator is always greater than zero:
151
- assert(target.len > 0);
152
- const target_sectors = @divFloor(target.len - 1, config.sector_size) + 1;
153
- assert(target_sectors > 0);
154
- read.target_max = (@divFloor(target_sectors - 1, 2) + 1) * config.sector_size;
155
- assert(read.target_max >= config.sector_size);
156
-
157
- // Pass 0 for `bytes_read`, we want to retry the read with smaller `target_max`:
158
- self.start_read(read, 0);
159
- return;
160
- } else {
161
- // We tried to read at (or less than) logical sector granularity and failed.
162
- log.err("latent sector error: offset={}, zeroing sector...", .{read.offset});
163
-
164
- // Zero this logical sector which can't be read:
165
- // We will treat these EIO errors the same as a checksum failure.
166
- // TODO This could be an interesting avenue to explore further, whether
167
- // temporary or permanent EIO errors should be conflated with checksum failures.
168
- assert(target.len > 0);
169
- std.mem.set(u8, target, 0);
170
-
171
- // We could set `read.target_max` to `vsr.sector_ceil(read.buffer.len)` here
172
- // in order to restart our pseudo-binary search on the rest of the sectors to be
173
- // read, optimistically assuming that this is the last failing sector.
174
- // However, data corruption that causes EIO errors often has spacial locality.
175
- // Therefore, restarting our pseudo-binary search here might give us abysmal
176
- // performance in the (not uncommon) case of many successive failing sectors.
177
- self.start_read(read, target.len);
178
- return;
179
- }
180
- },
181
-
182
- error.WouldBlock,
183
- error.NotOpenForReading,
184
- error.ConnectionResetByPeer,
185
- error.Alignment,
186
- error.IsDir,
187
- error.SystemResources,
188
- error.Unseekable,
189
- error.Unexpected,
190
- => {
191
- log.err(
192
- "impossible read: offset={} buffer.len={} error={s}",
193
- .{ read.offset, read.buffer.len, @errorName(err) },
194
- );
195
- @panic("impossible read");
196
- },
197
- };
198
-
199
- if (bytes_read == 0) {
200
- // We tried to read more than there really is available to read.
201
- // In other words, we thought we could read beyond the end of the file descriptor.
202
- // This can happen if the data file inode `size` was truncated or corrupted.
203
- log.err(
204
- "short read: buffer.len={} offset={} bytes_read={}",
205
- .{ read.offset, read.buffer.len, bytes_read },
206
- );
207
- @panic("data file inode size was truncated or corrupted");
208
- }
209
-
210
- // If our target was limited to a single sector, perhaps because of a latent sector error,
211
- // then increase `target_max` according to AIMD now that we have read successfully and
212
- // hopefully cleared the faulty zone.
213
- // We assume that `target_max` may exceed `read.buffer.len` at any time.
214
- if (read.target_max == config.sector_size) {
215
- // TODO Add log.debug because this is interesting.
216
- read.target_max += config.sector_size;
217
- }
218
-
219
- self.start_read(read, bytes_read);
220
- }
221
-
222
- pub fn write_sectors(
223
- self: *Storage,
224
- callback: fn (write: *Storage.Write) void,
225
- write: *Storage.Write,
226
- buffer: []const u8,
227
- offset: u64,
228
- ) void {
229
- assert_alignment(buffer, offset);
230
-
231
- write.* = .{
232
- .completion = undefined,
233
- .callback = callback,
234
- .buffer = buffer,
235
- .offset = offset,
236
- };
237
-
238
- self.start_write(write);
239
- }
240
-
241
- fn start_write(self: *Storage, write: *Storage.Write) void {
242
- self.assert_bounds(write.buffer, write.offset);
243
- self.io.write(
244
- *Storage,
245
- self,
246
- on_write,
247
- &write.completion,
248
- self.fd,
249
- write.buffer,
250
- write.offset,
251
- );
252
- }
253
-
254
- fn on_write(self: *Storage, completion: *IO.Completion, result: IO.WriteError!usize) void {
255
- const write = @fieldParentPtr(Storage.Write, "completion", completion);
256
-
257
- const bytes_written = result catch |err| switch (err) {
258
- // We assume that the disk will attempt to reallocate a spare sector for any LSE.
259
- // TODO What if we receive a temporary EIO error because of a faulty cable?
260
- error.InputOutput => @panic("latent sector error: no spare sectors to reallocate"),
261
- // TODO: It seems like it might be possible for some filesystems to return ETIMEDOUT
262
- // here. Consider handling this without panicking.
263
- else => {
264
- log.err(
265
- "impossible write: offset={} buffer.len={} error={s}",
266
- .{ write.offset, write.buffer.len, @errorName(err) },
267
- );
268
- @panic("impossible write");
269
- },
270
- };
271
-
272
- if (bytes_written == 0) {
273
- // This should never happen if the kernel and filesystem are well behaved.
274
- // However, block devices are known to exhibit this behavior in the wild.
275
- // TODO: Consider retrying with a timeout if this panic proves problematic, and be
276
- // careful to avoid logging in a busy loop. Perhaps a better approach might be to
277
- // return wrote = null here and let the protocol retry at a higher layer where there is
278
- // more context available to decide on how important this is or whether to cancel.
279
- @panic("write operation returned 0 bytes written");
280
- }
281
-
282
- write.offset += bytes_written;
283
- write.buffer = write.buffer[bytes_written..];
284
-
285
- if (write.buffer.len == 0) {
286
- write.callback(write);
287
- return;
288
- }
289
-
290
- self.start_write(write);
291
- }
292
-
293
- /// Ensures that the read or write is aligned correctly for Direct I/O.
294
- /// If this is not the case, then the underlying syscall will return EINVAL.
295
- /// We check this only at the start of a read or write because the physical sector size may be
296
- /// less than our logical sector size so that partial IOs then leave us no longer aligned.
297
- fn assert_alignment(buffer: []const u8, offset: u64) void {
298
- assert(@ptrToInt(buffer.ptr) % config.sector_size == 0);
299
- assert(buffer.len % config.sector_size == 0);
300
- assert(offset % config.sector_size == 0);
301
- }
302
-
303
- /// Ensures that the read or write is within bounds and intends to read or write some bytes.
304
- fn assert_bounds(self: *Storage, buffer: []const u8, offset: u64) void {
305
- assert(buffer.len > 0);
306
- assert(offset + buffer.len <= self.size);
307
- }
308
- };
@@ -1,351 +0,0 @@
1
- const std = @import("std");
2
- const assert = std.debug.assert;
3
- const mem = std.mem;
4
-
5
- const config = @import("../config.zig");
6
-
7
- const StateChecker = @import("state_checker.zig").StateChecker;
8
-
9
- const message_pool = @import("../message_pool.zig");
10
- const MessagePool = message_pool.MessagePool;
11
- const Message = MessagePool.Message;
12
-
13
- const Network = @import("network.zig").Network;
14
- const NetworkOptions = @import("network.zig").NetworkOptions;
15
-
16
- pub const StateMachine = @import("state_machine.zig").StateMachine;
17
- const MessageBus = @import("message_bus.zig").MessageBus;
18
- const Storage = @import("storage.zig").Storage;
19
- const Time = @import("time.zig").Time;
20
-
21
- const vsr = @import("../vsr.zig");
22
- pub const Replica = vsr.Replica(StateMachine, MessageBus, Storage, Time);
23
- pub const Client = vsr.Client(StateMachine, MessageBus);
24
-
25
- pub const ClusterOptions = struct {
26
- cluster: u32,
27
-
28
- replica_count: u8,
29
- client_count: u8,
30
-
31
- seed: u64,
32
-
33
- network_options: NetworkOptions,
34
- storage_options: Storage.Options,
35
- health_options: HealthOptions,
36
- };
37
-
38
- pub const HealthOptions = struct {
39
- /// Probability per tick that a crash will occur.
40
- crash_probability: f64,
41
- /// Minimum duration of a crash.
42
- crash_stability: u32,
43
- /// Probability per tick that a crashed replica will recovery.
44
- restart_probability: f64,
45
- /// Minimum time a replica is up until it is crashed again.
46
- restart_stability: u32,
47
- };
48
-
49
- pub const ReplicaHealth = union(enum) {
50
- /// When >0, the replica cannot crash.
51
- /// When =0, the replica may crash.
52
- up: u32,
53
- /// When >0, this is the ticks remaining until recovery is possible.
54
- /// When =0, the replica may recover.
55
- down: u32,
56
- };
57
-
58
- pub const Cluster = struct {
59
- allocator: mem.Allocator,
60
- options: ClusterOptions,
61
-
62
- state_machines: []StateMachine,
63
- storages: []Storage,
64
- times: []Time,
65
- replicas: []Replica,
66
- health: []ReplicaHealth,
67
-
68
- clients: []Client,
69
- network: Network,
70
-
71
- // TODO: Initializing these fields in main() is a bit ugly
72
- state_checker: StateChecker = undefined,
73
- on_change_state: fn (replica: *Replica) void = undefined,
74
-
75
- pub fn create(allocator: mem.Allocator, prng: std.rand.Random, options: ClusterOptions) !*Cluster {
76
- assert(options.replica_count > 0);
77
- assert(options.health_options.crash_probability < 1.0);
78
- assert(options.health_options.crash_probability >= 0.0);
79
- assert(options.health_options.restart_probability < 1.0);
80
- assert(options.health_options.restart_probability >= 0.0);
81
-
82
- const cluster = try allocator.create(Cluster);
83
- errdefer allocator.destroy(cluster);
84
-
85
- const state_machines = try allocator.alloc(StateMachine, options.replica_count);
86
- errdefer allocator.free(state_machines);
87
-
88
- const storages = try allocator.alloc(Storage, options.replica_count);
89
- errdefer allocator.free(storages);
90
-
91
- const times = try allocator.alloc(Time, options.replica_count);
92
- errdefer allocator.free(times);
93
-
94
- const replicas = try allocator.alloc(Replica, options.replica_count);
95
- errdefer allocator.free(replicas);
96
-
97
- const health = try allocator.alloc(ReplicaHealth, options.replica_count);
98
- errdefer allocator.free(health);
99
- mem.set(ReplicaHealth, health, .{ .up = 0 });
100
-
101
- const clients = try allocator.alloc(Client, options.client_count);
102
- errdefer allocator.free(clients);
103
-
104
- var network = try Network.init(
105
- allocator,
106
- options.replica_count,
107
- options.client_count,
108
- options.network_options,
109
- );
110
- errdefer network.deinit();
111
-
112
- cluster.* = .{
113
- .allocator = allocator,
114
- .options = options,
115
- .state_machines = state_machines,
116
- .storages = storages,
117
- .times = times,
118
- .replicas = replicas,
119
- .health = health,
120
- .clients = clients,
121
- .network = network,
122
- };
123
-
124
- var buffer: [config.replicas_max]Storage.FaultyAreas = undefined;
125
- const faulty_areas = Storage.generate_faulty_areas(prng, config.journal_size_max, options.replica_count, &buffer);
126
- assert(faulty_areas.len == options.replica_count);
127
-
128
- for (cluster.replicas) |*replica, replica_index| {
129
- cluster.times[replica_index] = .{
130
- .resolution = config.tick_ms * std.time.ns_per_ms,
131
- .offset_type = .linear,
132
- .offset_coefficient_A = 0,
133
- .offset_coefficient_B = 0,
134
- };
135
- cluster.state_machines[replica_index] = StateMachine.init(options.seed);
136
- cluster.storages[replica_index] = try Storage.init(
137
- allocator,
138
- config.journal_size_max,
139
- options.storage_options,
140
- @intCast(u8, replica_index),
141
- faulty_areas[replica_index],
142
- );
143
- const message_bus = try cluster.network.init_message_bus(
144
- options.cluster,
145
- .{ .replica = @intCast(u8, replica_index) },
146
- );
147
-
148
- replica.* = try Replica.init(
149
- allocator,
150
- options.cluster,
151
- options.replica_count,
152
- @intCast(u8, replica_index),
153
- &cluster.times[replica_index],
154
- &cluster.storages[replica_index],
155
- message_bus,
156
- &cluster.state_machines[replica_index],
157
- );
158
- message_bus.set_on_message(*Replica, replica, Replica.on_message);
159
- }
160
-
161
- {
162
- // Format the WAL (equivalent to "tigerbeetle init ...").
163
- for (cluster.storages) |storage| {
164
- const write_size = vsr.format_journal(options.cluster, 0, storage.memory);
165
- assert(write_size == storage.memory.len);
166
- assert(write_size == config.journal_size_max);
167
- }
168
- }
169
-
170
- for (cluster.clients) |*client| {
171
- const client_id = prng.int(u128);
172
- const client_message_bus = try cluster.network.init_message_bus(
173
- options.cluster,
174
- .{ .client = client_id },
175
- );
176
- client.* = try Client.init(
177
- allocator,
178
- client_id,
179
- options.cluster,
180
- options.replica_count,
181
- client_message_bus,
182
- );
183
- client_message_bus.set_on_message(*Client, client, Client.on_message);
184
- }
185
-
186
- return cluster;
187
- }
188
-
189
- pub fn destroy(cluster: *Cluster) void {
190
- for (cluster.clients) |*client| client.deinit();
191
- cluster.allocator.free(cluster.clients);
192
-
193
- for (cluster.replicas) |*replica| replica.deinit(cluster.allocator);
194
- cluster.allocator.free(cluster.replicas);
195
- cluster.allocator.free(cluster.health);
196
-
197
- for (cluster.storages) |*storage| storage.deinit(cluster.allocator);
198
- cluster.allocator.free(cluster.storages);
199
-
200
- cluster.network.deinit();
201
-
202
- cluster.allocator.destroy(cluster);
203
- }
204
-
205
- /// Reset a replica to its initial state, simulating a random crash/panic.
206
- /// Leave the persistent storage untouched, and leave any currently
207
- /// inflight messages to/from the replica in the network.
208
- ///
209
- /// Returns whether the replica was crashed.
210
- pub fn crash_replica(cluster: *Cluster, replica_index: u8) !bool {
211
- const replica = &cluster.replicas[replica_index];
212
- if (replica.op == 0) {
213
- // Only crash when `replica.op > 0` — an empty WAL would skip recovery after a crash.
214
- return false;
215
- }
216
-
217
- // Ensure that the replica can eventually recover without this replica.
218
- // Verify that each op is recoverable by the current healthy cluster (minus the replica we
219
- // are trying to crash).
220
- // TODO Remove this workaround when VSR recovery protocol is disabled.
221
- if (cluster.options.replica_count != 1) {
222
- var parent: u128 = undefined;
223
- const cluster_op_max = op_max: {
224
- var v: ?u32 = null;
225
- var op_max: ?u64 = null;
226
- for (cluster.replicas) |other_replica, i| {
227
- if (cluster.health[i] == .down) continue;
228
- if (other_replica.status == .recovering) continue;
229
-
230
- if (v == null or other_replica.view_normal > v.? or
231
- (other_replica.view_normal == v.? and other_replica.op > op_max.?))
232
- {
233
- v = other_replica.view_normal;
234
- op_max = other_replica.op;
235
- parent = other_replica.journal.header_with_op(op_max.?).?.checksum;
236
- }
237
- }
238
- break :op_max op_max.?;
239
- };
240
-
241
- // TODO This workaround doesn't handle log wrapping correctly.
242
- assert(cluster_op_max < config.journal_slot_count);
243
-
244
- var op: u64 = cluster_op_max + 1;
245
- while (op > 0) {
246
- op -= 1;
247
-
248
- var cluster_op_known: bool = false;
249
- for (cluster.replicas) |other_replica, i| {
250
- // Ignore replicas that are ineligible to assist recovery.
251
- if (replica_index == i) continue;
252
- if (cluster.health[i] == .down) continue;
253
- if (other_replica.status == .recovering) continue;
254
-
255
- if (other_replica.journal.header_with_op_and_checksum(op, parent)) |header| {
256
- parent = header.parent;
257
- if (!other_replica.journal.dirty.bit(.{ .index = op })) {
258
- // The op is recoverable if this replica crashes.
259
- break;
260
- }
261
- cluster_op_known = true;
262
- }
263
- } else {
264
- if (op == cluster_op_max and !cluster_op_known) {
265
- // The replica can crash; it will be able to truncate the last op.
266
- } else {
267
- // The op isn't recoverable if this replica is crashed.
268
- return false;
269
- }
270
- }
271
- }
272
-
273
- // We can't crash this replica because without it we won't be able to repair a broken
274
- // hash chain.
275
- if (parent != 0) return false;
276
- }
277
-
278
- cluster.health[replica_index] = .{ .down = cluster.options.health_options.crash_stability };
279
-
280
- // Reset the storage before the replica so that pending writes can (partially) finish.
281
- cluster.storages[replica_index].reset();
282
- replica.deinit(cluster.allocator);
283
- cluster.state_machines[replica_index] = StateMachine.init(cluster.options.seed);
284
-
285
- // The message bus and network should be left alone, as messages
286
- // may still be inflight to/from this replica. However, we should
287
- // do a check to ensure that we aren't leaking any messages when
288
- // deinitializing the replica above.
289
- const packet_simulator = &cluster.network.packet_simulator;
290
- // The same message may be used for multiple network packets, so simply counting how
291
- // many packets are inflight from the replica is insufficient, we need to dedup them.
292
- var messages_in_network_set = std.AutoHashMap(*Message, void).init(cluster.allocator);
293
- defer messages_in_network_set.deinit();
294
-
295
- var target: u8 = 0;
296
- while (target < packet_simulator.options.node_count) : (target += 1) {
297
- const path = .{ .source = replica_index, .target = target };
298
- const queue = packet_simulator.path_queue(path);
299
- var it = queue.iterator();
300
- while (it.next()) |data| {
301
- try messages_in_network_set.put(data.packet.message, {});
302
- }
303
- }
304
-
305
- const messages_in_network = messages_in_network_set.count();
306
-
307
- var messages_in_pool: usize = 0;
308
- const message_bus = cluster.network.get_message_bus(.{ .replica = replica_index });
309
- {
310
- var it = message_bus.pool.free_list;
311
- while (it) |message| : (it = message.next) messages_in_pool += 1;
312
- }
313
-
314
- const total_messages = message_pool.messages_max_replica;
315
- assert(messages_in_network + messages_in_pool == total_messages);
316
-
317
- replica.* = try Replica.init(
318
- cluster.allocator,
319
- cluster.options.cluster,
320
- cluster.options.replica_count,
321
- @intCast(u8, replica_index),
322
- &cluster.times[replica_index],
323
- &cluster.storages[replica_index],
324
- message_bus,
325
- &cluster.state_machines[replica_index],
326
- );
327
- message_bus.set_on_message(*Replica, replica, Replica.on_message);
328
- replica.on_change_state = cluster.on_change_state;
329
- return true;
330
- }
331
-
332
- /// Returns the number of replicas capable of helping a crashed node recover (i.e. with
333
- /// replica.status=normal).
334
- pub fn replica_normal_count(cluster: *Cluster) u8 {
335
- var count: u8 = 0;
336
- for (cluster.replicas) |*replica| {
337
- if (replica.status == .normal) count += 1;
338
- }
339
- return count;
340
- }
341
-
342
- pub fn replica_up_count(cluster: *const Cluster) u8 {
343
- var count: u8 = 0;
344
- for (cluster.health) |health| {
345
- if (health == .up) {
346
- count += 1;
347
- }
348
- }
349
- return count;
350
- }
351
- };