tigerbeetle-node 0.8.1 → 0.9.143

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +584 -184
  2. package/dist/benchmark.js +59 -51
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/bin/aarch64-linux-gnu/client.node +0 -0
  5. package/dist/bin/aarch64-linux-musl/client.node +0 -0
  6. package/dist/bin/aarch64-macos/client.node +0 -0
  7. package/dist/bin/x86_64-linux-gnu/client.node +0 -0
  8. package/dist/bin/x86_64-linux-musl/client.node +0 -0
  9. package/dist/bin/x86_64-macos/client.node +0 -0
  10. package/dist/bin/x86_64-windows/client.node +0 -0
  11. package/dist/bindings.d.ts +141 -0
  12. package/dist/bindings.js +112 -0
  13. package/dist/bindings.js.map +1 -0
  14. package/dist/index.d.ts +2 -125
  15. package/dist/index.js +51 -101
  16. package/dist/index.js.map +1 -1
  17. package/dist/test.js +69 -55
  18. package/dist/test.js.map +1 -1
  19. package/package-lock.json +26 -0
  20. package/package.json +17 -28
  21. package/src/benchmark.ts +58 -49
  22. package/src/bindings.ts +631 -0
  23. package/src/index.ts +71 -163
  24. package/src/node.zig +169 -148
  25. package/src/test.ts +71 -57
  26. package/src/translate.zig +19 -36
  27. package/.yarn/releases/yarn-berry.cjs +0 -55
  28. package/.yarnrc.yml +0 -1
  29. package/scripts/download_node_headers.sh +0 -25
  30. package/scripts/postinstall.sh +0 -6
  31. package/src/tigerbeetle/scripts/benchmark.bat +0 -46
  32. package/src/tigerbeetle/scripts/benchmark.sh +0 -55
  33. package/src/tigerbeetle/scripts/install.sh +0 -6
  34. package/src/tigerbeetle/scripts/install_zig.bat +0 -109
  35. package/src/tigerbeetle/scripts/install_zig.sh +0 -84
  36. package/src/tigerbeetle/scripts/lint.zig +0 -199
  37. package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -39
  38. package/src/tigerbeetle/scripts/vopr.bat +0 -48
  39. package/src/tigerbeetle/scripts/vopr.sh +0 -33
  40. package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
  41. package/src/tigerbeetle/src/benchmark.zig +0 -290
  42. package/src/tigerbeetle/src/cli.zig +0 -244
  43. package/src/tigerbeetle/src/config.zig +0 -239
  44. package/src/tigerbeetle/src/demo.zig +0 -125
  45. package/src/tigerbeetle/src/demo_01_create_accounts.zig +0 -35
  46. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +0 -7
  47. package/src/tigerbeetle/src/demo_03_create_transfers.zig +0 -24
  48. package/src/tigerbeetle/src/demo_04_create_pending_transfers.zig +0 -61
  49. package/src/tigerbeetle/src/demo_05_post_pending_transfers.zig +0 -37
  50. package/src/tigerbeetle/src/demo_06_void_pending_transfers.zig +0 -24
  51. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +0 -7
  52. package/src/tigerbeetle/src/fifo.zig +0 -104
  53. package/src/tigerbeetle/src/io/benchmark.zig +0 -213
  54. package/src/tigerbeetle/src/io/darwin.zig +0 -793
  55. package/src/tigerbeetle/src/io/linux.zig +0 -1038
  56. package/src/tigerbeetle/src/io/test.zig +0 -643
  57. package/src/tigerbeetle/src/io/windows.zig +0 -1161
  58. package/src/tigerbeetle/src/io.zig +0 -34
  59. package/src/tigerbeetle/src/main.zig +0 -144
  60. package/src/tigerbeetle/src/message_bus.zig +0 -1000
  61. package/src/tigerbeetle/src/message_pool.zig +0 -142
  62. package/src/tigerbeetle/src/ring_buffer.zig +0 -289
  63. package/src/tigerbeetle/src/simulator.zig +0 -417
  64. package/src/tigerbeetle/src/state_machine.zig +0 -2470
  65. package/src/tigerbeetle/src/storage.zig +0 -308
  66. package/src/tigerbeetle/src/test/cluster.zig +0 -351
  67. package/src/tigerbeetle/src/test/message_bus.zig +0 -93
  68. package/src/tigerbeetle/src/test/network.zig +0 -179
  69. package/src/tigerbeetle/src/test/packet_simulator.zig +0 -387
  70. package/src/tigerbeetle/src/test/state_checker.zig +0 -145
  71. package/src/tigerbeetle/src/test/state_machine.zig +0 -76
  72. package/src/tigerbeetle/src/test/storage.zig +0 -438
  73. package/src/tigerbeetle/src/test/time.zig +0 -84
  74. package/src/tigerbeetle/src/tigerbeetle.zig +0 -222
  75. package/src/tigerbeetle/src/time.zig +0 -113
  76. package/src/tigerbeetle/src/unit_tests.zig +0 -14
  77. package/src/tigerbeetle/src/vsr/client.zig +0 -505
  78. package/src/tigerbeetle/src/vsr/clock.zig +0 -812
  79. package/src/tigerbeetle/src/vsr/journal.zig +0 -2293
  80. package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
  81. package/src/tigerbeetle/src/vsr/replica.zig +0 -5015
  82. package/src/tigerbeetle/src/vsr.zig +0 -1017
  83. package/yarn.lock +0 -42
@@ -1,1000 +0,0 @@
1
- const std = @import("std");
2
- const builtin = @import("builtin");
3
- const assert = std.debug.assert;
4
- const mem = std.mem;
5
- const os = std.os;
6
-
7
- const is_linux = builtin.target.os.tag == .linux;
8
-
9
- const config = @import("config.zig");
10
- const log = std.log.scoped(.message_bus);
11
-
12
- const vsr = @import("vsr.zig");
13
- const Header = vsr.Header;
14
-
15
- const RingBuffer = @import("ring_buffer.zig").RingBuffer;
16
- const IO = @import("io.zig").IO;
17
- const MessagePool = @import("message_pool.zig").MessagePool;
18
- const Message = MessagePool.Message;
19
-
20
- pub const MessageBusReplica = MessageBusImpl(.replica);
21
- pub const MessageBusClient = MessageBusImpl(.client);
22
-
23
- fn MessageBusImpl(comptime process_type: vsr.ProcessType) type {
24
- const SendQueue = RingBuffer(*Message, switch (process_type) {
25
- .replica => config.connection_send_queue_max_replica,
26
- // A client has at most 1 in-flight request, plus pings.
27
- .client => config.connection_send_queue_max_client,
28
- });
29
-
30
- const tcp_sndbuf = switch (process_type) {
31
- .replica => config.tcp_sndbuf_replica,
32
- .client => config.tcp_sndbuf_client,
33
- };
34
-
35
- return struct {
36
- const Self = @This();
37
-
38
- pool: MessagePool,
39
- io: *IO,
40
-
41
- cluster: u32,
42
- configuration: []std.net.Address,
43
-
44
- process: switch (process_type) {
45
- .replica => struct {
46
- replica: u8,
47
- /// The file descriptor for the process on which to accept connections.
48
- accept_fd: os.socket_t,
49
- accept_completion: IO.Completion = undefined,
50
- /// The connection reserved for the currently in progress accept operation.
51
- /// This is non-null exactly when an accept operation is submitted.
52
- accept_connection: ?*Connection = null,
53
- /// Map from client id to the currently active connection for that client.
54
- /// This is used to make lookup of client connections when sending messages
55
- /// efficient and to ensure old client connections are dropped if a new one
56
- /// is established.
57
- clients: std.AutoHashMapUnmanaged(u128, *Connection) = .{},
58
- },
59
- .client => void,
60
- },
61
-
62
- /// The callback to be called when a message is received. Use set_on_message() to set
63
- /// with type safety for the context pointer.
64
- on_message_callback: ?fn (context: ?*anyopaque, message: *Message) void = null,
65
- on_message_context: ?*anyopaque = null,
66
-
67
- /// This slice is allocated with a fixed size in the init function and never reallocated.
68
- connections: []Connection,
69
- /// Number of connections currently in use (i.e. connection.peer != .none).
70
- connections_used: usize = 0,
71
-
72
- /// Map from replica index to the currently active connection for that replica, if any.
73
- /// The connection for the process replica if any will always be null.
74
- replicas: []?*Connection,
75
- /// The number of outgoing `connect()` attempts for a given replica:
76
- /// Reset to zero after a successful `on_connect()`.
77
- replicas_connect_attempts: []u64,
78
-
79
- /// Used to apply jitter when calculating exponential backoff:
80
- /// Seeded with the process' replica index or client ID.
81
- prng: std.rand.DefaultPrng,
82
-
83
- /// Initialize the MessageBus for the given cluster, configuration and replica/client process.
84
- pub fn init(
85
- allocator: mem.Allocator,
86
- cluster: u32,
87
- configuration: []std.net.Address,
88
- process: switch (process_type) {
89
- .replica => u8,
90
- .client => u128,
91
- },
92
- io: *IO,
93
- ) !Self {
94
- // There must be enough connections for all replicas and at least one client.
95
- assert(config.connections_max > configuration.len);
96
-
97
- const connections = try allocator.alloc(Connection, config.connections_max);
98
- errdefer allocator.free(connections);
99
- mem.set(Connection, connections, .{});
100
-
101
- const replicas = try allocator.alloc(?*Connection, configuration.len);
102
- errdefer allocator.free(replicas);
103
- mem.set(?*Connection, replicas, null);
104
-
105
- const replicas_connect_attempts = try allocator.alloc(u64, configuration.len);
106
- errdefer allocator.free(replicas_connect_attempts);
107
- mem.set(u64, replicas_connect_attempts, 0);
108
-
109
- const prng_seed = switch (process_type) {
110
- .replica => process,
111
- .client => @truncate(u64, process),
112
- };
113
-
114
- var bus: Self = .{
115
- .pool = try MessagePool.init(allocator, process_type),
116
- .io = io,
117
- .cluster = cluster,
118
- .configuration = configuration,
119
- .process = switch (process_type) {
120
- .replica => .{
121
- .replica = process,
122
- .accept_fd = try init_tcp(io, configuration[process]),
123
- },
124
- .client => {},
125
- },
126
- .connections = connections,
127
- .replicas = replicas,
128
- .replicas_connect_attempts = replicas_connect_attempts,
129
- .prng = std.rand.DefaultPrng.init(prng_seed),
130
- };
131
-
132
- // Pre-allocate enough memory to hold all possible connections in the client map.
133
- if (process_type == .replica) {
134
- try bus.process.clients.ensureTotalCapacity(allocator, config.connections_max);
135
- }
136
-
137
- return bus;
138
- }
139
-
140
- pub fn set_on_message(
141
- bus: *Self,
142
- comptime Context: type,
143
- context: Context,
144
- comptime on_message: fn (context: Context, message: *Message) void,
145
- ) void {
146
- assert(bus.on_message_callback == null);
147
- assert(bus.on_message_context == null);
148
-
149
- bus.on_message_callback = struct {
150
- fn wrapper(_context: ?*anyopaque, message: *Message) void {
151
- on_message(@intToPtr(Context, @ptrToInt(_context)), message);
152
- }
153
- }.wrapper;
154
- bus.on_message_context = context;
155
- }
156
-
157
- /// TODO This is required by the Client.
158
- pub fn deinit(_: *Self) void {}
159
-
160
- fn init_tcp(io: *IO, address: std.net.Address) !os.socket_t {
161
- const fd = try io.open_socket(
162
- address.any.family,
163
- os.SOCK.STREAM,
164
- os.IPPROTO.TCP,
165
- );
166
- errdefer os.closeSocket(fd);
167
-
168
- const set = struct {
169
- fn set(_fd: os.socket_t, level: u32, option: u32, value: c_int) !void {
170
- try os.setsockopt(_fd, level, option, &mem.toBytes(value));
171
- }
172
- }.set;
173
-
174
- if (config.tcp_rcvbuf > 0) rcvbuf: {
175
- if (is_linux) {
176
- // Requires CAP_NET_ADMIN privilege (settle for SO_RCVBUF in case of an EPERM):
177
- if (set(fd, os.SOL.SOCKET, os.SO.RCVBUFFORCE, config.tcp_rcvbuf)) |_| {
178
- break :rcvbuf;
179
- } else |err| switch (err) {
180
- error.PermissionDenied => {},
181
- else => |e| return e,
182
- }
183
- }
184
- try set(fd, os.SOL.SOCKET, os.SO.RCVBUF, config.tcp_rcvbuf);
185
- }
186
-
187
- if (tcp_sndbuf > 0) sndbuf: {
188
- if (is_linux) {
189
- // Requires CAP_NET_ADMIN privilege (settle for SO_SNDBUF in case of an EPERM):
190
- if (set(fd, os.SOL.SOCKET, os.SO.SNDBUFFORCE, tcp_sndbuf)) |_| {
191
- break :sndbuf;
192
- } else |err| switch (err) {
193
- error.PermissionDenied => {},
194
- else => |e| return e,
195
- }
196
- }
197
- try set(fd, os.SOL.SOCKET, os.SO.SNDBUF, tcp_sndbuf);
198
- }
199
-
200
- if (config.tcp_keepalive) {
201
- try set(fd, os.SOL.SOCKET, os.SO.KEEPALIVE, 1);
202
- if (is_linux) {
203
- try set(fd, os.IPPROTO.TCP, os.TCP.KEEPIDLE, config.tcp_keepidle);
204
- try set(fd, os.IPPROTO.TCP, os.TCP.KEEPINTVL, config.tcp_keepintvl);
205
- try set(fd, os.IPPROTO.TCP, os.TCP.KEEPCNT, config.tcp_keepcnt);
206
- }
207
- }
208
-
209
- if (config.tcp_user_timeout > 0) {
210
- if (is_linux) {
211
- try set(fd, os.IPPROTO.TCP, os.TCP.USER_TIMEOUT, config.tcp_user_timeout);
212
- }
213
- }
214
-
215
- // Set tcp no-delay
216
- if (config.tcp_nodelay) {
217
- if (is_linux) {
218
- try set(fd, os.IPPROTO.TCP, os.TCP.NODELAY, 1);
219
- }
220
- }
221
-
222
- try set(fd, os.SOL.SOCKET, os.SO.REUSEADDR, 1);
223
- try os.bind(fd, &address.any, address.getOsSockLen());
224
- try os.listen(fd, config.tcp_backlog);
225
-
226
- return fd;
227
- }
228
-
229
- pub fn tick(bus: *Self) void {
230
- switch (process_type) {
231
- .replica => {
232
- // Each replica is responsible for connecting to replicas that come
233
- // after it in the configuration. This ensures that replicas never try
234
- // to connect to each other at the same time.
235
- var replica: u8 = bus.process.replica + 1;
236
- while (replica < bus.replicas.len) : (replica += 1) {
237
- bus.maybe_connect_to_replica(replica);
238
- }
239
-
240
- // Only replicas accept connections from other replicas and clients:
241
- bus.maybe_accept();
242
- },
243
- .client => {
244
- // The client connects to all replicas.
245
- var replica: u8 = 0;
246
- while (replica < bus.replicas.len) : (replica += 1) {
247
- bus.maybe_connect_to_replica(replica);
248
- }
249
- },
250
- }
251
- }
252
-
253
- fn maybe_connect_to_replica(bus: *Self, replica: u8) void {
254
- // We already have a connection to the given replica.
255
- if (bus.replicas[replica] != null) {
256
- assert(bus.connections_used > 0);
257
- return;
258
- }
259
-
260
- // Obtain a connection struct for our new replica connection.
261
- // If there is a free connection, use that. Otherwise drop
262
- // a client or unknown connection to make space. Prefer dropping
263
- // a client connection to an unknown one as the unknown peer may
264
- // be a replica. Since shutting a connection down does not happen
265
- // instantly, simply return after starting the shutdown and try again
266
- // on the next tick().
267
- for (bus.connections) |*connection| {
268
- if (connection.state == .free) {
269
- assert(connection.peer == .none);
270
- // This will immediately add the connection to bus.replicas,
271
- // or else will return early if a socket file descriptor cannot be obtained:
272
- // TODO See if we can clean this up to remove/expose the early return branch.
273
- connection.connect_to_replica(bus, replica);
274
- return;
275
- }
276
- }
277
-
278
- // If there is already a connection being shut down, no need to kill another.
279
- for (bus.connections) |*connection| {
280
- if (connection.state == .terminating) return;
281
- }
282
-
283
- log.info("all connections in use but not all replicas are connected, " ++
284
- "attempting to disconnect a client", .{});
285
- for (bus.connections) |*connection| {
286
- if (connection.peer == .client) {
287
- connection.terminate(bus, .shutdown);
288
- return;
289
- }
290
- }
291
-
292
- log.info("failed to disconnect a client as no peer was a known client, " ++
293
- "attempting to disconnect an unknown peer.", .{});
294
- for (bus.connections) |*connection| {
295
- if (connection.peer == .unknown) {
296
- connection.terminate(bus, .shutdown);
297
- return;
298
- }
299
- }
300
-
301
- // We assert that the max number of connections is greater
302
- // than the number of replicas in init().
303
- unreachable;
304
- }
305
-
306
- fn maybe_accept(bus: *Self) void {
307
- comptime assert(process_type == .replica);
308
-
309
- if (bus.process.accept_connection != null) return;
310
- // All connections are currently in use, do nothing.
311
- if (bus.connections_used == bus.connections.len) return;
312
- assert(bus.connections_used < bus.connections.len);
313
- bus.process.accept_connection = for (bus.connections) |*connection| {
314
- if (connection.state == .free) {
315
- assert(connection.peer == .none);
316
- connection.state = .accepting;
317
- break connection;
318
- }
319
- } else unreachable;
320
- bus.io.accept(
321
- *Self,
322
- bus,
323
- on_accept,
324
- &bus.process.accept_completion,
325
- bus.process.accept_fd,
326
- );
327
- }
328
-
329
- fn on_accept(
330
- bus: *Self,
331
- completion: *IO.Completion,
332
- result: IO.AcceptError!os.socket_t,
333
- ) void {
334
- _ = completion;
335
-
336
- comptime assert(process_type == .replica);
337
- assert(bus.process.accept_connection != null);
338
- defer bus.process.accept_connection = null;
339
- const fd = result catch |err| {
340
- bus.process.accept_connection.?.state = .free;
341
- // TODO: some errors should probably be fatal
342
- log.err("accept failed: {}", .{err});
343
- return;
344
- };
345
- bus.process.accept_connection.?.on_accept(bus, fd);
346
- }
347
-
348
- pub fn get_message(bus: *Self) *Message {
349
- return bus.pool.get_message();
350
- }
351
-
352
- pub fn unref(bus: *Self, message: *Message) void {
353
- bus.pool.unref(message);
354
- }
355
-
356
- pub fn send_message_to_replica(bus: *Self, replica: u8, message: *Message) void {
357
- // Messages sent by a replica to itself should never be passed to the message bus.
358
- if (process_type == .replica) assert(replica != bus.process.replica);
359
-
360
- if (bus.replicas[replica]) |connection| {
361
- connection.send_message(bus, message);
362
- } else {
363
- log.debug("no active connection to replica {}, " ++
364
- "dropping message with header {}", .{ replica, message.header });
365
- }
366
- }
367
-
368
- /// Try to send the message to the client with the given id.
369
- /// If the client is not currently connected, the message is silently dropped.
370
- pub fn send_message_to_client(bus: *Self, client_id: u128, message: *Message) void {
371
- comptime assert(process_type == .replica);
372
-
373
- if (bus.process.clients.get(client_id)) |connection| {
374
- connection.send_message(bus, message);
375
- } else {
376
- log.debug("no connection to client {x}", .{client_id});
377
- }
378
- }
379
-
380
- /// Used to send/receive messages to/from a client or fellow replica.
381
- const Connection = struct {
382
- /// The peer is determined by inspecting the first message header
383
- /// received.
384
- peer: union(enum) {
385
- /// No peer is currently connected.
386
- none: void,
387
- /// A connection is established but an unambiguous header has not yet been received.
388
- unknown: void,
389
- /// The peer is a client with the given id.
390
- client: u128,
391
- /// The peer is a replica with the given id.
392
- replica: u8,
393
- } = .none,
394
- state: enum {
395
- /// The connection is not in use, with peer set to `.none`.
396
- free,
397
- /// The connection has been reserved for an in progress accept operation,
398
- /// with peer set to `.none`.
399
- accepting,
400
- /// The peer is a replica and a connect operation has been started
401
- /// but not yet competed.
402
- connecting,
403
- /// The peer is fully connected and may be a client, replica, or unknown.
404
- connected,
405
- /// The connection is being terminated but cleanup has not yet finished.
406
- terminating,
407
- } = .free,
408
- /// This is guaranteed to be valid only while state is connected.
409
- /// It will be reset to IO.INVALID_SOCKET during the shutdown process and is always IO.INVALID_SOCKET if the
410
- /// connection is unused (i.e. peer == .none). We use IO.INVALID_SOCKET instead of undefined here
411
- /// for safety to ensure an error if the invalid value is ever used, instead of
412
- /// potentially performing an action on an active fd.
413
- fd: os.socket_t = IO.INVALID_SOCKET,
414
-
415
- /// This completion is used for all recv operations.
416
- /// It is also used for the initial connect when establishing a replica connection.
417
- recv_completion: IO.Completion = undefined,
418
- /// True exactly when the recv_completion has been submitted to the IO abstraction
419
- /// but the callback has not yet been run.
420
- recv_submitted: bool = false,
421
- /// The Message with the buffer passed to the kernel for recv operations.
422
- recv_message: ?*Message = null,
423
- /// The number of bytes in `recv_message` that have been received and need parsing.
424
- recv_progress: usize = 0,
425
- /// The number of bytes in `recv_message` that have been parsed.
426
- recv_parsed: usize = 0,
427
- /// True if we have already checked the header checksum of the message we
428
- /// are currently receiving/parsing.
429
- recv_checked_header: bool = false,
430
-
431
- /// This completion is used for all send operations.
432
- send_completion: IO.Completion = undefined,
433
- /// True exactly when the send_completion has been submitted to the IO abstraction
434
- /// but the callback has not yet been run.
435
- send_submitted: bool = false,
436
- /// Number of bytes of the current message that have already been sent.
437
- send_progress: usize = 0,
438
- /// The queue of messages to send to the client or replica peer.
439
- send_queue: SendQueue = .{},
440
-
441
- /// Attempt to connect to a replica.
442
- /// The slot in the Message.replicas slices is immediately reserved.
443
- /// Failure is silent and returns the connection to an unused state.
444
- pub fn connect_to_replica(connection: *Connection, bus: *Self, replica: u8) void {
445
- if (process_type == .replica) assert(replica != bus.process.replica);
446
-
447
- assert(connection.peer == .none);
448
- assert(connection.state == .free);
449
- assert(connection.fd == IO.INVALID_SOCKET);
450
-
451
- // The first replica's network address family determines the
452
- // family for all other replicas:
453
- const family = bus.configuration[0].any.family;
454
- connection.fd = bus.io.open_socket(family, os.SOCK.STREAM, os.IPPROTO.TCP) catch return;
455
- connection.peer = .{ .replica = replica };
456
- connection.state = .connecting;
457
- bus.connections_used += 1;
458
-
459
- assert(bus.replicas[replica] == null);
460
- bus.replicas[replica] = connection;
461
-
462
- var attempts = &bus.replicas_connect_attempts[replica];
463
- const ms = vsr.exponential_backoff_with_jitter(
464
- bus.prng.random(),
465
- config.connection_delay_min_ms,
466
- config.connection_delay_max_ms,
467
- attempts.*,
468
- );
469
- attempts.* += 1;
470
-
471
- log.debug("connecting to replica {} in {}ms...", .{ connection.peer.replica, ms });
472
-
473
- assert(!connection.recv_submitted);
474
- connection.recv_submitted = true;
475
-
476
- bus.io.timeout(
477
- *Self,
478
- bus,
479
- on_connect_with_exponential_backoff,
480
- // We use `recv_completion` for the connection `timeout()` and `connect()` calls
481
- &connection.recv_completion,
482
- @intCast(u63, ms * std.time.ns_per_ms),
483
- );
484
- }
485
-
486
- fn on_connect_with_exponential_backoff(
487
- bus: *Self,
488
- completion: *IO.Completion,
489
- result: IO.TimeoutError!void,
490
- ) void {
491
- const connection = @fieldParentPtr(Connection, "recv_completion", completion);
492
- assert(connection.recv_submitted);
493
- connection.recv_submitted = false;
494
- if (connection.state == .terminating) {
495
- connection.maybe_close(bus);
496
- return;
497
- }
498
- assert(connection.state == .connecting);
499
- result catch unreachable;
500
-
501
- log.debug("connecting to replica {}...", .{connection.peer.replica});
502
-
503
- assert(!connection.recv_submitted);
504
- connection.recv_submitted = true;
505
-
506
- bus.io.connect(
507
- *Self,
508
- bus,
509
- on_connect,
510
- // We use `recv_completion` for the connection `timeout()` and `connect()` calls
511
- &connection.recv_completion,
512
- connection.fd,
513
- bus.configuration[connection.peer.replica],
514
- );
515
- }
516
-
517
- fn on_connect(
518
- bus: *Self,
519
- completion: *IO.Completion,
520
- result: IO.ConnectError!void,
521
- ) void {
522
- const connection = @fieldParentPtr(Connection, "recv_completion", completion);
523
- assert(connection.recv_submitted);
524
- connection.recv_submitted = false;
525
-
526
- if (connection.state == .terminating) {
527
- connection.maybe_close(bus);
528
- return;
529
- }
530
- assert(connection.state == .connecting);
531
- connection.state = .connected;
532
-
533
- result catch |err| {
534
- log.err("error connecting to replica {}: {}", .{ connection.peer.replica, err });
535
- connection.terminate(bus, .close);
536
- return;
537
- };
538
-
539
- log.info("connected to replica {}", .{connection.peer.replica});
540
- bus.replicas_connect_attempts[connection.peer.replica] = 0;
541
-
542
- connection.assert_recv_send_initial_state(bus);
543
- // This will terminate the connection if there are no messages available:
544
- connection.get_recv_message_and_recv(bus);
545
- // A message may have been queued for sending while we were connecting:
546
- // TODO Should we relax recv() and send() to return if `connection.state != .connected`?
547
- if (connection.state == .connected) connection.send(bus);
548
- }
549
-
550
- /// Given a newly accepted fd, start receiving messages on it.
551
- /// Callbacks will be continuously re-registered until terminate() is called.
552
- pub fn on_accept(connection: *Connection, bus: *Self, fd: os.socket_t) void {
553
- assert(connection.peer == .none);
554
- assert(connection.state == .accepting);
555
- assert(connection.fd == IO.INVALID_SOCKET);
556
-
557
- connection.peer = .unknown;
558
- connection.state = .connected;
559
- connection.fd = fd;
560
- bus.connections_used += 1;
561
-
562
- connection.assert_recv_send_initial_state(bus);
563
- connection.get_recv_message_and_recv(bus);
564
- assert(connection.send_queue.empty());
565
- }
566
-
567
- fn assert_recv_send_initial_state(connection: *Connection, bus: *Self) void {
568
- assert(bus.connections_used > 0);
569
-
570
- assert(connection.peer == .unknown or connection.peer == .replica);
571
- assert(connection.state == .connected);
572
- assert(connection.fd != IO.INVALID_SOCKET);
573
-
574
- assert(connection.recv_submitted == false);
575
- assert(connection.recv_message == null);
576
- assert(connection.recv_progress == 0);
577
- assert(connection.recv_parsed == 0);
578
-
579
- assert(connection.send_submitted == false);
580
- assert(connection.send_progress == 0);
581
- }
582
-
583
- /// Add a message to the connection's send queue, starting a send operation
584
- /// if the queue was previously empty.
585
- pub fn send_message(connection: *Connection, bus: *Self, message: *Message) void {
586
- assert(connection.peer == .client or connection.peer == .replica);
587
- switch (connection.state) {
588
- .connected, .connecting => {},
589
- .terminating => return,
590
- .free, .accepting => unreachable,
591
- }
592
- if (connection.send_queue.full()) {
593
- log.info("message queue for peer {} full, dropping {s} message", .{
594
- connection.peer,
595
- @tagName(message.header.command),
596
- });
597
- return;
598
- }
599
- connection.send_queue.push_assume_capacity(message.ref());
600
- // If the connection has not yet been established we can't send yet.
601
- // Instead on_connect() will call send().
602
- if (connection.state == .connecting) {
603
- assert(connection.peer == .replica);
604
- return;
605
- }
606
- // If there is no send operation currently in progress, start one.
607
- if (!connection.send_submitted) connection.send(bus);
608
- }
609
-
610
- /// Clean up an active connection and reset it to its initial, unused, state.
611
- /// This reset does not happen instantly as currently in progress operations
612
- /// must first be stopped. The `how` arg allows the caller to specify if a
613
- /// shutdown syscall should be made or not before proceeding to wait for
614
- /// currently in progress operations to complete and close the socket.
615
- /// I'll be back! (when the Connection is reused after being fully closed)
616
- pub fn terminate(connection: *Connection, bus: *Self, how: enum { shutdown, close }) void {
617
- assert(connection.peer != .none);
618
- assert(connection.state != .free);
619
- assert(connection.fd != IO.INVALID_SOCKET);
620
- switch (how) {
621
- .shutdown => {
622
- // The shutdown syscall will cause currently in progress send/recv
623
- // operations to be gracefully closed while keeping the fd open.
624
- //
625
- // TODO: Investigate differences between shutdown() on Linux vs Darwin.
626
- // Especially how this interacts with our assumptions around pending I/O.
627
- os.shutdown(connection.fd, .both) catch |err| switch (err) {
628
- error.SocketNotConnected => {
629
- // This should only happen if we for some reason decide to terminate
630
- // a connection while a connect operation is in progress.
631
- // This is fine though, we simply continue with the logic below and
632
- // wait for the connect operation to finish.
633
-
634
- // TODO: This currently happens in other cases if the
635
- // connection was closed due to an error. We need to intelligently
636
- // decide whether to shutdown or close directly based on the error
637
- // before these assertions may be re-enabled.
638
-
639
- //assert(connection.state == .connecting);
640
- //assert(connection.recv_submitted);
641
- //assert(!connection.send_submitted);
642
- },
643
- // Ignore all the remaining errors for now
644
- error.ConnectionAborted, error.ConnectionResetByPeer, error.BlockingOperationInProgress, error.NetworkSubsystemFailed, error.SystemResources, error.Unexpected => {},
645
- };
646
- },
647
- .close => {},
648
- }
649
- assert(connection.state != .terminating);
650
- connection.state = .terminating;
651
- connection.maybe_close(bus);
652
- }
653
-
654
- fn parse_messages(connection: *Connection, bus: *Self) void {
655
- assert(connection.peer != .none);
656
- assert(connection.state == .connected);
657
- assert(connection.fd != IO.INVALID_SOCKET);
658
-
659
- while (connection.parse_message(bus)) |message| {
660
- defer bus.unref(message);
661
-
662
- connection.on_message(bus, message);
663
- }
664
- }
665
-
666
- fn parse_message(connection: *Connection, bus: *Self) ?*Message {
667
- const data = connection.recv_message.?.buffer[connection.recv_parsed..connection.recv_progress];
668
- if (data.len < @sizeOf(Header)) {
669
- connection.get_recv_message_and_recv(bus);
670
- return null;
671
- }
672
-
673
- const header = mem.bytesAsValue(Header, data[0..@sizeOf(Header)]);
674
- if (!connection.recv_checked_header) {
675
- if (!header.valid_checksum()) {
676
- log.err("invalid header checksum received from {}", .{connection.peer});
677
- connection.terminate(bus, .shutdown);
678
- return null;
679
- }
680
-
681
- if (header.size < @sizeOf(Header) or header.size > config.message_size_max) {
682
- log.err("header with invalid size {d} received from peer {}", .{
683
- header.size,
684
- connection.peer,
685
- });
686
- connection.terminate(bus, .shutdown);
687
- return null;
688
- }
689
-
690
- if (header.cluster != bus.cluster) {
691
- log.err("message addressed to the wrong cluster: {}", .{header.cluster});
692
- connection.terminate(bus, .shutdown);
693
- return null;
694
- }
695
-
696
- switch (process_type) {
697
- // Replicas may forward messages from clients or from other replicas so we
698
- // may receive messages from a peer before we know who they are:
699
- // This has the same effect as an asymmetric network where, for a short time
700
- // bounded by the time it takes to ping, we can hear from a peer before we
701
- // can send back to them.
702
- .replica => connection.maybe_set_peer(bus, header),
703
- // The client connects only to replicas and should set peer when connecting:
704
- .client => assert(connection.peer == .replica),
705
- }
706
-
707
- connection.recv_checked_header = true;
708
- }
709
-
710
- if (data.len < header.size) {
711
- connection.get_recv_message_and_recv(bus);
712
- return null;
713
- }
714
-
715
- // At this point we know that we have the full message in our buffer.
716
- // We will now either deliver this message or terminate the connection
717
- // due to an error, so reset recv_checked_header for the next message.
718
- assert(connection.recv_checked_header);
719
- connection.recv_checked_header = false;
720
-
721
- const body = data[@sizeOf(Header)..header.size];
722
- if (!header.valid_checksum_body(body)) {
723
- log.err("invalid body checksum received from {}", .{connection.peer});
724
- connection.terminate(bus, .shutdown);
725
- return null;
726
- }
727
-
728
- connection.recv_parsed += header.size;
729
-
730
- // Return the parsed message using zero-copy if we can, or copy if the client is
731
- // pipelining:
732
- // If this is the first message but there are messages in the pipeline then we
733
- // copy the message so that its sector padding (if any) will not overwrite the
734
- // front of the pipeline. If this is not the first message then we must copy
735
- // the message to a new message as each message needs to have its own unique
736
- // `references` and `header` metadata.
737
- if (connection.recv_progress == header.size) return connection.recv_message.?.ref();
738
-
739
- const message = bus.get_message();
740
- mem.copy(u8, message.buffer, data[0..header.size]);
741
- return message;
742
- }
743
-
744
- /// Forward a received message to `Process.on_message()`.
745
- /// Zero any `.prepare` sector padding up to the nearest sector multiple after the body.
746
- fn on_message(connection: *Connection, bus: *Self, message: *Message) void {
747
- if (message == connection.recv_message.?) {
748
- assert(connection.recv_parsed == message.header.size);
749
- assert(connection.recv_parsed == connection.recv_progress);
750
- } else if (connection.recv_parsed == message.header.size) {
751
- assert(connection.recv_parsed < connection.recv_progress);
752
- } else {
753
- assert(connection.recv_parsed > message.header.size);
754
- assert(connection.recv_parsed <= connection.recv_progress);
755
- }
756
-
757
- if (message.header.command == .request or message.header.command == .prepare) {
758
- const sector_ceil = vsr.sector_ceil(message.header.size);
759
- if (message.header.size != sector_ceil) {
760
- assert(message.header.size < sector_ceil);
761
- assert(message.buffer.len == config.message_size_max + config.sector_size);
762
- mem.set(u8, message.buffer[message.header.size..sector_ceil], 0);
763
- }
764
- }
765
-
766
- bus.on_message_callback.?(bus.on_message_context, message);
767
- }
768
-
769
- fn maybe_set_peer(connection: *Connection, bus: *Self, header: *const Header) void {
770
- comptime assert(process_type == .replica);
771
-
772
- assert(bus.cluster == header.cluster);
773
- assert(bus.connections_used > 0);
774
-
775
- assert(connection.peer != .none);
776
- assert(connection.state == .connected);
777
- assert(connection.fd != IO.INVALID_SOCKET);
778
-
779
- if (connection.peer != .unknown) return;
780
-
781
- switch (header.peer_type()) {
782
- .unknown => return,
783
- .replica => {
784
- connection.peer = .{ .replica = header.replica };
785
- // If there is a connection to this replica, terminate and replace it:
786
- if (bus.replicas[connection.peer.replica]) |old| {
787
- assert(old.peer == .replica);
788
- assert(old.peer.replica == connection.peer.replica);
789
- assert(old.state != .free);
790
- if (old.state != .terminating) old.terminate(bus, .shutdown);
791
- }
792
- bus.replicas[connection.peer.replica] = connection;
793
- log.info("connection from replica {}", .{connection.peer.replica});
794
- },
795
- .client => {
796
- connection.peer = .{ .client = header.client };
797
- const result = bus.process.clients.getOrPutAssumeCapacity(header.client);
798
- // If there is a connection to this client, terminate and replace it:
799
- if (result.found_existing) {
800
- const old = result.value_ptr.*;
801
- assert(old.peer == .client);
802
- assert(old.peer.client == connection.peer.client);
803
- assert(old.state == .connected or old.state == .terminating);
804
- if (old.state != .terminating) old.terminate(bus, .shutdown);
805
- }
806
- result.value_ptr.* = connection;
807
- log.info("connection from client {}", .{connection.peer.client});
808
- },
809
- }
810
- }
811
-
812
- /// Acquires a free message if necessary and then calls `recv()`.
813
- /// Terminates the connection if a free message cannot be obtained.
814
- /// If the connection has a `recv_message` and the message being parsed is
815
- /// at pole position then calls `recv()` immediately, otherwise copies any
816
- /// partially received message into a new Message and sets `recv_message`,
817
- /// releasing the old one.
818
- fn get_recv_message_and_recv(connection: *Connection, bus: *Self) void {
819
- if (connection.recv_message != null and connection.recv_parsed == 0) {
820
- connection.recv(bus);
821
- return;
822
- }
823
-
824
- const new_message = bus.get_message();
825
- defer bus.unref(new_message);
826
-
827
- if (connection.recv_message) |recv_message| {
828
- defer bus.unref(recv_message);
829
-
830
- assert(connection.recv_progress > 0);
831
- assert(connection.recv_parsed > 0);
832
- const data = recv_message.buffer[connection.recv_parsed..connection.recv_progress];
833
- mem.copy(u8, new_message.buffer, data);
834
- connection.recv_progress = data.len;
835
- connection.recv_parsed = 0;
836
- } else {
837
- assert(connection.recv_progress == 0);
838
- assert(connection.recv_parsed == 0);
839
- }
840
-
841
- connection.recv_message = new_message.ref();
842
- connection.recv(bus);
843
- }
844
-
845
- fn recv(connection: *Connection, bus: *Self) void {
846
- assert(connection.peer != .none);
847
- assert(connection.state == .connected);
848
- assert(connection.fd != IO.INVALID_SOCKET);
849
-
850
- assert(!connection.recv_submitted);
851
- connection.recv_submitted = true;
852
-
853
- assert(connection.recv_progress < config.message_size_max);
854
-
855
- bus.io.recv(
856
- *Self,
857
- bus,
858
- on_recv,
859
- &connection.recv_completion,
860
- connection.fd,
861
- connection.recv_message.?.buffer[connection.recv_progress..config.message_size_max],
862
- );
863
- }
864
-
865
- fn on_recv(bus: *Self, completion: *IO.Completion, result: IO.RecvError!usize) void {
866
- const connection = @fieldParentPtr(Connection, "recv_completion", completion);
867
- assert(connection.recv_submitted);
868
- connection.recv_submitted = false;
869
- if (connection.state == .terminating) {
870
- connection.maybe_close(bus);
871
- return;
872
- }
873
- assert(connection.state == .connected);
874
- const bytes_received = result catch |err| {
875
- // TODO: maybe don't need to close on *every* error
876
- log.err("error receiving from {}: {}", .{ connection.peer, err });
877
- connection.terminate(bus, .shutdown);
878
- return;
879
- };
880
- // No bytes received means that the peer closed its side of the connection.
881
- if (bytes_received == 0) {
882
- log.info("peer performed an orderly shutdown: {}", .{connection.peer});
883
- connection.terminate(bus, .close);
884
- return;
885
- }
886
- connection.recv_progress += bytes_received;
887
- connection.parse_messages(bus);
888
- }
889
-
890
- fn send(connection: *Connection, bus: *Self) void {
891
- assert(connection.peer == .client or connection.peer == .replica);
892
- assert(connection.state == .connected);
893
- assert(connection.fd != IO.INVALID_SOCKET);
894
- const message = connection.send_queue.head() orelse return;
895
- assert(!connection.send_submitted);
896
- connection.send_submitted = true;
897
- bus.io.send(
898
- *Self,
899
- bus,
900
- on_send,
901
- &connection.send_completion,
902
- connection.fd,
903
- message.buffer[connection.send_progress..message.header.size],
904
- );
905
- }
906
-
907
- fn on_send(bus: *Self, completion: *IO.Completion, result: IO.SendError!usize) void {
908
- const connection = @fieldParentPtr(Connection, "send_completion", completion);
909
- assert(connection.send_submitted);
910
- connection.send_submitted = false;
911
- assert(connection.peer == .client or connection.peer == .replica);
912
- if (connection.state == .terminating) {
913
- connection.maybe_close(bus);
914
- return;
915
- }
916
- assert(connection.state == .connected);
917
- connection.send_progress += result catch |err| {
918
- // TODO: maybe don't need to close on *every* error
919
- log.err("error sending message to replica at {}: {}", .{ connection.peer, err });
920
- connection.terminate(bus, .shutdown);
921
- return;
922
- };
923
- assert(connection.send_progress <= connection.send_queue.head().?.header.size);
924
- // If the message has been fully sent, move on to the next one.
925
- if (connection.send_progress == connection.send_queue.head().?.header.size) {
926
- connection.send_progress = 0;
927
- const message = connection.send_queue.pop().?;
928
- bus.unref(message);
929
- }
930
- connection.send(bus);
931
- }
932
-
933
- fn maybe_close(connection: *Connection, bus: *Self) void {
934
- assert(connection.peer != .none);
935
- assert(connection.state == .terminating);
936
- // If a recv or send operation is currently submitted to the kernel,
937
- // submitting a close would cause a race. Therefore we must wait for
938
- // any currently submitted operation to complete.
939
- if (connection.recv_submitted or connection.send_submitted) return;
940
- connection.send_submitted = true;
941
- connection.recv_submitted = true;
942
- // We can free resources now that there is no longer any I/O in progress.
943
- while (connection.send_queue.pop()) |message| {
944
- bus.unref(message);
945
- }
946
- if (connection.recv_message) |message| {
947
- bus.unref(message);
948
- connection.recv_message = null;
949
- }
950
- assert(connection.fd != IO.INVALID_SOCKET);
951
- defer connection.fd = IO.INVALID_SOCKET;
952
- // It's OK to use the send completion here as we know that no send
953
- // operation is currently in progress.
954
- bus.io.close(*Self, bus, on_close, &connection.send_completion, connection.fd);
955
- }
956
-
957
- fn on_close(bus: *Self, completion: *IO.Completion, result: IO.CloseError!void) void {
958
- const connection = @fieldParentPtr(Connection, "send_completion", completion);
959
- assert(connection.send_submitted);
960
- assert(connection.recv_submitted);
961
-
962
- assert(connection.peer != .none);
963
- assert(connection.state == .terminating);
964
-
965
- // Reset the connection to its initial state.
966
- defer {
967
- assert(connection.recv_message == null);
968
- assert(connection.send_queue.empty());
969
-
970
- switch (connection.peer) {
971
- .none => unreachable,
972
- .unknown => {},
973
- .client => switch (process_type) {
974
- .replica => assert(bus.process.clients.remove(connection.peer.client)),
975
- .client => unreachable,
976
- },
977
- .replica => {
978
- // A newer replica connection may have replaced this one:
979
- if (bus.replicas[connection.peer.replica] == connection) {
980
- bus.replicas[connection.peer.replica] = null;
981
- } else {
982
- // A newer replica connection may even leapfrog this connection and
983
- // then be terminated and set to null before we can get here:
984
- assert(bus.replicas[connection.peer.replica] != null or
985
- bus.replicas[connection.peer.replica] == null);
986
- }
987
- },
988
- }
989
- bus.connections_used -= 1;
990
- connection.* = .{};
991
- }
992
-
993
- result catch |err| {
994
- log.err("error closing connection to {}: {}", .{ connection.peer, err });
995
- return;
996
- };
997
- }
998
- };
999
- };
1000
- }