tigerbeetle-node 0.11.13 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/aarch64-linux-gnu/client.node +0 -0
- package/dist/bin/aarch64-linux-musl/client.node +0 -0
- package/dist/bin/aarch64-macos/client.node +0 -0
- package/dist/bin/x86_64-linux-gnu/client.node +0 -0
- package/dist/bin/x86_64-linux-musl/client.node +0 -0
- package/dist/bin/x86_64-macos/client.node +0 -0
- package/dist/index.js +33 -1
- package/dist/index.js.map +1 -1
- package/package-lock.json +66 -0
- package/package.json +6 -16
- package/src/index.ts +56 -1
- package/src/node.zig +9 -9
- package/dist/.client.node.sha256 +0 -1
- package/scripts/build_lib.sh +0 -61
- package/scripts/download_node_headers.sh +0 -32
- package/src/tigerbeetle/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/benchmark.bat +0 -55
- package/src/tigerbeetle/scripts/scripts/benchmark.sh +0 -66
- package/src/tigerbeetle/scripts/scripts/confirm_image.sh +0 -44
- package/src/tigerbeetle/scripts/scripts/fail_on_diff.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/fuzz_loop.sh +0 -15
- package/src/tigerbeetle/scripts/scripts/fuzz_loop_hash_log.sh +0 -12
- package/src/tigerbeetle/scripts/scripts/fuzz_unique_errors.sh +0 -7
- package/src/tigerbeetle/scripts/scripts/install.bat +0 -7
- package/src/tigerbeetle/scripts/scripts/install.sh +0 -21
- package/src/tigerbeetle/scripts/scripts/install_zig.bat +0 -113
- package/src/tigerbeetle/scripts/scripts/install_zig.sh +0 -90
- package/src/tigerbeetle/scripts/scripts/lint.zig +0 -199
- package/src/tigerbeetle/scripts/scripts/pre-commit.sh +0 -9
- package/src/tigerbeetle/scripts/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/scripts/shellcheck.sh +0 -5
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +0 -10
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +0 -14
- package/src/tigerbeetle/scripts/upgrade_ubuntu_kernel.sh +0 -48
- package/src/tigerbeetle/scripts/validate_docs.sh +0 -23
- package/src/tigerbeetle/scripts/vr_state_enumerate +0 -46
- package/src/tigerbeetle/src/benchmark.zig +0 -336
- package/src/tigerbeetle/src/config.zig +0 -233
- package/src/tigerbeetle/src/constants.zig +0 -428
- package/src/tigerbeetle/src/ewah.zig +0 -286
- package/src/tigerbeetle/src/ewah_benchmark.zig +0 -120
- package/src/tigerbeetle/src/ewah_fuzz.zig +0 -130
- package/src/tigerbeetle/src/fifo.zig +0 -120
- package/src/tigerbeetle/src/io/benchmark.zig +0 -213
- package/src/tigerbeetle/src/io/darwin.zig +0 -814
- package/src/tigerbeetle/src/io/linux.zig +0 -1071
- package/src/tigerbeetle/src/io/test.zig +0 -643
- package/src/tigerbeetle/src/io/windows.zig +0 -1183
- package/src/tigerbeetle/src/io.zig +0 -34
- package/src/tigerbeetle/src/iops.zig +0 -107
- package/src/tigerbeetle/src/lsm/README.md +0 -308
- package/src/tigerbeetle/src/lsm/binary_search.zig +0 -341
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +0 -125
- package/src/tigerbeetle/src/lsm/compaction.zig +0 -603
- package/src/tigerbeetle/src/lsm/composite_key.zig +0 -77
- package/src/tigerbeetle/src/lsm/direction.zig +0 -11
- package/src/tigerbeetle/src/lsm/eytzinger.zig +0 -587
- package/src/tigerbeetle/src/lsm/eytzinger_benchmark.zig +0 -330
- package/src/tigerbeetle/src/lsm/forest.zig +0 -205
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +0 -450
- package/src/tigerbeetle/src/lsm/grid.zig +0 -573
- package/src/tigerbeetle/src/lsm/groove.zig +0 -1036
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +0 -474
- package/src/tigerbeetle/src/lsm/level_iterator.zig +0 -332
- package/src/tigerbeetle/src/lsm/manifest.zig +0 -617
- package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -878
- package/src/tigerbeetle/src/lsm/manifest_log.zig +0 -789
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +0 -691
- package/src/tigerbeetle/src/lsm/merge_iterator.zig +0 -106
- package/src/tigerbeetle/src/lsm/node_pool.zig +0 -235
- package/src/tigerbeetle/src/lsm/posted_groove.zig +0 -381
- package/src/tigerbeetle/src/lsm/segmented_array.zig +0 -1329
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +0 -148
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +0 -9
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +0 -850
- package/src/tigerbeetle/src/lsm/table.zig +0 -1009
- package/src/tigerbeetle/src/lsm/table_immutable.zig +0 -192
- package/src/tigerbeetle/src/lsm/table_iterator.zig +0 -340
- package/src/tigerbeetle/src/lsm/table_mutable.zig +0 -203
- package/src/tigerbeetle/src/lsm/test.zig +0 -439
- package/src/tigerbeetle/src/lsm/tree.zig +0 -1169
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +0 -479
- package/src/tigerbeetle/src/message_bus.zig +0 -1013
- package/src/tigerbeetle/src/message_pool.zig +0 -156
- package/src/tigerbeetle/src/ring_buffer.zig +0 -399
- package/src/tigerbeetle/src/simulator.zig +0 -580
- package/src/tigerbeetle/src/state_machine/auditor.zig +0 -578
- package/src/tigerbeetle/src/state_machine/workload.zig +0 -883
- package/src/tigerbeetle/src/state_machine.zig +0 -2099
- package/src/tigerbeetle/src/static_allocator.zig +0 -65
- package/src/tigerbeetle/src/stdx.zig +0 -171
- package/src/tigerbeetle/src/storage.zig +0 -393
- package/src/tigerbeetle/src/testing/cluster/message_bus.zig +0 -82
- package/src/tigerbeetle/src/testing/cluster/network.zig +0 -237
- package/src/tigerbeetle/src/testing/cluster/state_checker.zig +0 -169
- package/src/tigerbeetle/src/testing/cluster/storage_checker.zig +0 -202
- package/src/tigerbeetle/src/testing/cluster.zig +0 -444
- package/src/tigerbeetle/src/testing/fuzz.zig +0 -140
- package/src/tigerbeetle/src/testing/hash_log.zig +0 -66
- package/src/tigerbeetle/src/testing/id.zig +0 -99
- package/src/tigerbeetle/src/testing/packet_simulator.zig +0 -374
- package/src/tigerbeetle/src/testing/priority_queue.zig +0 -645
- package/src/tigerbeetle/src/testing/reply_sequence.zig +0 -139
- package/src/tigerbeetle/src/testing/state_machine.zig +0 -250
- package/src/tigerbeetle/src/testing/storage.zig +0 -757
- package/src/tigerbeetle/src/testing/table.zig +0 -247
- package/src/tigerbeetle/src/testing/time.zig +0 -84
- package/src/tigerbeetle/src/tigerbeetle.zig +0 -227
- package/src/tigerbeetle/src/time.zig +0 -112
- package/src/tigerbeetle/src/tracer.zig +0 -529
- package/src/tigerbeetle/src/unit_tests.zig +0 -40
- package/src/tigerbeetle/src/vopr.zig +0 -495
- package/src/tigerbeetle/src/vsr/README.md +0 -209
- package/src/tigerbeetle/src/vsr/client.zig +0 -544
- package/src/tigerbeetle/src/vsr/clock.zig +0 -855
- package/src/tigerbeetle/src/vsr/journal.zig +0 -2415
- package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +0 -111
- package/src/tigerbeetle/src/vsr/marzullo.zig +0 -309
- package/src/tigerbeetle/src/vsr/replica.zig +0 -6616
- package/src/tigerbeetle/src/vsr/replica_format.zig +0 -219
- package/src/tigerbeetle/src/vsr/superblock.zig +0 -1631
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +0 -256
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +0 -929
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +0 -334
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +0 -390
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +0 -615
- package/src/tigerbeetle/src/vsr/superblock_quorums.zig +0 -394
- package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +0 -314
- package/src/tigerbeetle/src/vsr.zig +0 -1425
|
@@ -1,1013 +0,0 @@
|
|
|
1
|
-
const std = @import("std");
|
|
2
|
-
const builtin = @import("builtin");
|
|
3
|
-
const assert = std.debug.assert;
|
|
4
|
-
const mem = std.mem;
|
|
5
|
-
const os = std.os;
|
|
6
|
-
|
|
7
|
-
const is_linux = builtin.target.os.tag == .linux;
|
|
8
|
-
|
|
9
|
-
const constants = @import("constants.zig");
|
|
10
|
-
const log = std.log.scoped(.message_bus);
|
|
11
|
-
|
|
12
|
-
const vsr = @import("vsr.zig");
|
|
13
|
-
const Header = vsr.Header;
|
|
14
|
-
|
|
15
|
-
const stdx = @import("stdx.zig");
|
|
16
|
-
const RingBuffer = @import("ring_buffer.zig").RingBuffer;
|
|
17
|
-
const IO = @import("io.zig").IO;
|
|
18
|
-
const MessagePool = @import("message_pool.zig").MessagePool;
|
|
19
|
-
const Message = MessagePool.Message;
|
|
20
|
-
|
|
21
|
-
pub const MessageBusReplica = MessageBusType(.replica);
|
|
22
|
-
pub const MessageBusClient = MessageBusType(.client);
|
|
23
|
-
|
|
24
|
-
fn MessageBusType(comptime process_type: vsr.ProcessType) type {
|
|
25
|
-
const SendQueue = RingBuffer(*Message, switch (process_type) {
|
|
26
|
-
.replica => constants.connection_send_queue_max_replica,
|
|
27
|
-
// A client has at most 1 in-flight request, plus pings.
|
|
28
|
-
.client => constants.connection_send_queue_max_client,
|
|
29
|
-
}, .array);
|
|
30
|
-
|
|
31
|
-
const tcp_sndbuf = switch (process_type) {
|
|
32
|
-
.replica => constants.tcp_sndbuf_replica,
|
|
33
|
-
.client => constants.tcp_sndbuf_client,
|
|
34
|
-
};
|
|
35
|
-
|
|
36
|
-
const Process = union(vsr.ProcessType) {
|
|
37
|
-
replica: u8,
|
|
38
|
-
client: u128,
|
|
39
|
-
};
|
|
40
|
-
|
|
41
|
-
return struct {
|
|
42
|
-
const Self = @This();
|
|
43
|
-
|
|
44
|
-
pool: *MessagePool,
|
|
45
|
-
io: *IO,
|
|
46
|
-
|
|
47
|
-
cluster: u32,
|
|
48
|
-
configuration: []const std.net.Address,
|
|
49
|
-
|
|
50
|
-
process: switch (process_type) {
|
|
51
|
-
.replica => struct {
|
|
52
|
-
replica: u8,
|
|
53
|
-
/// The file descriptor for the process on which to accept connections.
|
|
54
|
-
accept_fd: os.socket_t,
|
|
55
|
-
accept_completion: IO.Completion = undefined,
|
|
56
|
-
/// The connection reserved for the currently in progress accept operation.
|
|
57
|
-
/// This is non-null exactly when an accept operation is submitted.
|
|
58
|
-
accept_connection: ?*Connection = null,
|
|
59
|
-
/// Map from client id to the currently active connection for that client.
|
|
60
|
-
/// This is used to make lookup of client connections when sending messages
|
|
61
|
-
/// efficient and to ensure old client connections are dropped if a new one
|
|
62
|
-
/// is established.
|
|
63
|
-
clients: std.AutoHashMapUnmanaged(u128, *Connection) = .{},
|
|
64
|
-
},
|
|
65
|
-
.client => void,
|
|
66
|
-
},
|
|
67
|
-
|
|
68
|
-
/// The callback to be called when a message is received.
|
|
69
|
-
on_message_callback: fn (message_bus: *Self, message: *Message) void,
|
|
70
|
-
|
|
71
|
-
/// This slice is allocated with a fixed size in the init function and never reallocated.
|
|
72
|
-
connections: []Connection,
|
|
73
|
-
/// Number of connections currently in use (i.e. connection.peer != .none).
|
|
74
|
-
connections_used: usize = 0,
|
|
75
|
-
|
|
76
|
-
/// Map from replica index to the currently active connection for that replica, if any.
|
|
77
|
-
/// The connection for the process replica if any will always be null.
|
|
78
|
-
replicas: []?*Connection,
|
|
79
|
-
/// The number of outgoing `connect()` attempts for a given replica:
|
|
80
|
-
/// Reset to zero after a successful `on_connect()`.
|
|
81
|
-
replicas_connect_attempts: []u64,
|
|
82
|
-
|
|
83
|
-
/// Used to apply jitter when calculating exponential backoff:
|
|
84
|
-
/// Seeded with the process' replica index or client ID.
|
|
85
|
-
prng: std.rand.DefaultPrng,
|
|
86
|
-
|
|
87
|
-
pub const Options = struct {
|
|
88
|
-
configuration: []const std.net.Address,
|
|
89
|
-
io: *IO,
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
/// Initialize the MessageBus for the given cluster, configuration and replica/client process.
|
|
93
|
-
pub fn init(
|
|
94
|
-
allocator: mem.Allocator,
|
|
95
|
-
cluster: u32,
|
|
96
|
-
process: Process,
|
|
97
|
-
message_pool: *MessagePool,
|
|
98
|
-
on_message_callback: fn (message_bus: *Self, message: *Message) void,
|
|
99
|
-
options: Options,
|
|
100
|
-
) !Self {
|
|
101
|
-
// There must be enough connections for all replicas and at least one client.
|
|
102
|
-
assert(constants.connections_max > options.configuration.len);
|
|
103
|
-
assert(@as(vsr.ProcessType, process) == process_type);
|
|
104
|
-
|
|
105
|
-
const connections = try allocator.alloc(Connection, constants.connections_max);
|
|
106
|
-
errdefer allocator.free(connections);
|
|
107
|
-
mem.set(Connection, connections, .{});
|
|
108
|
-
|
|
109
|
-
const replicas = try allocator.alloc(?*Connection, options.configuration.len);
|
|
110
|
-
errdefer allocator.free(replicas);
|
|
111
|
-
mem.set(?*Connection, replicas, null);
|
|
112
|
-
|
|
113
|
-
const replicas_connect_attempts = try allocator.alloc(u64, options.configuration.len);
|
|
114
|
-
errdefer allocator.free(replicas_connect_attempts);
|
|
115
|
-
mem.set(u64, replicas_connect_attempts, 0);
|
|
116
|
-
|
|
117
|
-
const prng_seed = switch (process_type) {
|
|
118
|
-
.replica => process.replica,
|
|
119
|
-
.client => @truncate(u64, process.client),
|
|
120
|
-
};
|
|
121
|
-
|
|
122
|
-
var bus: Self = .{
|
|
123
|
-
.pool = message_pool,
|
|
124
|
-
.io = options.io,
|
|
125
|
-
.cluster = cluster,
|
|
126
|
-
.configuration = options.configuration,
|
|
127
|
-
.process = switch (process_type) {
|
|
128
|
-
.replica => .{
|
|
129
|
-
.replica = process.replica,
|
|
130
|
-
.accept_fd = try init_tcp(options.io, options.configuration[process.replica]),
|
|
131
|
-
},
|
|
132
|
-
.client => {},
|
|
133
|
-
},
|
|
134
|
-
.on_message_callback = on_message_callback,
|
|
135
|
-
.connections = connections,
|
|
136
|
-
.replicas = replicas,
|
|
137
|
-
.replicas_connect_attempts = replicas_connect_attempts,
|
|
138
|
-
.prng = std.rand.DefaultPrng.init(prng_seed),
|
|
139
|
-
};
|
|
140
|
-
|
|
141
|
-
// Pre-allocate enough memory to hold all possible connections in the client map.
|
|
142
|
-
if (process_type == .replica) {
|
|
143
|
-
try bus.process.clients.ensureTotalCapacity(allocator, constants.connections_max);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
return bus;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
pub fn deinit(bus: *Self, allocator: std.mem.Allocator) void {
|
|
150
|
-
if (process_type == .replica) {
|
|
151
|
-
bus.process.clients.deinit(allocator);
|
|
152
|
-
os.closeSocket(bus.process.accept_fd);
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
for (bus.connections) |*connection| {
|
|
156
|
-
if (connection.recv_message) |message| bus.unref(message);
|
|
157
|
-
while (connection.send_queue.pop()) |message| bus.unref(message);
|
|
158
|
-
}
|
|
159
|
-
allocator.free(bus.connections);
|
|
160
|
-
allocator.free(bus.replicas);
|
|
161
|
-
allocator.free(bus.replicas_connect_attempts);
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
fn init_tcp(io: *IO, address: std.net.Address) !os.socket_t {
|
|
165
|
-
const fd = try io.open_socket(
|
|
166
|
-
address.any.family,
|
|
167
|
-
os.SOCK.STREAM,
|
|
168
|
-
os.IPPROTO.TCP,
|
|
169
|
-
);
|
|
170
|
-
errdefer os.closeSocket(fd);
|
|
171
|
-
|
|
172
|
-
const set = struct {
|
|
173
|
-
fn set(_fd: os.socket_t, level: u32, option: u32, value: c_int) !void {
|
|
174
|
-
try os.setsockopt(_fd, level, option, &mem.toBytes(value));
|
|
175
|
-
}
|
|
176
|
-
}.set;
|
|
177
|
-
|
|
178
|
-
if (constants.tcp_rcvbuf > 0) rcvbuf: {
|
|
179
|
-
if (is_linux) {
|
|
180
|
-
// Requires CAP_NET_ADMIN privilege (settle for SO_RCVBUF in case of an EPERM):
|
|
181
|
-
if (set(fd, os.SOL.SOCKET, os.SO.RCVBUFFORCE, constants.tcp_rcvbuf)) |_| {
|
|
182
|
-
break :rcvbuf;
|
|
183
|
-
} else |err| switch (err) {
|
|
184
|
-
error.PermissionDenied => {},
|
|
185
|
-
else => |e| return e,
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
try set(fd, os.SOL.SOCKET, os.SO.RCVBUF, constants.tcp_rcvbuf);
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
if (tcp_sndbuf > 0) sndbuf: {
|
|
192
|
-
if (is_linux) {
|
|
193
|
-
// Requires CAP_NET_ADMIN privilege (settle for SO_SNDBUF in case of an EPERM):
|
|
194
|
-
if (set(fd, os.SOL.SOCKET, os.SO.SNDBUFFORCE, tcp_sndbuf)) |_| {
|
|
195
|
-
break :sndbuf;
|
|
196
|
-
} else |err| switch (err) {
|
|
197
|
-
error.PermissionDenied => {},
|
|
198
|
-
else => |e| return e,
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
try set(fd, os.SOL.SOCKET, os.SO.SNDBUF, tcp_sndbuf);
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
if (constants.tcp_keepalive) {
|
|
205
|
-
try set(fd, os.SOL.SOCKET, os.SO.KEEPALIVE, 1);
|
|
206
|
-
if (is_linux) {
|
|
207
|
-
try set(fd, os.IPPROTO.TCP, os.TCP.KEEPIDLE, constants.tcp_keepidle);
|
|
208
|
-
try set(fd, os.IPPROTO.TCP, os.TCP.KEEPINTVL, constants.tcp_keepintvl);
|
|
209
|
-
try set(fd, os.IPPROTO.TCP, os.TCP.KEEPCNT, constants.tcp_keepcnt);
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
if (constants.tcp_user_timeout_ms > 0) {
|
|
214
|
-
if (is_linux) {
|
|
215
|
-
try set(fd, os.IPPROTO.TCP, os.TCP.USER_TIMEOUT, constants.tcp_user_timeout_ms);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
// Set tcp no-delay
|
|
220
|
-
if (constants.tcp_nodelay) {
|
|
221
|
-
if (is_linux) {
|
|
222
|
-
try set(fd, os.IPPROTO.TCP, os.TCP.NODELAY, 1);
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
try set(fd, os.SOL.SOCKET, os.SO.REUSEADDR, 1);
|
|
227
|
-
try os.bind(fd, &address.any, address.getOsSockLen());
|
|
228
|
-
try os.listen(fd, constants.tcp_backlog);
|
|
229
|
-
|
|
230
|
-
return fd;
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
pub fn tick(bus: *Self) void {
|
|
234
|
-
switch (process_type) {
|
|
235
|
-
.replica => {
|
|
236
|
-
// Each replica is responsible for connecting to replicas that come
|
|
237
|
-
// after it in the configuration. This ensures that replicas never try
|
|
238
|
-
// to connect to each other at the same time.
|
|
239
|
-
var replica: u8 = bus.process.replica + 1;
|
|
240
|
-
while (replica < bus.replicas.len) : (replica += 1) {
|
|
241
|
-
bus.maybe_connect_to_replica(replica);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
// Only replicas accept connections from other replicas and clients:
|
|
245
|
-
bus.maybe_accept();
|
|
246
|
-
},
|
|
247
|
-
.client => {
|
|
248
|
-
// The client connects to all replicas.
|
|
249
|
-
var replica: u8 = 0;
|
|
250
|
-
while (replica < bus.replicas.len) : (replica += 1) {
|
|
251
|
-
bus.maybe_connect_to_replica(replica);
|
|
252
|
-
}
|
|
253
|
-
},
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
fn maybe_connect_to_replica(bus: *Self, replica: u8) void {
|
|
258
|
-
// We already have a connection to the given replica.
|
|
259
|
-
if (bus.replicas[replica] != null) {
|
|
260
|
-
assert(bus.connections_used > 0);
|
|
261
|
-
return;
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
// Obtain a connection struct for our new replica connection.
|
|
265
|
-
// If there is a free connection, use that. Otherwise drop
|
|
266
|
-
// a client or unknown connection to make space. Prefer dropping
|
|
267
|
-
// a client connection to an unknown one as the unknown peer may
|
|
268
|
-
// be a replica. Since shutting a connection down does not happen
|
|
269
|
-
// instantly, simply return after starting the shutdown and try again
|
|
270
|
-
// on the next tick().
|
|
271
|
-
for (bus.connections) |*connection| {
|
|
272
|
-
if (connection.state == .free) {
|
|
273
|
-
assert(connection.peer == .none);
|
|
274
|
-
// This will immediately add the connection to bus.replicas,
|
|
275
|
-
// or else will return early if a socket file descriptor cannot be obtained:
|
|
276
|
-
// TODO See if we can clean this up to remove/expose the early return branch.
|
|
277
|
-
connection.connect_to_replica(bus, replica);
|
|
278
|
-
return;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
// If there is already a connection being shut down, no need to kill another.
|
|
283
|
-
for (bus.connections) |*connection| {
|
|
284
|
-
if (connection.state == .terminating) return;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
log.info("all connections in use but not all replicas are connected, " ++
|
|
288
|
-
"attempting to disconnect a client", .{});
|
|
289
|
-
for (bus.connections) |*connection| {
|
|
290
|
-
if (connection.peer == .client) {
|
|
291
|
-
connection.terminate(bus, .shutdown);
|
|
292
|
-
return;
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
log.info("failed to disconnect a client as no peer was a known client, " ++
|
|
297
|
-
"attempting to disconnect an unknown peer.", .{});
|
|
298
|
-
for (bus.connections) |*connection| {
|
|
299
|
-
if (connection.peer == .unknown) {
|
|
300
|
-
connection.terminate(bus, .shutdown);
|
|
301
|
-
return;
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
// We assert that the max number of connections is greater
|
|
306
|
-
// than the number of replicas in init().
|
|
307
|
-
unreachable;
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
fn maybe_accept(bus: *Self) void {
|
|
311
|
-
comptime assert(process_type == .replica);
|
|
312
|
-
|
|
313
|
-
if (bus.process.accept_connection != null) return;
|
|
314
|
-
// All connections are currently in use, do nothing.
|
|
315
|
-
if (bus.connections_used == bus.connections.len) return;
|
|
316
|
-
assert(bus.connections_used < bus.connections.len);
|
|
317
|
-
bus.process.accept_connection = for (bus.connections) |*connection| {
|
|
318
|
-
if (connection.state == .free) {
|
|
319
|
-
assert(connection.peer == .none);
|
|
320
|
-
connection.state = .accepting;
|
|
321
|
-
break connection;
|
|
322
|
-
}
|
|
323
|
-
} else unreachable;
|
|
324
|
-
bus.io.accept(
|
|
325
|
-
*Self,
|
|
326
|
-
bus,
|
|
327
|
-
on_accept,
|
|
328
|
-
&bus.process.accept_completion,
|
|
329
|
-
bus.process.accept_fd,
|
|
330
|
-
);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
fn on_accept(
|
|
334
|
-
bus: *Self,
|
|
335
|
-
completion: *IO.Completion,
|
|
336
|
-
result: IO.AcceptError!os.socket_t,
|
|
337
|
-
) void {
|
|
338
|
-
_ = completion;
|
|
339
|
-
|
|
340
|
-
comptime assert(process_type == .replica);
|
|
341
|
-
assert(bus.process.accept_connection != null);
|
|
342
|
-
defer bus.process.accept_connection = null;
|
|
343
|
-
const fd = result catch |err| {
|
|
344
|
-
bus.process.accept_connection.?.state = .free;
|
|
345
|
-
// TODO: some errors should probably be fatal
|
|
346
|
-
log.err("accept failed: {}", .{err});
|
|
347
|
-
return;
|
|
348
|
-
};
|
|
349
|
-
bus.process.accept_connection.?.on_accept(bus, fd);
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
pub fn get_message(bus: *Self) *Message {
|
|
353
|
-
return bus.pool.get_message();
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
pub fn unref(bus: *Self, message: *Message) void {
|
|
357
|
-
bus.pool.unref(message);
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
pub fn send_message_to_replica(bus: *Self, replica: u8, message: *Message) void {
|
|
361
|
-
// Messages sent by a replica to itself should never be passed to the message bus.
|
|
362
|
-
if (process_type == .replica) assert(replica != bus.process.replica);
|
|
363
|
-
|
|
364
|
-
if (bus.replicas[replica]) |connection| {
|
|
365
|
-
connection.send_message(bus, message);
|
|
366
|
-
} else {
|
|
367
|
-
log.debug("no active connection to replica {}, " ++
|
|
368
|
-
"dropping message with header {}", .{ replica, message.header });
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
/// Try to send the message to the client with the given id.
|
|
373
|
-
/// If the client is not currently connected, the message is silently dropped.
|
|
374
|
-
pub fn send_message_to_client(bus: *Self, client_id: u128, message: *Message) void {
|
|
375
|
-
comptime assert(process_type == .replica);
|
|
376
|
-
|
|
377
|
-
if (bus.process.clients.get(client_id)) |connection| {
|
|
378
|
-
connection.send_message(bus, message);
|
|
379
|
-
} else {
|
|
380
|
-
log.debug("no connection to client {x}", .{client_id});
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
/// Used to send/receive messages to/from a client or fellow replica.
|
|
385
|
-
const Connection = struct {
|
|
386
|
-
/// The peer is determined by inspecting the first message header
|
|
387
|
-
/// received.
|
|
388
|
-
peer: union(enum) {
|
|
389
|
-
/// No peer is currently connected.
|
|
390
|
-
none: void,
|
|
391
|
-
/// A connection is established but an unambiguous header has not yet been received.
|
|
392
|
-
unknown: void,
|
|
393
|
-
/// The peer is a client with the given id.
|
|
394
|
-
client: u128,
|
|
395
|
-
/// The peer is a replica with the given id.
|
|
396
|
-
replica: u8,
|
|
397
|
-
} = .none,
|
|
398
|
-
state: enum {
|
|
399
|
-
/// The connection is not in use, with peer set to `.none`.
|
|
400
|
-
free,
|
|
401
|
-
/// The connection has been reserved for an in progress accept operation,
|
|
402
|
-
/// with peer set to `.none`.
|
|
403
|
-
accepting,
|
|
404
|
-
/// The peer is a replica and a connect operation has been started
|
|
405
|
-
/// but not yet competed.
|
|
406
|
-
connecting,
|
|
407
|
-
/// The peer is fully connected and may be a client, replica, or unknown.
|
|
408
|
-
connected,
|
|
409
|
-
/// The connection is being terminated but cleanup has not yet finished.
|
|
410
|
-
terminating,
|
|
411
|
-
} = .free,
|
|
412
|
-
/// This is guaranteed to be valid only while state is connected.
|
|
413
|
-
/// It will be reset to IO.INVALID_SOCKET during the shutdown process and is always IO.INVALID_SOCKET if the
|
|
414
|
-
/// connection is unused (i.e. peer == .none). We use IO.INVALID_SOCKET instead of undefined here
|
|
415
|
-
/// for safety to ensure an error if the invalid value is ever used, instead of
|
|
416
|
-
/// potentially performing an action on an active fd.
|
|
417
|
-
fd: os.socket_t = IO.INVALID_SOCKET,
|
|
418
|
-
|
|
419
|
-
/// This completion is used for all recv operations.
|
|
420
|
-
/// It is also used for the initial connect when establishing a replica connection.
|
|
421
|
-
recv_completion: IO.Completion = undefined,
|
|
422
|
-
/// True exactly when the recv_completion has been submitted to the IO abstraction
|
|
423
|
-
/// but the callback has not yet been run.
|
|
424
|
-
recv_submitted: bool = false,
|
|
425
|
-
/// The Message with the buffer passed to the kernel for recv operations.
|
|
426
|
-
recv_message: ?*Message = null,
|
|
427
|
-
/// The number of bytes in `recv_message` that have been received and need parsing.
|
|
428
|
-
recv_progress: usize = 0,
|
|
429
|
-
/// The number of bytes in `recv_message` that have been parsed.
|
|
430
|
-
recv_parsed: usize = 0,
|
|
431
|
-
/// True if we have already checked the header checksum of the message we
|
|
432
|
-
/// are currently receiving/parsing.
|
|
433
|
-
recv_checked_header: bool = false,
|
|
434
|
-
|
|
435
|
-
/// This completion is used for all send operations.
|
|
436
|
-
send_completion: IO.Completion = undefined,
|
|
437
|
-
/// True exactly when the send_completion has been submitted to the IO abstraction
|
|
438
|
-
/// but the callback has not yet been run.
|
|
439
|
-
send_submitted: bool = false,
|
|
440
|
-
/// Number of bytes of the current message that have already been sent.
|
|
441
|
-
send_progress: usize = 0,
|
|
442
|
-
/// The queue of messages to send to the client or replica peer.
|
|
443
|
-
send_queue: SendQueue = .{},
|
|
444
|
-
|
|
445
|
-
/// Attempt to connect to a replica.
|
|
446
|
-
/// The slot in the Message.replicas slices is immediately reserved.
|
|
447
|
-
/// Failure is silent and returns the connection to an unused state.
|
|
448
|
-
pub fn connect_to_replica(connection: *Connection, bus: *Self, replica: u8) void {
|
|
449
|
-
if (process_type == .replica) assert(replica != bus.process.replica);
|
|
450
|
-
|
|
451
|
-
assert(connection.peer == .none);
|
|
452
|
-
assert(connection.state == .free);
|
|
453
|
-
assert(connection.fd == IO.INVALID_SOCKET);
|
|
454
|
-
|
|
455
|
-
// The first replica's network address family determines the
|
|
456
|
-
// family for all other replicas:
|
|
457
|
-
const family = bus.configuration[0].any.family;
|
|
458
|
-
connection.fd = bus.io.open_socket(family, os.SOCK.STREAM, os.IPPROTO.TCP) catch return;
|
|
459
|
-
connection.peer = .{ .replica = replica };
|
|
460
|
-
connection.state = .connecting;
|
|
461
|
-
bus.connections_used += 1;
|
|
462
|
-
|
|
463
|
-
assert(bus.replicas[replica] == null);
|
|
464
|
-
bus.replicas[replica] = connection;
|
|
465
|
-
|
|
466
|
-
var attempts = &bus.replicas_connect_attempts[replica];
|
|
467
|
-
const ms = vsr.exponential_backoff_with_jitter(
|
|
468
|
-
bus.prng.random(),
|
|
469
|
-
constants.connection_delay_min_ms,
|
|
470
|
-
constants.connection_delay_max_ms,
|
|
471
|
-
attempts.*,
|
|
472
|
-
);
|
|
473
|
-
attempts.* += 1;
|
|
474
|
-
|
|
475
|
-
log.debug("connecting to replica {} in {}ms...", .{ connection.peer.replica, ms });
|
|
476
|
-
|
|
477
|
-
assert(!connection.recv_submitted);
|
|
478
|
-
connection.recv_submitted = true;
|
|
479
|
-
|
|
480
|
-
bus.io.timeout(
|
|
481
|
-
*Self,
|
|
482
|
-
bus,
|
|
483
|
-
on_connect_with_exponential_backoff,
|
|
484
|
-
// We use `recv_completion` for the connection `timeout()` and `connect()` calls
|
|
485
|
-
&connection.recv_completion,
|
|
486
|
-
@intCast(u63, ms * std.time.ns_per_ms),
|
|
487
|
-
);
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
fn on_connect_with_exponential_backoff(
|
|
491
|
-
bus: *Self,
|
|
492
|
-
completion: *IO.Completion,
|
|
493
|
-
result: IO.TimeoutError!void,
|
|
494
|
-
) void {
|
|
495
|
-
const connection = @fieldParentPtr(Connection, "recv_completion", completion);
|
|
496
|
-
assert(connection.recv_submitted);
|
|
497
|
-
connection.recv_submitted = false;
|
|
498
|
-
if (connection.state == .terminating) {
|
|
499
|
-
connection.maybe_close(bus);
|
|
500
|
-
return;
|
|
501
|
-
}
|
|
502
|
-
assert(connection.state == .connecting);
|
|
503
|
-
result catch unreachable;
|
|
504
|
-
|
|
505
|
-
log.debug("connecting to replica {}...", .{connection.peer.replica});
|
|
506
|
-
|
|
507
|
-
assert(!connection.recv_submitted);
|
|
508
|
-
connection.recv_submitted = true;
|
|
509
|
-
|
|
510
|
-
bus.io.connect(
|
|
511
|
-
*Self,
|
|
512
|
-
bus,
|
|
513
|
-
on_connect,
|
|
514
|
-
// We use `recv_completion` for the connection `timeout()` and `connect()` calls
|
|
515
|
-
&connection.recv_completion,
|
|
516
|
-
connection.fd,
|
|
517
|
-
bus.configuration[connection.peer.replica],
|
|
518
|
-
);
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
fn on_connect(
|
|
522
|
-
bus: *Self,
|
|
523
|
-
completion: *IO.Completion,
|
|
524
|
-
result: IO.ConnectError!void,
|
|
525
|
-
) void {
|
|
526
|
-
const connection = @fieldParentPtr(Connection, "recv_completion", completion);
|
|
527
|
-
assert(connection.recv_submitted);
|
|
528
|
-
connection.recv_submitted = false;
|
|
529
|
-
|
|
530
|
-
if (connection.state == .terminating) {
|
|
531
|
-
connection.maybe_close(bus);
|
|
532
|
-
return;
|
|
533
|
-
}
|
|
534
|
-
assert(connection.state == .connecting);
|
|
535
|
-
connection.state = .connected;
|
|
536
|
-
|
|
537
|
-
result catch |err| {
|
|
538
|
-
log.err("error connecting to replica {}: {}", .{ connection.peer.replica, err });
|
|
539
|
-
connection.terminate(bus, .close);
|
|
540
|
-
return;
|
|
541
|
-
};
|
|
542
|
-
|
|
543
|
-
log.info("connected to replica {}", .{connection.peer.replica});
|
|
544
|
-
bus.replicas_connect_attempts[connection.peer.replica] = 0;
|
|
545
|
-
|
|
546
|
-
connection.assert_recv_send_initial_state(bus);
|
|
547
|
-
connection.get_recv_message_and_recv(bus);
|
|
548
|
-
// A message may have been queued for sending while we were connecting:
|
|
549
|
-
// TODO Should we relax recv() and send() to return if `connection.state != .connected`?
|
|
550
|
-
if (connection.state == .connected) connection.send(bus);
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
/// Given a newly accepted fd, start receiving messages on it.
|
|
554
|
-
/// Callbacks will be continuously re-registered until terminate() is called.
|
|
555
|
-
pub fn on_accept(connection: *Connection, bus: *Self, fd: os.socket_t) void {
|
|
556
|
-
assert(connection.peer == .none);
|
|
557
|
-
assert(connection.state == .accepting);
|
|
558
|
-
assert(connection.fd == IO.INVALID_SOCKET);
|
|
559
|
-
|
|
560
|
-
connection.peer = .unknown;
|
|
561
|
-
connection.state = .connected;
|
|
562
|
-
connection.fd = fd;
|
|
563
|
-
bus.connections_used += 1;
|
|
564
|
-
|
|
565
|
-
connection.assert_recv_send_initial_state(bus);
|
|
566
|
-
connection.get_recv_message_and_recv(bus);
|
|
567
|
-
assert(connection.send_queue.empty());
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
fn assert_recv_send_initial_state(connection: *Connection, bus: *Self) void {
|
|
571
|
-
assert(bus.connections_used > 0);
|
|
572
|
-
|
|
573
|
-
assert(connection.peer == .unknown or connection.peer == .replica);
|
|
574
|
-
assert(connection.state == .connected);
|
|
575
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
576
|
-
|
|
577
|
-
assert(connection.recv_submitted == false);
|
|
578
|
-
assert(connection.recv_message == null);
|
|
579
|
-
assert(connection.recv_progress == 0);
|
|
580
|
-
assert(connection.recv_parsed == 0);
|
|
581
|
-
|
|
582
|
-
assert(connection.send_submitted == false);
|
|
583
|
-
assert(connection.send_progress == 0);
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
/// Add a message to the connection's send queue, starting a send operation
|
|
587
|
-
/// if the queue was previously empty.
|
|
588
|
-
pub fn send_message(connection: *Connection, bus: *Self, message: *Message) void {
|
|
589
|
-
assert(connection.peer == .client or connection.peer == .replica);
|
|
590
|
-
switch (connection.state) {
|
|
591
|
-
.connected, .connecting => {},
|
|
592
|
-
.terminating => return,
|
|
593
|
-
.free, .accepting => unreachable,
|
|
594
|
-
}
|
|
595
|
-
if (connection.send_queue.full()) {
|
|
596
|
-
log.info("message queue for peer {} full, dropping {s} message", .{
|
|
597
|
-
connection.peer,
|
|
598
|
-
@tagName(message.header.command),
|
|
599
|
-
});
|
|
600
|
-
return;
|
|
601
|
-
}
|
|
602
|
-
connection.send_queue.push_assume_capacity(message.ref());
|
|
603
|
-
// If the connection has not yet been established we can't send yet.
|
|
604
|
-
// Instead on_connect() will call send().
|
|
605
|
-
if (connection.state == .connecting) {
|
|
606
|
-
assert(connection.peer == .replica);
|
|
607
|
-
return;
|
|
608
|
-
}
|
|
609
|
-
// If there is no send operation currently in progress, start one.
|
|
610
|
-
if (!connection.send_submitted) connection.send(bus);
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
/// Clean up an active connection and reset it to its initial, unused, state.
|
|
614
|
-
/// This reset does not happen instantly as currently in progress operations
|
|
615
|
-
/// must first be stopped. The `how` arg allows the caller to specify if a
|
|
616
|
-
/// shutdown syscall should be made or not before proceeding to wait for
|
|
617
|
-
/// currently in progress operations to complete and close the socket.
|
|
618
|
-
/// I'll be back! (when the Connection is reused after being fully closed)
|
|
619
|
-
pub fn terminate(connection: *Connection, bus: *Self, how: enum { shutdown, close }) void {
|
|
620
|
-
assert(connection.peer != .none);
|
|
621
|
-
assert(connection.state != .free);
|
|
622
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
623
|
-
switch (how) {
|
|
624
|
-
.shutdown => {
|
|
625
|
-
// The shutdown syscall will cause currently in progress send/recv
|
|
626
|
-
// operations to be gracefully closed while keeping the fd open.
|
|
627
|
-
//
|
|
628
|
-
// TODO: Investigate differences between shutdown() on Linux vs Darwin.
|
|
629
|
-
// Especially how this interacts with our assumptions around pending I/O.
|
|
630
|
-
os.shutdown(connection.fd, .both) catch |err| switch (err) {
|
|
631
|
-
error.SocketNotConnected => {
|
|
632
|
-
// This should only happen if we for some reason decide to terminate
|
|
633
|
-
// a connection while a connect operation is in progress.
|
|
634
|
-
// This is fine though, we simply continue with the logic below and
|
|
635
|
-
// wait for the connect operation to finish.
|
|
636
|
-
|
|
637
|
-
// TODO: This currently happens in other cases if the
|
|
638
|
-
// connection was closed due to an error. We need to intelligently
|
|
639
|
-
// decide whether to shutdown or close directly based on the error
|
|
640
|
-
// before these assertions may be re-enabled.
|
|
641
|
-
|
|
642
|
-
//assert(connection.state == .connecting);
|
|
643
|
-
//assert(connection.recv_submitted);
|
|
644
|
-
//assert(!connection.send_submitted);
|
|
645
|
-
},
|
|
646
|
-
// Ignore all the remaining errors for now
|
|
647
|
-
error.ConnectionAborted,
|
|
648
|
-
error.ConnectionResetByPeer,
|
|
649
|
-
error.BlockingOperationInProgress,
|
|
650
|
-
error.NetworkSubsystemFailed,
|
|
651
|
-
error.SystemResources,
|
|
652
|
-
error.Unexpected,
|
|
653
|
-
=> {},
|
|
654
|
-
};
|
|
655
|
-
},
|
|
656
|
-
.close => {},
|
|
657
|
-
}
|
|
658
|
-
assert(connection.state != .terminating);
|
|
659
|
-
connection.state = .terminating;
|
|
660
|
-
connection.maybe_close(bus);
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
fn parse_messages(connection: *Connection, bus: *Self) void {
|
|
664
|
-
assert(connection.peer != .none);
|
|
665
|
-
assert(connection.state == .connected);
|
|
666
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
667
|
-
|
|
668
|
-
while (connection.parse_message(bus)) |message| {
|
|
669
|
-
defer bus.unref(message);
|
|
670
|
-
|
|
671
|
-
connection.on_message(bus, message);
|
|
672
|
-
}
|
|
673
|
-
}
|
|
674
|
-
|
|
675
|
-
fn parse_message(connection: *Connection, bus: *Self) ?*Message {
|
|
676
|
-
const data = connection.recv_message.?.buffer[connection.recv_parsed..connection.recv_progress];
|
|
677
|
-
if (data.len < @sizeOf(Header)) {
|
|
678
|
-
connection.get_recv_message_and_recv(bus);
|
|
679
|
-
return null;
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
const header = mem.bytesAsValue(
|
|
683
|
-
Header,
|
|
684
|
-
@alignCast(@alignOf(Header), data[0..@sizeOf(Header)]),
|
|
685
|
-
);
|
|
686
|
-
|
|
687
|
-
if (!connection.recv_checked_header) {
|
|
688
|
-
if (!header.valid_checksum()) {
|
|
689
|
-
log.err("invalid header checksum received from {}", .{connection.peer});
|
|
690
|
-
connection.terminate(bus, .shutdown);
|
|
691
|
-
return null;
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
if (header.size < @sizeOf(Header) or header.size > constants.message_size_max) {
|
|
695
|
-
log.err("header with invalid size {d} received from peer {}", .{
|
|
696
|
-
header.size,
|
|
697
|
-
connection.peer,
|
|
698
|
-
});
|
|
699
|
-
connection.terminate(bus, .shutdown);
|
|
700
|
-
return null;
|
|
701
|
-
}
|
|
702
|
-
|
|
703
|
-
if (header.cluster != bus.cluster) {
|
|
704
|
-
log.err("message addressed to the wrong cluster: {}", .{header.cluster});
|
|
705
|
-
connection.terminate(bus, .shutdown);
|
|
706
|
-
return null;
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
switch (process_type) {
|
|
710
|
-
// Replicas may forward messages from clients or from other replicas so we
|
|
711
|
-
// may receive messages from a peer before we know who they are:
|
|
712
|
-
// This has the same effect as an asymmetric network where, for a short time
|
|
713
|
-
// bounded by the time it takes to ping, we can hear from a peer before we
|
|
714
|
-
// can send back to them.
|
|
715
|
-
.replica => connection.maybe_set_peer(bus, header),
|
|
716
|
-
// The client connects only to replicas and should set peer when connecting:
|
|
717
|
-
.client => assert(connection.peer == .replica),
|
|
718
|
-
}
|
|
719
|
-
|
|
720
|
-
connection.recv_checked_header = true;
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
if (data.len < header.size) {
|
|
724
|
-
connection.get_recv_message_and_recv(bus);
|
|
725
|
-
return null;
|
|
726
|
-
}
|
|
727
|
-
|
|
728
|
-
// At this point we know that we have the full message in our buffer.
|
|
729
|
-
// We will now either deliver this message or terminate the connection
|
|
730
|
-
// due to an error, so reset recv_checked_header for the next message.
|
|
731
|
-
assert(connection.recv_checked_header);
|
|
732
|
-
connection.recv_checked_header = false;
|
|
733
|
-
|
|
734
|
-
const body = data[@sizeOf(Header)..header.size];
|
|
735
|
-
if (!header.valid_checksum_body(body)) {
|
|
736
|
-
log.err("invalid body checksum received from {}", .{connection.peer});
|
|
737
|
-
connection.terminate(bus, .shutdown);
|
|
738
|
-
return null;
|
|
739
|
-
}
|
|
740
|
-
|
|
741
|
-
connection.recv_parsed += header.size;
|
|
742
|
-
|
|
743
|
-
// Return the parsed message using zero-copy if we can, or copy if the client is
|
|
744
|
-
// pipelining:
|
|
745
|
-
// If this is the first message but there are messages in the pipeline then we
|
|
746
|
-
// copy the message so that its sector padding (if any) will not overwrite the
|
|
747
|
-
// front of the pipeline. If this is not the first message then we must copy
|
|
748
|
-
// the message to a new message as each message needs to have its own unique
|
|
749
|
-
// `references` and `header` metadata.
|
|
750
|
-
if (connection.recv_progress == header.size) return connection.recv_message.?.ref();
|
|
751
|
-
|
|
752
|
-
const message = bus.get_message();
|
|
753
|
-
stdx.copy_disjoint(.inexact, u8, message.buffer, data[0..header.size]);
|
|
754
|
-
return message;
|
|
755
|
-
}
|
|
756
|
-
|
|
757
|
-
/// Forward a received message to `Process.on_message()`.
|
|
758
|
-
/// Zero any `.prepare` sector padding up to the nearest sector multiple after the body.
|
|
759
|
-
fn on_message(connection: *Connection, bus: *Self, message: *Message) void {
|
|
760
|
-
if (message == connection.recv_message.?) {
|
|
761
|
-
assert(connection.recv_parsed == message.header.size);
|
|
762
|
-
assert(connection.recv_parsed == connection.recv_progress);
|
|
763
|
-
} else if (connection.recv_parsed == message.header.size) {
|
|
764
|
-
assert(connection.recv_parsed < connection.recv_progress);
|
|
765
|
-
} else {
|
|
766
|
-
assert(connection.recv_parsed > message.header.size);
|
|
767
|
-
assert(connection.recv_parsed <= connection.recv_progress);
|
|
768
|
-
}
|
|
769
|
-
|
|
770
|
-
if (message.header.command == .request or message.header.command == .prepare) {
|
|
771
|
-
const sector_ceil = vsr.sector_ceil(message.header.size);
|
|
772
|
-
if (message.header.size != sector_ceil) {
|
|
773
|
-
assert(message.header.size < sector_ceil);
|
|
774
|
-
assert(message.buffer.len == constants.message_size_max);
|
|
775
|
-
mem.set(u8, message.buffer[message.header.size..sector_ceil], 0);
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
|
|
779
|
-
bus.on_message_callback(bus, message);
|
|
780
|
-
}
|
|
781
|
-
|
|
782
|
-
fn maybe_set_peer(connection: *Connection, bus: *Self, header: *const Header) void {
|
|
783
|
-
comptime assert(process_type == .replica);
|
|
784
|
-
|
|
785
|
-
assert(bus.cluster == header.cluster);
|
|
786
|
-
assert(bus.connections_used > 0);
|
|
787
|
-
|
|
788
|
-
assert(connection.peer != .none);
|
|
789
|
-
assert(connection.state == .connected);
|
|
790
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
791
|
-
|
|
792
|
-
if (connection.peer != .unknown) return;
|
|
793
|
-
|
|
794
|
-
switch (header.peer_type()) {
|
|
795
|
-
.unknown => return,
|
|
796
|
-
.replica => {
|
|
797
|
-
connection.peer = .{ .replica = header.replica };
|
|
798
|
-
// If there is a connection to this replica, terminate and replace it:
|
|
799
|
-
if (bus.replicas[connection.peer.replica]) |old| {
|
|
800
|
-
assert(old.peer == .replica);
|
|
801
|
-
assert(old.peer.replica == connection.peer.replica);
|
|
802
|
-
assert(old.state != .free);
|
|
803
|
-
if (old.state != .terminating) old.terminate(bus, .shutdown);
|
|
804
|
-
}
|
|
805
|
-
bus.replicas[connection.peer.replica] = connection;
|
|
806
|
-
log.info("connection from replica {}", .{connection.peer.replica});
|
|
807
|
-
},
|
|
808
|
-
.client => {
|
|
809
|
-
assert(header.client != 0);
|
|
810
|
-
connection.peer = .{ .client = header.client };
|
|
811
|
-
const result = bus.process.clients.getOrPutAssumeCapacity(header.client);
|
|
812
|
-
// If there is a connection to this client, terminate and replace it:
|
|
813
|
-
if (result.found_existing) {
|
|
814
|
-
const old = result.value_ptr.*;
|
|
815
|
-
assert(old.peer == .client);
|
|
816
|
-
assert(old.peer.client == connection.peer.client);
|
|
817
|
-
assert(old.state == .connected or old.state == .terminating);
|
|
818
|
-
if (old.state != .terminating) old.terminate(bus, .shutdown);
|
|
819
|
-
}
|
|
820
|
-
result.value_ptr.* = connection;
|
|
821
|
-
log.info("connection from client {}", .{connection.peer.client});
|
|
822
|
-
},
|
|
823
|
-
}
|
|
824
|
-
}
|
|
825
|
-
|
|
826
|
-
/// Acquires a free message if necessary and then calls `recv()`.
|
|
827
|
-
/// If the connection has a `recv_message` and the message being parsed is
|
|
828
|
-
/// at pole position then calls `recv()` immediately, otherwise copies any
|
|
829
|
-
/// partially received message into a new Message and sets `recv_message`,
|
|
830
|
-
/// releasing the old one.
|
|
831
|
-
fn get_recv_message_and_recv(connection: *Connection, bus: *Self) void {
|
|
832
|
-
if (connection.recv_message != null and connection.recv_parsed == 0) {
|
|
833
|
-
connection.recv(bus);
|
|
834
|
-
return;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
const new_message = bus.get_message();
|
|
838
|
-
defer bus.unref(new_message);
|
|
839
|
-
|
|
840
|
-
if (connection.recv_message) |recv_message| {
|
|
841
|
-
defer bus.unref(recv_message);
|
|
842
|
-
|
|
843
|
-
assert(connection.recv_progress > 0);
|
|
844
|
-
assert(connection.recv_parsed > 0);
|
|
845
|
-
const data = recv_message.buffer[connection.recv_parsed..connection.recv_progress];
|
|
846
|
-
stdx.copy_disjoint(.inexact, u8, new_message.buffer, data);
|
|
847
|
-
connection.recv_progress = data.len;
|
|
848
|
-
connection.recv_parsed = 0;
|
|
849
|
-
} else {
|
|
850
|
-
assert(connection.recv_progress == 0);
|
|
851
|
-
assert(connection.recv_parsed == 0);
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
connection.recv_message = new_message.ref();
|
|
855
|
-
connection.recv(bus);
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
fn recv(connection: *Connection, bus: *Self) void {
|
|
859
|
-
assert(connection.peer != .none);
|
|
860
|
-
assert(connection.state == .connected);
|
|
861
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
862
|
-
|
|
863
|
-
assert(!connection.recv_submitted);
|
|
864
|
-
connection.recv_submitted = true;
|
|
865
|
-
|
|
866
|
-
assert(connection.recv_progress < constants.message_size_max);
|
|
867
|
-
|
|
868
|
-
bus.io.recv(
|
|
869
|
-
*Self,
|
|
870
|
-
bus,
|
|
871
|
-
on_recv,
|
|
872
|
-
&connection.recv_completion,
|
|
873
|
-
connection.fd,
|
|
874
|
-
connection.recv_message.?.buffer[connection.recv_progress..constants.message_size_max],
|
|
875
|
-
);
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
fn on_recv(bus: *Self, completion: *IO.Completion, result: IO.RecvError!usize) void {
|
|
879
|
-
const connection = @fieldParentPtr(Connection, "recv_completion", completion);
|
|
880
|
-
assert(connection.recv_submitted);
|
|
881
|
-
connection.recv_submitted = false;
|
|
882
|
-
if (connection.state == .terminating) {
|
|
883
|
-
connection.maybe_close(bus);
|
|
884
|
-
return;
|
|
885
|
-
}
|
|
886
|
-
assert(connection.state == .connected);
|
|
887
|
-
const bytes_received = result catch |err| {
|
|
888
|
-
// TODO: maybe don't need to close on *every* error
|
|
889
|
-
log.err("error receiving from {}: {}", .{ connection.peer, err });
|
|
890
|
-
connection.terminate(bus, .shutdown);
|
|
891
|
-
return;
|
|
892
|
-
};
|
|
893
|
-
// No bytes received means that the peer closed its side of the connection.
|
|
894
|
-
if (bytes_received == 0) {
|
|
895
|
-
log.info("peer performed an orderly shutdown: {}", .{connection.peer});
|
|
896
|
-
connection.terminate(bus, .close);
|
|
897
|
-
return;
|
|
898
|
-
}
|
|
899
|
-
connection.recv_progress += bytes_received;
|
|
900
|
-
connection.parse_messages(bus);
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
fn send(connection: *Connection, bus: *Self) void {
|
|
904
|
-
assert(connection.peer == .client or connection.peer == .replica);
|
|
905
|
-
assert(connection.state == .connected);
|
|
906
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
907
|
-
const message = connection.send_queue.head() orelse return;
|
|
908
|
-
assert(!connection.send_submitted);
|
|
909
|
-
connection.send_submitted = true;
|
|
910
|
-
bus.io.send(
|
|
911
|
-
*Self,
|
|
912
|
-
bus,
|
|
913
|
-
on_send,
|
|
914
|
-
&connection.send_completion,
|
|
915
|
-
connection.fd,
|
|
916
|
-
message.buffer[connection.send_progress..message.header.size],
|
|
917
|
-
);
|
|
918
|
-
}
|
|
919
|
-
|
|
920
|
-
fn on_send(bus: *Self, completion: *IO.Completion, result: IO.SendError!usize) void {
|
|
921
|
-
const connection = @fieldParentPtr(Connection, "send_completion", completion);
|
|
922
|
-
assert(connection.send_submitted);
|
|
923
|
-
connection.send_submitted = false;
|
|
924
|
-
assert(connection.peer == .client or connection.peer == .replica);
|
|
925
|
-
if (connection.state == .terminating) {
|
|
926
|
-
connection.maybe_close(bus);
|
|
927
|
-
return;
|
|
928
|
-
}
|
|
929
|
-
assert(connection.state == .connected);
|
|
930
|
-
connection.send_progress += result catch |err| {
|
|
931
|
-
// TODO: maybe don't need to close on *every* error
|
|
932
|
-
log.err("error sending message to replica at {}: {}", .{ connection.peer, err });
|
|
933
|
-
connection.terminate(bus, .shutdown);
|
|
934
|
-
return;
|
|
935
|
-
};
|
|
936
|
-
assert(connection.send_progress <= connection.send_queue.head().?.header.size);
|
|
937
|
-
// If the message has been fully sent, move on to the next one.
|
|
938
|
-
if (connection.send_progress == connection.send_queue.head().?.header.size) {
|
|
939
|
-
connection.send_progress = 0;
|
|
940
|
-
const message = connection.send_queue.pop().?;
|
|
941
|
-
bus.unref(message);
|
|
942
|
-
}
|
|
943
|
-
connection.send(bus);
|
|
944
|
-
}
|
|
945
|
-
|
|
946
|
-
fn maybe_close(connection: *Connection, bus: *Self) void {
|
|
947
|
-
assert(connection.peer != .none);
|
|
948
|
-
assert(connection.state == .terminating);
|
|
949
|
-
// If a recv or send operation is currently submitted to the kernel,
|
|
950
|
-
// submitting a close would cause a race. Therefore we must wait for
|
|
951
|
-
// any currently submitted operation to complete.
|
|
952
|
-
if (connection.recv_submitted or connection.send_submitted) return;
|
|
953
|
-
connection.send_submitted = true;
|
|
954
|
-
connection.recv_submitted = true;
|
|
955
|
-
// We can free resources now that there is no longer any I/O in progress.
|
|
956
|
-
while (connection.send_queue.pop()) |message| {
|
|
957
|
-
bus.unref(message);
|
|
958
|
-
}
|
|
959
|
-
if (connection.recv_message) |message| {
|
|
960
|
-
bus.unref(message);
|
|
961
|
-
connection.recv_message = null;
|
|
962
|
-
}
|
|
963
|
-
assert(connection.fd != IO.INVALID_SOCKET);
|
|
964
|
-
defer connection.fd = IO.INVALID_SOCKET;
|
|
965
|
-
// It's OK to use the send completion here as we know that no send
|
|
966
|
-
// operation is currently in progress.
|
|
967
|
-
bus.io.close(*Self, bus, on_close, &connection.send_completion, connection.fd);
|
|
968
|
-
}
|
|
969
|
-
|
|
970
|
-
fn on_close(bus: *Self, completion: *IO.Completion, result: IO.CloseError!void) void {
|
|
971
|
-
const connection = @fieldParentPtr(Connection, "send_completion", completion);
|
|
972
|
-
assert(connection.send_submitted);
|
|
973
|
-
assert(connection.recv_submitted);
|
|
974
|
-
|
|
975
|
-
assert(connection.peer != .none);
|
|
976
|
-
assert(connection.state == .terminating);
|
|
977
|
-
|
|
978
|
-
// Reset the connection to its initial state.
|
|
979
|
-
defer {
|
|
980
|
-
assert(connection.recv_message == null);
|
|
981
|
-
assert(connection.send_queue.empty());
|
|
982
|
-
|
|
983
|
-
switch (connection.peer) {
|
|
984
|
-
.none => unreachable,
|
|
985
|
-
.unknown => {},
|
|
986
|
-
.client => switch (process_type) {
|
|
987
|
-
.replica => assert(bus.process.clients.remove(connection.peer.client)),
|
|
988
|
-
.client => unreachable,
|
|
989
|
-
},
|
|
990
|
-
.replica => {
|
|
991
|
-
// A newer replica connection may have replaced this one:
|
|
992
|
-
if (bus.replicas[connection.peer.replica] == connection) {
|
|
993
|
-
bus.replicas[connection.peer.replica] = null;
|
|
994
|
-
} else {
|
|
995
|
-
// A newer replica connection may even leapfrog this connection and
|
|
996
|
-
// then be terminated and set to null before we can get here:
|
|
997
|
-
assert(bus.replicas[connection.peer.replica] != null or
|
|
998
|
-
bus.replicas[connection.peer.replica] == null);
|
|
999
|
-
}
|
|
1000
|
-
},
|
|
1001
|
-
}
|
|
1002
|
-
bus.connections_used -= 1;
|
|
1003
|
-
connection.* = .{};
|
|
1004
|
-
}
|
|
1005
|
-
|
|
1006
|
-
result catch |err| {
|
|
1007
|
-
log.err("error closing connection to {}: {}", .{ connection.peer, err });
|
|
1008
|
-
return;
|
|
1009
|
-
};
|
|
1010
|
-
}
|
|
1011
|
-
};
|
|
1012
|
-
};
|
|
1013
|
-
}
|