tigerbeetle-node 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -4
- package/package.json +2 -2
- package/scripts/postinstall.sh +2 -2
- package/src/node.zig +19 -27
- package/src/tigerbeetle/scripts/benchmark.bat +46 -0
- package/src/tigerbeetle/scripts/install.sh +1 -1
- package/src/tigerbeetle/scripts/install_zig.bat +4 -4
- package/src/tigerbeetle/scripts/install_zig.sh +4 -2
- package/src/tigerbeetle/scripts/lint.zig +8 -2
- package/src/tigerbeetle/scripts/vopr.sh +2 -2
- package/src/tigerbeetle/src/benchmark.zig +10 -12
- package/src/tigerbeetle/src/cli.zig +43 -20
- package/src/tigerbeetle/src/config.zig +26 -11
- package/src/tigerbeetle/src/demo.zig +119 -97
- package/src/tigerbeetle/src/demo_01_create_accounts.zig +5 -3
- package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -3
- package/src/tigerbeetle/src/demo_03_create_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +5 -3
- package/src/tigerbeetle/src/demo_05_accept_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_06_reject_transfers.zig +5 -3
- package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +2 -3
- package/src/tigerbeetle/src/io/benchmark.zig +213 -0
- package/src/tigerbeetle/src/{io_darwin.zig → io/darwin.zig} +259 -167
- package/src/tigerbeetle/src/io/linux.zig +1038 -0
- package/src/tigerbeetle/src/io/test.zig +643 -0
- package/src/tigerbeetle/src/io/windows.zig +1161 -0
- package/src/tigerbeetle/src/io.zig +9 -1328
- package/src/tigerbeetle/src/main.zig +28 -15
- package/src/tigerbeetle/src/message_bus.zig +78 -107
- package/src/tigerbeetle/src/message_pool.zig +65 -58
- package/src/tigerbeetle/src/ring_buffer.zig +7 -0
- package/src/tigerbeetle/src/simulator.zig +44 -40
- package/src/tigerbeetle/src/state_machine.zig +58 -27
- package/src/tigerbeetle/src/storage.zig +7 -234
- package/src/tigerbeetle/src/test/cluster.zig +5 -8
- package/src/tigerbeetle/src/test/message_bus.zig +10 -9
- package/src/tigerbeetle/src/test/network.zig +16 -19
- package/src/tigerbeetle/src/test/packet_simulator.zig +32 -29
- package/src/tigerbeetle/src/test/state_checker.zig +4 -3
- package/src/tigerbeetle/src/test/state_machine.zig +4 -0
- package/src/tigerbeetle/src/test/storage.zig +23 -19
- package/src/tigerbeetle/src/test/time.zig +2 -2
- package/src/tigerbeetle/src/tigerbeetle.zig +8 -128
- package/src/tigerbeetle/src/time.zig +61 -13
- package/src/tigerbeetle/src/vsr/client.zig +23 -37
- package/src/tigerbeetle/src/vsr/clock.zig +27 -44
- package/src/tigerbeetle/src/vsr/journal.zig +9 -12
- package/src/tigerbeetle/src/vsr/marzullo.zig +6 -3
- package/src/tigerbeetle/src/vsr/replica.zig +184 -204
- package/src/tigerbeetle/src/vsr.zig +287 -25
- package/src/translate.zig +55 -55
|
@@ -0,0 +1,1038 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const assert = std.debug.assert;
|
|
3
|
+
const os = std.os;
|
|
4
|
+
const linux = os.linux;
|
|
5
|
+
const IO_Uring = linux.IO_Uring;
|
|
6
|
+
const io_uring_cqe = linux.io_uring_cqe;
|
|
7
|
+
const io_uring_sqe = linux.io_uring_sqe;
|
|
8
|
+
const log = std.log.scoped(.io);
|
|
9
|
+
|
|
10
|
+
const config = @import("../config.zig");
|
|
11
|
+
const FIFO = @import("../fifo.zig").FIFO;
|
|
12
|
+
const buffer_limit = @import("../io.zig").buffer_limit;
|
|
13
|
+
|
|
14
|
+
pub const IO = struct {
|
|
15
|
+
ring: IO_Uring,
|
|
16
|
+
|
|
17
|
+
/// Operations not yet submitted to the kernel and waiting on available space in the
|
|
18
|
+
/// submission queue.
|
|
19
|
+
unqueued: FIFO(Completion) = .{},
|
|
20
|
+
|
|
21
|
+
/// Completions that are ready to have their callbacks run.
|
|
22
|
+
completed: FIFO(Completion) = .{},
|
|
23
|
+
|
|
24
|
+
pub fn init(entries: u12, flags: u32) !IO {
|
|
25
|
+
return IO{ .ring = try IO_Uring.init(entries, flags) };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
pub fn deinit(self: *IO) void {
|
|
29
|
+
self.ring.deinit();
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/// Pass all queued submissions to the kernel and peek for completions.
|
|
33
|
+
pub fn tick(self: *IO) !void {
|
|
34
|
+
// We assume that all timeouts submitted by `run_for_ns()` will be reaped by `run_for_ns()`
|
|
35
|
+
// and that `tick()` and `run_for_ns()` cannot be run concurrently.
|
|
36
|
+
// Therefore `timeouts` here will never be decremented and `etime` will always be false.
|
|
37
|
+
var timeouts: usize = 0;
|
|
38
|
+
var etime = false;
|
|
39
|
+
|
|
40
|
+
try self.flush(0, &timeouts, &etime);
|
|
41
|
+
assert(etime == false);
|
|
42
|
+
|
|
43
|
+
// Flush any SQEs that were queued while running completion callbacks in `flush()`:
|
|
44
|
+
// This is an optimization to avoid delaying submissions until the next tick.
|
|
45
|
+
// At the same time, we do not flush any ready CQEs since SQEs may complete synchronously.
|
|
46
|
+
// We guard against an io_uring_enter() syscall if we know we do not have any queued SQEs.
|
|
47
|
+
// We cannot use `self.ring.sq_ready()` here since this counts flushed and unflushed SQEs.
|
|
48
|
+
const queued = self.ring.sq.sqe_tail -% self.ring.sq.sqe_head;
|
|
49
|
+
if (queued > 0) {
|
|
50
|
+
try self.flush_submissions(0, &timeouts, &etime);
|
|
51
|
+
assert(etime == false);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Pass all queued submissions to the kernel and run for `nanoseconds`.
|
|
56
|
+
/// The `nanoseconds` argument is a u63 to allow coercion to the i64 used
|
|
57
|
+
/// in the kernel_timespec struct.
|
|
58
|
+
pub fn run_for_ns(self: *IO, nanoseconds: u63) !void {
|
|
59
|
+
// We must use the same clock source used by io_uring (CLOCK_MONOTONIC) since we specify the
|
|
60
|
+
// timeout below as an absolute value. Otherwise, we may deadlock if the clock sources are
|
|
61
|
+
// dramatically different. Any kernel that supports io_uring will support CLOCK_MONOTONIC.
|
|
62
|
+
var current_ts: os.timespec = undefined;
|
|
63
|
+
os.clock_gettime(os.CLOCK.MONOTONIC, ¤t_ts) catch unreachable;
|
|
64
|
+
// The absolute CLOCK_MONOTONIC time after which we may return from this function:
|
|
65
|
+
const timeout_ts: os.linux.kernel_timespec = .{
|
|
66
|
+
.tv_sec = current_ts.tv_sec,
|
|
67
|
+
.tv_nsec = current_ts.tv_nsec + nanoseconds,
|
|
68
|
+
};
|
|
69
|
+
var timeouts: usize = 0;
|
|
70
|
+
var etime = false;
|
|
71
|
+
while (!etime) {
|
|
72
|
+
const timeout_sqe = self.ring.get_sqe() catch blk: {
|
|
73
|
+
// The submission queue is full, so flush submissions to make space:
|
|
74
|
+
try self.flush_submissions(0, &timeouts, &etime);
|
|
75
|
+
break :blk self.ring.get_sqe() catch unreachable;
|
|
76
|
+
};
|
|
77
|
+
// Submit an absolute timeout that will be canceled if any other SQE completes first:
|
|
78
|
+
linux.io_uring_prep_timeout(timeout_sqe, &timeout_ts, 1, os.linux.IORING_TIMEOUT_ABS);
|
|
79
|
+
timeout_sqe.user_data = 0;
|
|
80
|
+
timeouts += 1;
|
|
81
|
+
// The amount of time this call will block is bounded by the timeout we just submitted:
|
|
82
|
+
try self.flush(1, &timeouts, &etime);
|
|
83
|
+
}
|
|
84
|
+
// Reap any remaining timeouts, which reference the timespec in the current stack frame.
|
|
85
|
+
// The busy loop here is required to avoid a potential deadlock, as the kernel determines
|
|
86
|
+
// when the timeouts are pushed to the completion queue, not us.
|
|
87
|
+
while (timeouts > 0) _ = try self.flush_completions(0, &timeouts, &etime);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
fn flush(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void {
|
|
91
|
+
// Flush any queued SQEs and reuse the same syscall to wait for completions if required:
|
|
92
|
+
try self.flush_submissions(wait_nr, timeouts, etime);
|
|
93
|
+
// We can now just peek for any CQEs without waiting and without another syscall:
|
|
94
|
+
try self.flush_completions(0, timeouts, etime);
|
|
95
|
+
// Run completions only after all completions have been flushed:
|
|
96
|
+
// Loop on a copy of the linked list, having reset the list first, so that any synchronous
|
|
97
|
+
// append on running a completion is executed only the next time round the event loop,
|
|
98
|
+
// without creating an infinite loop.
|
|
99
|
+
{
|
|
100
|
+
var copy = self.completed;
|
|
101
|
+
self.completed = .{};
|
|
102
|
+
while (copy.pop()) |completion| completion.complete();
|
|
103
|
+
}
|
|
104
|
+
// Again, loop on a copy of the list to avoid an infinite loop:
|
|
105
|
+
{
|
|
106
|
+
var copy = self.unqueued;
|
|
107
|
+
self.unqueued = .{};
|
|
108
|
+
while (copy.pop()) |completion| self.enqueue(completion);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
fn flush_completions(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void {
|
|
113
|
+
var cqes: [256]io_uring_cqe = undefined;
|
|
114
|
+
var wait_remaining = wait_nr;
|
|
115
|
+
while (true) {
|
|
116
|
+
// Guard against waiting indefinitely (if there are too few requests inflight),
|
|
117
|
+
// especially if this is not the first time round the loop:
|
|
118
|
+
const completed = self.ring.copy_cqes(&cqes, wait_remaining) catch |err| switch (err) {
|
|
119
|
+
error.SignalInterrupt => continue,
|
|
120
|
+
else => return err,
|
|
121
|
+
};
|
|
122
|
+
if (completed > wait_remaining) wait_remaining = 0 else wait_remaining -= completed;
|
|
123
|
+
for (cqes[0..completed]) |cqe| {
|
|
124
|
+
if (cqe.user_data == 0) {
|
|
125
|
+
timeouts.* -= 1;
|
|
126
|
+
// We are only done if the timeout submitted was completed due to time, not if
|
|
127
|
+
// it was completed due to the completion of an event, in which case `cqe.res`
|
|
128
|
+
// would be 0. It is possible for multiple timeout operations to complete at the
|
|
129
|
+
// same time if the nanoseconds value passed to `run_for_ns()` is very short.
|
|
130
|
+
if (-cqe.res == @enumToInt(os.E.TIME)) etime.* = true;
|
|
131
|
+
continue;
|
|
132
|
+
}
|
|
133
|
+
const completion = @intToPtr(*Completion, @intCast(usize, cqe.user_data));
|
|
134
|
+
completion.result = cqe.res;
|
|
135
|
+
// We do not run the completion here (instead appending to a linked list) to avoid:
|
|
136
|
+
// * recursion through `flush_submissions()` and `flush_completions()`,
|
|
137
|
+
// * unbounded stack usage, and
|
|
138
|
+
// * confusing stack traces.
|
|
139
|
+
self.completed.push(completion);
|
|
140
|
+
}
|
|
141
|
+
if (completed < cqes.len) break;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
fn flush_submissions(self: *IO, wait_nr: u32, timeouts: *usize, etime: *bool) !void {
|
|
146
|
+
while (true) {
|
|
147
|
+
_ = self.ring.submit_and_wait(wait_nr) catch |err| switch (err) {
|
|
148
|
+
error.SignalInterrupt => continue,
|
|
149
|
+
// Wait for some completions and then try again:
|
|
150
|
+
// See https://github.com/axboe/liburing/issues/281 re: error.SystemResources.
|
|
151
|
+
// Be careful also that copy_cqes() will flush before entering to wait (it does):
|
|
152
|
+
// https://github.com/axboe/liburing/commit/35c199c48dfd54ad46b96e386882e7ac341314c5
|
|
153
|
+
error.CompletionQueueOvercommitted, error.SystemResources => {
|
|
154
|
+
try self.flush_completions(1, timeouts, etime);
|
|
155
|
+
continue;
|
|
156
|
+
},
|
|
157
|
+
else => return err,
|
|
158
|
+
};
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
fn enqueue(self: *IO, completion: *Completion) void {
|
|
164
|
+
const sqe = self.ring.get_sqe() catch |err| switch (err) {
|
|
165
|
+
error.SubmissionQueueFull => {
|
|
166
|
+
self.unqueued.push(completion);
|
|
167
|
+
return;
|
|
168
|
+
},
|
|
169
|
+
};
|
|
170
|
+
completion.prep(sqe);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/// This struct holds the data needed for a single io_uring operation
|
|
174
|
+
pub const Completion = struct {
|
|
175
|
+
io: *IO,
|
|
176
|
+
result: i32 = undefined,
|
|
177
|
+
next: ?*Completion = null,
|
|
178
|
+
operation: Operation,
|
|
179
|
+
context: ?*anyopaque,
|
|
180
|
+
callback: fn (context: ?*anyopaque, completion: *Completion, result: *const anyopaque) void,
|
|
181
|
+
|
|
182
|
+
fn prep(completion: *Completion, sqe: *io_uring_sqe) void {
|
|
183
|
+
switch (completion.operation) {
|
|
184
|
+
.accept => |*op| {
|
|
185
|
+
linux.io_uring_prep_accept(
|
|
186
|
+
sqe,
|
|
187
|
+
op.socket,
|
|
188
|
+
&op.address,
|
|
189
|
+
&op.address_size,
|
|
190
|
+
os.SOCK.CLOEXEC,
|
|
191
|
+
);
|
|
192
|
+
},
|
|
193
|
+
.close => |op| {
|
|
194
|
+
linux.io_uring_prep_close(sqe, op.fd);
|
|
195
|
+
},
|
|
196
|
+
.connect => |*op| {
|
|
197
|
+
linux.io_uring_prep_connect(
|
|
198
|
+
sqe,
|
|
199
|
+
op.socket,
|
|
200
|
+
&op.address.any,
|
|
201
|
+
op.address.getOsSockLen(),
|
|
202
|
+
);
|
|
203
|
+
},
|
|
204
|
+
.read => |op| {
|
|
205
|
+
linux.io_uring_prep_read(
|
|
206
|
+
sqe,
|
|
207
|
+
op.fd,
|
|
208
|
+
op.buffer[0..buffer_limit(op.buffer.len)],
|
|
209
|
+
op.offset,
|
|
210
|
+
);
|
|
211
|
+
},
|
|
212
|
+
.recv => |op| {
|
|
213
|
+
linux.io_uring_prep_recv(sqe, op.socket, op.buffer, os.MSG.NOSIGNAL);
|
|
214
|
+
},
|
|
215
|
+
.send => |op| {
|
|
216
|
+
linux.io_uring_prep_send(sqe, op.socket, op.buffer, os.MSG.NOSIGNAL);
|
|
217
|
+
},
|
|
218
|
+
.timeout => |*op| {
|
|
219
|
+
linux.io_uring_prep_timeout(sqe, &op.timespec, 0, 0);
|
|
220
|
+
},
|
|
221
|
+
.write => |op| {
|
|
222
|
+
linux.io_uring_prep_write(
|
|
223
|
+
sqe,
|
|
224
|
+
op.fd,
|
|
225
|
+
op.buffer[0..buffer_limit(op.buffer.len)],
|
|
226
|
+
op.offset,
|
|
227
|
+
);
|
|
228
|
+
},
|
|
229
|
+
}
|
|
230
|
+
sqe.user_data = @ptrToInt(completion);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
fn complete(completion: *Completion) void {
|
|
234
|
+
switch (completion.operation) {
|
|
235
|
+
.accept => {
|
|
236
|
+
const result = blk: {
|
|
237
|
+
if (completion.result < 0) {
|
|
238
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
239
|
+
.INTR => {
|
|
240
|
+
completion.io.enqueue(completion);
|
|
241
|
+
return;
|
|
242
|
+
},
|
|
243
|
+
.AGAIN => error.WouldBlock,
|
|
244
|
+
.BADF => error.FileDescriptorInvalid,
|
|
245
|
+
.CONNABORTED => error.ConnectionAborted,
|
|
246
|
+
.FAULT => unreachable,
|
|
247
|
+
.INVAL => error.SocketNotListening,
|
|
248
|
+
.MFILE => error.ProcessFdQuotaExceeded,
|
|
249
|
+
.NFILE => error.SystemFdQuotaExceeded,
|
|
250
|
+
.NOBUFS => error.SystemResources,
|
|
251
|
+
.NOMEM => error.SystemResources,
|
|
252
|
+
.NOTSOCK => error.FileDescriptorNotASocket,
|
|
253
|
+
.OPNOTSUPP => error.OperationNotSupported,
|
|
254
|
+
.PERM => error.PermissionDenied,
|
|
255
|
+
.PROTO => error.ProtocolFailure,
|
|
256
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
257
|
+
};
|
|
258
|
+
break :blk err;
|
|
259
|
+
} else {
|
|
260
|
+
break :blk @intCast(os.socket_t, completion.result);
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
completion.callback(completion.context, completion, &result);
|
|
264
|
+
},
|
|
265
|
+
.close => {
|
|
266
|
+
const result = blk: {
|
|
267
|
+
if (completion.result < 0) {
|
|
268
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
269
|
+
.INTR => {}, // A success, see https://github.com/ziglang/zig/issues/2425
|
|
270
|
+
.BADF => error.FileDescriptorInvalid,
|
|
271
|
+
.DQUOT => error.DiskQuota,
|
|
272
|
+
.IO => error.InputOutput,
|
|
273
|
+
.NOSPC => error.NoSpaceLeft,
|
|
274
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
275
|
+
};
|
|
276
|
+
break :blk err;
|
|
277
|
+
} else {
|
|
278
|
+
assert(completion.result == 0);
|
|
279
|
+
}
|
|
280
|
+
};
|
|
281
|
+
completion.callback(completion.context, completion, &result);
|
|
282
|
+
},
|
|
283
|
+
.connect => {
|
|
284
|
+
const result = blk: {
|
|
285
|
+
if (completion.result < 0) {
|
|
286
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
287
|
+
.INTR => {
|
|
288
|
+
completion.io.enqueue(completion);
|
|
289
|
+
return;
|
|
290
|
+
},
|
|
291
|
+
.ACCES => error.AccessDenied,
|
|
292
|
+
.ADDRINUSE => error.AddressInUse,
|
|
293
|
+
.ADDRNOTAVAIL => error.AddressNotAvailable,
|
|
294
|
+
.AFNOSUPPORT => error.AddressFamilyNotSupported,
|
|
295
|
+
.AGAIN, .INPROGRESS => error.WouldBlock,
|
|
296
|
+
.ALREADY => error.OpenAlreadyInProgress,
|
|
297
|
+
.BADF => error.FileDescriptorInvalid,
|
|
298
|
+
.CONNREFUSED => error.ConnectionRefused,
|
|
299
|
+
.CONNRESET => error.ConnectionResetByPeer,
|
|
300
|
+
.FAULT => unreachable,
|
|
301
|
+
.ISCONN => error.AlreadyConnected,
|
|
302
|
+
.NETUNREACH => error.NetworkUnreachable,
|
|
303
|
+
.NOENT => error.FileNotFound,
|
|
304
|
+
.NOTSOCK => error.FileDescriptorNotASocket,
|
|
305
|
+
.PERM => error.PermissionDenied,
|
|
306
|
+
.PROTOTYPE => error.ProtocolNotSupported,
|
|
307
|
+
.TIMEDOUT => error.ConnectionTimedOut,
|
|
308
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
309
|
+
};
|
|
310
|
+
break :blk err;
|
|
311
|
+
} else {
|
|
312
|
+
assert(completion.result == 0);
|
|
313
|
+
}
|
|
314
|
+
};
|
|
315
|
+
completion.callback(completion.context, completion, &result);
|
|
316
|
+
},
|
|
317
|
+
.read => {
|
|
318
|
+
const result = blk: {
|
|
319
|
+
if (completion.result < 0) {
|
|
320
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
321
|
+
.INTR => {
|
|
322
|
+
completion.io.enqueue(completion);
|
|
323
|
+
return;
|
|
324
|
+
},
|
|
325
|
+
.AGAIN => error.WouldBlock,
|
|
326
|
+
.BADF => error.NotOpenForReading,
|
|
327
|
+
.CONNRESET => error.ConnectionResetByPeer,
|
|
328
|
+
.FAULT => unreachable,
|
|
329
|
+
.INVAL => error.Alignment,
|
|
330
|
+
.IO => error.InputOutput,
|
|
331
|
+
.ISDIR => error.IsDir,
|
|
332
|
+
.NOBUFS => error.SystemResources,
|
|
333
|
+
.NOMEM => error.SystemResources,
|
|
334
|
+
.NXIO => error.Unseekable,
|
|
335
|
+
.OVERFLOW => error.Unseekable,
|
|
336
|
+
.SPIPE => error.Unseekable,
|
|
337
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
338
|
+
};
|
|
339
|
+
break :blk err;
|
|
340
|
+
} else {
|
|
341
|
+
break :blk @intCast(usize, completion.result);
|
|
342
|
+
}
|
|
343
|
+
};
|
|
344
|
+
completion.callback(completion.context, completion, &result);
|
|
345
|
+
},
|
|
346
|
+
.recv => {
|
|
347
|
+
const result = blk: {
|
|
348
|
+
if (completion.result < 0) {
|
|
349
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
350
|
+
.INTR => {
|
|
351
|
+
completion.io.enqueue(completion);
|
|
352
|
+
return;
|
|
353
|
+
},
|
|
354
|
+
.AGAIN => error.WouldBlock,
|
|
355
|
+
.BADF => error.FileDescriptorInvalid,
|
|
356
|
+
.CONNREFUSED => error.ConnectionRefused,
|
|
357
|
+
.FAULT => unreachable,
|
|
358
|
+
.INVAL => unreachable,
|
|
359
|
+
.NOMEM => error.SystemResources,
|
|
360
|
+
.NOTCONN => error.SocketNotConnected,
|
|
361
|
+
.NOTSOCK => error.FileDescriptorNotASocket,
|
|
362
|
+
.CONNRESET => error.ConnectionResetByPeer,
|
|
363
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
364
|
+
};
|
|
365
|
+
break :blk err;
|
|
366
|
+
} else {
|
|
367
|
+
break :blk @intCast(usize, completion.result);
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
completion.callback(completion.context, completion, &result);
|
|
371
|
+
},
|
|
372
|
+
.send => {
|
|
373
|
+
const result = blk: {
|
|
374
|
+
if (completion.result < 0) {
|
|
375
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
376
|
+
.INTR => {
|
|
377
|
+
completion.io.enqueue(completion);
|
|
378
|
+
return;
|
|
379
|
+
},
|
|
380
|
+
.ACCES => error.AccessDenied,
|
|
381
|
+
.AGAIN => error.WouldBlock,
|
|
382
|
+
.ALREADY => error.FastOpenAlreadyInProgress,
|
|
383
|
+
.AFNOSUPPORT => error.AddressFamilyNotSupported,
|
|
384
|
+
.BADF => error.FileDescriptorInvalid,
|
|
385
|
+
.CONNRESET => error.ConnectionResetByPeer,
|
|
386
|
+
.DESTADDRREQ => unreachable,
|
|
387
|
+
.FAULT => unreachable,
|
|
388
|
+
.INVAL => unreachable,
|
|
389
|
+
.ISCONN => unreachable,
|
|
390
|
+
.MSGSIZE => error.MessageTooBig,
|
|
391
|
+
.NOBUFS => error.SystemResources,
|
|
392
|
+
.NOMEM => error.SystemResources,
|
|
393
|
+
.NOTCONN => error.SocketNotConnected,
|
|
394
|
+
.NOTSOCK => error.FileDescriptorNotASocket,
|
|
395
|
+
.OPNOTSUPP => error.OperationNotSupported,
|
|
396
|
+
.PIPE => error.BrokenPipe,
|
|
397
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
398
|
+
};
|
|
399
|
+
break :blk err;
|
|
400
|
+
} else {
|
|
401
|
+
break :blk @intCast(usize, completion.result);
|
|
402
|
+
}
|
|
403
|
+
};
|
|
404
|
+
completion.callback(completion.context, completion, &result);
|
|
405
|
+
},
|
|
406
|
+
.timeout => {
|
|
407
|
+
assert(completion.result < 0);
|
|
408
|
+
const result = switch (@intToEnum(os.E, -completion.result)) {
|
|
409
|
+
.INTR => {
|
|
410
|
+
completion.io.enqueue(completion);
|
|
411
|
+
return;
|
|
412
|
+
},
|
|
413
|
+
.CANCELED => error.Canceled,
|
|
414
|
+
.TIME => {}, // A success.
|
|
415
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
416
|
+
};
|
|
417
|
+
completion.callback(completion.context, completion, &result);
|
|
418
|
+
},
|
|
419
|
+
.write => {
|
|
420
|
+
const result = blk: {
|
|
421
|
+
if (completion.result < 0) {
|
|
422
|
+
const err = switch (@intToEnum(os.E, -completion.result)) {
|
|
423
|
+
.INTR => {
|
|
424
|
+
completion.io.enqueue(completion);
|
|
425
|
+
return;
|
|
426
|
+
},
|
|
427
|
+
.AGAIN => error.WouldBlock,
|
|
428
|
+
.BADF => error.NotOpenForWriting,
|
|
429
|
+
.DESTADDRREQ => error.NotConnected,
|
|
430
|
+
.DQUOT => error.DiskQuota,
|
|
431
|
+
.FAULT => unreachable,
|
|
432
|
+
.FBIG => error.FileTooBig,
|
|
433
|
+
.INVAL => error.Alignment,
|
|
434
|
+
.IO => error.InputOutput,
|
|
435
|
+
.NOSPC => error.NoSpaceLeft,
|
|
436
|
+
.NXIO => error.Unseekable,
|
|
437
|
+
.OVERFLOW => error.Unseekable,
|
|
438
|
+
.PERM => error.AccessDenied,
|
|
439
|
+
.PIPE => error.BrokenPipe,
|
|
440
|
+
.SPIPE => error.Unseekable,
|
|
441
|
+
else => |errno| os.unexpectedErrno(errno),
|
|
442
|
+
};
|
|
443
|
+
break :blk err;
|
|
444
|
+
} else {
|
|
445
|
+
break :blk @intCast(usize, completion.result);
|
|
446
|
+
}
|
|
447
|
+
};
|
|
448
|
+
completion.callback(completion.context, completion, &result);
|
|
449
|
+
},
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
};
|
|
453
|
+
|
|
454
|
+
/// This union encodes the set of operations supported as well as their arguments.
|
|
455
|
+
const Operation = union(enum) {
|
|
456
|
+
accept: struct {
|
|
457
|
+
socket: os.socket_t,
|
|
458
|
+
address: os.sockaddr = undefined,
|
|
459
|
+
address_size: os.socklen_t = @sizeOf(os.sockaddr),
|
|
460
|
+
},
|
|
461
|
+
close: struct {
|
|
462
|
+
fd: os.fd_t,
|
|
463
|
+
},
|
|
464
|
+
connect: struct {
|
|
465
|
+
socket: os.socket_t,
|
|
466
|
+
address: std.net.Address,
|
|
467
|
+
},
|
|
468
|
+
read: struct {
|
|
469
|
+
fd: os.fd_t,
|
|
470
|
+
buffer: []u8,
|
|
471
|
+
offset: u64,
|
|
472
|
+
},
|
|
473
|
+
recv: struct {
|
|
474
|
+
socket: os.socket_t,
|
|
475
|
+
buffer: []u8,
|
|
476
|
+
},
|
|
477
|
+
send: struct {
|
|
478
|
+
socket: os.socket_t,
|
|
479
|
+
buffer: []const u8,
|
|
480
|
+
},
|
|
481
|
+
timeout: struct {
|
|
482
|
+
timespec: os.linux.kernel_timespec,
|
|
483
|
+
},
|
|
484
|
+
write: struct {
|
|
485
|
+
fd: os.fd_t,
|
|
486
|
+
buffer: []const u8,
|
|
487
|
+
offset: u64,
|
|
488
|
+
},
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
pub const AcceptError = error{
|
|
492
|
+
WouldBlock,
|
|
493
|
+
FileDescriptorInvalid,
|
|
494
|
+
ConnectionAborted,
|
|
495
|
+
SocketNotListening,
|
|
496
|
+
ProcessFdQuotaExceeded,
|
|
497
|
+
SystemFdQuotaExceeded,
|
|
498
|
+
SystemResources,
|
|
499
|
+
FileDescriptorNotASocket,
|
|
500
|
+
OperationNotSupported,
|
|
501
|
+
PermissionDenied,
|
|
502
|
+
ProtocolFailure,
|
|
503
|
+
} || os.UnexpectedError;
|
|
504
|
+
|
|
505
|
+
pub fn accept(
|
|
506
|
+
self: *IO,
|
|
507
|
+
comptime Context: type,
|
|
508
|
+
context: Context,
|
|
509
|
+
comptime callback: fn (
|
|
510
|
+
context: Context,
|
|
511
|
+
completion: *Completion,
|
|
512
|
+
result: AcceptError!os.socket_t,
|
|
513
|
+
) void,
|
|
514
|
+
completion: *Completion,
|
|
515
|
+
socket: os.socket_t,
|
|
516
|
+
) void {
|
|
517
|
+
completion.* = .{
|
|
518
|
+
.io = self,
|
|
519
|
+
.context = context,
|
|
520
|
+
.callback = struct {
|
|
521
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
522
|
+
callback(
|
|
523
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
524
|
+
comp,
|
|
525
|
+
@intToPtr(*const AcceptError!os.socket_t, @ptrToInt(res)).*,
|
|
526
|
+
);
|
|
527
|
+
}
|
|
528
|
+
}.wrapper,
|
|
529
|
+
.operation = .{
|
|
530
|
+
.accept = .{
|
|
531
|
+
.socket = socket,
|
|
532
|
+
.address = undefined,
|
|
533
|
+
.address_size = @sizeOf(os.sockaddr),
|
|
534
|
+
},
|
|
535
|
+
},
|
|
536
|
+
};
|
|
537
|
+
self.enqueue(completion);
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
pub const CloseError = error{
|
|
541
|
+
FileDescriptorInvalid,
|
|
542
|
+
DiskQuota,
|
|
543
|
+
InputOutput,
|
|
544
|
+
NoSpaceLeft,
|
|
545
|
+
} || os.UnexpectedError;
|
|
546
|
+
|
|
547
|
+
pub fn close(
|
|
548
|
+
self: *IO,
|
|
549
|
+
comptime Context: type,
|
|
550
|
+
context: Context,
|
|
551
|
+
comptime callback: fn (
|
|
552
|
+
context: Context,
|
|
553
|
+
completion: *Completion,
|
|
554
|
+
result: CloseError!void,
|
|
555
|
+
) void,
|
|
556
|
+
completion: *Completion,
|
|
557
|
+
fd: os.fd_t,
|
|
558
|
+
) void {
|
|
559
|
+
completion.* = .{
|
|
560
|
+
.io = self,
|
|
561
|
+
.context = context,
|
|
562
|
+
.callback = struct {
|
|
563
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
564
|
+
callback(
|
|
565
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
566
|
+
comp,
|
|
567
|
+
@intToPtr(*const CloseError!void, @ptrToInt(res)).*,
|
|
568
|
+
);
|
|
569
|
+
}
|
|
570
|
+
}.wrapper,
|
|
571
|
+
.operation = .{
|
|
572
|
+
.close = .{ .fd = fd },
|
|
573
|
+
},
|
|
574
|
+
};
|
|
575
|
+
self.enqueue(completion);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
pub const ConnectError = error{
|
|
579
|
+
AccessDenied,
|
|
580
|
+
AddressInUse,
|
|
581
|
+
AddressNotAvailable,
|
|
582
|
+
AddressFamilyNotSupported,
|
|
583
|
+
WouldBlock,
|
|
584
|
+
OpenAlreadyInProgress,
|
|
585
|
+
FileDescriptorInvalid,
|
|
586
|
+
ConnectionRefused,
|
|
587
|
+
AlreadyConnected,
|
|
588
|
+
NetworkUnreachable,
|
|
589
|
+
FileNotFound,
|
|
590
|
+
FileDescriptorNotASocket,
|
|
591
|
+
PermissionDenied,
|
|
592
|
+
ProtocolNotSupported,
|
|
593
|
+
ConnectionTimedOut,
|
|
594
|
+
} || os.UnexpectedError;
|
|
595
|
+
|
|
596
|
+
pub fn connect(
|
|
597
|
+
self: *IO,
|
|
598
|
+
comptime Context: type,
|
|
599
|
+
context: Context,
|
|
600
|
+
comptime callback: fn (
|
|
601
|
+
context: Context,
|
|
602
|
+
completion: *Completion,
|
|
603
|
+
result: ConnectError!void,
|
|
604
|
+
) void,
|
|
605
|
+
completion: *Completion,
|
|
606
|
+
socket: os.socket_t,
|
|
607
|
+
address: std.net.Address,
|
|
608
|
+
) void {
|
|
609
|
+
completion.* = .{
|
|
610
|
+
.io = self,
|
|
611
|
+
.context = context,
|
|
612
|
+
.callback = struct {
|
|
613
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
614
|
+
callback(
|
|
615
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
616
|
+
comp,
|
|
617
|
+
@intToPtr(*const ConnectError!void, @ptrToInt(res)).*,
|
|
618
|
+
);
|
|
619
|
+
}
|
|
620
|
+
}.wrapper,
|
|
621
|
+
.operation = .{
|
|
622
|
+
.connect = .{
|
|
623
|
+
.socket = socket,
|
|
624
|
+
.address = address,
|
|
625
|
+
},
|
|
626
|
+
},
|
|
627
|
+
};
|
|
628
|
+
self.enqueue(completion);
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
pub const ReadError = error{
|
|
632
|
+
WouldBlock,
|
|
633
|
+
NotOpenForReading,
|
|
634
|
+
ConnectionResetByPeer,
|
|
635
|
+
Alignment,
|
|
636
|
+
InputOutput,
|
|
637
|
+
IsDir,
|
|
638
|
+
SystemResources,
|
|
639
|
+
Unseekable,
|
|
640
|
+
} || os.UnexpectedError;
|
|
641
|
+
|
|
642
|
+
pub fn read(
|
|
643
|
+
self: *IO,
|
|
644
|
+
comptime Context: type,
|
|
645
|
+
context: Context,
|
|
646
|
+
comptime callback: fn (
|
|
647
|
+
context: Context,
|
|
648
|
+
completion: *Completion,
|
|
649
|
+
result: ReadError!usize,
|
|
650
|
+
) void,
|
|
651
|
+
completion: *Completion,
|
|
652
|
+
fd: os.fd_t,
|
|
653
|
+
buffer: []u8,
|
|
654
|
+
offset: u64,
|
|
655
|
+
) void {
|
|
656
|
+
completion.* = .{
|
|
657
|
+
.io = self,
|
|
658
|
+
.context = context,
|
|
659
|
+
.callback = struct {
|
|
660
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
661
|
+
callback(
|
|
662
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
663
|
+
comp,
|
|
664
|
+
@intToPtr(*const ReadError!usize, @ptrToInt(res)).*,
|
|
665
|
+
);
|
|
666
|
+
}
|
|
667
|
+
}.wrapper,
|
|
668
|
+
.operation = .{
|
|
669
|
+
.read = .{
|
|
670
|
+
.fd = fd,
|
|
671
|
+
.buffer = buffer,
|
|
672
|
+
.offset = offset,
|
|
673
|
+
},
|
|
674
|
+
},
|
|
675
|
+
};
|
|
676
|
+
self.enqueue(completion);
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
pub const RecvError = error{
|
|
680
|
+
WouldBlock,
|
|
681
|
+
FileDescriptorInvalid,
|
|
682
|
+
ConnectionRefused,
|
|
683
|
+
SystemResources,
|
|
684
|
+
SocketNotConnected,
|
|
685
|
+
FileDescriptorNotASocket,
|
|
686
|
+
} || os.UnexpectedError;
|
|
687
|
+
|
|
688
|
+
pub fn recv(
|
|
689
|
+
self: *IO,
|
|
690
|
+
comptime Context: type,
|
|
691
|
+
context: Context,
|
|
692
|
+
comptime callback: fn (
|
|
693
|
+
context: Context,
|
|
694
|
+
completion: *Completion,
|
|
695
|
+
result: RecvError!usize,
|
|
696
|
+
) void,
|
|
697
|
+
completion: *Completion,
|
|
698
|
+
socket: os.socket_t,
|
|
699
|
+
buffer: []u8,
|
|
700
|
+
) void {
|
|
701
|
+
completion.* = .{
|
|
702
|
+
.io = self,
|
|
703
|
+
.context = context,
|
|
704
|
+
.callback = struct {
|
|
705
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
706
|
+
callback(
|
|
707
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
708
|
+
comp,
|
|
709
|
+
@intToPtr(*const RecvError!usize, @ptrToInt(res)).*,
|
|
710
|
+
);
|
|
711
|
+
}
|
|
712
|
+
}.wrapper,
|
|
713
|
+
.operation = .{
|
|
714
|
+
.recv = .{
|
|
715
|
+
.socket = socket,
|
|
716
|
+
.buffer = buffer,
|
|
717
|
+
},
|
|
718
|
+
},
|
|
719
|
+
};
|
|
720
|
+
self.enqueue(completion);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
pub const SendError = error{
|
|
724
|
+
AccessDenied,
|
|
725
|
+
WouldBlock,
|
|
726
|
+
FastOpenAlreadyInProgress,
|
|
727
|
+
AddressFamilyNotSupported,
|
|
728
|
+
FileDescriptorInvalid,
|
|
729
|
+
ConnectionResetByPeer,
|
|
730
|
+
MessageTooBig,
|
|
731
|
+
SystemResources,
|
|
732
|
+
SocketNotConnected,
|
|
733
|
+
FileDescriptorNotASocket,
|
|
734
|
+
OperationNotSupported,
|
|
735
|
+
BrokenPipe,
|
|
736
|
+
} || os.UnexpectedError;
|
|
737
|
+
|
|
738
|
+
pub fn send(
|
|
739
|
+
self: *IO,
|
|
740
|
+
comptime Context: type,
|
|
741
|
+
context: Context,
|
|
742
|
+
comptime callback: fn (
|
|
743
|
+
context: Context,
|
|
744
|
+
completion: *Completion,
|
|
745
|
+
result: SendError!usize,
|
|
746
|
+
) void,
|
|
747
|
+
completion: *Completion,
|
|
748
|
+
socket: os.socket_t,
|
|
749
|
+
buffer: []const u8,
|
|
750
|
+
) void {
|
|
751
|
+
completion.* = .{
|
|
752
|
+
.io = self,
|
|
753
|
+
.context = context,
|
|
754
|
+
.callback = struct {
|
|
755
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
756
|
+
callback(
|
|
757
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
758
|
+
comp,
|
|
759
|
+
@intToPtr(*const SendError!usize, @ptrToInt(res)).*,
|
|
760
|
+
);
|
|
761
|
+
}
|
|
762
|
+
}.wrapper,
|
|
763
|
+
.operation = .{
|
|
764
|
+
.send = .{
|
|
765
|
+
.socket = socket,
|
|
766
|
+
.buffer = buffer,
|
|
767
|
+
},
|
|
768
|
+
},
|
|
769
|
+
};
|
|
770
|
+
self.enqueue(completion);
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
pub const TimeoutError = error{Canceled} || os.UnexpectedError;
|
|
774
|
+
|
|
775
|
+
pub fn timeout(
|
|
776
|
+
self: *IO,
|
|
777
|
+
comptime Context: type,
|
|
778
|
+
context: Context,
|
|
779
|
+
comptime callback: fn (
|
|
780
|
+
context: Context,
|
|
781
|
+
completion: *Completion,
|
|
782
|
+
result: TimeoutError!void,
|
|
783
|
+
) void,
|
|
784
|
+
completion: *Completion,
|
|
785
|
+
nanoseconds: u63,
|
|
786
|
+
) void {
|
|
787
|
+
completion.* = .{
|
|
788
|
+
.io = self,
|
|
789
|
+
.context = context,
|
|
790
|
+
.callback = struct {
|
|
791
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
792
|
+
callback(
|
|
793
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
794
|
+
comp,
|
|
795
|
+
@intToPtr(*const TimeoutError!void, @ptrToInt(res)).*,
|
|
796
|
+
);
|
|
797
|
+
}
|
|
798
|
+
}.wrapper,
|
|
799
|
+
.operation = .{
|
|
800
|
+
.timeout = .{
|
|
801
|
+
.timespec = .{ .tv_sec = 0, .tv_nsec = nanoseconds },
|
|
802
|
+
},
|
|
803
|
+
},
|
|
804
|
+
};
|
|
805
|
+
self.enqueue(completion);
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
pub const WriteError = error{
|
|
809
|
+
WouldBlock,
|
|
810
|
+
NotOpenForWriting,
|
|
811
|
+
NotConnected,
|
|
812
|
+
DiskQuota,
|
|
813
|
+
FileTooBig,
|
|
814
|
+
Alignment,
|
|
815
|
+
InputOutput,
|
|
816
|
+
NoSpaceLeft,
|
|
817
|
+
Unseekable,
|
|
818
|
+
AccessDenied,
|
|
819
|
+
BrokenPipe,
|
|
820
|
+
} || os.UnexpectedError;
|
|
821
|
+
|
|
822
|
+
pub fn write(
|
|
823
|
+
self: *IO,
|
|
824
|
+
comptime Context: type,
|
|
825
|
+
context: Context,
|
|
826
|
+
comptime callback: fn (
|
|
827
|
+
context: Context,
|
|
828
|
+
completion: *Completion,
|
|
829
|
+
result: WriteError!usize,
|
|
830
|
+
) void,
|
|
831
|
+
completion: *Completion,
|
|
832
|
+
fd: os.fd_t,
|
|
833
|
+
buffer: []const u8,
|
|
834
|
+
offset: u64,
|
|
835
|
+
) void {
|
|
836
|
+
_ = callback;
|
|
837
|
+
|
|
838
|
+
completion.* = .{
|
|
839
|
+
.io = self,
|
|
840
|
+
.context = context,
|
|
841
|
+
.callback = struct {
|
|
842
|
+
fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
|
|
843
|
+
callback(
|
|
844
|
+
@intToPtr(Context, @ptrToInt(ctx)),
|
|
845
|
+
comp,
|
|
846
|
+
@intToPtr(*const WriteError!usize, @ptrToInt(res)).*,
|
|
847
|
+
);
|
|
848
|
+
}
|
|
849
|
+
}.wrapper,
|
|
850
|
+
.operation = .{
|
|
851
|
+
.write = .{
|
|
852
|
+
.fd = fd,
|
|
853
|
+
.buffer = buffer,
|
|
854
|
+
.offset = offset,
|
|
855
|
+
},
|
|
856
|
+
},
|
|
857
|
+
};
|
|
858
|
+
self.enqueue(completion);
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
pub const INVALID_SOCKET = -1;
|
|
862
|
+
|
|
863
|
+
/// Creates a socket that can be used for async operations with the IO instance.
|
|
864
|
+
pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t {
|
|
865
|
+
_ = self;
|
|
866
|
+
return os.socket(family, sock_type, protocol);
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
/// Opens a directory with read only access.
|
|
870
|
+
pub fn open_dir(dir_path: [:0]const u8) !os.fd_t {
|
|
871
|
+
return os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0);
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
/// Opens or creates a journal file:
|
|
875
|
+
/// - For reading and writing.
|
|
876
|
+
/// - For Direct I/O (if possible in development mode, but required in production mode).
|
|
877
|
+
/// - Obtains an advisory exclusive lock to the file descriptor.
|
|
878
|
+
/// - Allocates the file contiguously on disk if this is supported by the file system.
|
|
879
|
+
/// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
|
|
880
|
+
/// The caller is responsible for ensuring that the parent directory inode is durable.
|
|
881
|
+
/// - Verifies that the file size matches the expected file size before returning.
|
|
882
|
+
pub fn open_file(
|
|
883
|
+
self: *IO,
|
|
884
|
+
dir_fd: os.fd_t,
|
|
885
|
+
relative_path: [:0]const u8,
|
|
886
|
+
size: u64,
|
|
887
|
+
must_create: bool,
|
|
888
|
+
) !os.fd_t {
|
|
889
|
+
_ = self;
|
|
890
|
+
|
|
891
|
+
assert(relative_path.len > 0);
|
|
892
|
+
assert(size >= config.sector_size);
|
|
893
|
+
assert(size % config.sector_size == 0);
|
|
894
|
+
|
|
895
|
+
// TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
|
|
896
|
+
// This is much stronger than an advisory exclusive lock, and is required on some platforms.
|
|
897
|
+
|
|
898
|
+
var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC;
|
|
899
|
+
var mode: os.mode_t = 0;
|
|
900
|
+
|
|
901
|
+
// TODO Document this and investigate whether this is in fact correct to set here.
|
|
902
|
+
if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE;
|
|
903
|
+
|
|
904
|
+
var direct_io_supported = false;
|
|
905
|
+
if (config.direct_io) {
|
|
906
|
+
direct_io_supported = try fs_supports_direct_io(dir_fd);
|
|
907
|
+
if (direct_io_supported) {
|
|
908
|
+
flags |= os.O.DIRECT;
|
|
909
|
+
} else if (config.deployment_environment == .development) {
|
|
910
|
+
log.warn("file system does not support Direct I/O", .{});
|
|
911
|
+
} else {
|
|
912
|
+
// We require Direct I/O for safety to handle fsync failure correctly, and therefore
|
|
913
|
+
// panic in production if it is not supported.
|
|
914
|
+
@panic("file system does not support Direct I/O");
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
if (must_create) {
|
|
919
|
+
log.info("creating \"{s}\"...", .{relative_path});
|
|
920
|
+
flags |= os.O.CREAT;
|
|
921
|
+
flags |= os.O.EXCL;
|
|
922
|
+
mode = 0o666;
|
|
923
|
+
} else {
|
|
924
|
+
log.info("opening \"{s}\"...", .{relative_path});
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
// This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
|
|
928
|
+
assert((flags & os.O.DSYNC) > 0);
|
|
929
|
+
|
|
930
|
+
// Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
|
|
931
|
+
assert(!std.fs.path.isAbsolute(relative_path));
|
|
932
|
+
const fd = try os.openatZ(dir_fd, relative_path, flags, mode);
|
|
933
|
+
// TODO Return a proper error message when the path exists or does not exist (init/start).
|
|
934
|
+
errdefer os.close(fd);
|
|
935
|
+
|
|
936
|
+
// TODO Check that the file is actually a file.
|
|
937
|
+
|
|
938
|
+
// Obtain an advisory exclusive lock that works only if all processes actually use flock().
|
|
939
|
+
// LOCK_NB means that we want to fail the lock without waiting if another process has it.
|
|
940
|
+
os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) {
|
|
941
|
+
error.WouldBlock => @panic("another process holds the data file lock"),
|
|
942
|
+
else => return err,
|
|
943
|
+
};
|
|
944
|
+
|
|
945
|
+
// Ask the file system to allocate contiguous sectors for the file (if possible):
|
|
946
|
+
// If the file system does not support `fallocate()`, then this could mean more seeks or a
|
|
947
|
+
// panic if we run out of disk space (ENOSPC).
|
|
948
|
+
if (must_create) {
|
|
949
|
+
log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
|
|
950
|
+
fs_allocate(fd, size) catch |err| switch (err) {
|
|
951
|
+
error.OperationNotSupported => {
|
|
952
|
+
log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
|
|
953
|
+
log.info("allocating by writing to the last sector of the file instead...", .{});
|
|
954
|
+
|
|
955
|
+
const sector_size = config.sector_size;
|
|
956
|
+
const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
|
|
957
|
+
|
|
958
|
+
// Handle partial writes where the physical sector is less than a logical sector:
|
|
959
|
+
const write_offset = size - sector.len;
|
|
960
|
+
var written: usize = 0;
|
|
961
|
+
while (written < sector.len) {
|
|
962
|
+
written += try os.pwrite(fd, sector[written..], write_offset + written);
|
|
963
|
+
}
|
|
964
|
+
},
|
|
965
|
+
else => |e| return e,
|
|
966
|
+
};
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
// The best fsync strategy is always to fsync before reading because this prevents us from
|
|
970
|
+
// making decisions on data that was never durably written by a previously crashed process.
|
|
971
|
+
// We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
|
|
972
|
+
// Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
|
|
973
|
+
try os.fsync(fd);
|
|
974
|
+
|
|
975
|
+
// We fsync the parent directory to ensure that the file inode is durably written.
|
|
976
|
+
// The caller is responsible for the parent directory inode stored under the grandparent.
|
|
977
|
+
// We always do this when opening because we don't know if this was done before crashing.
|
|
978
|
+
try os.fsync(dir_fd);
|
|
979
|
+
|
|
980
|
+
const stat = try os.fstat(fd);
|
|
981
|
+
if (stat.size != size) @panic("data file inode size was truncated or corrupted");
|
|
982
|
+
|
|
983
|
+
return fd;
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
/// Detects whether the underlying file system for a given directory fd supports Direct I/O.
|
|
987
|
+
/// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
|
|
988
|
+
fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
|
|
989
|
+
if (!@hasDecl(std.os, "O_DIRECT")) return false;
|
|
990
|
+
|
|
991
|
+
const path = "fs_supports_direct_io";
|
|
992
|
+
const dir = std.fs.Dir{ .fd = dir_fd };
|
|
993
|
+
const fd = try os.openatZ(dir_fd, path, os.O.CLOEXEC | os.O.CREAT | os.O.TRUNC, 0o666);
|
|
994
|
+
defer os.close(fd);
|
|
995
|
+
defer dir.deleteFile(path) catch {};
|
|
996
|
+
|
|
997
|
+
while (true) {
|
|
998
|
+
const res = os.system.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0);
|
|
999
|
+
switch (os.linux.getErrno(res)) {
|
|
1000
|
+
0 => {
|
|
1001
|
+
os.close(@intCast(os.fd_t, res));
|
|
1002
|
+
return true;
|
|
1003
|
+
},
|
|
1004
|
+
os.linux.EINTR => continue,
|
|
1005
|
+
os.linux.EINVAL => return false,
|
|
1006
|
+
else => |err| return os.unexpectedErrno(err),
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
/// Allocates a file contiguously using fallocate() if supported.
|
|
1012
|
+
/// Alternatively, writes to the last sector so that at least the file size is correct.
|
|
1013
|
+
fn fs_allocate(fd: os.fd_t, size: u64) !void {
|
|
1014
|
+
const mode: i32 = 0;
|
|
1015
|
+
const offset: i64 = 0;
|
|
1016
|
+
const length = @intCast(i64, size);
|
|
1017
|
+
|
|
1018
|
+
while (true) {
|
|
1019
|
+
const rc = os.linux.fallocate(fd, mode, offset, length);
|
|
1020
|
+
switch (os.linux.getErrno(rc)) {
|
|
1021
|
+
.SUCCESS => return,
|
|
1022
|
+
.BADF => return error.FileDescriptorInvalid,
|
|
1023
|
+
.FBIG => return error.FileTooBig,
|
|
1024
|
+
.INTR => continue,
|
|
1025
|
+
.INVAL => return error.ArgumentsInvalid,
|
|
1026
|
+
.IO => return error.InputOutput,
|
|
1027
|
+
.NODEV => return error.NoDevice,
|
|
1028
|
+
.NOSPC => return error.NoSpaceLeft,
|
|
1029
|
+
.NOSYS => return error.SystemOutdated,
|
|
1030
|
+
.OPNOTSUPP => return error.OperationNotSupported,
|
|
1031
|
+
.PERM => return error.PermissionDenied,
|
|
1032
|
+
.SPIPE => return error.Unseekable,
|
|
1033
|
+
.TXTBSY => return error.FileBusy,
|
|
1034
|
+
else => |errno| return os.unexpectedErrno(errno),
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
};
|