tigerbeetle-node 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +3 -4
  2. package/package.json +1 -1
  3. package/src/node.zig +2 -12
  4. package/src/tigerbeetle/scripts/benchmark.bat +46 -0
  5. package/src/tigerbeetle/scripts/install_zig.bat +2 -2
  6. package/src/tigerbeetle/scripts/install_zig.sh +1 -1
  7. package/src/tigerbeetle/scripts/vopr.sh +2 -2
  8. package/src/tigerbeetle/src/benchmark.zig +2 -6
  9. package/src/tigerbeetle/src/cli.zig +39 -18
  10. package/src/tigerbeetle/src/config.zig +24 -9
  11. package/src/tigerbeetle/src/demo.zig +1 -1
  12. package/src/tigerbeetle/src/io/benchmark.zig +24 -49
  13. package/src/tigerbeetle/src/io/darwin.zig +175 -44
  14. package/src/tigerbeetle/src/io/linux.zig +177 -72
  15. package/src/tigerbeetle/src/io/test.zig +61 -39
  16. package/src/tigerbeetle/src/io/windows.zig +1161 -0
  17. package/src/tigerbeetle/src/io.zig +2 -0
  18. package/src/tigerbeetle/src/main.zig +13 -8
  19. package/src/tigerbeetle/src/message_bus.zig +49 -61
  20. package/src/tigerbeetle/src/message_pool.zig +63 -57
  21. package/src/tigerbeetle/src/ring_buffer.zig +7 -0
  22. package/src/tigerbeetle/src/simulator.zig +4 -4
  23. package/src/tigerbeetle/src/storage.zig +0 -230
  24. package/src/tigerbeetle/src/test/cluster.zig +3 -6
  25. package/src/tigerbeetle/src/test/message_bus.zig +4 -3
  26. package/src/tigerbeetle/src/test/network.zig +13 -16
  27. package/src/tigerbeetle/src/test/state_checker.zig +3 -2
  28. package/src/tigerbeetle/src/tigerbeetle.zig +5 -3
  29. package/src/tigerbeetle/src/time.zig +58 -11
  30. package/src/tigerbeetle/src/vsr/client.zig +18 -32
  31. package/src/tigerbeetle/src/vsr/clock.zig +1 -1
  32. package/src/tigerbeetle/src/vsr/journal.zig +2 -6
  33. package/src/tigerbeetle/src/vsr/replica.zig +146 -169
  34. package/src/tigerbeetle/src/vsr.zig +263 -5
@@ -0,0 +1,1161 @@
1
+ const std = @import("std");
2
+ const os = std.os;
3
+ const assert = std.debug.assert;
4
+ const log = std.log.scoped(.io);
5
+ const config = @import("../config.zig");
6
+
7
+ const FIFO = @import("../fifo.zig").FIFO;
8
+ const Time = @import("../time.zig").Time;
9
+ const buffer_limit = @import("../io.zig").buffer_limit;
10
+
11
+ pub const IO = struct {
12
+ iocp: os.windows.HANDLE,
13
+ timer: Time = .{},
14
+ io_pending: usize = 0,
15
+ timeouts: FIFO(Completion) = .{},
16
+ completed: FIFO(Completion) = .{},
17
+
18
+ pub fn init(entries: u12, flags: u32) !IO {
19
+ _ = entries;
20
+ _ = flags;
21
+
22
+ _ = try os.windows.WSAStartup(2, 2);
23
+ errdefer os.windows.WSACleanup() catch unreachable;
24
+
25
+ const iocp = try os.windows.CreateIoCompletionPort(os.windows.INVALID_HANDLE_VALUE, null, 0, 0);
26
+ return IO{ .iocp = iocp };
27
+ }
28
+
29
+ pub fn deinit(self: *IO) void {
30
+ assert(self.iocp != os.windows.INVALID_HANDLE_VALUE);
31
+ os.windows.CloseHandle(self.iocp);
32
+ self.iocp = os.windows.INVALID_HANDLE_VALUE;
33
+
34
+ os.windows.WSACleanup() catch unreachable;
35
+ }
36
+
37
+ pub fn tick(self: *IO) !void {
38
+ return self.flush(.non_blocking);
39
+ }
40
+
41
+ pub fn run_for_ns(self: *IO, nanoseconds: u63) !void {
42
+ const Callback = struct {
43
+ fn on_timeout(timed_out: *bool, completion: *Completion, result: TimeoutError!void) void {
44
+ _ = result catch unreachable;
45
+ _ = completion;
46
+ timed_out.* = true;
47
+ }
48
+ };
49
+
50
+ var timed_out = false;
51
+ var completion: Completion = undefined;
52
+ self.timeout(*bool, &timed_out, Callback.on_timeout, &completion, nanoseconds);
53
+
54
+ while (!timed_out) {
55
+ try self.flush(.blocking);
56
+ }
57
+ }
58
+
59
+ const FlushMode = enum {
60
+ blocking,
61
+ non_blocking,
62
+ };
63
+
64
+ fn flush(self: *IO, mode: FlushMode) !void {
65
+ if (self.completed.peek() == null) {
66
+ // Compute how long to poll by flushing timeout completions.
67
+ // NOTE: this may push to completed queue
68
+ var timeout_ms: ?os.windows.DWORD = null;
69
+ if (self.flush_timeouts()) |expires_ns| {
70
+ // 0ns expires should have been completed not returned
71
+ assert(expires_ns != 0);
72
+ // Round up sub-millisecond expire times to the next millisecond
73
+ const expires_ms = (expires_ns + (std.time.ns_per_ms / 2)) / std.time.ns_per_ms;
74
+ // Saturating cast to DWORD milliseconds
75
+ const expires = std.math.cast(os.windows.DWORD, expires_ms) catch std.math.maxInt(os.windows.DWORD);
76
+ // max DWORD is reserved for INFINITE so cap the cast at max - 1
77
+ timeout_ms = if (expires == os.windows.INFINITE) expires - 1 else expires;
78
+ }
79
+
80
+ // Poll for IO iff theres IO pending and flush_timeouts() found no ready completions
81
+ if (self.io_pending > 0 and self.completed.peek() == null) {
82
+ // In blocking mode, we're always waiting at least until the timeout by run_for_ns.
83
+ // In non-blocking mode, we shouldn't wait at all.
84
+ const io_timeout = switch (mode) {
85
+ .blocking => timeout_ms orelse @panic("IO.flush blocking unbounded"),
86
+ .non_blocking => 0,
87
+ };
88
+
89
+ var events: [64]os.windows.OVERLAPPED_ENTRY = undefined;
90
+ const num_events = os.windows.GetQueuedCompletionStatusEx(
91
+ self.iocp,
92
+ &events,
93
+ io_timeout,
94
+ false, // non-alertable wait
95
+ ) catch |err| switch (err) {
96
+ error.Timeout => 0,
97
+ error.Aborted => unreachable,
98
+ else => |e| return e,
99
+ };
100
+
101
+ assert(self.io_pending >= num_events);
102
+ self.io_pending -= num_events;
103
+
104
+ for (events[0..num_events]) |event| {
105
+ const raw_overlapped = event.lpOverlapped;
106
+ const overlapped = @fieldParentPtr(Completion.Overlapped, "raw", raw_overlapped);
107
+ const completion = overlapped.completion;
108
+ completion.next = null;
109
+ self.completed.push(completion);
110
+ }
111
+ }
112
+ }
113
+
114
+ // Dequeue and invoke all the completions currently ready.
115
+ // Must read all `completions` before invoking the callbacks
116
+ // as the callbacks could potentially submit more completions.
117
+ var completed = self.completed;
118
+ self.completed = .{};
119
+ while (completed.pop()) |completion| {
120
+ (completion.callback)(Completion.Context{
121
+ .io = self,
122
+ .completion = completion,
123
+ });
124
+ }
125
+ }
126
+
127
+ fn flush_timeouts(self: *IO) ?u64 {
128
+ var min_expires: ?u64 = null;
129
+ var current_time: ?u64 = null;
130
+ var timeouts: ?*Completion = self.timeouts.peek();
131
+
132
+ // iterate through the timeouts, returning min_expires at the end
133
+ while (timeouts) |completion| {
134
+ timeouts = completion.next;
135
+
136
+ // lazily get the current time
137
+ const now = current_time orelse self.timer.monotonic();
138
+ current_time = now;
139
+
140
+ // move the completion to completed if it expired
141
+ if (now >= completion.operation.timeout.deadline) {
142
+ self.timeouts.remove(completion);
143
+ self.completed.push(completion);
144
+ continue;
145
+ }
146
+
147
+ // if it's still waiting, update min_timeout
148
+ const expires = completion.operation.timeout.deadline - now;
149
+ if (min_expires) |current_min_expires| {
150
+ min_expires = std.math.min(expires, current_min_expires);
151
+ } else {
152
+ min_expires = expires;
153
+ }
154
+ }
155
+
156
+ return min_expires;
157
+ }
158
+
159
+ /// This struct holds the data needed for a single IO operation
160
+ pub const Completion = struct {
161
+ next: ?*Completion,
162
+ context: ?*anyopaque,
163
+ callback: fn (Context) void,
164
+ operation: Operation,
165
+
166
+ const Context = struct {
167
+ io: *IO,
168
+ completion: *Completion,
169
+ };
170
+
171
+ const Overlapped = struct {
172
+ raw: os.windows.OVERLAPPED,
173
+ completion: *Completion,
174
+ };
175
+
176
+ const Transfer = struct {
177
+ socket: os.socket_t,
178
+ buf: os.windows.ws2_32.WSABUF,
179
+ overlapped: Overlapped,
180
+ pending: bool,
181
+ };
182
+
183
+ const Operation = union(enum) {
184
+ accept: struct {
185
+ overlapped: Overlapped,
186
+ listen_socket: os.socket_t,
187
+ client_socket: os.socket_t,
188
+ addr_buffer: [(@sizeOf(std.net.Address) + 16) * 2]u8 align(4),
189
+ },
190
+ connect: struct {
191
+ socket: os.socket_t,
192
+ address: std.net.Address,
193
+ overlapped: Overlapped,
194
+ pending: bool,
195
+ },
196
+ send: Transfer,
197
+ recv: Transfer,
198
+ read: struct {
199
+ fd: os.fd_t,
200
+ buf: [*]u8,
201
+ len: u32,
202
+ offset: u64,
203
+ },
204
+ write: struct {
205
+ fd: os.fd_t,
206
+ buf: [*]const u8,
207
+ len: u32,
208
+ offset: u64,
209
+ },
210
+ close: struct {
211
+ fd: os.fd_t,
212
+ },
213
+ timeout: struct {
214
+ deadline: u64,
215
+ },
216
+ };
217
+ };
218
+
219
+ fn submit(
220
+ self: *IO,
221
+ context: anytype,
222
+ comptime callback: anytype,
223
+ completion: *Completion,
224
+ comptime op_tag: std.meta.Tag(Completion.Operation),
225
+ op_data: anytype,
226
+ comptime OperationImpl: type,
227
+ ) void {
228
+ const Context = @TypeOf(context);
229
+ const Callback = struct {
230
+ fn onComplete(ctx: Completion.Context) void {
231
+ // Perform the operation and get the result
232
+ const data = &@field(ctx.completion.operation, @tagName(op_tag));
233
+ const result = OperationImpl.do_operation(ctx, data);
234
+
235
+ // For OVERLAPPED IO, error.WouldBlock assumes that it will be completed by IOCP.
236
+ switch (op_tag) {
237
+ .accept, .read, .recv, .connect, .write, .send => {
238
+ _ = result catch |err| switch (err) {
239
+ error.WouldBlock => {
240
+ ctx.io.io_pending += 1;
241
+ return;
242
+ },
243
+ else => {},
244
+ };
245
+ },
246
+ else => {},
247
+ }
248
+
249
+ // The completion is finally ready to invoke the callback
250
+ callback(
251
+ @intToPtr(Context, @ptrToInt(ctx.completion.context)),
252
+ ctx.completion,
253
+ result,
254
+ );
255
+ }
256
+ };
257
+
258
+ // Setup the completion with the callback wrapper above
259
+ completion.* = .{
260
+ .next = null,
261
+ .context = @ptrCast(?*anyopaque, context),
262
+ .callback = Callback.onComplete,
263
+ .operation = @unionInit(Completion.Operation, @tagName(op_tag), op_data),
264
+ };
265
+
266
+ // Submit the completion onto the right queue
267
+ switch (op_tag) {
268
+ .timeout => self.timeouts.push(completion),
269
+ else => self.completed.push(completion),
270
+ }
271
+ }
272
+
273
+ pub const AcceptError = os.AcceptError || os.SetSockOptError;
274
+
275
+ pub fn accept(
276
+ self: *IO,
277
+ comptime Context: type,
278
+ context: Context,
279
+ comptime callback: fn (
280
+ context: Context,
281
+ completion: *Completion,
282
+ result: AcceptError!os.socket_t,
283
+ ) void,
284
+ completion: *Completion,
285
+ socket: os.socket_t,
286
+ ) void {
287
+ self.submit(
288
+ context,
289
+ callback,
290
+ completion,
291
+ .accept,
292
+ .{
293
+ .overlapped = undefined,
294
+ .listen_socket = socket,
295
+ .client_socket = INVALID_SOCKET,
296
+ .addr_buffer = undefined,
297
+ },
298
+ struct {
299
+ fn do_operation(ctx: Completion.Context, op: anytype) AcceptError!os.socket_t {
300
+ var flags: os.windows.DWORD = undefined;
301
+ var transferred: os.windows.DWORD = undefined;
302
+
303
+ const rc = switch (op.client_socket) {
304
+ // When first called, the client_socket is invalid so we start the op.
305
+ INVALID_SOCKET => blk: {
306
+ // Create the socket that will be used for accept.
307
+ op.client_socket = ctx.io.open_socket(
308
+ os.AF.INET,
309
+ os.SOCK.STREAM,
310
+ os.IPPROTO.TCP,
311
+ ) catch |err| switch (err) {
312
+ error.AddressFamilyNotSupported, error.ProtocolNotSupported => unreachable,
313
+ else => |e| return e,
314
+ };
315
+
316
+ var sync_bytes_read: os.windows.DWORD = undefined;
317
+ op.overlapped = .{
318
+ .raw = std.mem.zeroes(os.windows.OVERLAPPED),
319
+ .completion = ctx.completion,
320
+ };
321
+
322
+ // Start the asynchronous accept with the created socket.
323
+ break :blk os.windows.ws2_32.AcceptEx(
324
+ op.listen_socket,
325
+ op.client_socket,
326
+ &op.addr_buffer,
327
+ 0,
328
+ @sizeOf(std.net.Address) + 16,
329
+ @sizeOf(std.net.Address) + 16,
330
+ &sync_bytes_read,
331
+ &op.overlapped.raw,
332
+ );
333
+ },
334
+ // Called after accept was started, so get the result
335
+ else => os.windows.ws2_32.WSAGetOverlappedResult(
336
+ op.listen_socket,
337
+ &op.overlapped.raw,
338
+ &transferred,
339
+ os.windows.FALSE, // dont wait
340
+ &flags,
341
+ ),
342
+ };
343
+
344
+ // return the socket if we succeed in accepting.
345
+ if (rc != os.windows.FALSE) {
346
+ // enables getsockopt, setsockopt, getsockname, getpeername
347
+ _ = os.windows.ws2_32.setsockopt(
348
+ op.client_socket,
349
+ os.windows.ws2_32.SOL.SOCKET,
350
+ os.windows.ws2_32.SO.UPDATE_ACCEPT_CONTEXT,
351
+ null,
352
+ 0,
353
+ );
354
+
355
+ return op.client_socket;
356
+ }
357
+
358
+ // destroy the client_socket we created if we get a non WouldBlock error
359
+ errdefer |result| {
360
+ _ = result catch |err| switch (err) {
361
+ error.WouldBlock => {},
362
+ else => {
363
+ os.closeSocket(op.client_socket);
364
+ op.client_socket = INVALID_SOCKET;
365
+ },
366
+ };
367
+ }
368
+
369
+ return switch (os.windows.ws2_32.WSAGetLastError()) {
370
+ .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock,
371
+ .WSANOTINITIALISED => unreachable, // WSAStartup() was called
372
+ .WSAENETDOWN => unreachable, // WinSock error
373
+ .WSAENOTSOCK => error.FileDescriptorNotASocket,
374
+ .WSAEOPNOTSUPP => error.OperationNotSupported,
375
+ .WSA_INVALID_HANDLE => unreachable, // we dont use hEvent in OVERLAPPED
376
+ .WSAEFAULT, .WSA_INVALID_PARAMETER => unreachable, // params should be ok
377
+ .WSAECONNRESET => error.ConnectionAborted,
378
+ .WSAEMFILE => unreachable, // we create our own descriptor so its available
379
+ .WSAENOBUFS => error.SystemResources,
380
+ .WSAEINTR, .WSAEINPROGRESS => unreachable, // no blocking calls
381
+ else => |err| os.windows.unexpectedWSAError(err),
382
+ };
383
+ }
384
+ },
385
+ );
386
+ }
387
+
388
+ pub const CloseError = error{
389
+ FileDescriptorInvalid,
390
+ DiskQuota,
391
+ InputOutput,
392
+ NoSpaceLeft,
393
+ } || os.UnexpectedError;
394
+
395
+ pub const ConnectError = os.ConnectError || error{FileDescriptorNotASocket};
396
+
397
+ pub fn connect(
398
+ self: *IO,
399
+ comptime Context: type,
400
+ context: Context,
401
+ comptime callback: fn (
402
+ context: Context,
403
+ completion: *Completion,
404
+ result: ConnectError!void,
405
+ ) void,
406
+ completion: *Completion,
407
+ socket: os.socket_t,
408
+ address: std.net.Address,
409
+ ) void {
410
+ self.submit(
411
+ context,
412
+ callback,
413
+ completion,
414
+ .connect,
415
+ .{
416
+ .socket = socket,
417
+ .address = address,
418
+ .overlapped = undefined,
419
+ .pending = false,
420
+ },
421
+ struct {
422
+ fn do_operation(ctx: Completion.Context, op: anytype) ConnectError!void {
423
+ var flags: os.windows.DWORD = undefined;
424
+ var transferred: os.windows.DWORD = undefined;
425
+
426
+ const rc = blk: {
427
+ // Poll for the result if we've already started the connect op.
428
+ if (op.pending) {
429
+ break :blk os.windows.ws2_32.WSAGetOverlappedResult(
430
+ op.socket,
431
+ &op.overlapped.raw,
432
+ &transferred,
433
+ os.windows.FALSE, // dont wait
434
+ &flags,
435
+ );
436
+ }
437
+
438
+ // ConnectEx requires the socket to be initially bound (INADDR_ANY)
439
+ const inaddr_any = std.mem.zeroes([4]u8);
440
+ const bind_addr = std.net.Address.initIp4(inaddr_any, 0);
441
+ os.bind(
442
+ op.socket,
443
+ &bind_addr.any,
444
+ bind_addr.getOsSockLen(),
445
+ ) catch |err| switch (err) {
446
+ error.AccessDenied => unreachable,
447
+ error.SymLinkLoop => unreachable,
448
+ error.NameTooLong => unreachable,
449
+ error.NotDir => unreachable,
450
+ error.ReadOnlyFileSystem => unreachable,
451
+ error.NetworkSubsystemFailed => unreachable,
452
+ error.AlreadyBound => unreachable,
453
+ else => |e| return e,
454
+ };
455
+
456
+ const LPFN_CONNECTEX = fn (
457
+ Socket: os.windows.ws2_32.SOCKET,
458
+ SockAddr: *const os.windows.ws2_32.sockaddr,
459
+ SockLen: os.socklen_t,
460
+ SendBuf: ?*const anyopaque,
461
+ SendBufLen: os.windows.DWORD,
462
+ BytesSent: *os.windows.DWORD,
463
+ Overlapped: *os.windows.OVERLAPPED,
464
+ ) callconv(os.windows.WINAPI) os.windows.BOOL;
465
+
466
+ // Find the ConnectEx function by dynamically looking it up on the socket.
467
+ const connect_ex = os.windows.loadWinsockExtensionFunction(
468
+ LPFN_CONNECTEX,
469
+ op.socket,
470
+ os.windows.ws2_32.WSAID_CONNECTEX,
471
+ ) catch |err| switch (err) {
472
+ error.OperationNotSupported => unreachable,
473
+ error.ShortRead => unreachable,
474
+ else => |e| return e,
475
+ };
476
+
477
+ op.pending = true;
478
+ op.overlapped = .{
479
+ .raw = std.mem.zeroes(os.windows.OVERLAPPED),
480
+ .completion = ctx.completion,
481
+ };
482
+
483
+ // Start the connect operation.
484
+ break :blk (connect_ex)(
485
+ op.socket,
486
+ &op.address.any,
487
+ op.address.getOsSockLen(),
488
+ null,
489
+ 0,
490
+ &transferred,
491
+ &op.overlapped.raw,
492
+ );
493
+ };
494
+
495
+ // return if we succeeded in connecting
496
+ if (rc != os.windows.FALSE) {
497
+ // enables getsockopt, setsockopt, getsockname, getpeername
498
+ _ = os.windows.ws2_32.setsockopt(
499
+ op.socket,
500
+ os.windows.ws2_32.SOL.SOCKET,
501
+ os.windows.ws2_32.SO.UPDATE_CONNECT_CONTEXT,
502
+ null,
503
+ 0,
504
+ );
505
+
506
+ return;
507
+ }
508
+
509
+ return switch (os.windows.ws2_32.WSAGetLastError()) {
510
+ .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE, .WSAEALREADY => error.WouldBlock,
511
+ .WSANOTINITIALISED => unreachable, // WSAStartup() was called
512
+ .WSAENETDOWN => unreachable, // network subsystem is down
513
+ .WSAEADDRNOTAVAIL => error.AddressNotAvailable,
514
+ .WSAEAFNOSUPPORT => error.AddressFamilyNotSupported,
515
+ .WSAECONNREFUSED => error.ConnectionRefused,
516
+ .WSAEFAULT => unreachable, // all addresses should be valid
517
+ .WSAEINVAL => unreachable, // invalid socket type
518
+ .WSAEHOSTUNREACH, .WSAENETUNREACH => error.NetworkUnreachable,
519
+ .WSAENOBUFS => error.SystemResources,
520
+ .WSAENOTSOCK => unreachable, // socket is not bound or is listening
521
+ .WSAETIMEDOUT => error.ConnectionTimedOut,
522
+ .WSA_INVALID_HANDLE => unreachable, // we dont use hEvent in OVERLAPPED
523
+ else => |err| os.windows.unexpectedWSAError(err),
524
+ };
525
+ }
526
+ },
527
+ );
528
+ }
529
+
530
+ pub const SendError = os.SendError;
531
+
532
+ pub fn send(
533
+ self: *IO,
534
+ comptime Context: type,
535
+ context: Context,
536
+ comptime callback: fn (
537
+ context: Context,
538
+ completion: *Completion,
539
+ result: SendError!usize,
540
+ ) void,
541
+ completion: *Completion,
542
+ socket: os.socket_t,
543
+ buffer: []const u8,
544
+ ) void {
545
+ const transfer = Completion.Transfer{
546
+ .socket = socket,
547
+ .buf = os.windows.ws2_32.WSABUF{
548
+ .len = @intCast(u32, buffer_limit(buffer.len)),
549
+ .buf = @intToPtr([*]u8, @ptrToInt(buffer.ptr)),
550
+ },
551
+ .overlapped = undefined,
552
+ .pending = false,
553
+ };
554
+
555
+ self.submit(
556
+ context,
557
+ callback,
558
+ completion,
559
+ .send,
560
+ transfer,
561
+ struct {
562
+ fn do_operation(ctx: Completion.Context, op: anytype) SendError!usize {
563
+ var flags: os.windows.DWORD = undefined;
564
+ var transferred: os.windows.DWORD = undefined;
565
+
566
+ const rc = blk: {
567
+ // Poll for the result if we've already started the send op.
568
+ if (op.pending) {
569
+ break :blk os.windows.ws2_32.WSAGetOverlappedResult(
570
+ op.socket,
571
+ &op.overlapped.raw,
572
+ &transferred,
573
+ os.windows.FALSE, // dont wait
574
+ &flags,
575
+ );
576
+ }
577
+
578
+ op.pending = true;
579
+ op.overlapped = .{
580
+ .raw = std.mem.zeroes(os.windows.OVERLAPPED),
581
+ .completion = ctx.completion,
582
+ };
583
+
584
+ // Start the send operation.
585
+ break :blk switch (os.windows.ws2_32.WSASend(
586
+ op.socket,
587
+ @ptrCast([*]os.windows.ws2_32.WSABUF, &op.buf),
588
+ 1, // one buffer
589
+ &transferred,
590
+ 0, // no flags
591
+ &op.overlapped.raw,
592
+ null,
593
+ )) {
594
+ os.windows.ws2_32.SOCKET_ERROR => @as(os.windows.BOOL, os.windows.FALSE),
595
+ 0 => os.windows.TRUE,
596
+ else => unreachable,
597
+ };
598
+ };
599
+
600
+ // Return bytes transferred on success.
601
+ if (rc != os.windows.FALSE)
602
+ return transferred;
603
+
604
+ return switch (os.windows.ws2_32.WSAGetLastError()) {
605
+ .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock,
606
+ .WSANOTINITIALISED => unreachable, // WSAStartup() was called
607
+ .WSA_INVALID_HANDLE => unreachable, // we dont use OVERLAPPED.hEvent
608
+ .WSA_INVALID_PARAMETER => unreachable, // parameters are fine
609
+ .WSAECONNABORTED => error.ConnectionResetByPeer,
610
+ .WSAECONNRESET => error.ConnectionResetByPeer,
611
+ .WSAEFAULT => unreachable, // invalid buffer
612
+ .WSAEINTR => unreachable, // this is non blocking
613
+ .WSAEINPROGRESS => unreachable, // this is non blocking
614
+ .WSAEINVAL => unreachable, // invalid socket type
615
+ .WSAEMSGSIZE => error.MessageTooBig,
616
+ .WSAENETDOWN => error.NetworkSubsystemFailed,
617
+ .WSAENETRESET => error.ConnectionResetByPeer,
618
+ .WSAENOBUFS => error.SystemResources,
619
+ .WSAENOTCONN => error.FileDescriptorNotASocket,
620
+ .WSAEOPNOTSUPP => unreachable, // we dont use MSG_OOB or MSG_PARTIAL
621
+ .WSAESHUTDOWN => error.BrokenPipe,
622
+ .WSA_OPERATION_ABORTED => unreachable, // operation was cancelled
623
+ else => |err| os.windows.unexpectedWSAError(err),
624
+ };
625
+ }
626
+ },
627
+ );
628
+ }
629
+
630
+ pub const RecvError = os.RecvFromError;
631
+
632
+ pub fn recv(
633
+ self: *IO,
634
+ comptime Context: type,
635
+ context: Context,
636
+ comptime callback: fn (
637
+ context: Context,
638
+ completion: *Completion,
639
+ result: RecvError!usize,
640
+ ) void,
641
+ completion: *Completion,
642
+ socket: os.socket_t,
643
+ buffer: []u8,
644
+ ) void {
645
+ const transfer = Completion.Transfer{
646
+ .socket = socket,
647
+ .buf = os.windows.ws2_32.WSABUF{
648
+ .len = @intCast(u32, buffer_limit(buffer.len)),
649
+ .buf = buffer.ptr,
650
+ },
651
+ .overlapped = undefined,
652
+ .pending = false,
653
+ };
654
+
655
+ self.submit(
656
+ context,
657
+ callback,
658
+ completion,
659
+ .recv,
660
+ transfer,
661
+ struct {
662
+ fn do_operation(ctx: Completion.Context, op: anytype) RecvError!usize {
663
+ var flags: os.windows.DWORD = 0; // used both as input and output
664
+ var transferred: os.windows.DWORD = undefined;
665
+
666
+ const rc = blk: {
667
+ // Poll for the result if we've already started the recv op.
668
+ if (op.pending) {
669
+ break :blk os.windows.ws2_32.WSAGetOverlappedResult(
670
+ op.socket,
671
+ &op.overlapped.raw,
672
+ &transferred,
673
+ os.windows.FALSE, // dont wait
674
+ &flags,
675
+ );
676
+ }
677
+
678
+ op.pending = true;
679
+ op.overlapped = .{
680
+ .raw = std.mem.zeroes(os.windows.OVERLAPPED),
681
+ .completion = ctx.completion,
682
+ };
683
+
684
+ // Start the recv operation.
685
+ break :blk switch (os.windows.ws2_32.WSARecv(
686
+ op.socket,
687
+ @ptrCast([*]os.windows.ws2_32.WSABUF, &op.buf),
688
+ 1, // one buffer
689
+ &transferred,
690
+ &flags,
691
+ &op.overlapped.raw,
692
+ null,
693
+ )) {
694
+ os.windows.ws2_32.SOCKET_ERROR => @as(os.windows.BOOL, os.windows.FALSE),
695
+ 0 => os.windows.TRUE,
696
+ else => unreachable,
697
+ };
698
+ };
699
+
700
+ // Return bytes received on success.
701
+ if (rc != os.windows.FALSE)
702
+ return transferred;
703
+
704
+ return switch (os.windows.ws2_32.WSAGetLastError()) {
705
+ .WSA_IO_PENDING, .WSAEWOULDBLOCK, .WSA_IO_INCOMPLETE => error.WouldBlock,
706
+ .WSANOTINITIALISED => unreachable, // WSAStartup() was called
707
+ .WSA_INVALID_HANDLE => unreachable, // we dont use OVERLAPPED.hEvent
708
+ .WSA_INVALID_PARAMETER => unreachable, // parameters are fine
709
+ .WSAECONNABORTED => error.ConnectionRefused,
710
+ .WSAECONNRESET => error.ConnectionResetByPeer,
711
+ .WSAEDISCON => unreachable, // we only stream sockets
712
+ .WSAEFAULT => unreachable, // invalid buffer
713
+ .WSAEINTR => unreachable, // this is non blocking
714
+ .WSAEINPROGRESS => unreachable, // this is non blocking
715
+ .WSAEINVAL => unreachable, // invalid socket type
716
+ .WSAEMSGSIZE => error.MessageTooBig,
717
+ .WSAENETDOWN => error.NetworkSubsystemFailed,
718
+ .WSAENETRESET => error.ConnectionResetByPeer,
719
+ .WSAENOTCONN => error.SocketNotConnected,
720
+ .WSAEOPNOTSUPP => unreachable, // we dont use MSG_OOB or MSG_PARTIAL
721
+ .WSAESHUTDOWN => error.SocketNotConnected,
722
+ .WSAETIMEDOUT => error.ConnectionRefused,
723
+ .WSA_OPERATION_ABORTED => unreachable, // operation was cancelled
724
+ else => |err| os.windows.unexpectedWSAError(err),
725
+ };
726
+ }
727
+ },
728
+ );
729
+ }
730
+
731
+ pub const ReadError = error{
732
+ WouldBlock,
733
+ NotOpenForReading,
734
+ ConnectionResetByPeer,
735
+ Alignment,
736
+ InputOutput,
737
+ IsDir,
738
+ SystemResources,
739
+ Unseekable,
740
+ } || os.UnexpectedError;
741
+
742
+ pub fn read(
743
+ self: *IO,
744
+ comptime Context: type,
745
+ context: Context,
746
+ comptime callback: fn (
747
+ context: Context,
748
+ completion: *Completion,
749
+ result: ReadError!usize,
750
+ ) void,
751
+ completion: *Completion,
752
+ fd: os.fd_t,
753
+ buffer: []u8,
754
+ offset: u64,
755
+ ) void {
756
+ self.submit(
757
+ context,
758
+ callback,
759
+ completion,
760
+ .read,
761
+ .{
762
+ .fd = fd,
763
+ .buf = buffer.ptr,
764
+ .len = @intCast(u32, buffer_limit(buffer.len)),
765
+ .offset = offset,
766
+ },
767
+ struct {
768
+ fn do_operation(ctx: Completion.Context, op: anytype) ReadError!usize {
769
+ // Do a synchronous read for now.
770
+ _ = ctx;
771
+ return os.pread(op.fd, op.buf[0..op.len], op.offset) catch |err| switch (err) {
772
+ error.OperationAborted => unreachable,
773
+ error.BrokenPipe => unreachable,
774
+ error.ConnectionTimedOut => unreachable,
775
+ error.AccessDenied => error.InputOutput,
776
+ else => |e| e,
777
+ };
778
+ }
779
+ },
780
+ );
781
+ }
782
+
783
+ pub const WriteError = os.PWriteError;
784
+
785
+ pub fn write(
786
+ self: *IO,
787
+ comptime Context: type,
788
+ context: Context,
789
+ comptime callback: fn (
790
+ context: Context,
791
+ completion: *Completion,
792
+ result: WriteError!usize,
793
+ ) void,
794
+ completion: *Completion,
795
+ fd: os.fd_t,
796
+ buffer: []const u8,
797
+ offset: u64,
798
+ ) void {
799
+ self.submit(
800
+ context,
801
+ callback,
802
+ completion,
803
+ .write,
804
+ .{
805
+ .fd = fd,
806
+ .buf = buffer.ptr,
807
+ .len = @intCast(u32, buffer_limit(buffer.len)),
808
+ .offset = offset,
809
+ },
810
+ struct {
811
+ fn do_operation(ctx: Completion.Context, op: anytype) WriteError!usize {
812
+ // Do a synchronous write for now.
813
+ _ = ctx;
814
+ return os.pwrite(op.fd, op.buf[0..op.len], op.offset);
815
+ }
816
+ },
817
+ );
818
+ }
819
+
820
+ pub fn close(
821
+ self: *IO,
822
+ comptime Context: type,
823
+ context: Context,
824
+ comptime callback: fn (
825
+ context: Context,
826
+ completion: *Completion,
827
+ result: CloseError!void,
828
+ ) void,
829
+ completion: *Completion,
830
+ fd: os.fd_t,
831
+ ) void {
832
+ self.submit(
833
+ context,
834
+ callback,
835
+ completion,
836
+ .close,
837
+ .{ .fd = fd },
838
+ struct {
839
+ fn do_operation(ctx: Completion.Context, op: anytype) CloseError!void {
840
+ _ = ctx;
841
+
842
+ // Check if the fd is a SOCKET by seeing if getsockopt() returns ENOTSOCK
843
+ // https://stackoverflow.com/a/50981652
844
+ const socket = @ptrCast(os.socket_t, op.fd);
845
+ getsockoptError(socket) catch |err| switch (err) {
846
+ error.FileDescriptorNotASocket => return os.windows.CloseHandle(op.fd),
847
+ else => {},
848
+ };
849
+
850
+ os.closeSocket(socket);
851
+ }
852
+ },
853
+ );
854
+ }
855
+
856
+ pub const TimeoutError = error{Canceled} || os.UnexpectedError;
857
+
858
+ pub fn timeout(
859
+ self: *IO,
860
+ comptime Context: type,
861
+ context: Context,
862
+ comptime callback: fn (
863
+ context: Context,
864
+ completion: *Completion,
865
+ result: TimeoutError!void,
866
+ ) void,
867
+ completion: *Completion,
868
+ nanoseconds: u63,
869
+ ) void {
870
+ self.submit(
871
+ context,
872
+ callback,
873
+ completion,
874
+ .timeout,
875
+ .{ .deadline = self.timer.monotonic() + nanoseconds },
876
+ struct {
877
+ fn do_operation(ctx: Completion.Context, op: anytype) TimeoutError!void {
878
+ _ = ctx;
879
+ _ = op;
880
+ return;
881
+ }
882
+ },
883
+ );
884
+ }
885
+
886
+ pub const INVALID_SOCKET = os.windows.ws2_32.INVALID_SOCKET;
887
+
888
+ /// Creates a socket that can be used for async operations with the IO instance.
889
+ pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t {
890
+ // SOCK_NONBLOCK | SOCK_CLOEXEC
891
+ var flags: os.windows.DWORD = 0;
892
+ flags |= os.windows.ws2_32.WSA_FLAG_OVERLAPPED;
893
+ flags |= os.windows.ws2_32.WSA_FLAG_NO_HANDLE_INHERIT;
894
+
895
+ const socket = try os.windows.WSASocketW(
896
+ @bitCast(i32, family),
897
+ @bitCast(i32, sock_type),
898
+ @bitCast(i32, protocol),
899
+ null,
900
+ 0,
901
+ flags,
902
+ );
903
+ errdefer os.closeSocket(socket);
904
+
905
+ const socket_iocp = try os.windows.CreateIoCompletionPort(socket, self.iocp, 0, 0);
906
+ assert(socket_iocp == self.iocp);
907
+
908
+ // Ensure that synchronous IO completion doesn't queue an unneeded overlapped
909
+ // and that the event for the socket (WaitForSingleObject) doesn't need to be set.
910
+ var mode: os.windows.BYTE = 0;
911
+ mode |= os.windows.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS;
912
+ mode |= os.windows.FILE_SKIP_SET_EVENT_ON_HANDLE;
913
+
914
+ const handle = @ptrCast(os.windows.HANDLE, socket);
915
+ try os.windows.SetFileCompletionNotificationModes(handle, mode);
916
+
917
+ return socket;
918
+ }
919
+
920
+ /// Opens a directory with read only access.
921
+ pub fn open_dir(dir_path: [:0]const u8) !os.fd_t {
922
+ const dir = try std.fs.cwd().openDirZ(dir_path, .{});
923
+ return dir.fd;
924
+ }
925
+
926
+ /// Opens or creates a journal file:
927
+ /// - For reading and writing.
928
+ /// - For Direct I/O (required on windows).
929
+ /// - Obtains an advisory exclusive lock to the file descriptor.
930
+ /// - Allocates the file contiguously on disk if this is supported by the file system.
931
+ /// - Ensures that the file data is durable on disk.
932
+ /// The caller is responsible for ensuring that the parent directory inode is durable.
933
+ /// - Verifies that the file size matches the expected file size before returning.
934
+ pub fn open_file(
935
+ self: *IO,
936
+ dir_handle: os.fd_t,
937
+ relative_path: [:0]const u8,
938
+ size: u64,
939
+ must_create: bool,
940
+ ) !os.fd_t {
941
+ _ = self;
942
+
943
+ assert(relative_path.len > 0);
944
+ assert(size >= config.sector_size);
945
+ assert(size % config.sector_size == 0);
946
+
947
+ const path_w = try os.windows.sliceToPrefixedFileW(relative_path);
948
+
949
+ // FILE_CREATE = O_CREAT | O_EXCL
950
+ var creation_disposition: os.windows.DWORD = 0;
951
+ if (must_create) {
952
+ log.info("creating \"{s}\"...", .{relative_path});
953
+ creation_disposition = os.windows.FILE_CREATE;
954
+ } else {
955
+ log.info("opening \"{s}\"...", .{relative_path});
956
+ creation_disposition = os.windows.OPEN_EXISTING;
957
+ }
958
+
959
+ // O_EXCL
960
+ var shared_mode: os.windows.DWORD = 0;
961
+
962
+ // O_RDWR
963
+ var access_mask: os.windows.DWORD = 0;
964
+ access_mask |= os.windows.GENERIC_READ;
965
+ access_mask |= os.windows.GENERIC_WRITE;
966
+
967
+ // O_DIRECT | O_DSYNC
968
+ var attributes: os.windows.DWORD = 0;
969
+ attributes |= os.windows.FILE_FLAG_NO_BUFFERING;
970
+ attributes |= os.windows.FILE_FLAG_WRITE_THROUGH;
971
+
972
+ // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
973
+ assert((attributes & os.windows.FILE_FLAG_WRITE_THROUGH) > 0);
974
+
975
+ // TODO: Add ReadFileEx/WriteFileEx support.
976
+ // Not currently needed for O_DIRECT disk IO.
977
+ // attributes |= os.windows.FILE_FLAG_OVERLAPPED;
978
+
979
+ const handle = os.windows.kernel32.CreateFileW(
980
+ path_w.span(),
981
+ access_mask,
982
+ shared_mode,
983
+ null, // no security attributes required
984
+ creation_disposition,
985
+ attributes,
986
+ null, // no existing template file
987
+ );
988
+
989
+ if (handle == os.windows.INVALID_HANDLE_VALUE) {
990
+ return switch (os.windows.kernel32.GetLastError()) {
991
+ .ACCESS_DENIED => error.AccessDenied,
992
+ else => |err| os.windows.unexpectedError(err),
993
+ };
994
+ }
995
+
996
+ errdefer os.windows.CloseHandle(handle);
997
+
998
+ // Obtain an advisory exclusive lock
999
+ // even when we haven't given shared access to other processes.
1000
+ fs_lock(handle, size) catch |err| switch (err) {
1001
+ error.WouldBlock => @panic("another process holds the data file lock"),
1002
+ else => return err,
1003
+ };
1004
+
1005
+ // Ask the file system to allocate contiguous sectors for the file (if possible):
1006
+ if (must_create) {
1007
+ log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
1008
+ fs_allocate(handle, size) catch {
1009
+ log.warn("file system failed to preallocate the file memory", .{});
1010
+ log.info("allocating by writing to the last sector of the file instead...", .{});
1011
+
1012
+ const sector_size = config.sector_size;
1013
+ const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
1014
+
1015
+ // Handle partial writes where the physical sector is less than a logical sector:
1016
+ const write_offset = size - sector.len;
1017
+ var written: usize = 0;
1018
+ while (written < sector.len) {
1019
+ written += try os.pwrite(handle, sector[written..], write_offset + written);
1020
+ }
1021
+ };
1022
+ }
1023
+
1024
+ // The best fsync strategy is always to fsync before reading because this prevents us from
1025
+ // making decisions on data that was never durably written by a previously crashed process.
1026
+ // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
1027
+ // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
1028
+ try os.fsync(handle);
1029
+
1030
+ // We cannot fsync the directory handle on Windows.
1031
+ // We have no way to open a directory with write access.
1032
+ //
1033
+ // try os.fsync(dir_handle);
1034
+ _ = dir_handle;
1035
+
1036
+ const file_size = try os.windows.GetFileSizeEx(handle);
1037
+ if (file_size != size) @panic("data file inode size was truncated or corrupted");
1038
+
1039
+ return handle;
1040
+ }
1041
+
1042
+ fn fs_lock(handle: os.fd_t, size: u64) !void {
1043
+ // TODO: Look into using SetFileIoOverlappedRange() for better unbuffered async IO perf
1044
+ // NOTE: Requires SeLockMemoryPrivilege.
1045
+
1046
+ const kernel32 = struct {
1047
+ const LOCKFILE_EXCLUSIVE_LOCK = 0x2;
1048
+ const LOCKFILE_FAIL_IMMEDIATELY = 01;
1049
+
1050
+ extern "kernel32" fn LockFileEx(
1051
+ hFile: os.windows.HANDLE,
1052
+ dwFlags: os.windows.DWORD,
1053
+ dwReserved: os.windows.DWORD,
1054
+ nNumberOfBytesToLockLow: os.windows.DWORD,
1055
+ nNumberOfBytesToLockHigh: os.windows.DWORD,
1056
+ lpOverlapped: ?*os.windows.OVERLAPPED,
1057
+ ) callconv(os.windows.WINAPI) os.windows.BOOL;
1058
+ };
1059
+
1060
+ // hEvent = null
1061
+ // Offset & OffsetHigh = 0
1062
+ var lock_overlapped = std.mem.zeroes(os.windows.OVERLAPPED);
1063
+
1064
+ // LOCK_EX | LOCK_NB
1065
+ var lock_flags: os.windows.DWORD = 0;
1066
+ lock_flags |= kernel32.LOCKFILE_EXCLUSIVE_LOCK;
1067
+ lock_flags |= kernel32.LOCKFILE_FAIL_IMMEDIATELY;
1068
+
1069
+ const locked = kernel32.LockFileEx(
1070
+ handle,
1071
+ lock_flags,
1072
+ 0, // reserved param is always zero
1073
+ @truncate(u32, size), // low bits of size
1074
+ @truncate(u32, size >> 32), // high bits of size
1075
+ &lock_overlapped,
1076
+ );
1077
+
1078
+ if (locked == os.windows.FALSE) {
1079
+ return switch (os.windows.kernel32.GetLastError()) {
1080
+ .IO_PENDING => error.WouldBlock,
1081
+ else => |err| os.windows.unexpectedError(err),
1082
+ };
1083
+ }
1084
+ }
1085
+
1086
+ fn fs_allocate(handle: os.fd_t, size: u64) !void {
1087
+ // TODO: Look into using SetFileValidData() instead
1088
+ // NOTE: Requires SE_MANAGE_VOLUME_NAME privilege
1089
+
1090
+ // Move the file pointer to the start + size
1091
+ const seeked = os.windows.kernel32.SetFilePointerEx(
1092
+ handle,
1093
+ @intCast(i64, size),
1094
+ null, // no reference to new file pointer
1095
+ os.windows.FILE_BEGIN,
1096
+ );
1097
+
1098
+ if (seeked == os.windows.FALSE) {
1099
+ return switch (os.windows.kernel32.GetLastError()) {
1100
+ .INVALID_HANDLE => unreachable,
1101
+ .INVALID_PARAMETER => unreachable,
1102
+ else => |err| os.windows.unexpectedError(err),
1103
+ };
1104
+ }
1105
+
1106
+ // Mark the moved file pointer (start + size) as the physical EOF.
1107
+ const allocated = os.windows.kernel32.SetEndOfFile(handle);
1108
+ if (allocated == os.windows.FALSE) {
1109
+ const err = os.windows.kernel32.GetLastError();
1110
+ return os.windows.unexpectedError(err);
1111
+ }
1112
+ }
1113
+ };
1114
+
1115
+ // TODO: use os.getsockoptError when fixed for windows in stdlib
1116
+ fn getsockoptError(socket: os.socket_t) IO.ConnectError!void {
1117
+ var err_code: u32 = undefined;
1118
+ var size: i32 = @sizeOf(u32);
1119
+ const rc = os.windows.ws2_32.getsockopt(
1120
+ socket,
1121
+ os.SOL.SOCKET,
1122
+ os.SO.ERROR,
1123
+ std.mem.asBytes(&err_code),
1124
+ &size,
1125
+ );
1126
+
1127
+ if (rc != 0) {
1128
+ switch (os.windows.ws2_32.WSAGetLastError()) {
1129
+ .WSAENETDOWN => return error.NetworkUnreachable,
1130
+ .WSANOTINITIALISED => unreachable, // WSAStartup() was never called
1131
+ .WSAEFAULT => unreachable, // The address pointed to by optval or optlen is not in a valid part of the process address space.
1132
+ .WSAEINVAL => unreachable, // The level parameter is unknown or invalid
1133
+ .WSAENOPROTOOPT => unreachable, // The option is unknown at the level indicated.
1134
+ .WSAENOTSOCK => return error.FileDescriptorNotASocket,
1135
+ else => |err| return os.windows.unexpectedWSAError(err),
1136
+ }
1137
+ }
1138
+
1139
+ assert(size == 4);
1140
+ if (err_code == 0)
1141
+ return;
1142
+
1143
+ const ws_err = @intToEnum(os.windows.ws2_32.WinsockError, @intCast(u16, err_code));
1144
+ return switch (ws_err) {
1145
+ .WSAEACCES => error.PermissionDenied,
1146
+ .WSAEADDRINUSE => error.AddressInUse,
1147
+ .WSAEADDRNOTAVAIL => error.AddressNotAvailable,
1148
+ .WSAEAFNOSUPPORT => error.AddressFamilyNotSupported,
1149
+ .WSAEALREADY => error.ConnectionPending,
1150
+ .WSAEBADF => unreachable,
1151
+ .WSAECONNREFUSED => error.ConnectionRefused,
1152
+ .WSAEFAULT => unreachable,
1153
+ .WSAEISCONN => unreachable, // error.AlreadyConnected,
1154
+ .WSAENETUNREACH => error.NetworkUnreachable,
1155
+ .WSAENOTSOCK => error.FileDescriptorNotASocket,
1156
+ .WSAEPROTOTYPE => unreachable,
1157
+ .WSAETIMEDOUT => error.ConnectionTimedOut,
1158
+ .WSAECONNRESET => error.ConnectionResetByPeer,
1159
+ else => |e| os.windows.unexpectedWSAError(e),
1160
+ };
1161
+ }