tigerbeetle-node 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +3 -4
  2. package/package.json +1 -1
  3. package/src/node.zig +2 -12
  4. package/src/tigerbeetle/scripts/benchmark.bat +46 -0
  5. package/src/tigerbeetle/scripts/install_zig.bat +2 -2
  6. package/src/tigerbeetle/scripts/install_zig.sh +1 -1
  7. package/src/tigerbeetle/scripts/vopr.sh +2 -2
  8. package/src/tigerbeetle/src/benchmark.zig +2 -6
  9. package/src/tigerbeetle/src/cli.zig +39 -18
  10. package/src/tigerbeetle/src/config.zig +24 -9
  11. package/src/tigerbeetle/src/demo.zig +1 -1
  12. package/src/tigerbeetle/src/io/benchmark.zig +24 -49
  13. package/src/tigerbeetle/src/io/darwin.zig +175 -44
  14. package/src/tigerbeetle/src/io/linux.zig +177 -72
  15. package/src/tigerbeetle/src/io/test.zig +61 -39
  16. package/src/tigerbeetle/src/io/windows.zig +1161 -0
  17. package/src/tigerbeetle/src/io.zig +2 -0
  18. package/src/tigerbeetle/src/main.zig +13 -8
  19. package/src/tigerbeetle/src/message_bus.zig +49 -61
  20. package/src/tigerbeetle/src/message_pool.zig +63 -57
  21. package/src/tigerbeetle/src/ring_buffer.zig +7 -0
  22. package/src/tigerbeetle/src/simulator.zig +4 -4
  23. package/src/tigerbeetle/src/storage.zig +0 -230
  24. package/src/tigerbeetle/src/test/cluster.zig +3 -6
  25. package/src/tigerbeetle/src/test/message_bus.zig +4 -3
  26. package/src/tigerbeetle/src/test/network.zig +13 -16
  27. package/src/tigerbeetle/src/test/state_checker.zig +3 -2
  28. package/src/tigerbeetle/src/tigerbeetle.zig +5 -3
  29. package/src/tigerbeetle/src/time.zig +58 -11
  30. package/src/tigerbeetle/src/vsr/client.zig +18 -32
  31. package/src/tigerbeetle/src/vsr/clock.zig +1 -1
  32. package/src/tigerbeetle/src/vsr/journal.zig +2 -6
  33. package/src/tigerbeetle/src/vsr/replica.zig +146 -169
  34. package/src/tigerbeetle/src/vsr.zig +263 -5
@@ -2,7 +2,9 @@ const std = @import("std");
2
2
  const os = std.os;
3
3
  const mem = std.mem;
4
4
  const assert = std.debug.assert;
5
+ const log = std.log.scoped(.io);
5
6
 
7
+ const config = @import("../config.zig");
6
8
  const FIFO = @import("../fifo.zig").FIFO;
7
9
  const Time = @import("../time.zig").Time;
8
10
  const buffer_limit = @import("../io.zig").buffer_limit;
@@ -204,9 +206,6 @@ pub const IO = struct {
204
206
  address: std.net.Address,
205
207
  initiated: bool,
206
208
  },
207
- fsync: struct {
208
- fd: os.fd_t,
209
- },
210
209
  read: struct {
211
210
  fd: os.fd_t,
212
211
  buf: [*]u8,
@@ -248,7 +247,7 @@ pub const IO = struct {
248
247
  fn onComplete(io: *IO, _completion: *Completion) void {
249
248
  // Perform the actual operaton
250
249
  const op_data = &@field(_completion.operation, @tagName(operation_tag));
251
- const result = OperationImpl.doOperation(op_data);
250
+ const result = OperationImpl.do_operation(op_data);
252
251
 
253
252
  // Requeue onto io_pending if error.WouldBlock
254
253
  switch (operation_tag) {
@@ -310,7 +309,7 @@ pub const IO = struct {
310
309
  .socket = socket,
311
310
  },
312
311
  struct {
313
- fn doOperation(op: anytype) AcceptError!os.socket_t {
312
+ fn do_operation(op: anytype) AcceptError!os.socket_t {
314
313
  const fd = try os.accept(
315
314
  op.socket,
316
315
  null,
@@ -368,7 +367,7 @@ pub const IO = struct {
368
367
  .fd = fd,
369
368
  },
370
369
  struct {
371
- fn doOperation(op: anytype) CloseError!void {
370
+ fn do_operation(op: anytype) CloseError!void {
372
371
  return switch (os.errno(os.system.close(op.fd))) {
373
372
  .SUCCESS => {},
374
373
  .BADF => error.FileDescriptorInvalid,
@@ -407,7 +406,7 @@ pub const IO = struct {
407
406
  .initiated = false,
408
407
  },
409
408
  struct {
410
- fn doOperation(op: anytype) ConnectError!void {
409
+ fn do_operation(op: anytype) ConnectError!void {
411
410
  // Don't call connect after being rescheduled by io_pending as it gives EISCONN.
412
411
  // Instead, check the socket error to see if has been connected successfully.
413
412
  const result = switch (op.initiated) {
@@ -422,36 +421,6 @@ pub const IO = struct {
422
421
  );
423
422
  }
424
423
 
425
- pub const FsyncError = os.SyncError;
426
-
427
- pub fn fsync(
428
- self: *IO,
429
- comptime Context: type,
430
- context: Context,
431
- comptime callback: fn (
432
- context: Context,
433
- completion: *Completion,
434
- result: FsyncError!void,
435
- ) void,
436
- completion: *Completion,
437
- fd: os.fd_t,
438
- ) void {
439
- self.submit(
440
- context,
441
- callback,
442
- completion,
443
- .fsync,
444
- .{
445
- .fd = fd,
446
- },
447
- struct {
448
- fn doOperation(op: anytype) FsyncError!void {
449
- _ = os.fcntl(op.fd, os.F.FULLFSYNC, 1) catch return os.fsync(op.fd);
450
- }
451
- },
452
- );
453
- }
454
-
455
424
  pub const ReadError = error{
456
425
  WouldBlock,
457
426
  NotOpenForReading,
@@ -489,7 +458,7 @@ pub const IO = struct {
489
458
  .offset = offset,
490
459
  },
491
460
  struct {
492
- fn doOperation(op: anytype) ReadError!usize {
461
+ fn do_operation(op: anytype) ReadError!usize {
493
462
  while (true) {
494
463
  const rc = os.system.pread(
495
464
  op.fd,
@@ -546,7 +515,7 @@ pub const IO = struct {
546
515
  .len = @intCast(u32, buffer_limit(buffer.len)),
547
516
  },
548
517
  struct {
549
- fn doOperation(op: anytype) RecvError!usize {
518
+ fn do_operation(op: anytype) RecvError!usize {
550
519
  return os.recv(op.socket, op.buf[0..op.len], 0);
551
520
  }
552
521
  },
@@ -579,7 +548,7 @@ pub const IO = struct {
579
548
  .len = @intCast(u32, buffer_limit(buffer.len)),
580
549
  },
581
550
  struct {
582
- fn doOperation(op: anytype) SendError!usize {
551
+ fn do_operation(op: anytype) SendError!usize {
583
552
  return os.send(op.socket, op.buf[0..op.len], 0);
584
553
  }
585
554
  },
@@ -609,7 +578,7 @@ pub const IO = struct {
609
578
  .expires = self.time.monotonic() + nanoseconds,
610
579
  },
611
580
  struct {
612
- fn doOperation(_: anytype) TimeoutError!void {
581
+ fn do_operation(_: anytype) TimeoutError!void {
613
582
  return; // timeouts don't have errors for now
614
583
  }
615
584
  },
@@ -644,19 +613,181 @@ pub const IO = struct {
644
613
  .offset = offset,
645
614
  },
646
615
  struct {
647
- fn doOperation(op: anytype) WriteError!usize {
616
+ fn do_operation(op: anytype) WriteError!usize {
648
617
  return os.pwrite(op.fd, op.buf[0..op.len], op.offset);
649
618
  }
650
619
  },
651
620
  );
652
621
  }
653
622
 
654
- pub fn openSocket(family: u32, sock_type: u32, protocol: u32) !os.socket_t {
623
+ pub const INVALID_SOCKET = -1;
624
+
625
+ /// Creates a socket that can be used for async operations with the IO instance.
626
+ pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t {
627
+ _ = self;
628
+
655
629
  const fd = try os.socket(family, sock_type | os.SOCK.NONBLOCK, protocol);
656
- errdefer os.close(fd);
630
+ errdefer os.closeSocket(fd);
657
631
 
658
632
  // darwin doesn't support os.MSG_NOSIGNAL, but instead a socket option to avoid SIGPIPE.
659
633
  try os.setsockopt(fd, os.SOL.SOCKET, os.SO.NOSIGPIPE, &mem.toBytes(@as(c_int, 1)));
660
634
  return fd;
661
635
  }
636
+
637
+ /// Opens a directory with read only access.
638
+ pub fn open_dir(dir_path: [:0]const u8) !os.fd_t {
639
+ return os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0);
640
+ }
641
+
642
+ /// Opens or creates a journal file:
643
+ /// - For reading and writing.
644
+ /// - For Direct I/O (required on darwin).
645
+ /// - Obtains an advisory exclusive lock to the file descriptor.
646
+ /// - Allocates the file contiguously on disk if this is supported by the file system.
647
+ /// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
648
+ /// The caller is responsible for ensuring that the parent directory inode is durable.
649
+ /// - Verifies that the file size matches the expected file size before returning.
650
+ pub fn open_file(
651
+ self: *IO,
652
+ dir_fd: os.fd_t,
653
+ relative_path: [:0]const u8,
654
+ size: u64,
655
+ must_create: bool,
656
+ ) !os.fd_t {
657
+ _ = self;
658
+
659
+ assert(relative_path.len > 0);
660
+ assert(size >= config.sector_size);
661
+ assert(size % config.sector_size == 0);
662
+
663
+ // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
664
+ // This is much stronger than an advisory exclusive lock, and is required on some platforms.
665
+
666
+ // Opening with O_DSYNC is essential for both durability and correctness.
667
+ // O_DSYNC enables us to omit fsync() calls in the data plane, since we sync to the disk on every write.
668
+ var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC;
669
+ var mode: os.mode_t = 0;
670
+
671
+ // TODO Document this and investigate whether this is in fact correct to set here.
672
+ if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE;
673
+
674
+ if (must_create) {
675
+ log.info("creating \"{s}\"...", .{relative_path});
676
+ flags |= os.O.CREAT;
677
+ flags |= os.O.EXCL;
678
+ mode = 0o666;
679
+ } else {
680
+ log.info("opening \"{s}\"...", .{relative_path});
681
+ }
682
+
683
+ // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
684
+ assert((flags & os.O.DSYNC) > 0);
685
+
686
+ // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
687
+ assert(!std.fs.path.isAbsolute(relative_path));
688
+ const fd = try os.openatZ(dir_fd, relative_path, flags, mode);
689
+ // TODO Return a proper error message when the path exists or does not exist (init/start).
690
+ errdefer os.close(fd);
691
+
692
+ // TODO Check that the file is actually a file.
693
+
694
+ // On darwin assume that Direct I/O is always supported.
695
+ // Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist.
696
+ if (config.direct_io) {
697
+ _ = try os.fcntl(fd, os.F.NOCACHE, 1);
698
+ }
699
+
700
+ // Obtain an advisory exclusive lock that works only if all processes actually use flock().
701
+ // LOCK_NB means that we want to fail the lock without waiting if another process has it.
702
+ os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) {
703
+ error.WouldBlock => @panic("another process holds the data file lock"),
704
+ else => return err,
705
+ };
706
+
707
+ // Ask the file system to allocate contiguous sectors for the file (if possible):
708
+ // If the file system does not support `fallocate()`, then this could mean more seeks or a
709
+ // panic if we run out of disk space (ENOSPC).
710
+ if (must_create) try fs_allocate(fd, size);
711
+
712
+ // The best fsync strategy is always to fsync before reading because this prevents us from
713
+ // making decisions on data that was never durably written by a previously crashed process.
714
+ // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
715
+ // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
716
+ try fs_sync(fd);
717
+
718
+ // We fsync the parent directory to ensure that the file inode is durably written.
719
+ // The caller is responsible for the parent directory inode stored under the grandparent.
720
+ // We always do this when opening because we don't know if this was done before crashing.
721
+ try fs_sync(dir_fd);
722
+
723
+ const stat = try os.fstat(fd);
724
+ if (stat.size != size) @panic("data file inode size was truncated or corrupted");
725
+
726
+ return fd;
727
+ }
728
+
729
+ /// Darwin's fsync() syscall does not flush past the disk cache. We must use F_FULLFSYNC instead.
730
+ /// https://twitter.com/TigerBeetleDB/status/1422491736224436225
731
+ fn fs_sync(fd: os.fd_t) !void {
732
+ _ = os.fcntl(fd, os.F.FULLFSYNC, 1) catch return os.fsync(fd);
733
+ }
734
+
735
+ /// Allocates a file contiguously using fallocate() if supported.
736
+ /// Alternatively, writes to the last sector so that at least the file size is correct.
737
+ fn fs_allocate(fd: os.fd_t, size: u64) !void {
738
+ log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
739
+
740
+ // Darwin doesn't have fallocate() but we can simulate it using fcntl()s.
741
+ //
742
+ // https://stackoverflow.com/a/11497568
743
+ // https://api.kde.org/frameworks/kcoreaddons/html/posix__fallocate__mac_8h_source.html
744
+ // http://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61
745
+
746
+ const F_ALLOCATECONTIG = 0x2; // Allocate contiguous space.
747
+ const F_ALLOCATEALL = 0x4; // Allocate all or nothing.
748
+ const F_PEOFPOSMODE = 3; // Use relative offset from the seek pos mode.
749
+ const fstore_t = extern struct {
750
+ fst_flags: c_uint,
751
+ fst_posmode: c_int,
752
+ fst_offset: os.off_t,
753
+ fst_length: os.off_t,
754
+ fst_bytesalloc: os.off_t,
755
+ };
756
+
757
+ var store = fstore_t{
758
+ .fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL,
759
+ .fst_posmode = F_PEOFPOSMODE,
760
+ .fst_offset = 0,
761
+ .fst_length = @intCast(os.off_t, size),
762
+ .fst_bytesalloc = 0,
763
+ };
764
+
765
+ // Try to pre-allocate contiguous space and fall back to default non-contiguous.
766
+ var res = os.system.fcntl(fd, os.F.PREALLOCATE, @ptrToInt(&store));
767
+ if (os.errno(res) != .SUCCESS) {
768
+ store.fst_flags = F_ALLOCATEALL;
769
+ res = os.system.fcntl(fd, os.F.PREALLOCATE, @ptrToInt(&store));
770
+ }
771
+
772
+ switch (os.errno(res)) {
773
+ .SUCCESS => {},
774
+ .ACCES => unreachable, // F_SETLK or F_SETSIZE of F_WRITEBOOTSTRAP
775
+ .BADF => return error.FileDescriptorInvalid,
776
+ .DEADLK => unreachable, // F_SETLKW
777
+ .INTR => unreachable, // F_SETLKW
778
+ .INVAL => return error.ArgumentsInvalid, // for F_PREALLOCATE (offset invalid)
779
+ .MFILE => unreachable, // F_DUPFD or F_DUPED
780
+ .NOLCK => unreachable, // F_SETLK or F_SETLKW
781
+ .OVERFLOW => return error.FileTooBig,
782
+ .SRCH => unreachable, // F_SETOWN
783
+ .OPNOTSUPP => return error.OperationNotSupported, // not reported but need same error union
784
+ else => |errno| return os.unexpectedErrno(errno),
785
+ }
786
+
787
+ // Now actually perform the allocation.
788
+ return os.ftruncate(fd, size) catch |err| switch (err) {
789
+ error.AccessDenied => error.PermissionDenied,
790
+ else => |e| e,
791
+ };
792
+ }
662
793
  };
@@ -5,7 +5,9 @@ const linux = os.linux;
5
5
  const IO_Uring = linux.IO_Uring;
6
6
  const io_uring_cqe = linux.io_uring_cqe;
7
7
  const io_uring_sqe = linux.io_uring_sqe;
8
+ const log = std.log.scoped(.io);
8
9
 
10
+ const config = @import("../config.zig");
9
11
  const FIFO = @import("../fifo.zig").FIFO;
10
12
  const buffer_limit = @import("../io.zig").buffer_limit;
11
13
 
@@ -199,9 +201,6 @@ pub const IO = struct {
199
201
  op.address.getOsSockLen(),
200
202
  );
201
203
  },
202
- .fsync => |op| {
203
- linux.io_uring_prep_fsync(sqe, op.fd, 0);
204
- },
205
204
  .read => |op| {
206
205
  linux.io_uring_prep_read(
207
206
  sqe,
@@ -315,29 +314,6 @@ pub const IO = struct {
315
314
  };
316
315
  completion.callback(completion.context, completion, &result);
317
316
  },
318
- .fsync => {
319
- const result = blk: {
320
- if (completion.result < 0) {
321
- const err = switch (@intToEnum(os.E, -completion.result)) {
322
- .INTR => {
323
- completion.io.enqueue(completion);
324
- return;
325
- },
326
- .BADF => error.FileDescriptorInvalid,
327
- .DQUOT => error.DiskQuota,
328
- .INVAL => error.ArgumentsInvalid,
329
- .IO => error.InputOutput,
330
- .NOSPC => error.NoSpaceLeft,
331
- .ROFS => error.ReadOnlyFileSystem,
332
- else => |errno| os.unexpectedErrno(errno),
333
- };
334
- break :blk err;
335
- } else {
336
- assert(completion.result == 0);
337
- }
338
- };
339
- completion.callback(completion.context, completion, &result);
340
- },
341
317
  .read => {
342
318
  const result = blk: {
343
319
  if (completion.result < 0) {
@@ -489,9 +465,6 @@ pub const IO = struct {
489
465
  socket: os.socket_t,
490
466
  address: std.net.Address,
491
467
  },
492
- fsync: struct {
493
- fd: os.fd_t,
494
- },
495
468
  read: struct {
496
469
  fd: os.fd_t,
497
470
  buffer: []u8,
@@ -655,48 +628,6 @@ pub const IO = struct {
655
628
  self.enqueue(completion);
656
629
  }
657
630
 
658
- pub const FsyncError = error{
659
- FileDescriptorInvalid,
660
- DiskQuota,
661
- ArgumentsInvalid,
662
- InputOutput,
663
- NoSpaceLeft,
664
- ReadOnlyFileSystem,
665
- } || os.UnexpectedError;
666
-
667
- pub fn fsync(
668
- self: *IO,
669
- comptime Context: type,
670
- context: Context,
671
- comptime callback: fn (
672
- context: Context,
673
- completion: *Completion,
674
- result: FsyncError!void,
675
- ) void,
676
- completion: *Completion,
677
- fd: os.fd_t,
678
- ) void {
679
- completion.* = .{
680
- .io = self,
681
- .context = context,
682
- .callback = struct {
683
- fn wrapper(ctx: ?*anyopaque, comp: *Completion, res: *const anyopaque) void {
684
- callback(
685
- @intToPtr(Context, @ptrToInt(ctx)),
686
- comp,
687
- @intToPtr(*const FsyncError!void, @ptrToInt(res)).*,
688
- );
689
- }
690
- }.wrapper,
691
- .operation = .{
692
- .fsync = .{
693
- .fd = fd,
694
- },
695
- },
696
- };
697
- self.enqueue(completion);
698
- }
699
-
700
631
  pub const ReadError = error{
701
632
  WouldBlock,
702
633
  NotOpenForReading,
@@ -927,7 +858,181 @@ pub const IO = struct {
927
858
  self.enqueue(completion);
928
859
  }
929
860
 
930
- pub fn openSocket(family: u32, sock_type: u32, protocol: u32) !os.socket_t {
861
+ pub const INVALID_SOCKET = -1;
862
+
863
+ /// Creates a socket that can be used for async operations with the IO instance.
864
+ pub fn open_socket(self: *IO, family: u32, sock_type: u32, protocol: u32) !os.socket_t {
865
+ _ = self;
931
866
  return os.socket(family, sock_type, protocol);
932
867
  }
868
+
869
+ /// Opens a directory with read only access.
870
+ pub fn open_dir(dir_path: [:0]const u8) !os.fd_t {
871
+ return os.openZ(dir_path, os.O.CLOEXEC | os.O.RDONLY, 0);
872
+ }
873
+
874
+ /// Opens or creates a journal file:
875
+ /// - For reading and writing.
876
+ /// - For Direct I/O (if possible in development mode, but required in production mode).
877
+ /// - Obtains an advisory exclusive lock to the file descriptor.
878
+ /// - Allocates the file contiguously on disk if this is supported by the file system.
879
+ /// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
880
+ /// The caller is responsible for ensuring that the parent directory inode is durable.
881
+ /// - Verifies that the file size matches the expected file size before returning.
882
+ pub fn open_file(
883
+ self: *IO,
884
+ dir_fd: os.fd_t,
885
+ relative_path: [:0]const u8,
886
+ size: u64,
887
+ must_create: bool,
888
+ ) !os.fd_t {
889
+ _ = self;
890
+
891
+ assert(relative_path.len > 0);
892
+ assert(size >= config.sector_size);
893
+ assert(size % config.sector_size == 0);
894
+
895
+ // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
896
+ // This is much stronger than an advisory exclusive lock, and is required on some platforms.
897
+
898
+ var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC;
899
+ var mode: os.mode_t = 0;
900
+
901
+ // TODO Document this and investigate whether this is in fact correct to set here.
902
+ if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE;
903
+
904
+ var direct_io_supported = false;
905
+ if (config.direct_io) {
906
+ direct_io_supported = try fs_supports_direct_io(dir_fd);
907
+ if (direct_io_supported) {
908
+ flags |= os.O.DIRECT;
909
+ } else if (config.deployment_environment == .development) {
910
+ log.warn("file system does not support Direct I/O", .{});
911
+ } else {
912
+ // We require Direct I/O for safety to handle fsync failure correctly, and therefore
913
+ // panic in production if it is not supported.
914
+ @panic("file system does not support Direct I/O");
915
+ }
916
+ }
917
+
918
+ if (must_create) {
919
+ log.info("creating \"{s}\"...", .{relative_path});
920
+ flags |= os.O.CREAT;
921
+ flags |= os.O.EXCL;
922
+ mode = 0o666;
923
+ } else {
924
+ log.info("opening \"{s}\"...", .{relative_path});
925
+ }
926
+
927
+ // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
928
+ assert((flags & os.O.DSYNC) > 0);
929
+
930
+ // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
931
+ assert(!std.fs.path.isAbsolute(relative_path));
932
+ const fd = try os.openatZ(dir_fd, relative_path, flags, mode);
933
+ // TODO Return a proper error message when the path exists or does not exist (init/start).
934
+ errdefer os.close(fd);
935
+
936
+ // TODO Check that the file is actually a file.
937
+
938
+ // Obtain an advisory exclusive lock that works only if all processes actually use flock().
939
+ // LOCK_NB means that we want to fail the lock without waiting if another process has it.
940
+ os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) {
941
+ error.WouldBlock => @panic("another process holds the data file lock"),
942
+ else => return err,
943
+ };
944
+
945
+ // Ask the file system to allocate contiguous sectors for the file (if possible):
946
+ // If the file system does not support `fallocate()`, then this could mean more seeks or a
947
+ // panic if we run out of disk space (ENOSPC).
948
+ if (must_create) {
949
+ log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
950
+ fs_allocate(fd, size) catch |err| switch (err) {
951
+ error.OperationNotSupported => {
952
+ log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
953
+ log.info("allocating by writing to the last sector of the file instead...", .{});
954
+
955
+ const sector_size = config.sector_size;
956
+ const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
957
+
958
+ // Handle partial writes where the physical sector is less than a logical sector:
959
+ const write_offset = size - sector.len;
960
+ var written: usize = 0;
961
+ while (written < sector.len) {
962
+ written += try os.pwrite(fd, sector[written..], write_offset + written);
963
+ }
964
+ },
965
+ else => |e| return e,
966
+ };
967
+ }
968
+
969
+ // The best fsync strategy is always to fsync before reading because this prevents us from
970
+ // making decisions on data that was never durably written by a previously crashed process.
971
+ // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
972
+ // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
973
+ try os.fsync(fd);
974
+
975
+ // We fsync the parent directory to ensure that the file inode is durably written.
976
+ // The caller is responsible for the parent directory inode stored under the grandparent.
977
+ // We always do this when opening because we don't know if this was done before crashing.
978
+ try os.fsync(dir_fd);
979
+
980
+ const stat = try os.fstat(fd);
981
+ if (stat.size != size) @panic("data file inode size was truncated or corrupted");
982
+
983
+ return fd;
984
+ }
985
+
986
+ /// Detects whether the underlying file system for a given directory fd supports Direct I/O.
987
+ /// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
988
+ fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
989
+ if (!@hasDecl(std.os, "O_DIRECT")) return false;
990
+
991
+ const path = "fs_supports_direct_io";
992
+ const dir = std.fs.Dir{ .fd = dir_fd };
993
+ const fd = try os.openatZ(dir_fd, path, os.O.CLOEXEC | os.O.CREAT | os.O.TRUNC, 0o666);
994
+ defer os.close(fd);
995
+ defer dir.deleteFile(path) catch {};
996
+
997
+ while (true) {
998
+ const res = os.system.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0);
999
+ switch (os.linux.getErrno(res)) {
1000
+ 0 => {
1001
+ os.close(@intCast(os.fd_t, res));
1002
+ return true;
1003
+ },
1004
+ os.linux.EINTR => continue,
1005
+ os.linux.EINVAL => return false,
1006
+ else => |err| return os.unexpectedErrno(err),
1007
+ }
1008
+ }
1009
+ }
1010
+
1011
+ /// Allocates a file contiguously using fallocate() if supported.
1012
+ /// Alternatively, writes to the last sector so that at least the file size is correct.
1013
+ fn fs_allocate(fd: os.fd_t, size: u64) !void {
1014
+ const mode: i32 = 0;
1015
+ const offset: i64 = 0;
1016
+ const length = @intCast(i64, size);
1017
+
1018
+ while (true) {
1019
+ const rc = os.linux.fallocate(fd, mode, offset, length);
1020
+ switch (os.linux.getErrno(rc)) {
1021
+ .SUCCESS => return,
1022
+ .BADF => return error.FileDescriptorInvalid,
1023
+ .FBIG => return error.FileTooBig,
1024
+ .INTR => continue,
1025
+ .INVAL => return error.ArgumentsInvalid,
1026
+ .IO => return error.InputOutput,
1027
+ .NODEV => return error.NoDevice,
1028
+ .NOSPC => return error.NoSpaceLeft,
1029
+ .NOSYS => return error.SystemOutdated,
1030
+ .OPNOTSUPP => return error.OperationNotSupported,
1031
+ .PERM => return error.PermissionDenied,
1032
+ .SPIPE => return error.Unseekable,
1033
+ .TXTBSY => return error.FileBusy,
1034
+ else => |errno| return os.unexpectedErrno(errno),
1035
+ }
1036
+ }
1037
+ }
933
1038
  };