tigerbeetle-node 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +21 -7
  2. package/dist/benchmark.js +1 -1
  3. package/dist/benchmark.js.map +1 -1
  4. package/dist/index.d.ts +22 -20
  5. package/dist/index.js +40 -18
  6. package/dist/index.js.map +1 -1
  7. package/dist/test.js +13 -1
  8. package/dist/test.js.map +1 -1
  9. package/package.json +12 -12
  10. package/scripts/postinstall.sh +2 -2
  11. package/src/benchmark.ts +4 -4
  12. package/src/index.ts +35 -9
  13. package/src/node.zig +139 -28
  14. package/src/test.ts +19 -5
  15. package/src/tigerbeetle/scripts/benchmark.sh +10 -3
  16. package/src/tigerbeetle/scripts/install.sh +2 -2
  17. package/src/tigerbeetle/scripts/install_zig.bat +109 -0
  18. package/src/tigerbeetle/scripts/install_zig.sh +21 -4
  19. package/src/tigerbeetle/scripts/vopr.bat +48 -0
  20. package/src/tigerbeetle/scripts/vopr.sh +33 -0
  21. package/src/tigerbeetle/src/benchmark.zig +74 -42
  22. package/src/tigerbeetle/src/cli.zig +136 -83
  23. package/src/tigerbeetle/src/config.zig +80 -26
  24. package/src/tigerbeetle/src/demo.zig +101 -78
  25. package/src/tigerbeetle/src/demo_01_create_accounts.zig +2 -7
  26. package/src/tigerbeetle/src/demo_02_lookup_accounts.zig +2 -7
  27. package/src/tigerbeetle/src/demo_03_create_transfers.zig +2 -7
  28. package/src/tigerbeetle/src/demo_04_create_transfers_two_phase_commit.zig +2 -5
  29. package/src/tigerbeetle/src/demo_05_accept_transfers.zig +2 -7
  30. package/src/tigerbeetle/src/demo_06_reject_transfers.zig +2 -7
  31. package/src/tigerbeetle/src/demo_07_lookup_transfers.zig +8 -0
  32. package/src/tigerbeetle/src/fifo.zig +20 -11
  33. package/src/tigerbeetle/src/io.zig +35 -22
  34. package/src/tigerbeetle/src/io_darwin.zig +701 -0
  35. package/src/tigerbeetle/src/main.zig +72 -25
  36. package/src/tigerbeetle/src/message_bus.zig +379 -456
  37. package/src/tigerbeetle/src/message_pool.zig +3 -3
  38. package/src/tigerbeetle/src/ring_buffer.zig +192 -37
  39. package/src/tigerbeetle/src/simulator.zig +317 -0
  40. package/src/tigerbeetle/src/state_machine.zig +846 -38
  41. package/src/tigerbeetle/src/storage.zig +488 -90
  42. package/src/tigerbeetle/src/test/cluster.zig +221 -0
  43. package/src/tigerbeetle/src/test/message_bus.zig +92 -0
  44. package/src/tigerbeetle/src/test/network.zig +182 -0
  45. package/src/tigerbeetle/src/test/packet_simulator.zig +371 -0
  46. package/src/tigerbeetle/src/test/state_checker.zig +142 -0
  47. package/src/tigerbeetle/src/test/state_machine.zig +71 -0
  48. package/src/tigerbeetle/src/test/storage.zig +375 -0
  49. package/src/tigerbeetle/src/test/time.zig +84 -0
  50. package/src/tigerbeetle/src/tigerbeetle.zig +6 -3
  51. package/src/tigerbeetle/src/time.zig +65 -0
  52. package/src/tigerbeetle/src/unit_tests.zig +14 -0
  53. package/src/tigerbeetle/src/vsr/client.zig +519 -0
  54. package/src/tigerbeetle/src/vsr/clock.zig +829 -0
  55. package/src/tigerbeetle/src/vsr/journal.zig +1368 -0
  56. package/src/tigerbeetle/src/vsr/marzullo.zig +306 -0
  57. package/src/tigerbeetle/src/vsr/replica.zig +4248 -0
  58. package/src/tigerbeetle/src/vsr.zig +601 -0
  59. package/src/tigerbeetle/LICENSE +0 -177
  60. package/src/tigerbeetle/README.md +0 -116
  61. package/src/tigerbeetle/src/client.zig +0 -319
  62. package/src/tigerbeetle/src/concurrent_ranges.zig +0 -162
  63. package/src/tigerbeetle/src/fixed_array_list.zig +0 -53
  64. package/src/tigerbeetle/src/io_async.zig +0 -600
  65. package/src/tigerbeetle/src/journal.zig +0 -567
  66. package/src/tigerbeetle/src/test_client.zig +0 -41
  67. package/src/tigerbeetle/src/test_main.zig +0 -118
  68. package/src/tigerbeetle/src/test_message_bus.zig +0 -132
  69. package/src/tigerbeetle/src/vr/journal.zig +0 -672
  70. package/src/tigerbeetle/src/vr/replica.zig +0 -3061
  71. package/src/tigerbeetle/src/vr.zig +0 -374
@@ -1,137 +1,535 @@
1
1
  const std = @import("std");
2
+ const os = std.os;
2
3
  const Allocator = std.mem.Allocator;
3
4
  const assert = std.debug.assert;
4
- const log = std.log.scoped(.vr);
5
+ const log = std.log.scoped(.storage);
6
+
7
+ const IO = @import("io.zig").IO;
8
+ const is_darwin = std.Target.current.isDarwin();
5
9
 
6
10
  const config = @import("config.zig");
11
+ const vsr = @import("vsr.zig");
7
12
 
8
- /// TODO Use IO and callbacks:
9
13
  pub const Storage = struct {
10
- allocator: *Allocator,
11
- memory: []u8 align(config.sector_size),
12
- size: u64,
14
+ /// See usage in Journal.write_sectors() for details.
15
+ pub const synchronicity: enum {
16
+ always_synchronous,
17
+ always_asynchronous,
18
+ } = .always_asynchronous;
19
+
20
+ pub const Read = struct {
21
+ completion: IO.Completion,
22
+ callback: fn (read: *Storage.Read) void,
23
+
24
+ /// The buffer to read into, re-sliced and re-assigned as we go, e.g. after partial reads.
25
+ buffer: []u8,
26
+
27
+ /// The position into the file descriptor from where we should read, also adjusted as we go.
28
+ offset: u64,
29
+
30
+ /// The maximum amount of bytes to read per syscall. We use this to subdivide troublesome
31
+ /// reads into smaller reads to work around latent sector errors (LSEs).
32
+ target_max: u64,
33
+
34
+ /// Returns a target slice into `buffer` to read into, capped by `target_max`.
35
+ /// If the previous read was a partial read of physical sectors (e.g. 512 bytes) less than
36
+ /// our logical sector size (e.g. 4 KiB), so that the remainder of the buffer is no longer
37
+ /// aligned to a logical sector, then we further cap the slice to get back onto a logical
38
+ /// sector boundary.
39
+ fn target(read: *Read) []u8 {
40
+ // A worked example of a partial read that leaves the rest of the buffer unaligned:
41
+ // This could happen for non-Advanced Format disks with a physical sector of 512 bytes.
42
+ // We want to read 8 KiB:
43
+ // buffer.ptr = 0
44
+ // buffer.len = 8192
45
+ // ... and then experience a partial read of only 512 bytes:
46
+ // buffer.ptr = 512
47
+ // buffer.len = 7680
48
+ // We can now see that `buffer.len` is no longer a sector multiple of 4 KiB and further
49
+ // that we have 3584 bytes left of the partial sector read. If we subtract this amount
50
+ // from our logical sector size of 4 KiB we get 512 bytes, which is the alignment error
51
+ // that we need to subtract from `target_max` to get back onto the boundary.
52
+ var max = read.target_max;
53
+
54
+ const partial_sector_read_remainder = read.buffer.len % config.sector_size;
55
+ if (partial_sector_read_remainder != 0) {
56
+ // TODO log.debug() because this is interesting, and to ensure fuzz test coverage.
57
+ const partial_sector_read = config.sector_size - partial_sector_read_remainder;
58
+ max -= partial_sector_read;
59
+ }
60
+
61
+ return read.buffer[0..std.math.min(read.buffer.len, max)];
62
+ }
63
+ };
64
+
65
+ pub const Write = struct {
66
+ completion: IO.Completion,
67
+ callback: fn (write: *Storage.Write) void,
68
+ buffer: []const u8,
69
+ offset: u64,
70
+ };
13
71
 
14
- pub fn init(allocator: *Allocator, size: u64) !Storage {
15
- var memory = try allocator.allocAdvanced(u8, config.sector_size, size, .exact);
16
- errdefer allocator.free(memory);
17
- std.mem.set(u8, memory, 0);
72
+ size: u64,
73
+ fd: os.fd_t,
74
+ io: *IO,
18
75
 
76
+ pub fn init(size: u64, fd: os.fd_t, io: *IO) !Storage {
19
77
  return Storage{
20
- .allocator = allocator,
21
- .memory = memory,
22
78
  .size = size,
79
+ .fd = fd,
80
+ .io = io,
23
81
  };
24
82
  }
25
83
 
26
- pub fn deinit() void {
27
- self.allocator.free(self.memory);
84
+ pub fn deinit() void {}
85
+
86
+ pub fn read_sectors(
87
+ self: *Storage,
88
+ callback: fn (read: *Storage.Read) void,
89
+ read: *Storage.Read,
90
+ buffer: []u8,
91
+ offset: u64,
92
+ ) void {
93
+ self.assert_alignment(buffer, offset);
94
+
95
+ read.* = .{
96
+ .completion = undefined,
97
+ .callback = callback,
98
+ .buffer = buffer,
99
+ .offset = offset,
100
+ .target_max = buffer.len,
101
+ };
102
+
103
+ self.start_read(read, 0);
28
104
  }
29
105
 
30
- /// Detects whether the underlying file system for a given directory fd supports Direct I/O.
31
- /// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
32
- pub fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
33
- if (!@hasDecl(std.os, "O_DIRECT")) return false;
106
+ fn start_read(self: *Storage, read: *Storage.Read, bytes_read: usize) void {
107
+ assert(bytes_read <= read.target().len);
34
108
 
35
- const os = std.os;
36
- const path = "fs_supports_direct_io";
37
- const dir = fs.Dir{ .fd = dir_fd };
38
- const fd = try os.openatZ(dir_fd, path, os.O_CLOEXEC | os.O_CREAT | os.O_TRUNC, 0o666);
39
- defer os.close(fd);
40
- defer dir.deleteFile(path) catch {};
109
+ read.offset += bytes_read;
110
+ read.buffer = read.buffer[bytes_read..];
41
111
 
42
- while (true) {
43
- const res = os.system.openat(dir_fd, path, os.O_CLOEXEC | os.O_RDONLY | os.O_DIRECT, 0);
44
- switch (linux.getErrno(res)) {
45
- 0 => {
46
- os.close(@intCast(os.fd_t, res));
47
- return true;
48
- },
49
- linux.EINTR => continue,
50
- linux.EINVAL => return false,
51
- else => |err| return os.unexpectedErrno(err),
52
- }
112
+ const target = read.target();
113
+ if (target.len == 0) {
114
+ read.callback(read);
115
+ return;
53
116
  }
117
+
118
+ self.assert_bounds(target, read.offset);
119
+ self.io.read(
120
+ *Storage,
121
+ self,
122
+ on_read,
123
+ &read.completion,
124
+ self.fd,
125
+ target,
126
+ read.offset,
127
+ );
54
128
  }
55
129
 
56
- pub fn read(self: *Storage, buffer: []u8, offset: u64) void {
57
- self.assert_bounds_and_alignment(buffer, offset);
58
-
59
- if (self.read_all(buffer, offset)) |bytes_read| {
60
- if (bytes_read != buffer.len) {
61
- assert(bytes_read < buffer.len);
62
- log.emerg("short read: bytes_read={} buffer_len={} offset={}", .{
63
- bytes_read,
64
- buffer.len,
65
- offset,
66
- });
67
- @panic("fs corruption: file inode size truncated");
68
- }
69
- } else |err| switch (err) {
130
+ fn on_read(self: *Storage, completion: *IO.Completion, result: IO.ReadError!usize) void {
131
+ const read = @fieldParentPtr(Storage.Read, "completion", completion);
132
+
133
+ const bytes_read = result catch |err| switch (err) {
70
134
  error.InputOutput => {
71
135
  // The disk was unable to read some sectors (an internal CRC or hardware failure):
72
- if (buffer.len > config.sector_size) {
73
- log.err("latent sector error: offset={}, subdividing read...", .{offset});
74
- // Subdivide the read into sectors to read around the faulty sector(s):
75
- // This is considerably slower than doing a bulk read.
76
- // By now we might have also experienced the disk's read timeout (in seconds).
77
- // TODO Docs should instruct on why and how to reduce disk firmware timeouts.
78
- var buffer_offset = 0;
79
- while (buffer_offset < buffer.len) : (buffer_offset += config.sector_size) {
80
- self.read(
81
- buffer[buffer_offset..][0..config.sector_size],
82
- offset + buffer_offset,
83
- );
84
- }
85
- assert(buffer_offset == buffer.len);
136
+ // We may also have already experienced a partial unaligned read, reading less
137
+ // physical sectors than the logical sector size, so we cannot expect `target.len`
138
+ // to be an exact logical sector multiple.
139
+ const target = read.target();
140
+ if (target.len > config.sector_size) {
141
+ // We tried to read more than a logical sector and failed.
142
+ log.err("latent sector error: offset={}, subdividing read...", .{read.offset});
143
+
144
+ // Divide the buffer in half and try to read each half separately:
145
+ // This creates a recursive binary search for the sector(s) causing the error.
146
+ // This is considerably slower than doing a single bulk read and by now we might
147
+ // also have experienced the disk's read retry timeout (in seconds).
148
+ // TODO Our docs must instruct on why and how to reduce disk firmware timeouts.
149
+
150
+ // These lines both implement ceiling division e.g. `((3 - 1) / 2) + 1 == 2` and
151
+ // require that the numerator is always greater than zero:
152
+ assert(target.len > 0);
153
+ const target_sectors = @divFloor(target.len - 1, config.sector_size) + 1;
154
+ assert(target_sectors > 0);
155
+ read.target_max = (@divFloor(target_sectors - 1, 2) + 1) * config.sector_size;
156
+ assert(read.target_max >= config.sector_size);
157
+
158
+ // Pass 0 for `bytes_read`, we want to retry the read with smaller `target_max`:
159
+ self.start_read(read, 0);
160
+ return;
86
161
  } else {
87
- // Zero any remaining sectors that cannot be read:
88
- // We treat these EIO errors the same as a checksum failure.
89
- log.err("latent sector error: offset={}, zeroing buffer sector...", .{offset});
90
- assert(buffer.len == config.sector_size);
91
- mem.set(u8, buffer, 0);
162
+ // We tried to read at (or less than) logical sector granularity and failed.
163
+ log.err("latent sector error: offset={}, zeroing sector...", .{read.offset});
164
+
165
+ // Zero this logical sector which can't be read:
166
+ // We will treat these EIO errors the same as a checksum failure.
167
+ // TODO This could be an interesting avenue to explore further, whether
168
+ // temporary or permanent EIO errors should be conflated with checksum failures.
169
+ assert(target.len > 0);
170
+ std.mem.set(u8, target, 0);
171
+
172
+ // We could set `read.target_max` to `vsr.sector_ceil(read.buffer.len)` here
173
+ // in order to restart our pseudo-binary search on the rest of the sectors to be
174
+ // read, optimistically assuming that this is the last failing sector.
175
+ // However, data corruption that causes EIO errors often has spacial locality.
176
+ // Therefore, restarting our pseudo-binary search here might give us abysmal
177
+ // performance in the (not uncommon) case of many successive failing sectors.
178
+ self.start_read(read, target.len);
179
+ return;
92
180
  }
93
181
  },
94
- else => {
95
- log.emerg("impossible read: buffer_len={} offset={} error={}", .{
96
- buffer_len,
97
- offset,
98
- err,
99
- });
182
+
183
+ error.WouldBlock,
184
+ error.NotOpenForReading,
185
+ error.ConnectionResetByPeer,
186
+ error.Alignment,
187
+ error.IsDir,
188
+ error.SystemResources,
189
+ error.Unseekable,
190
+ error.Unexpected,
191
+ => {
192
+ log.emerg(
193
+ "impossible read: offset={} buffer.len={} error={s}",
194
+ .{ read.offset, read.buffer.len, @errorName(err) },
195
+ );
100
196
  @panic("impossible read");
101
197
  },
198
+ };
199
+
200
+ if (bytes_read == 0) {
201
+ // We tried to read more than there really is available to read.
202
+ // In other words, we thought we could read beyond the end of the file descriptor.
203
+ // This can happen if the data file inode `size` was truncated or corrupted.
204
+ log.emerg(
205
+ "short read: buffer.len={} offset={} bytes_read={}",
206
+ .{ read.offset, read.buffer.len, bytes_read },
207
+ );
208
+ @panic("data file inode size was truncated or corrupted");
102
209
  }
210
+
211
+ // If our target was limited to a single sector, perhaps because of a latent sector error,
212
+ // then increase `target_max` according to AIMD now that we have read successfully and
213
+ // hopefully cleared the faulty zone.
214
+ // We assume that `target_max` may exceed `read.buffer.len` at any time.
215
+ if (read.target_max == config.sector_size) {
216
+ // TODO Add log.debug because this is interesting.
217
+ read.target_max += config.sector_size;
218
+ }
219
+
220
+ self.start_read(read, bytes_read);
103
221
  }
104
222
 
105
- pub fn write(self: *Storage, buffer: []const u8, offset: u64) void {
106
- self.assert_bounds_and_alignment(buffer, offset);
107
- self.write_all(buffer, offset) catch |err| switch (err) {
223
+ pub fn write_sectors(
224
+ self: *Storage,
225
+ callback: fn (write: *Storage.Write) void,
226
+ write: *Storage.Write,
227
+ buffer: []const u8,
228
+ offset: u64,
229
+ ) void {
230
+ self.assert_alignment(buffer, offset);
231
+
232
+ write.* = .{
233
+ .completion = undefined,
234
+ .callback = callback,
235
+ .buffer = buffer,
236
+ .offset = offset,
237
+ };
238
+
239
+ self.start_write(write);
240
+ }
241
+
242
+ fn start_write(self: *Storage, write: *Storage.Write) void {
243
+ self.assert_bounds(write.buffer, write.offset);
244
+ self.io.write(
245
+ *Storage,
246
+ self,
247
+ on_write,
248
+ &write.completion,
249
+ self.fd,
250
+ write.buffer,
251
+ write.offset,
252
+ );
253
+ }
254
+
255
+ fn on_write(self: *Storage, completion: *IO.Completion, result: IO.WriteError!usize) void {
256
+ const write = @fieldParentPtr(Storage.Write, "completion", completion);
257
+
258
+ const bytes_written = result catch |err| switch (err) {
108
259
  // We assume that the disk will attempt to reallocate a spare sector for any LSE.
109
- // TODO What if we receive an EIO error because of a faulty cable?
260
+ // TODO What if we receive a temporary EIO error because of a faulty cable?
110
261
  error.InputOutput => @panic("latent sector error: no spare sectors to reallocate"),
262
+ // TODO: It seems like it might be possible for some filesystems to return ETIMEDOUT
263
+ // here. Consider handling this without panicking.
111
264
  else => {
112
- log.emerg("write: buffer.len={} offset={} error={}", .{ buffer.len, offset, err });
113
- @panic("unrecoverable disk error");
265
+ log.emerg(
266
+ "impossible write: offset={} buffer.len={} error={s}",
267
+ .{ write.offset, write.buffer.len, @errorName(err) },
268
+ );
269
+ @panic("impossible write");
114
270
  },
115
271
  };
272
+
273
+ if (bytes_written == 0) {
274
+ // This should never happen if the kernel and filesystem are well behaved.
275
+ // However, block devices are known to exhibit this behavior in the wild.
276
+ // TODO: Consider retrying with a timeout if this panic proves problematic, and be
277
+ // careful to avoid logging in a busy loop. Perhaps a better approach might be to
278
+ // return wrote = null here and let the protocol retry at a higher layer where there is
279
+ // more context available to decide on how important this is or whether to cancel.
280
+ @panic("write operation returned 0 bytes written");
281
+ }
282
+
283
+ write.offset += bytes_written;
284
+ write.buffer = write.buffer[bytes_written..];
285
+
286
+ if (write.buffer.len == 0) {
287
+ write.callback(write);
288
+ return;
289
+ }
290
+
291
+ self.start_write(write);
116
292
  }
117
293
 
118
- fn assert_bounds_and_alignment(self: *Storage, buffer: []const u8, offset: u64) void {
294
+ /// Ensures that the read or write is aligned correctly for Direct I/O.
295
+ /// If this is not the case, then the underlying syscall will return EINVAL.
296
+ /// We check this only at the start of a read or write because the physical sector size may be
297
+ /// less than our logical sector size so that partial IOs then leave us no longer aligned.
298
+ fn assert_alignment(self: *Storage, buffer: []const u8, offset: u64) void {
299
+ assert(@ptrToInt(buffer.ptr) % config.sector_size == 0);
300
+ assert(buffer.len % config.sector_size == 0);
301
+ assert(offset % config.sector_size == 0);
302
+ }
303
+
304
+ /// Ensures that the read or write is within bounds and intends to read or write some bytes.
305
+ fn assert_bounds(self: *Storage, buffer: []const u8, offset: u64) void {
119
306
  assert(buffer.len > 0);
120
307
  assert(offset + buffer.len <= self.size);
308
+ }
309
+
310
+ // Static helper functions to handle data file creation/opening/allocation:
311
+
312
+ /// Opens or creates a journal file:
313
+ /// - For reading and writing.
314
+ /// - For Direct I/O (if possible in development mode, but required in production mode).
315
+ /// - Obtains an advisory exclusive lock to the file descriptor.
316
+ /// - Allocates the file contiguously on disk if this is supported by the file system.
317
+ /// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
318
+ /// The caller is responsible for ensuring that the parent directory inode is durable.
319
+ /// - Verifies that the file size matches the expected file size before returning.
320
+ pub fn open(
321
+ dir_fd: os.fd_t,
322
+ relative_path: [:0]const u8,
323
+ size: u64,
324
+ must_create: bool,
325
+ ) !os.fd_t {
326
+ assert(relative_path.len > 0);
327
+ assert(size >= config.sector_size);
328
+ assert(size % config.sector_size == 0);
329
+
330
+ // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
331
+ // This is much stronger than an advisory exclusive lock, and is required on some platforms.
332
+
333
+ var flags: u32 = os.O_CLOEXEC | os.O_RDWR | os.O_DSYNC;
334
+ var mode: os.mode_t = 0;
335
+
336
+ // TODO Document this and investigate whether this is in fact correct to set here.
337
+ if (@hasDecl(os, "O_LARGEFILE")) flags |= os.O_LARGEFILE;
338
+
339
+ var direct_io_supported = false;
340
+ if (config.direct_io) {
341
+ direct_io_supported = try Storage.fs_supports_direct_io(dir_fd);
342
+ if (direct_io_supported) {
343
+ if (!is_darwin) flags |= os.O_DIRECT;
344
+ } else if (config.deployment_environment == .development) {
345
+ log.warn("file system does not support Direct I/O", .{});
346
+ } else {
347
+ // We require Direct I/O for safety to handle fsync failure correctly, and therefore
348
+ // panic in production if it is not supported.
349
+ @panic("file system does not support Direct I/O");
350
+ }
351
+ }
352
+
353
+ if (must_create) {
354
+ log.info("creating \"{s}\"...", .{relative_path});
355
+ flags |= os.O_CREAT;
356
+ flags |= os.O_EXCL;
357
+ mode = 0o666;
358
+ } else {
359
+ log.info("opening \"{s}\"...", .{relative_path});
360
+ }
361
+
362
+ // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
363
+ assert((flags & os.O_DSYNC) > 0);
364
+
365
+ // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
366
+ assert(!std.fs.path.isAbsolute(relative_path));
367
+ const fd = try os.openatZ(dir_fd, relative_path, flags, mode);
368
+ // TODO Return a proper error message when the path exists or does not exist (init/start).
369
+ errdefer os.close(fd);
370
+
371
+ // TODO Check that the file is actually a file.
121
372
 
122
- // Ensure that the read or write is aligned correctly for Direct I/O:
123
- // If this is not the case, the underlying syscall will return EINVAL.
124
- assert(@mod(@ptrToInt(buffer.ptr), config.sector_size) == 0);
125
- assert(@mod(buffer.len, config.sector_size) == 0);
126
- assert(@mod(offset, config.sector_size) == 0);
373
+ // On darwin, use F_NOCACHE on direct_io to disable the page cache as O_DIRECT doesn't exit.
374
+ if (is_darwin and config.direct_io and direct_io_supported) {
375
+ _ = try os.fcntl(fd, os.F_NOCACHE, 1);
376
+ }
377
+
378
+ // Obtain an advisory exclusive lock that works only if all processes actually use flock().
379
+ // LOCK_NB means that we want to fail the lock without waiting if another process has it.
380
+ os.flock(fd, os.LOCK_EX | os.LOCK_NB) catch |err| switch (err) {
381
+ error.WouldBlock => @panic("another process holds the data file lock"),
382
+ else => return err,
383
+ };
384
+
385
+ // Ask the file system to allocate contiguous sectors for the file (if possible):
386
+ // If the file system does not support `fallocate()`, then this could mean more seeks or a
387
+ // panic if we run out of disk space (ENOSPC).
388
+ if (must_create) try Storage.allocate(fd, size);
389
+
390
+ // The best fsync strategy is always to fsync before reading because this prevents us from
391
+ // making decisions on data that was never durably written by a previously crashed process.
392
+ // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
393
+ // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
394
+ try os.fsync(fd);
395
+
396
+ // We fsync the parent directory to ensure that the file inode is durably written.
397
+ // The caller is responsible for the parent directory inode stored under the grandparent.
398
+ // We always do this when opening because we don't know if this was done before crashing.
399
+ try os.fsync(dir_fd);
400
+
401
+ const stat = try os.fstat(fd);
402
+ if (stat.size != size) @panic("data file inode size was truncated or corrupted");
403
+
404
+ return fd;
405
+ }
406
+
407
+ /// Allocates a file contiguously using fallocate() if supported.
408
+ /// Alternatively, writes to the last sector so that at least the file size is correct.
409
+ pub fn allocate(fd: os.fd_t, size: u64) !void {
410
+ log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
411
+ Storage.fallocate(fd, 0, 0, @intCast(i64, size)) catch |err| switch (err) {
412
+ error.OperationNotSupported => {
413
+ log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
414
+ log.notice("allocating by writing to the last sector of the file instead...", .{});
415
+
416
+ const sector_size = config.sector_size;
417
+ const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
418
+
419
+ // Handle partial writes where the physical sector is less than a logical sector:
420
+ const offset = size - sector.len;
421
+ var written: usize = 0;
422
+ while (written < sector.len) {
423
+ written += try os.pwrite(fd, sector[written..], offset + written);
424
+ }
425
+ },
426
+ else => return err,
427
+ };
127
428
  }
128
429
 
129
- fn read_all(self: *Storage, buffer: []u8, offset: u64) !u64 {
130
- std.mem.copy(u8, buffer, self.memory[offset .. offset + buffer.len]);
131
- return buffer.len;
430
+ fn fallocate(fd: i32, mode: i32, offset: i64, length: i64) !void {
431
+ // https://stackoverflow.com/a/11497568
432
+ // https://api.kde.org/frameworks/kcoreaddons/html/posix__fallocate__mac_8h_source.html
433
+ // http://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61
434
+ if (is_darwin) {
435
+ const F_ALLOCATECONTIG = 0x2; // allocate contiguous space
436
+ const F_ALLOCATEALL = 0x4; // allocate all or nothing
437
+ const F_PEOFPOSMODE = 3; // use relative offset from the seek pos mode
438
+ const F_VOLPOSMODE = 4; // use the specified volume offset
439
+ const fstore_t = extern struct {
440
+ fst_flags: c_uint,
441
+ fst_posmode: c_int,
442
+ fst_offset: os.off_t,
443
+ fst_length: os.off_t,
444
+ fst_bytesalloc: os.off_t,
445
+ };
446
+
447
+ var store = fstore_t{
448
+ .fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL,
449
+ .fst_posmode = F_PEOFPOSMODE,
450
+ .fst_offset = 0,
451
+ .fst_length = offset + length,
452
+ .fst_bytesalloc = 0,
453
+ };
454
+
455
+ // try to pre-allocate contiguous space and fall back to default non-continugous
456
+ var res = os.system.fcntl(fd, os.F_PREALLOCATE, @ptrToInt(&store));
457
+ if (os.errno(res) != 0) {
458
+ store.fst_flags = F_ALLOCATEALL;
459
+ res = os.system.fcntl(fd, os.F_PREALLOCATE, @ptrToInt(&store));
460
+ }
461
+
462
+ switch (os.errno(res)) {
463
+ 0 => {},
464
+ os.EACCES => unreachable, // F_SETLK or F_SETSIZE of F_WRITEBOOTSTRAP
465
+ os.EBADF => return error.FileDescriptorInvalid,
466
+ os.EDEADLK => unreachable, // F_SETLKW
467
+ os.EINTR => unreachable, // F_SETLKW
468
+ os.EINVAL => return error.ArgumentsInvalid, // for F_PREALLOCATE (offset invalid)
469
+ os.EMFILE => unreachable, // F_DUPFD or F_DUPED
470
+ os.ENOLCK => unreachable, // F_SETLK or F_SETLKW
471
+ os.EOVERFLOW => return error.FileTooBig,
472
+ os.ESRCH => unreachable, // F_SETOWN
473
+ os.EOPNOTSUPP => return error.OperationNotSupported, // not reported but need same error union
474
+ else => |errno| return os.unexpectedErrno(errno),
475
+ }
476
+
477
+ // now actually perform the allocation
478
+ return os.ftruncate(fd, @intCast(u64, length)) catch |err| switch (err) {
479
+ error.AccessDenied => error.PermissionDenied,
480
+ else => |e| e,
481
+ };
482
+ }
483
+
484
+ while (true) {
485
+ const rc = os.linux.fallocate(fd, mode, offset, length);
486
+ switch (os.linux.getErrno(rc)) {
487
+ 0 => return,
488
+ os.linux.EBADF => return error.FileDescriptorInvalid,
489
+ os.linux.EFBIG => return error.FileTooBig,
490
+ os.linux.EINTR => continue,
491
+ os.linux.EINVAL => return error.ArgumentsInvalid,
492
+ os.linux.EIO => return error.InputOutput,
493
+ os.linux.ENODEV => return error.NoDevice,
494
+ os.linux.ENOSPC => return error.NoSpaceLeft,
495
+ os.linux.ENOSYS => return error.SystemOutdated,
496
+ os.linux.EOPNOTSUPP => return error.OperationNotSupported,
497
+ os.linux.EPERM => return error.PermissionDenied,
498
+ os.linux.ESPIPE => return error.Unseekable,
499
+ os.linux.ETXTBSY => return error.FileBusy,
500
+ else => |errno| return os.unexpectedErrno(errno),
501
+ }
502
+ }
132
503
  }
133
504
 
134
- fn write_all(self: *Storage, buffer: []const u8, offset: u64) !void {
135
- std.mem.copy(u8, self.memory[offset .. offset + buffer.len], buffer);
505
+ /// Detects whether the underlying file system for a given directory fd supports Direct I/O.
506
+ /// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
507
+ fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
508
+ if (!@hasDecl(std.os, "O_DIRECT") and !is_darwin) return false;
509
+
510
+ const path = "fs_supports_direct_io";
511
+ const dir = std.fs.Dir{ .fd = dir_fd };
512
+ const fd = try os.openatZ(dir_fd, path, os.O_CLOEXEC | os.O_CREAT | os.O_TRUNC, 0o666);
513
+ defer os.close(fd);
514
+ defer dir.deleteFile(path) catch {};
515
+
516
+ // F_NOCACHE on darwin is the most similar option to O_DIRECT on linux.
517
+ if (is_darwin) {
518
+ _ = os.fcntl(fd, os.F_NOCACHE, 1) catch return false;
519
+ return true;
520
+ }
521
+
522
+ while (true) {
523
+ const res = os.system.openat(dir_fd, path, os.O_CLOEXEC | os.O_RDONLY | os.O_DIRECT, 0);
524
+ switch (os.linux.getErrno(res)) {
525
+ 0 => {
526
+ os.close(@intCast(os.fd_t, res));
527
+ return true;
528
+ },
529
+ os.linux.EINTR => continue,
530
+ os.linux.EINVAL => return false,
531
+ else => |err| return os.unexpectedErrno(err),
532
+ }
533
+ }
136
534
  }
137
535
  };