tigerbeetle-node 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -4
- package/package.json +1 -1
- package/src/node.zig +2 -12
- package/src/tigerbeetle/scripts/benchmark.bat +46 -0
- package/src/tigerbeetle/scripts/install_zig.bat +2 -2
- package/src/tigerbeetle/scripts/install_zig.sh +1 -1
- package/src/tigerbeetle/scripts/vopr.sh +2 -2
- package/src/tigerbeetle/src/benchmark.zig +2 -6
- package/src/tigerbeetle/src/cli.zig +39 -18
- package/src/tigerbeetle/src/config.zig +24 -9
- package/src/tigerbeetle/src/demo.zig +1 -1
- package/src/tigerbeetle/src/io/benchmark.zig +24 -49
- package/src/tigerbeetle/src/io/darwin.zig +175 -44
- package/src/tigerbeetle/src/io/linux.zig +177 -72
- package/src/tigerbeetle/src/io/test.zig +61 -39
- package/src/tigerbeetle/src/io/windows.zig +1161 -0
- package/src/tigerbeetle/src/io.zig +2 -0
- package/src/tigerbeetle/src/main.zig +13 -8
- package/src/tigerbeetle/src/message_bus.zig +49 -61
- package/src/tigerbeetle/src/message_pool.zig +63 -57
- package/src/tigerbeetle/src/ring_buffer.zig +7 -0
- package/src/tigerbeetle/src/simulator.zig +4 -4
- package/src/tigerbeetle/src/storage.zig +0 -230
- package/src/tigerbeetle/src/test/cluster.zig +3 -6
- package/src/tigerbeetle/src/test/message_bus.zig +4 -3
- package/src/tigerbeetle/src/test/network.zig +13 -16
- package/src/tigerbeetle/src/test/state_checker.zig +3 -2
- package/src/tigerbeetle/src/tigerbeetle.zig +5 -3
- package/src/tigerbeetle/src/time.zig +58 -11
- package/src/tigerbeetle/src/vsr/client.zig +18 -32
- package/src/tigerbeetle/src/vsr/clock.zig +1 -1
- package/src/tigerbeetle/src/vsr/journal.zig +2 -6
- package/src/tigerbeetle/src/vsr/replica.zig +146 -169
- package/src/tigerbeetle/src/vsr.zig +263 -5
|
@@ -6,8 +6,6 @@ const assert = std.debug.assert;
|
|
|
6
6
|
const log = std.log.scoped(.storage);
|
|
7
7
|
|
|
8
8
|
const IO = @import("io.zig").IO;
|
|
9
|
-
const is_darwin = builtin.target.isDarwin();
|
|
10
|
-
|
|
11
9
|
const config = @import("config.zig");
|
|
12
10
|
const vsr = @import("vsr.zig");
|
|
13
11
|
|
|
@@ -307,232 +305,4 @@ pub const Storage = struct {
|
|
|
307
305
|
assert(buffer.len > 0);
|
|
308
306
|
assert(offset + buffer.len <= self.size);
|
|
309
307
|
}
|
|
310
|
-
|
|
311
|
-
// Static helper functions to handle data file creation/opening/allocation:
|
|
312
|
-
|
|
313
|
-
/// Opens or creates a journal file:
|
|
314
|
-
/// - For reading and writing.
|
|
315
|
-
/// - For Direct I/O (if possible in development mode, but required in production mode).
|
|
316
|
-
/// - Obtains an advisory exclusive lock to the file descriptor.
|
|
317
|
-
/// - Allocates the file contiguously on disk if this is supported by the file system.
|
|
318
|
-
/// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
|
|
319
|
-
/// The caller is responsible for ensuring that the parent directory inode is durable.
|
|
320
|
-
/// - Verifies that the file size matches the expected file size before returning.
|
|
321
|
-
pub fn open(
|
|
322
|
-
dir_fd: os.fd_t,
|
|
323
|
-
relative_path: [:0]const u8,
|
|
324
|
-
size: u64,
|
|
325
|
-
must_create: bool,
|
|
326
|
-
) !os.fd_t {
|
|
327
|
-
assert(relative_path.len > 0);
|
|
328
|
-
assert(size >= config.sector_size);
|
|
329
|
-
assert(size % config.sector_size == 0);
|
|
330
|
-
|
|
331
|
-
// TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
|
|
332
|
-
// This is much stronger than an advisory exclusive lock, and is required on some platforms.
|
|
333
|
-
|
|
334
|
-
var flags: u32 = os.O.CLOEXEC | os.O.RDWR | os.O.DSYNC;
|
|
335
|
-
var mode: os.mode_t = 0;
|
|
336
|
-
|
|
337
|
-
// TODO Document this and investigate whether this is in fact correct to set here.
|
|
338
|
-
if (@hasDecl(os, "O_LARGEFILE")) flags |= os.O.LARGEFILE;
|
|
339
|
-
|
|
340
|
-
var direct_io_supported = false;
|
|
341
|
-
if (config.direct_io) {
|
|
342
|
-
direct_io_supported = try Storage.fs_supports_direct_io(dir_fd);
|
|
343
|
-
if (direct_io_supported) {
|
|
344
|
-
if (!is_darwin) flags |= os.O.DIRECT;
|
|
345
|
-
} else if (config.deployment_environment == .development) {
|
|
346
|
-
log.warn("file system does not support Direct I/O", .{});
|
|
347
|
-
} else {
|
|
348
|
-
// We require Direct I/O for safety to handle fsync failure correctly, and therefore
|
|
349
|
-
// panic in production if it is not supported.
|
|
350
|
-
@panic("file system does not support Direct I/O");
|
|
351
|
-
}
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
if (must_create) {
|
|
355
|
-
log.info("creating \"{s}\"...", .{relative_path});
|
|
356
|
-
flags |= os.O.CREAT;
|
|
357
|
-
flags |= os.O.EXCL;
|
|
358
|
-
mode = 0o666;
|
|
359
|
-
} else {
|
|
360
|
-
log.info("opening \"{s}\"...", .{relative_path});
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
// This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
|
|
364
|
-
assert((flags & os.O.DSYNC) > 0);
|
|
365
|
-
|
|
366
|
-
// Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
|
|
367
|
-
assert(!std.fs.path.isAbsolute(relative_path));
|
|
368
|
-
const fd = try os.openatZ(dir_fd, relative_path, flags, mode);
|
|
369
|
-
// TODO Return a proper error message when the path exists or does not exist (init/start).
|
|
370
|
-
errdefer os.close(fd);
|
|
371
|
-
|
|
372
|
-
// TODO Check that the file is actually a file.
|
|
373
|
-
|
|
374
|
-
// On darwin, use F_NOCACHE on direct_io to disable the page cache as O_DIRECT doesn't exit.
|
|
375
|
-
if (is_darwin and config.direct_io and direct_io_supported) {
|
|
376
|
-
_ = try os.fcntl(fd, os.F.NOCACHE, 1);
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
// Obtain an advisory exclusive lock that works only if all processes actually use flock().
|
|
380
|
-
// LOCK_NB means that we want to fail the lock without waiting if another process has it.
|
|
381
|
-
os.flock(fd, os.LOCK.EX | os.LOCK.NB) catch |err| switch (err) {
|
|
382
|
-
error.WouldBlock => @panic("another process holds the data file lock"),
|
|
383
|
-
else => return err,
|
|
384
|
-
};
|
|
385
|
-
|
|
386
|
-
// Ask the file system to allocate contiguous sectors for the file (if possible):
|
|
387
|
-
// If the file system does not support `fallocate()`, then this could mean more seeks or a
|
|
388
|
-
// panic if we run out of disk space (ENOSPC).
|
|
389
|
-
if (must_create) try Storage.allocate(fd, size);
|
|
390
|
-
|
|
391
|
-
// The best fsync strategy is always to fsync before reading because this prevents us from
|
|
392
|
-
// making decisions on data that was never durably written by a previously crashed process.
|
|
393
|
-
// We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
|
|
394
|
-
// Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
|
|
395
|
-
try os.fsync(fd);
|
|
396
|
-
|
|
397
|
-
// We fsync the parent directory to ensure that the file inode is durably written.
|
|
398
|
-
// The caller is responsible for the parent directory inode stored under the grandparent.
|
|
399
|
-
// We always do this when opening because we don't know if this was done before crashing.
|
|
400
|
-
try os.fsync(dir_fd);
|
|
401
|
-
|
|
402
|
-
const stat = try os.fstat(fd);
|
|
403
|
-
if (stat.size != size) @panic("data file inode size was truncated or corrupted");
|
|
404
|
-
|
|
405
|
-
return fd;
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
/// Allocates a file contiguously using fallocate() if supported.
|
|
409
|
-
/// Alternatively, writes to the last sector so that at least the file size is correct.
|
|
410
|
-
pub fn allocate(fd: os.fd_t, size: u64) !void {
|
|
411
|
-
log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
|
|
412
|
-
Storage.fallocate(fd, 0, 0, @intCast(i64, size)) catch |err| switch (err) {
|
|
413
|
-
error.OperationNotSupported => {
|
|
414
|
-
log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
|
|
415
|
-
log.info("allocating by writing to the last sector of the file instead...", .{});
|
|
416
|
-
|
|
417
|
-
const sector_size = config.sector_size;
|
|
418
|
-
const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
|
|
419
|
-
|
|
420
|
-
// Handle partial writes where the physical sector is less than a logical sector:
|
|
421
|
-
const offset = size - sector.len;
|
|
422
|
-
var written: usize = 0;
|
|
423
|
-
while (written < sector.len) {
|
|
424
|
-
written += try os.pwrite(fd, sector[written..], offset + written);
|
|
425
|
-
}
|
|
426
|
-
},
|
|
427
|
-
else => return err,
|
|
428
|
-
};
|
|
429
|
-
}
|
|
430
|
-
|
|
431
|
-
fn fallocate(fd: i32, mode: i32, offset: i64, length: i64) !void {
|
|
432
|
-
// https://stackoverflow.com/a/11497568
|
|
433
|
-
// https://api.kde.org/frameworks/kcoreaddons/html/posix__fallocate__mac_8h_source.html
|
|
434
|
-
// http://hg.mozilla.org/mozilla-central/file/3d846420a907/xpcom/glue/FileUtils.cpp#l61
|
|
435
|
-
if (is_darwin) {
|
|
436
|
-
const F_ALLOCATECONTIG = 0x2; // allocate contiguous space
|
|
437
|
-
const F_ALLOCATEALL = 0x4; // allocate all or nothing
|
|
438
|
-
const F_PEOFPOSMODE = 3; // use relative offset from the seek pos mode
|
|
439
|
-
const F_VOLPOSMODE = 4; // use the specified volume offset
|
|
440
|
-
_ = F_VOLPOSMODE;
|
|
441
|
-
|
|
442
|
-
const fstore_t = extern struct {
|
|
443
|
-
fst_flags: c_uint,
|
|
444
|
-
fst_posmode: c_int,
|
|
445
|
-
fst_offset: os.off_t,
|
|
446
|
-
fst_length: os.off_t,
|
|
447
|
-
fst_bytesalloc: os.off_t,
|
|
448
|
-
};
|
|
449
|
-
|
|
450
|
-
var store = fstore_t{
|
|
451
|
-
.fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL,
|
|
452
|
-
.fst_posmode = F_PEOFPOSMODE,
|
|
453
|
-
.fst_offset = 0,
|
|
454
|
-
.fst_length = offset + length,
|
|
455
|
-
.fst_bytesalloc = 0,
|
|
456
|
-
};
|
|
457
|
-
|
|
458
|
-
// try to pre-allocate contiguous space and fall back to default non-continugous
|
|
459
|
-
var res = os.system.fcntl(fd, os.F.PREALLOCATE, @ptrToInt(&store));
|
|
460
|
-
if (os.errno(res) != .SUCCESS) {
|
|
461
|
-
store.fst_flags = F_ALLOCATEALL;
|
|
462
|
-
res = os.system.fcntl(fd, os.F.PREALLOCATE, @ptrToInt(&store));
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
switch (os.errno(res)) {
|
|
466
|
-
.SUCCESS => {},
|
|
467
|
-
.ACCES => unreachable, // F_SETLK or F_SETSIZE of F_WRITEBOOTSTRAP
|
|
468
|
-
.BADF => return error.FileDescriptorInvalid,
|
|
469
|
-
.DEADLK => unreachable, // F_SETLKW
|
|
470
|
-
.INTR => unreachable, // F_SETLKW
|
|
471
|
-
.INVAL => return error.ArgumentsInvalid, // for F_PREALLOCATE (offset invalid)
|
|
472
|
-
.MFILE => unreachable, // F_DUPFD or F_DUPED
|
|
473
|
-
.NOLCK => unreachable, // F_SETLK or F_SETLKW
|
|
474
|
-
.OVERFLOW => return error.FileTooBig,
|
|
475
|
-
.SRCH => unreachable, // F_SETOWN
|
|
476
|
-
.OPNOTSUPP => return error.OperationNotSupported, // not reported but need same error union
|
|
477
|
-
else => |errno| return os.unexpectedErrno(errno),
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
// now actually perform the allocation
|
|
481
|
-
return os.ftruncate(fd, @intCast(u64, length)) catch |err| switch (err) {
|
|
482
|
-
error.AccessDenied => error.PermissionDenied,
|
|
483
|
-
else => |e| e,
|
|
484
|
-
};
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
while (true) {
|
|
488
|
-
const rc = os.linux.fallocate(fd, mode, offset, length);
|
|
489
|
-
switch (os.linux.getErrno(rc)) {
|
|
490
|
-
.SUCCESS => return,
|
|
491
|
-
.BADF => return error.FileDescriptorInvalid,
|
|
492
|
-
.FBIG => return error.FileTooBig,
|
|
493
|
-
.INTR => continue,
|
|
494
|
-
.INVAL => return error.ArgumentsInvalid,
|
|
495
|
-
.IO => return error.InputOutput,
|
|
496
|
-
.NODEV => return error.NoDevice,
|
|
497
|
-
.NOSPC => return error.NoSpaceLeft,
|
|
498
|
-
.NOSYS => return error.SystemOutdated,
|
|
499
|
-
.OPNOTSUPP => return error.OperationNotSupported,
|
|
500
|
-
.PERM => return error.PermissionDenied,
|
|
501
|
-
.SPIPE => return error.Unseekable,
|
|
502
|
-
.TXTBSY => return error.FileBusy,
|
|
503
|
-
else => |errno| return os.unexpectedErrno(errno),
|
|
504
|
-
}
|
|
505
|
-
}
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
/// Detects whether the underlying file system for a given directory fd supports Direct I/O.
|
|
509
|
-
/// Not all Linux file systems support `O_DIRECT`, e.g. a shared macOS volume.
|
|
510
|
-
fn fs_supports_direct_io(dir_fd: std.os.fd_t) !bool {
|
|
511
|
-
if (!@hasDecl(std.os, "O_DIRECT") and !is_darwin) return false;
|
|
512
|
-
|
|
513
|
-
const path = "fs_supports_direct_io";
|
|
514
|
-
const dir = std.fs.Dir{ .fd = dir_fd };
|
|
515
|
-
const fd = try os.openatZ(dir_fd, path, os.O.CLOEXEC | os.O.CREAT | os.O.TRUNC, 0o666);
|
|
516
|
-
defer os.close(fd);
|
|
517
|
-
defer dir.deleteFile(path) catch {};
|
|
518
|
-
|
|
519
|
-
// F_NOCACHE on darwin is the most similar option to O_DIRECT on linux.
|
|
520
|
-
if (is_darwin) {
|
|
521
|
-
_ = os.fcntl(fd, os.F.NOCACHE, 1) catch return false;
|
|
522
|
-
return true;
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
while (true) {
|
|
526
|
-
const res = os.system.openat(dir_fd, path, os.O.CLOEXEC | os.O.RDONLY | os.O.DIRECT, 0);
|
|
527
|
-
switch (os.linux.getErrno(res)) {
|
|
528
|
-
0 => {
|
|
529
|
-
os.close(@intCast(os.fd_t, res));
|
|
530
|
-
return true;
|
|
531
|
-
},
|
|
532
|
-
os.linux.EINTR => continue,
|
|
533
|
-
os.linux.EINVAL => return false,
|
|
534
|
-
else => |err| return os.unexpectedErrno(err),
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
}
|
|
538
308
|
};
|
|
@@ -6,7 +6,8 @@ const config = @import("../config.zig");
|
|
|
6
6
|
|
|
7
7
|
const StateChecker = @import("state_checker.zig").StateChecker;
|
|
8
8
|
|
|
9
|
-
const
|
|
9
|
+
const message_pool = @import("../message_pool.zig");
|
|
10
|
+
const MessagePool = message_pool.MessagePool;
|
|
10
11
|
const Message = MessagePool.Message;
|
|
11
12
|
|
|
12
13
|
const Network = @import("network.zig").Network;
|
|
@@ -197,12 +198,8 @@ pub const Cluster = struct {
|
|
|
197
198
|
var it = message_bus.pool.free_list;
|
|
198
199
|
while (it) |message| : (it = message.next) messages_in_pool += 1;
|
|
199
200
|
}
|
|
200
|
-
{
|
|
201
|
-
var it = message_bus.pool.header_only_free_list;
|
|
202
|
-
while (it) |message| : (it = message.next) messages_in_pool += 1;
|
|
203
|
-
}
|
|
204
201
|
|
|
205
|
-
const total_messages =
|
|
202
|
+
const total_messages = message_pool.messages_max_replica;
|
|
206
203
|
assert(messages_in_network + messages_in_pool == total_messages);
|
|
207
204
|
|
|
208
205
|
replica.* = try Replica.init(
|
|
@@ -6,12 +6,13 @@ const config = @import("../config.zig");
|
|
|
6
6
|
const MessagePool = @import("../message_pool.zig").MessagePool;
|
|
7
7
|
const Message = MessagePool.Message;
|
|
8
8
|
const Header = @import("../vsr.zig").Header;
|
|
9
|
+
const ProcessType = @import("../vsr.zig").ProcessType;
|
|
9
10
|
|
|
10
11
|
const Network = @import("network.zig").Network;
|
|
11
12
|
|
|
12
13
|
const log = std.log.scoped(.message_bus);
|
|
13
14
|
|
|
14
|
-
pub const Process = union(
|
|
15
|
+
pub const Process = union(ProcessType) {
|
|
15
16
|
replica: u8,
|
|
16
17
|
client: u128,
|
|
17
18
|
};
|
|
@@ -35,7 +36,7 @@ pub const MessageBus = struct {
|
|
|
35
36
|
network: *Network,
|
|
36
37
|
) !MessageBus {
|
|
37
38
|
return MessageBus{
|
|
38
|
-
.pool = try MessagePool.init(allocator),
|
|
39
|
+
.pool = try MessagePool.init(allocator, @as(ProcessType, process)),
|
|
39
40
|
.network = network,
|
|
40
41
|
.cluster = cluster,
|
|
41
42
|
.process = process,
|
|
@@ -61,7 +62,7 @@ pub const MessageBus = struct {
|
|
|
61
62
|
|
|
62
63
|
pub fn tick(_: *MessageBus) void {}
|
|
63
64
|
|
|
64
|
-
pub fn get_message(bus: *MessageBus)
|
|
65
|
+
pub fn get_message(bus: *MessageBus) *Message {
|
|
65
66
|
return bus.pool.get_message();
|
|
66
67
|
}
|
|
67
68
|
|
|
@@ -28,7 +28,7 @@ pub const Network = struct {
|
|
|
28
28
|
message: *Message,
|
|
29
29
|
|
|
30
30
|
pub fn deinit(packet: *const Packet, path: PacketSimulatorPath) void {
|
|
31
|
-
const source_bus = &packet.network.
|
|
31
|
+
const source_bus = &packet.network.buses.items[path.source];
|
|
32
32
|
source_bus.unref(packet.message);
|
|
33
33
|
}
|
|
34
34
|
};
|
|
@@ -43,7 +43,7 @@ pub const Network = struct {
|
|
|
43
43
|
options: NetworkOptions,
|
|
44
44
|
packet_simulator: PacketSimulator(Packet),
|
|
45
45
|
|
|
46
|
-
|
|
46
|
+
buses: std.ArrayListUnmanaged(MessageBus),
|
|
47
47
|
processes: std.ArrayListUnmanaged(u128),
|
|
48
48
|
|
|
49
49
|
pub fn init(
|
|
@@ -55,8 +55,8 @@ pub const Network = struct {
|
|
|
55
55
|
const process_count = client_count + replica_count;
|
|
56
56
|
assert(process_count <= std.math.maxInt(u8));
|
|
57
57
|
|
|
58
|
-
var
|
|
59
|
-
errdefer
|
|
58
|
+
var buses = try std.ArrayListUnmanaged(MessageBus).initCapacity(allocator, process_count);
|
|
59
|
+
errdefer buses.deinit(allocator);
|
|
60
60
|
|
|
61
61
|
var processes = try std.ArrayListUnmanaged(u128).initCapacity(allocator, process_count);
|
|
62
62
|
errdefer processes.deinit(allocator);
|
|
@@ -71,18 +71,18 @@ pub const Network = struct {
|
|
|
71
71
|
.allocator = allocator,
|
|
72
72
|
.options = options,
|
|
73
73
|
.packet_simulator = packet_simulator,
|
|
74
|
-
.
|
|
74
|
+
.buses = buses,
|
|
75
75
|
.processes = processes,
|
|
76
76
|
};
|
|
77
77
|
}
|
|
78
78
|
|
|
79
79
|
pub fn deinit(network: *Network) void {
|
|
80
|
-
// TODO: deinit the
|
|
81
|
-
network.
|
|
80
|
+
// TODO: deinit the buses themselves when they gain a deinit()
|
|
81
|
+
network.buses.deinit(network.allocator);
|
|
82
82
|
network.processes.deinit(network.allocator);
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
/// Returns the address (index into Network.
|
|
85
|
+
/// Returns the address (index into Network.buses)
|
|
86
86
|
pub fn init_message_bus(network: *Network, cluster: u32, process: Process) !*MessageBus {
|
|
87
87
|
const raw_process = switch (process) {
|
|
88
88
|
.replica => |replica| replica,
|
|
@@ -97,9 +97,9 @@ pub const Network = struct {
|
|
|
97
97
|
const bus = try MessageBus.init(network.allocator, cluster, process, network);
|
|
98
98
|
|
|
99
99
|
network.processes.appendAssumeCapacity(raw_process);
|
|
100
|
-
network.
|
|
100
|
+
network.buses.appendAssumeCapacity(bus);
|
|
101
101
|
|
|
102
|
-
return &network.
|
|
102
|
+
return &network.buses.items[network.buses.items.len - 1];
|
|
103
103
|
}
|
|
104
104
|
|
|
105
105
|
pub fn send_message(network: *Network, message: *Message, path: Path) void {
|
|
@@ -131,18 +131,15 @@ pub const Network = struct {
|
|
|
131
131
|
}
|
|
132
132
|
|
|
133
133
|
pub fn get_message_bus(network: *Network, process: Process) *MessageBus {
|
|
134
|
-
return &network.
|
|
134
|
+
return &network.buses.items[network.process_to_address(process)];
|
|
135
135
|
}
|
|
136
136
|
|
|
137
137
|
fn deliver_message(packet: Packet, path: PacketSimulatorPath) void {
|
|
138
138
|
const network = packet.network;
|
|
139
139
|
|
|
140
|
-
const target_bus = &network.
|
|
140
|
+
const target_bus = &network.buses.items[path.target];
|
|
141
141
|
|
|
142
|
-
const message = target_bus.get_message()
|
|
143
|
-
log.debug("deliver_message: target message bus has no free messages, dropping", .{});
|
|
144
|
-
return;
|
|
145
|
-
};
|
|
142
|
+
const message = target_bus.get_message();
|
|
146
143
|
defer target_bus.unref(message);
|
|
147
144
|
|
|
148
145
|
std.mem.copy(u8, message.buffer, packet.message.buffer);
|
|
@@ -8,12 +8,13 @@ const Cluster = @import("cluster.zig").Cluster;
|
|
|
8
8
|
const Network = @import("network.zig").Network;
|
|
9
9
|
const StateMachine = @import("state_machine.zig").StateMachine;
|
|
10
10
|
|
|
11
|
-
const
|
|
11
|
+
const message_pool = @import("../message_pool.zig");
|
|
12
|
+
const MessagePool = message_pool.MessagePool;
|
|
12
13
|
const Message = MessagePool.Message;
|
|
13
14
|
|
|
14
15
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
15
16
|
|
|
16
|
-
const RequestQueue = RingBuffer(u128, config.
|
|
17
|
+
const RequestQueue = RingBuffer(u128, config.client_request_queue_max);
|
|
17
18
|
const StateTransitions = std.AutoHashMap(u128, u64);
|
|
18
19
|
|
|
19
20
|
const log = std.log.scoped(.state_checker);
|
|
@@ -204,12 +204,14 @@ pub const CommitTransfersResult = packed struct {
|
|
|
204
204
|
};
|
|
205
205
|
|
|
206
206
|
comptime {
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
const target = builtin.target;
|
|
208
|
+
|
|
209
|
+
if (target.os.tag != .linux and !target.isDarwin() and target.os.tag != .windows) {
|
|
210
|
+
@compileError("linux, windows or macos is required for io");
|
|
209
211
|
}
|
|
210
212
|
|
|
211
213
|
// We require little-endian architectures everywhere for efficient network deserialization:
|
|
212
|
-
if (
|
|
214
|
+
if (target.cpu.arch.endian() != .Little) {
|
|
213
215
|
@compileError("big-endian systems not supported");
|
|
214
216
|
}
|
|
215
217
|
}
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
const std = @import("std");
|
|
2
2
|
const builtin = @import("builtin");
|
|
3
|
-
const assert = std.debug.assert;
|
|
4
|
-
const is_darwin = builtin.target.isDarwin();
|
|
5
3
|
const config = @import("./config.zig");
|
|
6
4
|
|
|
5
|
+
const os = std.os;
|
|
6
|
+
const assert = std.debug.assert;
|
|
7
|
+
const is_darwin = builtin.target.os.tag.isDarwin();
|
|
8
|
+
const is_windows = builtin.target.os.tag == .windows;
|
|
9
|
+
|
|
7
10
|
pub const Time = struct {
|
|
8
11
|
const Self = @This();
|
|
9
12
|
|
|
@@ -19,19 +22,44 @@ pub const Time = struct {
|
|
|
19
22
|
/// system administrator manually changes the clock.
|
|
20
23
|
pub fn monotonic(self: *Self) u64 {
|
|
21
24
|
const m = blk: {
|
|
25
|
+
// Uses QueryPerformanceCounter() on windows due to it being the highest precision timer
|
|
26
|
+
// available while also accounting for time spent suspended by default:
|
|
27
|
+
// https://docs.microsoft.com/en-us/windows/win32/api/realtimeapiset/nf-realtimeapiset-queryunbiasedinterrupttime#remarks
|
|
28
|
+
if (is_windows) {
|
|
29
|
+
// QPF need not be globally cached either as it ends up being a load from read-only
|
|
30
|
+
// memory mapped to all processed by the kernel called KUSER_SHARED_DATA (See "QpcFrequency")
|
|
31
|
+
// https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/ntddk/ns-ntddk-kuser_shared_data
|
|
32
|
+
// https://www.geoffchappell.com/studies/windows/km/ntoskrnl/inc/api/ntexapi_x/kuser_shared_data/index.htm
|
|
33
|
+
const qpc = os.windows.QueryPerformanceCounter();
|
|
34
|
+
const qpf = os.windows.QueryPerformanceFrequency();
|
|
35
|
+
|
|
36
|
+
// 10Mhz (1 qpc tick every 100ns) is a common QPF on modern systems.
|
|
37
|
+
// We can optimize towards this by converting to ns via a single multiply.
|
|
38
|
+
// https://github.com/microsoft/STL/blob/785143a0c73f030238ef618890fd4d6ae2b3a3a0/stl/inc/chrono#L694-L701
|
|
39
|
+
const common_qpf = 10_000_000;
|
|
40
|
+
if (qpf == common_qpf) break :blk qpc * (std.time.ns_per_s / common_qpf);
|
|
41
|
+
|
|
42
|
+
// Convert qpc to nanos using fixed point to avoid expensive extra divs and overflow.
|
|
43
|
+
const scale = (std.time.ns_per_s << 32) / qpf;
|
|
44
|
+
break :blk @truncate(u64, (@as(u96, qpc) * scale) >> 32);
|
|
45
|
+
}
|
|
46
|
+
|
|
22
47
|
// Uses mach_continuous_time() instead of mach_absolute_time() as it counts while suspended.
|
|
23
48
|
// https://developer.apple.com/documentation/kernel/1646199-mach_continuous_time
|
|
24
49
|
// https://opensource.apple.com/source/Libc/Libc-1158.1.2/gen/clock_gettime.c.auto.html
|
|
25
50
|
if (is_darwin) {
|
|
26
51
|
const darwin = struct {
|
|
27
|
-
const mach_timebase_info_t =
|
|
28
|
-
extern "c" fn mach_timebase_info(info: *mach_timebase_info_t)
|
|
52
|
+
const mach_timebase_info_t = os.darwin.mach_timebase_info_data;
|
|
53
|
+
extern "c" fn mach_timebase_info(info: *mach_timebase_info_t) os.darwin.kern_return_t;
|
|
29
54
|
extern "c" fn mach_continuous_time() u64;
|
|
30
55
|
};
|
|
31
56
|
|
|
32
|
-
|
|
57
|
+
// mach_timebase_info() called through libc already does global caching for us
|
|
58
|
+
// https://opensource.apple.com/source/xnu/xnu-7195.81.3/libsyscall/wrappers/mach_timebase_info.c.auto.html
|
|
33
59
|
var info: darwin.mach_timebase_info_t = undefined;
|
|
34
60
|
if (darwin.mach_timebase_info(&info) != 0) @panic("mach_timebase_info() failed");
|
|
61
|
+
|
|
62
|
+
const now = darwin.mach_continuous_time();
|
|
35
63
|
return (now * info.numer) / info.denom;
|
|
36
64
|
}
|
|
37
65
|
|
|
@@ -40,8 +68,8 @@ pub const Time = struct {
|
|
|
40
68
|
// CLOCK_BOOTTIME is the same as CLOCK_MONOTONIC but includes elapsed time during a suspend.
|
|
41
69
|
// For more detail and why CLOCK_MONOTONIC_RAW is even worse than CLOCK_MONOTONIC,
|
|
42
70
|
// see https://github.com/ziglang/zig/pull/933#discussion_r656021295.
|
|
43
|
-
var ts:
|
|
44
|
-
|
|
71
|
+
var ts: os.timespec = undefined;
|
|
72
|
+
os.clock_gettime(os.CLOCK.BOOTTIME, &ts) catch @panic("CLOCK_BOOTTIME required");
|
|
45
73
|
break :blk @intCast(u64, ts.tv_sec) * std.time.ns_per_s + @intCast(u64, ts.tv_nsec);
|
|
46
74
|
};
|
|
47
75
|
|
|
@@ -54,11 +82,30 @@ pub const Time = struct {
|
|
|
54
82
|
/// A timestamp to measure real (i.e. wall clock) time, meaningful across systems, and reboots.
|
|
55
83
|
/// This clock is affected by discontinuous jumps in the system time.
|
|
56
84
|
pub fn realtime(_: *Self) i64 {
|
|
57
|
-
|
|
58
|
-
|
|
85
|
+
if (is_windows) {
|
|
86
|
+
const kernel32 = struct {
|
|
87
|
+
extern "kernel32" fn GetSystemTimePreciseAsFileTime(
|
|
88
|
+
lpFileTime: *os.windows.FILETIME,
|
|
89
|
+
) callconv(os.windows.WINAPI) void;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
var ft: os.windows.FILETIME = undefined;
|
|
93
|
+
kernel32.GetSystemTimePreciseAsFileTime(&ft);
|
|
94
|
+
const ft64 = (@as(u64, ft.dwHighDateTime) << 32) | ft.dwLowDateTime;
|
|
95
|
+
|
|
96
|
+
// FileTime is in units of 100 nanoseconds
|
|
97
|
+
// and uses the NTFS/Windows epoch of 1601-01-01 instead of Unix Epoch 1970-01-01.
|
|
98
|
+
const epoch_adjust = std.time.epoch.windows * (std.time.ns_per_s / 100);
|
|
99
|
+
return (@bitCast(i64, ft64) + epoch_adjust) * 100;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (is_darwin) {
|
|
103
|
+
// macos has supported clock_gettime() since 10.12:
|
|
104
|
+
// https://opensource.apple.com/source/Libc/Libc-1158.1.2/gen/clock_gettime.3.auto.html
|
|
105
|
+
}
|
|
59
106
|
|
|
60
|
-
var ts:
|
|
61
|
-
|
|
107
|
+
var ts: os.timespec = undefined;
|
|
108
|
+
os.clock_gettime(os.CLOCK.REALTIME, &ts) catch unreachable;
|
|
62
109
|
return @as(i64, ts.tv_sec) * std.time.ns_per_s + ts.tv_nsec;
|
|
63
110
|
}
|
|
64
111
|
|
|
@@ -7,7 +7,8 @@ const vsr = @import("../vsr.zig");
|
|
|
7
7
|
const Header = vsr.Header;
|
|
8
8
|
|
|
9
9
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
10
|
-
const
|
|
10
|
+
const message_pool = @import("../message_pool.zig");
|
|
11
|
+
const Message = message_pool.MessagePool.Message;
|
|
11
12
|
|
|
12
13
|
const log = std.log.scoped(.client);
|
|
13
14
|
|
|
@@ -66,8 +67,7 @@ pub fn Client(comptime StateMachine: type, comptime MessageBus: type) type {
|
|
|
66
67
|
|
|
67
68
|
/// A client is allowed at most one inflight request at a time at the protocol layer.
|
|
68
69
|
/// We therefore queue any further concurrent requests made by the application layer.
|
|
69
|
-
|
|
70
|
-
request_queue: RingBuffer(Request, config.message_bus_messages_max - 1) = .{},
|
|
70
|
+
request_queue: RingBuffer(Request, config.client_request_queue_max) = .{},
|
|
71
71
|
|
|
72
72
|
/// The number of ticks without a reply before the client resends the inflight request.
|
|
73
73
|
/// Dynamically adjusted as a function of recent request round-trip time.
|
|
@@ -188,25 +188,25 @@ pub fn Client(comptime StateMachine: type, comptime MessageBus: type) type {
|
|
|
188
188
|
@tagName(operation),
|
|
189
189
|
});
|
|
190
190
|
|
|
191
|
+
if (self.request_queue.full()) {
|
|
192
|
+
callback(user_data, operation, error.TooManyOutstandingRequests);
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
191
196
|
const was_empty = self.request_queue.empty();
|
|
192
197
|
|
|
193
|
-
self.request_queue.
|
|
198
|
+
self.request_queue.push_assume_capacity(.{
|
|
194
199
|
.user_data = user_data,
|
|
195
200
|
.callback = callback,
|
|
196
201
|
.message = message.ref(),
|
|
197
|
-
})
|
|
198
|
-
error.NoSpaceLeft => {
|
|
199
|
-
callback(user_data, operation, error.TooManyOutstandingRequests);
|
|
200
|
-
return;
|
|
201
|
-
},
|
|
202
|
-
};
|
|
202
|
+
});
|
|
203
203
|
|
|
204
204
|
// If the queue was empty, then there is no request inflight and we must send this one:
|
|
205
205
|
if (was_empty) self.send_request_for_the_first_time(message);
|
|
206
206
|
}
|
|
207
207
|
|
|
208
208
|
/// Acquires a message from the message bus if one is available.
|
|
209
|
-
pub fn get_message(self: *Self)
|
|
209
|
+
pub fn get_message(self: *Self) *Message {
|
|
210
210
|
return self.message_bus.get_message();
|
|
211
211
|
}
|
|
212
212
|
|
|
@@ -383,12 +383,12 @@ pub fn Client(comptime StateMachine: type, comptime MessageBus: type) type {
|
|
|
383
383
|
}
|
|
384
384
|
|
|
385
385
|
/// The caller owns the returned message, if any, which has exactly 1 reference.
|
|
386
|
-
fn create_message_from_header(self: *Self, header: Header)
|
|
386
|
+
fn create_message_from_header(self: *Self, header: Header) *Message {
|
|
387
387
|
assert(header.client == self.id);
|
|
388
388
|
assert(header.cluster == self.cluster);
|
|
389
389
|
assert(header.size == @sizeOf(Header));
|
|
390
390
|
|
|
391
|
-
const message = self.message_bus.pool.
|
|
391
|
+
const message = self.message_bus.pool.get_message();
|
|
392
392
|
defer self.message_bus.unref(message);
|
|
393
393
|
|
|
394
394
|
message.header.* = header;
|
|
@@ -402,8 +402,7 @@ pub fn Client(comptime StateMachine: type, comptime MessageBus: type) type {
|
|
|
402
402
|
fn register(self: *Self) void {
|
|
403
403
|
if (self.request_number > 0) return;
|
|
404
404
|
|
|
405
|
-
|
|
406
|
-
@panic("register: no message available to register a session with the cluster");
|
|
405
|
+
const message = self.message_bus.get_message();
|
|
407
406
|
defer self.message_bus.unref(message);
|
|
408
407
|
|
|
409
408
|
// We will set parent, context, view and checksums only when sending for the first time:
|
|
@@ -422,37 +421,24 @@ pub fn Client(comptime StateMachine: type, comptime MessageBus: type) type {
|
|
|
422
421
|
|
|
423
422
|
assert(self.request_queue.empty());
|
|
424
423
|
|
|
425
|
-
self.request_queue.
|
|
424
|
+
self.request_queue.push_assume_capacity(.{
|
|
426
425
|
.user_data = 0,
|
|
427
426
|
.callback = undefined,
|
|
428
427
|
.message = message.ref(),
|
|
429
|
-
})
|
|
430
|
-
error.NoSpaceLeft => unreachable, // This is the first request.
|
|
431
|
-
};
|
|
428
|
+
});
|
|
432
429
|
|
|
433
430
|
self.send_request_for_the_first_time(message);
|
|
434
431
|
}
|
|
435
432
|
|
|
436
433
|
fn send_header_to_replica(self: *Self, replica: u8, header: Header) void {
|
|
437
|
-
const message = self.create_message_from_header(header)
|
|
438
|
-
log.err("{}: no header-only message available, dropping message to replica {}", .{
|
|
439
|
-
self.id,
|
|
440
|
-
replica,
|
|
441
|
-
});
|
|
442
|
-
return;
|
|
443
|
-
};
|
|
434
|
+
const message = self.create_message_from_header(header);
|
|
444
435
|
defer self.message_bus.unref(message);
|
|
445
436
|
|
|
446
437
|
self.send_message_to_replica(replica, message);
|
|
447
438
|
}
|
|
448
439
|
|
|
449
440
|
fn send_header_to_replicas(self: *Self, header: Header) void {
|
|
450
|
-
const message = self.create_message_from_header(header)
|
|
451
|
-
log.err("{}: no header-only message available, dropping message to replicas", .{
|
|
452
|
-
self.id,
|
|
453
|
-
});
|
|
454
|
-
return;
|
|
455
|
-
};
|
|
441
|
+
const message = self.create_message_from_header(header);
|
|
456
442
|
defer self.message_bus.unref(message);
|
|
457
443
|
|
|
458
444
|
var replica: u8 = 0;
|
|
@@ -744,7 +744,7 @@ test "fuzz test" {
|
|
|
744
744
|
const allocator = &arena_allocator.allocator;
|
|
745
745
|
const ticks_max: u64 = 1_000_000;
|
|
746
746
|
const clock_count: u8 = 3;
|
|
747
|
-
const SystemTime = @import("../time.zig").Time;
|
|
747
|
+
const SystemTime = @import("../test/time.zig").Time;
|
|
748
748
|
var system_time = SystemTime{};
|
|
749
749
|
var seed = @intCast(u64, system_time.realtime());
|
|
750
750
|
var min_sync_error: u64 = 1_000_000_000;
|