tigerbeetle-node 0.11.1 → 0.11.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,8 +10,6 @@ const MessagePool = @import("../message_pool.zig").MessagePool;
10
10
  const Message = @import("../message_pool.zig").MessagePool.Message;
11
11
  const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
12
12
  const ClientTable = @import("superblock_client_table.zig").ClientTable;
13
- const format_wal_headers = @import("./journal.zig").format_wal_headers;
14
- const format_wal_prepares = @import("./journal.zig").format_wal_prepares;
15
13
 
16
14
  const vsr = @import("../vsr.zig");
17
15
  const Header = vsr.Header;
@@ -21,13 +19,14 @@ const Version = vsr.Version;
21
19
  const VSRState = vsr.VSRState;
22
20
 
23
21
  const log = std.log.scoped(.replica);
22
+ const tracer = @import("../tracer.zig");
24
23
 
25
24
  pub const Status = enum {
26
25
  normal,
27
26
  view_change,
28
27
  // Recovery (for replica_count > 1):
29
28
  //
30
- // 1. At replica start: `status=recovering` and `journal.recovered=false`
29
+ // 1. At replica start: `status=recovering` and `journal.status=recovering`
31
30
  // 2. Load the WAL. Mark questionable entries as faulty.
32
31
  // 3. If the WAL has no entries (besides the initial commit), skip to step 5 with view 0.
33
32
  // 4. Run VSR recovery protocol:
@@ -286,6 +285,9 @@ pub fn ReplicaType(
286
285
  /// The prepare message being committed.
287
286
  commit_prepare: ?*Message = null,
288
287
 
288
+ tracer_slot_commit: ?tracer.SpanStart = null,
289
+ tracer_slot_checkpoint: ?tracer.SpanStart = null,
290
+
289
291
  const OpenOptions = struct {
290
292
  replica_count: u8,
291
293
  storage: *Storage,
@@ -544,6 +546,9 @@ pub fn ReplicaType(
544
546
  /// Free all memory and unref all messages held by the replica
545
547
  /// This does not deinitialize the StateMachine, MessageBus, Storage, or Time
546
548
  pub fn deinit(self: *Self, allocator: Allocator) void {
549
+ assert(self.tracer_slot_checkpoint == null);
550
+ assert(self.tracer_slot_commit == null);
551
+
547
552
  self.static_allocator.transition_from_static_to_deinit();
548
553
 
549
554
  self.journal.deinit(allocator);
@@ -602,11 +607,10 @@ pub fn ReplicaType(
602
607
  self.grid.tick();
603
608
  self.message_bus.tick();
604
609
 
605
- if (!self.journal.recovered) {
606
- if (!self.journal.recovering) self.journal.recover();
607
- return;
608
- } else {
609
- assert(!self.journal.recovering);
610
+ switch (self.journal.status) {
611
+ .init => return self.journal.recover(),
612
+ .recovering => return,
613
+ .recovered => {},
610
614
  }
611
615
 
612
616
  if (self.status == .recovering) {
@@ -700,11 +704,9 @@ pub fn ReplicaType(
700
704
  return;
701
705
  }
702
706
 
703
- if (!self.journal.recovered) {
707
+ if (self.journal.status != .recovered) {
704
708
  log.debug("{}: on_message: waiting for journal to recover", .{self.replica});
705
709
  return;
706
- } else {
707
- assert(!self.journal.recovering);
708
710
  }
709
711
 
710
712
  assert(message.header.replica < self.replica_count);
@@ -748,6 +750,9 @@ pub fn ReplicaType(
748
750
 
749
751
  // Any message handlers that loopback must take responsibility for the flush.
750
752
  assert(self.loopback_queue == null);
753
+
754
+ // We have to regularly flush the tracer to get output from short benchmarks.
755
+ tracer.flush();
751
756
  }
752
757
 
753
758
  fn on_ping(self: *Self, message: *const Message) void {
@@ -1443,7 +1448,7 @@ pub fn ReplicaType(
1443
1448
  }
1444
1449
 
1445
1450
  // Recovery messages with our nonce are not sent until after the journal is recovered.
1446
- assert(self.journal.recovered);
1451
+ assert(self.journal.status == .recovered);
1447
1452
 
1448
1453
  var responses: *QuorumMessages = &self.recovery_response_from_other_replicas;
1449
1454
  if (responses[message.header.replica]) |existing| {
@@ -2515,6 +2520,12 @@ pub fn ReplicaType(
2515
2520
  assert(prepare.header.op == self.commit_min + 1);
2516
2521
  assert(prepare.header.op <= self.op);
2517
2522
 
2523
+ tracer.start(
2524
+ &self.tracer_slot_commit,
2525
+ .main,
2526
+ .{ .commit = .{ .op = prepare.header.op } },
2527
+ );
2528
+
2518
2529
  self.commit_prepare = prepare.ref();
2519
2530
  self.commit_callback = callback;
2520
2531
  self.state_machine.prefetch(
@@ -2593,6 +2604,11 @@ pub fn ReplicaType(
2593
2604
  self.op_checkpoint,
2594
2605
  self.op_checkpoint_next(),
2595
2606
  });
2607
+ tracer.start(
2608
+ &self.tracer_slot_checkpoint,
2609
+ .main,
2610
+ .checkpoint,
2611
+ );
2596
2612
  self.state_machine.checkpoint(commit_op_checkpoint_state_machine_callback);
2597
2613
  } else {
2598
2614
  self.commit_op_done();
@@ -2651,6 +2667,11 @@ pub fn ReplicaType(
2651
2667
  self.op,
2652
2668
  self.op_checkpoint,
2653
2669
  });
2670
+ tracer.end(
2671
+ &self.tracer_slot_checkpoint,
2672
+ .main,
2673
+ .checkpoint,
2674
+ );
2654
2675
 
2655
2676
  if (self.on_checkpoint) |on_checkpoint| on_checkpoint(self);
2656
2677
  self.commit_op_done();
@@ -2662,9 +2683,18 @@ pub fn ReplicaType(
2662
2683
  assert(self.commit_prepare.?.header.op == self.commit_min);
2663
2684
  assert(self.commit_prepare.?.header.op < self.op_checkpoint_trigger());
2664
2685
 
2686
+ const op = self.commit_prepare.?.header.op;
2687
+
2665
2688
  self.message_bus.unref(self.commit_prepare.?);
2666
2689
  self.commit_prepare = null;
2667
2690
  self.commit_callback = null;
2691
+
2692
+ tracer.end(
2693
+ &self.tracer_slot_commit,
2694
+ .main,
2695
+ .{ .commit = .{ .op = op } },
2696
+ );
2697
+
2668
2698
  callback(self);
2669
2699
  }
2670
2700
 
@@ -3634,7 +3664,7 @@ pub fn ReplicaType(
3634
3664
  // learn the op.
3635
3665
  fn op_certain(self: *const Self) bool {
3636
3666
  assert(self.status == .recovering);
3637
- assert(self.journal.recovered);
3667
+ assert(self.journal.status == .recovered);
3638
3668
  assert(self.op_checkpoint <= self.op);
3639
3669
 
3640
3670
  const slot_op_checkpoint = self.journal.slot_for_op(self.op_checkpoint).index;
@@ -3849,7 +3879,7 @@ pub fn ReplicaType(
3849
3879
  fn recover(self: *Self) void {
3850
3880
  assert(self.status == .recovering);
3851
3881
  assert(self.replica_count > 1);
3852
- assert(self.journal.recovered);
3882
+ assert(self.journal.status == .recovered);
3853
3883
 
3854
3884
  log.debug("{}: recover: sending recovery messages nonce={}", .{
3855
3885
  self.replica,
@@ -5089,7 +5119,7 @@ pub fn ReplicaType(
5089
5119
 
5090
5120
  fn set_op_and_commit_max(self: *Self, op: u64, commit_max: u64, method: []const u8) void {
5091
5121
  assert(self.status == .view_change or self.status == .recovering);
5092
- assert(self.journal.recovered);
5122
+ assert(self.journal.status == .recovered);
5093
5123
 
5094
5124
  switch (self.status) {
5095
5125
  .normal => unreachable,
@@ -5961,144 +5991,3 @@ pub fn ReplicaType(
5961
5991
  }
5962
5992
  };
5963
5993
  }
5964
-
5965
- /// Initialize the TigerBeetle replica's data file.
5966
- pub fn format(
5967
- comptime Storage: type,
5968
- allocator: std.mem.Allocator,
5969
- cluster: u32,
5970
- replica: u8,
5971
- storage: *Storage,
5972
- superblock: *vsr.SuperBlockType(Storage),
5973
- ) !void {
5974
- const ReplicaFormat = ReplicaFormatType(Storage);
5975
- var replica_format = ReplicaFormat{};
5976
-
5977
- try replica_format.format_wal(allocator, cluster, storage);
5978
- assert(!replica_format.formatting);
5979
-
5980
- superblock.format(
5981
- ReplicaFormat.format_superblock_callback,
5982
- &replica_format.superblock_context,
5983
- .{
5984
- .cluster = cluster,
5985
- .replica = replica,
5986
- .size_max = config.size_max, // This can later become a runtime arg, to cap storage.
5987
- },
5988
- );
5989
-
5990
- replica_format.formatting = true;
5991
- while (replica_format.formatting) storage.tick();
5992
- }
5993
-
5994
- fn ReplicaFormatType(comptime Storage: type) type {
5995
- const SuperBlock = vsr.SuperBlockType(Storage);
5996
- return struct {
5997
- const Self = @This();
5998
-
5999
- formatting: bool = false,
6000
- superblock_context: SuperBlock.Context = undefined,
6001
- wal_write: Storage.Write = undefined,
6002
-
6003
- fn format_wal(
6004
- self: *Self,
6005
- allocator: std.mem.Allocator,
6006
- cluster: u32,
6007
- storage: *Storage,
6008
- ) !void {
6009
- const header_zeroes = [_]u8{0} ** @sizeOf(Header);
6010
- const wal_write_size_max = 4 * 1024 * 1024;
6011
- assert(wal_write_size_max % config.sector_size == 0);
6012
-
6013
- // Direct I/O requires the buffer to be sector-aligned.
6014
- var wal_buffer = try allocator.allocAdvanced(
6015
- u8,
6016
- config.sector_size,
6017
- wal_write_size_max,
6018
- .exact,
6019
- );
6020
- errdefer allocator.free(wal_buffer);
6021
-
6022
- // The logical offset *within the Zone*.
6023
- // Even though the prepare zone follows the redundant header zone, write the prepares
6024
- // first. This allows the test Storage to check the invariant "never write the redundant
6025
- // header before the prepare".
6026
- var wal_offset: u64 = 0;
6027
- while (wal_offset < config.journal_size_prepares) {
6028
- const size = format_wal_prepares(cluster, wal_offset, wal_buffer);
6029
- assert(size > 0);
6030
-
6031
- for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
6032
- if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
6033
- // This is the (empty) body of a reserved or root Prepare.
6034
- } else {
6035
- // This is a Prepare's header.
6036
- assert(header.valid_checksum());
6037
- if (header.op == 0) {
6038
- assert(header.command == .prepare);
6039
- assert(header.operation == .root);
6040
- } else {
6041
- assert(header.command == .reserved);
6042
- assert(header.operation == .reserved);
6043
- }
6044
- }
6045
- }
6046
-
6047
- storage.write_sectors(
6048
- format_wal_sectors_callback,
6049
- &self.wal_write,
6050
- wal_buffer[0..size],
6051
- .wal_prepares,
6052
- wal_offset,
6053
- );
6054
- self.formatting = true;
6055
- while (self.formatting) storage.tick();
6056
- wal_offset += size;
6057
- }
6058
- // There are no prepares left to write.
6059
- assert(format_wal_prepares(cluster, wal_offset, wal_buffer) == 0);
6060
-
6061
- wal_offset = 0;
6062
- while (wal_offset < config.journal_size_headers) {
6063
- const size = format_wal_headers(cluster, wal_offset, wal_buffer);
6064
- assert(size > 0);
6065
-
6066
- for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
6067
- assert(header.valid_checksum());
6068
- if (header.op == 0) {
6069
- assert(header.command == .prepare);
6070
- assert(header.operation == .root);
6071
- } else {
6072
- assert(header.command == .reserved);
6073
- assert(header.operation == .reserved);
6074
- }
6075
- }
6076
-
6077
- storage.write_sectors(
6078
- format_wal_sectors_callback,
6079
- &self.wal_write,
6080
- wal_buffer[0..size],
6081
- .wal_headers,
6082
- wal_offset,
6083
- );
6084
- self.formatting = true;
6085
- while (self.formatting) storage.tick();
6086
- wal_offset += size;
6087
- }
6088
- // There are no headers left to write.
6089
- assert(format_wal_headers(cluster, wal_offset, wal_buffer) == 0);
6090
- }
6091
-
6092
- fn format_wal_sectors_callback(write: *Storage.Write) void {
6093
- const self = @fieldParentPtr(Self, "wal_write", write);
6094
- assert(self.formatting);
6095
- self.formatting = false;
6096
- }
6097
-
6098
- fn format_superblock_callback(superblock_context: *SuperBlock.Context) void {
6099
- const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
6100
- assert(self.formatting);
6101
- self.formatting = false;
6102
- }
6103
- };
6104
- }
@@ -0,0 +1,216 @@
1
+ const std = @import("std");
2
+ const assert = std.debug.assert;
3
+
4
+ const config = @import("../config.zig");
5
+ const vsr = @import("../vsr.zig");
6
+ const Header = vsr.Header;
7
+ const format_wal_headers = @import("./journal.zig").format_wal_headers;
8
+ const format_wal_prepares = @import("./journal.zig").format_wal_prepares;
9
+
10
+ /// Initialize the TigerBeetle replica's data file.
11
+ pub fn format(
12
+ comptime Storage: type,
13
+ allocator: std.mem.Allocator,
14
+ cluster: u32,
15
+ replica: u8,
16
+ storage: *Storage,
17
+ superblock: *vsr.SuperBlockType(Storage),
18
+ ) !void {
19
+ const ReplicaFormat = ReplicaFormatType(Storage);
20
+ var replica_format = ReplicaFormat{};
21
+
22
+ try replica_format.format_wal(allocator, cluster, storage);
23
+ assert(!replica_format.formatting);
24
+
25
+ superblock.format(
26
+ ReplicaFormat.format_superblock_callback,
27
+ &replica_format.superblock_context,
28
+ .{
29
+ .cluster = cluster,
30
+ .replica = replica,
31
+ // TODO Convert this to a runtime arg, to cap storage.
32
+ .size_max = config.size_max,
33
+ },
34
+ );
35
+
36
+ replica_format.formatting = true;
37
+ while (replica_format.formatting) storage.tick();
38
+ }
39
+
40
+ fn ReplicaFormatType(comptime Storage: type) type {
41
+ const SuperBlock = vsr.SuperBlockType(Storage);
42
+ return struct {
43
+ const Self = @This();
44
+
45
+ formatting: bool = false,
46
+ superblock_context: SuperBlock.Context = undefined,
47
+ wal_write: Storage.Write = undefined,
48
+
49
+ fn format_wal(
50
+ self: *Self,
51
+ allocator: std.mem.Allocator,
52
+ cluster: u32,
53
+ storage: *Storage,
54
+ ) !void {
55
+ const header_zeroes = [_]u8{0} ** @sizeOf(Header);
56
+ const wal_write_size_max = 4 * 1024 * 1024;
57
+ assert(wal_write_size_max % config.sector_size == 0);
58
+
59
+ // Direct I/O requires the buffer to be sector-aligned.
60
+ var wal_buffer = try allocator.allocAdvanced(
61
+ u8,
62
+ config.sector_size,
63
+ wal_write_size_max,
64
+ .exact,
65
+ );
66
+ defer allocator.free(wal_buffer);
67
+
68
+ // The logical offset *within the Zone*.
69
+ // Even though the prepare zone follows the redundant header zone, write the prepares
70
+ // first. This allows the test Storage to check the invariant "never write the redundant
71
+ // header before the prepare".
72
+ var wal_offset: u64 = 0;
73
+ while (wal_offset < config.journal_size_prepares) {
74
+ const size = format_wal_prepares(cluster, wal_offset, wal_buffer);
75
+ assert(size > 0);
76
+
77
+ for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
78
+ if (std.mem.eql(u8, std.mem.asBytes(header), &header_zeroes)) {
79
+ // This is the (empty) body of a reserved or root Prepare.
80
+ } else {
81
+ // This is a Prepare's header.
82
+ assert(header.valid_checksum());
83
+ if (header.op == 0) {
84
+ assert(header.command == .prepare);
85
+ assert(header.operation == .root);
86
+ } else {
87
+ assert(header.command == .reserved);
88
+ assert(header.operation == .reserved);
89
+ }
90
+ }
91
+ }
92
+
93
+ storage.write_sectors(
94
+ format_wal_sectors_callback,
95
+ &self.wal_write,
96
+ wal_buffer[0..size],
97
+ .wal_prepares,
98
+ wal_offset,
99
+ );
100
+ self.formatting = true;
101
+ while (self.formatting) storage.tick();
102
+ wal_offset += size;
103
+ }
104
+ // There are no prepares left to write.
105
+ assert(format_wal_prepares(cluster, wal_offset, wal_buffer) == 0);
106
+
107
+ wal_offset = 0;
108
+ while (wal_offset < config.journal_size_headers) {
109
+ const size = format_wal_headers(cluster, wal_offset, wal_buffer);
110
+ assert(size > 0);
111
+
112
+ for (std.mem.bytesAsSlice(Header, wal_buffer[0..size])) |*header| {
113
+ assert(header.valid_checksum());
114
+ if (header.op == 0) {
115
+ assert(header.command == .prepare);
116
+ assert(header.operation == .root);
117
+ } else {
118
+ assert(header.command == .reserved);
119
+ assert(header.operation == .reserved);
120
+ }
121
+ }
122
+
123
+ storage.write_sectors(
124
+ format_wal_sectors_callback,
125
+ &self.wal_write,
126
+ wal_buffer[0..size],
127
+ .wal_headers,
128
+ wal_offset,
129
+ );
130
+ self.formatting = true;
131
+ while (self.formatting) storage.tick();
132
+ wal_offset += size;
133
+ }
134
+ // There are no headers left to write.
135
+ assert(format_wal_headers(cluster, wal_offset, wal_buffer) == 0);
136
+ }
137
+
138
+ fn format_wal_sectors_callback(write: *Storage.Write) void {
139
+ const self = @fieldParentPtr(Self, "wal_write", write);
140
+ assert(self.formatting);
141
+ self.formatting = false;
142
+ }
143
+
144
+ fn format_superblock_callback(superblock_context: *SuperBlock.Context) void {
145
+ const self = @fieldParentPtr(Self, "superblock_context", superblock_context);
146
+ assert(self.formatting);
147
+ self.formatting = false;
148
+ }
149
+ };
150
+ }
151
+
152
+ test "format" {
153
+ const superblock_zone_size = @import("./superblock.zig").superblock_zone_size;
154
+ const MessagePool = @import("../message_pool.zig").MessagePool;
155
+ const Storage = @import("../test/storage.zig").Storage;
156
+ const SuperBlock = vsr.SuperBlockType(Storage);
157
+ const allocator = std.testing.allocator;
158
+ const cluster = 0;
159
+ const replica = 1;
160
+
161
+ var storage = try Storage.init(
162
+ allocator,
163
+ superblock_zone_size + config.journal_size_headers + config.journal_size_prepares,
164
+ .{
165
+ .read_latency_min = 0,
166
+ .read_latency_mean = 0,
167
+ .write_latency_min = 0,
168
+ .write_latency_mean = 0,
169
+ },
170
+ );
171
+ defer storage.deinit(allocator);
172
+
173
+ var message_pool = try MessagePool.init(allocator, .replica);
174
+ defer message_pool.deinit(allocator);
175
+
176
+ var superblock = try SuperBlock.init(allocator, &storage, &message_pool);
177
+ defer superblock.deinit(allocator);
178
+
179
+ try format(Storage, allocator, cluster, replica, &storage, &superblock);
180
+
181
+ // Verify the superblock sectors.
182
+ var copy: u8 = 0;
183
+ while (copy < config.superblock_copies) : (copy += 1) {
184
+ const sector = storage.superblock_sector(copy);
185
+
186
+ try std.testing.expectEqual(sector.copy, copy);
187
+ try std.testing.expectEqual(sector.replica, replica);
188
+ try std.testing.expectEqual(sector.cluster, cluster);
189
+ try std.testing.expectEqual(sector.size, storage.size);
190
+ try std.testing.expectEqual(sector.sequence, 1);
191
+ try std.testing.expectEqual(sector.vsr_state.commit_min, 0);
192
+ try std.testing.expectEqual(sector.vsr_state.commit_max, 0);
193
+ try std.testing.expectEqual(sector.vsr_state.view, 0);
194
+ try std.testing.expectEqual(sector.vsr_state.view_normal, 0);
195
+ }
196
+
197
+ // Verify the WAL headers and prepares zones.
198
+ assert(storage.wal_headers().len == storage.wal_headers().len);
199
+ for (storage.wal_headers()) |header, slot| {
200
+ const message = storage.wal_prepares()[slot];
201
+ try std.testing.expect(std.meta.eql(header, message.header));
202
+
203
+ try std.testing.expect(header.valid_checksum());
204
+ try std.testing.expect(header.valid_checksum_body(&[0]u8{}));
205
+ try std.testing.expectEqual(header.invalid(), null);
206
+ try std.testing.expectEqual(header.cluster, cluster);
207
+ try std.testing.expectEqual(header.op, slot);
208
+ try std.testing.expectEqual(header.size, @sizeOf(vsr.Header));
209
+ if (slot == 0) {
210
+ try std.testing.expectEqual(header.command, .prepare);
211
+ try std.testing.expectEqual(header.operation, .root);
212
+ } else {
213
+ try std.testing.expectEqual(header.command, .reserved);
214
+ }
215
+ }
216
+ }
@@ -11,7 +11,7 @@ const config = @import("config.zig");
11
11
  pub const Version: u8 = 0;
12
12
 
13
13
  pub const ReplicaType = @import("vsr/replica.zig").ReplicaType;
14
- pub const format = @import("vsr/replica.zig").format;
14
+ pub const format = @import("vsr/replica_format.zig").format;
15
15
  pub const Status = @import("vsr/replica.zig").Status;
16
16
  pub const Client = @import("vsr/client.zig").Client;
17
17
  pub const Clock = @import("vsr/clock.zig").Clock;