tigerbeetle-node 0.11.3 → 0.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/dist/.client.node.sha256 +1 -1
  2. package/package.json +1 -1
  3. package/src/node.zig +10 -5
  4. package/src/tigerbeetle/src/benchmark.zig +4 -4
  5. package/src/tigerbeetle/src/c/tb_client/context.zig +6 -6
  6. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +2 -2
  7. package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -1
  8. package/src/tigerbeetle/src/c/tb_client.h +97 -111
  9. package/src/tigerbeetle/src/c/tb_client.zig +30 -19
  10. package/src/tigerbeetle/src/c/tb_client_header.zig +218 -0
  11. package/src/tigerbeetle/src/c/test.zig +14 -14
  12. package/src/tigerbeetle/src/cli.zig +12 -12
  13. package/src/tigerbeetle/src/config.zig +183 -379
  14. package/src/tigerbeetle/src/constants.zig +394 -0
  15. package/src/tigerbeetle/src/demo.zig +4 -4
  16. package/src/tigerbeetle/src/ewah_fuzz.zig +2 -0
  17. package/src/tigerbeetle/src/io/darwin.zig +4 -4
  18. package/src/tigerbeetle/src/io/linux.zig +6 -6
  19. package/src/tigerbeetle/src/io/windows.zig +4 -4
  20. package/src/tigerbeetle/src/lsm/bloom_filter.zig +1 -1
  21. package/src/tigerbeetle/src/lsm/compaction.zig +15 -10
  22. package/src/tigerbeetle/src/lsm/forest.zig +2 -2
  23. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +18 -15
  24. package/src/tigerbeetle/src/lsm/grid.zig +5 -5
  25. package/src/tigerbeetle/src/lsm/groove.zig +8 -42
  26. package/src/tigerbeetle/src/lsm/level_iterator.zig +2 -2
  27. package/src/tigerbeetle/src/lsm/manifest.zig +19 -23
  28. package/src/tigerbeetle/src/lsm/manifest_level.zig +2 -2
  29. package/src/tigerbeetle/src/lsm/manifest_log.zig +8 -8
  30. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +25 -12
  31. package/src/tigerbeetle/src/lsm/posted_groove.zig +4 -15
  32. package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +13 -13
  33. package/src/tigerbeetle/src/lsm/set_associative_cache.zig +2 -2
  34. package/src/tigerbeetle/src/lsm/table.zig +43 -35
  35. package/src/tigerbeetle/src/lsm/table_immutable.zig +4 -4
  36. package/src/tigerbeetle/src/lsm/table_iterator.zig +17 -9
  37. package/src/tigerbeetle/src/lsm/table_mutable.zig +3 -3
  38. package/src/tigerbeetle/src/lsm/test.zig +6 -6
  39. package/src/tigerbeetle/src/lsm/tree.zig +75 -47
  40. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +27 -28
  41. package/src/tigerbeetle/src/main.zig +32 -23
  42. package/src/tigerbeetle/src/message_bus.zig +25 -25
  43. package/src/tigerbeetle/src/message_pool.zig +17 -17
  44. package/src/tigerbeetle/src/simulator.zig +7 -12
  45. package/src/tigerbeetle/src/state_machine.zig +582 -1806
  46. package/src/tigerbeetle/src/storage.zig +12 -12
  47. package/src/tigerbeetle/src/test/accounting/auditor.zig +2 -2
  48. package/src/tigerbeetle/src/test/accounting/workload.zig +5 -5
  49. package/src/tigerbeetle/src/test/cluster.zig +8 -8
  50. package/src/tigerbeetle/src/test/conductor.zig +6 -5
  51. package/src/tigerbeetle/src/test/fuzz.zig +19 -0
  52. package/src/tigerbeetle/src/test/message_bus.zig +0 -2
  53. package/src/tigerbeetle/src/test/network.zig +5 -5
  54. package/src/tigerbeetle/src/test/state_checker.zig +2 -2
  55. package/src/tigerbeetle/src/test/storage.zig +54 -51
  56. package/src/tigerbeetle/src/test/storage_checker.zig +3 -3
  57. package/src/tigerbeetle/src/test/table.zig +226 -0
  58. package/src/tigerbeetle/src/time.zig +0 -1
  59. package/src/tigerbeetle/src/tracer.zig +402 -214
  60. package/src/tigerbeetle/src/unit_tests.zig +1 -0
  61. package/src/tigerbeetle/src/vsr/client.zig +5 -5
  62. package/src/tigerbeetle/src/vsr/clock.zig +9 -8
  63. package/src/tigerbeetle/src/vsr/journal.zig +47 -47
  64. package/src/tigerbeetle/src/vsr/journal_format_fuzz.zig +13 -11
  65. package/src/tigerbeetle/src/vsr/replica.zig +56 -54
  66. package/src/tigerbeetle/src/vsr/replica_format.zig +8 -8
  67. package/src/tigerbeetle/src/vsr/superblock.zig +55 -55
  68. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +9 -9
  69. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +4 -3
  70. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +2 -0
  71. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +9 -6
  72. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +5 -5
  73. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +2 -0
  74. package/src/tigerbeetle/src/vsr.zig +20 -20
@@ -0,0 +1,394 @@
1
+ const std = @import("std");
2
+ const assert = std.debug.assert;
3
+ const vsr = @import("vsr.zig");
4
+ const tracer = @import("tracer.zig");
5
+ const Config = @import("config.zig").Config;
6
+ const config = @import("config.zig").configs.current;
7
+
8
+ /// The maximum log level.
9
+ /// One of: .err, .warn, .info, .debug
10
+ pub const log_level: std.log.Level = config.process.log_level;
11
+
12
+ pub const log = if (tracer_backend == .tracy)
13
+ tracer.log_fn
14
+ else
15
+ std.log.defaultLog;
16
+
17
+ // Which backend to use for ./tracer.zig.
18
+ // Default is `.none`.
19
+ pub const tracer_backend = config.process.tracer_backend;
20
+
21
+ /// The maximum number of replicas allowed in a cluster.
22
+ pub const replicas_max = 6;
23
+
24
+ /// The maximum number of clients allowed per cluster, where each client has a unique 128-bit ID.
25
+ /// This impacts the amount of memory allocated at initialization by the server.
26
+ /// This determines the size of the VR client table used to cache replies to clients by client ID.
27
+ /// Each client has one entry in the VR client table to store the latest `message_size_max` reply.
28
+ pub const clients_max = config.cluster.clients_max;
29
+
30
+ comptime {
31
+ assert(clients_max >= Config.Cluster.clients_max_min);
32
+ }
33
+
34
+ /// The minimum number of nodes required to form a quorum for replication:
35
+ /// Majority quorums are only required across view change and replication phases (not within).
36
+ /// As per Flexible Paxos, provided `quorum_replication + quorum_view_change > replicas`:
37
+ /// 1. you may increase `quorum_view_change` above a majority, so that
38
+ /// 2. you can decrease `quorum_replication` below a majority, to optimize the common case.
39
+ /// This improves latency by reducing the number of nodes required for synchronous replication.
40
+ /// This reduces redundancy only in the short term, asynchronous replication will still continue.
41
+ /// The size of the replication quorum is limited to the minimum of this value and actual majority.
42
+ /// The size of the view change quorum will then be automatically inferred from quorum_replication.
43
+ pub const quorum_replication_max = config.cluster.quorum_replication_max;
44
+
45
+ /// The default server port to listen on if not specified in `--addresses`:
46
+ pub const port = config.process.port;
47
+
48
+ /// The default network interface address to listen on if not specified in `--addresses`:
49
+ /// WARNING: Binding to all interfaces with "0.0.0.0" is dangerous and opens the server to anyone.
50
+ /// Bind to the "127.0.0.1" loopback address to accept local connections as a safe default only.
51
+ pub const address = config.process.address;
52
+
53
+ comptime {
54
+ // vsr.parse_address assumes that config.address/config.port are valid.
55
+ _ = std.net.Address.parseIp4(address, 0) catch unreachable;
56
+ _ = @as(u16, port);
57
+ }
58
+
59
+ /// The default maximum amount of memory to use.
60
+ pub const memory_size_max_default = config.process.memory_size_max_default;
61
+
62
+ /// At a high level, priority for object caching is (in descending order):
63
+ ///
64
+ /// 1. Accounts.
65
+ /// - 2 lookups per created transfer
66
+ /// - high temporal locality
67
+ /// - positive expected result
68
+ /// 2. Posted transfers.
69
+ /// - high temporal locality
70
+ /// - positive expected result
71
+ /// 3. Transfers. Generally don't cache these because of:
72
+ /// - low temporal locality
73
+ /// - negative expected result
74
+ ///
75
+ /// The maximum number of accounts to store in memory:
76
+ /// This impacts the amount of memory allocated at initialization by the server.
77
+ pub const cache_accounts_max = config.process.cache_accounts_max;
78
+
79
+ /// The maximum number of transfers to store in memory:
80
+ /// This impacts the amount of memory allocated at initialization by the server.
81
+ /// We allocate more capacity than the number of transfers for a safe hash table load factor.
82
+ pub const cache_transfers_max = config.process.cache_transfers_max;
83
+
84
+ /// The maximum number of two-phase transfers to store in memory:
85
+ /// This impacts the amount of memory allocated at initialization by the server.
86
+ pub const cache_transfers_posted_max = config.process.cache_transfers_posted_max;
87
+
88
+ comptime {
89
+ // SetAssociativeCache requires a power-of-two cardinality.
90
+ assert(cache_accounts_max == 0 or std.math.isPowerOfTwo(cache_accounts_max));
91
+ assert(cache_transfers_max == 0 or std.math.isPowerOfTwo(cache_transfers_max));
92
+ assert(cache_transfers_posted_max == 0 or std.math.isPowerOfTwo(cache_transfers_posted_max));
93
+ }
94
+
95
+ /// The maximum number of batch entries in the journal file:
96
+ /// A batch entry may contain many transfers, so this is not a limit on the number of transfers.
97
+ /// We need this limit to allocate space for copies of batch headers at the start of the journal.
98
+ /// These header copies enable us to disentangle corruption from crashes and recover accordingly.
99
+ pub const journal_slot_count = config.cluster.journal_slot_count;
100
+
101
+ /// The maximum size of the journal file:
102
+ /// This is pre-allocated and zeroed for performance when initialized.
103
+ /// Writes within this file never extend the filesystem inode size reducing the cost of fdatasync().
104
+ /// This enables static allocation of disk space so that appends cannot fail with ENOSPC.
105
+ /// This also enables us to detect filesystem inode corruption that would change the journal size.
106
+ // TODO remove this; just allocate a part of the total storage for the journal
107
+ pub const journal_size_max = journal_size_headers + journal_size_prepares;
108
+ pub const journal_size_headers = journal_slot_count * @sizeOf(vsr.Header);
109
+ pub const journal_size_prepares = journal_slot_count * message_size_max;
110
+
111
+ comptime {
112
+ // For the given WAL (lsm_batch_multiple=4):
113
+ //
114
+ // A B C D E
115
+ // |····|····|····|····|
116
+ //
117
+ // - ("|" delineates measures, where a measure is a multiple of prepare batches.)
118
+ // - ("·" is a prepare in the WAL.)
119
+ // - The Replica triggers a checkpoint at "E".
120
+ // - The entries between "A" and "D" are on-disk in level 0.
121
+ // - The entries between "D" and "E" are in-memory in the immutable table.
122
+ // - So the checkpoint only includes "A…D".
123
+ //
124
+ // The journal must have at least two measures (batches) to ensure at least one is checkpointed.
125
+ assert(journal_slot_count >= Config.Cluster.journal_slot_count_min);
126
+ assert(journal_slot_count >= lsm_batch_multiple * 2);
127
+ assert(journal_slot_count % lsm_batch_multiple == 0);
128
+ assert(journal_size_max == journal_size_headers + journal_size_prepares);
129
+
130
+ assert(journal_size_max == journal_size_headers + journal_size_prepares);
131
+ }
132
+
133
+ /// The maximum number of connections that can be held open by the server at any time:
134
+ pub const connections_max = replicas_max + clients_max;
135
+
136
+ /// The maximum size of a message in bytes:
137
+ /// This is also the limit of all inflight data across multiple pipelined requests per connection.
138
+ /// We may have one request of up to 2 MiB inflight or 2 pipelined requests of up to 1 MiB inflight.
139
+ /// This impacts sequential disk write throughput, the larger the buffer the better.
140
+ /// 2 MiB is 16,384 transfers, and a reasonable choice for sequential disk write throughput.
141
+ /// However, this impacts bufferbloat and head-of-line blocking latency for pipelined requests.
142
+ /// For a 1 Gbps NIC = 125 MiB/s throughput: 2 MiB / 125 * 1000ms = 16ms for the next request.
143
+ /// This impacts the amount of memory allocated at initialization by the server.
144
+ pub const message_size_max = config.cluster.message_size_max;
145
+ pub const message_body_size_max = message_size_max - @sizeOf(vsr.Header);
146
+
147
+ comptime {
148
+ // The WAL format requires messages to be a multiple of the sector size.
149
+ assert(message_size_max % sector_size == 0);
150
+ assert(message_size_max >= @sizeOf(vsr.Header));
151
+ assert(message_size_max >= sector_size);
152
+ assert(message_size_max >= Config.Cluster.message_size_max_min(clients_max));
153
+ }
154
+
155
+ /// The maximum number of Viewstamped Replication prepare messages that can be inflight at a time.
156
+ /// This is immutable once assigned per cluster, as replicas need to know how many operations might
157
+ /// possibly be uncommitted during a view change, and this must be constant for all replicas.
158
+ pub const pipeline_max = clients_max;
159
+
160
+ /// The minimum and maximum amount of time in milliseconds to wait before initiating a connection.
161
+ /// Exponential backoff and jitter are applied within this range.
162
+ pub const connection_delay_min_ms = config.process.connection_delay_min_ms;
163
+ pub const connection_delay_max_ms = config.process.connection_delay_max_ms;
164
+
165
+ /// The maximum number of outgoing messages that may be queued on a replica connection.
166
+ pub const connection_send_queue_max_replica = std.math.max(std.math.min(clients_max, 4), 2);
167
+
168
+ /// The maximum number of outgoing messages that may be queued on a client connection.
169
+ /// The client has one in-flight request, and occasionally a ping.
170
+ pub const connection_send_queue_max_client = 2;
171
+
172
+ /// The maximum number of outgoing requests that may be queued on a client (including the in-flight request).
173
+ pub const client_request_queue_max = config.process.client_request_queue_max;
174
+
175
+ /// The maximum number of connections in the kernel's complete connection queue pending an accept():
176
+ /// If the backlog argument is greater than the value in `/proc/sys/net/core/somaxconn`, then it is
177
+ /// silently truncated to that value. Since Linux 5.4, the default in this file is 4096.
178
+ pub const tcp_backlog = config.process.tcp_backlog;
179
+
180
+ /// The maximum size of a kernel socket receive buffer in bytes (or 0 to use the system default):
181
+ /// This sets SO_RCVBUF as an alternative to the auto-tuning range in /proc/sys/net/ipv4/tcp_rmem.
182
+ /// The value is limited by /proc/sys/net/core/rmem_max, unless the CAP_NET_ADMIN privilege exists.
183
+ /// The kernel doubles this value to allow space for packet bookkeeping overhead.
184
+ /// The receive buffer should ideally exceed the Bandwidth-Delay Product for maximum throughput.
185
+ /// At the same time, be careful going beyond 4 MiB as the kernel may merge many small TCP packets,
186
+ /// causing considerable latency spikes for large buffer sizes:
187
+ /// https://blog.cloudflare.com/the-story-of-one-latency-spike/
188
+ pub const tcp_rcvbuf = config.process.tcp_rcvbuf;
189
+
190
+ /// The maximum size of a kernel socket send buffer in bytes (or 0 to use the system default):
191
+ /// This sets SO_SNDBUF as an alternative to the auto-tuning range in /proc/sys/net/ipv4/tcp_wmem.
192
+ /// The value is limited by /proc/sys/net/core/wmem_max, unless the CAP_NET_ADMIN privilege exists.
193
+ /// The kernel doubles this value to allow space for packet bookkeeping overhead.
194
+ pub const tcp_sndbuf_replica = connection_send_queue_max_replica * message_size_max;
195
+ pub const tcp_sndbuf_client = connection_send_queue_max_client * message_size_max;
196
+
197
+ comptime {
198
+ // Avoid latency issues from setting sndbuf too high:
199
+ assert(tcp_sndbuf_replica <= 16 * 1024 * 1024);
200
+ assert(tcp_sndbuf_client <= 16 * 1024 * 1024);
201
+ }
202
+
203
+ /// Whether to enable TCP keepalive:
204
+ pub const tcp_keepalive = config.process.tcp_keepalive;
205
+
206
+ /// The time (in seconds) the connection needs to be idle before sending TCP keepalive probes:
207
+ /// Probes are not sent when the send buffer has data or the congestion window size is zero,
208
+ /// for these cases we also need tcp_user_timeout_ms below.
209
+ pub const tcp_keepidle = config.process.tcp_keepidle;
210
+
211
+ /// The time (in seconds) between individual keepalive probes:
212
+ pub const tcp_keepintvl = config.process.tcp_keepintvl;
213
+
214
+ /// The maximum number of keepalive probes to send before dropping the connection:
215
+ pub const tcp_keepcnt = config.process.tcp_keepcnt;
216
+
217
+ /// The time (in milliseconds) to timeout an idle connection or unacknowledged send:
218
+ /// This timer rides on the granularity of the keepalive or retransmission timers.
219
+ /// For example, if keepalive will only send a probe after 10s then this becomes the lower bound
220
+ /// for tcp_user_timeout_ms to fire, even if tcp_user_timeout_ms is 2s. Nevertheless, this would timeout
221
+ /// the connection at 10s rather than wait for tcp_keepcnt probes to be sent. At the same time, if
222
+ /// tcp_user_timeout_ms is larger than the max keepalive time then tcp_keepcnt will be ignored and
223
+ /// more keepalive probes will be sent until tcp_user_timeout_ms fires.
224
+ /// For a thorough overview of how these settings interact:
225
+ /// https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/
226
+ pub const tcp_user_timeout_ms = (tcp_keepidle + tcp_keepintvl * tcp_keepcnt) * 1000;
227
+
228
+ /// Whether to disable Nagle's algorithm to eliminate send buffering delays:
229
+ pub const tcp_nodelay = config.process.tcp_nodelay;
230
+
231
+ /// Size of a CPU cache line in bytes
232
+ pub const cache_line_size = config.cluster.cache_line_size;
233
+
234
+ /// The minimum size of an aligned kernel page and an Advanced Format disk sector:
235
+ /// This is necessary for direct I/O without the kernel having to fix unaligned pages with a copy.
236
+ /// The new Advanced Format sector size is backwards compatible with the old 512 byte sector size.
237
+ /// This should therefore never be less than 4 KiB to be future-proof when server disks are swapped.
238
+ pub const sector_size = 4096;
239
+
240
+ /// Whether to perform direct I/O to the underlying disk device:
241
+ /// This enables several performance optimizations:
242
+ /// * A memory copy to the kernel's page cache can be eliminated for reduced CPU utilization.
243
+ /// * I/O can be issued immediately to the disk device without buffering delay for improved latency.
244
+ /// This also enables several safety features:
245
+ /// * Disk data can be scrubbed to repair latent sector errors and checksum errors proactively.
246
+ /// * Fsync failures can be recovered from correctly.
247
+ /// WARNING: Disabling direct I/O is unsafe; the page cache cannot be trusted after an fsync error,
248
+ /// even after an application panic, since the kernel will mark dirty pages as clean, even
249
+ /// when they were never written to disk.
250
+ pub const direct_io = config.process.direct_io;
251
+ pub const direct_io_required = config.process.direct_io_required;
252
+
253
+ // TODO Add in the Grid's IOPS and the upper-bound that the Superblock will use.
254
+ pub const iops_read_max = journal_iops_read_max;
255
+ pub const iops_write_max = journal_iops_write_max;
256
+
257
+ /// The maximum number of concurrent WAL read I/O operations to allow at once.
258
+ pub const journal_iops_read_max = config.process.journal_iops_read_max;
259
+ /// The maximum number of concurrent WAL write I/O operations to allow at once.
260
+ /// Ideally this is at least as high as pipeline_max, but it is safe to be lower.
261
+ pub const journal_iops_write_max = config.process.journal_iops_write_max;
262
+
263
+ /// The number of redundant copies of the superblock in the superblock storage zone.
264
+ /// This must be either { 4, 6, 8 }, i.e. an even number, for more efficient flexible quorums.
265
+ ///
266
+ /// The superblock contains local state for the replica and therefore cannot be replicated remotely.
267
+ /// Loss of the superblock would represent loss of the replica and so it must be protected.
268
+ /// Since each superblock copy also copies the superblock trailer (around 33 MiB), setting this
269
+ /// beyond 4 copies (or decreasing block_size < 64 KiB) can result in a superblock zone > 264 MiB.
270
+ ///
271
+ /// This can mean checkpointing latencies in the rare extreme worst-case of at most 264ms, although
272
+ /// this would require EWAH compression of our block free set to have zero effective compression.
273
+ /// In practice, checkpointing latency should be an order of magnitude better due to compression,
274
+ /// because our block free set will fill holes when allocating.
275
+ ///
276
+ /// The superblock only needs to be checkpointed every now and then, before the WAL wraps around,
277
+ /// or when a view change needs to take place to elect a new primary.
278
+ pub const superblock_copies = config.cluster.superblock_copies;
279
+
280
+ comptime {
281
+ assert(superblock_copies % 2 == 0);
282
+ assert(superblock_copies >= 4);
283
+ assert(superblock_copies <= 8);
284
+ }
285
+
286
+ /// The maximum size of a local data file.
287
+ /// This should not be much larger than several TiB to limit:
288
+ /// * blast radius and recovery time when a whole replica is lost,
289
+ /// * replicated storage overhead, since all data files are mirrored,
290
+ /// * the size of the superblock storage zone, and
291
+ /// * the static memory allocation required for tracking LSM forest metadata in memory.
292
+ // TODO Remove, now that we have block_count_max.
293
+ pub const size_max = config.cluster.size_max;
294
+
295
+ /// The unit of read/write access to LSM manifest and LSM table blocks in the block storage zone.
296
+ ///
297
+ /// - A lower block size increases the memory overhead of table metadata, due to smaller/more tables.
298
+ /// - A higher block size increases space amplification due to partially-filled blocks.
299
+ pub const block_size = config.cluster.block_size;
300
+
301
+ pub const block_count_max = @divExact(16 * 1024 * 1024 * 1024 * 1024, block_size);
302
+
303
+ comptime {
304
+ assert(block_size % sector_size == 0);
305
+ assert(lsm_table_size_max % sector_size == 0);
306
+ assert(lsm_table_size_max % block_size == 0);
307
+ }
308
+
309
+ /// The number of levels in an LSM tree.
310
+ /// A higher number of levels increases read amplification, as well as total storage capacity.
311
+ pub const lsm_levels = config.cluster.lsm_levels;
312
+
313
+ comptime {
314
+ // ManifestLog serializes the level as a u7.
315
+ assert(lsm_levels > 0);
316
+ assert(lsm_levels <= std.math.maxInt(u7));
317
+ }
318
+
319
+ /// The number of tables at level i (0 ≤ i < lsm_levels) is `pow(lsm_growth_factor, i+1)`.
320
+ /// A higher growth factor increases write amplification (by increasing the number of tables in
321
+ /// level B that overlap a table in level A in a compaction), but decreases read amplification (by
322
+ /// reducing the height of the tree and thus the number of levels that must be probed). Since read
323
+ /// amplification can be optimized more easily (with filters and caching), we target a growth
324
+ /// factor of 8 for lower write amplification rather than the more typical growth factor of 10.
325
+ pub const lsm_growth_factor = config.cluster.lsm_growth_factor;
326
+
327
+ /// The maximum cumulative size of a table — computed as the sum of the size of the index block,
328
+ /// filter blocks, and data blocks.
329
+ pub const lsm_table_size_max = config.cluster.lsm_table_size_max;
330
+
331
+ /// Size of nodes used by the LSM tree manifest implementation.
332
+ /// TODO Double-check this with our "LSM Manifest" spreadsheet.
333
+ pub const lsm_manifest_node_size = config.process.lsm_manifest_node_size;
334
+
335
+ /// A multiple of batch inserts that a mutable table can definitely accommodate before flushing.
336
+ /// For example, if a message_size_max batch can contain at most 8181 transfers then a multiple of 4
337
+ /// means that the transfer tree's mutable table will be sized to 8191 * 4 = 32764 transfers.
338
+ /// TODO Assert this relative to lsm_table_size_max.
339
+ /// We want to ensure that a mutable table can be converted to an immutable table without overflow.
340
+ pub const lsm_batch_multiple = config.cluster.lsm_batch_multiple;
341
+
342
+ comptime {
343
+ // The LSM tree uses half-measures to balance compaction.
344
+ assert(lsm_batch_multiple % 2 == 0);
345
+ }
346
+
347
+ pub const lsm_snapshots_max = config.cluster.lsm_snapshots_max;
348
+
349
+ pub const lsm_value_to_key_layout_ratio_min = config.cluster.lsm_value_to_key_layout_ratio_min;
350
+
351
+ /// The number of milliseconds between each replica tick, the basic unit of time in TigerBeetle.
352
+ /// Used to regulate heartbeats, retries and timeouts, all specified as multiples of a tick.
353
+ pub const tick_ms = config.process.tick_ms;
354
+
355
+ /// The conservative round-trip time at startup when there is no network knowledge.
356
+ /// Adjusted dynamically thereafter for RTT-sensitive timeouts according to network congestion.
357
+ /// This should be set higher rather than lower to avoid flooding the network at startup.
358
+ pub const rtt_ticks = config.process.rtt_ms / tick_ms;
359
+
360
+ /// The multiple of round-trip time for RTT-sensitive timeouts.
361
+ pub const rtt_multiple = 2;
362
+
363
+ /// The min/max bounds of exponential backoff (and jitter) to add to RTT-sensitive timeouts.
364
+ pub const backoff_min_ticks = config.process.backoff_min_ms / tick_ms;
365
+ pub const backoff_max_ticks = config.process.backoff_max_ms / tick_ms;
366
+
367
+ /// The maximum skew between two clocks to allow when considering them to be in agreement.
368
+ /// The principle is that no two clocks tick exactly alike but some clocks more or less agree.
369
+ /// The maximum skew across the cluster as a whole is this value times the total number of clocks.
370
+ /// The cluster will be unavailable if the majority of clocks are all further than this value apart.
371
+ /// Decreasing this reduces the probability of reaching agreement on synchronized time.
372
+ /// Increasing this reduces the accuracy of synchronized time.
373
+ pub const clock_offset_tolerance_max_ms = config.process.clock_offset_tolerance_max_ms;
374
+
375
+ /// The amount of time before the clock's synchronized epoch is expired.
376
+ /// If the epoch is expired before it can be replaced with a new synchronized epoch, then this most
377
+ /// likely indicates either a network partition or else too many clock faults across the cluster.
378
+ /// A new synchronized epoch will be installed as soon as these conditions resolve.
379
+ pub const clock_epoch_max_ms = config.process.clock_epoch_max_ms;
380
+
381
+ /// The amount of time to wait for enough accurate samples before synchronizing the clock.
382
+ /// The more samples we can take per remote clock source, the more accurate our estimation becomes.
383
+ /// This impacts cluster startup time as the leader must first wait for synchronization to complete.
384
+ pub const clock_synchronization_window_min_ms = config.process.clock_synchronization_window_min_ms;
385
+
386
+ /// The amount of time without agreement before the clock window is expired and a new window opened.
387
+ /// This happens where some samples have been collected but not enough to reach agreement.
388
+ /// The quality of samples degrades as they age so at some point we throw them away and start over.
389
+ /// This eliminates the impact of gradual clock drift on our clock offset (clock skew) measurements.
390
+ /// If a window expires because of this then it is likely that the clock epoch will also be expired.
391
+ pub const clock_synchronization_window_max_ms = config.process.clock_synchronization_window_max_ms;
392
+
393
+ /// Whether to perform intensive online verification.
394
+ pub const verify = config.process.verify;
@@ -1,7 +1,7 @@
1
1
  const std = @import("std");
2
2
  const assert = std.debug.assert;
3
3
 
4
- const config = @import("config.zig");
4
+ const constants = @import("constants.zig");
5
5
 
6
6
  const tb = @import("tigerbeetle.zig");
7
7
  const Account = tb.Account;
@@ -16,7 +16,7 @@ const Storage = @import("storage.zig").Storage;
16
16
  const MessagePool = @import("message_pool.zig").MessagePool;
17
17
  const MessageBus = @import("message_bus.zig").MessageBusClient;
18
18
  const StateMachine = @import("state_machine.zig").StateMachineType(Storage, .{
19
- .message_body_size_max = config.message_body_size_max,
19
+ .message_body_size_max = constants.message_body_size_max,
20
20
  });
21
21
 
22
22
  const vsr = @import("vsr.zig");
@@ -37,7 +37,7 @@ pub fn request(
37
37
  const allocator = std.heap.page_allocator;
38
38
  const client_id = std.crypto.random.int(u128);
39
39
  const cluster_id: u32 = 0;
40
- var addresses = [_]std.net.Address{try std.net.Address.parseIp4("127.0.0.1", config.port)};
40
+ var addresses = [_]std.net.Address{try std.net.Address.parseIp4("127.0.0.1", constants.port)};
41
41
 
42
42
  var io = try IO.init(32, 0);
43
43
  defer io.deinit();
@@ -74,7 +74,7 @@ pub fn request(
74
74
 
75
75
  while (client.request_queue.count > 0) {
76
76
  client.tick();
77
- try io.run_for_ns(config.tick_ms * std.time.ns_per_ms);
77
+ try io.run_for_ns(constants.tick_ms * std.time.ns_per_ms);
78
78
  }
79
79
  }
80
80
 
@@ -6,6 +6,8 @@ const log = std.log.scoped(.fuzz_ewah);
6
6
  const ewah = @import("./ewah.zig");
7
7
  const fuzz = @import("./test/fuzz.zig");
8
8
 
9
+ pub const tigerbeetle_config = @import("config.zig").configs.test_min;
10
+
9
11
  pub fn main() !void {
10
12
  const allocator = std.testing.allocator;
11
13
  const args = try fuzz.parse_fuzz_args(allocator);
@@ -4,7 +4,7 @@ const mem = std.mem;
4
4
  const assert = std.debug.assert;
5
5
  const log = std.log.scoped(.io);
6
6
 
7
- const config = @import("../config.zig");
7
+ const constants = @import("../constants.zig");
8
8
  const FIFO = @import("../fifo.zig").FIFO;
9
9
  const Time = @import("../time.zig").Time;
10
10
  const buffer_limit = @import("../io.zig").buffer_limit;
@@ -658,8 +658,8 @@ pub const IO = struct {
658
658
  must_create: bool,
659
659
  ) !os.fd_t {
660
660
  assert(relative_path.len > 0);
661
- assert(size >= config.sector_size);
662
- assert(size % config.sector_size == 0);
661
+ assert(size >= constants.sector_size);
662
+ assert(size % constants.sector_size == 0);
663
663
 
664
664
  // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
665
665
  // This is much stronger than an advisory exclusive lock, and is required on some platforms.
@@ -694,7 +694,7 @@ pub const IO = struct {
694
694
 
695
695
  // On darwin assume that Direct I/O is always supported.
696
696
  // Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist.
697
- if (config.direct_io) {
697
+ if (constants.direct_io) {
698
698
  _ = try os.fcntl(fd, os.F.NOCACHE, 1);
699
699
  }
700
700
 
@@ -7,7 +7,7 @@ const io_uring_cqe = linux.io_uring_cqe;
7
7
  const io_uring_sqe = linux.io_uring_sqe;
8
8
  const log = std.log.scoped(.io);
9
9
 
10
- const config = @import("../config.zig");
10
+ const constants = @import("../constants.zig");
11
11
  const FIFO = @import("../fifo.zig").FIFO;
12
12
  const buffer_limit = @import("../io.zig").buffer_limit;
13
13
 
@@ -905,8 +905,8 @@ pub const IO = struct {
905
905
  must_create: bool,
906
906
  ) !os.fd_t {
907
907
  assert(relative_path.len > 0);
908
- assert(size >= config.sector_size);
909
- assert(size % config.sector_size == 0);
908
+ assert(size >= constants.sector_size);
909
+ assert(size % constants.sector_size == 0);
910
910
 
911
911
  // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
912
912
  // This is much stronger than an advisory exclusive lock, and is required on some platforms.
@@ -918,11 +918,11 @@ pub const IO = struct {
918
918
  if (@hasDecl(os.O, "LARGEFILE")) flags |= os.O.LARGEFILE;
919
919
 
920
920
  var direct_io_supported = false;
921
- if (config.direct_io) {
921
+ if (constants.direct_io) {
922
922
  direct_io_supported = try fs_supports_direct_io(dir_fd);
923
923
  if (direct_io_supported) {
924
924
  flags |= os.O.DIRECT;
925
- } else if (config.deployment_environment == .development) {
925
+ } else if (!constants.direct_io_required) {
926
926
  log.warn("file system does not support Direct I/O", .{});
927
927
  } else {
928
928
  // We require Direct I/O for safety to handle fsync failure correctly, and therefore
@@ -968,7 +968,7 @@ pub const IO = struct {
968
968
  log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
969
969
  log.info("allocating by writing to the last sector of the file instead...", .{});
970
970
 
971
- const sector_size = config.sector_size;
971
+ const sector_size = constants.sector_size;
972
972
  const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
973
973
 
974
974
  // Handle partial writes where the physical sector is less than a logical sector:
@@ -2,7 +2,7 @@ const std = @import("std");
2
2
  const os = std.os;
3
3
  const assert = std.debug.assert;
4
4
  const log = std.log.scoped(.io);
5
- const config = @import("../config.zig");
5
+ const constants = @import("../constants.zig");
6
6
 
7
7
  const FIFO = @import("../fifo.zig").FIFO;
8
8
  const Time = @import("../time.zig").Time;
@@ -941,8 +941,8 @@ pub const IO = struct {
941
941
  must_create: bool,
942
942
  ) !os.fd_t {
943
943
  assert(relative_path.len > 0);
944
- assert(size >= config.sector_size);
945
- assert(size % config.sector_size == 0);
944
+ assert(size >= constants.sector_size);
945
+ assert(size % constants.sector_size == 0);
946
946
 
947
947
  const path_w = try os.windows.sliceToPrefixedFileW(relative_path);
948
948
 
@@ -1013,7 +1013,7 @@ pub const IO = struct {
1013
1013
  log.warn("file system failed to preallocate the file memory", .{});
1014
1014
  log.info("allocating by writing to the last sector of the file instead...", .{});
1015
1015
 
1016
- const sector_size = config.sector_size;
1016
+ const sector_size = constants.sector_size;
1017
1017
  const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
1018
1018
 
1019
1019
  // Handle partial writes where the physical sector is less than a logical sector:
@@ -83,7 +83,7 @@ test {
83
83
 
84
84
  const test_bloom_filter = struct {
85
85
  const fuzz = @import("../test/fuzz.zig");
86
- const block_size = @import("../config.zig").block_size;
86
+ const block_size = @import("../constants.zig").block_size;
87
87
 
88
88
  fn random_keys(random: std.rand.Random, iter: usize) !void {
89
89
  const keys_count = @minimum(
@@ -38,7 +38,7 @@ const assert = std.debug.assert;
38
38
  const log = std.log.scoped(.compaction);
39
39
  const tracer = @import("../tracer.zig");
40
40
 
41
- const config = @import("../config.zig");
41
+ const constants = @import("../constants.zig");
42
42
 
43
43
  const GridType = @import("grid.zig").GridType;
44
44
  const ManifestType = @import("manifest.zig").ManifestType;
@@ -122,7 +122,7 @@ pub fn CompactionType(
122
122
  done,
123
123
  };
124
124
 
125
- tree_name: []const u8,
125
+ tree_name: [:0]const u8,
126
126
 
127
127
  grid: *Grid,
128
128
  grid_reservation: Grid.Reservation,
@@ -161,7 +161,7 @@ pub fn CompactionType(
161
161
 
162
162
  tracer_slot: ?tracer.SpanStart = null,
163
163
 
164
- pub fn init(allocator: mem.Allocator, tree_name: []const u8) !Compaction {
164
+ pub fn init(allocator: mem.Allocator, tree_name: [:0]const u8) !Compaction {
165
165
  var iterator_a = try IteratorA.init(allocator);
166
166
  errdefer iterator_a.deinit(allocator);
167
167
 
@@ -226,17 +226,17 @@ pub fn CompactionType(
226
226
  assert(!compaction.merge_done and compaction.merge_iterator == null);
227
227
  assert(compaction.tracer_slot == null);
228
228
 
229
- assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
229
+ assert(op_min % @divExact(constants.lsm_batch_multiple, 2) == 0);
230
230
  assert(range.table_count > 0);
231
231
  if (table_a) |t| assert(t.visible(op_min));
232
232
 
233
- assert(level_b < config.lsm_levels);
233
+ assert(level_b < constants.lsm_levels);
234
234
  assert((level_b == 0) == (table_a == null));
235
235
 
236
236
  // Levels may choose to drop tombstones if keys aren't included in the lower levels.
237
237
  // This invariant is always true for the last level as it doesn't have any lower ones.
238
238
  const drop_tombstones = manifest.compaction_must_drop_tombstones(level_b, range);
239
- assert(drop_tombstones or level_b < config.lsm_levels - 1);
239
+ assert(drop_tombstones or level_b < constants.lsm_levels - 1);
240
240
 
241
241
  compaction.* = .{
242
242
  .tree_name = compaction.tree_name,
@@ -355,6 +355,7 @@ pub fn CompactionType(
355
355
  .tree_name = compaction.tree_name,
356
356
  .level_b = compaction.level_b,
357
357
  } },
358
+ @src(),
358
359
  );
359
360
 
360
361
  // Generate fake IO to make sure io_pending doesn't reach zero multiple times from
@@ -433,6 +434,7 @@ pub fn CompactionType(
433
434
  .tree_name = compaction.tree_name,
434
435
  .level_b = compaction.level_b,
435
436
  } },
437
+ @src(),
436
438
  );
437
439
 
438
440
  // Create the merge iterator only when we can peek() from the read iterators.
@@ -502,6 +504,8 @@ pub fn CompactionType(
502
504
  // Finalize the data block if it's full or if it contains pending values when there's
503
505
  // no more left to merge.
504
506
  if (compaction.table_builder.data_block_full() or
507
+ compaction.table_builder.filter_block_full() or
508
+ compaction.table_builder.index_block_full() or
505
509
  (merge_iterator.empty() and !compaction.table_builder.data_block_empty()))
506
510
  {
507
511
  compaction.table_builder.data_block_finish(.{
@@ -518,6 +522,7 @@ pub fn CompactionType(
518
522
  // Finalize the filter block if it's full or if it contains pending data blocks
519
523
  // when there's no more merged values to fill them.
520
524
  if (compaction.table_builder.filter_block_full() or
525
+ compaction.table_builder.index_block_full() or
521
526
  (merge_iterator.empty() and !compaction.table_builder.filter_block_empty()))
522
527
  {
523
528
  compaction.table_builder.filter_block_finish(.{
@@ -620,11 +625,11 @@ pub fn CompactionType(
620
625
  }
621
626
 
622
627
  fn snapshot_max_for_table_input(op_min: u64) u64 {
623
- assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
624
- return op_min + @divExact(config.lsm_batch_multiple, 2) - 1;
628
+ assert(op_min % @divExact(constants.lsm_batch_multiple, 2) == 0);
629
+ return op_min + @divExact(constants.lsm_batch_multiple, 2) - 1;
625
630
  }
626
631
 
627
632
  fn snapshot_min_for_table_output(op_min: u64) u64 {
628
- assert(op_min % @divExact(config.lsm_batch_multiple, 2) == 0);
629
- return op_min + @divExact(config.lsm_batch_multiple, 2);
633
+ assert(op_min % @divExact(constants.lsm_batch_multiple, 2) == 0);
634
+ return op_min + @divExact(constants.lsm_batch_multiple, 2);
630
635
  }
@@ -4,11 +4,11 @@ const assert = std.debug.assert;
4
4
  const math = std.math;
5
5
  const mem = std.mem;
6
6
 
7
- const config = @import("../config.zig");
7
+ const constants = @import("../constants.zig");
8
8
  const vsr = @import("../vsr.zig");
9
9
 
10
10
  const GridType = @import("grid.zig").GridType;
11
- const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
11
+ const NodePool = @import("node_pool.zig").NodePool(constants.lsm_manifest_node_size, 16);
12
12
 
13
13
  pub fn ForestType(comptime Storage: type, comptime groove_config: anytype) type {
14
14
  var groove_fields: []const std.builtin.TypeInfo.StructField = &.{};