tigerbeetle-node 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +302 -101
- package/dist/index.d.ts +70 -72
- package/dist/index.js +70 -72
- package/dist/index.js.map +1 -1
- package/package.json +6 -6
- package/scripts/download_node_headers.sh +14 -7
- package/src/index.ts +6 -10
- package/src/node.zig +6 -3
- package/src/tigerbeetle/scripts/benchmark.sh +4 -4
- package/src/tigerbeetle/scripts/confirm_image.sh +44 -0
- package/src/tigerbeetle/scripts/fuzz_loop.sh +15 -0
- package/src/tigerbeetle/scripts/fuzz_unique_errors.sh +7 -0
- package/src/tigerbeetle/scripts/install.sh +19 -4
- package/src/tigerbeetle/scripts/install_zig.bat +5 -1
- package/src/tigerbeetle/scripts/install_zig.sh +24 -14
- package/src/tigerbeetle/scripts/pre-commit.sh +9 -0
- package/src/tigerbeetle/scripts/shellcheck.sh +5 -0
- package/src/tigerbeetle/scripts/tests_on_alpine.sh +10 -0
- package/src/tigerbeetle/scripts/tests_on_ubuntu.sh +14 -0
- package/src/tigerbeetle/src/benchmark.zig +4 -2
- package/src/tigerbeetle/src/benchmark_array_search.zig +3 -3
- package/src/tigerbeetle/src/c/tb_client/thread.zig +8 -9
- package/src/tigerbeetle/src/c/tb_client.h +100 -80
- package/src/tigerbeetle/src/c/tb_client.zig +4 -1
- package/src/tigerbeetle/src/cli.zig +1 -1
- package/src/tigerbeetle/src/config.zig +48 -16
- package/src/tigerbeetle/src/demo.zig +3 -1
- package/src/tigerbeetle/src/eytzinger_benchmark.zig +3 -3
- package/src/tigerbeetle/src/io/linux.zig +1 -1
- package/src/tigerbeetle/src/lsm/README.md +214 -0
- package/src/tigerbeetle/src/lsm/binary_search.zig +137 -10
- package/src/tigerbeetle/src/lsm/bloom_filter.zig +43 -0
- package/src/tigerbeetle/src/lsm/compaction.zig +352 -398
- package/src/tigerbeetle/src/lsm/composite_key.zig +2 -0
- package/src/tigerbeetle/src/lsm/eytzinger.zig +1 -1
- package/src/tigerbeetle/src/lsm/forest.zig +21 -447
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +412 -0
- package/src/tigerbeetle/src/lsm/grid.zig +145 -69
- package/src/tigerbeetle/src/lsm/groove.zig +196 -133
- package/src/tigerbeetle/src/lsm/k_way_merge.zig +40 -18
- package/src/tigerbeetle/src/lsm/level_iterator.zig +28 -9
- package/src/tigerbeetle/src/lsm/manifest.zig +81 -181
- package/src/tigerbeetle/src/lsm/manifest_level.zig +210 -454
- package/src/tigerbeetle/src/lsm/manifest_log.zig +77 -28
- package/src/tigerbeetle/src/lsm/posted_groove.zig +64 -76
- package/src/tigerbeetle/src/lsm/segmented_array.zig +561 -241
- package/src/tigerbeetle/src/lsm/segmented_array_benchmark.zig +148 -0
- package/src/tigerbeetle/src/lsm/segmented_array_fuzz.zig +9 -0
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +62 -12
- package/src/tigerbeetle/src/lsm/table.zig +83 -48
- package/src/tigerbeetle/src/lsm/table_immutable.zig +30 -23
- package/src/tigerbeetle/src/lsm/table_iterator.zig +25 -14
- package/src/tigerbeetle/src/lsm/table_mutable.zig +63 -12
- package/src/tigerbeetle/src/lsm/test.zig +49 -55
- package/src/tigerbeetle/src/lsm/tree.zig +407 -402
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +457 -0
- package/src/tigerbeetle/src/main.zig +28 -6
- package/src/tigerbeetle/src/message_bus.zig +2 -2
- package/src/tigerbeetle/src/message_pool.zig +14 -17
- package/src/tigerbeetle/src/simulator.zig +145 -112
- package/src/tigerbeetle/src/state_machine.zig +338 -228
- package/src/tigerbeetle/src/static_allocator.zig +65 -0
- package/src/tigerbeetle/src/storage.zig +3 -7
- package/src/tigerbeetle/src/test/accounting/auditor.zig +577 -0
- package/src/tigerbeetle/src/test/accounting/workload.zig +819 -0
- package/src/tigerbeetle/src/test/cluster.zig +18 -48
- package/src/tigerbeetle/src/test/conductor.zig +365 -0
- package/src/tigerbeetle/src/test/fuzz.zig +121 -0
- package/src/tigerbeetle/src/test/id.zig +89 -0
- package/src/tigerbeetle/src/test/priority_queue.zig +645 -0
- package/src/tigerbeetle/src/test/state_checker.zig +93 -69
- package/src/tigerbeetle/src/test/state_machine.zig +11 -35
- package/src/tigerbeetle/src/test/storage.zig +29 -8
- package/src/tigerbeetle/src/tigerbeetle.zig +14 -16
- package/src/tigerbeetle/src/unit_tests.zig +7 -0
- package/src/tigerbeetle/src/vopr.zig +494 -0
- package/src/tigerbeetle/src/vopr_hub/README.md +58 -0
- package/src/tigerbeetle/src/vopr_hub/SETUP.md +199 -0
- package/src/tigerbeetle/src/vopr_hub/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/main.go +1022 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +3 -0
- package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +403 -0
- package/src/tigerbeetle/src/vsr/client.zig +13 -0
- package/src/tigerbeetle/src/vsr/journal.zig +16 -13
- package/src/tigerbeetle/src/vsr/replica.zig +924 -491
- package/src/tigerbeetle/src/vsr/superblock.zig +55 -37
- package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -10
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +2 -2
- package/src/tigerbeetle/src/vsr/superblock_manifest.zig +18 -3
- package/src/tigerbeetle/src/vsr.zig +75 -55
- package/src/tigerbeetle/scripts/vopr.bat +0 -48
- package/src/tigerbeetle/scripts/vopr.sh +0 -33
|
@@ -31,8 +31,15 @@ const vsr = @import("../vsr.zig");
|
|
|
31
31
|
|
|
32
32
|
const SuperBlockType = vsr.SuperBlockType;
|
|
33
33
|
const GridType = @import("grid.zig").GridType;
|
|
34
|
+
const BlockType = @import("grid.zig").BlockType;
|
|
34
35
|
const RingBuffer = @import("../ring_buffer.zig").RingBuffer;
|
|
35
36
|
|
|
37
|
+
/// ManifestLog block schema:
|
|
38
|
+
/// │ vsr.Header │ operation=BlockType.manifest
|
|
39
|
+
/// │ [entry_count_max]Label │ level index, insert|remove
|
|
40
|
+
/// │ [≤entry_count_max]TableInfo │
|
|
41
|
+
/// │ […]u8{0} │ padding (to end of block)
|
|
42
|
+
/// Label and TableInfo entries correspond.
|
|
36
43
|
pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
37
44
|
return struct {
|
|
38
45
|
const ManifestLog = @This();
|
|
@@ -40,8 +47,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
40
47
|
const SuperBlock = SuperBlockType(Storage);
|
|
41
48
|
const Grid = GridType(Storage);
|
|
42
49
|
|
|
43
|
-
const BlockPtr =
|
|
44
|
-
const BlockPtrConst =
|
|
50
|
+
const BlockPtr = Grid.BlockPtr;
|
|
51
|
+
const BlockPtrConst = Grid.BlockPtrConst;
|
|
45
52
|
|
|
46
53
|
pub const Callback = fn (manifest_log: *ManifestLog) void;
|
|
47
54
|
|
|
@@ -99,20 +106,27 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
99
106
|
blocks_closed: u8 = 0,
|
|
100
107
|
|
|
101
108
|
/// The number of entries in the open block.
|
|
109
|
+
///
|
|
110
|
+
/// Invariants:
|
|
111
|
+
/// - When `entry_count = 0`, there is no open block.
|
|
112
|
+
/// - `entry_count < entry_count_max`. When `entry_count` reaches the maximum, the open
|
|
113
|
+
/// block is closed, and `entry_count` resets to 0.
|
|
102
114
|
entry_count: u32 = 0,
|
|
103
115
|
|
|
104
116
|
opened: bool = false,
|
|
105
117
|
open_event: OpenEvent = undefined,
|
|
106
118
|
open_iterator: SuperBlock.Manifest.IteratorReverse = undefined,
|
|
107
119
|
|
|
120
|
+
/// Set for the duration of `compact`.
|
|
108
121
|
reading: bool = false,
|
|
109
122
|
read: Grid.Read = undefined,
|
|
110
|
-
read_callback: Callback =
|
|
123
|
+
read_callback: ?Callback = null,
|
|
111
124
|
read_block_reference: ?SuperBlock.Manifest.BlockReference = null,
|
|
112
125
|
|
|
126
|
+
/// Set for the duration of `flush` and `checkpoint`.
|
|
113
127
|
writing: bool = false,
|
|
114
128
|
write: Grid.Write = undefined,
|
|
115
|
-
write_callback: Callback =
|
|
129
|
+
write_callback: ?Callback = null,
|
|
116
130
|
|
|
117
131
|
pub fn init(allocator: mem.Allocator, grid: *Grid, tree_hash: u128) !ManifestLog {
|
|
118
132
|
// TODO RingBuffer for .pointer should be extended to take care of alignment:
|
|
@@ -154,6 +168,11 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
154
168
|
assert(!manifest_log.opened);
|
|
155
169
|
assert(!manifest_log.reading);
|
|
156
170
|
assert(!manifest_log.writing);
|
|
171
|
+
assert(manifest_log.read_callback == null);
|
|
172
|
+
|
|
173
|
+
assert(manifest_log.blocks.count == 0);
|
|
174
|
+
assert(manifest_log.blocks_closed == 0);
|
|
175
|
+
assert(manifest_log.entry_count == 0);
|
|
157
176
|
|
|
158
177
|
manifest_log.open_event = event;
|
|
159
178
|
manifest_log.open_iterator = manifest_log.superblock.manifest.iterator_reverse(
|
|
@@ -171,6 +190,10 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
171
190
|
assert(manifest_log.reading);
|
|
172
191
|
assert(!manifest_log.writing);
|
|
173
192
|
|
|
193
|
+
assert(manifest_log.blocks.count == 0);
|
|
194
|
+
assert(manifest_log.blocks_closed == 0);
|
|
195
|
+
assert(manifest_log.entry_count == 0);
|
|
196
|
+
|
|
174
197
|
manifest_log.read_block_reference = manifest_log.open_iterator.next();
|
|
175
198
|
|
|
176
199
|
if (manifest_log.read_block_reference) |block| {
|
|
@@ -182,15 +205,16 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
182
205
|
&manifest_log.read,
|
|
183
206
|
block.address,
|
|
184
207
|
block.checksum,
|
|
208
|
+
.manifest,
|
|
185
209
|
);
|
|
186
210
|
} else {
|
|
187
211
|
manifest_log.opened = true;
|
|
188
212
|
manifest_log.open_event = undefined;
|
|
189
213
|
manifest_log.open_iterator = undefined;
|
|
190
214
|
|
|
191
|
-
const callback = manifest_log.read_callback
|
|
215
|
+
const callback = manifest_log.read_callback.?;
|
|
192
216
|
manifest_log.reading = false;
|
|
193
|
-
manifest_log.read_callback =
|
|
217
|
+
manifest_log.read_callback = null;
|
|
194
218
|
assert(manifest_log.read_block_reference == null);
|
|
195
219
|
|
|
196
220
|
callback(manifest_log);
|
|
@@ -229,6 +253,10 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
229
253
|
}
|
|
230
254
|
}
|
|
231
255
|
|
|
256
|
+
if (block_entry_count(block) < entry_count_max) {
|
|
257
|
+
manifest.queue_for_compaction(block_reference.address);
|
|
258
|
+
}
|
|
259
|
+
|
|
232
260
|
log.debug("{}: opened: checksum={} address={} entries={}", .{
|
|
233
261
|
manifest_log.tree_hash,
|
|
234
262
|
block_reference.checksum,
|
|
@@ -259,11 +287,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
259
287
|
assert(table.snapshot_min > 0);
|
|
260
288
|
assert(table.snapshot_max > table.snapshot_min);
|
|
261
289
|
|
|
262
|
-
if (manifest_log.
|
|
263
|
-
manifest_log.
|
|
264
|
-
} else if (manifest_log.entry_count == entry_count_max) {
|
|
265
|
-
assert(manifest_log.blocks.count > 0);
|
|
266
|
-
manifest_log.close_block();
|
|
290
|
+
if (manifest_log.entry_count == 0) {
|
|
291
|
+
assert(manifest_log.blocks.count == manifest_log.blocks_closed);
|
|
267
292
|
manifest_log.acquire_block();
|
|
268
293
|
} else if (manifest_log.entry_count > 0) {
|
|
269
294
|
assert(manifest_log.blocks.count > 0);
|
|
@@ -302,12 +327,18 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
302
327
|
}
|
|
303
328
|
|
|
304
329
|
manifest_log.entry_count += 1;
|
|
330
|
+
if (manifest_log.entry_count == entry_count_max) {
|
|
331
|
+
manifest_log.close_block();
|
|
332
|
+
assert(manifest_log.entry_count == 0);
|
|
333
|
+
}
|
|
305
334
|
}
|
|
306
335
|
|
|
307
|
-
|
|
336
|
+
/// `flush` does not close a partial block; that is only necessary during `checkpoint`.
|
|
337
|
+
fn flush(manifest_log: *ManifestLog, callback: Callback) void {
|
|
308
338
|
assert(manifest_log.opened);
|
|
309
339
|
assert(!manifest_log.reading);
|
|
310
340
|
assert(!manifest_log.writing);
|
|
341
|
+
assert(manifest_log.write_callback == null);
|
|
311
342
|
|
|
312
343
|
manifest_log.writing = true;
|
|
313
344
|
manifest_log.write_callback = callback;
|
|
@@ -332,8 +363,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
332
363
|
assert(manifest_log.entry_count < entry_count_max);
|
|
333
364
|
}
|
|
334
365
|
|
|
335
|
-
const callback = manifest_log.write_callback
|
|
336
|
-
manifest_log.write_callback =
|
|
366
|
+
const callback = manifest_log.write_callback.?;
|
|
367
|
+
manifest_log.write_callback = null;
|
|
337
368
|
manifest_log.writing = false;
|
|
338
369
|
|
|
339
370
|
callback(manifest_log);
|
|
@@ -350,6 +381,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
350
381
|
const entry_count = block_entry_count(block);
|
|
351
382
|
|
|
352
383
|
if (manifest_log.blocks_closed == 1 and manifest_log.blocks.count == 1) {
|
|
384
|
+
// This might be the last block of a checkpoint, which can be a partial block.
|
|
353
385
|
assert(entry_count > 0);
|
|
354
386
|
} else {
|
|
355
387
|
assert(entry_count == entry_count_max);
|
|
@@ -395,41 +427,48 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
395
427
|
}
|
|
396
428
|
|
|
397
429
|
pub fn compact(manifest_log: *ManifestLog, callback: Callback) void {
|
|
430
|
+
assert(manifest_log.opened);
|
|
398
431
|
assert(!manifest_log.reading);
|
|
432
|
+
assert(!manifest_log.writing);
|
|
433
|
+
assert(manifest_log.read_callback == null);
|
|
399
434
|
manifest_log.read_callback = callback;
|
|
400
|
-
manifest_log.flush(
|
|
435
|
+
manifest_log.flush(compact_flush_callback);
|
|
401
436
|
}
|
|
402
437
|
|
|
403
|
-
fn
|
|
404
|
-
const callback = manifest_log.read_callback
|
|
405
|
-
manifest_log.read_callback = undefined;
|
|
438
|
+
fn compact_flush_callback(manifest_log: *ManifestLog) void {
|
|
439
|
+
const callback = manifest_log.read_callback.?;
|
|
406
440
|
|
|
407
441
|
assert(manifest_log.opened);
|
|
408
442
|
assert(!manifest_log.reading);
|
|
409
443
|
assert(!manifest_log.writing);
|
|
444
|
+
assert(manifest_log.blocks_closed == 0);
|
|
410
445
|
|
|
411
446
|
const manifest: *SuperBlock.Manifest = &manifest_log.superblock.manifest;
|
|
412
447
|
|
|
448
|
+
// Compact a single manifest block — to minimize latency spikes, we want to do the bare
|
|
449
|
+
// minimum of compaction work required.
|
|
450
|
+
// TODO Compact more than 1 block if fragmentation is outstripping the compaction rate.
|
|
413
451
|
if (manifest.oldest_block_queued_for_compaction(manifest_log.tree_hash)) |block| {
|
|
414
452
|
assert(block.tree == manifest_log.tree_hash);
|
|
415
453
|
assert(block.address > 0);
|
|
416
454
|
|
|
417
455
|
manifest_log.reading = true;
|
|
418
|
-
manifest_log.read_callback = callback;
|
|
419
456
|
manifest_log.read_block_reference = block;
|
|
420
457
|
|
|
421
458
|
manifest_log.grid.read_block(
|
|
422
|
-
|
|
459
|
+
compact_read_block_callback,
|
|
423
460
|
&manifest_log.read,
|
|
424
461
|
block.address,
|
|
425
462
|
block.checksum,
|
|
463
|
+
.manifest,
|
|
426
464
|
);
|
|
427
465
|
} else {
|
|
466
|
+
manifest_log.read_callback = null;
|
|
428
467
|
callback(manifest_log);
|
|
429
468
|
}
|
|
430
469
|
}
|
|
431
470
|
|
|
432
|
-
fn
|
|
471
|
+
fn compact_read_block_callback(read: *Grid.Read, block: BlockPtrConst) void {
|
|
433
472
|
const manifest_log = @fieldParentPtr(ManifestLog, "read", read);
|
|
434
473
|
assert(manifest_log.opened);
|
|
435
474
|
assert(manifest_log.reading);
|
|
@@ -489,11 +528,11 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
489
528
|
);
|
|
490
529
|
assert(!manifest.queued_for_compaction(block_reference.address));
|
|
491
530
|
|
|
492
|
-
manifest_log.
|
|
531
|
+
manifest_log.grid.release_at_checkpoint(block_reference.address);
|
|
493
532
|
|
|
494
|
-
const callback = manifest_log.read_callback
|
|
533
|
+
const callback = manifest_log.read_callback.?;
|
|
495
534
|
manifest_log.reading = false;
|
|
496
|
-
manifest_log.read_callback =
|
|
535
|
+
manifest_log.read_callback = null;
|
|
497
536
|
manifest_log.read_block_reference = null;
|
|
498
537
|
|
|
499
538
|
callback(manifest_log);
|
|
@@ -503,6 +542,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
503
542
|
assert(manifest_log.opened);
|
|
504
543
|
assert(!manifest_log.reading);
|
|
505
544
|
assert(!manifest_log.writing);
|
|
545
|
+
assert(manifest_log.write_callback == null);
|
|
506
546
|
|
|
507
547
|
manifest_log.writing = true;
|
|
508
548
|
manifest_log.write_callback = callback;
|
|
@@ -519,7 +559,9 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
519
559
|
}
|
|
520
560
|
|
|
521
561
|
fn acquire_block(manifest_log: *ManifestLog) void {
|
|
562
|
+
assert(manifest_log.opened);
|
|
522
563
|
assert(manifest_log.entry_count == 0);
|
|
564
|
+
assert(manifest_log.blocks.count == manifest_log.blocks_closed);
|
|
523
565
|
assert(!manifest_log.blocks.full());
|
|
524
566
|
|
|
525
567
|
manifest_log.blocks.advance_tail();
|
|
@@ -529,15 +571,16 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
529
571
|
const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
|
|
530
572
|
header.* = .{
|
|
531
573
|
.cluster = manifest_log.superblock.working.cluster,
|
|
532
|
-
.op = manifest_log.
|
|
574
|
+
.op = manifest_log.grid.acquire(),
|
|
533
575
|
.size = undefined,
|
|
534
576
|
.command = .block,
|
|
535
577
|
};
|
|
536
578
|
}
|
|
537
579
|
|
|
538
580
|
fn close_block(manifest_log: *ManifestLog) void {
|
|
539
|
-
|
|
581
|
+
assert(manifest_log.blocks.count == manifest_log.blocks_closed + 1);
|
|
540
582
|
|
|
583
|
+
const block: BlockPtr = manifest_log.blocks.tail().?;
|
|
541
584
|
const entry_count = manifest_log.entry_count;
|
|
542
585
|
assert(entry_count > 0);
|
|
543
586
|
assert(entry_count <= entry_count_max);
|
|
@@ -554,6 +597,7 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
554
597
|
// Zero unused tables, and padding:
|
|
555
598
|
mem.set(u8, block[header.size..], 0);
|
|
556
599
|
|
|
600
|
+
header.operation = BlockType.manifest.operation();
|
|
557
601
|
header.set_checksum_body(block[@sizeOf(vsr.Header)..header.size]);
|
|
558
602
|
header.set_checksum();
|
|
559
603
|
|
|
@@ -569,10 +613,12 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
569
613
|
|
|
570
614
|
manifest_log.blocks_closed += 1;
|
|
571
615
|
manifest_log.entry_count = 0;
|
|
616
|
+
assert(manifest_log.blocks.count == manifest_log.blocks_closed);
|
|
572
617
|
}
|
|
573
618
|
|
|
574
619
|
fn verify_block(block: BlockPtrConst, checksum: ?u128, address: ?u64) void {
|
|
575
620
|
const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
|
|
621
|
+
assert(BlockType.from(header.operation) == .manifest);
|
|
576
622
|
|
|
577
623
|
if (config.verify) {
|
|
578
624
|
assert(header.valid_checksum());
|
|
@@ -623,6 +669,8 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
623
669
|
|
|
624
670
|
// Encode the smaller type first because this will be multiplied by entry_count_max.
|
|
625
671
|
const labels_size = entry_count_max * @sizeOf(Label);
|
|
672
|
+
assert(labels_size == labels_size_max);
|
|
673
|
+
assert((@sizeOf(vsr.Header) + labels_size) % @alignOf(TableInfo) == 0);
|
|
626
674
|
const tables_size = entry_count * @sizeOf(TableInfo);
|
|
627
675
|
|
|
628
676
|
return @sizeOf(vsr.Header) + labels_size + tables_size;
|
|
@@ -649,14 +697,14 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
649
697
|
fn tables(block: BlockPtr) *[entry_count_max]TableInfo {
|
|
650
698
|
return mem.bytesAsSlice(
|
|
651
699
|
TableInfo,
|
|
652
|
-
block[@sizeOf(vsr.Header) +
|
|
700
|
+
block[@sizeOf(vsr.Header) + labels_size_max ..][0..tables_size_max],
|
|
653
701
|
)[0..entry_count_max];
|
|
654
702
|
}
|
|
655
703
|
|
|
656
704
|
fn tables_const(block: BlockPtrConst) *const [entry_count_max]TableInfo {
|
|
657
705
|
return mem.bytesAsSlice(
|
|
658
706
|
TableInfo,
|
|
659
|
-
block[@sizeOf(vsr.Header) +
|
|
707
|
+
block[@sizeOf(vsr.Header) + labels_size_max ..][0..tables_size_max],
|
|
660
708
|
)[0..entry_count_max];
|
|
661
709
|
}
|
|
662
710
|
};
|
|
@@ -893,6 +941,7 @@ pub fn main() !void {
|
|
|
893
941
|
};
|
|
894
942
|
assert(@sizeOf(TableInfo) == 48 + 16 * 2);
|
|
895
943
|
assert(@alignOf(TableInfo) == 16);
|
|
944
|
+
assert(@bitSizeOf(TableInfo) == @sizeOf(TableInfo) * 8);
|
|
896
945
|
|
|
897
946
|
const ManifestLogTest = ManifestLogTestType(Storage, TableInfo);
|
|
898
947
|
|
|
@@ -12,6 +12,7 @@ const GridType = @import("grid.zig").GridType;
|
|
|
12
12
|
const NodePool = @import("node_pool.zig").NodePool(config.lsm_manifest_node_size, 16);
|
|
13
13
|
|
|
14
14
|
const snapshot_latest = @import("tree.zig").snapshot_latest;
|
|
15
|
+
const compaction_snapshot_for_op = @import("tree.zig").compaction_snapshot_for_op;
|
|
15
16
|
|
|
16
17
|
/// This type wraps a single LSM tree in the API needed to integrate it with the Forest.
|
|
17
18
|
/// TigerBeetle's state machine requires a map from u128 ID to posted boolean for transfers
|
|
@@ -44,7 +45,6 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
44
45
|
return value.id;
|
|
45
46
|
}
|
|
46
47
|
|
|
47
|
-
// TODO(ifreund): disallow this id in the state machine.
|
|
48
48
|
const sentinel_key = math.maxInt(u128);
|
|
49
49
|
|
|
50
50
|
inline fn tombstone(value: *const Value) bool {
|
|
@@ -69,13 +69,13 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
69
69
|
Value.tombstone_from_key,
|
|
70
70
|
);
|
|
71
71
|
|
|
72
|
-
const Tree = TreeType(Table, Storage, "
|
|
72
|
+
const Tree = TreeType(Table, Storage, "posted_groove");
|
|
73
73
|
const Grid = GridType(Storage);
|
|
74
74
|
|
|
75
75
|
const PrefetchIDs = std.AutoHashMapUnmanaged(u128, void);
|
|
76
|
-
const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool);
|
|
76
|
+
const PrefetchObjects = std.AutoHashMapUnmanaged(u128, bool); // true:posted, false:voided
|
|
77
77
|
|
|
78
|
-
cache: *Tree.
|
|
78
|
+
cache: *Tree.TableMutable.ValuesCache,
|
|
79
79
|
tree: Tree,
|
|
80
80
|
|
|
81
81
|
/// Object IDs enqueued to be prefetched.
|
|
@@ -90,42 +90,31 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
90
90
|
/// sufficient to query this hashmap alone to know the state of the LSM trees.
|
|
91
91
|
prefetch_objects: PrefetchObjects,
|
|
92
92
|
|
|
93
|
-
///
|
|
93
|
+
/// The snapshot to prefetch from.
|
|
94
|
+
prefetch_snapshot: ?u64,
|
|
95
|
+
|
|
96
|
+
/// This field is necessary to expose the same open()/compact()/checkpoint() function
|
|
94
97
|
/// signatures as the real Groove type.
|
|
95
98
|
callback: ?fn (*PostedGroove) void = null,
|
|
96
99
|
|
|
100
|
+
/// See comments for Groove.Options.
|
|
101
|
+
pub const Options = struct {
|
|
102
|
+
cache_entries_max: u32,
|
|
103
|
+
prefetch_entries_max: u32,
|
|
104
|
+
commit_entries_max: u32,
|
|
105
|
+
};
|
|
106
|
+
|
|
97
107
|
pub fn init(
|
|
98
108
|
allocator: mem.Allocator,
|
|
99
109
|
node_pool: *NodePool,
|
|
100
110
|
grid: *Grid,
|
|
101
|
-
|
|
102
|
-
// that tigerbeetle was given to allocate from CLI arguments.
|
|
103
|
-
cache_size: u32,
|
|
104
|
-
// In general, the commit count max for a field, depends on the field's object,
|
|
105
|
-
// how many objects might be changed by a batch:
|
|
106
|
-
// (config.message_size_max - sizeOf(vsr.header))
|
|
107
|
-
// For example, there are at most 8191 transfers in a batch.
|
|
108
|
-
// So commit_count_max=8191 for transfer objects and indexes.
|
|
109
|
-
//
|
|
110
|
-
// However, if a transfer is ever mutated, then this will double commit_count_max
|
|
111
|
-
// since the old index might need to be removed, and the new index inserted.
|
|
112
|
-
//
|
|
113
|
-
// A way to see this is by looking at the state machine. If a transfer is inserted,
|
|
114
|
-
// how many accounts and transfer put/removes will be generated?
|
|
115
|
-
//
|
|
116
|
-
// This also means looking at the state machine operation that will generate the
|
|
117
|
-
// most put/removes in the worst case.
|
|
118
|
-
// For example, create_accounts will put at most 8191 accounts.
|
|
119
|
-
// However, create_transfers will put 2 accounts (8191 * 2) for every transfer, and
|
|
120
|
-
// some of these accounts may exist, requiring a remove/put to update the index.
|
|
121
|
-
commit_count_max: u32,
|
|
111
|
+
options: Options,
|
|
122
112
|
) !PostedGroove {
|
|
123
|
-
// Cache is
|
|
124
|
-
const cache = try allocator.create(Tree.
|
|
113
|
+
// Cache is heap-allocated to pass a pointer into the Object tree.
|
|
114
|
+
const cache = try allocator.create(Tree.TableMutable.ValuesCache);
|
|
125
115
|
errdefer allocator.destroy(cache);
|
|
126
116
|
|
|
127
|
-
cache.* = .
|
|
128
|
-
try cache.ensureTotalCapacity(allocator, cache_size);
|
|
117
|
+
cache.* = try Tree.TableMutable.ValuesCache.init(allocator, options.cache_entries_max);
|
|
129
118
|
errdefer cache.deinit(allocator);
|
|
130
119
|
|
|
131
120
|
var tree = try Tree.init(
|
|
@@ -134,20 +123,17 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
134
123
|
grid,
|
|
135
124
|
cache,
|
|
136
125
|
.{
|
|
137
|
-
.
|
|
126
|
+
.commit_entries_max = options.commit_entries_max,
|
|
138
127
|
},
|
|
139
128
|
);
|
|
140
129
|
errdefer tree.deinit(allocator);
|
|
141
130
|
|
|
142
|
-
// TODO: document why this is twice the commit count max.
|
|
143
|
-
const prefetch_count_max = commit_count_max * 2;
|
|
144
|
-
|
|
145
131
|
var prefetch_ids = PrefetchIDs{};
|
|
146
|
-
try prefetch_ids.ensureTotalCapacity(allocator,
|
|
132
|
+
try prefetch_ids.ensureTotalCapacity(allocator, options.prefetch_entries_max);
|
|
147
133
|
errdefer prefetch_ids.deinit(allocator);
|
|
148
134
|
|
|
149
135
|
var prefetch_objects = PrefetchObjects{};
|
|
150
|
-
try prefetch_objects.ensureTotalCapacity(allocator,
|
|
136
|
+
try prefetch_objects.ensureTotalCapacity(allocator, options.prefetch_entries_max);
|
|
151
137
|
errdefer prefetch_objects.deinit(allocator);
|
|
152
138
|
|
|
153
139
|
return PostedGroove{
|
|
@@ -156,12 +142,11 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
156
142
|
|
|
157
143
|
.prefetch_ids = prefetch_ids,
|
|
158
144
|
.prefetch_objects = prefetch_objects,
|
|
145
|
+
.prefetch_snapshot = null,
|
|
159
146
|
};
|
|
160
147
|
}
|
|
161
148
|
|
|
162
149
|
pub fn deinit(groove: *PostedGroove, allocator: mem.Allocator) void {
|
|
163
|
-
assert(groove.callback == null);
|
|
164
|
-
|
|
165
150
|
groove.tree.deinit(allocator);
|
|
166
151
|
groove.cache.deinit(allocator);
|
|
167
152
|
allocator.destroy(groove.cache);
|
|
@@ -176,10 +161,24 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
176
161
|
return groove.prefetch_objects.get(id);
|
|
177
162
|
}
|
|
178
163
|
|
|
179
|
-
/// Must be called directly
|
|
180
|
-
///
|
|
181
|
-
pub fn
|
|
182
|
-
|
|
164
|
+
/// Must be called directly before the state machine begins queuing ids for prefetch.
|
|
165
|
+
/// When `snapshot` is null, prefetch from the current snapshot.
|
|
166
|
+
pub fn prefetch_setup(groove: *PostedGroove, snapshot: ?u64) void {
|
|
167
|
+
// We may query the input tables of an ongoing compaction, but must not query the
|
|
168
|
+
// output tables until the compaction is complete. (Until then, the output tables may
|
|
169
|
+
// be in the manifest but not yet on disk).
|
|
170
|
+
const snapshot_max = groove.tree.lookup_snapshot_max;
|
|
171
|
+
const snapshot_target = snapshot orelse snapshot_max;
|
|
172
|
+
assert(snapshot_target <= snapshot_max);
|
|
173
|
+
|
|
174
|
+
if (groove.prefetch_snapshot == null) {
|
|
175
|
+
groove.prefetch_objects.clearRetainingCapacity();
|
|
176
|
+
} else {
|
|
177
|
+
// If there is a snapshot already set from the previous prefetch_setup(), then its
|
|
178
|
+
// prefetch() was never called, so there must already be no queued objects or ids.
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
groove.prefetch_snapshot = snapshot_target;
|
|
183
182
|
assert(groove.prefetch_objects.count() == 0);
|
|
184
183
|
assert(groove.prefetch_ids.count() == 0);
|
|
185
184
|
}
|
|
@@ -188,7 +187,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
188
187
|
/// We tolerate duplicate IDs enqueued by the state machine.
|
|
189
188
|
/// For example, if all unique operations require the same two dependencies.
|
|
190
189
|
pub fn prefetch_enqueue(groove: *PostedGroove, id: u128) void {
|
|
191
|
-
if (groove.tree.
|
|
190
|
+
if (groove.tree.lookup_from_memory(groove.prefetch_snapshot.?, id)) |value| {
|
|
192
191
|
switch (value.data) {
|
|
193
192
|
.posted => groove.prefetch_objects.putAssumeCapacity(value.id, true),
|
|
194
193
|
.voided => groove.prefetch_objects.putAssumeCapacity(value.id, false),
|
|
@@ -200,8 +199,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
200
199
|
}
|
|
201
200
|
|
|
202
201
|
/// Ensure the objects corresponding to all ids enqueued with prefetch_enqueue() are
|
|
203
|
-
///
|
|
204
|
-
/// backup hash map.
|
|
202
|
+
/// available in `prefetch_objects`.
|
|
205
203
|
pub fn prefetch(
|
|
206
204
|
groove: *PostedGroove,
|
|
207
205
|
callback: fn (*PrefetchContext) void,
|
|
@@ -210,14 +208,17 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
210
208
|
context.* = .{
|
|
211
209
|
.groove = groove,
|
|
212
210
|
.callback = callback,
|
|
211
|
+
.snapshot = groove.prefetch_snapshot.?,
|
|
213
212
|
.id_iterator = groove.prefetch_ids.keyIterator(),
|
|
214
213
|
};
|
|
214
|
+
groove.prefetch_snapshot = null;
|
|
215
215
|
context.start_workers();
|
|
216
216
|
}
|
|
217
217
|
|
|
218
218
|
pub const PrefetchContext = struct {
|
|
219
219
|
groove: *PostedGroove,
|
|
220
220
|
callback: fn (*PrefetchContext) void,
|
|
221
|
+
snapshot: u64,
|
|
221
222
|
|
|
222
223
|
id_iterator: PrefetchIDs.KeyIterator,
|
|
223
224
|
|
|
@@ -233,17 +234,16 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
233
234
|
|
|
234
235
|
// Track an extra "worker" that will finish after the loop.
|
|
235
236
|
//
|
|
236
|
-
// This prevents `context.finish()` from being called within the loop body when
|
|
237
|
-
// worker finishes synchronously. `context.finish()`
|
|
238
|
-
//
|
|
237
|
+
// This prevents `context.finish()` from being called within the loop body when
|
|
238
|
+
// every worker finishes synchronously. `context.finish()` calls the user-provided
|
|
239
|
+
// callback which may re-use the memory of this `PrefetchContext`. However, we
|
|
240
|
+
// rely on `context` being well-defined for the loop condition.
|
|
239
241
|
context.workers_busy += 1;
|
|
240
242
|
|
|
241
|
-
|
|
242
|
-
while (context.workers_busy - 1 < context.workers.len) {
|
|
243
|
-
const worker = &context.workers[context.workers_busy - 1];
|
|
243
|
+
for (context.workers) |*worker| {
|
|
244
244
|
worker.* = .{ .context = context };
|
|
245
245
|
context.workers_busy += 1;
|
|
246
|
-
|
|
246
|
+
worker.lookup_start_next();
|
|
247
247
|
}
|
|
248
248
|
|
|
249
249
|
assert(context.workers_busy >= 1);
|
|
@@ -257,12 +257,12 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
257
257
|
|
|
258
258
|
fn finish(context: *PrefetchContext) void {
|
|
259
259
|
assert(context.workers_busy == 0);
|
|
260
|
-
|
|
260
|
+
|
|
261
261
|
assert(context.id_iterator.next() == null);
|
|
262
|
+
context.groove.prefetch_ids.clearRetainingCapacity();
|
|
263
|
+
assert(context.groove.prefetch_ids.count() == 0);
|
|
262
264
|
|
|
263
|
-
|
|
264
|
-
context.* = undefined;
|
|
265
|
-
callback(context);
|
|
265
|
+
context.callback(context);
|
|
266
266
|
}
|
|
267
267
|
};
|
|
268
268
|
|
|
@@ -272,33 +272,27 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
272
272
|
|
|
273
273
|
/// Returns true if asynchronous I/O has been started.
|
|
274
274
|
/// Returns false if there are no more IDs to prefetch.
|
|
275
|
-
fn
|
|
276
|
-
const groove = worker.context.groove;
|
|
277
|
-
|
|
275
|
+
fn lookup_start_next(worker: *PrefetchWorker) void {
|
|
278
276
|
const id = worker.context.id_iterator.next() orelse {
|
|
279
|
-
groove.prefetch_ids.clearRetainingCapacity();
|
|
280
|
-
assert(groove.prefetch_ids.count() == 0);
|
|
281
277
|
worker.context.worker_finished();
|
|
282
|
-
return
|
|
278
|
+
return;
|
|
283
279
|
};
|
|
284
280
|
|
|
285
281
|
if (config.verify) {
|
|
286
|
-
// This
|
|
287
|
-
assert(groove.tree.
|
|
282
|
+
// This was checked in prefetch_enqueue().
|
|
283
|
+
assert(worker.context.groove.tree.lookup_from_memory(worker.context.snapshot, id.*) == null);
|
|
288
284
|
}
|
|
289
285
|
|
|
290
286
|
// If not in the LSM tree's cache, the object must be read from disk and added
|
|
291
287
|
// to the auxillary prefetch_objects hash map.
|
|
292
288
|
// TODO: this LSM tree function needlessly checks the LSM tree's cache a
|
|
293
289
|
// second time. Adding API to the LSM tree to avoid this may be worthwhile.
|
|
294
|
-
groove.tree.
|
|
290
|
+
worker.context.groove.tree.lookup_from_levels(
|
|
295
291
|
lookup_id_callback,
|
|
296
292
|
&worker.lookup_id,
|
|
297
|
-
|
|
293
|
+
worker.context.snapshot,
|
|
298
294
|
id.*,
|
|
299
295
|
);
|
|
300
|
-
|
|
301
|
-
return true;
|
|
302
296
|
}
|
|
303
297
|
|
|
304
298
|
fn lookup_id_callback(
|
|
@@ -321,13 +315,7 @@ pub fn PostedGrooveType(comptime Storage: type) type {
|
|
|
321
315
|
},
|
|
322
316
|
}
|
|
323
317
|
}
|
|
324
|
-
worker.
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
fn lookup_finish(worker: *PrefetchWorker) void {
|
|
328
|
-
if (!worker.lookup_start()) {
|
|
329
|
-
worker.* = undefined;
|
|
330
|
-
}
|
|
318
|
+
worker.lookup_start_next();
|
|
331
319
|
}
|
|
332
320
|
};
|
|
333
321
|
|
|
@@ -392,7 +380,7 @@ test "PostedGroove" {
|
|
|
392
380
|
|
|
393
381
|
_ = PostedGroove.prefetch_enqueue;
|
|
394
382
|
_ = PostedGroove.prefetch;
|
|
395
|
-
_ = PostedGroove.
|
|
383
|
+
_ = PostedGroove.prefetch_setup;
|
|
396
384
|
|
|
397
385
|
std.testing.refAllDecls(PostedGroove.PrefetchWorker);
|
|
398
386
|
std.testing.refAllDecls(PostedGroove.PrefetchContext);
|