tigerbeetle-node 0.11.6 → 0.11.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/.client.node.sha256 +1 -1
- package/package.json +1 -1
- package/src/tigerbeetle/scripts/benchmark.bat +1 -2
- package/src/tigerbeetle/scripts/benchmark.sh +1 -2
- package/src/tigerbeetle/scripts/install.bat +7 -0
- package/src/tigerbeetle/scripts/install.sh +2 -3
- package/src/tigerbeetle/src/benchmark.zig +3 -3
- package/src/tigerbeetle/src/ewah.zig +6 -5
- package/src/tigerbeetle/src/ewah_fuzz.zig +1 -1
- package/src/tigerbeetle/src/io/darwin.zig +19 -0
- package/src/tigerbeetle/src/io/linux.zig +8 -0
- package/src/tigerbeetle/src/io/windows.zig +20 -2
- package/src/tigerbeetle/src/iops.zig +7 -1
- package/src/tigerbeetle/src/lsm/compaction.zig +18 -29
- package/src/tigerbeetle/src/lsm/forest_fuzz.zig +9 -5
- package/src/tigerbeetle/src/lsm/grid.zig +267 -267
- package/src/tigerbeetle/src/lsm/level_iterator.zig +18 -1
- package/src/tigerbeetle/src/lsm/manifest.zig +29 -1
- package/src/tigerbeetle/src/lsm/manifest_log.zig +5 -5
- package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +2 -2
- package/src/tigerbeetle/src/lsm/set_associative_cache.zig +26 -70
- package/src/tigerbeetle/src/lsm/table.zig +42 -0
- package/src/tigerbeetle/src/lsm/table_iterator.zig +27 -0
- package/src/tigerbeetle/src/lsm/table_mutable.zig +1 -1
- package/src/tigerbeetle/src/lsm/test.zig +2 -3
- package/src/tigerbeetle/src/lsm/tree.zig +27 -6
- package/src/tigerbeetle/src/lsm/tree_fuzz.zig +1 -1
- package/src/tigerbeetle/src/simulator.zig +0 -5
- package/src/tigerbeetle/src/storage.zig +58 -6
- package/src/tigerbeetle/src/test/cluster.zig +3 -0
- package/src/tigerbeetle/src/test/state_checker.zig +1 -1
- package/src/tigerbeetle/src/test/storage.zig +22 -1
- package/src/tigerbeetle/src/tracer.zig +50 -28
- package/src/tigerbeetle/src/unit_tests.zig +9 -4
- package/src/tigerbeetle/src/vopr.zig +4 -4
- package/src/tigerbeetle/src/vsr/journal.zig +153 -93
- package/src/tigerbeetle/src/vsr/replica.zig +10 -20
- package/src/tigerbeetle/src/vsr/superblock.zig +19 -16
- package/src/tigerbeetle/src/vsr/superblock_free_set.zig +114 -93
- package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +1 -1
- package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +1 -3
- package/src/tigerbeetle/src/vsr.zig +55 -8
- package/src/tigerbeetle/src/c/tb_client/context.zig +0 -304
- package/src/tigerbeetle/src/c/tb_client/echo_client.zig +0 -108
- package/src/tigerbeetle/src/c/tb_client/packet.zig +0 -80
- package/src/tigerbeetle/src/c/tb_client/signal.zig +0 -286
- package/src/tigerbeetle/src/c/tb_client/thread.zig +0 -88
- package/src/tigerbeetle/src/c/tb_client.h +0 -220
- package/src/tigerbeetle/src/c/tb_client.zig +0 -177
- package/src/tigerbeetle/src/c/tb_client_header.zig +0 -218
- package/src/tigerbeetle/src/c/tb_client_header_test.zig +0 -135
- package/src/tigerbeetle/src/c/test.zig +0 -371
- package/src/tigerbeetle/src/cli.zig +0 -399
- package/src/tigerbeetle/src/main.zig +0 -242
|
@@ -68,6 +68,9 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
68
68
|
values: ValuesRingBuffer,
|
|
69
69
|
tables: TablesRingBuffer,
|
|
70
70
|
|
|
71
|
+
// Used for verifying key order when constants.verify == true.
|
|
72
|
+
key_prev: ?Key,
|
|
73
|
+
|
|
71
74
|
pub fn init(allocator: mem.Allocator) !LevelIterator {
|
|
72
75
|
var values = try ValuesRingBuffer.init(allocator);
|
|
73
76
|
errdefer values.deinit(allocator);
|
|
@@ -96,6 +99,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
96
99
|
.{ .table_iterator = table_b },
|
|
97
100
|
},
|
|
98
101
|
},
|
|
102
|
+
.key_prev = null,
|
|
99
103
|
};
|
|
100
104
|
}
|
|
101
105
|
|
|
@@ -138,6 +142,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
138
142
|
.direction = context.direction,
|
|
139
143
|
.values = .{ .buffer = it.values.buffer },
|
|
140
144
|
.tables = .{ .buffer = it.tables.buffer },
|
|
145
|
+
.key_prev = null,
|
|
141
146
|
};
|
|
142
147
|
|
|
143
148
|
assert(it.key_exclusive == null);
|
|
@@ -281,7 +286,7 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
281
286
|
/// Returns either:
|
|
282
287
|
/// - the next Key, if available.
|
|
283
288
|
/// - error.Empty when there are no values remaining to iterate.
|
|
284
|
-
/// - error.Drained when the iterator isn't empty, but the values
|
|
289
|
+
/// - error.Drained when the iterator isn't empty, but the values
|
|
285
290
|
/// still need to be buffered into memory via tick().
|
|
286
291
|
pub fn peek(it: LevelIterator) error{ Empty, Drained }!Key {
|
|
287
292
|
if (it.values.head_ptr_const()) |value| return key_from_value(value);
|
|
@@ -299,6 +304,18 @@ pub fn LevelIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
299
304
|
|
|
300
305
|
/// This may only be called after peek() returns a Key (and not Empty or Drained)
|
|
301
306
|
pub fn pop(it: *LevelIterator) Value {
|
|
307
|
+
const value = it.pop_internal();
|
|
308
|
+
|
|
309
|
+
if (constants.verify) {
|
|
310
|
+
const key = Table.key_from_value(&value);
|
|
311
|
+
if (it.key_prev) |k| assert(Table.compare_keys(k, key) == .lt);
|
|
312
|
+
it.key_prev = key;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
return value;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
fn pop_internal(it: *LevelIterator) Value {
|
|
302
319
|
if (it.values.pop()) |value| return value;
|
|
303
320
|
|
|
304
321
|
const table_iterator = &it.tables.head_ptr().?.table_iterator;
|
|
@@ -531,7 +531,7 @@ pub fn ManifestType(comptime Table: type, comptime Storage: type) type {
|
|
|
531
531
|
.ascending,
|
|
532
532
|
KeyRange{ .key_min = range.key_min, .key_max = range.key_max },
|
|
533
533
|
);
|
|
534
|
-
if (it.next()
|
|
534
|
+
if (it.next() != null) {
|
|
535
535
|
// If the range is being compacted into the last level then this is unreachable,
|
|
536
536
|
// as the last level has no subsequent levels and must always drop tombstones.
|
|
537
537
|
assert(level_b != constants.lsm_levels - 1);
|
|
@@ -585,5 +585,33 @@ pub fn ManifestType(comptime Table: type, comptime Storage: type) type {
|
|
|
585
585
|
manifest.checkpoint_callback = null;
|
|
586
586
|
callback(manifest);
|
|
587
587
|
}
|
|
588
|
+
|
|
589
|
+
pub fn verify(manifest: *Manifest, snapshot: u64) void {
|
|
590
|
+
for (manifest.levels) |*level| {
|
|
591
|
+
var key_max_prev: ?Key = null;
|
|
592
|
+
var table_info_iter = level.iterator(
|
|
593
|
+
.visible,
|
|
594
|
+
&.{snapshot},
|
|
595
|
+
.ascending,
|
|
596
|
+
null,
|
|
597
|
+
);
|
|
598
|
+
while (table_info_iter.next()) |table_info| {
|
|
599
|
+
if (key_max_prev) |k| {
|
|
600
|
+
assert(compare_keys(k, table_info.key_min) == .lt);
|
|
601
|
+
}
|
|
602
|
+
// We could have key_min == key_max if there is only one value.
|
|
603
|
+
assert(compare_keys(table_info.key_min, table_info.key_max) != .gt);
|
|
604
|
+
key_max_prev = table_info.key_max;
|
|
605
|
+
|
|
606
|
+
Table.verify(
|
|
607
|
+
Storage,
|
|
608
|
+
manifest.manifest_log.grid.superblock.storage,
|
|
609
|
+
table_info.address,
|
|
610
|
+
table_info.key_min,
|
|
611
|
+
table_info.key_max,
|
|
612
|
+
);
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
}
|
|
588
616
|
};
|
|
589
617
|
}
|
|
@@ -404,14 +404,14 @@ pub fn ManifestLogType(comptime Storage: type, comptime TableInfo: type) type {
|
|
|
404
404
|
return;
|
|
405
405
|
}
|
|
406
406
|
|
|
407
|
-
const block = manifest_log.blocks.
|
|
408
|
-
verify_block(block
|
|
407
|
+
const block = manifest_log.blocks.head_ptr().?;
|
|
408
|
+
verify_block(block.*, null, null);
|
|
409
409
|
|
|
410
|
-
const header = mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
|
|
411
|
-
const address = Block.address(block);
|
|
410
|
+
const header = mem.bytesAsValue(vsr.Header, block.*[0..@sizeOf(vsr.Header)]);
|
|
411
|
+
const address = Block.address(block.*);
|
|
412
412
|
assert(address > 0);
|
|
413
413
|
|
|
414
|
-
const entry_count = Block.entry_count(block);
|
|
414
|
+
const entry_count = Block.entry_count(block.*);
|
|
415
415
|
|
|
416
416
|
if (manifest_log.blocks_closed == 1 and manifest_log.blocks.count == 1) {
|
|
417
417
|
// This might be the last block of a checkpoint, which can be a partial block.
|
|
@@ -319,7 +319,7 @@ const Environment = struct {
|
|
|
319
319
|
|
|
320
320
|
fn wait(env: *Environment, manifest_log: *ManifestLog) void {
|
|
321
321
|
while (env.pending > 0) {
|
|
322
|
-
manifest_log.grid.tick();
|
|
322
|
+
// manifest_log.grid.tick();
|
|
323
323
|
manifest_log.superblock.storage.tick();
|
|
324
324
|
}
|
|
325
325
|
}
|
|
@@ -635,7 +635,7 @@ fn verify_manifest_compaction_set(
|
|
|
635
635
|
var compact_blocks_checked: u32 = 0;
|
|
636
636
|
|
|
637
637
|
// This test doesn't include any actual table blocks, so all blocks are manifest blocks.
|
|
638
|
-
var blocks = superblock.free_set.blocks.iterator(.{ .kind = .
|
|
638
|
+
var blocks = superblock.free_set.blocks.iterator(.{ .kind = .set });
|
|
639
639
|
while (blocks.next()) |block_index| {
|
|
640
640
|
const block_address = block_index + 1;
|
|
641
641
|
const block = superblock.storage.grid_block(block_address);
|
|
@@ -188,21 +188,26 @@ pub fn SetAssociativeCache(
|
|
|
188
188
|
mem.set(u64, self.clocks.words, 0);
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
-
/// Returns whether an entry with the given key is cached,
|
|
191
|
+
/// Returns whether an entry with the given key is cached,
|
|
192
192
|
/// without modifying the entry's counter.
|
|
193
193
|
pub fn exists(self: *Self, key: Key) bool {
|
|
194
194
|
const set = self.associate(key);
|
|
195
195
|
return self.search(set, key) != null;
|
|
196
196
|
}
|
|
197
197
|
|
|
198
|
-
pub fn
|
|
198
|
+
pub fn get_index(self: *Self, key: Key) ?usize {
|
|
199
199
|
const set = self.associate(key);
|
|
200
200
|
const way = self.search(set, key) orelse return null;
|
|
201
201
|
|
|
202
202
|
const count = self.counts.get(set.offset + way);
|
|
203
203
|
self.counts.set(set.offset + way, count +| 1);
|
|
204
204
|
|
|
205
|
-
return
|
|
205
|
+
return set.offset + way;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
pub fn get(self: *Self, key: Key) ?*align(value_alignment) Value {
|
|
209
|
+
const index = self.get_index(key) orelse return null;
|
|
210
|
+
return @alignCast(value_alignment, &self.values[index]);
|
|
206
211
|
}
|
|
207
212
|
|
|
208
213
|
/// Remove a key from the set associative cache if present.
|
|
@@ -211,6 +216,7 @@ pub fn SetAssociativeCache(
|
|
|
211
216
|
const way = self.search(set, key) orelse return;
|
|
212
217
|
|
|
213
218
|
self.counts.set(set.offset + way, 0);
|
|
219
|
+
set.values[way] = undefined;
|
|
214
220
|
}
|
|
215
221
|
|
|
216
222
|
/// Hint that the key is less likely to be accessed in the future, without actually removing
|
|
@@ -248,39 +254,21 @@ pub fn SetAssociativeCache(
|
|
|
248
254
|
return @ptrCast(*const Ways, &result).*;
|
|
249
255
|
}
|
|
250
256
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
struct {
|
|
255
|
-
inline fn locked(_: void, _: *const Value) bool {
|
|
256
|
-
return false;
|
|
257
|
-
}
|
|
258
|
-
}.locked,
|
|
259
|
-
{},
|
|
260
|
-
key,
|
|
261
|
-
);
|
|
257
|
+
/// Insert a value, evicting an older entry if needed.
|
|
258
|
+
pub fn insert(self: *Self, value: *const Value) void {
|
|
259
|
+
_ = self.insert_index(value);
|
|
262
260
|
}
|
|
263
261
|
|
|
264
|
-
///
|
|
265
|
-
///
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
pub fn insert_preserve_locked(
|
|
269
|
-
self: *Self,
|
|
270
|
-
comptime Context: type,
|
|
271
|
-
comptime locked: fn (
|
|
272
|
-
Context,
|
|
273
|
-
*align(value_alignment) const Value,
|
|
274
|
-
) callconv(.Inline) bool,
|
|
275
|
-
context: Context,
|
|
276
|
-
key: Key,
|
|
277
|
-
) *align(value_alignment) Value {
|
|
262
|
+
/// Insert a value, evicting an older entry if needed.
|
|
263
|
+
/// Return the index at which the value was inserted.
|
|
264
|
+
pub fn insert_index(self: *Self, value: *const Value) usize {
|
|
265
|
+
const key = key_from_value(value);
|
|
278
266
|
const set = self.associate(key);
|
|
279
267
|
if (self.search(set, key)) |way| {
|
|
280
|
-
//
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
268
|
+
// Overwrite the old entry for this key.
|
|
269
|
+
self.counts.set(set.offset + way, 1);
|
|
270
|
+
set.values[way] = value.*;
|
|
271
|
+
return set.offset + way;
|
|
284
272
|
}
|
|
285
273
|
|
|
286
274
|
const clock_index = @divExact(set.offset, layout.ways);
|
|
@@ -299,11 +287,6 @@ pub fn SetAssociativeCache(
|
|
|
299
287
|
safety_count += 1;
|
|
300
288
|
way +%= 1;
|
|
301
289
|
}) {
|
|
302
|
-
// We pass a value pointer to the callback here so that a cache miss
|
|
303
|
-
// can be avoided if the caller is able to determine if the value is
|
|
304
|
-
// locked by comparing pointers directly.
|
|
305
|
-
if (locked(context, @alignCast(value_alignment, &set.values[way]))) continue;
|
|
306
|
-
|
|
307
290
|
var count = self.counts.get(set.offset + way);
|
|
308
291
|
if (count == 0) break; // Way is already free.
|
|
309
292
|
|
|
@@ -316,10 +299,11 @@ pub fn SetAssociativeCache(
|
|
|
316
299
|
assert(self.counts.get(set.offset + way) == 0);
|
|
317
300
|
|
|
318
301
|
set.tags[way] = set.tag;
|
|
302
|
+
set.values[way] = value.*;
|
|
319
303
|
self.counts.set(set.offset + way, 1);
|
|
320
304
|
self.clocks.set(clock_index, way +% 1);
|
|
321
305
|
|
|
322
|
-
return
|
|
306
|
+
return set.offset + way;
|
|
323
307
|
}
|
|
324
308
|
|
|
325
309
|
const Set = struct {
|
|
@@ -430,7 +414,7 @@ fn set_associative_cache_test(
|
|
|
430
414
|
try expectEqual(i, sac.clocks.get(0));
|
|
431
415
|
|
|
432
416
|
const key = i * sac.sets;
|
|
433
|
-
sac.insert(key)
|
|
417
|
+
sac.insert(&key);
|
|
434
418
|
try expect(sac.counts.get(i) == 1);
|
|
435
419
|
try expectEqual(key, sac.get(key).?.*);
|
|
436
420
|
try expect(sac.counts.get(i) == 2);
|
|
@@ -443,7 +427,7 @@ fn set_associative_cache_test(
|
|
|
443
427
|
// Insert another element into the first set, causing key 0 to be evicted.
|
|
444
428
|
{
|
|
445
429
|
const key = layout.ways * sac.sets;
|
|
446
|
-
sac.insert(key)
|
|
430
|
+
sac.insert(&key);
|
|
447
431
|
try expect(sac.counts.get(0) == 1);
|
|
448
432
|
try expectEqual(key, sac.get(key).?.*);
|
|
449
433
|
try expect(sac.counts.get(0) == 2);
|
|
@@ -460,34 +444,6 @@ fn set_associative_cache_test(
|
|
|
460
444
|
|
|
461
445
|
if (log) sac.associate(0).inspect(sac);
|
|
462
446
|
|
|
463
|
-
// Lock all other slots, causing key layout.ways * sac.sets to be evicted despite having the
|
|
464
|
-
// highest count.
|
|
465
|
-
{
|
|
466
|
-
{
|
|
467
|
-
assert(sac.counts.get(0) == 2);
|
|
468
|
-
var i: usize = 1;
|
|
469
|
-
while (i < layout.ways) : (i += 1) assert(sac.counts.get(i) == 1);
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
const key = (layout.ways + 1) * sac.sets;
|
|
473
|
-
const expect_evicted = layout.ways * sac.sets;
|
|
474
|
-
|
|
475
|
-
sac.insert_preserve_locked(
|
|
476
|
-
u64,
|
|
477
|
-
struct {
|
|
478
|
-
inline fn locked(only_unlocked: u64, value: *const Value) bool {
|
|
479
|
-
return value.* != only_unlocked;
|
|
480
|
-
}
|
|
481
|
-
}.locked,
|
|
482
|
-
expect_evicted,
|
|
483
|
-
key,
|
|
484
|
-
).* = key;
|
|
485
|
-
|
|
486
|
-
try expectEqual(@as(?*Value, null), sac.get(expect_evicted));
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
if (log) sac.associate(0).inspect(sac);
|
|
490
|
-
|
|
491
447
|
// Ensure removal works.
|
|
492
448
|
{
|
|
493
449
|
const key = 5 * sac.sets;
|
|
@@ -512,7 +468,7 @@ fn set_associative_cache_test(
|
|
|
512
468
|
try expectEqual(i, sac.clocks.get(0));
|
|
513
469
|
|
|
514
470
|
const key = i * sac.sets;
|
|
515
|
-
sac.insert(key)
|
|
471
|
+
sac.insert(&key);
|
|
516
472
|
try expect(sac.counts.get(i) == 1);
|
|
517
473
|
var j: usize = 2;
|
|
518
474
|
while (j <= math.maxInt(SAC.Count)) : (j += 1) {
|
|
@@ -530,7 +486,7 @@ fn set_associative_cache_test(
|
|
|
530
486
|
// Insert another element into the first set, causing key 0 to be evicted.
|
|
531
487
|
{
|
|
532
488
|
const key = layout.ways * sac.sets;
|
|
533
|
-
sac.insert(key)
|
|
489
|
+
sac.insert(&key);
|
|
534
490
|
try expect(sac.counts.get(0) == 1);
|
|
535
491
|
try expectEqual(key, sac.get(key).?.*);
|
|
536
492
|
try expect(sac.counts.get(0) == 2);
|
|
@@ -629,6 +629,11 @@ pub fn TableType(
|
|
|
629
629
|
assert(compare_keys(builder.key_min, builder.key_max) == .lt);
|
|
630
630
|
}
|
|
631
631
|
|
|
632
|
+
if (current > 0) {
|
|
633
|
+
const key_max_prev = index_data_keys(builder.index_block)[current - 1];
|
|
634
|
+
assert(compare_keys(key_max_prev, key_from_value(&values[0])) == .lt);
|
|
635
|
+
}
|
|
636
|
+
|
|
632
637
|
builder.data_block_count += 1;
|
|
633
638
|
builder.value = 0;
|
|
634
639
|
|
|
@@ -979,6 +984,43 @@ pub fn TableType(
|
|
|
979
984
|
|
|
980
985
|
return null;
|
|
981
986
|
}
|
|
987
|
+
|
|
988
|
+
pub fn verify(
|
|
989
|
+
comptime Storage: type,
|
|
990
|
+
storage: *Storage,
|
|
991
|
+
index_address: u64,
|
|
992
|
+
key_min: ?Key,
|
|
993
|
+
key_max: ?Key,
|
|
994
|
+
) void {
|
|
995
|
+
if (Storage != @import("../test/storage.zig").Storage)
|
|
996
|
+
// Too complicated to do async verification
|
|
997
|
+
return;
|
|
998
|
+
|
|
999
|
+
const index_block = storage.grid_block(index_address);
|
|
1000
|
+
const addresses = index_data_addresses(index_block);
|
|
1001
|
+
const data_blocks_used = index_data_blocks_used(index_block);
|
|
1002
|
+
var data_block_index: usize = 0;
|
|
1003
|
+
while (data_block_index < data_blocks_used) : (data_block_index += 1) {
|
|
1004
|
+
const address = addresses[data_block_index];
|
|
1005
|
+
const data_block = storage.grid_block(address);
|
|
1006
|
+
const values = data_block_values_used(data_block);
|
|
1007
|
+
if (values.len > 0) {
|
|
1008
|
+
if (data_block_index == 0) {
|
|
1009
|
+
assert(key_min == null or
|
|
1010
|
+
compare_keys(key_min.?, key_from_value(&values[0])) == .eq);
|
|
1011
|
+
}
|
|
1012
|
+
if (data_block_index == data_blocks_used - 1) {
|
|
1013
|
+
assert(key_max == null or
|
|
1014
|
+
compare_keys(key_from_value(&values[values.len - 1]), key_max.?) == .eq);
|
|
1015
|
+
}
|
|
1016
|
+
var a = &values[0];
|
|
1017
|
+
for (values[1..]) |*b| {
|
|
1018
|
+
assert(compare_keys(key_from_value(a), key_from_value(b)) == .lt);
|
|
1019
|
+
a = b;
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
982
1024
|
};
|
|
983
1025
|
}
|
|
984
1026
|
|
|
@@ -52,6 +52,9 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
52
52
|
/// This field is only used for safety checks, it does not affect the behavior.
|
|
53
53
|
read_pending: bool = false,
|
|
54
54
|
|
|
55
|
+
// Used for verifying key order when constants.verify == true.
|
|
56
|
+
key_prev: ?Table.Key,
|
|
57
|
+
|
|
55
58
|
pub fn init(allocator: mem.Allocator) !TableIterator {
|
|
56
59
|
const index_block = try allocator.alignedAlloc(
|
|
57
60
|
u8,
|
|
@@ -95,6 +98,7 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
95
98
|
},
|
|
96
99
|
},
|
|
97
100
|
.value = undefined,
|
|
101
|
+
.key_prev = null,
|
|
98
102
|
};
|
|
99
103
|
}
|
|
100
104
|
|
|
@@ -132,10 +136,21 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
132
136
|
.values = .{ .buffer = it.values.buffer },
|
|
133
137
|
.data_blocks = .{ .buffer = it.data_blocks.buffer },
|
|
134
138
|
.value = 0,
|
|
139
|
+
.key_prev = null,
|
|
135
140
|
};
|
|
136
141
|
|
|
137
142
|
assert(it.values.empty());
|
|
138
143
|
assert(it.data_blocks.empty());
|
|
144
|
+
|
|
145
|
+
if (constants.verify) {
|
|
146
|
+
Table.verify(
|
|
147
|
+
Storage,
|
|
148
|
+
context.grid.superblock.storage,
|
|
149
|
+
context.address,
|
|
150
|
+
null,
|
|
151
|
+
null,
|
|
152
|
+
);
|
|
153
|
+
}
|
|
139
154
|
}
|
|
140
155
|
|
|
141
156
|
/// Try to buffer at least a full block of values to be peek()'d.
|
|
@@ -291,6 +306,18 @@ pub fn TableIteratorType(comptime Table: type, comptime Storage: type) type {
|
|
|
291
306
|
|
|
292
307
|
/// This may only be called after peek() returns a Key (and not Empty or Drained)
|
|
293
308
|
pub fn pop(it: *TableIterator) Table.Value {
|
|
309
|
+
const value = it.pop_internal();
|
|
310
|
+
|
|
311
|
+
if (constants.verify) {
|
|
312
|
+
const key = Table.key_from_value(&value);
|
|
313
|
+
if (it.key_prev) |k| assert(Table.compare_keys(k, key) == .lt);
|
|
314
|
+
it.key_prev = key;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return value;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
fn pop_internal(it: *TableIterator) Table.Value {
|
|
294
321
|
assert(!it.read_pending);
|
|
295
322
|
assert(!it.read_table_index);
|
|
296
323
|
|
|
@@ -35,8 +35,7 @@ const Environment = struct {
|
|
|
35
35
|
const node_count = 1024;
|
|
36
36
|
const cache_entries_max = 2 * 1024 * 1024;
|
|
37
37
|
const forest_options = StateMachine.forest_options(.{
|
|
38
|
-
|
|
39
|
-
.lsm_forest_node_count = undefined,
|
|
38
|
+
.lsm_forest_node_count = node_count,
|
|
40
39
|
.cache_entries_accounts = cache_entries_max,
|
|
41
40
|
.cache_entries_transfers = cache_entries_max,
|
|
42
41
|
.cache_entries_posted = cache_entries_max,
|
|
@@ -121,7 +120,7 @@ const Environment = struct {
|
|
|
121
120
|
}
|
|
122
121
|
|
|
123
122
|
fn tick(env: *Environment) !void {
|
|
124
|
-
env.grid.tick();
|
|
123
|
+
// env.grid.tick();
|
|
125
124
|
try env.io.tick();
|
|
126
125
|
}
|
|
127
126
|
|
|
@@ -212,13 +212,26 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
212
212
|
var manifest = try Manifest.init(allocator, node_pool, grid, tree_hash);
|
|
213
213
|
errdefer manifest.deinit(allocator);
|
|
214
214
|
|
|
215
|
-
var compaction_table_immutable = try CompactionTableImmutable.init(
|
|
215
|
+
var compaction_table_immutable = try CompactionTableImmutable.init(
|
|
216
|
+
allocator,
|
|
217
|
+
std.fmt.comptimePrint("{s}(immutable->0)", .{tree_name}),
|
|
218
|
+
);
|
|
216
219
|
errdefer compaction_table_immutable.deinit(allocator);
|
|
217
220
|
|
|
218
221
|
var compaction_table: [@divFloor(constants.lsm_levels, 2)]CompactionTable = undefined;
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
+
{
|
|
223
|
+
comptime var i: usize = 0;
|
|
224
|
+
inline while (i < compaction_table.len) : (i += 1) {
|
|
225
|
+
errdefer for (compaction_table[0..i]) |*c| c.deinit(allocator);
|
|
226
|
+
const compaction_name = std.fmt.comptimePrint("{s}({}->{}/{}->{})", .{
|
|
227
|
+
tree_name,
|
|
228
|
+
2 * i,
|
|
229
|
+
2 * i + 1,
|
|
230
|
+
2 * i + 1,
|
|
231
|
+
2 * i + 2,
|
|
232
|
+
});
|
|
233
|
+
compaction_table[i] = try CompactionTable.init(allocator, compaction_name);
|
|
234
|
+
}
|
|
222
235
|
}
|
|
223
236
|
errdefer for (compaction_table) |*c| c.deinit(allocator);
|
|
224
237
|
|
|
@@ -580,10 +593,14 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
580
593
|
assert(tree.compaction_io_pending == 0);
|
|
581
594
|
assert(tree.compaction_callback == null);
|
|
582
595
|
|
|
596
|
+
if (constants.verify) {
|
|
597
|
+
tree.manifest.verify(tree.compaction_op);
|
|
598
|
+
}
|
|
599
|
+
|
|
583
600
|
tracer.start(
|
|
584
601
|
&tree.tracer_slot,
|
|
585
602
|
.{ .tree = .{ .tree_name = tree_name } },
|
|
586
|
-
.
|
|
603
|
+
.tree_compaction_beat,
|
|
587
604
|
@src(),
|
|
588
605
|
);
|
|
589
606
|
|
|
@@ -959,9 +976,13 @@ pub fn TreeType(comptime TreeTable: type, comptime Storage: type, comptime tree_
|
|
|
959
976
|
tracer.end(
|
|
960
977
|
&tree.tracer_slot,
|
|
961
978
|
.{ .tree = .{ .tree_name = tree_name } },
|
|
962
|
-
.
|
|
979
|
+
.tree_compaction_beat,
|
|
963
980
|
);
|
|
964
981
|
|
|
982
|
+
if (constants.verify) {
|
|
983
|
+
tree.manifest.verify(tree.compaction_op);
|
|
984
|
+
}
|
|
985
|
+
|
|
965
986
|
// Invoke the compact() callback after the manifest compacts at the end of the beat.
|
|
966
987
|
const callback = tree.compaction_callback.?;
|
|
967
988
|
tree.compaction_callback = null;
|
|
@@ -269,11 +269,6 @@ pub fn main() !void {
|
|
|
269
269
|
}
|
|
270
270
|
};
|
|
271
271
|
|
|
272
|
-
// Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
|
|
273
|
-
for (cluster.storages) |*storage, i| {
|
|
274
|
-
storage.faulty = replica_normal_min <= i;
|
|
275
|
-
}
|
|
276
|
-
|
|
277
272
|
var tick: u64 = 0;
|
|
278
273
|
while (tick < ticks_max) : (tick += 1) {
|
|
279
274
|
const health_options = &cluster.options.health_options;
|
|
@@ -5,8 +5,8 @@ const assert = std.debug.assert;
|
|
|
5
5
|
const log = std.log.scoped(.storage);
|
|
6
6
|
|
|
7
7
|
const IO = @import("io.zig").IO;
|
|
8
|
+
const FIFO = @import("fifo.zig").FIFO;
|
|
8
9
|
const constants = @import("constants.zig");
|
|
9
|
-
const fatal = @import("cli.zig").fatal;
|
|
10
10
|
const vsr = @import("vsr.zig");
|
|
11
11
|
|
|
12
12
|
pub const Storage = struct {
|
|
@@ -68,9 +68,17 @@ pub const Storage = struct {
|
|
|
68
68
|
offset: u64,
|
|
69
69
|
};
|
|
70
70
|
|
|
71
|
+
pub const NextTick = struct {
|
|
72
|
+
next: ?*NextTick = null,
|
|
73
|
+
callback: fn (next_tick: *NextTick) void,
|
|
74
|
+
};
|
|
75
|
+
|
|
71
76
|
io: *IO,
|
|
72
77
|
fd: os.fd_t,
|
|
73
78
|
|
|
79
|
+
next_tick_queue: FIFO(NextTick) = .{},
|
|
80
|
+
next_tick_completion: IO.Completion = undefined,
|
|
81
|
+
|
|
74
82
|
pub fn init(io: *IO, fd: os.fd_t) !Storage {
|
|
75
83
|
return Storage{
|
|
76
84
|
.io = io,
|
|
@@ -79,6 +87,7 @@ pub const Storage = struct {
|
|
|
79
87
|
}
|
|
80
88
|
|
|
81
89
|
pub fn deinit(storage: *Storage) void {
|
|
90
|
+
assert(storage.next_tick_queue.empty());
|
|
82
91
|
assert(storage.fd != IO.INVALID_FILE);
|
|
83
92
|
storage.fd = IO.INVALID_FILE;
|
|
84
93
|
}
|
|
@@ -90,6 +99,43 @@ pub const Storage = struct {
|
|
|
90
99
|
};
|
|
91
100
|
}
|
|
92
101
|
|
|
102
|
+
pub fn on_next_tick(
|
|
103
|
+
storage: *Storage,
|
|
104
|
+
callback: fn (next_tick: *Storage.NextTick) void,
|
|
105
|
+
next_tick: *Storage.NextTick,
|
|
106
|
+
) void {
|
|
107
|
+
next_tick.* = .{ .callback = callback };
|
|
108
|
+
|
|
109
|
+
const was_empty = storage.next_tick_queue.empty();
|
|
110
|
+
storage.next_tick_queue.push(next_tick);
|
|
111
|
+
|
|
112
|
+
if (was_empty) {
|
|
113
|
+
storage.io.timeout(
|
|
114
|
+
*Storage,
|
|
115
|
+
storage,
|
|
116
|
+
timeout_callback,
|
|
117
|
+
&storage.next_tick_completion,
|
|
118
|
+
0, // 0ns timeout means to resolve as soon as possible - like a yield
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
fn timeout_callback(
|
|
124
|
+
storage: *Storage,
|
|
125
|
+
completion: *IO.Completion,
|
|
126
|
+
result: IO.TimeoutError!void,
|
|
127
|
+
) void {
|
|
128
|
+
assert(completion == &storage.next_tick_completion);
|
|
129
|
+
_ = result catch |e| switch (e) {
|
|
130
|
+
error.Canceled => unreachable,
|
|
131
|
+
error.Unexpected => unreachable,
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
var queue = storage.next_tick_queue;
|
|
135
|
+
storage.next_tick_queue = .{};
|
|
136
|
+
while (queue.pop()) |next_tick| next_tick.callback(next_tick);
|
|
137
|
+
}
|
|
138
|
+
|
|
93
139
|
pub fn read_sectors(
|
|
94
140
|
self: *Storage,
|
|
95
141
|
callback: fn (read: *Storage.Read) void,
|
|
@@ -113,18 +159,24 @@ pub const Storage = struct {
|
|
|
113
159
|
.target_max = buffer.len,
|
|
114
160
|
};
|
|
115
161
|
|
|
116
|
-
self.start_read(read,
|
|
162
|
+
self.start_read(read, null);
|
|
117
163
|
assert(read.target().len > 0);
|
|
118
164
|
}
|
|
119
165
|
|
|
120
|
-
fn start_read(self: *Storage, read: *Storage.Read, bytes_read: usize) void {
|
|
121
|
-
|
|
166
|
+
fn start_read(self: *Storage, read: *Storage.Read, bytes_read: ?usize) void {
|
|
167
|
+
const bytes = bytes_read orelse 0;
|
|
168
|
+
assert(bytes <= read.target().len);
|
|
122
169
|
|
|
123
|
-
read.offset +=
|
|
124
|
-
read.buffer = read.buffer[
|
|
170
|
+
read.offset += bytes;
|
|
171
|
+
read.buffer = read.buffer[bytes..];
|
|
125
172
|
|
|
126
173
|
const target = read.target();
|
|
127
174
|
if (target.len == 0) {
|
|
175
|
+
// Resolving the read inline means start_read() must not have been called from
|
|
176
|
+
// read_sectors(). If it was, this is a synchronous callback resolution and should
|
|
177
|
+
// be reported.
|
|
178
|
+
assert(bytes_read != null);
|
|
179
|
+
|
|
128
180
|
read.callback(read);
|
|
129
181
|
return;
|
|
130
182
|
}
|
|
@@ -137,6 +137,8 @@ pub const Cluster = struct {
|
|
|
137
137
|
storage_options.replica_index = @intCast(u8, replica_index);
|
|
138
138
|
storage_options.faulty_wal_areas = faulty_wal_areas[replica_index];
|
|
139
139
|
storage.* = try Storage.init(allocator, options.storage_size_limit, storage_options);
|
|
140
|
+
// Disable most faults at startup, so that the replicas don't get stuck in recovery mode.
|
|
141
|
+
storage.faulty = replica_index >= vsr.quorums(options.replica_count).view_change;
|
|
140
142
|
}
|
|
141
143
|
errdefer for (cluster.storages) |*storage| storage.deinit(allocator);
|
|
142
144
|
|
|
@@ -330,6 +332,7 @@ pub const Cluster = struct {
|
|
|
330
332
|
.{
|
|
331
333
|
.replica_count = @intCast(u8, cluster.replicas.len),
|
|
332
334
|
.storage = &cluster.storages[replica_index],
|
|
335
|
+
// TODO Test restarting with a higher storage limit.
|
|
333
336
|
.storage_size_limit = cluster.options.storage_size_limit,
|
|
334
337
|
.message_pool = &cluster.pools[replica_index],
|
|
335
338
|
.time = time,
|