tigerbeetle-node 0.11.0 → 0.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/package.json +4 -3
  2. package/src/tigerbeetle/scripts/fuzz_loop.sh +1 -1
  3. package/src/tigerbeetle/scripts/pre-commit.sh +2 -2
  4. package/src/tigerbeetle/scripts/validate_docs.sh +17 -0
  5. package/src/tigerbeetle/src/benchmark.zig +25 -11
  6. package/src/tigerbeetle/src/c/tb_client/context.zig +248 -47
  7. package/src/tigerbeetle/src/c/tb_client/echo_client.zig +108 -0
  8. package/src/tigerbeetle/src/c/tb_client/packet.zig +2 -2
  9. package/src/tigerbeetle/src/c/tb_client/signal.zig +2 -4
  10. package/src/tigerbeetle/src/c/tb_client/thread.zig +17 -256
  11. package/src/tigerbeetle/src/c/tb_client.h +18 -4
  12. package/src/tigerbeetle/src/c/tb_client.zig +88 -26
  13. package/src/tigerbeetle/src/c/tb_client_header_test.zig +135 -0
  14. package/src/tigerbeetle/src/c/test.zig +371 -1
  15. package/src/tigerbeetle/src/cli.zig +36 -6
  16. package/src/tigerbeetle/src/config.zig +10 -1
  17. package/src/tigerbeetle/src/demo.zig +2 -1
  18. package/src/tigerbeetle/src/demo_01_create_accounts.zig +1 -1
  19. package/src/tigerbeetle/src/demo_03_create_transfers.zig +13 -0
  20. package/src/tigerbeetle/src/ewah.zig +11 -33
  21. package/src/tigerbeetle/src/ewah_benchmark.zig +8 -9
  22. package/src/tigerbeetle/src/lsm/README.md +97 -3
  23. package/src/tigerbeetle/src/lsm/compaction.zig +32 -7
  24. package/src/tigerbeetle/src/{eytzinger_benchmark.zig → lsm/eytzinger_benchmark.zig} +34 -21
  25. package/src/tigerbeetle/src/lsm/forest_fuzz.zig +34 -32
  26. package/src/tigerbeetle/src/lsm/grid.zig +39 -21
  27. package/src/tigerbeetle/src/lsm/groove.zig +1 -0
  28. package/src/tigerbeetle/src/lsm/k_way_merge.zig +3 -3
  29. package/src/tigerbeetle/src/lsm/level_iterator.zig +1 -1
  30. package/src/tigerbeetle/src/lsm/manifest.zig +13 -0
  31. package/src/tigerbeetle/src/lsm/manifest_level.zig +0 -49
  32. package/src/tigerbeetle/src/lsm/manifest_log.zig +173 -335
  33. package/src/tigerbeetle/src/lsm/manifest_log_fuzz.zig +665 -0
  34. package/src/tigerbeetle/src/lsm/node_pool.zig +4 -0
  35. package/src/tigerbeetle/src/lsm/posted_groove.zig +1 -0
  36. package/src/tigerbeetle/src/lsm/segmented_array.zig +24 -15
  37. package/src/tigerbeetle/src/lsm/table.zig +32 -20
  38. package/src/tigerbeetle/src/lsm/table_immutable.zig +1 -1
  39. package/src/tigerbeetle/src/lsm/table_iterator.zig +4 -5
  40. package/src/tigerbeetle/src/lsm/test.zig +13 -2
  41. package/src/tigerbeetle/src/lsm/tree.zig +45 -7
  42. package/src/tigerbeetle/src/lsm/tree_fuzz.zig +36 -32
  43. package/src/tigerbeetle/src/main.zig +55 -2
  44. package/src/tigerbeetle/src/message_bus.zig +18 -7
  45. package/src/tigerbeetle/src/message_pool.zig +8 -2
  46. package/src/tigerbeetle/src/ring_buffer.zig +7 -3
  47. package/src/tigerbeetle/src/simulator.zig +38 -11
  48. package/src/tigerbeetle/src/state_machine.zig +47 -22
  49. package/src/tigerbeetle/src/test/accounting/workload.zig +9 -5
  50. package/src/tigerbeetle/src/test/cluster.zig +15 -33
  51. package/src/tigerbeetle/src/test/conductor.zig +2 -1
  52. package/src/tigerbeetle/src/test/network.zig +45 -19
  53. package/src/tigerbeetle/src/test/packet_simulator.zig +40 -29
  54. package/src/tigerbeetle/src/test/state_checker.zig +5 -7
  55. package/src/tigerbeetle/src/test/storage.zig +453 -110
  56. package/src/tigerbeetle/src/test/storage_checker.zig +204 -0
  57. package/src/tigerbeetle/src/tigerbeetle.zig +1 -0
  58. package/src/tigerbeetle/src/unit_tests.zig +6 -1
  59. package/src/tigerbeetle/src/util.zig +97 -11
  60. package/src/tigerbeetle/src/vopr.zig +2 -1
  61. package/src/tigerbeetle/src/vsr/client.zig +8 -3
  62. package/src/tigerbeetle/src/vsr/journal.zig +280 -202
  63. package/src/tigerbeetle/src/vsr/replica.zig +169 -31
  64. package/src/tigerbeetle/src/vsr/superblock.zig +356 -629
  65. package/src/tigerbeetle/src/vsr/superblock_client_table.zig +7 -6
  66. package/src/tigerbeetle/src/vsr/superblock_free_set.zig +414 -151
  67. package/src/tigerbeetle/src/vsr/superblock_free_set_fuzz.zig +332 -0
  68. package/src/tigerbeetle/src/vsr/superblock_fuzz.zig +349 -0
  69. package/src/tigerbeetle/src/vsr/superblock_manifest.zig +44 -9
  70. package/src/tigerbeetle/src/vsr/superblock_quorums.zig +394 -0
  71. package/src/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +312 -0
  72. package/src/tigerbeetle/src/vsr.zig +19 -5
  73. package/src/tigerbeetle/src/benchmark_array_search.zig +0 -317
  74. package/src/tigerbeetle/src/benchmarks/perf.zig +0 -299
  75. package/src/tigerbeetle/src/vopr_hub/README.md +0 -58
  76. package/src/tigerbeetle/src/vopr_hub/SETUP.md +0 -199
  77. package/src/tigerbeetle/src/vopr_hub/go.mod +0 -3
  78. package/src/tigerbeetle/src/vopr_hub/main.go +0 -1022
  79. package/src/tigerbeetle/src/vopr_hub/scheduler/go.mod +0 -3
  80. package/src/tigerbeetle/src/vopr_hub/scheduler/main.go +0 -403
@@ -35,15 +35,16 @@ pub fn main() !void {
35
35
  var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
36
36
  defer arena.deinit();
37
37
 
38
+ const allocator = arena.allocator();
38
39
  var i: usize = 0;
39
40
  var bitsets: [samples][]usize = undefined;
40
41
  var bitsets_encoded: [samples][]align(@alignOf(usize)) u8 = undefined;
41
42
  var bitsets_decoded: [samples][]usize = undefined;
42
43
  var bitset_lengths: [samples]usize = undefined;
43
44
  while (i < samples) : (i += 1) {
44
- bitsets[i] = try make_bitset(&arena.allocator, config);
45
- bitsets_encoded[i] = try arena.allocator.alignedAlloc(u8, @alignOf(usize), ewah.encode_size_max(bitsets[0].len));
46
- bitsets_decoded[i] = try arena.allocator.alloc(usize, config.words);
45
+ bitsets[i] = try make_bitset(allocator, config);
46
+ bitsets_encoded[i] = try allocator.alignedAlloc(u8, @alignOf(usize), ewah.encode_size_max(bitsets[0].len));
47
+ bitsets_decoded[i] = try allocator.alloc(usize, config.words);
47
48
  }
48
49
 
49
50
  // Benchmark encoding.
@@ -96,16 +97,14 @@ pub fn main() !void {
96
97
  }
97
98
  }
98
99
 
99
- fn make_bitset(allocator: *std.mem.Allocator, config: BitSetConfig) ![]usize {
100
+ fn make_bitset(allocator: std.mem.Allocator, config: BitSetConfig) ![]usize {
100
101
  var words = try allocator.alloc(usize, config.words);
101
102
  var w: usize = 0;
102
- var run: bool = true;
103
103
  var literal: usize = 1;
104
104
  while (w < words.len) : (w += 1) {
105
- const start = w;
106
- const run_length = prng.random.uintLessThan(usize, 2 * config.run_length_e);
107
- const literals_length = prng.random.uintLessThan(usize, 2 * config.literals_length_e);
108
- const run_bit = prng.random.boolean();
105
+ const run_length = prng.random().uintLessThan(usize, 2 * config.run_length_e);
106
+ const literals_length = prng.random().uintLessThan(usize, 2 * config.literals_length_e);
107
+ const run_bit = prng.random().boolean();
109
108
 
110
109
  const run_end = std.math.min(w + run_length, words.len);
111
110
  while (w < run_end) : (w += 1) {
@@ -25,7 +25,7 @@ A tree is a hierarchy of in-memory and on-disk tables. There are three categorie
25
25
  - The mutable table's contents are periodically moved to the immutable table,
26
26
  where they are stored while being flushed to level `0`.
27
27
  - Level `0` … level `config.lsm_levels - 1` each contain an exponentially increasing number of
28
- on-disk tables.
28
+ immutable on-disk tables.
29
29
  - Each tree has as many as `config.lsm_growth_factor ^ (level + 1)` tables per level.
30
30
  (`config.lsm_growth_factor` is typically 8).
31
31
  - Within a given level and snapshot, the tables' key ranges are [disjoint](manifest_level.zig).
@@ -47,7 +47,7 @@ The first half of the bar compacts even levels while the latter compacts odd lev
47
47
  Mutable table changes are sorted and compacted into the immutable table.
48
48
  The immutable table is compacted into level 0 during the odd level half of the bar.
49
49
 
50
- At any given point, there are at most `levels/2` compactions running concurrently.
50
+ At any given point, there are at most `⌈levels/2⌉` compactions running concurrently.
51
51
  The source level is denoted as `level_a` and the target level as `level_b`.
52
52
  The last level in the LSM tree has no target level so it is never a source level.
53
53
  Each compaction compacts a [single table](#table-selection) from `level_a` into all tables in
@@ -55,7 +55,7 @@ Each compaction compacts a [single table](#table-selection) from `level_a` into
55
55
 
56
56
  Invariants:
57
57
  * At the end of every beat, there is space in mutable table for the next beat.
58
- * The manifest is compacted at the end of every beat.
58
+ * The manifest log is compacted at the end of every beat.
59
59
  * The compactions' output tables are not [visible](#snapshots-and-compaction) until the compaction has finished.
60
60
 
61
61
  1. First half-bar, first beat ("first beat"):
@@ -63,15 +63,20 @@ Invariants:
63
63
  * Allow the per-level table limits to overflow if needed (for example, if we may compact a table
64
64
  from level `A` to level `B`, where level `B` is already full).
65
65
  * Start compactions from even levels that have reached their table limit.
66
+ * Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
67
+ during this half-bar.
66
68
 
67
69
  2. First half-bar, last beat:
68
70
  * Finish ticking any incomplete even-level compactions.
69
71
  * Assert on callback completion that all compactions are complete.
72
+ * Release reservations from the Free Set.
70
73
 
71
74
  3. Second half-bar, first beat ("middle beat"):
72
75
  * Assert no compactions are currently running.
73
76
  * Start compactions from odd levels that have reached their table limit.
74
77
  * Compact the immutable table if it contains any sorted values (it might be empty).
78
+ * Acquire reservations from the Free Set for all blocks (upper-bound) that will be written
79
+ during this half-bar.
75
80
 
76
81
  4. Second half-bar, last beat:
77
82
  * Finish ticking any incomplete odd-level and immutable table compactions.
@@ -79,6 +84,7 @@ Invariants:
79
84
  * Assert on callback completion that no level's table count overflows.
80
85
  * Flush, clear, and sort mutable table values into immutable table for next bar.
81
86
  * Remove input tables that are invisible to all current and persisted snapshots.
87
+ * Release reservations from the Free Set.
82
88
 
83
89
  ### Compaction Selection Policy
84
90
 
@@ -212,3 +218,91 @@ At the end of the last beat of the compaction bar (`23`):
212
218
  - Updates from ops `0→B` (`0…15`) are on disk.
213
219
  - Updates from ops `B→C` (`16…23`) are moved from the mutable table to the immutable table.
214
220
  - `tree.lookup_snapshot_max` is `x` when committing op `x` (for `x ∈ {24,25,…}`).
221
+
222
+
223
+ ## Manifest
224
+
225
+ The manifest is a tree's index of table locations and metadata.
226
+ (Not to be confused with the [SuperBlock Manifest](../vsr/README.md#manifest)).
227
+
228
+ Each manifest has two components:
229
+ - a single [`ManifestLog`](#manifest-log) shared by all levels, and
230
+ - one [`ManifestLevel`](#manifest-level) for each on-disk level.
231
+
232
+ ### Manifest Log
233
+
234
+ The manifest log is an on-disk log of all updates to the tree's table index.
235
+
236
+ The manifest log tracks:
237
+
238
+ - tables created as compaction output
239
+ - tables updated as compaction input (modifying their `snapshot_max`)
240
+ - tables moved between levels by compaction
241
+ - tables deleted after compaction
242
+
243
+ Updates are accumulated in-memory before being flushed:
244
+
245
+ - incrementally during compaction, or
246
+ - in their entirety during checkpoint.
247
+
248
+ The manifest log is periodically compacted to remove older entries that have been superseded by
249
+ newer entries. For example, if a table is created and later deleted, manifest log compaction
250
+ will eventually remove any reference to the table from the log blocks.
251
+
252
+ All manifest log blocks are tracked in the superblock manifest.
253
+
254
+ ### Manifest Level
255
+
256
+ A `ManifestLevel` is an in-memory collection of the table metadata for a single level of a tree.
257
+
258
+ For a given level and snapshot, there may be gaps in the key ranges of the visible tables,
259
+ but the key ranges are disjoint.
260
+
261
+ Manifest levels are queried for tables at a target snapshot and within a key range.
262
+
263
+ #### Example
264
+
265
+ Given the `ManifestLevel` tables (with values chosen for visualization, not realism):
266
+
267
+ label A B C D E F G H I J K L M
268
+ key_min 0 4 12 16 4 8 12 26 4 25 4 16 24
269
+ key_max 3 11 15 19 7 11 15 27 7 27 11 19 27
270
+ snapshot_min 1 1 1 1 3 3 3 3 5 5 7 7 7
271
+ snapshot_max 9 3 3 7 5 7 9 5 7 7 9 9 9
272
+
273
+ A level's tables can be visualized in 2D as a partitioned rectangle:
274
+
275
+ 0 1 2
276
+ 0 4 8 2 6 0 4 8
277
+ 9┌───┬───────┬───┬───┬───┬───┐
278
+ │ │ K │ │ L │###│ M │
279
+ 7│ ├───┬───┤ ├───┤###└┬──┤
280
+ │ │ I │ │ G │ │####│ J│
281
+ 5│ A ├───┤ F │ │ │####└┬─┤
282
+ │ │ E │ │ │ D │#####│H│
283
+ 3│ ├───┴───┼───┤ │#####└─┤
284
+ │ │ B │ C │ │#######│
285
+ 1└───┴───────┴───┴───┴───────┘
286
+
287
+ Example iterations:
288
+
289
+ visibility snapshots direction key_min key_max tables
290
+ visible 2 ascending 0 28 A, B, C, D
291
+ visible 4 ascending 0 28 A, E, F, G, D, H
292
+ visible 6 descending 12 28 J, D, G
293
+ visible 8 ascending 0 28 A, K, G, L, M
294
+ invisible 2, 4, 6 ascending 0 28 K, L, M
295
+
296
+ Legend:
297
+
298
+ - `#` represents a gap — no tables cover these keys during the snapshot.
299
+ - The horizontal axis represents the key range.
300
+ - The vertical axis represents the snapshot range.
301
+ - Each rectangle is a table within the manifest level.
302
+ - The sides of each rectangle depict:
303
+ - left: `table.key_min` (the diagram is inclusive, and the `table.key_min` is inclusive)
304
+ - right: `table.key_max` (the diagram is EXCLUSIVE, but the `table.key_max` is INCLUSIVE)
305
+ - bottom: `table.snapshot_min` (inclusive)
306
+ - top: `table.snapshot_max` (inclusive)
307
+ - (Not depicted: tables may have `table.key_min == table.key_max`.)
308
+ - (Not depicted: the newest set of tables would have `table.snapshot_max == maxInt(u64)`.)
@@ -85,7 +85,7 @@ pub fn CompactionType(
85
85
  );
86
86
 
87
87
  const MergeStreamSelector = struct {
88
- fn peek(compaction: *const Compaction, stream_id: u32) error{Empty, Drained}!Key {
88
+ fn peek(compaction: *const Compaction, stream_id: u32) error{ Empty, Drained }!Key {
89
89
  return switch (stream_id) {
90
90
  0 => compaction.iterator_a.peek(),
91
91
  1 => compaction.iterator_b.peek(),
@@ -121,6 +121,7 @@ pub fn CompactionType(
121
121
  };
122
122
 
123
123
  grid: *Grid,
124
+ grid_reservation: Grid.Reservation,
124
125
  range: Manifest.CompactionRange,
125
126
 
126
127
  /// `op_min` is the first op/beat of this compaction's half-bar.
@@ -152,6 +153,8 @@ pub fn CompactionType(
152
153
  level_b: u8,
153
154
  level_a_input: ?TableInfo,
154
155
 
156
+ tables_output_count: usize = 0,
157
+
155
158
  pub fn init(allocator: mem.Allocator) !Compaction {
156
159
  var iterator_a = try IteratorA.init(allocator);
157
160
  errdefer iterator_a.deinit(allocator);
@@ -165,6 +168,7 @@ pub fn CompactionType(
165
168
  return Compaction{
166
169
  // Assigned by start()
167
170
  .grid = undefined,
171
+ .grid_reservation = undefined,
168
172
  .range = undefined,
169
173
  .op_min = undefined,
170
174
  .drop_tombstones = undefined,
@@ -227,6 +231,20 @@ pub fn CompactionType(
227
231
 
228
232
  compaction.* = .{
229
233
  .grid = grid,
234
+ // Reserve enough blocks to write our output tables in the worst case, where:
235
+ // - no tombstones are dropped,
236
+ // - no values are overwritten,
237
+ // - and all tables are full.
238
+ //
239
+ // We must reserve before doing any async work so that the block acquisition order
240
+ // is deterministic (relative to other concurrent compactions).
241
+ // TODO The replica must stop accepting requests if it runs out of blocks/capacity,
242
+ // rather than panicking here.
243
+ // TODO(Compaction Pacing): Reserve smaller increments, at the start of each beat.
244
+ // (And likewise release the reservation at the end of each beat, instead of at the
245
+ // end of each half-bar).
246
+ // TODO(Move Table) Don't reserve these when we just move the table to the next level.
247
+ .grid_reservation = grid.reserve(range.table_count * Table.block_count_max).?,
230
248
  .range = range,
231
249
  .op_min = op_min,
232
250
  .drop_tombstones = drop_tombstones,
@@ -303,12 +321,12 @@ pub fn CompactionType(
303
321
  // This is safe; iterator_b makes a copy of the block before calling us.
304
322
  const grid = compaction.grid;
305
323
  for (Table.index_data_addresses_used(index_block)) |address| {
306
- grid.release_at_checkpoint(address);
324
+ grid.release(address);
307
325
  }
308
326
  for (Table.index_filter_addresses_used(index_block)) |address| {
309
- grid.release_at_checkpoint(address);
327
+ grid.release(address);
310
328
  }
311
- grid.release_at_checkpoint(Table.index_block_address(index_block));
329
+ grid.release(Table.index_block_address(index_block));
312
330
  }
313
331
 
314
332
  pub fn compact_tick(compaction: *Compaction, callback: Callback) void {
@@ -441,7 +459,7 @@ pub fn CompactionType(
441
459
  {
442
460
  compaction.table_builder.data_block_finish(.{
443
461
  .cluster = compaction.grid.superblock.working.cluster,
444
- .address = compaction.grid.acquire(),
462
+ .address = compaction.grid.acquire(compaction.grid_reservation),
445
463
  });
446
464
 
447
465
  // Mark the finished data block as writable for the next compact_tick() call.
@@ -457,7 +475,7 @@ pub fn CompactionType(
457
475
  {
458
476
  compaction.table_builder.filter_block_finish(.{
459
477
  .cluster = compaction.grid.superblock.working.cluster,
460
- .address = compaction.grid.acquire(),
478
+ .address = compaction.grid.acquire(compaction.grid_reservation),
461
479
  });
462
480
 
463
481
  // Mark the finished filter block as writable for the next compact_tick() call.
@@ -473,7 +491,7 @@ pub fn CompactionType(
473
491
  {
474
492
  const table = compaction.table_builder.index_block_finish(.{
475
493
  .cluster = compaction.grid.superblock.working.cluster,
476
- .address = compaction.grid.acquire(),
494
+ .address = compaction.grid.acquire(compaction.grid_reservation),
477
495
  .snapshot_min = snapshot_min_for_table_output(compaction.op_min),
478
496
  // TODO(Persistent Snapshots) set snapshot_max to the minimum snapshot_max of
479
497
  // all the (original) input tables.
@@ -484,6 +502,9 @@ pub fn CompactionType(
484
502
  compaction.index.block = compaction.table_builder.index_block;
485
503
  assert(!compaction.index.writable);
486
504
  compaction.index.writable = true;
505
+
506
+ compaction.tables_output_count += 1;
507
+ assert(compaction.tables_output_count <= compaction.range.table_count);
487
508
  }
488
509
  }
489
510
 
@@ -540,6 +561,10 @@ pub fn CompactionType(
540
561
  assert(compaction.io_pending == 0);
541
562
  assert(compaction.merge_done);
542
563
 
564
+ // TODO(Beat Pacing) This should really be where the compaction callback is invoked,
565
+ // but currently that can occur multiple times per beat.
566
+ compaction.grid.forfeit(compaction.grid_reservation);
567
+
543
568
  compaction.status = .idle;
544
569
  compaction.merge_done = false;
545
570
  }
@@ -2,9 +2,9 @@ const std = @import("std");
2
2
  const assert = std.debug.assert;
3
3
  const math = std.math;
4
4
 
5
- const binary_search = @import("./binary_search.zig").binary_search;
5
+ const binary_search_keys_raw = @import("./binary_search.zig").binary_search_keys_raw;
6
+ const binary_search_values_raw = @import("./binary_search.zig").binary_search_values_raw;
6
7
  const eytzinger = @import("./eytzinger.zig").eytzinger;
7
- const perf = @import("./benchmarks/perf.zig");
8
8
 
9
9
  const GiB = 1 << 30;
10
10
  const searches = 500_000;
@@ -53,7 +53,7 @@ pub fn main() !void {
53
53
  defer arena.deinit();
54
54
 
55
55
  const blob_size = GiB;
56
- var blob = try arena.allocator.alloc(u8, blob_size);
56
+ var blob = try arena.allocator().alloc(u8, blob_size);
57
57
 
58
58
  inline for (kv_types) |kv| {
59
59
  inline for (values_per_page) |values_count, v| {
@@ -65,13 +65,13 @@ pub fn main() !void {
65
65
  .keys_count = keys_count,
66
66
  .values_count = values_count,
67
67
  .searches = searches,
68
- }, blob, &prng.random);
68
+ }, blob, prng.random());
69
69
  }
70
70
  }
71
71
  }
72
72
  }
73
73
 
74
- fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random) !void {
74
+ fn run_benchmark(comptime layout: Layout, blob: []u8, random: std.rand.Random) !void {
75
75
  assert(blob.len == layout.blob_size);
76
76
  const Eytzinger = eytzinger(layout.keys_count - 1, layout.values_count);
77
77
  const V = Value(layout);
@@ -88,7 +88,7 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
88
88
 
89
89
  // Generate 1GiB worth of 24KiB pages.
90
90
  var blob_alloc = std.heap.FixedBufferAllocator.init(blob);
91
- var pages = try blob_alloc.allocator.alloc(Page, page_count);
91
+ var pages = try blob_alloc.allocator().alloc(Page, page_count);
92
92
  random.bytes(std.mem.sliceAsBytes(pages));
93
93
  for (pages) |*page| {
94
94
  for (page.values) |*value, i| value.key = i;
@@ -105,7 +105,17 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
105
105
  const target = value_picker[v % value_picker.len];
106
106
  const page = &pages[page_index];
107
107
  const bounds = Eytzinger.search_values(K, V, V.key_compare, &page.keys, &page.values, target);
108
- const hit = bounds[binary_search(K, V, V.key_from_value, V.key_compare, bounds, target, .{})];
108
+ const hit = bounds[
109
+ binary_search_values_raw(
110
+ K,
111
+ V,
112
+ V.key_from_value,
113
+ V.key_compare,
114
+ bounds,
115
+ target,
116
+ .{},
117
+ )
118
+ ];
109
119
 
110
120
  assert(hit.key == target);
111
121
  if (i % pages.len == 0) v += 1;
@@ -136,7 +146,17 @@ fn run_benchmark(comptime layout: Layout, blob: []u8, random: *std.rand.Random)
136
146
  while (i < layout.searches) : (i += 1) {
137
147
  const target = value_picker[v % value_picker.len];
138
148
  const page = &pages[page_picker[i % page_picker.len]];
139
- const hit = page.values[binary_search(K, V, V.key_from_value, V.key_compare, page.values[0..], target, .{})];
149
+ const hit = page.values[
150
+ binary_search_values_raw(
151
+ K,
152
+ V,
153
+ V.key_from_value,
154
+ V.key_compare,
155
+ page.values[0..],
156
+ target,
157
+ .{},
158
+ )
159
+ ];
140
160
 
141
161
  assert(hit.key == target);
142
162
  if (i % pages.len == 0) v += 1;
@@ -182,14 +202,10 @@ fn Value(comptime layout: Layout) type {
182
202
  assert(@sizeOf(Self) == layout.value_size);
183
203
  }
184
204
 
185
- inline fn key_from_value(self: Self) Key {
205
+ inline fn key_from_value(self: *const Self) Key {
186
206
  return self.key;
187
207
  }
188
208
 
189
- inline fn key_from_key(x: Key) Key {
190
- return x;
191
- }
192
-
193
209
  inline fn key_compare(a: Key, b: Key) math.Order {
194
210
  return math.order(a, b);
195
211
  }
@@ -206,9 +222,7 @@ const BenchmarkResult = struct {
206
222
  branch_misses: usize,
207
223
  };
208
224
 
209
- const PERF = perf.PERF;
210
- const perf_event_attr = perf.perf_event_attr;
211
- const perf_event_open = perf.perf_event_open;
225
+ const PERF = std.os.linux.PERF;
212
226
  const perf_counters = [_]PERF.COUNT.HW{
213
227
  PERF.COUNT.HW.CPU_CYCLES,
214
228
  PERF.COUNT.HW.INSTRUCTIONS,
@@ -223,10 +237,9 @@ const Benchmark = struct {
223
237
  perf_fds: [perf_counters.len]std.os.fd_t,
224
238
 
225
239
  fn begin() !Benchmark {
226
- const flags = PERF.FLAG.FD_NO_GROUP;
227
240
  var perf_fds = [1]std.os.fd_t{-1} ** perf_counters.len;
228
241
  for (perf_counters) |counter, i| {
229
- var attr: perf_event_attr = .{
242
+ var attr: std.os.linux.perf_event_attr = .{
230
243
  .type = PERF.TYPE.HARDWARE,
231
244
  .config = @enumToInt(counter),
232
245
  .flags = .{
@@ -235,7 +248,7 @@ const Benchmark = struct {
235
248
  .exclude_hv = true,
236
249
  },
237
250
  };
238
- perf_fds[i] = try perf_event_open(&attr, 0, -1, perf_fds[0], PERF.FLAG.FD_CLOEXEC);
251
+ perf_fds[i] = try std.os.perf_event_open(&attr, 0, -1, perf_fds[0], PERF.FLAG.FD_CLOEXEC);
239
252
  }
240
253
  const err = std.os.linux.ioctl(perf_fds[0], PERF.EVENT_IOC.ENABLE, PERF.IOC_FLAG_GROUP);
241
254
  if (err == -1) return error.Unexpected;
@@ -274,7 +287,7 @@ const Benchmark = struct {
274
287
  };
275
288
 
276
289
  // shuffle([0,1,…,n-1])
277
- fn shuffled_index(comptime n: usize, rand: *std.rand.Random) [n]usize {
290
+ fn shuffled_index(comptime n: usize, rand: std.rand.Random) [n]usize {
278
291
  var indices: [n]usize = undefined;
279
292
  for (indices) |*i, j| i.* = j;
280
293
  rand.shuffle(usize, indices[0..]);
@@ -307,7 +320,7 @@ fn binary_search_keys(
307
320
  assert(keys.len == layout.keys_count);
308
321
  assert(values.len == layout.values_count);
309
322
 
310
- const key_index = binary_search(Key, Key, V.key_from_key, compare_keys, keys, key, .{});
323
+ const key_index = binary_search_keys_raw(Key, compare_keys, keys, key, .{});
311
324
  const key_stride = layout.values_count / layout.keys_count;
312
325
  const high = key_index * key_stride;
313
326
  if (key_index < keys.len and keys[key_index] == key) {
@@ -38,7 +38,10 @@ const Environment = struct {
38
38
  const cluster = 32;
39
39
  const replica = 4;
40
40
  // TODO Is this appropriate for the number of fuzz_ops we want to run?
41
- const size_max = vsr.Zone.superblock.size().? + vsr.Zone.wal.size().? + 1024 * 1024 * 1024;
41
+ const size_max = vsr.Zone.superblock.size().? +
42
+ vsr.Zone.wal_headers.size().? +
43
+ vsr.Zone.wal_prepares.size().? +
44
+ 1024 * 1024 * 1024;
42
45
 
43
46
  const node_count = 1024;
44
47
  // This is the smallest size that set_associative_cache will allow us.
@@ -80,6 +83,7 @@ const Environment = struct {
80
83
  forest: Forest,
81
84
  // We need @fieldParentPtr() of forest, so we can't use an optional Forest.
82
85
  forest_exists: bool,
86
+ checkpoint_op: ?u64 = null,
83
87
 
84
88
  fn init(env: *Environment, storage: *Storage) !void {
85
89
  env.state = .uninit;
@@ -184,7 +188,8 @@ const Environment = struct {
184
188
  env.change_state(.forest_compacting, .forest_open);
185
189
  }
186
190
 
187
- pub fn checkpoint(env: *Environment) void {
191
+ pub fn checkpoint(env: *Environment, op: u64) void {
192
+ env.checkpoint_op = op - config.lsm_batch_multiple;
188
193
  env.change_state(.forest_open, .forest_checkpointing);
189
194
  env.forest.checkpoint(forest_checkpoint_callback);
190
195
  env.tick_until_state_change(.forest_checkpointing, .superblock_checkpointing);
@@ -194,7 +199,14 @@ const Environment = struct {
194
199
  fn forest_checkpoint_callback(forest: *Forest) void {
195
200
  const env = @fieldParentPtr(@This(), "forest", forest);
196
201
  env.change_state(.forest_checkpointing, .superblock_checkpointing);
197
- env.superblock.checkpoint(superblock_checkpoint_callback, &env.superblock_context);
202
+ env.superblock.checkpoint(superblock_checkpoint_callback, &env.superblock_context, .{
203
+ .commit_min_checksum = env.superblock.working.vsr_state.commit_min_checksum + 1,
204
+ .commit_min = env.checkpoint_op.?,
205
+ .commit_max = env.checkpoint_op.? + 1,
206
+ .view_normal = 0,
207
+ .view = 0,
208
+ });
209
+ env.checkpoint_op = null;
198
210
  }
199
211
 
200
212
  fn superblock_checkpoint_callback(superblock_context: *SuperBlock.Context) void {
@@ -236,17 +248,17 @@ const Environment = struct {
236
248
 
237
249
  for (fuzz_ops) |fuzz_op, fuzz_op_index| {
238
250
  log.debug("Running fuzz_ops[{}/{}] == {}", .{ fuzz_op_index, fuzz_ops.len, fuzz_op });
239
- //TODO(@djg) Restore these when dj-vopr-workload merges.
240
- //const storage_size_used = storage.size_used();
241
- //log.debug("storage.size_used = {}/{}", .{ storage_size_used, storage.size });
242
- //const model_size = model.count() * @sizeOf(Account);
243
- //log.debug("space_amplification = {d:.2}", .{@intToFloat(f64, storage_size_used) / @intToFloat(f64, model_size)});
251
+ const storage_size_used = storage.size_used();
252
+ log.debug("storage.size_used = {}/{}", .{ storage_size_used, storage.size });
253
+ const model_size = model.count() * @sizeOf(Account);
254
+ log.debug("space_amplification = {d:.2}", .{
255
+ @intToFloat(f64, storage_size_used) / @intToFloat(f64, model_size),
256
+ });
244
257
  // Apply fuzz_op to the forest and the model.
245
258
  switch (fuzz_op) {
246
259
  .compact => |compact| {
247
260
  env.compact(compact.op);
248
- if (compact.checkpoint)
249
- env.checkpoint();
261
+ if (compact.checkpoint) env.checkpoint(compact.op);
250
262
  },
251
263
  .put_account => |account| {
252
264
  env.forest.grooves.accounts.put(&account);
@@ -274,27 +286,9 @@ const Environment = struct {
274
286
  }
275
287
  };
276
288
 
277
- pub fn run_fuzz_ops(fuzz_ops: []const FuzzOp) !void {
289
+ pub fn run_fuzz_ops(storage_options: Storage.Options, fuzz_ops: []const FuzzOp) !void {
278
290
  // Init mocked storage.
279
- var storage = try Storage.init(
280
- allocator,
281
- Environment.size_max,
282
- Storage.Options{
283
- // We don't apply storage faults yet, so this seed doesn't matter.
284
- .seed = 0xdeadbeef,
285
- .read_latency_min = 0,
286
- .read_latency_mean = 0,
287
- .write_latency_min = 0,
288
- .write_latency_mean = 0,
289
- .read_fault_probability = 0,
290
- .write_fault_probability = 0,
291
- },
292
- 0,
293
- .{
294
- .first_offset = 0,
295
- .period = 0,
296
- },
297
- );
291
+ var storage = try Storage.init(allocator, Environment.size_max, storage_options);
298
292
  defer storage.deinit(allocator);
299
293
 
300
294
  try Environment.format(&storage);
@@ -354,6 +348,7 @@ pub fn generate_fuzz_ops(random: std.rand.Random) ![]const FuzzOp {
354
348
  const checkpoint =
355
349
  // Can only checkpoint on the last beat of the bar.
356
350
  compact_op % config.lsm_batch_multiple == config.lsm_batch_multiple - 1 and
351
+ compact_op > config.lsm_batch_multiple and
357
352
  // Checkpoint at roughly the same rate as log wraparound.
358
353
  random.uintLessThan(usize, Environment.compacts_per_checkpoint) == 0;
359
354
  break :compact FuzzOp{
@@ -402,11 +397,18 @@ pub fn generate_fuzz_ops(random: std.rand.Random) ![]const FuzzOp {
402
397
  pub fn main() !void {
403
398
  const fuzz_args = try fuzz.parse_fuzz_args(allocator);
404
399
  var rng = std.rand.DefaultPrng.init(fuzz_args.seed);
400
+ const random = rng.random();
405
401
 
406
- const fuzz_ops = try generate_fuzz_ops(rng.random());
402
+ const fuzz_ops = try generate_fuzz_ops(random);
407
403
  defer allocator.free(fuzz_ops);
408
404
 
409
- try run_fuzz_ops(fuzz_ops);
405
+ try run_fuzz_ops(Storage.Options{
406
+ .seed = random.int(u64),
407
+ .read_latency_min = 0,
408
+ .read_latency_mean = 0 + fuzz.random_int_exponential(random, u64, 20),
409
+ .write_latency_min = 0,
410
+ .write_latency_mean = 0 + fuzz.random_int_exponential(random, u64, 20),
411
+ }, fuzz_ops);
410
412
 
411
413
  log.info("Passed!", .{});
412
414
  }