tigerbeetle 0.0.34 → 0.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/tb_client/extconf.rb +13 -13
- data/ext/tb_client/tigerbeetle/LICENSE +177 -0
- data/ext/tb_client/tigerbeetle/build.zig +2327 -0
- data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
- data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
- data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
- data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
- data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
- data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
- data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
- data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
- data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
- data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
- data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
- data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
- data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
- data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
- data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
- data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
- data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
- data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
- data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
- data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
- data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
- data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
- data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
- data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
- data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
- data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
- data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
- data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
- data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
- data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
- data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
- data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
- data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
- data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
- data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
- data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
- data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
- data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
- data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
- data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
- data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
- data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
- data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
- data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
- data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
- data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
- data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
- data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
- data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
- data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
- data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
- data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
- data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
- data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
- data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
- data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
- data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
- data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
- data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
- data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
- data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
- data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
- data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
- data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
- data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
- data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
- data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
- data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
- data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
- data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
- data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
- data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
- data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
- data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
- data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
- data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
- data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
- data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
- data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
- data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
- data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
- data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
- data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
- data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
- data/lib/tb_client/shared_lib.rb +12 -5
- data/lib/tigerbeetle/client.rb +1 -1
- data/lib/tigerbeetle/platforms.rb +9 -0
- data/lib/tigerbeetle/version.rb +2 -2
- data/tigerbeetle.gemspec +22 -5
- metadata +242 -3
- data/ext/tb_client/pkg.tar.gz +0 -0
|
@@ -0,0 +1,2586 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const Allocator = std.mem.Allocator;
|
|
3
|
+
const assert = std.debug.assert;
|
|
4
|
+
const maybe = stdx.maybe;
|
|
5
|
+
|
|
6
|
+
const constants = @import("../constants.zig");
|
|
7
|
+
|
|
8
|
+
const Message = @import("../message_pool.zig").MessagePool.Message;
|
|
9
|
+
const stdx = @import("stdx");
|
|
10
|
+
const vsr = @import("../vsr.zig");
|
|
11
|
+
const Header = vsr.Header;
|
|
12
|
+
const IOPSType = @import("../iops.zig").IOPSType;
|
|
13
|
+
|
|
14
|
+
const log = std.log.scoped(.journal);
|
|
15
|
+
|
|
16
|
+
/// The WAL consists of two contiguous circular buffers on disk:
|
|
17
|
+
/// - `vsr.Zone.wal_headers`
|
|
18
|
+
/// - `vsr.Zone.wal_prepares`
|
|
19
|
+
///
|
|
20
|
+
/// In each ring, the `op` for reserved headers is set to the corresponding slot index.
|
|
21
|
+
/// This helps WAL recovery detect misdirected reads/writes.
|
|
22
|
+
const Ring = enum {
|
|
23
|
+
/// A circular buffer of (redundant) prepare message headers.
|
|
24
|
+
headers,
|
|
25
|
+
/// A circular buffer of prepare messages. Each slot is padded to `constants.message_size_max`.
|
|
26
|
+
prepares,
|
|
27
|
+
|
|
28
|
+
/// Returns the slot's offset relative to the start of the ring.
|
|
29
|
+
inline fn offset(comptime ring: Ring, slot: Slot) u64 {
|
|
30
|
+
assert(slot.index < slot_count);
|
|
31
|
+
switch (ring) {
|
|
32
|
+
.headers => {
|
|
33
|
+
comptime assert(constants.sector_size % @sizeOf(Header) == 0);
|
|
34
|
+
const ring_offset = vsr.sector_floor(slot.index * @sizeOf(Header));
|
|
35
|
+
assert(ring_offset < headers_size);
|
|
36
|
+
return ring_offset;
|
|
37
|
+
},
|
|
38
|
+
.prepares => {
|
|
39
|
+
const ring_offset = constants.message_size_max * slot.index;
|
|
40
|
+
assert(ring_offset < prepares_size);
|
|
41
|
+
return ring_offset;
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
const headers_per_sector = @divExact(constants.sector_size, @sizeOf(Header));
|
|
48
|
+
const headers_per_message = @divExact(constants.message_size_max, @sizeOf(Header));
|
|
49
|
+
comptime {
|
|
50
|
+
assert(headers_per_sector > 0);
|
|
51
|
+
assert(headers_per_message > 0);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// A slot is an index within:
|
|
55
|
+
///
|
|
56
|
+
/// - the on-disk headers ring
|
|
57
|
+
/// - the on-disk prepares ring
|
|
58
|
+
/// - `journal.headers`
|
|
59
|
+
/// - `journal.headers_redundant`
|
|
60
|
+
/// - `journal.dirty`
|
|
61
|
+
/// - `journal.faulty`
|
|
62
|
+
///
|
|
63
|
+
/// A header's slot is `header.op % constants.journal_slot_count`.
|
|
64
|
+
const Slot = struct { index: usize };
|
|
65
|
+
|
|
66
|
+
/// An inclusive, non-empty range of slots.
|
|
67
|
+
pub const SlotRange = struct {
|
|
68
|
+
head: Slot,
|
|
69
|
+
tail: Slot,
|
|
70
|
+
|
|
71
|
+
/// Returns whether this range (inclusive) includes the specified slot.
|
|
72
|
+
///
|
|
73
|
+
/// Cases (`·`=included, ` `=excluded):
|
|
74
|
+
///
|
|
75
|
+
/// * `head < tail` → ` head··tail `
|
|
76
|
+
/// * `head > tail` → `··tail head··` (The range wraps around).
|
|
77
|
+
/// * `head = tail` → panic (Caller must handle this case separately).
|
|
78
|
+
pub fn contains(range: *const SlotRange, slot: Slot) bool {
|
|
79
|
+
// To avoid confusion, the empty range must be checked separately by the caller.
|
|
80
|
+
assert(range.head.index != range.tail.index);
|
|
81
|
+
|
|
82
|
+
if (range.head.index < range.tail.index) {
|
|
83
|
+
return range.head.index <= slot.index and slot.index <= range.tail.index;
|
|
84
|
+
}
|
|
85
|
+
if (range.head.index > range.tail.index) {
|
|
86
|
+
return slot.index <= range.tail.index or range.head.index <= slot.index;
|
|
87
|
+
}
|
|
88
|
+
unreachable;
|
|
89
|
+
}
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const slot_count = constants.journal_slot_count;
|
|
93
|
+
const headers_size = constants.journal_size_headers;
|
|
94
|
+
const prepares_size = constants.journal_size_prepares;
|
|
95
|
+
|
|
96
|
+
pub const write_ahead_log_zone_size = headers_size + prepares_size;
|
|
97
|
+
|
|
98
|
+
/// Limit on the number of repair reads.
|
|
99
|
+
/// This keeps some reads available for commit path, so that an asymmetrically
|
|
100
|
+
/// partitioned replica cannot starve the cluster with request_prepare messages.
|
|
101
|
+
const reads_repair_count_max: u6 = constants.journal_iops_read_max - reads_commit_count_max;
|
|
102
|
+
/// We need at most two reads on commit path: one for commit_journal, and one for
|
|
103
|
+
/// primary_repair_pipeline_read.
|
|
104
|
+
const reads_commit_count_max: u6 = 2;
|
|
105
|
+
|
|
106
|
+
comptime {
|
|
107
|
+
assert(slot_count > 0);
|
|
108
|
+
assert(slot_count % 2 == 0);
|
|
109
|
+
assert(slot_count % headers_per_sector == 0);
|
|
110
|
+
assert(slot_count >= headers_per_sector);
|
|
111
|
+
// The length of the prepare pipeline is the upper bound on how many ops can be
|
|
112
|
+
// reordered during a view change. See `recover_prepares_callback()` for more detail.
|
|
113
|
+
assert(slot_count > constants.pipeline_prepare_queue_max);
|
|
114
|
+
|
|
115
|
+
assert(headers_size > 0);
|
|
116
|
+
assert(headers_size % constants.sector_size == 0);
|
|
117
|
+
// It's important that the replica doesn't write all redundant headers simultaneously.
|
|
118
|
+
// Otherwise, a crash could lead to a series of torn writes making the entire journal faulty.
|
|
119
|
+
// Normally, this guarantee falls out naturally out of the fact that there are fewer journal
|
|
120
|
+
// writes available than there are sectors. This is not the case for the simulator, which only
|
|
121
|
+
// has two sectors worth of headers. Rather than adding simulator-only locking to the journal,
|
|
122
|
+
// the simulator itself prevents correlated torn writes at runtime, and we just exclude the
|
|
123
|
+
// simulator from the assert:
|
|
124
|
+
assert(
|
|
125
|
+
@divExact(headers_size, constants.sector_size) > constants.journal_iops_write_max or
|
|
126
|
+
!constants.config.is_production(),
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
assert(prepares_size > 0);
|
|
130
|
+
assert(prepares_size % constants.sector_size == 0);
|
|
131
|
+
assert(prepares_size % constants.message_size_max == 0);
|
|
132
|
+
|
|
133
|
+
assert(reads_repair_count_max > 0);
|
|
134
|
+
assert(reads_repair_count_max + reads_commit_count_max == constants.journal_iops_read_max);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
pub fn JournalType(comptime Replica: type, comptime Storage: type) type {
|
|
138
|
+
return struct {
|
|
139
|
+
const Journal = @This();
|
|
140
|
+
const Sector = *align(constants.sector_size) [constants.sector_size]u8;
|
|
141
|
+
|
|
142
|
+
const Status = union(enum) {
|
|
143
|
+
init: void,
|
|
144
|
+
recovering: *const fn (journal: *Journal) void,
|
|
145
|
+
recovered: void,
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
pub const Read = struct {
|
|
149
|
+
journal: *Journal,
|
|
150
|
+
completion: Storage.Read,
|
|
151
|
+
message: *Message.Prepare,
|
|
152
|
+
options: Options,
|
|
153
|
+
callback: Callback,
|
|
154
|
+
|
|
155
|
+
pub const Options = struct {
|
|
156
|
+
op: u64,
|
|
157
|
+
checksum: u128,
|
|
158
|
+
destination_replica: ?u8 = null,
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
const Callback = *const fn (
|
|
162
|
+
replica: *Replica,
|
|
163
|
+
prepare: ?*Message.Prepare,
|
|
164
|
+
options: Options,
|
|
165
|
+
) void;
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
pub const Write = struct {
|
|
169
|
+
journal: *Journal,
|
|
170
|
+
callback: *const fn (
|
|
171
|
+
replica: *Replica,
|
|
172
|
+
wrote: ?*Message.Prepare,
|
|
173
|
+
) void,
|
|
174
|
+
|
|
175
|
+
message: *Message.Prepare,
|
|
176
|
+
|
|
177
|
+
/// This is reset to undefined and reused for each Storage.write_sectors() call.
|
|
178
|
+
range: Range,
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
/// State that needs to be persisted while waiting for an overlapping
|
|
182
|
+
/// concurrent write to complete. This is a range on the physical disk.
|
|
183
|
+
const Range = struct {
|
|
184
|
+
completion: Storage.Write,
|
|
185
|
+
callback: *const fn (write: *Journal.Write) void,
|
|
186
|
+
buffer: []const u8,
|
|
187
|
+
ring: Ring,
|
|
188
|
+
/// Offset within the ring.
|
|
189
|
+
offset: u64,
|
|
190
|
+
|
|
191
|
+
/// If other writes are waiting on this write to proceed, they will
|
|
192
|
+
/// be queued up in this linked list.
|
|
193
|
+
next: ?*Range = null,
|
|
194
|
+
/// True if a Storage.write_sectors() operation is in progress for this buffer/offset.
|
|
195
|
+
locked: bool,
|
|
196
|
+
|
|
197
|
+
fn overlaps(journal: *const Range, other: *const Range) bool {
|
|
198
|
+
if (journal.ring != other.ring) return false;
|
|
199
|
+
|
|
200
|
+
if (journal.offset < other.offset) {
|
|
201
|
+
return journal.offset + journal.buffer.len > other.offset;
|
|
202
|
+
} else {
|
|
203
|
+
return other.offset + other.buffer.len > journal.offset;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
const HeaderChunks = stdx.BitSetType(stdx.div_ceil(slot_count, headers_per_message));
|
|
209
|
+
|
|
210
|
+
storage: *Storage,
|
|
211
|
+
replica: u8,
|
|
212
|
+
|
|
213
|
+
/// A header is located at `slot == header.op % headers.len`.
|
|
214
|
+
///
|
|
215
|
+
/// Each slot's `header.command` is either `prepare` or `reserved`.
|
|
216
|
+
/// When the slot's header is `reserved`, the header's `op` is the slot index.
|
|
217
|
+
///
|
|
218
|
+
/// During recovery, store the (unvalidated) headers of the prepare ring.
|
|
219
|
+
headers: []align(constants.sector_size) Header.Prepare,
|
|
220
|
+
|
|
221
|
+
/// Store headers whose prepares are on disk.
|
|
222
|
+
/// Redundant headers are updated after the corresponding prepare(s) are written,
|
|
223
|
+
/// whereas `headers` are updated beforehand.
|
|
224
|
+
///
|
|
225
|
+
/// Consider this example:
|
|
226
|
+
/// 1. Ops 6 and 7 arrive.
|
|
227
|
+
/// 2. The write of prepare 7 finishes (before prepare 6).
|
|
228
|
+
/// 3. Op 7 continues on to write the redundant headers.
|
|
229
|
+
/// Because prepare 6 is not yet written, header 6 is written as reserved.
|
|
230
|
+
/// 4. If at this point the replica crashes & restarts, slot 6 is in case `@L`
|
|
231
|
+
/// (decision=nil) which can be locally repaired.
|
|
232
|
+
/// In contrast, if op 6's prepare header was written in step 3, it would be case `@K`,
|
|
233
|
+
/// which requires remote repair.
|
|
234
|
+
///
|
|
235
|
+
/// During recovery, store the redundant (unvalidated) headers.
|
|
236
|
+
headers_redundant: []align(constants.sector_size) Header.Prepare,
|
|
237
|
+
|
|
238
|
+
/// We copy-on-write to these buffers, as the in-memory headers may change while writing.
|
|
239
|
+
/// The buffers belong to the IOP at the corresponding index in IOPS.
|
|
240
|
+
write_headers_sectors: *align(constants.sector_size) [
|
|
241
|
+
constants.journal_iops_write_max
|
|
242
|
+
][constants.sector_size]u8,
|
|
243
|
+
|
|
244
|
+
/// A set bit indicates a chunk of redundant headers for which a read has been issued.
|
|
245
|
+
header_chunks_requested: HeaderChunks = .{},
|
|
246
|
+
/// A set bit indicates a chunk of redundant headers that has been recovered.
|
|
247
|
+
header_chunks_recovered: HeaderChunks = .{},
|
|
248
|
+
|
|
249
|
+
/// Statically allocated read IO operation context data.
|
|
250
|
+
reads: IOPSType(Read, constants.journal_iops_read_max) = .{},
|
|
251
|
+
/// Count of reads currently acquired on the repair path.
|
|
252
|
+
reads_repair_count: u6 = 0,
|
|
253
|
+
/// Count of reads currently acquired on the commit path.
|
|
254
|
+
reads_commit_count: u6 = 0,
|
|
255
|
+
|
|
256
|
+
/// Statically allocated write IO operation context data.
|
|
257
|
+
///
|
|
258
|
+
/// Each acquired write in this list is either:
|
|
259
|
+
/// - executing (`write.range.locked`), or
|
|
260
|
+
/// - queued (`!write.range.locked`).
|
|
261
|
+
///
|
|
262
|
+
/// Invariants:
|
|
263
|
+
/// - When there are multiple Writes to the same location, only one of them is executing at
|
|
264
|
+
/// any time -- the others are queued behind it.
|
|
265
|
+
/// - There is at most one Write to a given slot at any time.
|
|
266
|
+
writes: IOPSType(Write, constants.journal_iops_write_max) = .{},
|
|
267
|
+
|
|
268
|
+
/// Whether an entry is in memory only and needs to be written or is being written:
|
|
269
|
+
/// We use this in the same sense as a dirty bit in the kernel page cache.
|
|
270
|
+
/// A dirty bit means that we have not prepared the entry, or need to repair a faulty entry.
|
|
271
|
+
dirty: BitSet,
|
|
272
|
+
|
|
273
|
+
/// Whether an entry was written to disk and this write was subsequently lost due to:
|
|
274
|
+
/// * corruption,
|
|
275
|
+
/// * a misdirected write (or a misdirected read, we do not distinguish), or else
|
|
276
|
+
/// * a latent sector error, where the sector can no longer be read.
|
|
277
|
+
/// A faulty bit means that we prepared and then lost the entry.
|
|
278
|
+
/// A faulty bit requires the dirty bit to also be set so that callers need not check both.
|
|
279
|
+
/// A faulty bit is used then only to qualify the severity of the dirty bit.
|
|
280
|
+
faulty: BitSet,
|
|
281
|
+
|
|
282
|
+
/// The checksum of the prepare in the corresponding slot.
|
|
283
|
+
/// This is used to respond to `request_prepare` messages even when the slot is faulty.
|
|
284
|
+
/// For example, the slot may be faulty because the redundant header is faulty.
|
|
285
|
+
///
|
|
286
|
+
/// The checksum will missing (`prepare_checksums[i]=0`, `prepare_inhabited[i]=false`) when:
|
|
287
|
+
/// * the message in the slot is reserved,
|
|
288
|
+
/// * the message in the slot is being written, or when
|
|
289
|
+
/// * the message in the slot is corrupt.
|
|
290
|
+
// TODO: `prepare_checksums` and `prepare_inhabited` should be combined into a []?u128,
|
|
291
|
+
// but that type is currently unusable (as of Zig 0.9.1).
|
|
292
|
+
// See: https://github.com/ziglang/zig/issues/9871
|
|
293
|
+
prepare_checksums: []u128,
|
|
294
|
+
/// When prepare_inhabited[i]==false, prepare_checksums[i]==0.
|
|
295
|
+
/// (`undefined` would may more sense than `0`, but `0` allows it to be asserted).
|
|
296
|
+
prepare_inhabited: []bool,
|
|
297
|
+
|
|
298
|
+
status: Status = .init,
|
|
299
|
+
|
|
300
|
+
pub fn init(allocator: Allocator, storage: *Storage, replica: u8) !Journal {
|
|
301
|
+
// TODO Fix this assertion:
|
|
302
|
+
// assert(write_ahead_log_zone_size <= storage.size);
|
|
303
|
+
|
|
304
|
+
const headers = try allocator.alignedAlloc(
|
|
305
|
+
Header.Prepare,
|
|
306
|
+
constants.sector_size,
|
|
307
|
+
slot_count,
|
|
308
|
+
);
|
|
309
|
+
errdefer allocator.free(headers);
|
|
310
|
+
for (headers) |*header| header.* = undefined;
|
|
311
|
+
|
|
312
|
+
const headers_redundant = try allocator.alignedAlloc(
|
|
313
|
+
Header.Prepare,
|
|
314
|
+
constants.sector_size,
|
|
315
|
+
slot_count,
|
|
316
|
+
);
|
|
317
|
+
errdefer allocator.free(headers_redundant);
|
|
318
|
+
for (headers_redundant) |*header| header.* = undefined;
|
|
319
|
+
|
|
320
|
+
var dirty = try BitSet.init_full(allocator, slot_count);
|
|
321
|
+
errdefer dirty.deinit(allocator);
|
|
322
|
+
|
|
323
|
+
var faulty = try BitSet.init_full(allocator, slot_count);
|
|
324
|
+
errdefer faulty.deinit(allocator);
|
|
325
|
+
|
|
326
|
+
const prepare_checksums = try allocator.alloc(u128, slot_count);
|
|
327
|
+
errdefer allocator.free(prepare_checksums);
|
|
328
|
+
@memset(prepare_checksums, 0);
|
|
329
|
+
|
|
330
|
+
const prepare_inhabited = try allocator.alloc(bool, slot_count);
|
|
331
|
+
errdefer allocator.free(prepare_inhabited);
|
|
332
|
+
@memset(prepare_inhabited, false);
|
|
333
|
+
|
|
334
|
+
const write_headers_sectors = (try allocator.alignedAlloc(
|
|
335
|
+
[constants.sector_size]u8,
|
|
336
|
+
constants.sector_size,
|
|
337
|
+
constants.journal_iops_write_max,
|
|
338
|
+
))[0..constants.journal_iops_write_max];
|
|
339
|
+
errdefer allocator.free(write_headers_sectors);
|
|
340
|
+
|
|
341
|
+
log.debug("{}: slot_count={} size={} headers_size={} prepares_size={}", .{
|
|
342
|
+
replica,
|
|
343
|
+
slot_count,
|
|
344
|
+
std.fmt.fmtIntSizeBin(write_ahead_log_zone_size),
|
|
345
|
+
std.fmt.fmtIntSizeBin(headers_size),
|
|
346
|
+
std.fmt.fmtIntSizeBin(prepares_size),
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
var journal = Journal{
|
|
350
|
+
.storage = storage,
|
|
351
|
+
.replica = replica,
|
|
352
|
+
.headers = headers,
|
|
353
|
+
.headers_redundant = headers_redundant,
|
|
354
|
+
.dirty = dirty,
|
|
355
|
+
.faulty = faulty,
|
|
356
|
+
.prepare_checksums = prepare_checksums,
|
|
357
|
+
.prepare_inhabited = prepare_inhabited,
|
|
358
|
+
.write_headers_sectors = write_headers_sectors,
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
assert(@mod(@intFromPtr(&journal.headers[0]), constants.sector_size) == 0);
|
|
362
|
+
assert(journal.dirty.bits.bit_length == slot_count);
|
|
363
|
+
assert(journal.faulty.bits.bit_length == slot_count);
|
|
364
|
+
assert(journal.dirty.count == slot_count);
|
|
365
|
+
assert(journal.faulty.count == slot_count);
|
|
366
|
+
assert(journal.prepare_checksums.len == slot_count);
|
|
367
|
+
assert(journal.prepare_inhabited.len == slot_count);
|
|
368
|
+
|
|
369
|
+
for (journal.headers) |*h| assert(!h.valid_checksum());
|
|
370
|
+
for (journal.headers_redundant) |*h| assert(!h.valid_checksum());
|
|
371
|
+
|
|
372
|
+
return journal;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
pub fn deinit(journal: *Journal, allocator: Allocator) void {
|
|
376
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
377
|
+
|
|
378
|
+
journal.dirty.deinit(allocator);
|
|
379
|
+
journal.faulty.deinit(allocator);
|
|
380
|
+
allocator.free(journal.headers);
|
|
381
|
+
allocator.free(journal.headers_redundant);
|
|
382
|
+
allocator.free(journal.write_headers_sectors);
|
|
383
|
+
allocator.free(journal.prepare_checksums);
|
|
384
|
+
allocator.free(journal.prepare_inhabited);
|
|
385
|
+
|
|
386
|
+
{
|
|
387
|
+
var it = journal.reads.iterate();
|
|
388
|
+
while (it.next()) |read| replica.message_bus.unref(read.message);
|
|
389
|
+
}
|
|
390
|
+
{
|
|
391
|
+
var it = journal.writes.iterate();
|
|
392
|
+
while (it.next()) |write| replica.message_bus.unref(write.message);
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
pub fn slot_for_op(_: *const Journal, op: u64) Slot {
|
|
397
|
+
return Slot{ .index = op % slot_count };
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
pub fn slot_with_op(journal: *const Journal, op: u64) ?Slot {
|
|
401
|
+
if (journal.header_with_op(op)) |_| {
|
|
402
|
+
return journal.slot_for_op(op);
|
|
403
|
+
} else {
|
|
404
|
+
return null;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
pub fn slot_with_op_and_checksum(journal: *const Journal, op: u64, checksum: u128) ?Slot {
|
|
409
|
+
if (journal.header_with_op_and_checksum(op, checksum)) |_| {
|
|
410
|
+
return journal.slot_for_op(op);
|
|
411
|
+
} else {
|
|
412
|
+
return null;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
pub fn slot_for_header(journal: *const Journal, header: *const Header.Prepare) Slot {
|
|
417
|
+
assert(header.command == .prepare);
|
|
418
|
+
assert(header.operation != .reserved);
|
|
419
|
+
return journal.slot_for_op(header.op);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
pub fn slot_with_header(
|
|
423
|
+
journal: *const Journal,
|
|
424
|
+
header: *const Header.Prepare,
|
|
425
|
+
) ?Slot {
|
|
426
|
+
assert(header.command == .prepare);
|
|
427
|
+
assert(header.operation != .reserved);
|
|
428
|
+
return journal.slot_with_op_and_checksum(header.op, header.checksum);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/// Returns any existing header at the location indicated by header.op.
|
|
432
|
+
/// The existing header may have an older or newer op number.
|
|
433
|
+
pub fn header_for_prepare(
|
|
434
|
+
journal: *const Journal,
|
|
435
|
+
header: *const Header.Prepare,
|
|
436
|
+
) ?*const Header.Prepare {
|
|
437
|
+
assert(header.command == .prepare);
|
|
438
|
+
assert(header.operation != .reserved);
|
|
439
|
+
return journal.header_for_op(header.op);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/// We use `op` directly to index into the headers array and locate ops without a scan.
|
|
443
|
+
/// The existing header may have an older or newer op number.
|
|
444
|
+
pub fn header_for_op(journal: *const Journal, op: u64) ?*const Header.Prepare {
|
|
445
|
+
const slot = journal.slot_for_op(op);
|
|
446
|
+
const existing = &journal.headers[slot.index];
|
|
447
|
+
assert(existing.command == .prepare);
|
|
448
|
+
|
|
449
|
+
if (existing.operation == .reserved) {
|
|
450
|
+
assert(existing.op == slot.index);
|
|
451
|
+
return null;
|
|
452
|
+
} else {
|
|
453
|
+
assert(journal.slot_for_op(existing.op).index == slot.index);
|
|
454
|
+
return existing;
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
/// Returns the entry at `@mod(op)` location, but only if `entry.op == op`, else `null`.
|
|
459
|
+
/// Be careful of using this without considering that there may still be an existing op.
|
|
460
|
+
pub fn header_with_op(journal: *const Journal, op: u64) ?*const Header.Prepare {
|
|
461
|
+
if (journal.header_for_op(op)) |existing| {
|
|
462
|
+
if (existing.op == op) return existing;
|
|
463
|
+
}
|
|
464
|
+
return null;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
/// As per `header_with_op()`, but only if there is a checksum match.
|
|
468
|
+
pub fn header_with_op_and_checksum(
|
|
469
|
+
journal: *const Journal,
|
|
470
|
+
op: u64,
|
|
471
|
+
checksum: u128,
|
|
472
|
+
) ?*const Header.Prepare {
|
|
473
|
+
if (journal.header_with_op(op)) |existing| {
|
|
474
|
+
assert(existing.op == op);
|
|
475
|
+
if (existing.checksum == checksum) return existing;
|
|
476
|
+
}
|
|
477
|
+
return null;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
pub fn previous_entry(
|
|
481
|
+
journal: *const Journal,
|
|
482
|
+
header: *const Header.Prepare,
|
|
483
|
+
) ?*const Header.Prepare {
|
|
484
|
+
if (header.op == 0) {
|
|
485
|
+
return null;
|
|
486
|
+
} else {
|
|
487
|
+
return journal.header_for_op(header.op - 1);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
pub fn next_entry(
|
|
492
|
+
journal: *const Journal,
|
|
493
|
+
header: *const Header.Prepare,
|
|
494
|
+
) ?*const Header.Prepare {
|
|
495
|
+
return journal.header_for_op(header.op + 1);
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
/// Returns the highest op number prepared, in any slot without reference to the checkpoint.
|
|
499
|
+
pub fn op_maximum(journal: *const Journal) u64 {
|
|
500
|
+
assert(journal.status == .recovered);
|
|
501
|
+
|
|
502
|
+
var op: u64 = 0;
|
|
503
|
+
for (journal.headers) |*header| {
|
|
504
|
+
if (header.operation != .reserved) {
|
|
505
|
+
if (header.op > op) op = header.op;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
return op;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/// Returns the highest op number prepared, as per `header_ok()` in the untrusted headers.
|
|
512
|
+
fn op_maximum_headers_untrusted(
|
|
513
|
+
cluster: u128,
|
|
514
|
+
headers_untrusted: []const Header.Prepare,
|
|
515
|
+
) u64 {
|
|
516
|
+
var op: u64 = 0;
|
|
517
|
+
for (headers_untrusted, 0..) |*header_untrusted, slot_index| {
|
|
518
|
+
const slot = Slot{ .index = slot_index };
|
|
519
|
+
if (header_ok(cluster, slot, header_untrusted)) |header| {
|
|
520
|
+
if (header.operation != .reserved) {
|
|
521
|
+
if (header.op > op) op = header.op;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
return op;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
pub fn has_header(journal: *const Journal, header: *const Header.Prepare) bool {
|
|
529
|
+
assert(journal.status == .recovered);
|
|
530
|
+
assert(header.command == .prepare);
|
|
531
|
+
assert(header.operation != .reserved);
|
|
532
|
+
|
|
533
|
+
if (journal.header_with_op_and_checksum(header.op, header.checksum)) |_| {
|
|
534
|
+
return true;
|
|
535
|
+
} else {
|
|
536
|
+
return false;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
pub fn has_prepare(journal: *const Journal, header: *const Header.Prepare) bool {
|
|
541
|
+
if (journal.slot_with_op_and_checksum(header.op, header.checksum)) |slot| {
|
|
542
|
+
if (!journal.dirty.bit(slot)) {
|
|
543
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
544
|
+
assert(journal.prepare_checksums[slot.index] == header.checksum);
|
|
545
|
+
return true;
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
return false;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
pub fn has_dirty(journal: *const Journal, header: *const Header.Prepare) bool {
|
|
552
|
+
return journal.has_header(header) and journal.dirty.bit(
|
|
553
|
+
journal.slot_with_header(header).?,
|
|
554
|
+
);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
/// Copies latest headers between `op_min` and `op_max` (both inclusive) as fit in `dest`.
|
|
558
|
+
/// Reverses the order when copying so that latest headers are copied first, which protects
|
|
559
|
+
/// against the callsite slicing the buffer the wrong way and incorrectly, and which is
|
|
560
|
+
/// required by message handlers that use the hash chain for repairs.
|
|
561
|
+
/// Skips .reserved headers (gaps between headers).
|
|
562
|
+
/// Zeroes the `dest` buffer in case the copy would underflow and leave a buffer bleed.
|
|
563
|
+
/// Returns the number of headers actually copied.
|
|
564
|
+
pub fn copy_latest_headers_between(
|
|
565
|
+
journal: *const Journal,
|
|
566
|
+
op_min: u64,
|
|
567
|
+
op_max: u64,
|
|
568
|
+
dest: []Header.Prepare,
|
|
569
|
+
) usize {
|
|
570
|
+
assert(journal.status == .recovered);
|
|
571
|
+
assert(op_min <= op_max);
|
|
572
|
+
assert(dest.len > 0);
|
|
573
|
+
|
|
574
|
+
var copied: usize = 0;
|
|
575
|
+
// Poison all slots; only slots less than `copied` are used.
|
|
576
|
+
@memset(dest, undefined);
|
|
577
|
+
|
|
578
|
+
// Start at op_max + 1 and do the decrement upfront to avoid overflow when op_min == 0:
|
|
579
|
+
var op = op_max + 1;
|
|
580
|
+
while (op > op_min) {
|
|
581
|
+
op -= 1;
|
|
582
|
+
|
|
583
|
+
if (journal.header_with_op(op)) |header| {
|
|
584
|
+
dest[copied] = header.*;
|
|
585
|
+
assert(dest[copied].invalid() == null);
|
|
586
|
+
copied += 1;
|
|
587
|
+
if (copied == dest.len) break;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
log.debug(
|
|
592
|
+
"{}: copy_latest_headers_between: op_min={} op_max={} dest.len={} copied={}",
|
|
593
|
+
.{
|
|
594
|
+
journal.replica,
|
|
595
|
+
op_min,
|
|
596
|
+
op_max,
|
|
597
|
+
dest.len,
|
|
598
|
+
copied,
|
|
599
|
+
},
|
|
600
|
+
);
|
|
601
|
+
|
|
602
|
+
return copied;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
const HeaderRange = struct { op_min: u64, op_max: u64 };
|
|
606
|
+
|
|
607
|
+
/// Finds the latest break in headers between `op_min` and `op_max` (both inclusive).
|
|
608
|
+
/// A break is a missing header or a header not connected to the next header by hash chain.
|
|
609
|
+
/// On finding the highest break, extends the range downwards to cover as much as possible.
|
|
610
|
+
///
|
|
611
|
+
/// We expect that `op_max` (`replica.op`) must exist.
|
|
612
|
+
/// `op_min` may exist or not.
|
|
613
|
+
///
|
|
614
|
+
/// A range will never include `op_max` because this must be up to date as the latest op.
|
|
615
|
+
/// A range may include `op_min`.
|
|
616
|
+
/// We must therefore first resolve any op uncertainty so that we can trust `op_max` here.
|
|
617
|
+
///
|
|
618
|
+
/// For example: If ops 3, 9 and 10 are missing, returns: `{ .op_min = 9, .op_max = 10 }`.
|
|
619
|
+
///
|
|
620
|
+
/// Another example: If op 17 is disconnected from op 18, 16 is connected to 17, and 12-15
|
|
621
|
+
/// are missing, returns: `{ .op_min = 12, .op_max = 17 }`.
|
|
622
|
+
pub fn find_latest_headers_break_between(
|
|
623
|
+
journal: *const Journal,
|
|
624
|
+
op_min: u64,
|
|
625
|
+
op_max: u64,
|
|
626
|
+
) ?HeaderRange {
|
|
627
|
+
assert(journal.status == .recovered);
|
|
628
|
+
assert(journal.header_with_op(op_max) != null);
|
|
629
|
+
assert(op_max >= op_min);
|
|
630
|
+
assert(op_max - op_min + 1 <= slot_count);
|
|
631
|
+
var range: ?HeaderRange = null;
|
|
632
|
+
|
|
633
|
+
// We set B, the op after op_max, to null because we only examine breaks < op_max:
|
|
634
|
+
var B: ?*const Header.Prepare = null;
|
|
635
|
+
|
|
636
|
+
var op = op_max + 1;
|
|
637
|
+
while (op > op_min) {
|
|
638
|
+
op -= 1;
|
|
639
|
+
|
|
640
|
+
// Get the entry at @mod(op) location, but only if entry.op == op, else null:
|
|
641
|
+
const A = journal.header_with_op(op);
|
|
642
|
+
if (A) |a| {
|
|
643
|
+
if (B) |b| {
|
|
644
|
+
// If A was reordered then A may have a newer op than B (but an older view).
|
|
645
|
+
// However, here we use header_with_op() to assert a.op + 1 == b.op:
|
|
646
|
+
assert(a.op + 1 == b.op);
|
|
647
|
+
|
|
648
|
+
// We do not assert a.view <= b.view here unless the chain is intact because
|
|
649
|
+
// repair_header() may put a newer view to the left of an older view.
|
|
650
|
+
|
|
651
|
+
// A exists and B exists:
|
|
652
|
+
if (range) |*r| {
|
|
653
|
+
assert(b.op == r.op_min);
|
|
654
|
+
if (a.op == op_min) {
|
|
655
|
+
// A is committed, because we pass `commit_min` as `op_min`:
|
|
656
|
+
// Do not add A to range because A cannot be a break if committed.
|
|
657
|
+
break;
|
|
658
|
+
} else if (a.checksum == b.parent) {
|
|
659
|
+
// A is connected to B, but B is disconnected, add A to range:
|
|
660
|
+
assert(a.view <= b.view);
|
|
661
|
+
r.op_min = a.op;
|
|
662
|
+
} else if (a.view < b.view) {
|
|
663
|
+
// A is not connected to B, and A is older than B, add A to range:
|
|
664
|
+
r.op_min = a.op;
|
|
665
|
+
} else if (a.view > b.view) {
|
|
666
|
+
// A is not connected to B, but A is newer than B, close range:
|
|
667
|
+
break;
|
|
668
|
+
} else {
|
|
669
|
+
// Op numbers in the same view must be connected.
|
|
670
|
+
unreachable;
|
|
671
|
+
}
|
|
672
|
+
} else if (a.checksum == b.parent) {
|
|
673
|
+
// A is connected to B, and B is connected or B is op_max.
|
|
674
|
+
assert(a.view <= b.view);
|
|
675
|
+
} else if (a.view != b.view) {
|
|
676
|
+
// A is not connected to B, open range:
|
|
677
|
+
assert(b.op <= op_max);
|
|
678
|
+
range = .{ .op_min = a.op, .op_max = a.op };
|
|
679
|
+
} else {
|
|
680
|
+
// Op numbers in the same view must be connected.
|
|
681
|
+
unreachable;
|
|
682
|
+
}
|
|
683
|
+
} else {
|
|
684
|
+
// A exists and B does not exist (or B has a older/newer op number):
|
|
685
|
+
if (range) |r| {
|
|
686
|
+
// We cannot compare A to B, A may be older/newer, close range:
|
|
687
|
+
assert(r.op_min == op + 1);
|
|
688
|
+
break;
|
|
689
|
+
} else {
|
|
690
|
+
// We expect a range if B does not exist, unless:
|
|
691
|
+
assert(a.op == op_max);
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
} else {
|
|
695
|
+
assert(op < op_max);
|
|
696
|
+
|
|
697
|
+
// A does not exist, or A has an older (or newer if reordered) op number:
|
|
698
|
+
if (range) |*r| {
|
|
699
|
+
// Add A to range:
|
|
700
|
+
assert(r.op_min == op + 1);
|
|
701
|
+
r.op_min = op;
|
|
702
|
+
} else {
|
|
703
|
+
// Open range:
|
|
704
|
+
assert(B != null);
|
|
705
|
+
range = .{ .op_min = op, .op_max = op };
|
|
706
|
+
}
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
B = A;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
if (range) |r| {
|
|
713
|
+
assert(r.op_min >= op_min);
|
|
714
|
+
// We can never repair op_max (replica.op) since that is the latest op:
|
|
715
|
+
// We can assume this because any existing view jump barrier must first be resolved.
|
|
716
|
+
assert(r.op_max < op_max);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
return range;
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
/// Read a prepare from disk. There must be a matching in-memory header.
|
|
723
|
+
pub fn read_prepare(
|
|
724
|
+
journal: *Journal,
|
|
725
|
+
callback: Read.Callback,
|
|
726
|
+
options: Read.Options,
|
|
727
|
+
) void {
|
|
728
|
+
assert(journal.status == .recovered);
|
|
729
|
+
assert(options.checksum != 0);
|
|
730
|
+
assert(journal.reads.available() > 0);
|
|
731
|
+
|
|
732
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
733
|
+
if (options.op > replica.op) {
|
|
734
|
+
journal.read_prepare_log(options.op, options.checksum, "beyond replica.op");
|
|
735
|
+
callback(replica, null, options);
|
|
736
|
+
return;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
const slot = journal.slot_with_op_and_checksum(options.op, options.checksum) orelse {
|
|
740
|
+
journal.read_prepare_log(options.op, options.checksum, "no entry exactly");
|
|
741
|
+
callback(replica, null, options);
|
|
742
|
+
return;
|
|
743
|
+
};
|
|
744
|
+
|
|
745
|
+
if (journal.prepare_inhabited[slot.index] and
|
|
746
|
+
journal.prepare_checksums[slot.index] == options.checksum)
|
|
747
|
+
{
|
|
748
|
+
journal.read_prepare_with_op_and_checksum(callback, options);
|
|
749
|
+
} else {
|
|
750
|
+
journal.read_prepare_log(options.op, options.checksum, "no matching prepare");
|
|
751
|
+
callback(replica, null, options);
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
/// Read a prepare from disk. There may or may not be an in-memory header.
|
|
756
|
+
pub fn read_prepare_with_op_and_checksum(
|
|
757
|
+
journal: *Journal,
|
|
758
|
+
callback: Read.Callback,
|
|
759
|
+
options: Read.Options,
|
|
760
|
+
) void {
|
|
761
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
762
|
+
const slot = journal.slot_for_op(options.op);
|
|
763
|
+
|
|
764
|
+
assert(journal.status == .recovered);
|
|
765
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
766
|
+
assert(journal.prepare_checksums[slot.index] == options.checksum);
|
|
767
|
+
|
|
768
|
+
if (options.destination_replica == null) {
|
|
769
|
+
assert(journal.reads.available() > 0);
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
const message = replica.message_bus.get_message(.prepare);
|
|
773
|
+
defer replica.message_bus.unref(message);
|
|
774
|
+
|
|
775
|
+
var message_size: usize = constants.message_size_max;
|
|
776
|
+
|
|
777
|
+
// If the header is in-memory, we can skip the read from the disk.
|
|
778
|
+
if (journal.header_with_op_and_checksum(options.op, options.checksum)) |exact| {
|
|
779
|
+
if (exact.size == @sizeOf(Header)) {
|
|
780
|
+
message.header.* = exact.*;
|
|
781
|
+
// Normally the message's padding would have been zeroed by the MessageBus,
|
|
782
|
+
// but we are copying (only) a message header into a new buffer.
|
|
783
|
+
@memset(message.buffer[@sizeOf(Header)..constants.sector_size], 0);
|
|
784
|
+
callback(replica, message, options);
|
|
785
|
+
return;
|
|
786
|
+
} else {
|
|
787
|
+
// As an optimization, we can read fewer than `message_size_max` bytes because
|
|
788
|
+
// we know the message's exact size.
|
|
789
|
+
message_size = vsr.sector_ceil(exact.size);
|
|
790
|
+
assert(message_size <= constants.message_size_max);
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
if (options.destination_replica == null) {
|
|
795
|
+
journal.reads_commit_count += 1;
|
|
796
|
+
} else {
|
|
797
|
+
if (journal.reads_repair_count == reads_repair_count_max) {
|
|
798
|
+
journal.read_prepare_log(options.op, options.checksum, "waiting for IOP");
|
|
799
|
+
callback(replica, null, options);
|
|
800
|
+
return;
|
|
801
|
+
}
|
|
802
|
+
journal.reads_repair_count += 1;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
assert(journal.reads_repair_count <= reads_repair_count_max);
|
|
806
|
+
assert(journal.reads_commit_count <= reads_commit_count_max);
|
|
807
|
+
|
|
808
|
+
const read = journal.reads.acquire().?;
|
|
809
|
+
|
|
810
|
+
read.* = .{
|
|
811
|
+
.journal = journal,
|
|
812
|
+
.completion = undefined,
|
|
813
|
+
.message = message.ref(),
|
|
814
|
+
.options = options,
|
|
815
|
+
.callback = callback,
|
|
816
|
+
};
|
|
817
|
+
|
|
818
|
+
const buffer: []u8 = message.buffer[0..message_size];
|
|
819
|
+
|
|
820
|
+
// Memory must not be owned by `journal.headers` as these may be modified concurrently:
|
|
821
|
+
assert(stdx.disjoint_slices(u8, vsr.Header.Prepare, buffer, journal.headers));
|
|
822
|
+
|
|
823
|
+
journal.storage.read_sectors(
|
|
824
|
+
read_prepare_with_op_and_checksum_callback,
|
|
825
|
+
&read.completion,
|
|
826
|
+
buffer,
|
|
827
|
+
.wal_prepares,
|
|
828
|
+
Ring.prepares.offset(slot),
|
|
829
|
+
);
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
fn read_prepare_with_op_and_checksum_callback(completion: *Storage.Read) void {
|
|
833
|
+
const read: *Journal.Read = @alignCast(@fieldParentPtr("completion", completion));
|
|
834
|
+
const journal = read.journal;
|
|
835
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
836
|
+
|
|
837
|
+
const callback = read.callback;
|
|
838
|
+
const message = read.message;
|
|
839
|
+
const options = read.options;
|
|
840
|
+
|
|
841
|
+
defer replica.message_bus.unref(message);
|
|
842
|
+
|
|
843
|
+
assert(journal.status == .recovered);
|
|
844
|
+
|
|
845
|
+
if (options.destination_replica == null) {
|
|
846
|
+
journal.reads_commit_count -= 1;
|
|
847
|
+
} else {
|
|
848
|
+
journal.reads_repair_count -= 1;
|
|
849
|
+
}
|
|
850
|
+
journal.reads.release(read);
|
|
851
|
+
|
|
852
|
+
if (options.op > replica.op) {
|
|
853
|
+
journal.read_prepare_log(options.op, options.checksum, "beyond replica.op");
|
|
854
|
+
callback(replica, null, options);
|
|
855
|
+
return;
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
const slot = journal.slot_for_op(options.op);
|
|
859
|
+
const checksum_inhabited = journal.prepare_inhabited[slot.index];
|
|
860
|
+
const checksum_match = journal.prepare_checksums[slot.index] == options.checksum;
|
|
861
|
+
if (!checksum_inhabited or !checksum_match) {
|
|
862
|
+
journal.read_prepare_log(
|
|
863
|
+
options.op,
|
|
864
|
+
options.checksum,
|
|
865
|
+
"prepare changed during read",
|
|
866
|
+
);
|
|
867
|
+
callback(replica, null, options);
|
|
868
|
+
return;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
const error_reason: ?[]const u8 = reason: {
|
|
872
|
+
if (!message.header.valid_checksum()) {
|
|
873
|
+
break :reason "corrupt header after read";
|
|
874
|
+
}
|
|
875
|
+
assert(message.header.invalid() == null);
|
|
876
|
+
|
|
877
|
+
if (message.header.cluster != replica.cluster) {
|
|
878
|
+
// This could be caused by a misdirected read or write.
|
|
879
|
+
// Though when a prepare spans multiple sectors, a misdirected read/write will
|
|
880
|
+
// likely manifest as a checksum failure instead.
|
|
881
|
+
break :reason "wrong cluster";
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
if (message.header.op != options.op) {
|
|
885
|
+
// Possible causes:
|
|
886
|
+
// * The prepare was rewritten since the read began.
|
|
887
|
+
// * Misdirected read/write.
|
|
888
|
+
// * The combination of:
|
|
889
|
+
// * The primary is responding to a `request_prepare`.
|
|
890
|
+
// * The `request_prepare` did not include a checksum.
|
|
891
|
+
// * The requested op's slot is faulty, but the prepare is valid. Since the
|
|
892
|
+
// prepare is valid, WAL recovery set `prepare_checksums[slot]`. But on
|
|
893
|
+
// reading this entry it turns out not to have the right op.
|
|
894
|
+
// (This case (and the accompanying unnecessary read) could be prevented by
|
|
895
|
+
// storing the op along with the checksum in `prepare_checksums`.)
|
|
896
|
+
break :reason "op changed during read";
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
if (message.header.checksum != options.checksum) {
|
|
900
|
+
// This can also be caused by a misdirected read/write.
|
|
901
|
+
break :reason "checksum changed during read";
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
if (!message.header.valid_checksum_body(message.body_used())) {
|
|
905
|
+
break :reason "corrupt body after read";
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
const message_padding =
|
|
909
|
+
message.buffer[message.header.size..vsr.sector_ceil(message.header.size)];
|
|
910
|
+
if (!stdx.zeroed(message_padding)) {
|
|
911
|
+
break :reason "corrupt sector padding";
|
|
912
|
+
}
|
|
913
|
+
break :reason null;
|
|
914
|
+
};
|
|
915
|
+
|
|
916
|
+
if (error_reason) |reason| {
|
|
917
|
+
// Check that the `headers` slot belongs to the same op that it did when the read
|
|
918
|
+
// began. The slot may not match the Read's op/checksum due to either:
|
|
919
|
+
// * The in-memory header changed since the read began.
|
|
920
|
+
// * The in-memory header is reserved+faulty; the read was via `prepare_checksums`
|
|
921
|
+
if (journal.slot_with_op_and_checksum(options.op, options.checksum)) |s| {
|
|
922
|
+
journal.faulty.set(s);
|
|
923
|
+
journal.dirty.set(s);
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
journal.read_prepare_log(options.op, options.checksum, reason);
|
|
927
|
+
callback(replica, null, options);
|
|
928
|
+
} else {
|
|
929
|
+
assert(message.header.checksum == options.checksum);
|
|
930
|
+
callback(replica, message, options);
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
fn read_prepare_log(journal: *Journal, op: u64, checksum: ?u128, notice: []const u8) void {
|
|
935
|
+
log.info(
|
|
936
|
+
"{}: read_prepare: op={} checksum={x:0>32}: {s}",
|
|
937
|
+
.{ journal.replica, op, checksum orelse 0, notice },
|
|
938
|
+
);
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
pub fn recover(journal: *Journal, callback: *const fn (journal: *Journal) void) void {
|
|
942
|
+
assert(journal.status == .init);
|
|
943
|
+
assert(journal.dirty.count == slot_count);
|
|
944
|
+
assert(journal.faulty.count == slot_count);
|
|
945
|
+
assert(journal.reads.executing() == 0);
|
|
946
|
+
assert(journal.writes.executing() == 0);
|
|
947
|
+
assert(journal.header_chunks_requested.empty());
|
|
948
|
+
assert(journal.header_chunks_recovered.empty());
|
|
949
|
+
|
|
950
|
+
journal.status = .{ .recovering = callback };
|
|
951
|
+
log.debug("{}: recover: recovering", .{journal.replica});
|
|
952
|
+
|
|
953
|
+
var available: usize = journal.reads.available();
|
|
954
|
+
while (available > 0) : (available -= 1) journal.recover_headers();
|
|
955
|
+
|
|
956
|
+
assert(journal.header_chunks_recovered.empty());
|
|
957
|
+
assert(journal.header_chunks_requested.count() == journal.reads.executing());
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
fn recover_headers(journal: *Journal) void {
|
|
961
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
962
|
+
assert(journal.status == .recovering);
|
|
963
|
+
assert(journal.reads.available() > 0);
|
|
964
|
+
assert(
|
|
965
|
+
journal.header_chunks_recovered.count() <= journal.header_chunks_requested.count(),
|
|
966
|
+
);
|
|
967
|
+
|
|
968
|
+
if (journal.header_chunks_recovered.full()) {
|
|
969
|
+
log.debug("{}: recover_headers: complete", .{journal.replica});
|
|
970
|
+
journal.recover_prepares();
|
|
971
|
+
return;
|
|
972
|
+
}
|
|
973
|
+
|
|
974
|
+
const chunk_index = journal.header_chunks_requested.first_unset() orelse return;
|
|
975
|
+
assert(!journal.header_chunks_recovered.is_set(chunk_index));
|
|
976
|
+
|
|
977
|
+
const message = replica.message_bus.get_message(.prepare);
|
|
978
|
+
defer replica.message_bus.unref(message);
|
|
979
|
+
|
|
980
|
+
const chunk_read = journal.reads.acquire().?;
|
|
981
|
+
chunk_read.* = .{
|
|
982
|
+
.journal = journal,
|
|
983
|
+
.completion = undefined,
|
|
984
|
+
.message = message.ref(),
|
|
985
|
+
.options = .{ .op = chunk_index, .checksum = undefined },
|
|
986
|
+
.callback = undefined,
|
|
987
|
+
};
|
|
988
|
+
|
|
989
|
+
const offset = constants.message_size_max * chunk_index;
|
|
990
|
+
assert(offset < headers_size);
|
|
991
|
+
|
|
992
|
+
const buffer = recover_headers_buffer(message, offset);
|
|
993
|
+
assert(buffer.len > 0);
|
|
994
|
+
assert(buffer.len <= constants.message_size_max);
|
|
995
|
+
assert(buffer.len + offset <= headers_size);
|
|
996
|
+
|
|
997
|
+
log.debug("{}: recover_headers: offset={} size={} recovering", .{
|
|
998
|
+
journal.replica,
|
|
999
|
+
offset,
|
|
1000
|
+
buffer.len,
|
|
1001
|
+
});
|
|
1002
|
+
|
|
1003
|
+
journal.header_chunks_requested.set(chunk_index);
|
|
1004
|
+
journal.storage.read_sectors(
|
|
1005
|
+
recover_headers_callback,
|
|
1006
|
+
&chunk_read.completion,
|
|
1007
|
+
buffer,
|
|
1008
|
+
.wal_headers,
|
|
1009
|
+
offset,
|
|
1010
|
+
);
|
|
1011
|
+
}
|
|
1012
|
+
|
|
1013
|
+
fn recover_headers_callback(completion: *Storage.Read) void {
|
|
1014
|
+
const chunk_read: *Journal.Read = @alignCast(@fieldParentPtr("completion", completion));
|
|
1015
|
+
const journal = chunk_read.journal;
|
|
1016
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1017
|
+
assert(journal.status == .recovering);
|
|
1018
|
+
assert(chunk_read.options.destination_replica == null);
|
|
1019
|
+
|
|
1020
|
+
const chunk_index = chunk_read.options.op;
|
|
1021
|
+
assert(journal.header_chunks_requested.is_set(chunk_index));
|
|
1022
|
+
assert(!journal.header_chunks_recovered.is_set(chunk_index));
|
|
1023
|
+
|
|
1024
|
+
const chunk_buffer = recover_headers_buffer(
|
|
1025
|
+
chunk_read.message,
|
|
1026
|
+
chunk_index * constants.message_size_max,
|
|
1027
|
+
);
|
|
1028
|
+
assert(chunk_buffer.len >= @sizeOf(Header));
|
|
1029
|
+
assert(chunk_buffer.len % @sizeOf(Header) == 0);
|
|
1030
|
+
|
|
1031
|
+
log.debug("{}: recover_headers: offset={} size={} recovered", .{
|
|
1032
|
+
journal.replica,
|
|
1033
|
+
chunk_index * constants.message_size_max,
|
|
1034
|
+
chunk_buffer.len,
|
|
1035
|
+
});
|
|
1036
|
+
|
|
1037
|
+
// Directly store all the redundant headers in `journal.headers_redundant` (including
|
|
1038
|
+
// any that are invalid or corrupt). As the prepares are recovered, these will be
|
|
1039
|
+
// replaced or removed as necessary.
|
|
1040
|
+
const chunk_headers = std.mem.bytesAsSlice(Header.Prepare, chunk_buffer);
|
|
1041
|
+
stdx.copy_disjoint(
|
|
1042
|
+
.exact,
|
|
1043
|
+
Header.Prepare,
|
|
1044
|
+
journal
|
|
1045
|
+
.headers_redundant[chunk_index * headers_per_message ..][0..chunk_headers.len],
|
|
1046
|
+
chunk_headers,
|
|
1047
|
+
);
|
|
1048
|
+
|
|
1049
|
+
// We must release before we call `recover_headers()` in case Storage is synchronous.
|
|
1050
|
+
// Otherwise, we would run out of messages and reads.
|
|
1051
|
+
replica.message_bus.unref(chunk_read.message);
|
|
1052
|
+
journal.reads.release(chunk_read);
|
|
1053
|
+
|
|
1054
|
+
journal.header_chunks_recovered.set(chunk_index);
|
|
1055
|
+
journal.recover_headers();
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
fn recover_headers_buffer(
|
|
1059
|
+
message: *Message.Prepare,
|
|
1060
|
+
offset: u64,
|
|
1061
|
+
) []align(@alignOf(Header)) u8 {
|
|
1062
|
+
const max = @min(constants.message_size_max, headers_size - offset);
|
|
1063
|
+
assert(max % constants.sector_size == 0);
|
|
1064
|
+
assert(max % @sizeOf(Header) == 0);
|
|
1065
|
+
return message.buffer[0..max];
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
/// Recover the prepares ring. Reads are issued concurrently.
|
|
1069
|
+
/// - `dirty` is initially full.
|
|
1070
|
+
/// Bits are cleared when a read is issued to the slot.
|
|
1071
|
+
/// All bits are set again before recover_slots() is called.
|
|
1072
|
+
/// - `faulty` is initially full.
|
|
1073
|
+
/// Bits are cleared when the slot's read finishes.
|
|
1074
|
+
/// All bits are set again before recover_slots() is called.
|
|
1075
|
+
/// - The prepare's headers are loaded into `journal.headers`.
|
|
1076
|
+
fn recover_prepares(journal: *Journal) void {
|
|
1077
|
+
assert(journal.status == .recovering);
|
|
1078
|
+
assert(journal.dirty.count == slot_count);
|
|
1079
|
+
assert(journal.faulty.count == slot_count);
|
|
1080
|
+
assert(journal.reads.executing() == 0);
|
|
1081
|
+
assert(journal.writes.executing() == 0);
|
|
1082
|
+
|
|
1083
|
+
var available: usize = journal.reads.available();
|
|
1084
|
+
while (available > 0) : (available -= 1) journal.recover_prepare();
|
|
1085
|
+
|
|
1086
|
+
assert(journal.writes.executing() == 0);
|
|
1087
|
+
assert(journal.reads.executing() > 0);
|
|
1088
|
+
assert(journal.reads.executing() + journal.dirty.count == slot_count);
|
|
1089
|
+
assert(journal.faulty.count == slot_count);
|
|
1090
|
+
}
|
|
1091
|
+
|
|
1092
|
+
fn recover_prepare(journal: *Journal) void {
|
|
1093
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1094
|
+
assert(journal.status == .recovering);
|
|
1095
|
+
assert(journal.reads.available() > 0);
|
|
1096
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1097
|
+
|
|
1098
|
+
if (journal.faulty.count == 0) {
|
|
1099
|
+
for (journal.headers, 0..) |_, index| journal.dirty.set(Slot{ .index = index });
|
|
1100
|
+
for (journal.headers, 0..) |_, index| journal.faulty.set(Slot{ .index = index });
|
|
1101
|
+
return journal.recover_slots();
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
const slot_index = journal.dirty.bits.findFirstSet() orelse return;
|
|
1105
|
+
const slot = Slot{ .index = slot_index };
|
|
1106
|
+
const message = replica.message_bus.get_message(.prepare);
|
|
1107
|
+
defer replica.message_bus.unref(message);
|
|
1108
|
+
|
|
1109
|
+
const read = journal.reads.acquire().?;
|
|
1110
|
+
read.* = .{
|
|
1111
|
+
.journal = journal,
|
|
1112
|
+
.completion = undefined,
|
|
1113
|
+
.message = message.ref(),
|
|
1114
|
+
.options = .{ .op = slot.index, .checksum = undefined },
|
|
1115
|
+
.callback = undefined,
|
|
1116
|
+
};
|
|
1117
|
+
|
|
1118
|
+
log.debug("{}: recover_prepare: recovering slot={}", .{
|
|
1119
|
+
journal.replica,
|
|
1120
|
+
slot.index,
|
|
1121
|
+
});
|
|
1122
|
+
|
|
1123
|
+
journal.dirty.clear(slot);
|
|
1124
|
+
journal.storage.read_sectors(
|
|
1125
|
+
recover_prepare_callback,
|
|
1126
|
+
&read.completion,
|
|
1127
|
+
// We load the entire message to verify that it isn't torn or corrupt.
|
|
1128
|
+
// We don't know the message's size, so use the entire buffer.
|
|
1129
|
+
message.buffer[0..constants.message_size_max],
|
|
1130
|
+
.wal_prepares,
|
|
1131
|
+
Ring.prepares.offset(slot),
|
|
1132
|
+
);
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
fn recover_prepare_callback(completion: *Storage.Read) void {
|
|
1136
|
+
const read: *Journal.Read = @alignCast(@fieldParentPtr("completion", completion));
|
|
1137
|
+
const journal = read.journal;
|
|
1138
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1139
|
+
|
|
1140
|
+
assert(journal.status == .recovering);
|
|
1141
|
+
assert(journal.dirty.count <= journal.faulty.count);
|
|
1142
|
+
assert(read.options.destination_replica == null);
|
|
1143
|
+
|
|
1144
|
+
const slot = Slot{ .index = @intCast(read.options.op) };
|
|
1145
|
+
assert(slot.index < slot_count);
|
|
1146
|
+
assert(!journal.dirty.bit(slot));
|
|
1147
|
+
assert(journal.faulty.bit(slot));
|
|
1148
|
+
|
|
1149
|
+
// Check `valid_checksum_body` here rather than in `recover_done` so that we don't need
|
|
1150
|
+
// to hold onto the whole message (just the header).
|
|
1151
|
+
if (read.message.header.valid_checksum() and
|
|
1152
|
+
read.message.header.valid_checksum_body(read.message.body_used()))
|
|
1153
|
+
{
|
|
1154
|
+
const message_size = read.message.header.size;
|
|
1155
|
+
const message_padding =
|
|
1156
|
+
read.message.buffer[message_size..vsr.sector_ceil(message_size)];
|
|
1157
|
+
|
|
1158
|
+
if (stdx.zeroed(message_padding)) {
|
|
1159
|
+
journal.headers[slot.index] = read.message.header.*;
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
replica.message_bus.unref(read.message);
|
|
1164
|
+
journal.reads.release(read);
|
|
1165
|
+
|
|
1166
|
+
journal.faulty.clear(slot);
|
|
1167
|
+
journal.recover_prepare();
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
/// When in doubt about whether a particular message was received, it must be marked as
|
|
1171
|
+
/// faulty to avoid nacking a prepare which was received then lost/misdirected/corrupted.
|
|
1172
|
+
///
|
|
1173
|
+
///
|
|
1174
|
+
/// There are two special cases where faulty slots must be carefully handled:
|
|
1175
|
+
///
|
|
1176
|
+
/// A) Redundant headers are written in batches. Slots that are marked faulty are written
|
|
1177
|
+
/// as invalid (zeroed). This ensures that if the replica crashes and recovers, the
|
|
1178
|
+
/// entries are still faulty rather than reserved.
|
|
1179
|
+
/// The recovery process must be conservative about which headers are stored in
|
|
1180
|
+
/// `journal.headers`. To understand why this is important, consider what happens if it did
|
|
1181
|
+
/// load the faulty header into `journal.headers`, and then reads it back after a restart:
|
|
1182
|
+
///
|
|
1183
|
+
/// 1. Suppose slot 8 is in case @D. Per the table below, mark slot 8 faulty.
|
|
1184
|
+
/// 2. Suppose slot 9 is also loaded as faulty.
|
|
1185
|
+
/// 3. Journal recovery finishes. The replica beings to repair its missing/broken messages.
|
|
1186
|
+
/// 4. VSR recovery protocol fetches the true prepare for slot 9.
|
|
1187
|
+
/// 5. The message from step 4 is written to slot 9 of the prepares.
|
|
1188
|
+
/// 6. The header from step 4 is written to slot 9 of the redundant headers.
|
|
1189
|
+
/// But writes to the redundant headers are done in batches of `headers_per_sector`!
|
|
1190
|
+
/// So if step 1 loaded slot 8's prepare header into `journal.headers`, slot 8's
|
|
1191
|
+
/// redundant header would be updated at the same time (in the same write) as slot 9.
|
|
1192
|
+
/// 7! Immediately after step 6's write finishes, suppose the replica crashes (e.g. due to
|
|
1193
|
+
/// power failure).
|
|
1194
|
+
/// 8! Journal recovery again — but now slot 8 is loaded *without* being marked faulty.
|
|
1195
|
+
/// So we may incorrectly nack slot 8's message.
|
|
1196
|
+
///
|
|
1197
|
+
/// Therefore, recovery will never load a header into a slot *and* mark that slot faulty.
|
|
1198
|
+
///
|
|
1199
|
+
///
|
|
1200
|
+
/// B) When replica_count=1, repairing broken/lost prepares over VSR is not an option,
|
|
1201
|
+
/// so if a message is faulty the replica will abort.
|
|
1202
|
+
///
|
|
1203
|
+
///
|
|
1204
|
+
/// Recovery decision table:
|
|
1205
|
+
///
|
|
1206
|
+
/// label @A @B @C @D @E @F @G @H @I @J @K @L @M @N @O @P
|
|
1207
|
+
/// header valid 0 1 1 0 0 0 1 _ 1 1 1 1 1 1 1 1
|
|
1208
|
+
/// header reserved _ 1 0 _ _ _ 1 _ 0 0 0 1 0 0 0 0
|
|
1209
|
+
/// prepare valid 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
|
|
1210
|
+
/// prepare reserved _ _ _ 1 0 0 0 0 0 1 1 1 0 0 0 0
|
|
1211
|
+
/// prepare.op is maximum _ _ _ _ 0 1 _ _ _ _ _ _ _ _ _ _
|
|
1212
|
+
/// prepare.op > prep_max !0 !0 !0 _ 0 0 0 1 0 _ _ _ 0 0 0 0
|
|
1213
|
+
/// header.op > prep_max !0 !0 !0 _ 0 0 0 1 1 1 0 _ 0 0 0 0
|
|
1214
|
+
/// match checksum _ _ _ _ _ _ _ _ _ _ _ !1 0 0 0 1
|
|
1215
|
+
/// match op _ _ _ _ _ _ _ _ !0 !0 _ !1 < > 1 !1
|
|
1216
|
+
/// match view _ _ _ _ _ _ _ _ _ _ _ !1 _ _ !0 !1
|
|
1217
|
+
/// decision (replicas>1) vsr vsr vsr vsr vsr fix fix cut cut cut vsr nil fix vsr vsr eql
|
|
1218
|
+
/// decision (replicas=1) fix fix
|
|
1219
|
+
///
|
|
1220
|
+
/// Legend:
|
|
1221
|
+
///
|
|
1222
|
+
/// 0 false
|
|
1223
|
+
/// 1 true
|
|
1224
|
+
/// !0 assert false
|
|
1225
|
+
/// !1 assert true
|
|
1226
|
+
/// _ ignore
|
|
1227
|
+
/// < header.op < prepare.op
|
|
1228
|
+
/// > header.op > prepare.op
|
|
1229
|
+
/// eql The header and prepare are identical; no repair necessary.
|
|
1230
|
+
/// nil Reserved; dirty/faulty are clear, no repair necessary.
|
|
1231
|
+
/// fix Repair header using local intact prepare.
|
|
1232
|
+
/// vsr Repair with VSR `request_prepare`.
|
|
1233
|
+
///
|
|
1234
|
+
/// A "valid" header/prepare:
|
|
1235
|
+
/// 1. has a valid checksum
|
|
1236
|
+
/// 2. has the correct cluster
|
|
1237
|
+
/// 3. is in the correct slot (op % slot_count)
|
|
1238
|
+
/// 4. has command=prepare
|
|
1239
|
+
/// 5. may or may not have operation=reserved
|
|
1240
|
+
fn recover_slots(journal: *Journal) void {
|
|
1241
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1242
|
+
const log_view = replica.superblock.working.vsr_state.log_view;
|
|
1243
|
+
const view_headers = replica.superblock.working.view_headers();
|
|
1244
|
+
|
|
1245
|
+
assert(journal.status == .recovering);
|
|
1246
|
+
assert(journal.reads.executing() == 0);
|
|
1247
|
+
assert(journal.writes.executing() == 0);
|
|
1248
|
+
assert(journal.dirty.count == slot_count);
|
|
1249
|
+
assert(journal.faulty.count == slot_count);
|
|
1250
|
+
|
|
1251
|
+
var cases: [slot_count]*const Case = undefined;
|
|
1252
|
+
|
|
1253
|
+
for (journal.headers, 0..) |_, index| {
|
|
1254
|
+
const slot = Slot{ .index = index };
|
|
1255
|
+
const header = header_ok(replica.cluster, slot, &journal.headers_redundant[index]);
|
|
1256
|
+
const prepare = header_ok(replica.cluster, slot, &journal.headers[index]);
|
|
1257
|
+
|
|
1258
|
+
cases[index] = recovery_case(header, prepare, .{
|
|
1259
|
+
.op_prepare_max = replica.op_prepare_max(),
|
|
1260
|
+
.op_max = @max(
|
|
1261
|
+
op_maximum_headers_untrusted(replica.cluster, journal.headers_redundant),
|
|
1262
|
+
op_maximum_headers_untrusted(replica.cluster, journal.headers),
|
|
1263
|
+
),
|
|
1264
|
+
.op_checkpoint = replica.op_checkpoint(),
|
|
1265
|
+
});
|
|
1266
|
+
|
|
1267
|
+
// `prepare_checksums` improves the availability of `request_prepare` by being more
|
|
1268
|
+
// flexible than `headers` regarding the prepares it references. It may hold a
|
|
1269
|
+
// prepare whose redundant header is broken, as long as the prepare itself is valid.
|
|
1270
|
+
if (prepare != null and prepare.?.operation != .reserved) {
|
|
1271
|
+
assert(!journal.prepare_inhabited[index]);
|
|
1272
|
+
journal.prepare_inhabited[index] = true;
|
|
1273
|
+
journal.prepare_checksums[index] = prepare.?.checksum;
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
assert(journal.headers.len == cases.len);
|
|
1277
|
+
|
|
1278
|
+
const torn_prepares_ = journal.torn_prepares(&cases);
|
|
1279
|
+
// Refine cases @B and @C: Repair (truncate) a prepare if it was torn during a crash.
|
|
1280
|
+
for (torn_prepares_.const_slice()) |torn_prepare| {
|
|
1281
|
+
assert(cases[torn_prepare.index].decision(replica.solo()) == .vsr);
|
|
1282
|
+
cases[torn_prepare.index] = &case_cut_torn;
|
|
1283
|
+
log.warn("{}: recover_slots: torn prepare in slot={}", .{
|
|
1284
|
+
journal.replica,
|
|
1285
|
+
torn_prepare.index,
|
|
1286
|
+
});
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
for (cases, 0..) |case, index| journal.recover_slot(Slot{ .index = index }, case);
|
|
1290
|
+
assert(cases.len == slot_count);
|
|
1291
|
+
|
|
1292
|
+
stdx.copy_disjoint(
|
|
1293
|
+
.exact,
|
|
1294
|
+
Header.Prepare,
|
|
1295
|
+
journal.headers_redundant,
|
|
1296
|
+
journal.headers,
|
|
1297
|
+
);
|
|
1298
|
+
|
|
1299
|
+
// Discard headers which we are certain do not belong in the current log_view.
|
|
1300
|
+
// - This ensures that we don't accidentally set our new head op to be a message
|
|
1301
|
+
// which was truncated but not yet overwritten.
|
|
1302
|
+
// - This is also necessary to ensure that generated DVC's headers are complete.
|
|
1303
|
+
//
|
|
1304
|
+
// It is essential that this is performed:
|
|
1305
|
+
// - after prepare_op_max is computed,
|
|
1306
|
+
// - after the case decisions are made (to avoid @K:vsr arising from an
|
|
1307
|
+
// artificially reserved prepare),
|
|
1308
|
+
// - after torn_prepares(), which computes its own max ops.
|
|
1309
|
+
// - before we repair the 'fix' cases.
|
|
1310
|
+
//
|
|
1311
|
+
// (These headers can originate if we join a view, write some prepares from the new
|
|
1312
|
+
// view, and then crash before the view_durable_update() finished.)
|
|
1313
|
+
for (journal.headers, 0..) |*header_untrusted, index| {
|
|
1314
|
+
const slot = Slot{ .index = index };
|
|
1315
|
+
if (header_ok(replica.cluster, slot, header_untrusted)) |header| {
|
|
1316
|
+
const view_range = view_headers.view_for_op(header.op, log_view);
|
|
1317
|
+
assert(view_range.max <= log_view);
|
|
1318
|
+
|
|
1319
|
+
if (header.operation != .reserved and !view_range.contains(header.view)) {
|
|
1320
|
+
log.warn("{}: recover_slots: drop header " ++
|
|
1321
|
+
"view_range={}..{} view={} op={} checksum={x:0>32}", .{
|
|
1322
|
+
journal.replica,
|
|
1323
|
+
view_range.min,
|
|
1324
|
+
view_range.max,
|
|
1325
|
+
header.view,
|
|
1326
|
+
header.op,
|
|
1327
|
+
header.checksum,
|
|
1328
|
+
});
|
|
1329
|
+
journal.remove_entry(slot);
|
|
1330
|
+
}
|
|
1331
|
+
}
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
log.debug("{}: recover_slots: dirty={} faulty={}", .{
|
|
1335
|
+
journal.replica,
|
|
1336
|
+
journal.dirty.count,
|
|
1337
|
+
journal.faulty.count,
|
|
1338
|
+
});
|
|
1339
|
+
|
|
1340
|
+
journal.recover_fix();
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
/// Returns the slots that are safe to truncate.
|
|
1344
|
+
///
|
|
1345
|
+
/// The goal of this function is to identify all prepares that were torn while being
|
|
1346
|
+
/// appended to the log before a crash. These torn prepares must be truncated to ensure
|
|
1347
|
+
/// that the replica doesn't start up in recovering_head.
|
|
1348
|
+
///
|
|
1349
|
+
/// Conditions for torn prepares to be truncated:
|
|
1350
|
+
/// * op_max, computed as the max of the prepare headers and redundant headers must be
|
|
1351
|
+
/// certain.
|
|
1352
|
+
/// * for certainty of op_max, there must be no faults between (op_max, op_prepare_max]
|
|
1353
|
+
/// other than "torn prepares", which manifest as:
|
|
1354
|
+
/// - the redundant header is valid,
|
|
1355
|
+
/// - the redundant header's op is at least a log cycle behind,
|
|
1356
|
+
/// - the prepare is corrupt
|
|
1357
|
+
/// * faults may exist outside of (op_max, op_prepare_max]. They have no bearing on the
|
|
1358
|
+
/// certainty of op_max as they lie between (op_checkpoint, op_max].
|
|
1359
|
+
fn torn_prepares(
|
|
1360
|
+
journal: *const Journal,
|
|
1361
|
+
cases: []const *const Case,
|
|
1362
|
+
) stdx.BoundedArrayType(Slot, constants.journal_iops_write_max) {
|
|
1363
|
+
const replica: *const Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1364
|
+
|
|
1365
|
+
assert(journal.status == .recovering);
|
|
1366
|
+
assert(journal.dirty.count == slot_count);
|
|
1367
|
+
assert(journal.faulty.count == slot_count);
|
|
1368
|
+
|
|
1369
|
+
const op_max = @max(
|
|
1370
|
+
op_maximum_headers_untrusted(replica.cluster, journal.headers_redundant),
|
|
1371
|
+
op_maximum_headers_untrusted(replica.cluster, journal.headers),
|
|
1372
|
+
);
|
|
1373
|
+
|
|
1374
|
+
const op_checkpoint = replica.op_checkpoint();
|
|
1375
|
+
const op_prepare_max = replica.op_prepare_max();
|
|
1376
|
+
|
|
1377
|
+
// Nothing to truncate - head op is not certain as it must be >= op_checkpoint.
|
|
1378
|
+
if (op_max < op_checkpoint) return .{};
|
|
1379
|
+
|
|
1380
|
+
// Nothing to truncate - prepares beyond prepare_max are truncated via the cut decision.
|
|
1381
|
+
if (op_max >= op_prepare_max) return .{};
|
|
1382
|
+
|
|
1383
|
+
const op_prepare_max_slot = journal.slot_for_op(op_prepare_max);
|
|
1384
|
+
const op_checkpoint_slot = journal.slot_for_op(op_checkpoint);
|
|
1385
|
+
|
|
1386
|
+
assert(op_max < op_prepare_max);
|
|
1387
|
+
|
|
1388
|
+
// Range is constructed such that the op for all *valid* prepares or headers in it
|
|
1389
|
+
// should be less than op_max. If a prepare/header within this range is corrupted, that
|
|
1390
|
+
// makes our op_max uncertain.
|
|
1391
|
+
const op_max_to_op_prepare_max = SlotRange{
|
|
1392
|
+
.head = journal.slot_for_op(op_max + 1),
|
|
1393
|
+
.tail = prepare_max: {
|
|
1394
|
+
if (op_checkpoint > 0 and op_max == op_checkpoint) {
|
|
1395
|
+
assert(op_prepare_max_slot.index == op_checkpoint_slot.index);
|
|
1396
|
+
assert(op_prepare_max > 0);
|
|
1397
|
+
|
|
1398
|
+
break :prepare_max journal.slot_for_op(op_prepare_max - 1);
|
|
1399
|
+
} else {
|
|
1400
|
+
break :prepare_max op_prepare_max_slot;
|
|
1401
|
+
}
|
|
1402
|
+
},
|
|
1403
|
+
};
|
|
1404
|
+
|
|
1405
|
+
// We only consider journal_iops_write_max torn slots, as that is the maximum number of
|
|
1406
|
+
// prepare writes that could be concurrently underway. If we find more (due to
|
|
1407
|
+
// corruptions), we err on the side of caution and don't truncate any prepares.
|
|
1408
|
+
var torn_slots: stdx.BoundedArrayType(Slot, constants.journal_iops_write_max) = .{};
|
|
1409
|
+
|
|
1410
|
+
// We now search for torn prepares between op_max and op_prepare_max. A torn prepare
|
|
1411
|
+
// manifests as a prepare with an *invalid checksum* and a *valid* header from any
|
|
1412
|
+
// previous wrap. If our op_max is certain, i.e. we are guaranteed to not find any
|
|
1413
|
+
// op > op_max in our journal, then we can say with certainty that a torn prepare was
|
|
1414
|
+
// being appended to the WAL. However, if we find a "non torn-prepare" fault outside of
|
|
1415
|
+
// [op_max + 1, op_prepare_max], we return an empty slice.
|
|
1416
|
+
//
|
|
1417
|
+
// (fault [op_max+1..........op_prepare_max] fault)
|
|
1418
|
+
// (...op_prepare_max] fault fault [op_max+1......)
|
|
1419
|
+
//
|
|
1420
|
+
// When there exists a "non torn-prepare" fault outside of [op_max + 1, op_prepare_max],
|
|
1421
|
+
// op_max is not certain, as the faulty slot could be the true op_max. Consequently, we
|
|
1422
|
+
// can't say if a torn prepare was truly torn (safe to truncate) or corrupted (not safe
|
|
1423
|
+
// to truncate).
|
|
1424
|
+
for (cases, 0..) |case, index| {
|
|
1425
|
+
// Do not use `faulty.bit()` because the decisions have not been processed yet.
|
|
1426
|
+
if (case.decision(replica.solo()) == .vsr) {
|
|
1427
|
+
const slot = Slot{ .index = index };
|
|
1428
|
+
|
|
1429
|
+
// Checked separately as SlotRange.contains doesn't handle empty ranges.
|
|
1430
|
+
const range_empty = op_max_to_op_prepare_max.head.index ==
|
|
1431
|
+
op_max_to_op_prepare_max.tail.index;
|
|
1432
|
+
|
|
1433
|
+
if ((range_empty and index == op_prepare_max_slot.index) or
|
|
1434
|
+
(!range_empty and op_max_to_op_prepare_max.contains(slot)))
|
|
1435
|
+
{
|
|
1436
|
+
const header_prepare_untrusted = &journal.headers[index];
|
|
1437
|
+
const header_redundant_ok = header_ok(
|
|
1438
|
+
replica.cluster,
|
|
1439
|
+
slot,
|
|
1440
|
+
&journal.headers_redundant[index],
|
|
1441
|
+
);
|
|
1442
|
+
|
|
1443
|
+
// We need our head op to be certain to reliably truncate torn prepares.
|
|
1444
|
+
// Head op is uncertain if we encounter one of the below faults:
|
|
1445
|
+
|
|
1446
|
+
// 1. Corrupt redundant header or a misdirected read to a redundant header.
|
|
1447
|
+
if (header_redundant_ok == null) return .{};
|
|
1448
|
+
|
|
1449
|
+
// 2. Redundant header is set to .reserved. Could happen if:
|
|
1450
|
+
// i. Slot was found corrupt on a previous startup, which set the header
|
|
1451
|
+
// to reserved in memory.
|
|
1452
|
+
// ii. Replica crashes *before* the corrupt slot was repaired, but
|
|
1453
|
+
// *after* the reserved header was written to disk with a write to
|
|
1454
|
+
// a nearby header (there are multiple headers in a single sector)
|
|
1455
|
+
if (header_redundant_ok.?.operation == .reserved) return .{};
|
|
1456
|
+
|
|
1457
|
+
// 3. Prepare must be invalid for the slot to be eligible for truncation. A
|
|
1458
|
+
// valid prepare could be faulty due to a misdirected read/write.
|
|
1459
|
+
if (header_prepare_untrusted.valid_checksum()) return .{};
|
|
1460
|
+
|
|
1461
|
+
// Header is valid and from a previous wrap.
|
|
1462
|
+
assert(header_redundant_ok != null);
|
|
1463
|
+
assert(header_redundant_ok.?.op < op_max);
|
|
1464
|
+
assert(header_redundant_ok.?.op <= op_checkpoint);
|
|
1465
|
+
|
|
1466
|
+
assert(!header_prepare_untrusted.valid_checksum());
|
|
1467
|
+
assert(!journal.prepare_inhabited[index]);
|
|
1468
|
+
|
|
1469
|
+
if (torn_slots.count() < constants.journal_iops_write_max) {
|
|
1470
|
+
torn_slots.push(slot);
|
|
1471
|
+
} else {
|
|
1472
|
+
log.warn("{}: torn_prepares: not truncating, found >{} " ++
|
|
1473
|
+
"torn prepares!", .{
|
|
1474
|
+
journal.replica,
|
|
1475
|
+
constants.journal_iops_write_max,
|
|
1476
|
+
});
|
|
1477
|
+
return .{};
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
}
|
|
1482
|
+
return torn_slots;
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
fn recover_slot(journal: *Journal, slot: Slot, case: *const Case) void {
|
|
1486
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1487
|
+
const cluster = replica.cluster;
|
|
1488
|
+
|
|
1489
|
+
assert(journal.status == .recovering);
|
|
1490
|
+
assert(journal.dirty.bit(slot));
|
|
1491
|
+
assert(journal.faulty.bit(slot));
|
|
1492
|
+
|
|
1493
|
+
const header = header_ok(cluster, slot, &journal.headers_redundant[slot.index]);
|
|
1494
|
+
const prepare = header_ok(cluster, slot, &journal.headers[slot.index]);
|
|
1495
|
+
const decision = case.decision(replica.solo());
|
|
1496
|
+
switch (decision) {
|
|
1497
|
+
.eql => {
|
|
1498
|
+
assert(header.?.command == .prepare);
|
|
1499
|
+
assert(prepare.?.command == .prepare);
|
|
1500
|
+
assert(header.?.operation != .reserved);
|
|
1501
|
+
assert(prepare.?.operation != .reserved);
|
|
1502
|
+
assert(header.?.checksum == prepare.?.checksum);
|
|
1503
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1504
|
+
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1505
|
+
journal.headers[slot.index] = header.?;
|
|
1506
|
+
journal.dirty.clear(slot);
|
|
1507
|
+
journal.faulty.clear(slot);
|
|
1508
|
+
},
|
|
1509
|
+
.nil => {
|
|
1510
|
+
assert(header.?.command == .prepare);
|
|
1511
|
+
assert(prepare.?.command == .prepare);
|
|
1512
|
+
assert(header.?.operation == .reserved);
|
|
1513
|
+
assert(prepare.?.operation == .reserved);
|
|
1514
|
+
assert(header.?.checksum == prepare.?.checksum);
|
|
1515
|
+
assert(
|
|
1516
|
+
header.?.checksum == Header.Prepare.reserve(cluster, slot.index).checksum,
|
|
1517
|
+
);
|
|
1518
|
+
assert(!journal.prepare_inhabited[slot.index]);
|
|
1519
|
+
assert(journal.prepare_checksums[slot.index] == 0);
|
|
1520
|
+
journal.headers[slot.index] = header.?;
|
|
1521
|
+
journal.dirty.clear(slot);
|
|
1522
|
+
journal.faulty.clear(slot);
|
|
1523
|
+
},
|
|
1524
|
+
.fix => {
|
|
1525
|
+
assert(prepare.?.command == .prepare);
|
|
1526
|
+
journal.headers[slot.index] = prepare.?;
|
|
1527
|
+
journal.faulty.clear(slot);
|
|
1528
|
+
assert(journal.dirty.bit(slot));
|
|
1529
|
+
if (replica.solo()) {
|
|
1530
|
+
// @D, @E, @F, @G, @M
|
|
1531
|
+
} else {
|
|
1532
|
+
assert(prepare.?.operation != .reserved);
|
|
1533
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1534
|
+
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1535
|
+
// @F, @G, @M
|
|
1536
|
+
}
|
|
1537
|
+
},
|
|
1538
|
+
.vsr => {
|
|
1539
|
+
journal.headers[slot.index] = Header.Prepare.reserve(cluster, slot.index);
|
|
1540
|
+
assert(journal.dirty.bit(slot));
|
|
1541
|
+
assert(journal.faulty.bit(slot));
|
|
1542
|
+
},
|
|
1543
|
+
.cut_torn => {
|
|
1544
|
+
assert(header != null);
|
|
1545
|
+
assert(prepare == null);
|
|
1546
|
+
assert(!journal.prepare_inhabited[slot.index]);
|
|
1547
|
+
assert(journal.prepare_checksums[slot.index] == 0);
|
|
1548
|
+
journal.headers[slot.index] = Header.Prepare.reserve(cluster, slot.index);
|
|
1549
|
+
journal.dirty.clear(slot);
|
|
1550
|
+
journal.faulty.clear(slot);
|
|
1551
|
+
},
|
|
1552
|
+
.cut => {
|
|
1553
|
+
assert(prepare != null);
|
|
1554
|
+
|
|
1555
|
+
if (prepare.?.op <= replica.op_prepare_max()) {
|
|
1556
|
+
assert(header != null);
|
|
1557
|
+
assert(header.?.operation != .reserved);
|
|
1558
|
+
assert(header.?.op > replica.op_prepare_max());
|
|
1559
|
+
} else {
|
|
1560
|
+
assert(prepare.?.operation != .reserved);
|
|
1561
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1562
|
+
assert(journal.prepare_checksums[slot.index] == prepare.?.checksum);
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
journal.headers[slot.index] = Header.Prepare.reserve(cluster, slot.index);
|
|
1566
|
+
journal.dirty.clear(slot);
|
|
1567
|
+
journal.faulty.clear(slot);
|
|
1568
|
+
},
|
|
1569
|
+
.unr => unreachable,
|
|
1570
|
+
}
|
|
1571
|
+
|
|
1572
|
+
journal.headers_redundant[slot.index] = journal.headers[slot.index];
|
|
1573
|
+
if (journal.faulty.bit(slot)) {
|
|
1574
|
+
journal.headers_redundant[slot.index].checksum = 0; // Invalidate the checksum.
|
|
1575
|
+
}
|
|
1576
|
+
assert(journal.faulty.bit(slot) !=
|
|
1577
|
+
journal.headers_redundant[slot.index].valid_checksum());
|
|
1578
|
+
|
|
1579
|
+
switch (decision) {
|
|
1580
|
+
.eql, .nil => {
|
|
1581
|
+
log.debug("{}: recover_slot: recovered " ++
|
|
1582
|
+
"slot={:0>4} label={s} decision={s} operation={} op={} view={}", .{
|
|
1583
|
+
journal.replica,
|
|
1584
|
+
slot.index,
|
|
1585
|
+
case.label,
|
|
1586
|
+
@tagName(decision),
|
|
1587
|
+
journal.headers[slot.index].operation,
|
|
1588
|
+
journal.headers[slot.index].op,
|
|
1589
|
+
journal.headers[slot.index].view,
|
|
1590
|
+
});
|
|
1591
|
+
},
|
|
1592
|
+
.fix, .vsr, .cut, .cut_torn => {
|
|
1593
|
+
log.warn("{}: recover_slot: recovered " ++
|
|
1594
|
+
"slot={:0>4} label={s} decision={s} operation={} op={} view={}", .{
|
|
1595
|
+
journal.replica,
|
|
1596
|
+
slot.index,
|
|
1597
|
+
case.label,
|
|
1598
|
+
@tagName(decision),
|
|
1599
|
+
journal.headers[slot.index].operation,
|
|
1600
|
+
journal.headers[slot.index].op,
|
|
1601
|
+
journal.headers[slot.index].view,
|
|
1602
|
+
});
|
|
1603
|
+
},
|
|
1604
|
+
.unr => unreachable,
|
|
1605
|
+
}
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
/// Repair the redundant headers for slots with decision=fix, one sector at a time.
|
|
1609
|
+
fn recover_fix(journal: *Journal) void {
|
|
1610
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1611
|
+
assert(journal.status == .recovering);
|
|
1612
|
+
assert(journal.writes.executing() == 0);
|
|
1613
|
+
assert(journal.dirty.count >= journal.faulty.count);
|
|
1614
|
+
assert(journal.dirty.count <= slot_count);
|
|
1615
|
+
|
|
1616
|
+
var fix_sector: ?usize = null;
|
|
1617
|
+
var dirty_iterator = journal.dirty.bits.iterator(.{ .kind = .set });
|
|
1618
|
+
while (dirty_iterator.next()) |dirty_slot| {
|
|
1619
|
+
if (journal.faulty.bit(Slot{ .index = dirty_slot })) continue;
|
|
1620
|
+
if (journal.prepare_inhabited[dirty_slot]) {
|
|
1621
|
+
assert(journal.prepare_checksums[dirty_slot] ==
|
|
1622
|
+
journal.headers[dirty_slot].checksum);
|
|
1623
|
+
assert(journal.prepare_checksums[dirty_slot] ==
|
|
1624
|
+
journal.headers_redundant[dirty_slot].checksum);
|
|
1625
|
+
} else {
|
|
1626
|
+
// Case @D for R=1.
|
|
1627
|
+
assert(replica.solo());
|
|
1628
|
+
}
|
|
1629
|
+
|
|
1630
|
+
const dirty_slot_sector = @divFloor(dirty_slot, headers_per_sector);
|
|
1631
|
+
if (fix_sector) |fix_sector_| {
|
|
1632
|
+
if (fix_sector_ != dirty_slot_sector) break;
|
|
1633
|
+
} else {
|
|
1634
|
+
fix_sector = dirty_slot_sector;
|
|
1635
|
+
}
|
|
1636
|
+
journal.dirty.clear(Slot{ .index = dirty_slot });
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
if (fix_sector == null) return journal.recover_done();
|
|
1640
|
+
|
|
1641
|
+
const write = journal.writes.acquire().?;
|
|
1642
|
+
write.* = .{
|
|
1643
|
+
.journal = journal,
|
|
1644
|
+
.callback = undefined,
|
|
1645
|
+
.message = undefined,
|
|
1646
|
+
.range = undefined,
|
|
1647
|
+
};
|
|
1648
|
+
|
|
1649
|
+
const buffer: []u8 = journal.header_sector(fix_sector.?, write);
|
|
1650
|
+
const buffer_headers = std.mem.bytesAsSlice(Header, buffer);
|
|
1651
|
+
assert(buffer_headers.len == headers_per_sector);
|
|
1652
|
+
|
|
1653
|
+
const offset = Ring.headers.offset(Slot{ .index = fix_sector.? * headers_per_sector });
|
|
1654
|
+
journal.write_sectors(recover_fix_callback, write, buffer, .headers, offset);
|
|
1655
|
+
}
|
|
1656
|
+
|
|
1657
|
+
fn recover_fix_callback(write: *Journal.Write) void {
|
|
1658
|
+
const journal = write.journal;
|
|
1659
|
+
assert(journal.status == .recovering);
|
|
1660
|
+
|
|
1661
|
+
journal.writes.release(write);
|
|
1662
|
+
journal.recover_fix();
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
fn recover_done(journal: *Journal) void {
|
|
1666
|
+
assert(journal.status == .recovering);
|
|
1667
|
+
assert(journal.reads.executing() == 0);
|
|
1668
|
+
assert(journal.writes.executing() == 0);
|
|
1669
|
+
assert(journal.dirty.count <= slot_count);
|
|
1670
|
+
assert(journal.faulty.count <= slot_count);
|
|
1671
|
+
assert(journal.faulty.count == journal.dirty.count);
|
|
1672
|
+
assert(journal.header_chunks_requested.full());
|
|
1673
|
+
assert(journal.header_chunks_recovered.full());
|
|
1674
|
+
|
|
1675
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1676
|
+
const callback = journal.status.recovering;
|
|
1677
|
+
journal.status = .recovered;
|
|
1678
|
+
|
|
1679
|
+
if (journal.headers[0].op == 0 and journal.headers[0].operation != .reserved) {
|
|
1680
|
+
assert(
|
|
1681
|
+
journal.headers[0].checksum == Header.Prepare.root(replica.cluster).checksum,
|
|
1682
|
+
);
|
|
1683
|
+
assert(!journal.faulty.bit(Slot{ .index = 0 }));
|
|
1684
|
+
}
|
|
1685
|
+
|
|
1686
|
+
for (journal.headers, 0..) |*header, index| {
|
|
1687
|
+
assert(header.valid_checksum());
|
|
1688
|
+
assert(header.cluster == replica.cluster);
|
|
1689
|
+
assert(header.command == .prepare);
|
|
1690
|
+
assert(std.meta.eql(header.*, journal.headers_redundant[index]));
|
|
1691
|
+
if (header.operation == .reserved) {
|
|
1692
|
+
assert(header.op == index);
|
|
1693
|
+
} else {
|
|
1694
|
+
assert(header.op % slot_count == index);
|
|
1695
|
+
assert(journal.prepare_inhabited[index]);
|
|
1696
|
+
assert(journal.prepare_checksums[index] == header.checksum);
|
|
1697
|
+
maybe(journal.faulty.bit(Slot{ .index = index }));
|
|
1698
|
+
}
|
|
1699
|
+
}
|
|
1700
|
+
callback(journal);
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
/// Removes entries from `op_min` (inclusive) onwards.
|
|
1704
|
+
/// Used after a view change to remove uncommitted entries discarded by the new primary.
|
|
1705
|
+
pub fn remove_entries_from(journal: *Journal, op_min: u64) void {
|
|
1706
|
+
assert(journal.status == .recovered);
|
|
1707
|
+
assert(op_min > 0);
|
|
1708
|
+
|
|
1709
|
+
log.debug("{}: remove_entries_from: op_min={}", .{ journal.replica, op_min });
|
|
1710
|
+
|
|
1711
|
+
for (journal.headers, 0..) |*header, index| {
|
|
1712
|
+
// We must remove the header regardless of whether it is a prepare or reserved,
|
|
1713
|
+
// since a reserved header may have been marked faulty for case @K, and
|
|
1714
|
+
// since the caller expects the WAL to be truncated, with clean slots.
|
|
1715
|
+
if (header.op >= op_min) {
|
|
1716
|
+
// TODO Explore scenarios where the data on disk may resurface after a crash.
|
|
1717
|
+
const slot = journal.slot_for_op(header.op);
|
|
1718
|
+
assert(slot.index == index);
|
|
1719
|
+
journal.remove_entry(slot);
|
|
1720
|
+
}
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
|
|
1724
|
+
pub fn remove_entry(journal: *Journal, slot: Slot) void {
|
|
1725
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1726
|
+
|
|
1727
|
+
const reserved = Header.Prepare.reserve(replica.cluster, slot.index);
|
|
1728
|
+
journal.headers[slot.index] = reserved;
|
|
1729
|
+
journal.headers_redundant[slot.index] = reserved;
|
|
1730
|
+
journal.dirty.clear(slot);
|
|
1731
|
+
journal.faulty.clear(slot);
|
|
1732
|
+
// Do not clear `prepare_inhabited`/`prepare_checksums`. The prepare is
|
|
1733
|
+
// untouched on disk, and may be useful later. Consider this scenario:
|
|
1734
|
+
//
|
|
1735
|
+
// 1. Op 4 is received; start writing it.
|
|
1736
|
+
// 2. Op 4's prepare is written (setting `prepare_checksums`), start writing
|
|
1737
|
+
// the headers.
|
|
1738
|
+
// 3. View change. Op 4 is discarded by `remove_entries_from`.
|
|
1739
|
+
// 4. View change. Op 4 (the same one from before) is back, marked as dirty. But
|
|
1740
|
+
// we don't start a write, because `journal.writing()` says it is already in
|
|
1741
|
+
// progress.
|
|
1742
|
+
// 5. Op 4's header write finishes (`write_prepare_on_write_header`).
|
|
1743
|
+
//
|
|
1744
|
+
// If `remove_entries_from` cleared `prepare_checksums`,
|
|
1745
|
+
// `write_prepare_on_write_header` would clear `dirty`/`faulty` for a slot with
|
|
1746
|
+
// `prepare_inhabited=false`.
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
pub fn set_header_as_dirty(journal: *Journal, header: *const Header.Prepare) void {
|
|
1750
|
+
assert(journal.status == .recovered);
|
|
1751
|
+
assert(header.command == .prepare);
|
|
1752
|
+
assert(header.operation != .reserved);
|
|
1753
|
+
|
|
1754
|
+
log.debug("{}: set_header_as_dirty: op={} checksum={x:0>32}", .{
|
|
1755
|
+
journal.replica,
|
|
1756
|
+
header.op,
|
|
1757
|
+
header.checksum,
|
|
1758
|
+
});
|
|
1759
|
+
|
|
1760
|
+
const slot = journal.slot_for_header(header);
|
|
1761
|
+
|
|
1762
|
+
if (journal.has_header(header)) {
|
|
1763
|
+
assert(journal.dirty.bit(slot));
|
|
1764
|
+
maybe(journal.faulty.bit(slot));
|
|
1765
|
+
// Do not clear any faulty bit for the same entry.
|
|
1766
|
+
} else {
|
|
1767
|
+
// Overwriting a new op with an old op would be a correctness bug; it could cause a
|
|
1768
|
+
// message to be uncommitted.
|
|
1769
|
+
assert(journal.headers[slot.index].op <= header.op);
|
|
1770
|
+
|
|
1771
|
+
if (journal.headers[slot.index].operation == .reserved) {
|
|
1772
|
+
// The WAL might have written/prepared this exact header before crashing —
|
|
1773
|
+
// leave the entry marked faulty because we cannot safely nack it.
|
|
1774
|
+
maybe(journal.faulty.bit(slot));
|
|
1775
|
+
} else {
|
|
1776
|
+
// The WAL definitely did not hold this exact header, so it is safe to reset the
|
|
1777
|
+
// faulty bit + nack this header.
|
|
1778
|
+
journal.faulty.clear(slot);
|
|
1779
|
+
journal.headers_redundant[slot.index] =
|
|
1780
|
+
Header.Prepare.reserve(header.cluster, slot.index);
|
|
1781
|
+
}
|
|
1782
|
+
|
|
1783
|
+
journal.headers[slot.index] = header.*;
|
|
1784
|
+
journal.dirty.set(slot);
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
/// `write_prepare` uses `write_sectors` to prevent concurrent disk writes.
|
|
1789
|
+
pub fn write_prepare(
|
|
1790
|
+
journal: *Journal,
|
|
1791
|
+
callback: *const fn (journal: *Replica, wrote: ?*Message.Prepare) void,
|
|
1792
|
+
message: *Message.Prepare,
|
|
1793
|
+
) void {
|
|
1794
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1795
|
+
|
|
1796
|
+
assert(journal.status == .recovered);
|
|
1797
|
+
assert(message.header.command == .prepare);
|
|
1798
|
+
assert(message.header.operation != .reserved);
|
|
1799
|
+
assert(message.header.size >= @sizeOf(Header));
|
|
1800
|
+
assert(message.header.size <= message.buffer.len);
|
|
1801
|
+
assert(journal.has_header(message.header));
|
|
1802
|
+
assert(journal.writing(message.header) == .none);
|
|
1803
|
+
if (replica.solo()) assert(journal.writes.executing() == 0);
|
|
1804
|
+
|
|
1805
|
+
// The underlying header memory must be owned by the buffer and not by journal.headers:
|
|
1806
|
+
// Otherwise, concurrent writes may modify the memory of the pointer while we write.
|
|
1807
|
+
assert(@intFromPtr(message.header) == @intFromPtr(message.buffer));
|
|
1808
|
+
|
|
1809
|
+
const slot = journal.slot_with_header(message.header).?;
|
|
1810
|
+
|
|
1811
|
+
if (!journal.dirty.bit(slot)) {
|
|
1812
|
+
// Any function that sets the faulty bit should also set the dirty bit:
|
|
1813
|
+
assert(!journal.faulty.bit(slot));
|
|
1814
|
+
assert(journal.prepare_inhabited[slot.index]);
|
|
1815
|
+
assert(journal.prepare_checksums[slot.index] == message.header.checksum);
|
|
1816
|
+
assert(journal.headers_redundant[slot.index].checksum == message.header.checksum);
|
|
1817
|
+
journal.write_prepare_debug(message.header, "skipping (clean)");
|
|
1818
|
+
callback(replica, message);
|
|
1819
|
+
return;
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
assert(journal.has_dirty(message.header));
|
|
1823
|
+
|
|
1824
|
+
const write = journal.writes.acquire() orelse {
|
|
1825
|
+
assert(!replica.solo());
|
|
1826
|
+
|
|
1827
|
+
journal.write_prepare_warn(message.header, "waiting for IOP");
|
|
1828
|
+
callback(replica, null);
|
|
1829
|
+
return;
|
|
1830
|
+
};
|
|
1831
|
+
|
|
1832
|
+
journal.write_prepare_debug(message.header, "starting");
|
|
1833
|
+
|
|
1834
|
+
write.* = .{
|
|
1835
|
+
.journal = journal,
|
|
1836
|
+
.callback = callback,
|
|
1837
|
+
.message = message.ref(),
|
|
1838
|
+
.range = undefined,
|
|
1839
|
+
};
|
|
1840
|
+
|
|
1841
|
+
// Slice the message to the nearest sector, we don't want to write the whole buffer:
|
|
1842
|
+
const buffer = message.buffer[0..vsr.sector_ceil(message.header.size)];
|
|
1843
|
+
const offset = Ring.prepares.offset(slot);
|
|
1844
|
+
|
|
1845
|
+
// Assert that any sector padding has already been zeroed:
|
|
1846
|
+
assert(stdx.zeroed(buffer[message.header.size..]));
|
|
1847
|
+
|
|
1848
|
+
journal.prepare_inhabited[slot.index] = false;
|
|
1849
|
+
journal.prepare_checksums[slot.index] = 0;
|
|
1850
|
+
|
|
1851
|
+
journal.write_sectors(write_prepare_header, write, buffer, .prepares, offset);
|
|
1852
|
+
}
|
|
1853
|
+
|
|
1854
|
+
/// Attempt to lock the in-memory sector containing the header being written.
|
|
1855
|
+
/// If the sector is already locked, add this write to the wait queue.
|
|
1856
|
+
fn write_prepare_header(write: *Journal.Write) void {
|
|
1857
|
+
const journal = write.journal;
|
|
1858
|
+
const message = write.message;
|
|
1859
|
+
assert(journal.status == .recovered);
|
|
1860
|
+
assert(journal.writing(message.header) == .exact);
|
|
1861
|
+
|
|
1862
|
+
// `prepare_inhabited[slot.index]` is usually false here, but may be true if two
|
|
1863
|
+
// (or more) writes to the same slot were queued concurrently and this is not the
|
|
1864
|
+
// first to finish writing its prepare.
|
|
1865
|
+
const slot = journal.slot_for_header(message.header);
|
|
1866
|
+
journal.prepare_inhabited[slot.index] = true;
|
|
1867
|
+
journal.prepare_checksums[slot.index] = message.header.checksum;
|
|
1868
|
+
|
|
1869
|
+
if (!journal.has_header(message.header)) {
|
|
1870
|
+
journal.write_prepare_debug(message.header, "entry changed while writing sectors");
|
|
1871
|
+
journal.write_prepare_release(write, null);
|
|
1872
|
+
// We just overwrote a (potentially-clean) prepare with the "wrong" header.
|
|
1873
|
+
journal.dirty.set(slot);
|
|
1874
|
+
return;
|
|
1875
|
+
}
|
|
1876
|
+
|
|
1877
|
+
if (journal.headers_redundant[slot.index].operation == .reserved and
|
|
1878
|
+
journal.headers_redundant[slot.index].checksum == 0)
|
|
1879
|
+
{
|
|
1880
|
+
assert(journal.faulty.bit(slot));
|
|
1881
|
+
}
|
|
1882
|
+
journal.headers_redundant[slot.index] = message.header.*;
|
|
1883
|
+
|
|
1884
|
+
// TODO It's possible within this section that the header has since been replaced but we
|
|
1885
|
+
// continue writing, even when the dirty bit is no longer set. This is not a problem
|
|
1886
|
+
// but it would be good to stop writing as soon as we see we no longer need to.
|
|
1887
|
+
// For this, we'll need to have a way to tweak write_prepare_release() to release locks.
|
|
1888
|
+
// At present, we don't return early here simply because it doesn't yet do that.
|
|
1889
|
+
|
|
1890
|
+
const offset = Ring.headers.offset(slot);
|
|
1891
|
+
assert(offset % constants.sector_size == 0);
|
|
1892
|
+
|
|
1893
|
+
const buffer: []u8 = journal.header_sector(
|
|
1894
|
+
@divFloor(slot.index, headers_per_sector),
|
|
1895
|
+
write,
|
|
1896
|
+
);
|
|
1897
|
+
|
|
1898
|
+
log.debug("{}: write_header: op={} sectors[{}..{}]", .{
|
|
1899
|
+
journal.replica,
|
|
1900
|
+
message.header.op,
|
|
1901
|
+
offset,
|
|
1902
|
+
offset + constants.sector_size,
|
|
1903
|
+
});
|
|
1904
|
+
// Memory must not be owned by journal.headers as these may be modified concurrently:
|
|
1905
|
+
assert(@intFromPtr(buffer.ptr) < @intFromPtr(journal.headers.ptr) or
|
|
1906
|
+
@intFromPtr(buffer.ptr) > @intFromPtr(journal.headers.ptr) + headers_size);
|
|
1907
|
+
|
|
1908
|
+
journal.write_sectors(write_prepare_on_write_header, write, buffer, .headers, offset);
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
fn write_prepare_on_write_header(write: *Journal.Write) void {
|
|
1912
|
+
const journal = write.journal;
|
|
1913
|
+
const message = write.message;
|
|
1914
|
+
|
|
1915
|
+
if (!journal.has_header(message.header)) {
|
|
1916
|
+
journal.write_prepare_debug(message.header, "entry changed while writing headers");
|
|
1917
|
+
journal.write_prepare_release(write, null);
|
|
1918
|
+
return;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
const slot = journal.slot_with_header(message.header).?;
|
|
1922
|
+
if (journal.headers_redundant[slot.index].checksum != message.header.checksum) {
|
|
1923
|
+
assert(journal.dirty.bit(slot));
|
|
1924
|
+
// Scenario:
|
|
1925
|
+
// 1. write_prepare(h₁)
|
|
1926
|
+
// 2. write_prepare_header(h₁)
|
|
1927
|
+
// 3. remove_entry(h₁)
|
|
1928
|
+
// 4. set_header_as_dirty(h₁)
|
|
1929
|
+
// 5. write_prepare_on_write_header(h₁)
|
|
1930
|
+
// `prepare_checksums` is still correct, but `remove_entry()` cleared the
|
|
1931
|
+
// `headers_redundant`.
|
|
1932
|
+
journal.write_prepare_debug(
|
|
1933
|
+
message.header,
|
|
1934
|
+
"entry removed then added while writing headers",
|
|
1935
|
+
);
|
|
1936
|
+
journal.write_prepare_release(write, null);
|
|
1937
|
+
return;
|
|
1938
|
+
}
|
|
1939
|
+
|
|
1940
|
+
if (!journal.prepare_inhabited[slot.index] or
|
|
1941
|
+
journal.prepare_checksums[slot.index] != message.header.checksum)
|
|
1942
|
+
{
|
|
1943
|
+
journal.write_prepare_debug(
|
|
1944
|
+
message.header,
|
|
1945
|
+
"entry changed twice while writing headers",
|
|
1946
|
+
);
|
|
1947
|
+
journal.write_prepare_release(write, null);
|
|
1948
|
+
return;
|
|
1949
|
+
}
|
|
1950
|
+
|
|
1951
|
+
journal.write_prepare_debug(message.header, "complete, marking clean");
|
|
1952
|
+
|
|
1953
|
+
journal.dirty.clear(slot);
|
|
1954
|
+
journal.faulty.clear(slot);
|
|
1955
|
+
|
|
1956
|
+
journal.write_prepare_release(write, message);
|
|
1957
|
+
}
|
|
1958
|
+
|
|
1959
|
+
fn write_prepare_release(
|
|
1960
|
+
journal: *Journal,
|
|
1961
|
+
write: *Journal.Write,
|
|
1962
|
+
wrote: ?*Message.Prepare,
|
|
1963
|
+
) void {
|
|
1964
|
+
const replica: *Replica = @alignCast(@fieldParentPtr("journal", journal));
|
|
1965
|
+
const write_callback = write.callback;
|
|
1966
|
+
const write_message = write.message;
|
|
1967
|
+
|
|
1968
|
+
// Release the write prior to returning control to the caller.
|
|
1969
|
+
// This allows us to enforce journal.writes.len≤1 when replica_count=1, because the
|
|
1970
|
+
// callback may immediately start the next write.
|
|
1971
|
+
journal.writes.release(write);
|
|
1972
|
+
assert(journal.writing(write_message.header) == .none);
|
|
1973
|
+
|
|
1974
|
+
write_callback(replica, wrote);
|
|
1975
|
+
replica.message_bus.unref(write_message);
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
fn write_prepare_debug(
|
|
1979
|
+
journal: *const Journal,
|
|
1980
|
+
header: *const Header.Prepare,
|
|
1981
|
+
status: []const u8,
|
|
1982
|
+
) void {
|
|
1983
|
+
journal.write_prepare_fn(header, status, log.debug);
|
|
1984
|
+
}
|
|
1985
|
+
|
|
1986
|
+
fn write_prepare_warn(
|
|
1987
|
+
journal: *const Journal,
|
|
1988
|
+
header: *const Header.Prepare,
|
|
1989
|
+
status: []const u8,
|
|
1990
|
+
) void {
|
|
1991
|
+
journal.write_prepare_fn(header, status, log.warn);
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1994
|
+
fn write_prepare_fn(
|
|
1995
|
+
journal: *const Journal,
|
|
1996
|
+
header: *const Header.Prepare,
|
|
1997
|
+
status: []const u8,
|
|
1998
|
+
comptime log_fn: anytype,
|
|
1999
|
+
) void {
|
|
2000
|
+
assert(journal.status == .recovered);
|
|
2001
|
+
assert(header.command == .prepare);
|
|
2002
|
+
assert(header.operation != .reserved);
|
|
2003
|
+
|
|
2004
|
+
log_fn("{}: write: view={} slot={} op={} len={}: {x:0>32} {s}", .{
|
|
2005
|
+
journal.replica,
|
|
2006
|
+
header.view,
|
|
2007
|
+
journal.slot_for_header(header).index,
|
|
2008
|
+
header.op,
|
|
2009
|
+
header.size,
|
|
2010
|
+
header.checksum,
|
|
2011
|
+
status,
|
|
2012
|
+
});
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
fn write_sectors(
|
|
2016
|
+
journal: *Journal,
|
|
2017
|
+
callback: *const fn (write: *Journal.Write) void,
|
|
2018
|
+
write: *Journal.Write,
|
|
2019
|
+
buffer: []const u8,
|
|
2020
|
+
ring: Ring,
|
|
2021
|
+
offset: u64, // Offset within the Ring.
|
|
2022
|
+
) void {
|
|
2023
|
+
write.range = .{
|
|
2024
|
+
.callback = callback,
|
|
2025
|
+
.completion = undefined,
|
|
2026
|
+
.buffer = buffer,
|
|
2027
|
+
.ring = ring,
|
|
2028
|
+
.offset = offset,
|
|
2029
|
+
.locked = false,
|
|
2030
|
+
};
|
|
2031
|
+
journal.lock_sectors(write);
|
|
2032
|
+
}
|
|
2033
|
+
|
|
2034
|
+
/// Start the write on the current range or add it to the proper queue
|
|
2035
|
+
/// if an overlapping range is currently being written.
|
|
2036
|
+
fn lock_sectors(journal: *Journal, write: *Journal.Write) void {
|
|
2037
|
+
assert(!write.range.locked);
|
|
2038
|
+
assert(write.range.next == null);
|
|
2039
|
+
|
|
2040
|
+
var it = journal.writes.iterate();
|
|
2041
|
+
while (it.next()) |other| {
|
|
2042
|
+
if (other == write) continue;
|
|
2043
|
+
assert(journal.slot_for_header(write.message.header).index !=
|
|
2044
|
+
journal.slot_for_header(other.message.header).index);
|
|
2045
|
+
|
|
2046
|
+
if (!other.range.locked) continue;
|
|
2047
|
+
|
|
2048
|
+
if (other.range.overlaps(&write.range)) {
|
|
2049
|
+
assert(other.range.offset == write.range.offset);
|
|
2050
|
+
assert(other.range.buffer.len == write.range.buffer.len);
|
|
2051
|
+
assert(other.range.ring == write.range.ring);
|
|
2052
|
+
assert(other.range.ring == .headers);
|
|
2053
|
+
|
|
2054
|
+
var tail = &other.range;
|
|
2055
|
+
while (tail.next) |next| tail = next;
|
|
2056
|
+
tail.next = &write.range;
|
|
2057
|
+
return;
|
|
2058
|
+
}
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
log.debug("{}: write_sectors: ring={} offset={} len={} locked", .{
|
|
2062
|
+
journal.replica,
|
|
2063
|
+
write.range.ring,
|
|
2064
|
+
write.range.offset,
|
|
2065
|
+
write.range.buffer.len,
|
|
2066
|
+
});
|
|
2067
|
+
|
|
2068
|
+
write.range.locked = true;
|
|
2069
|
+
journal.storage.write_sectors(
|
|
2070
|
+
write_sectors_on_write,
|
|
2071
|
+
&write.range.completion,
|
|
2072
|
+
write.range.buffer,
|
|
2073
|
+
switch (write.range.ring) {
|
|
2074
|
+
.headers => .wal_headers,
|
|
2075
|
+
.prepares => .wal_prepares,
|
|
2076
|
+
},
|
|
2077
|
+
write.range.offset,
|
|
2078
|
+
);
|
|
2079
|
+
// We rely on the Storage.write_sectors() implementation being always synchronous,
|
|
2080
|
+
// in which case writes never actually need to be queued, or always asynchronous,
|
|
2081
|
+
// in which case write_sectors_on_write() doesn't have to handle lock_sectors()
|
|
2082
|
+
// synchronously completing a write and making a nested write_sectors_on_write() call.
|
|
2083
|
+
//
|
|
2084
|
+
// We don't currently allow Storage implementations that are sometimes synchronous and
|
|
2085
|
+
// sometimes asynchronous as we don't have a use case for such a Storage implementation
|
|
2086
|
+
// and doing so would require a significant complexity increase.
|
|
2087
|
+
switch (Storage.synchronicity) {
|
|
2088
|
+
.always_synchronous => assert(!write.range.locked),
|
|
2089
|
+
.always_asynchronous => assert(write.range.locked),
|
|
2090
|
+
}
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
fn write_sectors_on_write(completion: *Storage.Write) void {
|
|
2094
|
+
const range: *Range = @fieldParentPtr("completion", completion);
|
|
2095
|
+
const write: *Journal.Write = @fieldParentPtr("range", range);
|
|
2096
|
+
const journal = write.journal;
|
|
2097
|
+
|
|
2098
|
+
assert(write.range.locked);
|
|
2099
|
+
write.range.locked = false;
|
|
2100
|
+
|
|
2101
|
+
log.debug("{}: write_sectors: ring={} offset={} len={} unlocked", .{
|
|
2102
|
+
journal.replica,
|
|
2103
|
+
write.range.ring,
|
|
2104
|
+
write.range.offset,
|
|
2105
|
+
write.range.buffer.len,
|
|
2106
|
+
});
|
|
2107
|
+
|
|
2108
|
+
// Drain the list of ranges that were waiting on this range to complete.
|
|
2109
|
+
var current = range.next;
|
|
2110
|
+
range.next = null;
|
|
2111
|
+
while (current) |waiting| {
|
|
2112
|
+
assert(waiting.locked == false);
|
|
2113
|
+
current = waiting.next;
|
|
2114
|
+
waiting.next = null;
|
|
2115
|
+
journal.lock_sectors(@as(*Journal.Write, @fieldParentPtr("range", waiting)));
|
|
2116
|
+
}
|
|
2117
|
+
|
|
2118
|
+
range.callback(write);
|
|
2119
|
+
}
|
|
2120
|
+
|
|
2121
|
+
/// Returns a sector of redundant headers, ready to be written to the specified sector.
|
|
2122
|
+
/// `sector_index` is relative to the start of the redundant header zone.
|
|
2123
|
+
fn header_sector(
|
|
2124
|
+
journal: *const Journal,
|
|
2125
|
+
sector_index: usize,
|
|
2126
|
+
write: *const Journal.Write,
|
|
2127
|
+
) Sector {
|
|
2128
|
+
assert(journal.status != .init);
|
|
2129
|
+
assert(journal.writes.items.len == journal.write_headers_sectors.len);
|
|
2130
|
+
assert(sector_index < @divFloor(slot_count, headers_per_sector));
|
|
2131
|
+
|
|
2132
|
+
const sector_slot = Slot{ .index = sector_index * headers_per_sector };
|
|
2133
|
+
assert(sector_slot.index < slot_count);
|
|
2134
|
+
|
|
2135
|
+
const write_index = @divExact(
|
|
2136
|
+
@intFromPtr(write) - @intFromPtr(&journal.writes.items),
|
|
2137
|
+
@sizeOf(Journal.Write),
|
|
2138
|
+
);
|
|
2139
|
+
|
|
2140
|
+
const sector: Sector = &journal.write_headers_sectors[write_index];
|
|
2141
|
+
const sector_headers = std.mem.bytesAsSlice(Header.Prepare, sector);
|
|
2142
|
+
assert(sector_headers.len == headers_per_sector);
|
|
2143
|
+
|
|
2144
|
+
// Write headers from `headers_redundant` instead of `headers` — we need to avoid
|
|
2145
|
+
// writing (leaking) a redundant header before its corresponding prepare is on disk.
|
|
2146
|
+
stdx.copy_disjoint(
|
|
2147
|
+
.exact,
|
|
2148
|
+
Header.Prepare,
|
|
2149
|
+
sector_headers,
|
|
2150
|
+
journal.headers_redundant[sector_slot.index..][0..headers_per_sector],
|
|
2151
|
+
);
|
|
2152
|
+
|
|
2153
|
+
for (sector_headers, 0..) |sector_header, i| {
|
|
2154
|
+
const slot = Slot{ .index = sector_slot.index + i };
|
|
2155
|
+
if (sector_header.operation == .reserved and
|
|
2156
|
+
sector_header.checksum == 0)
|
|
2157
|
+
{
|
|
2158
|
+
// Deliberately write an invalid header until the corresponding prepare is
|
|
2159
|
+
// repaired. (See read_prepare_with_op_and_checksum_callback()).
|
|
2160
|
+
assert(journal.faulty.bit(slot));
|
|
2161
|
+
} else {
|
|
2162
|
+
maybe(journal.faulty.bit(slot));
|
|
2163
|
+
}
|
|
2164
|
+
}
|
|
2165
|
+
|
|
2166
|
+
return sector;
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
const Writing = enum {
|
|
2170
|
+
none,
|
|
2171
|
+
/// Either the prepare or the redundant header of a message with the same slot as the
|
|
2172
|
+
/// given op is being written. It may be a different version of the same op, or a
|
|
2173
|
+
/// different op which shares the prepare slot.
|
|
2174
|
+
slot,
|
|
2175
|
+
/// Either the prepare or the redundant header of a message with the exact op/checksum
|
|
2176
|
+
/// is being written.
|
|
2177
|
+
exact,
|
|
2178
|
+
};
|
|
2179
|
+
|
|
2180
|
+
pub fn writing(journal: *Journal, header: *const Header.Prepare) Writing {
|
|
2181
|
+
const slot = journal.slot_for_header(header);
|
|
2182
|
+
var found: Writing = .none;
|
|
2183
|
+
var writes = journal.writes.iterate();
|
|
2184
|
+
while (writes.next()) |write| {
|
|
2185
|
+
const write_slot = journal.slot_for_op(write.message.header.op);
|
|
2186
|
+
if (write_slot.index == slot.index) {
|
|
2187
|
+
assert(found == .none);
|
|
2188
|
+
|
|
2189
|
+
if (write.message.header.checksum == header.checksum) {
|
|
2190
|
+
assert(write.message.header.op == header.op);
|
|
2191
|
+
found = .exact;
|
|
2192
|
+
} else {
|
|
2193
|
+
maybe(write.message.header.op == header.op);
|
|
2194
|
+
found = .slot;
|
|
2195
|
+
}
|
|
2196
|
+
} else {
|
|
2197
|
+
assert(write.message.header.op != header.op);
|
|
2198
|
+
}
|
|
2199
|
+
}
|
|
2200
|
+
return found;
|
|
2201
|
+
}
|
|
2202
|
+
};
|
|
2203
|
+
}
|
|
2204
|
+
|
|
2205
|
+
/// @B and @C:
|
|
2206
|
+
/// This prepare is corrupt.
|
|
2207
|
+
/// We may have a valid redundant header, but need to recover the full message.
|
|
2208
|
+
///
|
|
2209
|
+
/// Case @B may be caused by crashing while writing the prepare (torn write).
|
|
2210
|
+
///
|
|
2211
|
+
/// @D:
|
|
2212
|
+
/// This is possibly a torn write to the redundant headers, so when replica_count=1 we must
|
|
2213
|
+
/// repair this locally. The probability that this results in an incorrect recovery is:
|
|
2214
|
+
/// P(crash during first WAL wrap)
|
|
2215
|
+
/// × P(redundant header is corrupt)
|
|
2216
|
+
/// × P(lost write to prepare covered by the corrupt redundant header)
|
|
2217
|
+
/// which is negligible, and does not impact replica_count>1.
|
|
2218
|
+
///
|
|
2219
|
+
/// @E:
|
|
2220
|
+
/// Valid prepare, corrupt header. One of:
|
|
2221
|
+
///
|
|
2222
|
+
/// 1. The replica crashed while writing the redundant header (torn write).
|
|
2223
|
+
/// 2. The read to the header is corrupt or misdirected.
|
|
2224
|
+
/// 3. Multiple faults, for example: the redundant header read is corrupt, and the latest prepare
|
|
2225
|
+
/// write is misdirected.
|
|
2226
|
+
///
|
|
2227
|
+
///
|
|
2228
|
+
/// @F and @G:
|
|
2229
|
+
/// The replica is recovering from a crash after writing the prepare, but before writing the
|
|
2230
|
+
/// redundant header.
|
|
2231
|
+
///
|
|
2232
|
+
///
|
|
2233
|
+
/// @G:
|
|
2234
|
+
/// One of:
|
|
2235
|
+
///
|
|
2236
|
+
/// * The prepare was written, but then truncated, so the redundant header was written as reserved.
|
|
2237
|
+
/// * A misdirected read to a reserved header.
|
|
2238
|
+
/// * The redundant header's write was lost or misdirected.
|
|
2239
|
+
///
|
|
2240
|
+
/// There is a risk of data loss in the case of 2 lost writes.
|
|
2241
|
+
///
|
|
2242
|
+
///
|
|
2243
|
+
/// @H, @I, and @J:
|
|
2244
|
+
/// The prepare/header is valid and is past the prepare_max for the replica's checkpoint. We allow
|
|
2245
|
+
/// replicas to write to a slot past prepare_max when the replica has already committed the prepare
|
|
2246
|
+
/// in that slot.
|
|
2247
|
+
///
|
|
2248
|
+
/// On startup, we must truncate all these prepares so we can replay all prepares in the checkpoint.
|
|
2249
|
+
///
|
|
2250
|
+
///
|
|
2251
|
+
/// @K:
|
|
2252
|
+
/// The redundant header is present & valid, but the corresponding prepare was a lost or misdirected
|
|
2253
|
+
/// read or write.
|
|
2254
|
+
///
|
|
2255
|
+
///
|
|
2256
|
+
/// @L:
|
|
2257
|
+
/// This slot is legitimately reserved — this may be the first fill of the log.
|
|
2258
|
+
///
|
|
2259
|
+
///
|
|
2260
|
+
/// @M and @N:
|
|
2261
|
+
/// When the redundant header & prepare header are both valid but distinct ops, always pick the
|
|
2262
|
+
/// higher op.
|
|
2263
|
+
///
|
|
2264
|
+
/// For example, consider slot_count=10, the op to the left is 12, the op to the right is 14, and
|
|
2265
|
+
/// the tiebreak is between an op=3 and op=13. Choosing op=13 over op=3 is safe because the op=3
|
|
2266
|
+
/// must be from a previous wrap — it is too far back (>pipeline) to have been replaced by a view
|
|
2267
|
+
/// change.
|
|
2268
|
+
///
|
|
2269
|
+
/// The length of the prepare pipeline is the upper bound on how many ops can be reordered during a
|
|
2270
|
+
/// view change.
|
|
2271
|
+
///
|
|
2272
|
+
/// @M:
|
|
2273
|
+
/// When the higher op belongs to the prepare, repair locally.
|
|
2274
|
+
/// The most likely cause for this case is that the log wrapped, but the redundant header write was
|
|
2275
|
+
/// lost.
|
|
2276
|
+
///
|
|
2277
|
+
/// @N:
|
|
2278
|
+
/// When the higher op belongs to the header, mark faulty.
|
|
2279
|
+
///
|
|
2280
|
+
///
|
|
2281
|
+
/// @O:
|
|
2282
|
+
/// Either:
|
|
2283
|
+
/// - The message was rewritten due to a view change.
|
|
2284
|
+
/// - The prepare write was lost, but the previous prepare had the same op (but a different view).
|
|
2285
|
+
///
|
|
2286
|
+
/// The prepare and header have different views, but regardless of which is greater (and in both of
|
|
2287
|
+
/// the above cases), recovery can't distinguish which is actually *newer*. Thus, we can't `fix`,
|
|
2288
|
+
/// despite having a valid prepare.
|
|
2289
|
+
///
|
|
2290
|
+
/// For example, if the header.view=2 and prepare.view=4, any of these scenarios are possible:
|
|
2291
|
+
/// - Before crashing, we wrote the view=4 prepare, and then lost/misdirected the write for the
|
|
2292
|
+
/// view=4 header. The view=2 header is left behind from view=2 or view=3.
|
|
2293
|
+
/// - Before crashing, we wrote the view=2 prepare, and then lost/misdirected the write for the
|
|
2294
|
+
/// view=2 header. The view=4 header is left behind from view=3.
|
|
2295
|
+
/// - Before crashing, we wrote the view=4 prepare, and then crashed before we could write the
|
|
2296
|
+
/// view=4 header. The view=2 header is left behind from view=2 or view=3.
|
|
2297
|
+
/// (This last case is the most likely.)
|
|
2298
|
+
///
|
|
2299
|
+
///
|
|
2300
|
+
/// @P:
|
|
2301
|
+
/// The redundant header matches the message's header.
|
|
2302
|
+
/// This is the usual case: both the prepare and header are correct and equivalent.
|
|
2303
|
+
const recovery_cases = table: {
|
|
2304
|
+
const __ = Matcher.any;
|
|
2305
|
+
const _0 = Matcher.is_false;
|
|
2306
|
+
const _1 = Matcher.is_true;
|
|
2307
|
+
// The replica will abort if any of these checks fail:
|
|
2308
|
+
const a0 = Matcher.assert_is_false;
|
|
2309
|
+
const a1 = Matcher.assert_is_true;
|
|
2310
|
+
|
|
2311
|
+
break :table [_]Case{
|
|
2312
|
+
// Legend:
|
|
2313
|
+
//
|
|
2314
|
+
// R>1 replica_count > 1 or standby
|
|
2315
|
+
// R=1 replica_count = 1 and !standby
|
|
2316
|
+
// ok valid checksum ∧ valid cluster ∧ valid slot ∧ valid command
|
|
2317
|
+
// nil operation == reserved
|
|
2318
|
+
// ✓∑ header.checksum == prepare.checksum
|
|
2319
|
+
// op⌈ prepare.op is maximum of all prepare.ops
|
|
2320
|
+
// op>₁ prepare.op > op_prepare_max
|
|
2321
|
+
// op>₂ header.op > op_prepare_max
|
|
2322
|
+
// op= header.op == prepare.op
|
|
2323
|
+
// op< header.op < prepare.op
|
|
2324
|
+
// view header.view == prepare.view
|
|
2325
|
+
//
|
|
2326
|
+
// Label Decision Header Prepare Compare
|
|
2327
|
+
// R>1 R=1 ok nil ok nil op⌈ op> op> ✓∑ op= op< view
|
|
2328
|
+
Case.init("@A", .vsr, .vsr, .{ _0, __, _0, __, __, a0, a0, __, __, __, __ }),
|
|
2329
|
+
Case.init("@B", .vsr, .vsr, .{ _1, _1, _0, __, __, a0, __, __, __, __, __ }),
|
|
2330
|
+
Case.init("@C", .vsr, .vsr, .{ _1, _0, _0, __, __, a0, __, __, __, __, __ }),
|
|
2331
|
+
Case.init("@D", .vsr, .fix, .{ _0, __, _1, _1, __, __, a0, __, __, __, __ }),
|
|
2332
|
+
Case.init("@E", .vsr, .fix, .{ _0, __, _1, _0, _0, _0, a0, __, __, __, __ }),
|
|
2333
|
+
Case.init("@F", .fix, .fix, .{ _0, __, _1, _0, _1, _0, a0, __, __, __, __ }),
|
|
2334
|
+
Case.init("@G", .fix, .fix, .{ _1, _1, _1, _0, __, _0, __, __, __, __, __ }),
|
|
2335
|
+
Case.init("@H", .cut, .unr, .{ __, __, _1, _0, __, _1, __, __, __, __, __ }), // prepare.op > op_prepare_max
|
|
2336
|
+
Case.init("@I", .cut, .unr, .{ _1, _0, _1, _0, __, _0, _1, __, __, a0, __ }), // header.op > op_prepare_max, prepare !reserved
|
|
2337
|
+
Case.init("@J", .cut, .unr, .{ _1, _0, _1, _1, __, __, _1, __, __, a0, __ }), // header.op > op_prepare_max, prepare reserved
|
|
2338
|
+
Case.init("@K", .vsr, .vsr, .{ _1, _0, _1, _1, __, __, _0, __, __, __, __ }),
|
|
2339
|
+
Case.init("@L", .nil, .nil, .{ _1, _1, _1, _1, __, __, __, a1, a1, a0, a1 }), // normal path: reserved
|
|
2340
|
+
Case.init("@M", .fix, .fix, .{ _1, _0, _1, _0, __, _0, _0, _0, _0, _1, __ }), // header.op < prepare.op
|
|
2341
|
+
Case.init("@N", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _0, _0, _0, _0, __ }), // header.op > prepare.op
|
|
2342
|
+
Case.init("@O", .vsr, .vsr, .{ _1, _0, _1, _0, __, _0, _0, _0, _1, a0, a0 }), // header.view != prepare.view
|
|
2343
|
+
Case.init("@P", .eql, .eql, .{ _1, _0, _1, _0, __, _0, _0, _1, a1, a0, a1 }), // normal path: prepare
|
|
2344
|
+
};
|
|
2345
|
+
};
|
|
2346
|
+
|
|
2347
|
+
const case_cut_torn = Case{
|
|
2348
|
+
.label = "@TruncateTorn",
|
|
2349
|
+
.decision_multiple = .cut_torn,
|
|
2350
|
+
.decision_single = .cut_torn,
|
|
2351
|
+
.pattern = undefined,
|
|
2352
|
+
};
|
|
2353
|
+
|
|
2354
|
+
const RecoveryDecision = enum {
|
|
2355
|
+
/// The header and prepare are identical; no repair necessary.
|
|
2356
|
+
eql,
|
|
2357
|
+
/// Reserved; dirty/faulty are clear, no repair necessary.
|
|
2358
|
+
nil,
|
|
2359
|
+
/// Use intact prepare to repair redundant header. Dirty/faulty are clear.
|
|
2360
|
+
fix,
|
|
2361
|
+
/// If replica_count>1 or standby: Repair with VSR `request_prepare`. Mark dirty, mark faulty.
|
|
2362
|
+
/// If replica_count=1 and !standby: Fail; cannot recover safely.
|
|
2363
|
+
vsr,
|
|
2364
|
+
/// The prepare is from the next checkpoint. Truncate, set to reserved, clear dirty/faulty.
|
|
2365
|
+
cut,
|
|
2366
|
+
/// Truncate the op, setting it to reserved. Dirty/faulty are clear.
|
|
2367
|
+
cut_torn,
|
|
2368
|
+
/// Unreachable combination of header and prepare states.
|
|
2369
|
+
unr,
|
|
2370
|
+
};
|
|
2371
|
+
|
|
2372
|
+
const Matcher = enum { any, is_false, is_true, assert_is_false, assert_is_true };
|
|
2373
|
+
|
|
2374
|
+
const Case = struct {
|
|
2375
|
+
label: []const u8,
|
|
2376
|
+
/// Decision when replica_count>1.
|
|
2377
|
+
decision_multiple: RecoveryDecision,
|
|
2378
|
+
/// Decision when replica_count=1.
|
|
2379
|
+
decision_single: RecoveryDecision,
|
|
2380
|
+
/// 0: header_ok(header)
|
|
2381
|
+
/// 1: header.operation == reserved
|
|
2382
|
+
/// 2: header_ok(prepare) ∧ valid_checksum_body
|
|
2383
|
+
/// 3: prepare.operation == reserved
|
|
2384
|
+
/// 4: prepare.op is maximum of all prepare.ops
|
|
2385
|
+
/// 5: prepare.op > op_prepare_max
|
|
2386
|
+
/// 6: header.op > op_prepare_max
|
|
2387
|
+
/// 7: header.checksum == prepare.checksum
|
|
2388
|
+
/// 8: header.op == prepare.op
|
|
2389
|
+
/// 9: header.op < prepare.op
|
|
2390
|
+
/// 10: header.view == prepare.view
|
|
2391
|
+
pattern: [pattern_size]Matcher,
|
|
2392
|
+
|
|
2393
|
+
const pattern_size = 11;
|
|
2394
|
+
|
|
2395
|
+
fn init(
|
|
2396
|
+
label: []const u8,
|
|
2397
|
+
decision_multiple: RecoveryDecision,
|
|
2398
|
+
decision_single: RecoveryDecision,
|
|
2399
|
+
pattern: [pattern_size]Matcher,
|
|
2400
|
+
) Case {
|
|
2401
|
+
return .{
|
|
2402
|
+
.label = label,
|
|
2403
|
+
.decision_multiple = decision_multiple,
|
|
2404
|
+
.decision_single = decision_single,
|
|
2405
|
+
.pattern = pattern,
|
|
2406
|
+
};
|
|
2407
|
+
}
|
|
2408
|
+
|
|
2409
|
+
fn check(case: *const Case, parameters: [pattern_size]bool) !bool {
|
|
2410
|
+
for (case.pattern, parameters) |pattern, parameter| {
|
|
2411
|
+
switch (pattern) {
|
|
2412
|
+
.any => {},
|
|
2413
|
+
.is_false => if (parameter) return false,
|
|
2414
|
+
.is_true => if (!parameter) return false,
|
|
2415
|
+
.assert_is_false => if (parameter) return error.ExpectFalse,
|
|
2416
|
+
.assert_is_true => if (!parameter) return error.ExpectTrue,
|
|
2417
|
+
}
|
|
2418
|
+
}
|
|
2419
|
+
return true;
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
fn decision(case: *const Case, solo: bool) RecoveryDecision {
|
|
2423
|
+
if (solo) {
|
|
2424
|
+
return case.decision_single;
|
|
2425
|
+
} else {
|
|
2426
|
+
return case.decision_multiple;
|
|
2427
|
+
}
|
|
2428
|
+
}
|
|
2429
|
+
};
|
|
2430
|
+
|
|
2431
|
+
fn recovery_case(
|
|
2432
|
+
header: ?Header.Prepare,
|
|
2433
|
+
prepare: ?Header.Prepare,
|
|
2434
|
+
data: struct {
|
|
2435
|
+
op_max: u64,
|
|
2436
|
+
op_prepare_max: u64,
|
|
2437
|
+
op_checkpoint: u64,
|
|
2438
|
+
},
|
|
2439
|
+
) *const Case {
|
|
2440
|
+
const h_ok = header != null;
|
|
2441
|
+
const p_ok = prepare != null;
|
|
2442
|
+
|
|
2443
|
+
if (h_ok) assert(header.?.invalid() == null);
|
|
2444
|
+
if (p_ok) assert(prepare.?.invalid() == null);
|
|
2445
|
+
|
|
2446
|
+
const parameters: [Case.pattern_size]bool = .{
|
|
2447
|
+
h_ok,
|
|
2448
|
+
if (h_ok) header.?.operation == .reserved else false,
|
|
2449
|
+
p_ok,
|
|
2450
|
+
if (p_ok) prepare.?.operation == .reserved else false,
|
|
2451
|
+
if (p_ok) prepare.?.op == data.op_max else false,
|
|
2452
|
+
if (p_ok) prepare.?.op > data.op_prepare_max else false,
|
|
2453
|
+
if (h_ok) header.?.op > data.op_prepare_max else false,
|
|
2454
|
+
if (h_ok and p_ok) header.?.checksum == prepare.?.checksum else false,
|
|
2455
|
+
if (h_ok and p_ok) header.?.op == prepare.?.op else false,
|
|
2456
|
+
if (h_ok and p_ok) header.?.op < prepare.?.op else false,
|
|
2457
|
+
if (h_ok and p_ok) header.?.view == prepare.?.view else false,
|
|
2458
|
+
};
|
|
2459
|
+
|
|
2460
|
+
var result: ?*const Case = null;
|
|
2461
|
+
for (&recovery_cases) |*case| {
|
|
2462
|
+
const match = case.check(parameters) catch {
|
|
2463
|
+
log.err("recovery_case: impossible state: case={s} parameters={any}", .{
|
|
2464
|
+
case.label,
|
|
2465
|
+
parameters,
|
|
2466
|
+
});
|
|
2467
|
+
unreachable;
|
|
2468
|
+
};
|
|
2469
|
+
if (match) {
|
|
2470
|
+
assert(result == null);
|
|
2471
|
+
result = case;
|
|
2472
|
+
}
|
|
2473
|
+
}
|
|
2474
|
+
// The recovery table is exhaustive.
|
|
2475
|
+
// Every combination of parameters matches exactly one case.
|
|
2476
|
+
return result.?;
|
|
2477
|
+
}
|
|
2478
|
+
|
|
2479
|
+
/// Returns the header, only if the header:
|
|
2480
|
+
/// * has a valid checksum, and
|
|
2481
|
+
/// * has command=prepare
|
|
2482
|
+
/// * has the expected cluster, and
|
|
2483
|
+
/// * has an expected command, and
|
|
2484
|
+
/// * resides in the correct slot.
|
|
2485
|
+
fn header_ok(
|
|
2486
|
+
cluster: u128,
|
|
2487
|
+
slot: Slot,
|
|
2488
|
+
header: *const Header.Prepare,
|
|
2489
|
+
) ?Header.Prepare {
|
|
2490
|
+
// We must first validate the header checksum before accessing any fields.
|
|
2491
|
+
// Otherwise, we may hit undefined data or an out-of-bounds enum and cause a runtime crash.
|
|
2492
|
+
if (!header.valid_checksum()) return null;
|
|
2493
|
+
if (header.command != .prepare) return null;
|
|
2494
|
+
|
|
2495
|
+
// A header with the wrong cluster, or in the wrong slot, may indicate a misdirected read/write.
|
|
2496
|
+
// All journalled headers should be reserved or else prepares.
|
|
2497
|
+
// A misdirected read/write to or from another storage zone may return the wrong message.
|
|
2498
|
+
const valid_cluster_command_and_slot = switch (header.operation) {
|
|
2499
|
+
.reserved => header.cluster == cluster and slot.index == header.op,
|
|
2500
|
+
else => header.cluster == cluster and slot.index == header.op % slot_count,
|
|
2501
|
+
};
|
|
2502
|
+
|
|
2503
|
+
// Do not check the checksum here, because that would run only after the other field accesses.
|
|
2504
|
+
return if (valid_cluster_command_and_slot) header.* else null;
|
|
2505
|
+
}
|
|
2506
|
+
|
|
2507
|
+
test "recovery_cases" {
|
|
2508
|
+
// Verify that every pattern matches exactly one case.
|
|
2509
|
+
//
|
|
2510
|
+
// Every possible combination of parameters must either:
|
|
2511
|
+
// * have a matching case
|
|
2512
|
+
// * have a case that fails (which would result in a panic).
|
|
2513
|
+
var i: usize = 0;
|
|
2514
|
+
while (i < (1 << Case.pattern_size)) : (i += 1) {
|
|
2515
|
+
var parameters: [Case.pattern_size]bool = undefined;
|
|
2516
|
+
comptime var j: usize = 0;
|
|
2517
|
+
inline while (j < parameters.len) : (j += 1) {
|
|
2518
|
+
parameters[j] = i & (1 << j) != 0;
|
|
2519
|
+
}
|
|
2520
|
+
|
|
2521
|
+
var case_fail: bool = false;
|
|
2522
|
+
var case_match: ?*const Case = null;
|
|
2523
|
+
for (&recovery_cases) |*case| {
|
|
2524
|
+
// Assertion patterns (a0/a1) act as wildcards for the purpose of matching.
|
|
2525
|
+
// Thus, it is possible for multiple cases to "match" a pattern iff they all fail an
|
|
2526
|
+
// assertion. (For example, simultaneous op= and op<).
|
|
2527
|
+
if (case.check(parameters) catch {
|
|
2528
|
+
assert(case_match == null);
|
|
2529
|
+
|
|
2530
|
+
case_fail = true;
|
|
2531
|
+
continue;
|
|
2532
|
+
}) {
|
|
2533
|
+
assert(!case_fail);
|
|
2534
|
+
|
|
2535
|
+
try std.testing.expectEqual(case_match, null);
|
|
2536
|
+
case_match = case;
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
assert(case_fail == (case_match == null));
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
pub const BitSet = struct {
|
|
2544
|
+
bits: std.DynamicBitSetUnmanaged,
|
|
2545
|
+
|
|
2546
|
+
/// The number of bits set (updated incrementally as bits are set or cleared):
|
|
2547
|
+
count: u64 = 0,
|
|
2548
|
+
|
|
2549
|
+
fn init_full(allocator: Allocator, count: usize) !BitSet {
|
|
2550
|
+
const bits = try std.DynamicBitSetUnmanaged.initFull(allocator, count);
|
|
2551
|
+
errdefer bits.deinit(allocator);
|
|
2552
|
+
|
|
2553
|
+
return BitSet{
|
|
2554
|
+
.bits = bits,
|
|
2555
|
+
.count = count,
|
|
2556
|
+
};
|
|
2557
|
+
}
|
|
2558
|
+
|
|
2559
|
+
fn deinit(bit_set: *BitSet, allocator: Allocator) void {
|
|
2560
|
+
assert(bit_set.count == bit_set.bits.count());
|
|
2561
|
+
|
|
2562
|
+
bit_set.bits.deinit(allocator);
|
|
2563
|
+
}
|
|
2564
|
+
|
|
2565
|
+
/// Clear the bit for a slot (idempotent):
|
|
2566
|
+
pub fn clear(bit_set: *BitSet, slot: Slot) void {
|
|
2567
|
+
if (bit_set.bits.isSet(slot.index)) {
|
|
2568
|
+
bit_set.bits.unset(slot.index);
|
|
2569
|
+
bit_set.count -= 1;
|
|
2570
|
+
}
|
|
2571
|
+
}
|
|
2572
|
+
|
|
2573
|
+
/// Whether the bit for a slot is set:
|
|
2574
|
+
pub fn bit(bit_set: *const BitSet, slot: Slot) bool {
|
|
2575
|
+
return bit_set.bits.isSet(slot.index);
|
|
2576
|
+
}
|
|
2577
|
+
|
|
2578
|
+
/// Set the bit for a slot (idempotent):
|
|
2579
|
+
pub fn set(bit_set: *BitSet, slot: Slot) void {
|
|
2580
|
+
if (!bit_set.bits.isSet(slot.index)) {
|
|
2581
|
+
bit_set.bits.set(slot.index);
|
|
2582
|
+
bit_set.count += 1;
|
|
2583
|
+
assert(bit_set.count <= bit_set.bits.bit_length);
|
|
2584
|
+
}
|
|
2585
|
+
}
|
|
2586
|
+
};
|