tigerbeetle 0.0.36 → 0.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/tb_client/extconf.rb +13 -13
- data/ext/tb_client/tigerbeetle/LICENSE +177 -0
- data/ext/tb_client/tigerbeetle/build.zig +2327 -0
- data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
- data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
- data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
- data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
- data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
- data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
- data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
- data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
- data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
- data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
- data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
- data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
- data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
- data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
- data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
- data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
- data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
- data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
- data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
- data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
- data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
- data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
- data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
- data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
- data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
- data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
- data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
- data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
- data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
- data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
- data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
- data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
- data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
- data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
- data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
- data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
- data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
- data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
- data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
- data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
- data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
- data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
- data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
- data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
- data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
- data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
- data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
- data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
- data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
- data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
- data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
- data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
- data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
- data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
- data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
- data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
- data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
- data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
- data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
- data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
- data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
- data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
- data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
- data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
- data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
- data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
- data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
- data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
- data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
- data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
- data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
- data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
- data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
- data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
- data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
- data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
- data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
- data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
- data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
- data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
- data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
- data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
- data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
- data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
- data/lib/tb_client/shared_lib.rb +12 -5
- data/lib/tigerbeetle/platforms.rb +9 -0
- data/lib/tigerbeetle/version.rb +1 -1
- data/tigerbeetle.gemspec +22 -5
- metadata +242 -3
- data/ext/tb_client/pkg.tar.gz +0 -0
|
@@ -0,0 +1,1460 @@
|
|
|
1
|
+
const std = @import("std");
|
|
2
|
+
const builtin = @import("builtin");
|
|
3
|
+
const assert = std.debug.assert;
|
|
4
|
+
const maybe = stdx.maybe;
|
|
5
|
+
const mem = std.mem;
|
|
6
|
+
|
|
7
|
+
const constants = @import("../constants.zig");
|
|
8
|
+
const vsr = @import("../vsr.zig");
|
|
9
|
+
const schema = @import("../lsm/schema.zig");
|
|
10
|
+
|
|
11
|
+
const SuperBlockType = vsr.SuperBlockType;
|
|
12
|
+
const QueueType = @import("../queue.zig").QueueType;
|
|
13
|
+
const IOPSType = @import("../iops.zig").IOPSType;
|
|
14
|
+
const SetAssociativeCacheType = @import("../lsm/set_associative_cache.zig").SetAssociativeCacheType;
|
|
15
|
+
const stdx = @import("stdx");
|
|
16
|
+
const GridBlocksMissing = @import("./grid_blocks_missing.zig").GridBlocksMissing;
|
|
17
|
+
const Tracer = vsr.trace.Tracer;
|
|
18
|
+
|
|
19
|
+
const FreeSet = @import("./free_set.zig").FreeSet;
|
|
20
|
+
|
|
21
|
+
const log = stdx.log.scoped(.grid);
|
|
22
|
+
|
|
23
|
+
pub const BlockPtr = *align(constants.sector_size) [constants.block_size]u8;
|
|
24
|
+
pub const BlockPtrConst = *align(constants.sector_size) const [constants.block_size]u8;
|
|
25
|
+
|
|
26
|
+
// Leave this outside GridType so we can call it from modules that don't know about Storage.
|
|
27
|
+
pub fn allocate_block(
|
|
28
|
+
allocator: mem.Allocator,
|
|
29
|
+
) error{OutOfMemory}!*align(constants.sector_size) [constants.block_size]u8 {
|
|
30
|
+
const block = try allocator.alignedAlloc(u8, constants.sector_size, constants.block_size);
|
|
31
|
+
@memset(block, 0);
|
|
32
|
+
return block[0..constants.block_size];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/// The Grid provides access to on-disk blocks (blobs of `block_size` bytes).
|
|
36
|
+
/// Each block is identified by an "address" (`u64`, beginning at 1).
|
|
37
|
+
///
|
|
38
|
+
/// Recently/frequently-used blocks are transparently cached in memory.
|
|
39
|
+
pub fn GridType(comptime Storage: type) type {
|
|
40
|
+
const block_size = constants.block_size;
|
|
41
|
+
const SuperBlock = SuperBlockType(Storage);
|
|
42
|
+
|
|
43
|
+
return struct {
|
|
44
|
+
const Grid = @This();
|
|
45
|
+
const CheckpointTrailer = vsr.CheckpointTrailerType(Storage);
|
|
46
|
+
|
|
47
|
+
pub const read_iops_max = constants.grid_iops_read_max;
|
|
48
|
+
pub const write_iops_max = constants.grid_iops_write_max;
|
|
49
|
+
|
|
50
|
+
pub const RepairTable = GridBlocksMissing.RepairTable;
|
|
51
|
+
pub const RepairTableResult = GridBlocksMissing.RepairTableResult;
|
|
52
|
+
pub const Reservation = @import("./free_set.zig").Reservation;
|
|
53
|
+
|
|
54
|
+
// Grid just reuses the Storage's NextTick abstraction for simplicity.
|
|
55
|
+
pub const NextTick = Storage.NextTick;
|
|
56
|
+
|
|
57
|
+
pub const Write = struct {
|
|
58
|
+
callback: *const fn (*Grid.Write) void,
|
|
59
|
+
address: u64,
|
|
60
|
+
repair: bool,
|
|
61
|
+
block: *BlockPtr,
|
|
62
|
+
/// The current checkpoint when the write began.
|
|
63
|
+
/// Verifies that the checkpoint does not advance during the (non-repair) write.
|
|
64
|
+
checkpoint_id: u128,
|
|
65
|
+
|
|
66
|
+
/// Link for the Grid.write_queue linked list.
|
|
67
|
+
link: QueueType(Write).Link = .{},
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
const WriteIOP = struct {
|
|
71
|
+
grid: *Grid,
|
|
72
|
+
completion: Storage.Write,
|
|
73
|
+
write: *Write,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
const ReadBlockCallback = union(enum) {
|
|
77
|
+
/// If the local read fails, report the error.
|
|
78
|
+
from_local_storage: *const fn (*Grid.Read, ReadBlockResult) void,
|
|
79
|
+
/// If the local read fails, this read will be added to a linked list, which Replica can
|
|
80
|
+
/// then interrogate each tick(). The callback passed to this function won't be called
|
|
81
|
+
/// until the block has been recovered.
|
|
82
|
+
from_local_or_global_storage: *const fn (*Grid.Read, BlockPtrConst) void,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
pub const Read = struct {
|
|
86
|
+
callback: ReadBlockCallback,
|
|
87
|
+
address: u64,
|
|
88
|
+
checksum: u128,
|
|
89
|
+
/// The current checkpoint when the read began.
|
|
90
|
+
/// Used to verify that the checkpoint does not advance while the read is in progress.
|
|
91
|
+
checkpoint_id: u128,
|
|
92
|
+
checkpoint_durable: bool,
|
|
93
|
+
|
|
94
|
+
/// When coherent=true:
|
|
95
|
+
/// - the block (address+checksum) is part of the current checkpoint.
|
|
96
|
+
/// - the read will complete before the next checkpoint occurs.
|
|
97
|
+
/// - callback == .from_local_or_global_storage
|
|
98
|
+
/// When coherent=false:
|
|
99
|
+
/// - the block (address+checksum) is not necessarily part of the current checkpoint.
|
|
100
|
+
/// - the read may complete after a future checkpoint.
|
|
101
|
+
/// - callback == .from_local_storage
|
|
102
|
+
coherent: bool,
|
|
103
|
+
cache_read: bool,
|
|
104
|
+
cache_write: bool,
|
|
105
|
+
pending: ReadPending = .{},
|
|
106
|
+
resolves: QueueType(ReadPending) = QueueType(ReadPending).init(.{ .name = null }),
|
|
107
|
+
|
|
108
|
+
grid: *Grid,
|
|
109
|
+
next_tick: Grid.NextTick = undefined,
|
|
110
|
+
|
|
111
|
+
/// Link for Grid.read_queue/Grid.read_global_queue linked lists.
|
|
112
|
+
link: QueueType(Read).Link = .{},
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
/// Although we distinguish between the reasons why the block is invalid, we only use this
|
|
116
|
+
/// info for logging, not logic.
|
|
117
|
+
pub const ReadBlockResult = union(enum) {
|
|
118
|
+
valid: BlockPtrConst,
|
|
119
|
+
/// Checksum of block header is invalid.
|
|
120
|
+
invalid_checksum,
|
|
121
|
+
/// Checksum of block body is invalid.
|
|
122
|
+
invalid_checksum_body,
|
|
123
|
+
/// The block header is valid, but its `header.command` is not `block`.
|
|
124
|
+
/// (This is possible due to misdirected IO).
|
|
125
|
+
unexpected_command,
|
|
126
|
+
/// The block is valid, but it is not the block we expected.
|
|
127
|
+
unexpected_checksum,
|
|
128
|
+
/// The block is valid, and it is the block we expected, but the last sector's padding
|
|
129
|
+
/// is corrupt, so we will repair it just to be safe.
|
|
130
|
+
invalid_padding,
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
const ReadPending = struct {
|
|
134
|
+
/// Link for Read.resolves linked lists.
|
|
135
|
+
link: QueueType(ReadPending).Link = .{},
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
const ReadIOP = struct {
|
|
139
|
+
completion: Storage.Read,
|
|
140
|
+
read: *Read,
|
|
141
|
+
};
|
|
142
|
+
|
|
143
|
+
const cache_interface = struct {
|
|
144
|
+
inline fn address_from_address(address: *const u64) u64 {
|
|
145
|
+
return address.*;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
inline fn hash_address(address: u64) u64 {
|
|
149
|
+
assert(address > 0);
|
|
150
|
+
return stdx.hash_inline(address);
|
|
151
|
+
}
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const set_associative_cache_ways = 16;
|
|
155
|
+
|
|
156
|
+
pub const Cache = SetAssociativeCacheType(
|
|
157
|
+
u64,
|
|
158
|
+
u64,
|
|
159
|
+
cache_interface.address_from_address,
|
|
160
|
+
cache_interface.hash_address,
|
|
161
|
+
.{
|
|
162
|
+
.ways = set_associative_cache_ways,
|
|
163
|
+
// layout.cache_line_size isn't actually used to compute anything. Rather, it's
|
|
164
|
+
// used by the SetAssociativeCache to assert() on sub-optimal values. In this case,
|
|
165
|
+
// it's better to allow the user to be able to run with a much smaller grid cache
|
|
166
|
+
// (256MiB vs 1GiB!) than trying to be completely optimal.
|
|
167
|
+
.cache_line_size = 16,
|
|
168
|
+
.value_alignment = @alignOf(u64),
|
|
169
|
+
},
|
|
170
|
+
);
|
|
171
|
+
|
|
172
|
+
superblock: *SuperBlock,
|
|
173
|
+
trace: *Tracer,
|
|
174
|
+
free_set: FreeSet,
|
|
175
|
+
free_set_checkpoint_blocks_acquired: CheckpointTrailer,
|
|
176
|
+
free_set_checkpoint_blocks_released: CheckpointTrailer,
|
|
177
|
+
|
|
178
|
+
blocks_missing: GridBlocksMissing,
|
|
179
|
+
|
|
180
|
+
cache: Cache,
|
|
181
|
+
/// Each entry in cache has a corresponding block.
|
|
182
|
+
cache_blocks: []BlockPtr,
|
|
183
|
+
|
|
184
|
+
write_iops: IOPSType(WriteIOP, write_iops_max) = .{},
|
|
185
|
+
write_queue: QueueType(Write) = QueueType(Write).init(.{ .name = "grid_write" }),
|
|
186
|
+
|
|
187
|
+
// Each read_iops has a corresponding block.
|
|
188
|
+
read_iop_blocks: [read_iops_max]BlockPtr,
|
|
189
|
+
read_iops: IOPSType(ReadIOP, read_iops_max) = .{},
|
|
190
|
+
read_queue: QueueType(Read) = QueueType(Read).init(.{ .name = "grid_read" }),
|
|
191
|
+
|
|
192
|
+
// List of Read.pending's which are in `read_queue` but also waiting for a free `read_iops`.
|
|
193
|
+
read_pending_queue: QueueType(ReadPending) = QueueType(ReadPending).init(.{
|
|
194
|
+
.name = "grid_read_pending",
|
|
195
|
+
}),
|
|
196
|
+
/// List of `Read`s which are waiting for a block repair from another replica.
|
|
197
|
+
/// (Reads in this queue have already failed locally).
|
|
198
|
+
///
|
|
199
|
+
/// Invariants:
|
|
200
|
+
/// - For each read, read.callback=from_local_or_global_storage.
|
|
201
|
+
read_global_queue: QueueType(Read) = QueueType(Read).init(.{ .name = "grid_read_global" }),
|
|
202
|
+
// True if there's a read that is resolving callbacks.
|
|
203
|
+
// If so, the read cache must not be invalidated.
|
|
204
|
+
read_resolving: bool = false,
|
|
205
|
+
|
|
206
|
+
callback: union(enum) {
|
|
207
|
+
none,
|
|
208
|
+
open: *const fn (*Grid) void,
|
|
209
|
+
checkpoint: *const fn (*Grid) void,
|
|
210
|
+
checkpoint_durable: *const fn (*Grid) void,
|
|
211
|
+
cancel: *const fn (*Grid) void,
|
|
212
|
+
} = .none,
|
|
213
|
+
|
|
214
|
+
canceling_tick_context: NextTick = undefined,
|
|
215
|
+
|
|
216
|
+
pub fn init(allocator: mem.Allocator, options: struct {
|
|
217
|
+
superblock: *SuperBlock,
|
|
218
|
+
trace: *Tracer,
|
|
219
|
+
cache_blocks_count: u64 = Cache.value_count_max_multiple,
|
|
220
|
+
missing_blocks_max: usize,
|
|
221
|
+
missing_tables_max: usize,
|
|
222
|
+
blocks_released_prior_checkpoint_durability_max: usize,
|
|
223
|
+
}) !Grid {
|
|
224
|
+
var free_set = try FreeSet.init(allocator, .{
|
|
225
|
+
.grid_size_limit = options.superblock.grid_size_limit(),
|
|
226
|
+
.blocks_released_prior_checkpoint_durability_max = options
|
|
227
|
+
.blocks_released_prior_checkpoint_durability_max,
|
|
228
|
+
});
|
|
229
|
+
errdefer free_set.deinit(allocator);
|
|
230
|
+
|
|
231
|
+
const free_set_encoded_size_max = free_set.encode_size_max();
|
|
232
|
+
var free_set_checkpoint_blocks_acquired =
|
|
233
|
+
try CheckpointTrailer.init(allocator, .free_set, free_set_encoded_size_max);
|
|
234
|
+
errdefer free_set_checkpoint_blocks_acquired.deinit(allocator);
|
|
235
|
+
|
|
236
|
+
var free_set_checkpoint_blocks_released =
|
|
237
|
+
try CheckpointTrailer.init(allocator, .free_set, free_set_encoded_size_max);
|
|
238
|
+
errdefer free_set_checkpoint_blocks_released.deinit(allocator);
|
|
239
|
+
|
|
240
|
+
var blocks_missing = try GridBlocksMissing.init(allocator, .{
|
|
241
|
+
.blocks_max = options.missing_blocks_max,
|
|
242
|
+
.tables_max = options.missing_tables_max,
|
|
243
|
+
});
|
|
244
|
+
errdefer blocks_missing.deinit(allocator);
|
|
245
|
+
|
|
246
|
+
const cache_blocks = try allocator.alloc(BlockPtr, options.cache_blocks_count);
|
|
247
|
+
errdefer allocator.free(cache_blocks);
|
|
248
|
+
|
|
249
|
+
for (cache_blocks, 0..) |*cache_block, i| {
|
|
250
|
+
errdefer for (cache_blocks[0..i]) |block| allocator.free(block);
|
|
251
|
+
cache_block.* = try allocate_block(allocator);
|
|
252
|
+
}
|
|
253
|
+
errdefer for (cache_blocks) |block| allocator.free(block);
|
|
254
|
+
|
|
255
|
+
var cache = try Cache.init(allocator, options.cache_blocks_count, .{ .name = "grid" });
|
|
256
|
+
errdefer cache.deinit(allocator);
|
|
257
|
+
|
|
258
|
+
var read_iop_blocks: [read_iops_max]BlockPtr = undefined;
|
|
259
|
+
|
|
260
|
+
for (&read_iop_blocks, 0..) |*read_iop_block, i| {
|
|
261
|
+
errdefer for (read_iop_blocks[0..i]) |block| allocator.free(block);
|
|
262
|
+
read_iop_block.* = try allocate_block(allocator);
|
|
263
|
+
}
|
|
264
|
+
errdefer for (&read_iop_blocks) |block| allocator.free(block);
|
|
265
|
+
|
|
266
|
+
return Grid{
|
|
267
|
+
.superblock = options.superblock,
|
|
268
|
+
.trace = options.trace,
|
|
269
|
+
.free_set = free_set,
|
|
270
|
+
.free_set_checkpoint_blocks_acquired = free_set_checkpoint_blocks_acquired,
|
|
271
|
+
.free_set_checkpoint_blocks_released = free_set_checkpoint_blocks_released,
|
|
272
|
+
.blocks_missing = blocks_missing,
|
|
273
|
+
.cache = cache,
|
|
274
|
+
.cache_blocks = cache_blocks,
|
|
275
|
+
.read_iop_blocks = read_iop_blocks,
|
|
276
|
+
};
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
pub fn deinit(grid: *Grid, allocator: mem.Allocator) void {
|
|
280
|
+
for (&grid.read_iop_blocks) |block| allocator.free(block);
|
|
281
|
+
|
|
282
|
+
for (grid.cache_blocks) |block| allocator.free(block);
|
|
283
|
+
allocator.free(grid.cache_blocks);
|
|
284
|
+
|
|
285
|
+
grid.cache.deinit(allocator);
|
|
286
|
+
grid.blocks_missing.deinit(allocator);
|
|
287
|
+
|
|
288
|
+
grid.free_set_checkpoint_blocks_acquired.deinit(allocator);
|
|
289
|
+
grid.free_set_checkpoint_blocks_released.deinit(allocator);
|
|
290
|
+
|
|
291
|
+
grid.free_set.deinit(allocator);
|
|
292
|
+
|
|
293
|
+
grid.* = undefined;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
pub fn open(grid: *Grid, callback: *const fn (*Grid) void) void {
|
|
297
|
+
assert(grid.callback == .none);
|
|
298
|
+
|
|
299
|
+
grid.callback = .{ .open = callback };
|
|
300
|
+
grid.free_set_checkpoint_blocks_acquired.open(
|
|
301
|
+
grid,
|
|
302
|
+
grid.superblock.working.free_set_reference(.blocks_acquired),
|
|
303
|
+
open_free_set_callback_blocks_acquired,
|
|
304
|
+
);
|
|
305
|
+
grid.free_set_checkpoint_blocks_released.open(
|
|
306
|
+
grid,
|
|
307
|
+
grid.superblock.working.free_set_reference(.blocks_released),
|
|
308
|
+
open_free_set_callback_blocks_released,
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
fn open_free_set_callback_blocks_acquired(trailer: *CheckpointTrailer) void {
|
|
313
|
+
assert(trailer.callback == .none);
|
|
314
|
+
const grid: *Grid = @fieldParentPtr("free_set_checkpoint_blocks_acquired", trailer);
|
|
315
|
+
grid.open_free_set_callback();
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
fn open_free_set_callback_blocks_released(trailer: *CheckpointTrailer) void {
|
|
319
|
+
assert(trailer.callback == .none);
|
|
320
|
+
const grid: *Grid = @fieldParentPtr("free_set_checkpoint_blocks_released", trailer);
|
|
321
|
+
grid.open_free_set_callback();
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
fn open_free_set_callback(grid: *Grid) void {
|
|
325
|
+
assert(grid.free_set_checkpoint_blocks_acquired.callback == .none or
|
|
326
|
+
grid.free_set_checkpoint_blocks_released.callback == .none);
|
|
327
|
+
|
|
328
|
+
const callback = grid.callback.open;
|
|
329
|
+
// May still be reading the CheckpointTrailer for `blocks_acquired`.
|
|
330
|
+
if (grid.free_set_checkpoint_blocks_acquired.callback == .open) return;
|
|
331
|
+
assert(grid.free_set_checkpoint_blocks_acquired.callback == .none);
|
|
332
|
+
|
|
333
|
+
// May still be reading the CheckpointTrailer for `blocks_released`.
|
|
334
|
+
if (grid.free_set_checkpoint_blocks_released.callback == .open) return;
|
|
335
|
+
assert(grid.free_set_checkpoint_blocks_released.callback == .none);
|
|
336
|
+
|
|
337
|
+
{
|
|
338
|
+
assert(!grid.free_set.opened);
|
|
339
|
+
defer assert(grid.free_set.opened);
|
|
340
|
+
|
|
341
|
+
const block_count_encoded_blocks_acquired =
|
|
342
|
+
grid.free_set_checkpoint_blocks_acquired.block_count();
|
|
343
|
+
const block_count_encoded_blocks_released =
|
|
344
|
+
grid.free_set_checkpoint_blocks_released.block_count();
|
|
345
|
+
grid.free_set.open(.{
|
|
346
|
+
.encoded = .{
|
|
347
|
+
.blocks_acquired = grid.free_set_checkpoint_blocks_acquired.decode_chunks(),
|
|
348
|
+
.blocks_released = grid.free_set_checkpoint_blocks_released.decode_chunks(),
|
|
349
|
+
},
|
|
350
|
+
.free_set_block_addresses = .{
|
|
351
|
+
.blocks_acquired = grid.free_set_checkpoint_blocks_acquired
|
|
352
|
+
.block_addresses[0..block_count_encoded_blocks_acquired],
|
|
353
|
+
.blocks_released = grid.free_set_checkpoint_blocks_released
|
|
354
|
+
.block_addresses[0..block_count_encoded_blocks_released],
|
|
355
|
+
},
|
|
356
|
+
});
|
|
357
|
+
assert((grid.free_set.count_acquired() > 0) ==
|
|
358
|
+
(grid.free_set_checkpoint_blocks_acquired.size > 0));
|
|
359
|
+
|
|
360
|
+
// Assert that the highest acquired address is compatible with storage_size.
|
|
361
|
+
const storage_size: u64 = storage_size: {
|
|
362
|
+
var storage_size = vsr.superblock.data_file_size_min;
|
|
363
|
+
if (grid.free_set.highest_address_acquired()) |address| {
|
|
364
|
+
assert(address > 0);
|
|
365
|
+
assert(grid.free_set_checkpoint_blocks_acquired.size > 0);
|
|
366
|
+
maybe(grid.free_set_checkpoint_blocks_released.size == 0);
|
|
367
|
+
|
|
368
|
+
storage_size += address * constants.block_size;
|
|
369
|
+
} else {
|
|
370
|
+
assert(grid.free_set_checkpoint_blocks_acquired.size == 0);
|
|
371
|
+
assert(grid.free_set_checkpoint_blocks_released.size == 0);
|
|
372
|
+
|
|
373
|
+
assert(grid.free_set.count_released() == 0);
|
|
374
|
+
}
|
|
375
|
+
break :storage_size storage_size;
|
|
376
|
+
};
|
|
377
|
+
assert(storage_size == grid.superblock.working.vsr_state.checkpoint.storage_size);
|
|
378
|
+
|
|
379
|
+
assert(grid.free_set.count_released() >=
|
|
380
|
+
(grid.free_set_checkpoint_blocks_acquired.block_count() +
|
|
381
|
+
grid.free_set_checkpoint_blocks_released.block_count()));
|
|
382
|
+
|
|
383
|
+
assert(grid.free_set.count_reservations() == 0);
|
|
384
|
+
}
|
|
385
|
+
grid.callback = .none;
|
|
386
|
+
callback(grid);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
/// Checkpoint process is delicate:
|
|
390
|
+
/// 1. Encode free set.
|
|
391
|
+
/// 2. Derive the number of blocks required to store the encoding.
|
|
392
|
+
/// 3. Allocate free set blocks for the encoding (in the old checkpoint).
|
|
393
|
+
/// 4. Write the free set blocks to disk.
|
|
394
|
+
/// 5. Mark the free set's own blocks as released (but not yet free).
|
|
395
|
+
///
|
|
396
|
+
/// This function handles step 1, and calls CheckpointTrailer.checkpoint, which handles 2-4.
|
|
397
|
+
/// The caller is responsible for calling Grid.mark_checkpoint_not_durable, which handles 5.
|
|
398
|
+
pub fn checkpoint(grid: *Grid, callback: *const fn (*Grid) void) void {
|
|
399
|
+
assert(grid.callback == .none);
|
|
400
|
+
assert(grid.read_global_queue.empty());
|
|
401
|
+
|
|
402
|
+
{
|
|
403
|
+
assert(grid.free_set.count_reservations() == 0);
|
|
404
|
+
|
|
405
|
+
const free_set_encoded = grid.free_set.encode_chunks(
|
|
406
|
+
grid.free_set_checkpoint_blocks_acquired.encode_chunks(),
|
|
407
|
+
grid.free_set_checkpoint_blocks_released.encode_chunks(),
|
|
408
|
+
);
|
|
409
|
+
|
|
410
|
+
grid.free_set_checkpoint_blocks_acquired.size =
|
|
411
|
+
free_set_encoded.encoded_size_blocks_acquired;
|
|
412
|
+
grid.free_set_checkpoint_blocks_released.size =
|
|
413
|
+
free_set_encoded.encoded_size_blocks_released;
|
|
414
|
+
|
|
415
|
+
assert(grid.free_set_checkpoint_blocks_acquired.size % @sizeOf(FreeSet.Word) == 0);
|
|
416
|
+
assert(grid.free_set_checkpoint_blocks_released.size % @sizeOf(FreeSet.Word) == 0);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
grid.callback = .{ .checkpoint = callback };
|
|
420
|
+
grid.free_set_checkpoint_blocks_acquired
|
|
421
|
+
.checkpoint(checkpoint_free_set_blocks_acquired_callback);
|
|
422
|
+
grid.free_set_checkpoint_blocks_released
|
|
423
|
+
.checkpoint(checkpoint_free_set_blocks_released_callback);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
fn checkpoint_free_set_blocks_acquired_callback(trailer: *CheckpointTrailer) void {
|
|
427
|
+
assert(trailer.callback == .none);
|
|
428
|
+
const grid: *Grid = @fieldParentPtr("free_set_checkpoint_blocks_acquired", trailer);
|
|
429
|
+
assert(grid.callback == .checkpoint);
|
|
430
|
+
|
|
431
|
+
grid.checkpoint_join();
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
fn checkpoint_free_set_blocks_released_callback(trailer: *CheckpointTrailer) void {
|
|
435
|
+
assert(trailer.callback == .none);
|
|
436
|
+
const grid: *Grid = @fieldParentPtr("free_set_checkpoint_blocks_released", trailer);
|
|
437
|
+
assert(grid.callback == .checkpoint);
|
|
438
|
+
|
|
439
|
+
grid.checkpoint_join();
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
fn checkpoint_join(grid: *Grid) void {
|
|
443
|
+
assert(grid.callback == .checkpoint);
|
|
444
|
+
assert(grid.read_global_queue.empty());
|
|
445
|
+
|
|
446
|
+
if (grid.free_set_checkpoint_blocks_acquired.callback == .checkpoint) {
|
|
447
|
+
return; // Still writing free set `blocks_acquired` bitset.
|
|
448
|
+
}
|
|
449
|
+
assert(grid.free_set_checkpoint_blocks_acquired.callback == .none);
|
|
450
|
+
|
|
451
|
+
if (grid.free_set_checkpoint_blocks_released.callback == .checkpoint) {
|
|
452
|
+
return; // Still writing free set `blocks_released` bitset.
|
|
453
|
+
}
|
|
454
|
+
assert(grid.free_set_checkpoint_blocks_released.callback == .none);
|
|
455
|
+
|
|
456
|
+
const callback = grid.callback.checkpoint;
|
|
457
|
+
grid.callback = .none;
|
|
458
|
+
callback(grid);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/// Mark the current checkpoint as not durable, then release the blocks acquired for the
|
|
462
|
+
/// FreeSet checkpoints (to be freed when the *next* checkpoint becomes durable).
|
|
463
|
+
///
|
|
464
|
+
/// The ordering is important here, if we were to release these blocks before the checkpoint
|
|
465
|
+
/// is marked as not durable, they would erroneously be freed when the *current* checkpoint
|
|
466
|
+
/// becomes durable.
|
|
467
|
+
pub fn mark_checkpoint_not_durable(grid: *Grid) void {
|
|
468
|
+
assert(grid.free_set.checkpoint_durable);
|
|
469
|
+
defer assert(!grid.free_set.checkpoint_durable);
|
|
470
|
+
|
|
471
|
+
grid.free_set.mark_checkpoint_not_durable();
|
|
472
|
+
grid.release(grid.free_set_checkpoint_blocks_acquired
|
|
473
|
+
.block_addresses[0..grid.free_set_checkpoint_blocks_acquired.block_count()]);
|
|
474
|
+
grid.release(grid.free_set_checkpoint_blocks_released
|
|
475
|
+
.block_addresses[0..grid.free_set_checkpoint_blocks_released.block_count()]);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/// Now that the checkpoint is durable on a commit quorum of replicas:
|
|
479
|
+
/// 1. Await all pending repair-writes to blocks that are about to be freed.
|
|
480
|
+
/// 2. Mark currently released blocks as free and eligible for acquisition.
|
|
481
|
+
///
|
|
482
|
+
/// This function handles step 1.
|
|
483
|
+
/// The caller is responsible for calling FreeSet.checkpoint which handles 2.
|
|
484
|
+
pub fn checkpoint_durable(grid: *Grid, callback: *const fn (*Grid) void) void {
|
|
485
|
+
assert(!grid.free_set.checkpoint_durable);
|
|
486
|
+
grid.callback = .{ .checkpoint_durable = callback };
|
|
487
|
+
|
|
488
|
+
grid.blocks_missing.checkpoint_durable_commence(&grid.free_set);
|
|
489
|
+
if (grid.blocks_missing.state.checkpoint_durable.aborting == 0) {
|
|
490
|
+
grid.checkpoint_durable_join();
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
fn checkpoint_durable_join(grid: *Grid) void {
|
|
495
|
+
assert(grid.callback == .checkpoint_durable);
|
|
496
|
+
|
|
497
|
+
// We are still repairing some blocks released during the previous checkpoint interval.
|
|
498
|
+
if (!grid.blocks_missing.checkpoint_durable_complete()) {
|
|
499
|
+
assert(grid.write_iops.executing() > 0);
|
|
500
|
+
return;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
var write_queue_iterator = grid.write_queue.iterate();
|
|
504
|
+
while (write_queue_iterator.next()) |write| {
|
|
505
|
+
assert(write.repair);
|
|
506
|
+
assert(!grid.free_set.is_free(write.address));
|
|
507
|
+
assert(!grid.free_set.to_be_freed_at_checkpoint_durability(write.address));
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
var write_iops_iterator = grid.write_iops.iterate();
|
|
511
|
+
while (write_iops_iterator.next()) |iop| {
|
|
512
|
+
assert(!grid.free_set.is_free(iop.write.address));
|
|
513
|
+
assert(!grid.free_set.to_be_freed_at_checkpoint_durability(iop.write.address));
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
// Now that there are no writes to released blocks, we can safely mark them as free,
|
|
517
|
+
// and also mark the checkpoint as durable.
|
|
518
|
+
assert(!grid.free_set.checkpoint_durable);
|
|
519
|
+
defer assert(grid.free_set.checkpoint_durable);
|
|
520
|
+
|
|
521
|
+
grid.free_set.mark_checkpoint_durable();
|
|
522
|
+
|
|
523
|
+
const callback = grid.callback.checkpoint_durable;
|
|
524
|
+
grid.callback = .none;
|
|
525
|
+
callback(grid);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
pub fn cancel(grid: *Grid, callback: *const fn (*Grid) void) void {
|
|
529
|
+
// grid.open() is cancellable the same way that read_block()/write_block() are.
|
|
530
|
+
switch (grid.callback) {
|
|
531
|
+
.none => {},
|
|
532
|
+
.open => {},
|
|
533
|
+
.checkpoint_durable => {},
|
|
534
|
+
.checkpoint => unreachable,
|
|
535
|
+
.cancel => unreachable,
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
grid.callback = .{ .cancel = callback };
|
|
539
|
+
|
|
540
|
+
grid.blocks_missing.cancel();
|
|
541
|
+
grid.read_queue.reset();
|
|
542
|
+
grid.read_pending_queue.reset();
|
|
543
|
+
grid.read_global_queue.reset();
|
|
544
|
+
grid.write_queue.reset();
|
|
545
|
+
grid.superblock.storage.reset_next_tick_lsm();
|
|
546
|
+
grid.superblock.storage.on_next_tick(
|
|
547
|
+
.vsr,
|
|
548
|
+
cancel_tick_callback,
|
|
549
|
+
&grid.canceling_tick_context,
|
|
550
|
+
);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
fn cancel_tick_callback(next_tick: *NextTick) void {
|
|
554
|
+
const grid: *Grid = @alignCast(@fieldParentPtr("canceling_tick_context", next_tick));
|
|
555
|
+
if (grid.callback != .cancel) return;
|
|
556
|
+
|
|
557
|
+
assert(grid.read_queue.empty());
|
|
558
|
+
assert(grid.read_pending_queue.empty());
|
|
559
|
+
assert(grid.read_global_queue.empty());
|
|
560
|
+
assert(grid.write_queue.empty());
|
|
561
|
+
|
|
562
|
+
grid.cancel_join_callback();
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
fn cancel_join_callback(grid: *Grid) void {
|
|
566
|
+
assert(grid.callback == .cancel);
|
|
567
|
+
assert(grid.read_queue.empty());
|
|
568
|
+
assert(grid.read_pending_queue.empty());
|
|
569
|
+
assert(grid.read_global_queue.empty());
|
|
570
|
+
assert(grid.write_queue.empty());
|
|
571
|
+
|
|
572
|
+
if (grid.read_iops.executing() == 0 and
|
|
573
|
+
grid.write_iops.executing() == 0)
|
|
574
|
+
{
|
|
575
|
+
const callback = grid.callback.cancel;
|
|
576
|
+
grid.callback = .none;
|
|
577
|
+
|
|
578
|
+
callback(grid);
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
pub fn on_next_tick(
|
|
583
|
+
grid: *Grid,
|
|
584
|
+
callback: *const fn (*Grid.NextTick) void,
|
|
585
|
+
next_tick: *Grid.NextTick,
|
|
586
|
+
) void {
|
|
587
|
+
assert(grid.callback != .cancel);
|
|
588
|
+
grid.superblock.storage.on_next_tick(.lsm, callback, next_tick);
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
/// Aborts if there are not enough free blocks to fill the reservation.
|
|
592
|
+
/// Should a use case arise where a null return would be preferred, this can be split
|
|
593
|
+
/// into panicking and non-panicking versions.
|
|
594
|
+
pub fn reserve(grid: *Grid, blocks_count: usize) Reservation {
|
|
595
|
+
assert(grid.callback == .none);
|
|
596
|
+
return grid.free_set.reserve(blocks_count) orelse vsr.fatal(
|
|
597
|
+
.storage_size_would_exceed_limit,
|
|
598
|
+
"data file would become too large size={} + reservation={} > limit={}, " ++
|
|
599
|
+
"restart the replica increasing '--limit-storage'",
|
|
600
|
+
.{
|
|
601
|
+
grid.superblock.working.vsr_state.checkpoint.storage_size,
|
|
602
|
+
blocks_count * constants.block_size,
|
|
603
|
+
grid.superblock.storage_size_limit,
|
|
604
|
+
},
|
|
605
|
+
);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
/// Forfeit a reservation.
|
|
609
|
+
pub fn forfeit(grid: *Grid, reservation: Reservation) void {
|
|
610
|
+
assert(grid.callback == .none);
|
|
611
|
+
return grid.free_set.forfeit(reservation);
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
/// Returns a just-allocated block.
|
|
615
|
+
/// The caller is responsible for not acquiring more blocks than they reserved.
|
|
616
|
+
pub fn acquire(grid: *Grid, reservation: Reservation) u64 {
|
|
617
|
+
assert(grid.callback == .none);
|
|
618
|
+
return grid.free_set.acquire(reservation).?;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
/// This function should be used to release addresses, instead of release()
|
|
622
|
+
/// on the free set directly, as this also demotes the address within the block cache.
|
|
623
|
+
/// This reduces conflict misses in the block cache, by freeing ways soon after they are
|
|
624
|
+
/// released.
|
|
625
|
+
///
|
|
626
|
+
/// This does not remove the blocks from the cache — the blocks can be read until the next
|
|
627
|
+
/// checkpoint.
|
|
628
|
+
///
|
|
629
|
+
/// Asserts that the addresses are not currently being read from or written to.
|
|
630
|
+
pub fn release(grid: *Grid, addresses: []const u64) void {
|
|
631
|
+
assert(grid.callback == .none);
|
|
632
|
+
for (addresses) |address| {
|
|
633
|
+
assert(address > 0);
|
|
634
|
+
|
|
635
|
+
// It's safe to release an address that is being read from, because the superblock
|
|
636
|
+
// will not allow it to be overwritten before the end of the bar.
|
|
637
|
+
assert(grid.writing(address, null) != .create);
|
|
638
|
+
|
|
639
|
+
grid.cache.demote(address);
|
|
640
|
+
grid.free_set.release(address);
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
const Writing = enum { create, repair, not_writing };
|
|
645
|
+
|
|
646
|
+
/// If the address is being written to by a non-repair, return `.create`.
|
|
647
|
+
/// If the address is being written to by a repair, return `.repair`.
|
|
648
|
+
/// Otherwise return `.not_writing`.
|
|
649
|
+
///
|
|
650
|
+
/// Assert that the block pointer is not being used for any write if non-null.
|
|
651
|
+
pub fn writing(grid: *Grid, address: u64, block: ?BlockPtrConst) Writing {
|
|
652
|
+
assert(address > 0);
|
|
653
|
+
|
|
654
|
+
var result = Writing.not_writing;
|
|
655
|
+
{
|
|
656
|
+
var it = grid.write_queue.iterate();
|
|
657
|
+
while (it.next()) |queued_write| {
|
|
658
|
+
assert(block != queued_write.block.*);
|
|
659
|
+
if (address == queued_write.address) {
|
|
660
|
+
assert(result == .not_writing);
|
|
661
|
+
result = if (queued_write.repair) .repair else .create;
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
{
|
|
666
|
+
var it = grid.write_iops.iterate();
|
|
667
|
+
while (it.next()) |iop| {
|
|
668
|
+
assert(block != iop.write.block.*);
|
|
669
|
+
if (address == iop.write.address) {
|
|
670
|
+
assert(result == .not_writing);
|
|
671
|
+
result = if (iop.write.repair) .repair else .create;
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
return result;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
/// Assert that the address is not currently being read from (disregarding repairs).
|
|
679
|
+
/// Assert that the block pointer is not being used for any read if non-null.
|
|
680
|
+
fn assert_not_reading(grid: *Grid, address: u64, block: ?BlockPtrConst) void {
|
|
681
|
+
assert(address > 0);
|
|
682
|
+
|
|
683
|
+
for ([_]*const QueueType(Read){
|
|
684
|
+
&grid.read_queue,
|
|
685
|
+
&grid.read_global_queue,
|
|
686
|
+
}) |queue| {
|
|
687
|
+
var it = queue.iterate();
|
|
688
|
+
while (it.next()) |queued_read| {
|
|
689
|
+
if (queued_read.coherent) {
|
|
690
|
+
assert(address != queued_read.address);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
{
|
|
695
|
+
var it = grid.read_iops.iterate();
|
|
696
|
+
while (it.next()) |iop| {
|
|
697
|
+
if (iop.read.coherent) {
|
|
698
|
+
assert(address != iop.read.address);
|
|
699
|
+
}
|
|
700
|
+
const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
701
|
+
assert(block != iop_block);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
pub fn assert_only_repairing(grid: *Grid) void {
|
|
707
|
+
assert(grid.callback != .cancel);
|
|
708
|
+
assert(grid.read_global_queue.empty());
|
|
709
|
+
|
|
710
|
+
var read_queue_iterator = grid.read_queue.iterate();
|
|
711
|
+
while (read_queue_iterator.next()) |read| {
|
|
712
|
+
// Scrubber reads are independent from LSM operations.
|
|
713
|
+
assert(!read.coherent);
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
var write_queue_iterator = grid.write_queue.iterate();
|
|
717
|
+
while (write_queue_iterator.next()) |write| {
|
|
718
|
+
assert(write.repair);
|
|
719
|
+
assert(!grid.free_set.is_free(write.address));
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
var write_iops = grid.write_iops.iterate();
|
|
723
|
+
while (write_iops.next()) |iop| {
|
|
724
|
+
assert(iop.write.repair);
|
|
725
|
+
assert(!grid.free_set.is_free(iop.write.address));
|
|
726
|
+
}
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
pub fn fulfill_block(grid: *Grid, block: BlockPtrConst) bool {
|
|
730
|
+
assert(grid.superblock.opened);
|
|
731
|
+
assert(grid.callback != .cancel);
|
|
732
|
+
|
|
733
|
+
const block_header = schema.header_from_block(block);
|
|
734
|
+
assert(block_header.cluster == grid.superblock.working.cluster);
|
|
735
|
+
|
|
736
|
+
var reads_iterator = grid.read_global_queue.iterate();
|
|
737
|
+
while (reads_iterator.next()) |read| {
|
|
738
|
+
if (read.checksum == block_header.checksum and
|
|
739
|
+
read.address == block_header.address)
|
|
740
|
+
{
|
|
741
|
+
assert(block_header.release.value <=
|
|
742
|
+
grid.superblock.working.vsr_state.checkpoint.release.value);
|
|
743
|
+
grid.read_global_queue.remove(read);
|
|
744
|
+
grid.read_block_resolve(read, .{ .valid = block });
|
|
745
|
+
return true;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
return false;
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
pub fn repair_block_waiting(grid: *Grid, address: u64, checksum: u128) bool {
|
|
752
|
+
assert(grid.superblock.opened);
|
|
753
|
+
assert(grid.callback != .cancel);
|
|
754
|
+
return grid.blocks_missing.block_waiting(address, checksum);
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
/// Write a block that should already exist but (maybe) doesn't because of:
|
|
758
|
+
/// - a disk fault, or
|
|
759
|
+
/// - the block was missed due to state sync.
|
|
760
|
+
///
|
|
761
|
+
/// NOTE: This will consume `block` and replace it with a fresh block.
|
|
762
|
+
pub fn repair_block(
|
|
763
|
+
grid: *Grid,
|
|
764
|
+
callback: *const fn (*Grid.Write) void,
|
|
765
|
+
write: *Grid.Write,
|
|
766
|
+
block: *BlockPtr,
|
|
767
|
+
) void {
|
|
768
|
+
const block_header = schema.header_from_block(block.*);
|
|
769
|
+
assert(grid.superblock.opened);
|
|
770
|
+
assert(grid.callback != .cancel);
|
|
771
|
+
assert(grid.writing(block_header.address, block.*) == .not_writing);
|
|
772
|
+
assert(grid.blocks_missing.block_waiting(block_header.address, block_header.checksum));
|
|
773
|
+
assert(!grid.free_set.is_free(block_header.address));
|
|
774
|
+
|
|
775
|
+
grid.blocks_missing.write_commence(block_header.address, block_header.checksum);
|
|
776
|
+
grid.write_block(callback, write, block, .repair);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/// Write a block for the first time.
|
|
780
|
+
/// NOTE: This will consume `block` and replace it with a fresh block.
|
|
781
|
+
pub fn create_block(
|
|
782
|
+
grid: *Grid,
|
|
783
|
+
callback: *const fn (*Grid.Write) void,
|
|
784
|
+
write: *Grid.Write,
|
|
785
|
+
block: *BlockPtr,
|
|
786
|
+
) void {
|
|
787
|
+
const block_header = schema.header_from_block(block.*);
|
|
788
|
+
assert(grid.superblock.opened);
|
|
789
|
+
assert(grid.callback == .none or grid.callback == .checkpoint);
|
|
790
|
+
assert((grid.callback == .checkpoint) == (block_header.block_type == .free_set));
|
|
791
|
+
assert(grid.writing(block_header.address, block.*) == .not_writing);
|
|
792
|
+
assert(!grid.blocks_missing.block_waiting(
|
|
793
|
+
block_header.address,
|
|
794
|
+
block_header.checksum,
|
|
795
|
+
));
|
|
796
|
+
assert(!grid.free_set.is_free(block_header.address));
|
|
797
|
+
grid.assert_not_reading(block_header.address, block.*);
|
|
798
|
+
|
|
799
|
+
grid.write_block(callback, write, block, .create);
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
/// NOTE: This will consume `block` and replace it with a fresh block.
|
|
803
|
+
fn write_block(
|
|
804
|
+
grid: *Grid,
|
|
805
|
+
callback: *const fn (*Grid.Write) void,
|
|
806
|
+
write: *Grid.Write,
|
|
807
|
+
block: *BlockPtr,
|
|
808
|
+
trigger: enum { create, repair },
|
|
809
|
+
) void {
|
|
810
|
+
const header = schema.header_from_block(block.*);
|
|
811
|
+
assert(header.cluster == grid.superblock.working.cluster);
|
|
812
|
+
assert(header.release.value <=
|
|
813
|
+
grid.superblock.working.vsr_state.checkpoint.release.value);
|
|
814
|
+
|
|
815
|
+
assert(grid.superblock.opened);
|
|
816
|
+
assert(grid.callback != .cancel);
|
|
817
|
+
assert(grid.writing(header.address, block.*) == .not_writing);
|
|
818
|
+
assert(!grid.free_set.is_free(header.address));
|
|
819
|
+
grid.assert_coherent(header.address, header.checksum);
|
|
820
|
+
|
|
821
|
+
if (constants.verify) {
|
|
822
|
+
for (grid.cache_blocks) |cache_block| {
|
|
823
|
+
assert(cache_block != block.*);
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
// Zero sector padding.
|
|
828
|
+
@memset(block.*[header.size..vsr.sector_ceil(header.size)], 0);
|
|
829
|
+
|
|
830
|
+
write.* = .{
|
|
831
|
+
.callback = callback,
|
|
832
|
+
.address = header.address,
|
|
833
|
+
.repair = trigger == .repair,
|
|
834
|
+
.block = block,
|
|
835
|
+
.checkpoint_id = grid.superblock.working.checkpoint_id(),
|
|
836
|
+
};
|
|
837
|
+
|
|
838
|
+
const iop = grid.write_iops.acquire() orelse {
|
|
839
|
+
grid.write_queue.push(write);
|
|
840
|
+
return;
|
|
841
|
+
};
|
|
842
|
+
|
|
843
|
+
grid.write_block_with(iop, write);
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
fn write_block_with(grid: *Grid, iop: *WriteIOP, write: *Write) void {
|
|
847
|
+
assert(!grid.free_set.is_free(write.address));
|
|
848
|
+
|
|
849
|
+
grid.trace.start(.{ .grid_write = .{ .iop = grid.write_iops.index(iop) } });
|
|
850
|
+
|
|
851
|
+
iop.* = .{
|
|
852
|
+
.grid = grid,
|
|
853
|
+
.completion = undefined,
|
|
854
|
+
.write = write,
|
|
855
|
+
};
|
|
856
|
+
|
|
857
|
+
const write_header = schema.header_from_block(write.block.*);
|
|
858
|
+
assert(write_header.size > @sizeOf(vsr.Header));
|
|
859
|
+
assert(write_header.size <= constants.block_size);
|
|
860
|
+
assert(stdx.zeroed(
|
|
861
|
+
write.block.*[write_header.size..vsr.sector_ceil(write_header.size)],
|
|
862
|
+
));
|
|
863
|
+
|
|
864
|
+
grid.superblock.storage.write_sectors(
|
|
865
|
+
write_block_callback,
|
|
866
|
+
&iop.completion,
|
|
867
|
+
write.block.*[0..vsr.sector_ceil(write_header.size)],
|
|
868
|
+
.grid,
|
|
869
|
+
block_offset(write.address),
|
|
870
|
+
);
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
fn write_block_callback(completion: *Storage.Write) void {
|
|
874
|
+
const iop: *WriteIOP = @fieldParentPtr("completion", completion);
|
|
875
|
+
|
|
876
|
+
// We must copy these values to the stack as they will be overwritten
|
|
877
|
+
// when we release the iop and potentially start a queued write.
|
|
878
|
+
const grid = iop.grid;
|
|
879
|
+
const completed_write = iop.write;
|
|
880
|
+
|
|
881
|
+
// We can only update the cache if the Grid is not resolving callbacks with a cache
|
|
882
|
+
// block.
|
|
883
|
+
assert(!grid.read_resolving);
|
|
884
|
+
assert(!grid.free_set.is_free(completed_write.address));
|
|
885
|
+
|
|
886
|
+
if (!completed_write.repair) {
|
|
887
|
+
assert(grid.superblock.working.checkpoint_id() == completed_write.checkpoint_id);
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
// Insert the write block into the cache, and give the evicted block to the writer.
|
|
891
|
+
const cache_index = grid.cache.upsert(&completed_write.address).index;
|
|
892
|
+
const cache_block = &grid.cache_blocks[cache_index];
|
|
893
|
+
std.mem.swap(BlockPtr, cache_block, completed_write.block);
|
|
894
|
+
// This block content won't be used again. We could overwrite the entire thing, but that
|
|
895
|
+
// would be more expensive.
|
|
896
|
+
@memset(completed_write.block.*[0..@sizeOf(vsr.Header)], 0);
|
|
897
|
+
|
|
898
|
+
const cache_block_header = schema.header_from_block(cache_block.*);
|
|
899
|
+
assert(cache_block_header.address == completed_write.address);
|
|
900
|
+
grid.assert_coherent(completed_write.address, cache_block_header.checksum);
|
|
901
|
+
|
|
902
|
+
grid.trace.stop(.{ .grid_write = .{ .iop = grid.write_iops.index(iop) } });
|
|
903
|
+
|
|
904
|
+
if (grid.callback == .cancel) {
|
|
905
|
+
assert(grid.write_queue.empty());
|
|
906
|
+
|
|
907
|
+
grid.write_iops.release(iop);
|
|
908
|
+
grid.cancel_join_callback();
|
|
909
|
+
return;
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
// Start a queued write if possible *before* calling the completed
|
|
913
|
+
// write's callback. This ensures that if the callback calls
|
|
914
|
+
// Grid.write_block() it doesn't preempt the queue.
|
|
915
|
+
//
|
|
916
|
+
// (Don't pop from the write queue until after the read-repairs are resolved.
|
|
917
|
+
// Otherwise their resolution might complete grid cancellation, but the replica has
|
|
918
|
+
// not released its own write iop (via callback).)
|
|
919
|
+
if (grid.write_queue.pop()) |queued_write| {
|
|
920
|
+
grid.write_block_with(iop, queued_write);
|
|
921
|
+
} else {
|
|
922
|
+
grid.write_iops.release(iop);
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
// Precede the write's callback, since the callback takes back ownership of the block.
|
|
926
|
+
if (completed_write.repair) grid.blocks_missing.write_complete(cache_block.*);
|
|
927
|
+
// This call must come after (logically) releasing the IOP. Otherwise we risk tripping
|
|
928
|
+
// assertions forbidding concurrent writes using the same block/address
|
|
929
|
+
// if the callback calls write_block().
|
|
930
|
+
completed_write.callback(completed_write);
|
|
931
|
+
|
|
932
|
+
// We start awaiting pending repairs when the checkpoint becomes durable.
|
|
933
|
+
if (grid.callback == .checkpoint_durable) grid.checkpoint_durable_join();
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
/// Fetch the block synchronously from cache, if possible.
|
|
937
|
+
/// The returned block pointer is only valid until the next Grid write.
|
|
938
|
+
pub fn read_block_from_cache(
|
|
939
|
+
grid: *Grid,
|
|
940
|
+
address: u64,
|
|
941
|
+
checksum: u128,
|
|
942
|
+
options: struct { coherent: bool },
|
|
943
|
+
) ?BlockPtrConst {
|
|
944
|
+
assert(grid.superblock.opened);
|
|
945
|
+
assert(grid.callback != .cancel);
|
|
946
|
+
if (options.coherent) {
|
|
947
|
+
assert(grid.writing(address, null) != .create);
|
|
948
|
+
assert(!grid.free_set.is_free(address));
|
|
949
|
+
grid.assert_coherent(address, checksum);
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
assert(address > 0);
|
|
953
|
+
|
|
954
|
+
const cache_index = grid.cache.get_index(address) orelse return null;
|
|
955
|
+
const cache_block = grid.cache_blocks[cache_index];
|
|
956
|
+
|
|
957
|
+
const header = schema.header_from_block(cache_block);
|
|
958
|
+
assert(header.address == address);
|
|
959
|
+
assert(header.cluster == grid.superblock.working.cluster);
|
|
960
|
+
assert(header.release.value <=
|
|
961
|
+
grid.superblock.working.vsr_state.checkpoint.release.value);
|
|
962
|
+
|
|
963
|
+
if (header.checksum == checksum) {
|
|
964
|
+
if (constants.verify and
|
|
965
|
+
options.coherent and
|
|
966
|
+
grid.superblock.working.vsr_state.sync_op_max == 0)
|
|
967
|
+
{
|
|
968
|
+
grid.verify_read_from_cache(address, cache_block);
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
return cache_block;
|
|
972
|
+
} else {
|
|
973
|
+
if (options.coherent) {
|
|
974
|
+
assert(grid.superblock.working.vsr_state.sync_op_max > 0);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
return null;
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
pub fn read_block(
|
|
982
|
+
grid: *Grid,
|
|
983
|
+
callback: ReadBlockCallback,
|
|
984
|
+
read: *Grid.Read,
|
|
985
|
+
address: u64,
|
|
986
|
+
checksum: u128,
|
|
987
|
+
options: struct {
|
|
988
|
+
cache_read: bool,
|
|
989
|
+
cache_write: bool,
|
|
990
|
+
},
|
|
991
|
+
) void {
|
|
992
|
+
assert(grid.superblock.opened);
|
|
993
|
+
assert(grid.callback != .cancel);
|
|
994
|
+
assert(address > 0);
|
|
995
|
+
|
|
996
|
+
switch (callback) {
|
|
997
|
+
.from_local_storage => {
|
|
998
|
+
maybe(grid.callback == .checkpoint);
|
|
999
|
+
// We try to read the block even when it is free — if we recently released it,
|
|
1000
|
+
// it might be found on disk anyway.
|
|
1001
|
+
maybe(grid.free_set.is_free(address));
|
|
1002
|
+
maybe(grid.writing(address, null) == .create);
|
|
1003
|
+
},
|
|
1004
|
+
.from_local_or_global_storage => {
|
|
1005
|
+
assert(grid.callback != .checkpoint);
|
|
1006
|
+
assert(!grid.free_set.is_free(address));
|
|
1007
|
+
assert(grid.writing(address, null) != .create);
|
|
1008
|
+
grid.assert_coherent(address, checksum);
|
|
1009
|
+
},
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
read.* = .{
|
|
1013
|
+
.callback = callback,
|
|
1014
|
+
.address = address,
|
|
1015
|
+
.checksum = checksum,
|
|
1016
|
+
.coherent = callback == .from_local_or_global_storage,
|
|
1017
|
+
.checkpoint_durable = grid.free_set.checkpoint_durable,
|
|
1018
|
+
.cache_read = options.cache_read,
|
|
1019
|
+
.cache_write = options.cache_write,
|
|
1020
|
+
.checkpoint_id = grid.superblock.working.checkpoint_id(),
|
|
1021
|
+
.grid = grid,
|
|
1022
|
+
};
|
|
1023
|
+
|
|
1024
|
+
if (options.cache_read) {
|
|
1025
|
+
grid.on_next_tick(read_block_tick_callback, &read.next_tick);
|
|
1026
|
+
} else {
|
|
1027
|
+
read_block_tick_callback(&read.next_tick);
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
fn read_block_tick_callback(next_tick: *Storage.NextTick) void {
|
|
1032
|
+
const read: *Grid.Read = @alignCast(@fieldParentPtr("next_tick", next_tick));
|
|
1033
|
+
const grid = read.grid;
|
|
1034
|
+
assert(grid.superblock.opened);
|
|
1035
|
+
assert(grid.callback != .cancel);
|
|
1036
|
+
if (read.coherent) {
|
|
1037
|
+
assert(!grid.free_set.is_free(read.address));
|
|
1038
|
+
assert(grid.writing(read.address, null) != .create);
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
assert(read.address > 0);
|
|
1042
|
+
|
|
1043
|
+
// Check if a read is already processing/recovering and merge with it.
|
|
1044
|
+
for ([_]*const QueueType(Read){
|
|
1045
|
+
&grid.read_queue,
|
|
1046
|
+
&grid.read_global_queue,
|
|
1047
|
+
}) |queue| {
|
|
1048
|
+
// Don't remote-repair repairs – the block may not belong in our current checkpoint.
|
|
1049
|
+
if (read.callback == .from_local_storage) {
|
|
1050
|
+
if (queue == &grid.read_global_queue) continue;
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
var it = queue.iterate();
|
|
1054
|
+
while (it.next()) |queued_read| {
|
|
1055
|
+
if (queued_read.address == read.address) {
|
|
1056
|
+
// TODO check all read options match
|
|
1057
|
+
if (queued_read.checksum == read.checksum) {
|
|
1058
|
+
queued_read.resolves.push(&read.pending);
|
|
1059
|
+
return;
|
|
1060
|
+
} else {
|
|
1061
|
+
assert(!queued_read.coherent or !read.coherent);
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
// When Read.cache_read is set, the caller of read_block() is responsible for calling
|
|
1068
|
+
// us via next_tick().
|
|
1069
|
+
if (read.cache_read) {
|
|
1070
|
+
if (grid.read_block_from_cache(
|
|
1071
|
+
read.address,
|
|
1072
|
+
read.checksum,
|
|
1073
|
+
.{ .coherent = read.coherent },
|
|
1074
|
+
)) |cache_block| {
|
|
1075
|
+
grid.read_block_resolve(read, .{ .valid = cache_block });
|
|
1076
|
+
return;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
// Become the "root" read that's fetching the block for the given address. The fetch
|
|
1081
|
+
// happens asynchronously to avoid stack-overflow and nested cache invalidation.
|
|
1082
|
+
grid.read_queue.push(read);
|
|
1083
|
+
|
|
1084
|
+
// Grab an IOP to resolve the block from storage.
|
|
1085
|
+
// Failure to do so means the read is queued to receive an IOP when one finishes.
|
|
1086
|
+
const iop = grid.read_iops.acquire() orelse {
|
|
1087
|
+
grid.read_pending_queue.push(&read.pending);
|
|
1088
|
+
return;
|
|
1089
|
+
};
|
|
1090
|
+
|
|
1091
|
+
grid.read_block_with(iop, read);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
fn read_block_with(grid: *Grid, iop: *Grid.ReadIOP, read: *Grid.Read) void {
|
|
1095
|
+
const address = read.address;
|
|
1096
|
+
assert(address > 0);
|
|
1097
|
+
|
|
1098
|
+
// We can only update the cache if the Grid is not resolving callbacks with a cache
|
|
1099
|
+
// block.
|
|
1100
|
+
assert(!grid.read_resolving);
|
|
1101
|
+
|
|
1102
|
+
grid.trace.start(.{ .grid_read = .{ .iop = grid.read_iops.index(iop) } });
|
|
1103
|
+
|
|
1104
|
+
iop.* = .{
|
|
1105
|
+
.completion = undefined,
|
|
1106
|
+
.read = read,
|
|
1107
|
+
};
|
|
1108
|
+
const iop_block = grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
1109
|
+
|
|
1110
|
+
grid.superblock.storage.read_sectors(
|
|
1111
|
+
read_block_callback,
|
|
1112
|
+
&iop.completion,
|
|
1113
|
+
iop_block,
|
|
1114
|
+
.grid,
|
|
1115
|
+
block_offset(address),
|
|
1116
|
+
);
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
fn read_block_callback(completion: *Storage.Read) void {
|
|
1120
|
+
const iop: *ReadIOP = @fieldParentPtr("completion", completion);
|
|
1121
|
+
const read = iop.read;
|
|
1122
|
+
const grid = read.grid;
|
|
1123
|
+
const iop_block = &grid.read_iop_blocks[grid.read_iops.index(iop)];
|
|
1124
|
+
|
|
1125
|
+
grid.trace.stop(.{ .grid_read = .{ .iop = grid.read_iops.index(iop) } });
|
|
1126
|
+
|
|
1127
|
+
if (grid.callback == .cancel) {
|
|
1128
|
+
grid.read_iops.release(iop);
|
|
1129
|
+
grid.cancel_join_callback();
|
|
1130
|
+
return;
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
// Insert the block into the cache, and give the evicted block to `iop`.
|
|
1134
|
+
const cache_index =
|
|
1135
|
+
if (read.cache_write) grid.cache.upsert(&read.address).index else null;
|
|
1136
|
+
const block = block: {
|
|
1137
|
+
if (read.cache_write) {
|
|
1138
|
+
const cache_block = &grid.cache_blocks[cache_index.?];
|
|
1139
|
+
std.mem.swap(BlockPtr, iop_block, cache_block);
|
|
1140
|
+
// This block content won't be used again. We could overwrite the entire thing,
|
|
1141
|
+
// but that would be more expensive.
|
|
1142
|
+
@memset(iop_block.*[0..@sizeOf(vsr.Header)], 0);
|
|
1143
|
+
break :block cache_block;
|
|
1144
|
+
} else {
|
|
1145
|
+
break :block iop_block;
|
|
1146
|
+
}
|
|
1147
|
+
};
|
|
1148
|
+
|
|
1149
|
+
// Handoff the iop to a pending read or release it before resolving the callbacks below.
|
|
1150
|
+
if (grid.read_pending_queue.pop()) |pending| {
|
|
1151
|
+
const queued_read: *Read = @alignCast(@fieldParentPtr("pending", pending));
|
|
1152
|
+
grid.read_block_with(iop, queued_read);
|
|
1153
|
+
} else {
|
|
1154
|
+
grid.read_iops.release(iop);
|
|
1155
|
+
}
|
|
1156
|
+
|
|
1157
|
+
// Remove the "root" read so that the address is no longer actively reading / locked.
|
|
1158
|
+
grid.read_queue.remove(read);
|
|
1159
|
+
|
|
1160
|
+
const result = read_block_validate(block.*, .{
|
|
1161
|
+
.address = read.address,
|
|
1162
|
+
.checksum = read.checksum,
|
|
1163
|
+
});
|
|
1164
|
+
|
|
1165
|
+
if (result != .valid) {
|
|
1166
|
+
const header =
|
|
1167
|
+
mem.bytesAsValue(vsr.Header.Block, block.*[0..@sizeOf(vsr.Header)]);
|
|
1168
|
+
log.warn(
|
|
1169
|
+
"{}: {s}: expected address={} checksum={x:0>32}, " ++
|
|
1170
|
+
"found address={} checksum={x:0>32}",
|
|
1171
|
+
.{
|
|
1172
|
+
grid.superblock.replica_index.?,
|
|
1173
|
+
@tagName(result),
|
|
1174
|
+
read.address,
|
|
1175
|
+
read.checksum,
|
|
1176
|
+
header.address,
|
|
1177
|
+
header.checksum,
|
|
1178
|
+
},
|
|
1179
|
+
);
|
|
1180
|
+
|
|
1181
|
+
if (read.cache_write) {
|
|
1182
|
+
// Don't cache a corrupt or incorrect block.
|
|
1183
|
+
const removed = grid.cache.remove(read.address);
|
|
1184
|
+
assert(removed != null);
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
if (constants.verify) grid.verify_read_fault(read);
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
grid.read_block_resolve(read, result);
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1193
|
+
fn read_block_validate(block: BlockPtrConst, expect: struct {
|
|
1194
|
+
address: u64,
|
|
1195
|
+
checksum: u128,
|
|
1196
|
+
}) ReadBlockResult {
|
|
1197
|
+
const header = mem.bytesAsValue(vsr.Header.Block, block[0..@sizeOf(vsr.Header)]);
|
|
1198
|
+
|
|
1199
|
+
if (!header.valid_checksum()) return .invalid_checksum;
|
|
1200
|
+
if (header.command != .block) return .unexpected_command;
|
|
1201
|
+
|
|
1202
|
+
assert(header.size >= @sizeOf(vsr.Header));
|
|
1203
|
+
assert(header.size <= constants.block_size);
|
|
1204
|
+
|
|
1205
|
+
const block_body = block[@sizeOf(vsr.Header)..header.size];
|
|
1206
|
+
if (!header.valid_checksum_body(block_body)) {
|
|
1207
|
+
return .invalid_checksum_body;
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
if (header.checksum != expect.checksum) return .unexpected_checksum;
|
|
1211
|
+
|
|
1212
|
+
if (!stdx.zeroed(block[header.size..vsr.sector_ceil(header.size)])) {
|
|
1213
|
+
return .invalid_padding;
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1216
|
+
assert(header.address == expect.address);
|
|
1217
|
+
return .{ .valid = block };
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
fn read_block_resolve(grid: *Grid, read: *Grid.Read, result: ReadBlockResult) void {
|
|
1221
|
+
assert(grid.callback != .cancel);
|
|
1222
|
+
|
|
1223
|
+
// Guard to make sure the cache cannot be updated by any read.callbacks() below.
|
|
1224
|
+
assert(!grid.read_resolving);
|
|
1225
|
+
grid.read_resolving = true;
|
|
1226
|
+
defer {
|
|
1227
|
+
assert(grid.read_resolving);
|
|
1228
|
+
grid.read_resolving = false;
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
if (read.coherent) {
|
|
1232
|
+
assert(!grid.free_set.is_free(read.address));
|
|
1233
|
+
assert(read.checkpoint_id == grid.superblock.working.checkpoint_id());
|
|
1234
|
+
grid.assert_coherent(read.address, read.checksum);
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
if (result == .valid) {
|
|
1238
|
+
const header = schema.header_from_block(result.valid);
|
|
1239
|
+
assert(header.cluster == grid.superblock.working.cluster);
|
|
1240
|
+
assert(header.release.value <=
|
|
1241
|
+
grid.superblock.working.vsr_state.checkpoint.release.value);
|
|
1242
|
+
assert(header.address == read.address);
|
|
1243
|
+
assert(header.checksum == read.checksum);
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
var read_remote_resolves: QueueType(ReadPending) = QueueType(ReadPending).init(.{
|
|
1247
|
+
.name = read.resolves.any.name,
|
|
1248
|
+
});
|
|
1249
|
+
|
|
1250
|
+
// Resolve all reads queued to the address with the block.
|
|
1251
|
+
while (read.resolves.pop()) |pending| {
|
|
1252
|
+
const pending_read: *Read = @alignCast(@fieldParentPtr("pending", pending));
|
|
1253
|
+
assert(pending_read.address == read.address);
|
|
1254
|
+
assert(pending_read.checksum == read.checksum);
|
|
1255
|
+
if (pending_read.coherent) {
|
|
1256
|
+
assert(pending_read.checkpoint_id == grid.superblock.working.checkpoint_id());
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
switch (pending_read.callback) {
|
|
1260
|
+
.from_local_storage => |callback| callback(pending_read, result),
|
|
1261
|
+
.from_local_or_global_storage => |callback| {
|
|
1262
|
+
if (result == .valid) {
|
|
1263
|
+
callback(pending_read, result.valid);
|
|
1264
|
+
} else {
|
|
1265
|
+
read_remote_resolves.push(&pending_read.pending);
|
|
1266
|
+
}
|
|
1267
|
+
},
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
// Then invoke the callback with the cache block (which should be valid for the duration
|
|
1272
|
+
// of the callback as any nested Grid calls cannot synchronously update the cache).
|
|
1273
|
+
switch (read.callback) {
|
|
1274
|
+
.from_local_storage => |callback| callback(read, result),
|
|
1275
|
+
.from_local_or_global_storage => |callback| {
|
|
1276
|
+
if (result == .valid) {
|
|
1277
|
+
callback(read, result.valid);
|
|
1278
|
+
} else {
|
|
1279
|
+
read_remote_resolves.push(&read.pending);
|
|
1280
|
+
}
|
|
1281
|
+
},
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
// On the result of an invalid block, move the "root" read (and all others it
|
|
1285
|
+
// resolves) to recovery queue. Future reads on the same address will see the "root"
|
|
1286
|
+
// read in the recovery queue and enqueue to it.
|
|
1287
|
+
if (read_remote_resolves.pop()) |read_remote_head_pending| {
|
|
1288
|
+
const read_remote_head: *Read = @alignCast(
|
|
1289
|
+
@fieldParentPtr("pending", read_remote_head_pending),
|
|
1290
|
+
);
|
|
1291
|
+
assert(read_remote_head.callback == .from_local_or_global_storage);
|
|
1292
|
+
assert(read_remote_head.coherent);
|
|
1293
|
+
|
|
1294
|
+
log.debug("{}: read_block: fault: address={} checksum={x:0>32}", .{
|
|
1295
|
+
grid.superblock.replica_index.?,
|
|
1296
|
+
read_remote_head.address,
|
|
1297
|
+
read_remote_head.checksum,
|
|
1298
|
+
});
|
|
1299
|
+
|
|
1300
|
+
read_remote_head.resolves = read_remote_resolves;
|
|
1301
|
+
grid.read_global_queue.push(read_remote_head);
|
|
1302
|
+
|
|
1303
|
+
if (grid.blocks_missing.repair_blocks_available() > 0) {
|
|
1304
|
+
grid.blocks_missing.repair_block(
|
|
1305
|
+
read_remote_head.address,
|
|
1306
|
+
read_remote_head.checksum,
|
|
1307
|
+
);
|
|
1308
|
+
}
|
|
1309
|
+
}
|
|
1310
|
+
}
|
|
1311
|
+
|
|
1312
|
+
fn block_offset(address: u64) u64 {
|
|
1313
|
+
assert(address > 0);
|
|
1314
|
+
|
|
1315
|
+
return (address - 1) * block_size;
|
|
1316
|
+
}
|
|
1317
|
+
|
|
1318
|
+
fn assert_coherent(grid: *const Grid, address: u64, checksum: u128) void {
|
|
1319
|
+
assert(!grid.free_set.is_free(address));
|
|
1320
|
+
|
|
1321
|
+
const TestStorage = @import("../testing/storage.zig").Storage;
|
|
1322
|
+
if (Storage != TestStorage) return;
|
|
1323
|
+
|
|
1324
|
+
if (grid.superblock.storage.options.grid_checker) |checker| {
|
|
1325
|
+
checker.assert_coherent(
|
|
1326
|
+
&grid.superblock.working.vsr_state.checkpoint,
|
|
1327
|
+
grid.free_set.checkpoint_durable,
|
|
1328
|
+
address,
|
|
1329
|
+
checksum,
|
|
1330
|
+
);
|
|
1331
|
+
|
|
1332
|
+
checker.assert_coherent(
|
|
1333
|
+
&grid.superblock.staging.vsr_state.checkpoint,
|
|
1334
|
+
checkpoint_durable: {
|
|
1335
|
+
if (grid.superblock.working.checkpoint_id() ==
|
|
1336
|
+
grid.superblock.staging.checkpoint_id())
|
|
1337
|
+
{
|
|
1338
|
+
break :checkpoint_durable grid.free_set.checkpoint_durable;
|
|
1339
|
+
} else {
|
|
1340
|
+
// Checkpoint is currently being written to the superblock. Pass
|
|
1341
|
+
// checkpoint_durable=False as we update free_set.checkpoint_durable
|
|
1342
|
+
// only *after* the checkpoint is written to the superblock.
|
|
1343
|
+
assert(grid.superblock.staging.parent_checkpoint_id() ==
|
|
1344
|
+
grid.superblock.working.checkpoint_id());
|
|
1345
|
+
assert(grid.free_set.checkpoint_durable);
|
|
1346
|
+
break :checkpoint_durable false;
|
|
1347
|
+
}
|
|
1348
|
+
},
|
|
1349
|
+
address,
|
|
1350
|
+
checksum,
|
|
1351
|
+
);
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
fn verify_read_from_cache(
|
|
1356
|
+
grid: *const Grid,
|
|
1357
|
+
address: u64,
|
|
1358
|
+
cached_block: BlockPtrConst,
|
|
1359
|
+
) void {
|
|
1360
|
+
comptime assert(constants.verify);
|
|
1361
|
+
|
|
1362
|
+
const TestStorage = @import("../testing/storage.zig").Storage;
|
|
1363
|
+
if (Storage != TestStorage) return;
|
|
1364
|
+
|
|
1365
|
+
const actual_block = grid.superblock.storage.grid_block(address).?;
|
|
1366
|
+
const actual_header = schema.header_from_block(actual_block);
|
|
1367
|
+
const cached_header = schema.header_from_block(cached_block);
|
|
1368
|
+
assert(cached_header.checksum == actual_header.checksum);
|
|
1369
|
+
|
|
1370
|
+
assert(std.mem.eql(
|
|
1371
|
+
u8,
|
|
1372
|
+
cached_block[0..cached_header.size],
|
|
1373
|
+
actual_block[0..actual_header.size],
|
|
1374
|
+
));
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
/// Called when we fail to read a block.
|
|
1378
|
+
fn verify_read_fault(grid: *const Grid, read: *const Read) void {
|
|
1379
|
+
comptime assert(constants.verify);
|
|
1380
|
+
|
|
1381
|
+
const TestStorage = @import("../testing/storage.zig").Storage;
|
|
1382
|
+
if (Storage != TestStorage) return;
|
|
1383
|
+
|
|
1384
|
+
// Only check coherent reads -- i.e., when we know for certain that the read's
|
|
1385
|
+
// address/checksum belongs in our current checkpoint.
|
|
1386
|
+
if (!read.coherent) return;
|
|
1387
|
+
|
|
1388
|
+
// Check our storage (bypassing faults).
|
|
1389
|
+
if (grid.superblock.storage.grid_block(read.address)) |actual_block| {
|
|
1390
|
+
const actual_header = schema.header_from_block(actual_block);
|
|
1391
|
+
if (actual_header.checksum == read.checksum) {
|
|
1392
|
+
// Exact block found. Since the read failed anyway, it must have been a
|
|
1393
|
+
// simulated read fault.
|
|
1394
|
+
assert(grid.superblock.storage.area_faulty(.{
|
|
1395
|
+
.grid = .{ .address = read.address },
|
|
1396
|
+
}));
|
|
1397
|
+
} else {
|
|
1398
|
+
// Different block found -- since this is a coherent read, we must be syncing.
|
|
1399
|
+
assert(grid.superblock.working.vsr_state.sync_op_max > 0);
|
|
1400
|
+
}
|
|
1401
|
+
} else {
|
|
1402
|
+
// No block found -- since this is a coherent read, we must by syncing.
|
|
1403
|
+
assert(grid.superblock.working.vsr_state.sync_op_max > 0);
|
|
1404
|
+
}
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
/// Mark all blocks in the grid cache as MADV_DONTDUMP. Must be done after transitioning
|
|
1408
|
+
/// to static, as the combination of madvise() + mremap() can cause an EFAULT.
|
|
1409
|
+
///
|
|
1410
|
+
/// It's OK that some blocks, such as the blocks used by compaction escape this -- this is
|
|
1411
|
+
/// not to stop sensitive data from appearing in core dumps, but rather to keep the core
|
|
1412
|
+
/// dump size managable even with a large grid cache.
|
|
1413
|
+
pub fn madv_dont_dump(grid: *const Grid) !void {
|
|
1414
|
+
if (builtin.target.os.tag != .linux) return;
|
|
1415
|
+
|
|
1416
|
+
assert(grid.cache_blocks.len > 0);
|
|
1417
|
+
|
|
1418
|
+
// Each block could be its own isolated memory mapping, with how things are done
|
|
1419
|
+
// using allocate_block(), but it's extremely unlikely. Coalesce them where possible to
|
|
1420
|
+
// save on madvise() syscalls.
|
|
1421
|
+
var continuous_cache_start = @intFromPtr(grid.cache_blocks[0]);
|
|
1422
|
+
var continuous_cache_len = grid.cache_blocks[0].len;
|
|
1423
|
+
var madvise_bytes: usize = 0;
|
|
1424
|
+
var madvise_calls: usize = 0;
|
|
1425
|
+
|
|
1426
|
+
for (grid.cache_blocks[1..]) |cache_block| {
|
|
1427
|
+
if (continuous_cache_start + continuous_cache_len == @intFromPtr(cache_block.ptr)) {
|
|
1428
|
+
continuous_cache_len += cache_block.len;
|
|
1429
|
+
} else {
|
|
1430
|
+
try std.posix.madvise(
|
|
1431
|
+
@ptrFromInt(continuous_cache_start),
|
|
1432
|
+
continuous_cache_len,
|
|
1433
|
+
std.posix.MADV.DONTDUMP,
|
|
1434
|
+
);
|
|
1435
|
+
madvise_bytes += continuous_cache_len;
|
|
1436
|
+
madvise_calls += 1;
|
|
1437
|
+
|
|
1438
|
+
continuous_cache_start = @intFromPtr(cache_block.ptr);
|
|
1439
|
+
continuous_cache_len = cache_block.len;
|
|
1440
|
+
}
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
try std.posix.madvise(
|
|
1444
|
+
@ptrFromInt(continuous_cache_start),
|
|
1445
|
+
continuous_cache_len,
|
|
1446
|
+
std.posix.MADV.DONTDUMP,
|
|
1447
|
+
);
|
|
1448
|
+
madvise_bytes += continuous_cache_len;
|
|
1449
|
+
madvise_calls += 1;
|
|
1450
|
+
|
|
1451
|
+
assert(madvise_bytes == constants.block_size * grid.cache_blocks.len);
|
|
1452
|
+
assert(madvise_calls <= grid.cache_blocks.len);
|
|
1453
|
+
|
|
1454
|
+
log.debug("marked {} bytes as MADV_DONTDUMP with {} calls", .{
|
|
1455
|
+
madvise_bytes,
|
|
1456
|
+
madvise_calls,
|
|
1457
|
+
});
|
|
1458
|
+
}
|
|
1459
|
+
};
|
|
1460
|
+
}
|