tigerbeetle 0.0.34 → 0.0.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/tb_client/extconf.rb +13 -13
- data/ext/tb_client/tigerbeetle/LICENSE +177 -0
- data/ext/tb_client/tigerbeetle/build.zig +2327 -0
- data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
- data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
- data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
- data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
- data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
- data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
- data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
- data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
- data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
- data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
- data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
- data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
- data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
- data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
- data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
- data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
- data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
- data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
- data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
- data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
- data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
- data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
- data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
- data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
- data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
- data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
- data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
- data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
- data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
- data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
- data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
- data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
- data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
- data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
- data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
- data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
- data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
- data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
- data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
- data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
- data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
- data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
- data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
- data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
- data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
- data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
- data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
- data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
- data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
- data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
- data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
- data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
- data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
- data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
- data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
- data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
- data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
- data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
- data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
- data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
- data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
- data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
- data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
- data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
- data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
- data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
- data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
- data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
- data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
- data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
- data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
- data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
- data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
- data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
- data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
- data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
- data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
- data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
- data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
- data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
- data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
- data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
- data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
- data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
- data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
- data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
- data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
- data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
- data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
- data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
- data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
- data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
- data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
- data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
- data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
- data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
- data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
- data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
- data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
- data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
- data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
- data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
- data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
- data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
- data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
- data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
- data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
- data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
- data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
- data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
- data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
- data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
- data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
- data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
- data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
- data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
- data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
- data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
- data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
- data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
- data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
- data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
- data/lib/tb_client/shared_lib.rb +12 -5
- data/lib/tigerbeetle/client.rb +1 -1
- data/lib/tigerbeetle/platforms.rb +9 -0
- data/lib/tigerbeetle/version.rb +2 -2
- data/tigerbeetle.gemspec +22 -5
- metadata +242 -3
- data/ext/tb_client/pkg.tar.gz +0 -0
|
@@ -0,0 +1,2117 @@
|
|
|
1
|
+
//! Compaction moves or merges a table's values from the previous level.
|
|
2
|
+
//!
|
|
3
|
+
//! Each Compaction is paced to run in an arbitrary amount of beats, by the forest.
|
|
4
|
+
//!
|
|
5
|
+
//! Compaction overview:
|
|
6
|
+
//!
|
|
7
|
+
//! 1. Given:
|
|
8
|
+
//!
|
|
9
|
+
//! - levels A and B, where A+1=B
|
|
10
|
+
//! - a single table in level A ("table A")
|
|
11
|
+
//! - all tables from level B which intersect table A's key range ("tables B")
|
|
12
|
+
//! (This can include anything between 0 tables and all of level B's tables.)
|
|
13
|
+
//!
|
|
14
|
+
//! 2. If table A's key range is disjoint from the keys in level B, move table A into level B.
|
|
15
|
+
//! All done! (But if the key ranges intersect, jump to step 3).
|
|
16
|
+
//!
|
|
17
|
+
//! 3. Create an iterator from the sort-merge of table A and the concatenation of tables B.
|
|
18
|
+
//! If the same key exists in level A and B, take A's and discard B's. †
|
|
19
|
+
//!
|
|
20
|
+
//! 4. Write the sort-merge iterator into a sequence of new tables on disk.
|
|
21
|
+
//!
|
|
22
|
+
//! 5. Update the input tables in the Manifest with their new `snapshot_max` so that they become
|
|
23
|
+
//! invisible to subsequent read transactions.
|
|
24
|
+
//!
|
|
25
|
+
//! 6. Insert the new level-B tables into the Manifest.
|
|
26
|
+
//!
|
|
27
|
+
//! † When A's value is a tombstone, there is a special case for garbage collection. When either:
|
|
28
|
+
//! * level B is the final level, or
|
|
29
|
+
//! * A's key does not exist in B or any deeper level,
|
|
30
|
+
//! then the tombstone is omitted from the compacted output, see: `compaction_must_drop_tombstones`.
|
|
31
|
+
//!
|
|
32
|
+
const std = @import("std");
|
|
33
|
+
const mem = std.mem;
|
|
34
|
+
const math = std.math;
|
|
35
|
+
const assert = std.debug.assert;
|
|
36
|
+
const Allocator = std.mem.Allocator;
|
|
37
|
+
|
|
38
|
+
const log = std.log.scoped(.compaction);
|
|
39
|
+
|
|
40
|
+
const constants = @import("../constants.zig");
|
|
41
|
+
|
|
42
|
+
const stdx = @import("stdx");
|
|
43
|
+
const maybe = stdx.maybe;
|
|
44
|
+
const vsr = @import("../vsr.zig");
|
|
45
|
+
const trace = @import("../trace.zig");
|
|
46
|
+
const StackType = @import("../stack.zig").StackType;
|
|
47
|
+
const IOPSType = @import("../iops.zig").IOPSType;
|
|
48
|
+
const GridType = @import("../vsr/grid.zig").GridType;
|
|
49
|
+
const BlockPtr = @import("../vsr/grid.zig").BlockPtr;
|
|
50
|
+
const BlockPtrConst = @import("../vsr/grid.zig").BlockPtrConst;
|
|
51
|
+
const allocate_block = @import("../vsr/grid.zig").allocate_block;
|
|
52
|
+
const TableInfoType = @import("manifest.zig").TreeTableInfoType;
|
|
53
|
+
const ManifestType = @import("manifest.zig").ManifestType;
|
|
54
|
+
const schema = @import("schema.zig");
|
|
55
|
+
const RingBufferType = stdx.RingBufferType;
|
|
56
|
+
|
|
57
|
+
/// The upper-bound count of input tables to a single tree's compaction.
|
|
58
|
+
///
|
|
59
|
+
/// - +1 from level A.
|
|
60
|
+
/// - +lsm_growth_factor from level B. The A-input table cannot overlap with an extra B-input table
|
|
61
|
+
/// because input table selection is least-overlap. If the input table overlaps on one or both
|
|
62
|
+
/// edges, there must be another table with less overlap to select.
|
|
63
|
+
pub const compaction_tables_input_max = 1 + constants.lsm_growth_factor;
|
|
64
|
+
|
|
65
|
+
/// The upper-bound count of output tables from a single tree's compaction.
|
|
66
|
+
/// In the "worst" case, no keys are overwritten/merged, and no tombstones are dropped.
|
|
67
|
+
pub const compaction_tables_output_max = compaction_tables_input_max;
|
|
68
|
+
|
|
69
|
+
/// The minimum number of blocks required for a single beat of a single compaction.
|
|
70
|
+
///
|
|
71
|
+
/// Compaction needs to carry over the output index block and all input blocks to the next beat:
|
|
72
|
+
/// One index and one value block for the output table, one index block for level A, two index
|
|
73
|
+
/// blocks for level B (to allow prefetching), and `lsm_compaction_queue_read_max/2` value blocks
|
|
74
|
+
/// for the two input tables.
|
|
75
|
+
pub const compaction_block_count_beat_min: u32 =
|
|
76
|
+
(1 + 1) + (1 + 2) + constants.lsm_compaction_queue_read_max;
|
|
77
|
+
|
|
78
|
+
const half_bar_beat_count = @divExact(constants.lsm_compaction_ops, 2);
|
|
79
|
+
|
|
80
|
+
/// Resources shared by all compactions.
|
|
81
|
+
///
|
|
82
|
+
/// ResourcePool is a singleton owned by the Forest, but it doesn't depend on Forest type.
|
|
83
|
+
pub fn ResourcePoolType(comptime Grid: type) type {
|
|
84
|
+
return struct {
|
|
85
|
+
reads: IOPSType(BlockRead, constants.lsm_compaction_iops_read_max) = .{},
|
|
86
|
+
writes: IOPSType(BlockWrite, constants.lsm_compaction_iops_write_max) = .{},
|
|
87
|
+
cpus: IOPSType(CPU, 1) = .{},
|
|
88
|
+
blocks: StackType(Block),
|
|
89
|
+
blocks_backing_storage: []Block,
|
|
90
|
+
grid_reservation: ?Grid.Reservation = null,
|
|
91
|
+
|
|
92
|
+
const ResourcePool = @This();
|
|
93
|
+
|
|
94
|
+
const BlockRead = struct {
|
|
95
|
+
grid_read: Grid.Read,
|
|
96
|
+
block: *Block,
|
|
97
|
+
compaction: *anyopaque,
|
|
98
|
+
|
|
99
|
+
fn parent(read: *BlockRead, comptime Compaction: type) *Compaction {
|
|
100
|
+
return @as(*Compaction, @ptrCast(@alignCast(read.compaction)));
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
const BlockWrite = struct {
|
|
105
|
+
grid_write: Grid.Write,
|
|
106
|
+
block: *Block,
|
|
107
|
+
compaction: *anyopaque,
|
|
108
|
+
|
|
109
|
+
fn parent(write: *BlockWrite, comptime Compaction: type) *Compaction {
|
|
110
|
+
return @as(*Compaction, @ptrCast(@alignCast(write.compaction)));
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
/// While we don't currently have a CPU pool, we already treat CPU as a resource, by storing
|
|
115
|
+
/// it in a ring-buffer of length one.
|
|
116
|
+
const CPU = struct {
|
|
117
|
+
next_tick: Grid.NextTick,
|
|
118
|
+
block: *Block,
|
|
119
|
+
compaction: *anyopaque,
|
|
120
|
+
|
|
121
|
+
fn parent(cpu: *CPU, comptime Compaction: type) *Compaction {
|
|
122
|
+
return @as(*Compaction, @ptrCast(@alignCast(cpu.compaction)));
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
const Block = struct {
|
|
127
|
+
ptr: BlockPtr,
|
|
128
|
+
stage: enum {
|
|
129
|
+
// block is in the resource pool.
|
|
130
|
+
free,
|
|
131
|
+
|
|
132
|
+
// Block is owned by a table builder.
|
|
133
|
+
build_index_block,
|
|
134
|
+
build_value_block,
|
|
135
|
+
|
|
136
|
+
// Block is in the read queue.
|
|
137
|
+
read_index_block,
|
|
138
|
+
read_index_block_done,
|
|
139
|
+
read_value_block,
|
|
140
|
+
read_value_block_done,
|
|
141
|
+
|
|
142
|
+
// Block is in the read queue and is used by merge.
|
|
143
|
+
// Next stage is either free or loops back to read_value_block_done.
|
|
144
|
+
merge,
|
|
145
|
+
|
|
146
|
+
// Block is in the write queue. Goes directly to free from here.
|
|
147
|
+
write_value_block,
|
|
148
|
+
write_index_block,
|
|
149
|
+
},
|
|
150
|
+
|
|
151
|
+
link: StackType(Block).Link,
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
pub fn init(allocator: mem.Allocator, block_count: u32) !ResourcePool {
|
|
155
|
+
const blocks_backing_storage = try allocator.alloc(Block, block_count);
|
|
156
|
+
var blocks_allocated: u32 = 0;
|
|
157
|
+
errdefer {
|
|
158
|
+
for (blocks_backing_storage[0..blocks_allocated]) |block| {
|
|
159
|
+
allocator.free(block.ptr);
|
|
160
|
+
}
|
|
161
|
+
allocator.free(blocks_backing_storage);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
for (blocks_backing_storage) |*block| {
|
|
165
|
+
block.* = .{
|
|
166
|
+
.ptr = try allocate_block(allocator),
|
|
167
|
+
.stage = .free,
|
|
168
|
+
.link = .{},
|
|
169
|
+
};
|
|
170
|
+
blocks_allocated += 1;
|
|
171
|
+
}
|
|
172
|
+
assert(blocks_allocated == block_count);
|
|
173
|
+
|
|
174
|
+
var blocks = StackType(Block).init(.{
|
|
175
|
+
.capacity = blocks_allocated,
|
|
176
|
+
.verify_push = false,
|
|
177
|
+
});
|
|
178
|
+
for (blocks_backing_storage) |*block| blocks.push(block);
|
|
179
|
+
|
|
180
|
+
return .{
|
|
181
|
+
.blocks = blocks,
|
|
182
|
+
.blocks_backing_storage = blocks_backing_storage,
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
pub fn deinit(pool: *ResourcePool, allocator: Allocator) void {
|
|
187
|
+
for (pool.blocks_backing_storage) |block| {
|
|
188
|
+
allocator.free(block.ptr);
|
|
189
|
+
}
|
|
190
|
+
allocator.free(pool.blocks_backing_storage);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
pub fn reset(pool: *ResourcePool) void {
|
|
194
|
+
pool.* = .{
|
|
195
|
+
.blocks = StackType(Block).init(.{
|
|
196
|
+
.capacity = pool.blocks.capacity(),
|
|
197
|
+
.verify_push = false,
|
|
198
|
+
}),
|
|
199
|
+
.blocks_backing_storage = pool.blocks_backing_storage,
|
|
200
|
+
};
|
|
201
|
+
for (pool.blocks_backing_storage) |*block| {
|
|
202
|
+
block.* = .{
|
|
203
|
+
.ptr = block.ptr,
|
|
204
|
+
.stage = .free,
|
|
205
|
+
.link = .{},
|
|
206
|
+
};
|
|
207
|
+
pool.blocks.push(block);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// NB: idle does not check that no blocks are acquired! Although no IO can happen between
|
|
212
|
+
// the beats, it is valid to carry over some blocks.
|
|
213
|
+
pub fn idle(pool: *ResourcePool) bool {
|
|
214
|
+
return pool.reads.executing() == 0 and
|
|
215
|
+
pool.writes.executing() == 0 and
|
|
216
|
+
pool.cpus.executing() == 0;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
pub fn blocks_acquired(pool: *ResourcePool) u32 {
|
|
220
|
+
assert(pool.blocks.count() <= pool.blocks_backing_storage.len);
|
|
221
|
+
return @as(u32, @intCast(pool.blocks_backing_storage.len - pool.blocks.count()));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
pub fn blocks_free(pool: *ResourcePool) u32 {
|
|
225
|
+
return pool.blocks.count();
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
fn block_acquire(pool: *@This()) ?*Block {
|
|
229
|
+
const block = pool.blocks.pop() orelse return null;
|
|
230
|
+
assert(block.stage == .free);
|
|
231
|
+
assert(block.link.next == null);
|
|
232
|
+
return block;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
fn block_release(pool: *@This(), block: *Block) void {
|
|
236
|
+
assert(block.stage == .free);
|
|
237
|
+
assert(block.link.next == null);
|
|
238
|
+
pool.blocks.push(block);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
pub fn format(
|
|
242
|
+
self: @This(),
|
|
243
|
+
comptime _: []const u8,
|
|
244
|
+
_: std.fmt.FormatOptions,
|
|
245
|
+
writer: anytype,
|
|
246
|
+
) !void {
|
|
247
|
+
return writer.print("ResourcePool{{ " ++
|
|
248
|
+
".reads = {}/{}, .writes = {}/{}, .cpus = {}/{}, .blocks = {}/{} }}", .{
|
|
249
|
+
self.reads.available(), self.reads.total(),
|
|
250
|
+
self.writes.available(), self.writes.total(),
|
|
251
|
+
self.cpus.available(), self.cpus.total(),
|
|
252
|
+
self.blocks.count, self.blocks_backing_storage.len,
|
|
253
|
+
});
|
|
254
|
+
}
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
pub fn CompactionType(
|
|
259
|
+
comptime Table: type,
|
|
260
|
+
comptime Tree: type,
|
|
261
|
+
comptime Storage: type,
|
|
262
|
+
) type {
|
|
263
|
+
return struct {
|
|
264
|
+
const Compaction = @This();
|
|
265
|
+
|
|
266
|
+
const Grid = GridType(Storage);
|
|
267
|
+
const ResourcePool = ResourcePoolType(Grid);
|
|
268
|
+
|
|
269
|
+
const Manifest = ManifestType(Table, Storage);
|
|
270
|
+
const TableInfo = TableInfoType(Table);
|
|
271
|
+
const TableInfoReference = Manifest.TableInfoReference;
|
|
272
|
+
const CompactionRange = Manifest.CompactionRange;
|
|
273
|
+
|
|
274
|
+
const Value = Table.Value;
|
|
275
|
+
const key_from_value = Table.key_from_value;
|
|
276
|
+
const tombstone = Table.tombstone;
|
|
277
|
+
|
|
278
|
+
const TableInfoA = union(enum) {
|
|
279
|
+
immutable: []Value,
|
|
280
|
+
disk: TableInfoReference,
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
const Position = struct {
|
|
284
|
+
index_block: u32 = 0,
|
|
285
|
+
value_block: u32 = 0,
|
|
286
|
+
value: u32 = 0,
|
|
287
|
+
|
|
288
|
+
pub fn format(
|
|
289
|
+
self: @This(),
|
|
290
|
+
comptime _: []const u8,
|
|
291
|
+
_: std.fmt.FormatOptions,
|
|
292
|
+
writer: anytype,
|
|
293
|
+
) !void {
|
|
294
|
+
return writer.print("Position{{ .index_block = {}, " ++
|
|
295
|
+
".value_block = {}, .value = {} }}", .{
|
|
296
|
+
self.index_block,
|
|
297
|
+
self.value_block,
|
|
298
|
+
self.value,
|
|
299
|
+
});
|
|
300
|
+
}
|
|
301
|
+
};
|
|
302
|
+
|
|
303
|
+
// Globally scoped fields:
|
|
304
|
+
// ----------------------
|
|
305
|
+
grid: *Grid,
|
|
306
|
+
tree: *Tree,
|
|
307
|
+
level_b: u8,
|
|
308
|
+
|
|
309
|
+
stage: enum {
|
|
310
|
+
inactive,
|
|
311
|
+
beat,
|
|
312
|
+
beat_quota_done,
|
|
313
|
+
paused,
|
|
314
|
+
} = .inactive,
|
|
315
|
+
|
|
316
|
+
// Bar-scoped fields:
|
|
317
|
+
// -----------------
|
|
318
|
+
|
|
319
|
+
/// `op_min` is the first op/beat of this compaction's half-bar.
|
|
320
|
+
/// `op_min` is used as a snapshot — the compaction's input tables must be visible
|
|
321
|
+
/// to `op_min`.
|
|
322
|
+
///
|
|
323
|
+
/// After this compaction finishes:
|
|
324
|
+
/// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
|
|
325
|
+
/// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
|
|
326
|
+
op_min: u64 = 0,
|
|
327
|
+
|
|
328
|
+
table_info_a: ?TableInfoA = null,
|
|
329
|
+
range_b: ?CompactionRange = null,
|
|
330
|
+
|
|
331
|
+
/// Whether this compaction will use the move-table optimization.
|
|
332
|
+
/// Specifically, this field is set to True if the optimal compaction
|
|
333
|
+
/// table in level A can simply be moved to level B.
|
|
334
|
+
move_table: bool = false,
|
|
335
|
+
/// Levels may choose to drop tombstones if keys aren't included in the lower levels.
|
|
336
|
+
/// This invariant is always true for the last level as it doesn't have any lower ones.
|
|
337
|
+
drop_tombstones: bool = false,
|
|
338
|
+
|
|
339
|
+
/// Counters track physical IO and are not fully deterministic. In particular, `in` and
|
|
340
|
+
/// `dropped` values can vary between the replicas.
|
|
341
|
+
///
|
|
342
|
+
/// Counters obey accounting equation of compaction:
|
|
343
|
+
/// out = in - dropped
|
|
344
|
+
counters: struct {
|
|
345
|
+
in: u64 = 0,
|
|
346
|
+
dropped: u64 = 0, // Tombstones.
|
|
347
|
+
out: u64 = 0,
|
|
348
|
+
|
|
349
|
+
fn consistent(counters: @This()) bool {
|
|
350
|
+
return counters.out == counters.in - counters.dropped;
|
|
351
|
+
}
|
|
352
|
+
} = .{},
|
|
353
|
+
|
|
354
|
+
/// Quotas track logical progress of compaction, determine pacing and must be deterministic.
|
|
355
|
+
/// Quotas count consumed input values. That is, every time an output block is written,
|
|
356
|
+
/// the done quota is incremented by the number of input values which contributed to the
|
|
357
|
+
/// output block.
|
|
358
|
+
///
|
|
359
|
+
/// At the start of the bar, the total number of input values is known. The beat quota is
|
|
360
|
+
/// then set based on the number of values left and beats left.
|
|
361
|
+
quotas: struct {
|
|
362
|
+
beat: u64 = 0,
|
|
363
|
+
beat_done: u64 = 0,
|
|
364
|
+
bar: u64 = 0,
|
|
365
|
+
bar_done: u64 = 0,
|
|
366
|
+
|
|
367
|
+
fn beat_exhausted(quotas: @This()) bool {
|
|
368
|
+
assert(quotas.beat_done <= quotas.bar_done);
|
|
369
|
+
assert(quotas.bar_done <= quotas.bar);
|
|
370
|
+
return quotas.beat_done >= quotas.beat;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
fn bar_exhausted(quotas: @This()) bool {
|
|
374
|
+
assert(quotas.bar_done <= quotas.bar);
|
|
375
|
+
return quotas.bar_done == quotas.bar;
|
|
376
|
+
}
|
|
377
|
+
} = .{},
|
|
378
|
+
|
|
379
|
+
// Position points at the next value from the layer that should be feed into the merge
|
|
380
|
+
// algorithm.
|
|
381
|
+
level_a_position: Position = .{},
|
|
382
|
+
level_b_position: Position = .{},
|
|
383
|
+
|
|
384
|
+
/// Manifest log appends are queued up until bar_complete is explicitly called to ensure
|
|
385
|
+
/// they are applied deterministically relative to other concurrent compactions.
|
|
386
|
+
// Worst-case manifest updates:
|
|
387
|
+
// See docs/about/internals/lsm.md "Compaction Table Overlap" for more detail.
|
|
388
|
+
manifest_entries: stdx.BoundedArrayType(struct {
|
|
389
|
+
operation: enum {
|
|
390
|
+
insert_to_level_b,
|
|
391
|
+
move_to_level_b,
|
|
392
|
+
},
|
|
393
|
+
table: TableInfo,
|
|
394
|
+
}, compaction_tables_output_max) = .{},
|
|
395
|
+
|
|
396
|
+
table_builder: Table.Builder = .{},
|
|
397
|
+
table_builder_index_block: ?*ResourcePool.Block = null,
|
|
398
|
+
table_builder_value_block: ?*ResourcePool.Block = null,
|
|
399
|
+
|
|
400
|
+
// The progress through immutable table is persisted throughout the bar.
|
|
401
|
+
level_a_immutable_stage: enum { ready, merge, exhausted } = .ready,
|
|
402
|
+
|
|
403
|
+
// Beat-scoped fields:
|
|
404
|
+
// ------------------
|
|
405
|
+
pool: ?*ResourcePool = null,
|
|
406
|
+
callback: ?*const fn (pool: *ResourcePool, tree_id: u16, values_consumed: u64) void = null,
|
|
407
|
+
|
|
408
|
+
// IO queues:
|
|
409
|
+
//
|
|
410
|
+
// Compaction structure is such that the data can be read (and written) concurrently, but
|
|
411
|
+
// the merge must happen sequentially. It is reminiscent of state machine's prefetch/execute
|
|
412
|
+
// split.
|
|
413
|
+
//
|
|
414
|
+
// When a block is read from disk, it is added to the tail of the corresponding queue. When
|
|
415
|
+
// the head block from both level a and level b queues is in the .read_value_block_done
|
|
416
|
+
// state, the two blocks are popped off the queues and passed down to the merge. At this
|
|
417
|
+
// point, any number of the blocks still in the queues can continue their read operations.
|
|
418
|
+
//
|
|
419
|
+
// For index blocks, queues of length one are used. Because an index block is freed as soon
|
|
420
|
+
// as the read for the last value block is scheduled, the pipeline should not dry out even
|
|
421
|
+
// when switching between the tables.
|
|
422
|
+
//
|
|
423
|
+
// Note that level_{a,b}_position fields track the logical, deterministic progression of
|
|
424
|
+
// compaction.
|
|
425
|
+
//
|
|
426
|
+
// For output blocks:
|
|
427
|
+
// - the order of completions doesn't matter,
|
|
428
|
+
// - the blocks are not used after the completion of the IO. That is, only the number of
|
|
429
|
+
// outstanding operations needs to be tracked. Use a RingBuffer with void elements rather
|
|
430
|
+
// than an u32 for API uniformity and comptime upper bound. We have
|
|
431
|
+
// <https://github.com/ziglang/zig/issues/3806> at home.
|
|
432
|
+
//
|
|
433
|
+
// In addition to static max size, the queues are additionally limited at runtime by the
|
|
434
|
+
// number of available free blocks. The queues are not limited by IOPS --- it is assumed
|
|
435
|
+
// that there are enough IOPS to fill up all the queues.
|
|
436
|
+
level_a_index_block: RingBufferType(*ResourcePool.Block, .{
|
|
437
|
+
.array = 1,
|
|
438
|
+
}) = .{ .buffer = undefined },
|
|
439
|
+
|
|
440
|
+
level_a_value_block: RingBufferType(*ResourcePool.Block, .{
|
|
441
|
+
.array = @divExact(constants.lsm_compaction_queue_read_max, 2),
|
|
442
|
+
}) = .{ .buffer = undefined },
|
|
443
|
+
|
|
444
|
+
level_b_index_block: RingBufferType(*ResourcePool.Block, .{
|
|
445
|
+
.array = 2, // Prefetch next table's index block
|
|
446
|
+
}) = .{ .buffer = undefined },
|
|
447
|
+
|
|
448
|
+
level_b_value_block: RingBufferType(*ResourcePool.Block, .{
|
|
449
|
+
.array = @divExact(constants.lsm_compaction_queue_read_max, 2),
|
|
450
|
+
}) = .{ .buffer = undefined },
|
|
451
|
+
|
|
452
|
+
output_blocks: RingBufferType(void, .{
|
|
453
|
+
.array = constants.lsm_compaction_queue_write_max,
|
|
454
|
+
}) = .{ .buffer = undefined },
|
|
455
|
+
|
|
456
|
+
pub fn init(tree: *Tree, grid: *Grid, level_b: u8) Compaction {
|
|
457
|
+
assert(level_b < constants.lsm_levels);
|
|
458
|
+
|
|
459
|
+
return Compaction{
|
|
460
|
+
.grid = grid,
|
|
461
|
+
.tree = tree,
|
|
462
|
+
.level_b = level_b,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
pub fn reset(compaction: *Compaction) void {
|
|
467
|
+
compaction.grid.trace.cancel(.compact_beat);
|
|
468
|
+
compaction.grid.trace.cancel(.compact_beat_merge);
|
|
469
|
+
compaction.* = .{
|
|
470
|
+
.grid = compaction.grid,
|
|
471
|
+
.tree = compaction.tree,
|
|
472
|
+
.level_b = compaction.level_b,
|
|
473
|
+
};
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
/// Assert consistency of the compaction counters between beats. This isn't as
|
|
477
|
+
/// straightforward as calling counters.consistent() as we do at the end of the bar, since
|
|
478
|
+
/// we may carry over multiple input value blocks, and an output value block over to the
|
|
479
|
+
/// next beat.
|
|
480
|
+
///
|
|
481
|
+
///
|
|
482
|
+
/// Compute values_in, values_dropped, values_out, values_in_flight, and assert:
|
|
483
|
+
/// values_out + values_in_flight == values_in - values_dropped
|
|
484
|
+
///
|
|
485
|
+
// values_in: Values from the input value blocks that have been read from disk.
|
|
486
|
+
// values_dropped: Values dropped during merge.
|
|
487
|
+
// values_out: Values from the output value blocks that have been written to disk,
|
|
488
|
+
// plus the values from the output value block being carried over to the
|
|
489
|
+
// next beat (output of merge but not written to disk yet).
|
|
490
|
+
// values_in_flight: Values from the input value blocks being carried over to the next
|
|
491
|
+
// beat, minus the values that have already been consumed during
|
|
492
|
+
// merge.
|
|
493
|
+
fn assert_counter_consistency_between_beats(compaction: *const Compaction) void {
|
|
494
|
+
const values_in = compaction.counters.in;
|
|
495
|
+
const values_out = compaction.counters.out + compaction.table_builder.value_count;
|
|
496
|
+
const values_dropped = compaction.counters.dropped;
|
|
497
|
+
|
|
498
|
+
var values_in_flight: u64 = 0;
|
|
499
|
+
|
|
500
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
501
|
+
assert(compaction.level_a_value_block.empty());
|
|
502
|
+
if (compaction.level_a_immutable_stage != .exhausted) {
|
|
503
|
+
values_in_flight += compaction.table_info_a.?.immutable.len;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
var level_a_value_block_iterator = compaction.level_a_value_block.iterator();
|
|
508
|
+
while (level_a_value_block_iterator.next()) |block| {
|
|
509
|
+
values_in_flight += Table.value_block_values_used(block.ptr).len;
|
|
510
|
+
}
|
|
511
|
+
values_in_flight -= compaction.level_a_position.value;
|
|
512
|
+
|
|
513
|
+
var level_b_value_block_iterator = compaction.level_b_value_block.iterator();
|
|
514
|
+
while (level_b_value_block_iterator.next()) |block| {
|
|
515
|
+
values_in_flight += Table.value_block_values_used(block.ptr).len;
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
values_in_flight -= compaction.level_b_position.value;
|
|
519
|
+
|
|
520
|
+
assert(values_out + values_in_flight == values_in - values_dropped);
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
pub fn assert_between_bars(compaction: *const Compaction) void {
|
|
524
|
+
assert(compaction.stage == .inactive);
|
|
525
|
+
assert(compaction.idle());
|
|
526
|
+
assert(compaction.block_queues_empty_output());
|
|
527
|
+
assert(compaction.block_queues_empty_input());
|
|
528
|
+
|
|
529
|
+
assert(compaction.table_builder.state == .no_blocks);
|
|
530
|
+
assert(compaction.table_builder_value_block == null);
|
|
531
|
+
assert(compaction.table_builder_index_block == null);
|
|
532
|
+
assert(compaction.manifest_entries.empty());
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
fn idle(compaction: *const Compaction) bool {
|
|
536
|
+
return compaction.pool == null and
|
|
537
|
+
compaction.callback == null and
|
|
538
|
+
compaction.quotas.beat_exhausted();
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
fn block_queues_empty_output(compaction: *const Compaction) bool {
|
|
542
|
+
return compaction.output_blocks.empty();
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
fn block_queues_empty_input(compaction: *const Compaction) bool {
|
|
546
|
+
return compaction.level_a_index_block.empty() and
|
|
547
|
+
compaction.level_a_value_block.empty() and
|
|
548
|
+
compaction.level_b_index_block.empty() and
|
|
549
|
+
compaction.level_b_value_block.empty();
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
/// Plan the work for the bar:
|
|
553
|
+
/// - check if compaction is needed at all (if the level_a is full),
|
|
554
|
+
/// - find a table on level_a and the corresponding range on level_b that should be
|
|
555
|
+
/// compacted,
|
|
556
|
+
/// - compute the bar quota (just the total number of values in all input tables),
|
|
557
|
+
/// - execute move table optimization if range_b turns out to be empty.
|
|
558
|
+
pub fn bar_commence(compaction: *Compaction, op: u64) u64 {
|
|
559
|
+
assert(compaction.idle());
|
|
560
|
+
assert(compaction.block_queues_empty_output());
|
|
561
|
+
assert(compaction.block_queues_empty_input());
|
|
562
|
+
|
|
563
|
+
assert(compaction.stage == .inactive);
|
|
564
|
+
assert(op == compaction_op_min(op));
|
|
565
|
+
|
|
566
|
+
compaction.stage = .paused;
|
|
567
|
+
compaction.op_min = op;
|
|
568
|
+
|
|
569
|
+
if (compaction.level_b == 0) {
|
|
570
|
+
// Do not start compaction if the immutable table does not require compaction.
|
|
571
|
+
if (compaction.tree.table_immutable.mutability.immutable.flushed) {
|
|
572
|
+
assert(compaction.quotas.bar == 0);
|
|
573
|
+
assert(compaction.quotas.bar_exhausted());
|
|
574
|
+
log.debug("{s}:{}: bar_commence: immutable table flushed", .{
|
|
575
|
+
compaction.tree.config.name,
|
|
576
|
+
compaction.level_b,
|
|
577
|
+
});
|
|
578
|
+
return 0;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const table_value_count_limit = Table.value_count_max;
|
|
582
|
+
assert(compaction.tree.table_immutable.count() > 0);
|
|
583
|
+
assert(compaction.tree.table_immutable.count() <= table_value_count_limit);
|
|
584
|
+
|
|
585
|
+
// If the mutable table will fit in the free capacity of the immutable table (even
|
|
586
|
+
// in the projected "worst" case of all full batches during the second half-bar),
|
|
587
|
+
// then defer compacting the immutable table into level 0.
|
|
588
|
+
//
|
|
589
|
+
// This optimization cannot apply to the last bar before a checkpoint trigger, since
|
|
590
|
+
// recovery from the checkpoint only replays that final bar, which must reconstruct
|
|
591
|
+
// the original immutable table.
|
|
592
|
+
// TODO(Snapshots) This optimization must be disabled to take a persistent snapshot.
|
|
593
|
+
const mutable_count_half_bar_first = compaction.tree.table_mutable.count();
|
|
594
|
+
const mutable_count_half_bar_last = @divExact(table_value_count_limit, 2);
|
|
595
|
+
const mutable_count = mutable_count_half_bar_first + mutable_count_half_bar_last;
|
|
596
|
+
const immutable_count = compaction.tree.table_immutable.count();
|
|
597
|
+
if (immutable_count + mutable_count <= table_value_count_limit) {
|
|
598
|
+
const op_checkpoint =
|
|
599
|
+
compaction.grid.superblock.working.vsr_state.checkpoint.header.op;
|
|
600
|
+
const op_checkpoint_next = vsr.Checkpoint.checkpoint_after(op_checkpoint);
|
|
601
|
+
const op_checkpoint_trigger_next =
|
|
602
|
+
vsr.Checkpoint.trigger_for_checkpoint(op_checkpoint_next).?;
|
|
603
|
+
const compaction_op_max = op + (half_bar_beat_count - 1);
|
|
604
|
+
const last_half_bar_of_checkpoint =
|
|
605
|
+
compaction_op_max == op_checkpoint_trigger_next;
|
|
606
|
+
|
|
607
|
+
if (!last_half_bar_of_checkpoint) {
|
|
608
|
+
assert(compaction.quotas.bar == 0);
|
|
609
|
+
assert(compaction.quotas.bar_exhausted());
|
|
610
|
+
log.debug("{s}:{}: bar_commence: immutable table flush skipped " ++
|
|
611
|
+
"({}+{}+{} ≤ {})", .{
|
|
612
|
+
compaction.tree.config.name,
|
|
613
|
+
compaction.level_b,
|
|
614
|
+
immutable_count,
|
|
615
|
+
mutable_count_half_bar_first,
|
|
616
|
+
mutable_count_half_bar_last,
|
|
617
|
+
table_value_count_limit,
|
|
618
|
+
});
|
|
619
|
+
return 0;
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
compaction.table_info_a = .{
|
|
624
|
+
.immutable = compaction.tree.table_immutable.values_used(),
|
|
625
|
+
};
|
|
626
|
+
|
|
627
|
+
compaction.range_b = compaction.tree.manifest.immutable_table_compaction_range(
|
|
628
|
+
compaction.tree.table_immutable.key_min(),
|
|
629
|
+
compaction.tree.table_immutable.key_max(),
|
|
630
|
+
.{ .value_count = compaction.tree.table_immutable.count() },
|
|
631
|
+
);
|
|
632
|
+
|
|
633
|
+
// +1 to count the immutable table (level A).
|
|
634
|
+
assert(compaction.range_b.?.tables.count() + 1 <= compaction_tables_input_max);
|
|
635
|
+
assert(compaction.range_b.?.key_min <= compaction.tree.table_immutable.key_min());
|
|
636
|
+
assert(compaction.tree.table_immutable.key_max() <= compaction.range_b.?.key_max);
|
|
637
|
+
} else {
|
|
638
|
+
const level_a = compaction.level_b - 1;
|
|
639
|
+
|
|
640
|
+
// Do not start compaction if level A does not require compaction.
|
|
641
|
+
const table_range = compaction.tree.manifest.compaction_table(level_a) orelse {
|
|
642
|
+
assert(compaction.quotas.bar == 0);
|
|
643
|
+
assert(compaction.quotas.bar_exhausted());
|
|
644
|
+
log.debug("{s}:{}: bar_commence: nothing to compact", .{
|
|
645
|
+
compaction.tree.config.name,
|
|
646
|
+
compaction.level_b,
|
|
647
|
+
});
|
|
648
|
+
return 0;
|
|
649
|
+
};
|
|
650
|
+
|
|
651
|
+
compaction.table_info_a = .{ .disk = table_range.table_a };
|
|
652
|
+
compaction.range_b = table_range.range_b;
|
|
653
|
+
|
|
654
|
+
assert(compaction.range_b.?.tables.count() + 1 <= compaction_tables_input_max);
|
|
655
|
+
assert(compaction.table_info_a.?.disk.table_info.key_min <=
|
|
656
|
+
compaction.table_info_a.?.disk.table_info.key_max);
|
|
657
|
+
assert(compaction.range_b.?.key_min <=
|
|
658
|
+
compaction.table_info_a.?.disk.table_info.key_min);
|
|
659
|
+
assert(compaction.table_info_a.?.disk.table_info.key_max <=
|
|
660
|
+
compaction.range_b.?.key_max);
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
switch (compaction.table_info_a.?) {
|
|
664
|
+
.immutable => {},
|
|
665
|
+
.disk => |table| {
|
|
666
|
+
assert(!compaction.grid.free_set.is_released(table.table_info.address));
|
|
667
|
+
assert(!compaction.grid.free_set.is_free(table.table_info.address));
|
|
668
|
+
},
|
|
669
|
+
}
|
|
670
|
+
for (compaction.range_b.?.tables.slice()) |table| {
|
|
671
|
+
assert(!compaction.grid.free_set.is_released(table.table_info.address));
|
|
672
|
+
assert(!compaction.grid.free_set.is_free(table.table_info.address));
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
var quota_bar = switch (compaction.table_info_a.?) {
|
|
676
|
+
.immutable => compaction.tree.table_immutable.count(),
|
|
677
|
+
.disk => |table| table.table_info.value_count,
|
|
678
|
+
};
|
|
679
|
+
for (compaction.range_b.?.tables.const_slice()) |*table| {
|
|
680
|
+
quota_bar += table.table_info.value_count;
|
|
681
|
+
}
|
|
682
|
+
compaction.quotas = .{
|
|
683
|
+
.beat = 0,
|
|
684
|
+
.beat_done = 0,
|
|
685
|
+
.bar = quota_bar,
|
|
686
|
+
.bar_done = 0,
|
|
687
|
+
};
|
|
688
|
+
|
|
689
|
+
log.debug("{s}:{}: bar_commence: quota_bar_done={} quota_bar={}", .{
|
|
690
|
+
compaction.tree.config.name,
|
|
691
|
+
compaction.level_b,
|
|
692
|
+
compaction.quotas.bar_done,
|
|
693
|
+
compaction.quotas.bar,
|
|
694
|
+
});
|
|
695
|
+
compaction.move_table = compaction.table_info_a.? == .disk and
|
|
696
|
+
compaction.range_b.?.tables.empty();
|
|
697
|
+
compaction.drop_tombstones = compaction.tree.manifest
|
|
698
|
+
.compaction_must_drop_tombstones(compaction.level_b, &compaction.range_b.?);
|
|
699
|
+
|
|
700
|
+
// The last level must always drop tombstones.
|
|
701
|
+
if (compaction.level_b == constants.lsm_levels - 1) assert(compaction.drop_tombstones);
|
|
702
|
+
|
|
703
|
+
assert(std.meta.eql(compaction.counters, .{}));
|
|
704
|
+
inline for (.{ compaction.level_a_position, compaction.level_b_position }) |position| {
|
|
705
|
+
assert(std.meta.eql(position, .{}));
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
// Append the entries to the manifest update queue here and now if we're doing
|
|
709
|
+
// move table. They'll be applied later by bar_complete().
|
|
710
|
+
if (compaction.move_table) {
|
|
711
|
+
const snapshot_max = snapshot_max_for_table_input(compaction.op_min);
|
|
712
|
+
assert(compaction.table_info_a.?.disk.table_info.snapshot_max >= snapshot_max);
|
|
713
|
+
|
|
714
|
+
compaction.manifest_entries.push(.{
|
|
715
|
+
.operation = .move_to_level_b,
|
|
716
|
+
.table = compaction.table_info_a.?.disk.table_info.*,
|
|
717
|
+
});
|
|
718
|
+
|
|
719
|
+
const value_count = compaction.table_info_a.?.disk.table_info.value_count;
|
|
720
|
+
|
|
721
|
+
compaction.quotas.beat = value_count;
|
|
722
|
+
compaction.quotas.beat_done = value_count;
|
|
723
|
+
compaction.quotas.bar_done = value_count;
|
|
724
|
+
|
|
725
|
+
assert(compaction.quotas.beat_exhausted());
|
|
726
|
+
assert(compaction.quotas.bar_exhausted());
|
|
727
|
+
|
|
728
|
+
return 0;
|
|
729
|
+
} else {
|
|
730
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
731
|
+
compaction.counters.in += compaction.table_info_a.?.immutable.len;
|
|
732
|
+
}
|
|
733
|
+
return compaction.quotas.bar;
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
/// Apply the changes that have been accumulated in memory to the manifest and remove any
|
|
738
|
+
/// tables that are now invisible.
|
|
739
|
+
pub fn bar_complete(compaction: *Compaction) void {
|
|
740
|
+
assert(compaction.idle());
|
|
741
|
+
assert(compaction.block_queues_empty_output());
|
|
742
|
+
assert(compaction.block_queues_empty_input());
|
|
743
|
+
|
|
744
|
+
assert(compaction.stage == .paused);
|
|
745
|
+
assert(compaction.counters.consistent());
|
|
746
|
+
assert(compaction.quotas.bar_exhausted());
|
|
747
|
+
// Assert blocks have been released back to the pipeline.
|
|
748
|
+
assert(compaction.table_builder.state == .no_blocks);
|
|
749
|
+
assert(compaction.table_builder_index_block == null);
|
|
750
|
+
assert(compaction.table_builder_value_block == null);
|
|
751
|
+
|
|
752
|
+
defer {
|
|
753
|
+
compaction.* = .{
|
|
754
|
+
.grid = compaction.grid,
|
|
755
|
+
.tree = compaction.tree,
|
|
756
|
+
.level_b = compaction.level_b,
|
|
757
|
+
};
|
|
758
|
+
assert(compaction.stage == .inactive);
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
if (compaction.table_info_a == null) {
|
|
762
|
+
assert(compaction.range_b == null);
|
|
763
|
+
assert(compaction.manifest_entries.count() == 0);
|
|
764
|
+
assert(compaction.quotas.bar == 0);
|
|
765
|
+
if (compaction.level_b == 0) {
|
|
766
|
+
// Either:
|
|
767
|
+
// - the immutable table is empty (already flushed), or
|
|
768
|
+
// - the mutable table will be absorbed into the immutable table.
|
|
769
|
+
maybe(compaction.tree.table_immutable.mutability.immutable.flushed);
|
|
770
|
+
}
|
|
771
|
+
return;
|
|
772
|
+
}
|
|
773
|
+
assert(compaction.table_info_a != null);
|
|
774
|
+
assert(compaction.range_b != null);
|
|
775
|
+
assert(compaction.quotas.bar > 0);
|
|
776
|
+
|
|
777
|
+
switch (compaction.table_info_a.?) {
|
|
778
|
+
.immutable => {},
|
|
779
|
+
.disk => |table| {
|
|
780
|
+
if (compaction.move_table) {
|
|
781
|
+
assert(!compaction.grid.free_set.is_released(table.table_info.address));
|
|
782
|
+
assert(!compaction.grid.free_set.is_free(table.table_info.address));
|
|
783
|
+
} else {
|
|
784
|
+
assert(compaction.grid.free_set.is_released(table.table_info.address));
|
|
785
|
+
}
|
|
786
|
+
},
|
|
787
|
+
}
|
|
788
|
+
for (compaction.range_b.?.tables.slice()) |table| {
|
|
789
|
+
assert(compaction.grid.free_set.is_released(table.table_info.address));
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
log.debug("{s}:{}: bar_complete: " ++
|
|
793
|
+
"values_in={} values_out={} values_dropped={}", .{
|
|
794
|
+
compaction.tree.config.name,
|
|
795
|
+
compaction.level_b,
|
|
796
|
+
compaction.counters.in,
|
|
797
|
+
compaction.counters.out,
|
|
798
|
+
compaction.counters.dropped,
|
|
799
|
+
});
|
|
800
|
+
|
|
801
|
+
compaction.grid.trace.count(
|
|
802
|
+
.{ .compaction_values_physical = .{
|
|
803
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
804
|
+
} },
|
|
805
|
+
compaction.counters.out,
|
|
806
|
+
);
|
|
807
|
+
if (compaction.level_b == 0) {
|
|
808
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
809
|
+
compaction.grid.trace.count(
|
|
810
|
+
.{ .compaction_values_logical = .{
|
|
811
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
812
|
+
} },
|
|
813
|
+
compaction.table_info_a.?.immutable.len,
|
|
814
|
+
);
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
// Mark the immutable table as flushed, if we were compacting into level 0.
|
|
819
|
+
if (compaction.level_b == 0) {
|
|
820
|
+
assert(!compaction.tree.table_immutable.mutability.immutable.flushed);
|
|
821
|
+
compaction.tree.table_immutable.mutability.immutable.flushed = true;
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// Each compaction's manifest updates are deferred to the end of the last
|
|
825
|
+
// bar to ensure:
|
|
826
|
+
// - manifest log updates are ordered deterministically relative to one another, and
|
|
827
|
+
// - manifest updates are not visible until after the blocks are all on disk.
|
|
828
|
+
const manifest = &compaction.tree.manifest;
|
|
829
|
+
const level_b = compaction.level_b;
|
|
830
|
+
const snapshot_max = snapshot_max_for_table_input(compaction.op_min);
|
|
831
|
+
|
|
832
|
+
var manifest_removed_value_count: u64 = 0;
|
|
833
|
+
var manifest_added_value_count: u64 = 0;
|
|
834
|
+
|
|
835
|
+
if (compaction.move_table) {
|
|
836
|
+
// If no compaction is required, don't update snapshot_max.
|
|
837
|
+
} else {
|
|
838
|
+
// These updates MUST precede insert_table() and move_table() since they use
|
|
839
|
+
// references to modify the ManifestLevel in-place.
|
|
840
|
+
switch (compaction.table_info_a.?) {
|
|
841
|
+
.immutable => {
|
|
842
|
+
manifest_removed_value_count = compaction.tree.table_immutable.count();
|
|
843
|
+
},
|
|
844
|
+
.disk => |table_info| {
|
|
845
|
+
manifest_removed_value_count += table_info.table_info.value_count;
|
|
846
|
+
manifest.update_table(level_b - 1, snapshot_max, table_info);
|
|
847
|
+
},
|
|
848
|
+
}
|
|
849
|
+
for (compaction.range_b.?.tables.const_slice()) |table| {
|
|
850
|
+
manifest_removed_value_count += table.table_info.value_count;
|
|
851
|
+
manifest.update_table(level_b, snapshot_max, table);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
for (compaction.manifest_entries.slice()) |*entry| {
|
|
856
|
+
switch (entry.operation) {
|
|
857
|
+
.insert_to_level_b => {
|
|
858
|
+
manifest.insert_table(level_b, &entry.table);
|
|
859
|
+
manifest_added_value_count += entry.table.value_count;
|
|
860
|
+
},
|
|
861
|
+
.move_to_level_b => {
|
|
862
|
+
manifest.move_table(level_b - 1, level_b, &entry.table);
|
|
863
|
+
manifest_removed_value_count += entry.table.value_count;
|
|
864
|
+
manifest_added_value_count += entry.table.value_count;
|
|
865
|
+
},
|
|
866
|
+
}
|
|
867
|
+
}
|
|
868
|
+
if (compaction.move_table) {
|
|
869
|
+
assert(std.meta.eql(compaction.counters, .{}));
|
|
870
|
+
assert(manifest_added_value_count == manifest_removed_value_count);
|
|
871
|
+
assert(manifest_added_value_count > 0);
|
|
872
|
+
} else {
|
|
873
|
+
assert(manifest_added_value_count == compaction.counters.out);
|
|
874
|
+
assert(manifest_removed_value_count == compaction.counters.in);
|
|
875
|
+
assert(manifest_removed_value_count - manifest_added_value_count ==
|
|
876
|
+
compaction.counters.dropped);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// Hide any tables that are now invisible.
|
|
880
|
+
manifest.remove_invisible_tables(
|
|
881
|
+
level_b,
|
|
882
|
+
&.{},
|
|
883
|
+
compaction.range_b.?.key_min,
|
|
884
|
+
compaction.range_b.?.key_max,
|
|
885
|
+
);
|
|
886
|
+
if (level_b > 0) {
|
|
887
|
+
manifest.remove_invisible_tables(
|
|
888
|
+
level_b - 1,
|
|
889
|
+
&.{},
|
|
890
|
+
compaction.range_b.?.key_min,
|
|
891
|
+
compaction.range_b.?.key_max,
|
|
892
|
+
);
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
pub fn beat_commence(
|
|
897
|
+
compaction: *Compaction,
|
|
898
|
+
values_count: u64,
|
|
899
|
+
) void {
|
|
900
|
+
assert(compaction.idle());
|
|
901
|
+
assert(compaction.stage == .paused);
|
|
902
|
+
assert(compaction.block_queues_empty_output());
|
|
903
|
+
// We may be carrying over some blocks from the previous beat.
|
|
904
|
+
maybe(compaction.block_queues_empty_input());
|
|
905
|
+
|
|
906
|
+
if (compaction.move_table) assert(compaction.quotas.bar_exhausted());
|
|
907
|
+
|
|
908
|
+
// Run the compaction up to completion of the bar quota, if possible.
|
|
909
|
+
const values_remaining = (compaction.quotas.bar - compaction.quotas.bar_done);
|
|
910
|
+
|
|
911
|
+
compaction.quotas.beat = @min(values_count, values_remaining);
|
|
912
|
+
compaction.quotas.beat_done = 0;
|
|
913
|
+
assert(compaction.quotas.beat <= compaction.quotas.bar);
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
/// The entry point to the actual compaction work for the beat. Called by the forest.
|
|
917
|
+
pub fn compaction_dispatch_enter(
|
|
918
|
+
compaction: *Compaction,
|
|
919
|
+
options: struct {
|
|
920
|
+
pool: *ResourcePool,
|
|
921
|
+
callback: *const fn (pool: *ResourcePool, tree_id: u16, values_consumed: u64) void,
|
|
922
|
+
},
|
|
923
|
+
) enum { pending, ready } {
|
|
924
|
+
assert(compaction.stage == .paused);
|
|
925
|
+
assert(compaction.block_queues_empty_output());
|
|
926
|
+
// We may be carrying over some blocks from the previous beat.
|
|
927
|
+
maybe(compaction.block_queues_empty_input());
|
|
928
|
+
|
|
929
|
+
if (compaction.move_table) assert(compaction.quotas.bar_exhausted());
|
|
930
|
+
|
|
931
|
+
if (compaction.quotas.bar_exhausted()) {
|
|
932
|
+
log.debug("{}: {s}:{}: beat_commence: bar quota={} fulfilled, done={}", .{
|
|
933
|
+
compaction.grid.superblock.replica_index.?,
|
|
934
|
+
compaction.tree.config.name,
|
|
935
|
+
compaction.level_b,
|
|
936
|
+
compaction.quotas.bar,
|
|
937
|
+
compaction.quotas.bar_done,
|
|
938
|
+
});
|
|
939
|
+
return .ready;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
if (compaction.quotas.beat_exhausted()) {
|
|
943
|
+
log.debug("{s}:{}: beat_commence: beat quota={} fulfilled, done={}", .{
|
|
944
|
+
compaction.tree.config.name,
|
|
945
|
+
compaction.level_b,
|
|
946
|
+
compaction.quotas.beat,
|
|
947
|
+
compaction.quotas.beat_done,
|
|
948
|
+
});
|
|
949
|
+
return .ready;
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
assert(!compaction.move_table);
|
|
953
|
+
|
|
954
|
+
compaction.grid.trace.start(.{ .compact_beat = .{
|
|
955
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
956
|
+
.level_b = compaction.level_b,
|
|
957
|
+
} });
|
|
958
|
+
|
|
959
|
+
assert(options.pool.idle());
|
|
960
|
+
assert(options.pool.grid_reservation != null);
|
|
961
|
+
|
|
962
|
+
compaction.pool = options.pool;
|
|
963
|
+
compaction.callback = options.callback;
|
|
964
|
+
compaction.stage = .beat;
|
|
965
|
+
|
|
966
|
+
compaction.compaction_dispatch();
|
|
967
|
+
return .pending;
|
|
968
|
+
}
|
|
969
|
+
|
|
970
|
+
// While beat_commence is called by the forest sequentially for each compaction, to get
|
|
971
|
+
// deterministic grid reservations, each compaction completes its own beat's work
|
|
972
|
+
// asynchronously
|
|
973
|
+
fn beat_complete(compaction: *Compaction) void {
|
|
974
|
+
assert(compaction.stage == .beat_quota_done);
|
|
975
|
+
switch (compaction.table_builder.state) {
|
|
976
|
+
.no_blocks => {},
|
|
977
|
+
.index_and_value_block => {
|
|
978
|
+
assert(!compaction.table_builder.index_block_full());
|
|
979
|
+
assert(!compaction.table_builder.value_block_full());
|
|
980
|
+
assert(!compaction.quotas.bar_exhausted());
|
|
981
|
+
},
|
|
982
|
+
.index_block => {
|
|
983
|
+
assert(!compaction.table_builder.index_block_full());
|
|
984
|
+
assert(!compaction.quotas.bar_exhausted());
|
|
985
|
+
},
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
989
|
+
switch (compaction.level_a_immutable_stage) {
|
|
990
|
+
.ready, .exhausted => {},
|
|
991
|
+
.merge => unreachable,
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
|
|
995
|
+
assert(compaction.block_queues_empty_output());
|
|
996
|
+
// We may be carrying over some input blocks to the next beat.
|
|
997
|
+
maybe(compaction.block_queues_empty_input());
|
|
998
|
+
|
|
999
|
+
compaction.assert_counter_consistency_between_beats();
|
|
1000
|
+
|
|
1001
|
+
assert(compaction.pool.?.idle());
|
|
1002
|
+
maybe(compaction.pool.?.blocks_acquired() > 0);
|
|
1003
|
+
|
|
1004
|
+
if (compaction.quotas.bar_exhausted()) {
|
|
1005
|
+
assert(compaction.table_builder.state == .no_blocks);
|
|
1006
|
+
assert(compaction.table_builder_value_block == null);
|
|
1007
|
+
assert(compaction.table_builder_index_block == null);
|
|
1008
|
+
assert(compaction.block_queues_empty_input());
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
const pool = compaction.pool.?;
|
|
1012
|
+
const callback = compaction.callback.?;
|
|
1013
|
+
|
|
1014
|
+
compaction.stage = .paused;
|
|
1015
|
+
compaction.callback = null;
|
|
1016
|
+
compaction.pool = null;
|
|
1017
|
+
|
|
1018
|
+
assert(compaction.idle());
|
|
1019
|
+
assert(pool.idle());
|
|
1020
|
+
log.debug("{s}:{}: beat_complete: quota_beat_done={} quota_beat={} " ++
|
|
1021
|
+
"quota_bar_done={} quota_bar={}", .{
|
|
1022
|
+
compaction.tree.config.name,
|
|
1023
|
+
compaction.level_b,
|
|
1024
|
+
compaction.quotas.beat_done,
|
|
1025
|
+
compaction.quotas.beat,
|
|
1026
|
+
compaction.quotas.bar_done,
|
|
1027
|
+
compaction.quotas.bar,
|
|
1028
|
+
});
|
|
1029
|
+
|
|
1030
|
+
callback(pool, compaction.tree.config.id, compaction.quotas.beat_done);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// Compaction is a lot of work: read input tables from both levels, merge their value
|
|
1034
|
+
// blocks, write the results to disk. Many of these jobs can proceed in parallel. For
|
|
1035
|
+
// example, only a single value block from each level is needed to start a merge.
|
|
1036
|
+
//
|
|
1037
|
+
// The job of compaction_dispatch is to kick off all the jobs. There are several additional
|
|
1038
|
+
// concerns:
|
|
1039
|
+
// - All jobs use the same common pool of resources (ResourcePool). The jobs are started
|
|
1040
|
+
// in the order that splits resources fairly (e.g., reads from level a and level b
|
|
1041
|
+
// alternate). Fairness also ensures that the process does not deadlock.
|
|
1042
|
+
// - Jobs have dependencies --- merging needs value blocks, reading a value block needs the
|
|
1043
|
+
// corresponding index blocks.
|
|
1044
|
+
// - A single bar of compaction should process only a fraction of the input, so the
|
|
1045
|
+
// processes can be suspended in the middle.
|
|
1046
|
+
//
|
|
1047
|
+
// A beat of compaction ends when both:
|
|
1048
|
+
// - at least quota.bar of input values is consumed,
|
|
1049
|
+
// - there's no incomplete output value blocks.
|
|
1050
|
+
//
|
|
1051
|
+
// In other words, the only compaction state that gets carried over to the next beat is a
|
|
1052
|
+
// partially full index block. The current beat must end with writing a value block, and the
|
|
1053
|
+
// next beat must start with re-reading level_a and level_b index and value blocks.
|
|
1054
|
+
fn compaction_dispatch(compaction: *Compaction) void {
|
|
1055
|
+
switch (compaction.stage) {
|
|
1056
|
+
.beat,
|
|
1057
|
+
.beat_quota_done,
|
|
1058
|
+
=> {},
|
|
1059
|
+
.inactive,
|
|
1060
|
+
.paused,
|
|
1061
|
+
=> unreachable,
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// The loop below runs while (progressed) and, every time progressed is set to true,
|
|
1065
|
+
// one of the safety_counter resources is acquired.
|
|
1066
|
+
var progressed = true;
|
|
1067
|
+
const safety_counter =
|
|
1068
|
+
compaction.pool.?.reads.available() +
|
|
1069
|
+
compaction.pool.?.writes.available() +
|
|
1070
|
+
compaction.pool.?.cpus.available() + 1;
|
|
1071
|
+
for (0..safety_counter) |_| {
|
|
1072
|
+
if (!progressed) break;
|
|
1073
|
+
progressed = false;
|
|
1074
|
+
|
|
1075
|
+
if (compaction.stage == .beat_quota_done) {
|
|
1076
|
+
// Just wait for all in-flight jobs to complete.
|
|
1077
|
+
return compaction.compaction_dispatch_beat_quota_done();
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
// To avoid deadlocks, allocate blocks for the table builder first.
|
|
1081
|
+
if (compaction.table_builder.state == .no_blocks) {
|
|
1082
|
+
assert(compaction.table_builder_index_block == null);
|
|
1083
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1084
|
+
assert(block.stage == .free);
|
|
1085
|
+
block.stage = .build_index_block;
|
|
1086
|
+
compaction.table_builder.set_index_block(block.ptr);
|
|
1087
|
+
compaction.table_builder_index_block = block;
|
|
1088
|
+
} else {
|
|
1089
|
+
assert(compaction.output_blocks.count > 0);
|
|
1090
|
+
}
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
if (compaction.table_builder.state == .index_block) {
|
|
1094
|
+
assert(compaction.table_builder_value_block == null);
|
|
1095
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1096
|
+
assert(block.stage == .free);
|
|
1097
|
+
block.stage = .build_value_block;
|
|
1098
|
+
compaction.table_builder.set_value_block(block.ptr);
|
|
1099
|
+
compaction.table_builder_value_block = block;
|
|
1100
|
+
} else {
|
|
1101
|
+
assert(compaction.output_blocks.count > 0);
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
const level_a_index_block_next =
|
|
1106
|
+
compaction.level_a_position.index_block +
|
|
1107
|
+
@as(u32, @intCast(compaction.level_a_index_block.count));
|
|
1108
|
+
const level_b_index_block_next =
|
|
1109
|
+
compaction.level_b_position.index_block +
|
|
1110
|
+
@as(u32, @intCast(compaction.level_b_index_block.count));
|
|
1111
|
+
const level_a_value_block_next =
|
|
1112
|
+
compaction.level_a_position.value_block +
|
|
1113
|
+
@as(u32, @intCast(compaction.level_a_value_block.count));
|
|
1114
|
+
const level_b_value_block_next =
|
|
1115
|
+
compaction.level_b_position.value_block +
|
|
1116
|
+
@as(u32, @intCast(compaction.level_b_value_block.count));
|
|
1117
|
+
|
|
1118
|
+
// Read level A index block (for level_b > 0).
|
|
1119
|
+
if (compaction.table_info_a.? == .disk) {
|
|
1120
|
+
assert(compaction.level_b > 0);
|
|
1121
|
+
if (!compaction.level_a_index_block.full() and
|
|
1122
|
+
level_a_index_block_next < 1)
|
|
1123
|
+
{
|
|
1124
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1125
|
+
const read = compaction.pool.?.reads.acquire().?;
|
|
1126
|
+
|
|
1127
|
+
assert(block.stage == .free);
|
|
1128
|
+
block.stage = .read_index_block;
|
|
1129
|
+
compaction.level_a_index_block.push_assume_capacity(block);
|
|
1130
|
+
|
|
1131
|
+
compaction.read_index_block(.level_a, read, block);
|
|
1132
|
+
progressed = true;
|
|
1133
|
+
} else {
|
|
1134
|
+
assert(compaction.level_a_index_block.count > 0 or
|
|
1135
|
+
compaction.output_blocks.count > 0);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
|
|
1140
|
+
// Read level B index block.
|
|
1141
|
+
if (!compaction.level_b_index_block.full() and
|
|
1142
|
+
level_b_index_block_next < compaction.range_b.?.tables.count())
|
|
1143
|
+
{
|
|
1144
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1145
|
+
const read = compaction.pool.?.reads.acquire().?;
|
|
1146
|
+
|
|
1147
|
+
assert(block.stage == .free);
|
|
1148
|
+
block.stage = .read_index_block;
|
|
1149
|
+
compaction.level_b_index_block.push_assume_capacity(block);
|
|
1150
|
+
|
|
1151
|
+
compaction.read_index_block(.level_b, read, block);
|
|
1152
|
+
progressed = true;
|
|
1153
|
+
} else {
|
|
1154
|
+
assert(compaction.level_b_index_block.count > 0 or
|
|
1155
|
+
compaction.output_blocks.count > 0);
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
// Read level A value block.
|
|
1160
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
1161
|
+
// The whole table is in memory, no need to read anything.
|
|
1162
|
+
assert(compaction.level_a_index_block.count == 0);
|
|
1163
|
+
} else {
|
|
1164
|
+
if (compaction.level_a_index_block.head()) |index_block| {
|
|
1165
|
+
if (index_block.stage == .read_index_block_done) {
|
|
1166
|
+
const index_schema = schema.TableIndex.from(index_block.ptr);
|
|
1167
|
+
const value_blocks_count =
|
|
1168
|
+
index_schema.value_blocks_used(index_block.ptr);
|
|
1169
|
+
if (!compaction.level_a_value_block.full() and
|
|
1170
|
+
level_a_value_block_next < value_blocks_count)
|
|
1171
|
+
{
|
|
1172
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1173
|
+
const read = compaction.pool.?.reads.acquire().?;
|
|
1174
|
+
|
|
1175
|
+
assert(block.stage == .free);
|
|
1176
|
+
block.stage = .read_value_block;
|
|
1177
|
+
compaction.level_a_value_block.push_assume_capacity(block);
|
|
1178
|
+
|
|
1179
|
+
compaction.read_value_block(.level_a, read, block);
|
|
1180
|
+
progressed = true;
|
|
1181
|
+
} else {
|
|
1182
|
+
assert(compaction.level_a_value_block.count > 0 or
|
|
1183
|
+
compaction.output_blocks.count > 0);
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
} else {
|
|
1187
|
+
assert(index_block.stage == .read_index_block);
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
// Read level B value block.
|
|
1193
|
+
if (compaction.level_b_index_block.head()) |index_block| {
|
|
1194
|
+
if (index_block.stage == .read_index_block_done) {
|
|
1195
|
+
const index_schema = schema.TableIndex.from(index_block.ptr);
|
|
1196
|
+
const value_blocks_count =
|
|
1197
|
+
index_schema.value_blocks_used(index_block.ptr);
|
|
1198
|
+
|
|
1199
|
+
if (!compaction.level_b_value_block.full() and
|
|
1200
|
+
level_b_value_block_next < value_blocks_count)
|
|
1201
|
+
{
|
|
1202
|
+
if (compaction.pool.?.block_acquire()) |block| {
|
|
1203
|
+
const read = compaction.pool.?.reads.acquire().?;
|
|
1204
|
+
|
|
1205
|
+
assert(block.stage == .free);
|
|
1206
|
+
block.stage = .read_value_block;
|
|
1207
|
+
compaction.level_b_value_block.push_assume_capacity(block);
|
|
1208
|
+
|
|
1209
|
+
compaction.read_value_block(.level_b, read, block);
|
|
1210
|
+
progressed = true;
|
|
1211
|
+
} else {
|
|
1212
|
+
assert(compaction.level_b_value_block.count > 0 or
|
|
1213
|
+
compaction.output_blocks.count > 0);
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
} else {
|
|
1217
|
+
assert(index_block.stage == .read_index_block);
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
|
|
1221
|
+
const level_a_ready_immutable = compaction.table_info_a.? == .immutable and
|
|
1222
|
+
compaction.level_a_immutable_stage == .ready;
|
|
1223
|
+
const level_a_ready_disk = compaction.table_info_a.? == .disk and
|
|
1224
|
+
compaction.level_a_value_block.head() != null and
|
|
1225
|
+
compaction.level_a_value_block.head().?.stage == .read_value_block_done;
|
|
1226
|
+
const level_a_ready = level_a_ready_immutable or level_a_ready_disk;
|
|
1227
|
+
|
|
1228
|
+
const level_a_exhausted_immutable = compaction.table_info_a.? == .immutable and
|
|
1229
|
+
compaction.level_a_immutable_stage == .exhausted;
|
|
1230
|
+
const level_a_exhausted_disk = compaction.table_info_a.? == .disk and
|
|
1231
|
+
compaction.level_a_index_block.count == 0 and
|
|
1232
|
+
compaction.level_a_value_block.count == 0;
|
|
1233
|
+
const level_a_exhausted = level_a_exhausted_immutable or level_a_exhausted_disk;
|
|
1234
|
+
|
|
1235
|
+
const level_b_ready = compaction.level_b_value_block.head() != null and
|
|
1236
|
+
compaction.level_b_value_block.head().?.stage == .read_value_block_done;
|
|
1237
|
+
|
|
1238
|
+
const level_b_exhausted =
|
|
1239
|
+
compaction.level_b_index_block.count == 0 and
|
|
1240
|
+
compaction.level_b_value_block.count == 0;
|
|
1241
|
+
const levels_exhausted = level_a_exhausted and level_b_exhausted;
|
|
1242
|
+
|
|
1243
|
+
assert(levels_exhausted == compaction.quotas.bar_exhausted());
|
|
1244
|
+
|
|
1245
|
+
if (compaction.table_builder.state == .index_and_value_block) {
|
|
1246
|
+
if (level_a_exhausted and level_b_exhausted) {
|
|
1247
|
+
assert(compaction.stage == .beat_quota_done);
|
|
1248
|
+
} else if ((level_a_exhausted or level_a_ready) and
|
|
1249
|
+
(level_b_exhausted or level_b_ready) and
|
|
1250
|
+
!compaction.table_builder.value_block_full())
|
|
1251
|
+
{
|
|
1252
|
+
const cpu = compaction.pool.?.cpus.acquire().?;
|
|
1253
|
+
compaction.merge(cpu);
|
|
1254
|
+
progressed = true;
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
// Write value and index blocks. It is important for correctness that both the
|
|
1258
|
+
// value block and index block are written together. Otherwise, we may end up
|
|
1259
|
+
// overflowing the index block's capacity for value block addresses.
|
|
1260
|
+
if (compaction.output_blocks.spare_capacity() >= 2) {
|
|
1261
|
+
if (compaction.table_builder.value_block_full()) {
|
|
1262
|
+
assert(!compaction.output_blocks.full());
|
|
1263
|
+
const write = compaction.pool.?.writes.acquire().?;
|
|
1264
|
+
compaction.write_value_block(write, .{
|
|
1265
|
+
.address = compaction.grid.acquire(
|
|
1266
|
+
compaction.pool.?.grid_reservation.?,
|
|
1267
|
+
),
|
|
1268
|
+
});
|
|
1269
|
+
progressed = true;
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
if (compaction.table_builder.index_block_full()) {
|
|
1273
|
+
assert(!compaction.output_blocks.full());
|
|
1274
|
+
const write = compaction.pool.?.writes.acquire().?;
|
|
1275
|
+
compaction.write_index_block(write, .{
|
|
1276
|
+
.address = compaction.grid.acquire(
|
|
1277
|
+
compaction.pool.?.grid_reservation.?,
|
|
1278
|
+
),
|
|
1279
|
+
});
|
|
1280
|
+
progressed = true;
|
|
1281
|
+
}
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
} else unreachable;
|
|
1285
|
+
assert(!progressed);
|
|
1286
|
+
assert(!compaction.pool.?.idle());
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
fn compaction_dispatch_beat_quota_done(compaction: *Compaction) void {
|
|
1290
|
+
assert(compaction.stage == .beat_quota_done);
|
|
1291
|
+
|
|
1292
|
+
if (compaction.table_builder.state == .index_and_value_block and
|
|
1293
|
+
(compaction.table_builder.value_block_full() or compaction.quotas.bar_exhausted()))
|
|
1294
|
+
{
|
|
1295
|
+
if (compaction.table_builder.value_block_empty()) {
|
|
1296
|
+
assert(compaction.quotas.bar_exhausted());
|
|
1297
|
+
const value_block = compaction.table_builder_value_block.?;
|
|
1298
|
+
compaction.table_builder_value_block = null;
|
|
1299
|
+
compaction.table_builder.state = .index_block;
|
|
1300
|
+
assert(value_block.stage == .build_value_block);
|
|
1301
|
+
value_block.stage = .free;
|
|
1302
|
+
compaction.pool.?.block_release(value_block);
|
|
1303
|
+
} else {
|
|
1304
|
+
if (!compaction.output_blocks.full()) {
|
|
1305
|
+
const write = compaction.pool.?.writes.acquire().?;
|
|
1306
|
+
compaction.write_value_block(write, .{
|
|
1307
|
+
.address = compaction.grid.acquire(
|
|
1308
|
+
compaction.pool.?.grid_reservation.?,
|
|
1309
|
+
),
|
|
1310
|
+
});
|
|
1311
|
+
assert(compaction.table_builder.state == .index_block);
|
|
1312
|
+
assert(compaction.table_builder_value_block == null);
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
if (compaction.table_builder.state == .index_block and
|
|
1318
|
+
(compaction.table_builder.index_block_full() or compaction.quotas.bar_exhausted()))
|
|
1319
|
+
{
|
|
1320
|
+
if (compaction.table_builder.index_block_empty()) {
|
|
1321
|
+
assert(compaction.quotas.bar_exhausted());
|
|
1322
|
+
const index_block = compaction.table_builder_index_block.?;
|
|
1323
|
+
compaction.table_builder_index_block = null;
|
|
1324
|
+
compaction.table_builder.state = .no_blocks;
|
|
1325
|
+
assert(index_block.stage == .build_index_block);
|
|
1326
|
+
index_block.stage = .free;
|
|
1327
|
+
compaction.pool.?.block_release(index_block);
|
|
1328
|
+
} else {
|
|
1329
|
+
if (!compaction.output_blocks.full()) {
|
|
1330
|
+
const write = compaction.pool.?.writes.acquire().?;
|
|
1331
|
+
compaction.write_index_block(write, .{
|
|
1332
|
+
.address = compaction.grid.acquire(
|
|
1333
|
+
compaction.pool.?.grid_reservation.?,
|
|
1334
|
+
),
|
|
1335
|
+
});
|
|
1336
|
+
assert(compaction.table_builder.state == .no_blocks);
|
|
1337
|
+
assert(compaction.table_builder_value_block == null);
|
|
1338
|
+
assert(compaction.table_builder_index_block == null);
|
|
1339
|
+
}
|
|
1340
|
+
}
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
if (compaction.output_blocks.count > 0) {
|
|
1344
|
+
return;
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
switch (compaction.table_builder.state) {
|
|
1348
|
+
.no_blocks => {},
|
|
1349
|
+
.index_and_value_block => {
|
|
1350
|
+
assert(!compaction.table_builder.index_block_full());
|
|
1351
|
+
assert(!compaction.table_builder.value_block_full());
|
|
1352
|
+
assert(!compaction.quotas.bar_exhausted());
|
|
1353
|
+
},
|
|
1354
|
+
.index_block => {
|
|
1355
|
+
assert(!compaction.table_builder.index_block_full());
|
|
1356
|
+
assert(!compaction.quotas.bar_exhausted());
|
|
1357
|
+
},
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
var level_a_value_block_iterator = compaction.level_a_value_block.iterator();
|
|
1361
|
+
while (level_a_value_block_iterator.next()) |block| {
|
|
1362
|
+
if (block.stage == .read_value_block) return;
|
|
1363
|
+
|
|
1364
|
+
assert(block.stage == .read_value_block_done);
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
var level_a_index_block_iterator = compaction.level_a_index_block.iterator();
|
|
1368
|
+
while (level_a_index_block_iterator.next()) |block| {
|
|
1369
|
+
if (block.stage == .read_index_block) return;
|
|
1370
|
+
|
|
1371
|
+
assert(block.stage == .read_index_block_done);
|
|
1372
|
+
}
|
|
1373
|
+
|
|
1374
|
+
var level_b_value_block_iterator = compaction.level_b_value_block.iterator();
|
|
1375
|
+
while (level_b_value_block_iterator.next()) |block| {
|
|
1376
|
+
if (block.stage == .read_value_block) return;
|
|
1377
|
+
|
|
1378
|
+
assert(block.stage == .read_value_block_done);
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
var level_b_index_block_iterator = compaction.level_b_index_block.iterator();
|
|
1382
|
+
while (level_b_index_block_iterator.next()) |block| {
|
|
1383
|
+
if (block.stage == .read_index_block) return;
|
|
1384
|
+
|
|
1385
|
+
assert(block.stage == .read_index_block_done);
|
|
1386
|
+
}
|
|
1387
|
+
|
|
1388
|
+
compaction.grid.trace.stop(.{ .compact_beat = .{
|
|
1389
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
1390
|
+
.level_b = compaction.level_b,
|
|
1391
|
+
} });
|
|
1392
|
+
compaction.beat_complete();
|
|
1393
|
+
}
|
|
1394
|
+
|
|
1395
|
+
fn read_index_block(
|
|
1396
|
+
compaction: *Compaction,
|
|
1397
|
+
level: enum { level_a, level_b },
|
|
1398
|
+
read: *ResourcePool.BlockRead,
|
|
1399
|
+
index_block: *ResourcePool.Block,
|
|
1400
|
+
) void {
|
|
1401
|
+
const level_b_index_block_next =
|
|
1402
|
+
compaction.level_b_position.index_block +
|
|
1403
|
+
@as(u32, @intCast(compaction.level_b_index_block.count));
|
|
1404
|
+
|
|
1405
|
+
assert(compaction.stage == .beat or compaction.stage == .beat_quota_done);
|
|
1406
|
+
assert(index_block.stage == .read_index_block);
|
|
1407
|
+
switch (level) {
|
|
1408
|
+
.level_a => assert(compaction.level_a_position.index_block == 0),
|
|
1409
|
+
.level_b => {
|
|
1410
|
+
assert(level_b_index_block_next - 1 < compaction.range_b.?.tables.count());
|
|
1411
|
+
assert(level_b_index_block_next > 0);
|
|
1412
|
+
},
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
const table_ref = switch (level) {
|
|
1416
|
+
.level_a => compaction.table_info_a.?.disk,
|
|
1417
|
+
.level_b => compaction.range_b.?.tables.get(level_b_index_block_next - 1),
|
|
1418
|
+
};
|
|
1419
|
+
read.block = index_block;
|
|
1420
|
+
read.compaction = compaction;
|
|
1421
|
+
compaction.grid.read_block(
|
|
1422
|
+
.{ .from_local_or_global_storage = read_index_block_callback },
|
|
1423
|
+
&read.grid_read,
|
|
1424
|
+
table_ref.table_info.address,
|
|
1425
|
+
table_ref.table_info.checksum,
|
|
1426
|
+
.{ .cache_read = true, .cache_write = true },
|
|
1427
|
+
);
|
|
1428
|
+
}
|
|
1429
|
+
|
|
1430
|
+
fn read_index_block_callback(grid_read: *Grid.Read, index_block: BlockPtrConst) void {
|
|
1431
|
+
const read: *ResourcePool.BlockRead = @fieldParentPtr("grid_read", grid_read);
|
|
1432
|
+
const compaction: *Compaction = read.parent(Compaction);
|
|
1433
|
+
const block = read.block;
|
|
1434
|
+
compaction.pool.?.reads.release(read);
|
|
1435
|
+
|
|
1436
|
+
assert(block.stage == .read_index_block);
|
|
1437
|
+
stdx.copy_disjoint(.exact, u8, block.ptr, index_block);
|
|
1438
|
+
block.stage = .read_index_block_done;
|
|
1439
|
+
compaction.compaction_dispatch();
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
fn read_value_block(
|
|
1443
|
+
compaction: *Compaction,
|
|
1444
|
+
level: enum { level_a, level_b },
|
|
1445
|
+
read: *ResourcePool.BlockRead,
|
|
1446
|
+
value_block: *ResourcePool.Block,
|
|
1447
|
+
) void {
|
|
1448
|
+
assert(compaction.stage == .beat or compaction.stage == .beat_quota_done);
|
|
1449
|
+
assert(value_block.stage == .read_value_block);
|
|
1450
|
+
if (level == .level_a) assert(compaction.table_info_a.? == .disk);
|
|
1451
|
+
|
|
1452
|
+
const index_block = switch (level) {
|
|
1453
|
+
.level_a => compaction.level_a_index_block.head().?,
|
|
1454
|
+
.level_b => compaction.level_b_index_block.head().?,
|
|
1455
|
+
};
|
|
1456
|
+
|
|
1457
|
+
const level_a_value_block_next =
|
|
1458
|
+
compaction.level_a_position.value_block +
|
|
1459
|
+
@as(u32, @intCast(compaction.level_a_value_block.count));
|
|
1460
|
+
const level_b_value_block_next =
|
|
1461
|
+
compaction.level_b_position.value_block +
|
|
1462
|
+
@as(u32, @intCast(compaction.level_b_value_block.count));
|
|
1463
|
+
|
|
1464
|
+
const value_block_index = blk: {
|
|
1465
|
+
switch (level) {
|
|
1466
|
+
.level_a => {
|
|
1467
|
+
assert(level_a_value_block_next > 0);
|
|
1468
|
+
break :blk level_a_value_block_next - 1;
|
|
1469
|
+
},
|
|
1470
|
+
.level_b => {
|
|
1471
|
+
assert(level_b_value_block_next > 0);
|
|
1472
|
+
break :blk level_b_value_block_next - 1;
|
|
1473
|
+
},
|
|
1474
|
+
}
|
|
1475
|
+
};
|
|
1476
|
+
|
|
1477
|
+
const index_schema = schema.TableIndex.from(index_block.ptr);
|
|
1478
|
+
|
|
1479
|
+
const value_block_address =
|
|
1480
|
+
index_schema.value_addresses_used(index_block.ptr)[value_block_index];
|
|
1481
|
+
const value_block_checksum =
|
|
1482
|
+
index_schema.value_checksums_used(index_block.ptr)[value_block_index];
|
|
1483
|
+
|
|
1484
|
+
read.block = value_block;
|
|
1485
|
+
read.compaction = compaction;
|
|
1486
|
+
compaction.grid.read_block(
|
|
1487
|
+
.{ .from_local_or_global_storage = read_value_block_callback },
|
|
1488
|
+
&read.grid_read,
|
|
1489
|
+
value_block_address,
|
|
1490
|
+
value_block_checksum.value,
|
|
1491
|
+
.{ .cache_read = true, .cache_write = true },
|
|
1492
|
+
);
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
// TODO: Support for LSM snapshots would require us to only remove blocks
|
|
1496
|
+
// that are invisible.
|
|
1497
|
+
fn read_value_block_release_table(
|
|
1498
|
+
compaction: *Compaction,
|
|
1499
|
+
index_block: BlockPtrConst,
|
|
1500
|
+
) void {
|
|
1501
|
+
const index_schema = schema.TableIndex.from(index_block);
|
|
1502
|
+
const index_block_address = Table.block_address(index_block);
|
|
1503
|
+
const value_block_addresses = index_schema.value_addresses_used(index_block);
|
|
1504
|
+
|
|
1505
|
+
// Tables are released when the index block is no longer needed. Given that the same
|
|
1506
|
+
// index block can get re-read across the bar, the same table can be released twice.
|
|
1507
|
+
if (compaction.grid.free_set.is_released(index_block_address)) {
|
|
1508
|
+
for (value_block_addresses) |address| {
|
|
1509
|
+
assert(compaction.grid.free_set.is_released(address));
|
|
1510
|
+
}
|
|
1511
|
+
} else {
|
|
1512
|
+
compaction.grid.release(value_block_addresses);
|
|
1513
|
+
compaction.grid.release(&.{index_block_address});
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
fn read_value_block_callback(grid_read: *Grid.Read, value_block: BlockPtrConst) void {
|
|
1518
|
+
const read: *ResourcePool.BlockRead = @fieldParentPtr("grid_read", grid_read);
|
|
1519
|
+
const compaction: *Compaction = read.parent(Compaction);
|
|
1520
|
+
const block = read.block;
|
|
1521
|
+
compaction.pool.?.reads.release(read);
|
|
1522
|
+
|
|
1523
|
+
assert(block.stage == .read_value_block);
|
|
1524
|
+
stdx.copy_disjoint(.exact, u8, block.ptr, value_block);
|
|
1525
|
+
block.stage = .read_value_block_done;
|
|
1526
|
+
compaction.counters.in += Table.value_block_values_used(block.ptr).len;
|
|
1527
|
+
compaction.compaction_dispatch();
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
fn merge(compaction: *Compaction, cpu: *ResourcePool.CPU) void {
|
|
1531
|
+
assert(!compaction.quotas.bar_exhausted());
|
|
1532
|
+
|
|
1533
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
1534
|
+
if (compaction.level_a_immutable_stage == .ready) {
|
|
1535
|
+
compaction.level_a_immutable_stage = .merge;
|
|
1536
|
+
} else assert(compaction.level_a_immutable_stage == .exhausted);
|
|
1537
|
+
} else {
|
|
1538
|
+
if (compaction.level_a_value_block.head()) |block| {
|
|
1539
|
+
assert(block.stage == .read_value_block_done);
|
|
1540
|
+
block.stage = .merge;
|
|
1541
|
+
} else assert(compaction.level_b_value_block.head() != null);
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
if (compaction.level_b_value_block.head()) |block| {
|
|
1545
|
+
assert(block.stage == .read_value_block_done);
|
|
1546
|
+
block.stage = .merge;
|
|
1547
|
+
}
|
|
1548
|
+
|
|
1549
|
+
assert(compaction.table_builder.state == .index_and_value_block);
|
|
1550
|
+
|
|
1551
|
+
cpu.compaction = compaction;
|
|
1552
|
+
compaction.grid.on_next_tick(merge_callback, &cpu.next_tick);
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
fn merge_callback(next_tick: *Grid.NextTick) void {
|
|
1556
|
+
const cpu: *ResourcePool.CPU = @fieldParentPtr("next_tick", next_tick);
|
|
1557
|
+
const compaction: *Compaction = cpu.parent(Compaction);
|
|
1558
|
+
compaction.pool.?.cpus.release(cpu);
|
|
1559
|
+
assert(compaction.table_builder.state == .index_and_value_block);
|
|
1560
|
+
|
|
1561
|
+
compaction.grid.trace.start(.{ .compact_beat_merge = .{
|
|
1562
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
1563
|
+
.level_b = compaction.level_b,
|
|
1564
|
+
} });
|
|
1565
|
+
|
|
1566
|
+
const values_source_a, const values_source_b = compaction.merge_inputs();
|
|
1567
|
+
assert(values_source_a != null or values_source_b != null);
|
|
1568
|
+
|
|
1569
|
+
const values_target = compaction.table_builder
|
|
1570
|
+
.value_block_values()[compaction.table_builder.value_count..];
|
|
1571
|
+
|
|
1572
|
+
inline for ([_]?[]const Value{
|
|
1573
|
+
values_source_a,
|
|
1574
|
+
values_source_b,
|
|
1575
|
+
values_target,
|
|
1576
|
+
}) |values_maybe| {
|
|
1577
|
+
if (values_maybe) |values| {
|
|
1578
|
+
assert(values.len > 0);
|
|
1579
|
+
assert(values.len <= Table.data.value_count_max);
|
|
1580
|
+
}
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1583
|
+
// Do the actual merge from inputs to the output (table builder).
|
|
1584
|
+
const merge_result: MergeResult = if (values_source_a == null) blk: {
|
|
1585
|
+
const consumed = values_copy(values_target, values_source_b.?);
|
|
1586
|
+
break :blk .{
|
|
1587
|
+
.consumed_a = 0,
|
|
1588
|
+
.consumed_b = consumed,
|
|
1589
|
+
.dropped = 0,
|
|
1590
|
+
.produced = consumed,
|
|
1591
|
+
};
|
|
1592
|
+
} else if (values_source_b == null) blk: {
|
|
1593
|
+
if (compaction.drop_tombstones) {
|
|
1594
|
+
const copy_result = values_copy_drop_tombstones(
|
|
1595
|
+
values_target,
|
|
1596
|
+
values_source_a.?,
|
|
1597
|
+
);
|
|
1598
|
+
break :blk .{
|
|
1599
|
+
.consumed_a = copy_result.consumed,
|
|
1600
|
+
.consumed_b = 0,
|
|
1601
|
+
.dropped = copy_result.dropped,
|
|
1602
|
+
.produced = copy_result.produced,
|
|
1603
|
+
};
|
|
1604
|
+
} else {
|
|
1605
|
+
const consumed = values_copy(values_target, values_source_a.?);
|
|
1606
|
+
break :blk .{
|
|
1607
|
+
.consumed_a = consumed,
|
|
1608
|
+
.consumed_b = 0,
|
|
1609
|
+
.dropped = 0,
|
|
1610
|
+
.produced = consumed,
|
|
1611
|
+
};
|
|
1612
|
+
}
|
|
1613
|
+
} else values_merge(
|
|
1614
|
+
values_target,
|
|
1615
|
+
values_source_a.?,
|
|
1616
|
+
values_source_b.?,
|
|
1617
|
+
compaction.drop_tombstones,
|
|
1618
|
+
);
|
|
1619
|
+
|
|
1620
|
+
compaction.level_a_position.value += merge_result.consumed_a;
|
|
1621
|
+
compaction.level_b_position.value += merge_result.consumed_b;
|
|
1622
|
+
compaction.table_builder.value_count += merge_result.produced;
|
|
1623
|
+
|
|
1624
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
1625
|
+
assert(compaction.level_a_position.value <= Table.value_count_max);
|
|
1626
|
+
} else {
|
|
1627
|
+
assert(compaction.level_a_position.value <= Table.data.value_count_max);
|
|
1628
|
+
}
|
|
1629
|
+
assert(compaction.level_b_position.value <= Table.data.value_count_max);
|
|
1630
|
+
assert(compaction.table_builder.value_count <= Table.data.value_count_max);
|
|
1631
|
+
|
|
1632
|
+
const consumed_ab = merge_result.consumed_a + merge_result.consumed_b;
|
|
1633
|
+
|
|
1634
|
+
compaction.quotas.bar_done += consumed_ab;
|
|
1635
|
+
compaction.quotas.beat_done += consumed_ab;
|
|
1636
|
+
|
|
1637
|
+
compaction.counters.dropped += merge_result.dropped;
|
|
1638
|
+
|
|
1639
|
+
assert(compaction.quotas.bar_done <= compaction.quotas.bar);
|
|
1640
|
+
|
|
1641
|
+
compaction.merge_advance_position();
|
|
1642
|
+
|
|
1643
|
+
// NB: although all the work here is synchronous, we don't defer trace.stop precisely
|
|
1644
|
+
// to exclude compaction.dispatch call below.
|
|
1645
|
+
compaction.grid.trace.stop(.{ .compact_beat_merge = .{
|
|
1646
|
+
.tree = @enumFromInt(compaction.tree.config.id),
|
|
1647
|
+
.level_b = compaction.level_b,
|
|
1648
|
+
} });
|
|
1649
|
+
compaction.compaction_dispatch();
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
fn merge_inputs(compaction: *const Compaction) struct { ?[]const Value, ?[]const Value } {
|
|
1653
|
+
const level_a_values_used: ?[]const Value = values: {
|
|
1654
|
+
switch (compaction.table_info_a.?) {
|
|
1655
|
+
.immutable => {
|
|
1656
|
+
if (compaction.level_a_immutable_stage == .merge) {
|
|
1657
|
+
break :values compaction.table_info_a.?.immutable;
|
|
1658
|
+
} else {
|
|
1659
|
+
assert(compaction.level_a_immutable_stage == .exhausted);
|
|
1660
|
+
break :values null;
|
|
1661
|
+
}
|
|
1662
|
+
},
|
|
1663
|
+
.disk => {
|
|
1664
|
+
if (compaction.level_a_value_block.head()) |block| {
|
|
1665
|
+
assert(block.stage == .merge);
|
|
1666
|
+
break :values Table.value_block_values_used(block.ptr);
|
|
1667
|
+
} else {
|
|
1668
|
+
break :values null;
|
|
1669
|
+
}
|
|
1670
|
+
},
|
|
1671
|
+
}
|
|
1672
|
+
};
|
|
1673
|
+
|
|
1674
|
+
const level_b_values_used: ?[]const Value = values: {
|
|
1675
|
+
if (compaction.level_b_value_block.head()) |block| {
|
|
1676
|
+
assert(block.stage == .merge);
|
|
1677
|
+
break :values Table.value_block_values_used(block.ptr);
|
|
1678
|
+
} else {
|
|
1679
|
+
break :values null;
|
|
1680
|
+
}
|
|
1681
|
+
};
|
|
1682
|
+
assert(!(level_a_values_used == null and level_b_values_used == null));
|
|
1683
|
+
|
|
1684
|
+
const level_a_values = if (level_a_values_used) |values_used| values: {
|
|
1685
|
+
const values_remaining = values_used[compaction.level_a_position.value..];
|
|
1686
|
+
// Only consume one block at a time so that a beat never outputs past its quota
|
|
1687
|
+
// by more than one value block.
|
|
1688
|
+
const limit = @min(
|
|
1689
|
+
Table.data.value_count_max,
|
|
1690
|
+
values_remaining.len,
|
|
1691
|
+
);
|
|
1692
|
+
break :values values_remaining[0..limit];
|
|
1693
|
+
} else null;
|
|
1694
|
+
|
|
1695
|
+
const level_b_values = if (level_b_values_used) |values_used|
|
|
1696
|
+
values_used[compaction.level_b_position.value..]
|
|
1697
|
+
else
|
|
1698
|
+
null;
|
|
1699
|
+
|
|
1700
|
+
return .{ level_a_values, level_b_values };
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
// merge_callback advances just position.values. Here, we implement the carry-flag logic,
|
|
1704
|
+
// advancing value_block and index_block. This is also the place where determine that the
|
|
1705
|
+
// beat's quota of work is done and begin to wind down the dispatch loop.
|
|
1706
|
+
fn merge_advance_position(compaction: *Compaction) void {
|
|
1707
|
+
if (compaction.table_info_a.? == .immutable) {
|
|
1708
|
+
if (compaction.level_a_immutable_stage == .merge) {
|
|
1709
|
+
if (compaction.level_a_position.value ==
|
|
1710
|
+
compaction.table_info_a.?.immutable.len)
|
|
1711
|
+
{
|
|
1712
|
+
compaction.level_a_position.value_block += 1;
|
|
1713
|
+
assert(compaction.level_a_position.value_block == 1);
|
|
1714
|
+
compaction.level_a_position.value = 0;
|
|
1715
|
+
compaction.level_a_immutable_stage = .exhausted;
|
|
1716
|
+
} else {
|
|
1717
|
+
compaction.level_a_immutable_stage = .ready;
|
|
1718
|
+
}
|
|
1719
|
+
} else {
|
|
1720
|
+
assert(compaction.level_a_immutable_stage == .exhausted);
|
|
1721
|
+
}
|
|
1722
|
+
} else {
|
|
1723
|
+
if (compaction.level_a_value_block.head()) |value_block| {
|
|
1724
|
+
assert(value_block.stage == .merge);
|
|
1725
|
+
if (compaction.level_a_position.value ==
|
|
1726
|
+
Table.value_block_values_used(value_block.ptr).len)
|
|
1727
|
+
{
|
|
1728
|
+
_ = compaction.level_a_value_block.pop();
|
|
1729
|
+
|
|
1730
|
+
compaction.level_a_position.value_block += 1;
|
|
1731
|
+
compaction.level_a_position.value = 0;
|
|
1732
|
+
|
|
1733
|
+
const index_block = compaction.level_a_index_block.head().?;
|
|
1734
|
+
assert(index_block.stage == .read_index_block_done);
|
|
1735
|
+
const index_schema = schema.TableIndex.from(index_block.ptr);
|
|
1736
|
+
const value_blocks_count =
|
|
1737
|
+
index_schema.value_blocks_used(index_block.ptr);
|
|
1738
|
+
|
|
1739
|
+
// It is imperative that we pop the index block when the final value block
|
|
1740
|
+
// is popped. While it is tempting to pop the index block when we issue
|
|
1741
|
+
// a read for the final value block, this would be incorrect as it would
|
|
1742
|
+
// lead to an incorrect index being computed for level_a_value_block_next
|
|
1743
|
+
// in `compaction_dispatch`.
|
|
1744
|
+
if (compaction.level_a_position.value_block == value_blocks_count) {
|
|
1745
|
+
compaction.level_a_position.index_block += 1;
|
|
1746
|
+
assert(compaction.level_a_position.index_block == 1);
|
|
1747
|
+
compaction.level_a_position.value_block = 0;
|
|
1748
|
+
|
|
1749
|
+
const popped = compaction.level_a_index_block.pop().?;
|
|
1750
|
+
assert(popped == index_block);
|
|
1751
|
+
compaction.read_value_block_release_table(index_block.ptr);
|
|
1752
|
+
index_block.stage = .free;
|
|
1753
|
+
compaction.pool.?.block_release(index_block);
|
|
1754
|
+
}
|
|
1755
|
+
|
|
1756
|
+
value_block.stage = .free;
|
|
1757
|
+
compaction.pool.?.block_release(value_block);
|
|
1758
|
+
} else {
|
|
1759
|
+
value_block.stage = .read_value_block_done;
|
|
1760
|
+
}
|
|
1761
|
+
} else {
|
|
1762
|
+
assert(compaction.level_a_position.value == 0); // Level A exhausted.
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
if (compaction.level_b_value_block.head()) |value_block| {
|
|
1767
|
+
assert(value_block.stage == .merge);
|
|
1768
|
+
if (compaction.level_b_position.value ==
|
|
1769
|
+
Table.value_block_values_used(value_block.ptr).len)
|
|
1770
|
+
{
|
|
1771
|
+
_ = compaction.level_b_value_block.pop().?;
|
|
1772
|
+
compaction.level_b_position.value_block += 1;
|
|
1773
|
+
compaction.level_b_position.value = 0;
|
|
1774
|
+
|
|
1775
|
+
const index_block = compaction.level_b_index_block.head().?;
|
|
1776
|
+
assert(index_block.stage == .read_index_block_done);
|
|
1777
|
+
const index_schema = schema.TableIndex.from(index_block.ptr);
|
|
1778
|
+
const value_blocks_count =
|
|
1779
|
+
index_schema.value_blocks_used(index_block.ptr);
|
|
1780
|
+
|
|
1781
|
+
// It is imperative that we pop the index block when the final value block
|
|
1782
|
+
// is popped. While it is tempting to pop the index block when we issue
|
|
1783
|
+
// a read for the final value block, this would be incorrect as it would
|
|
1784
|
+
// lead to an incorrect index being computed for level_b_value_block_next
|
|
1785
|
+
// in `compaction_dispatch`.
|
|
1786
|
+
if (compaction.level_b_position.value_block == value_blocks_count) {
|
|
1787
|
+
compaction.level_b_position.index_block += 1;
|
|
1788
|
+
compaction.level_b_position.value_block = 0;
|
|
1789
|
+
|
|
1790
|
+
const popped = compaction.level_b_index_block.pop().?;
|
|
1791
|
+
assert(popped == index_block);
|
|
1792
|
+
compaction.read_value_block_release_table(index_block.ptr);
|
|
1793
|
+
index_block.stage = .free;
|
|
1794
|
+
|
|
1795
|
+
compaction.pool.?.block_release(index_block);
|
|
1796
|
+
}
|
|
1797
|
+
|
|
1798
|
+
value_block.stage = .free;
|
|
1799
|
+
compaction.pool.?.block_release(value_block);
|
|
1800
|
+
} else {
|
|
1801
|
+
value_block.stage = .read_value_block_done;
|
|
1802
|
+
}
|
|
1803
|
+
} else {
|
|
1804
|
+
assert(compaction.level_b_position.value == 0); // Level B exhausted.
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
if (compaction.quotas.beat_exhausted()) {
|
|
1808
|
+
assert(compaction.stage == .beat);
|
|
1809
|
+
compaction.stage = .beat_quota_done;
|
|
1810
|
+
}
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
fn write_value_block(
|
|
1814
|
+
compaction: *Compaction,
|
|
1815
|
+
write: *ResourcePool.BlockWrite,
|
|
1816
|
+
options: struct { address: u64 },
|
|
1817
|
+
) void {
|
|
1818
|
+
const block = compaction.table_builder_value_block.?;
|
|
1819
|
+
assert(block.stage == .build_value_block);
|
|
1820
|
+
assert(compaction.table_builder.value_block == block.ptr);
|
|
1821
|
+
assert(!compaction.output_blocks.full());
|
|
1822
|
+
|
|
1823
|
+
compaction.counters.out += compaction.table_builder.value_count;
|
|
1824
|
+
compaction.table_builder.value_block_finish(.{
|
|
1825
|
+
.cluster = compaction.grid.superblock.working.cluster,
|
|
1826
|
+
.release = compaction.grid.superblock.working.vsr_state.checkpoint.release,
|
|
1827
|
+
.address = options.address,
|
|
1828
|
+
.snapshot_min = snapshot_min_for_table_output(compaction.op_min),
|
|
1829
|
+
.tree_id = compaction.tree.config.id,
|
|
1830
|
+
});
|
|
1831
|
+
assert(compaction.table_builder.state == .index_block);
|
|
1832
|
+
compaction.table_builder_value_block = null;
|
|
1833
|
+
|
|
1834
|
+
compaction.output_blocks.push_assume_capacity({});
|
|
1835
|
+
block.stage = .write_value_block;
|
|
1836
|
+
|
|
1837
|
+
write.block = block;
|
|
1838
|
+
write.compaction = compaction;
|
|
1839
|
+
compaction.grid.create_block(write_block_callback, &write.grid_write, &write.block.ptr);
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
fn write_index_block(
|
|
1843
|
+
compaction: *Compaction,
|
|
1844
|
+
write: *ResourcePool.BlockWrite,
|
|
1845
|
+
options: struct { address: u64 },
|
|
1846
|
+
) void {
|
|
1847
|
+
const block = compaction.table_builder_index_block.?;
|
|
1848
|
+
assert(block.stage == .build_index_block);
|
|
1849
|
+
assert(compaction.table_builder.index_block == block.ptr);
|
|
1850
|
+
assert(!compaction.output_blocks.full());
|
|
1851
|
+
|
|
1852
|
+
const table = compaction.table_builder.index_block_finish(.{
|
|
1853
|
+
.cluster = compaction.grid.superblock.working.cluster,
|
|
1854
|
+
.release = compaction.grid.superblock.working.vsr_state.checkpoint.release,
|
|
1855
|
+
.address = options.address,
|
|
1856
|
+
.snapshot_min = snapshot_min_for_table_output(compaction.op_min),
|
|
1857
|
+
.tree_id = compaction.tree.config.id,
|
|
1858
|
+
});
|
|
1859
|
+
assert(compaction.table_builder.state == .no_blocks);
|
|
1860
|
+
compaction.table_builder_index_block = null;
|
|
1861
|
+
|
|
1862
|
+
compaction.manifest_entries.push(.{
|
|
1863
|
+
.operation = .insert_to_level_b,
|
|
1864
|
+
.table = table,
|
|
1865
|
+
});
|
|
1866
|
+
|
|
1867
|
+
compaction.output_blocks.push_assume_capacity({});
|
|
1868
|
+
block.stage = .write_index_block;
|
|
1869
|
+
|
|
1870
|
+
write.block = block;
|
|
1871
|
+
write.compaction = compaction;
|
|
1872
|
+
compaction.grid.create_block(write_block_callback, &write.grid_write, &write.block.ptr);
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
fn write_block_callback(grid_write: *Grid.Write) void {
|
|
1876
|
+
const write: *ResourcePool.BlockWrite = @fieldParentPtr("grid_write", grid_write);
|
|
1877
|
+
const compaction: *Compaction = write.parent(Compaction);
|
|
1878
|
+
const block = write.block;
|
|
1879
|
+
compaction.pool.?.writes.release(write);
|
|
1880
|
+
|
|
1881
|
+
assert(block.stage == .write_value_block or block.stage == .write_index_block);
|
|
1882
|
+
block.stage = .free;
|
|
1883
|
+
compaction.pool.?.block_release(block);
|
|
1884
|
+
|
|
1885
|
+
const popped = compaction.output_blocks.pop();
|
|
1886
|
+
assert(popped != null);
|
|
1887
|
+
|
|
1888
|
+
compaction.compaction_dispatch();
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
// The three functions below are hot CPU loops doing the actual merging, TigerBeetle's data
|
|
1892
|
+
// plane. To reduce the probability of the optimizer getting confused over pointers, don't
|
|
1893
|
+
// use 'self' and instead specify all inputs and outputs explicitly. Its the caller's job to
|
|
1894
|
+
// apply control plane changes to the compaction state.
|
|
1895
|
+
//
|
|
1896
|
+
// TODO: Add micro benchmarks.
|
|
1897
|
+
|
|
1898
|
+
fn values_copy(values_target: []Value, values_source: []const Value) u32 {
|
|
1899
|
+
assert(values_source.len > 0);
|
|
1900
|
+
assert(values_source.len <= Table.data.value_count_max);
|
|
1901
|
+
assert(values_target.len > 0);
|
|
1902
|
+
assert(values_target.len <= Table.data.value_count_max);
|
|
1903
|
+
|
|
1904
|
+
const len: u32 = @intCast(@min(values_source.len, values_target.len));
|
|
1905
|
+
stdx.copy_disjoint(
|
|
1906
|
+
.exact,
|
|
1907
|
+
Value,
|
|
1908
|
+
values_target[0..len],
|
|
1909
|
+
values_source[0..len],
|
|
1910
|
+
);
|
|
1911
|
+
|
|
1912
|
+
return len;
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1915
|
+
const CopyDropTombstonesResult = struct {
|
|
1916
|
+
consumed: u32,
|
|
1917
|
+
dropped: u32,
|
|
1918
|
+
produced: u32,
|
|
1919
|
+
};
|
|
1920
|
+
/// Copy values from values_source to values_target, dropping tombstones as we go.
|
|
1921
|
+
fn values_copy_drop_tombstones(
|
|
1922
|
+
values_target: []Value,
|
|
1923
|
+
values_source: []const Value,
|
|
1924
|
+
) CopyDropTombstonesResult {
|
|
1925
|
+
assert(values_source.len > 0);
|
|
1926
|
+
assert(values_source.len <= Table.data.value_count_max);
|
|
1927
|
+
assert(values_target.len > 0);
|
|
1928
|
+
assert(values_target.len <= Table.data.value_count_max);
|
|
1929
|
+
|
|
1930
|
+
var index_source: usize = 0;
|
|
1931
|
+
var index_target: usize = 0;
|
|
1932
|
+
// Merge as many values as possible.
|
|
1933
|
+
while (index_source < values_source.len and
|
|
1934
|
+
index_target < values_target.len)
|
|
1935
|
+
{
|
|
1936
|
+
const value_in = &values_source[index_source];
|
|
1937
|
+
index_source += 1;
|
|
1938
|
+
if (tombstone(value_in)) {
|
|
1939
|
+
assert(Table.usage != .secondary_index);
|
|
1940
|
+
continue;
|
|
1941
|
+
}
|
|
1942
|
+
values_target[index_target] = value_in.*;
|
|
1943
|
+
index_target += 1;
|
|
1944
|
+
}
|
|
1945
|
+
const copy_result: CopyDropTombstonesResult = .{
|
|
1946
|
+
.consumed = @intCast(index_source),
|
|
1947
|
+
.dropped = @intCast(index_source - index_target),
|
|
1948
|
+
.produced = @intCast(index_target),
|
|
1949
|
+
};
|
|
1950
|
+
assert(copy_result.consumed > 0);
|
|
1951
|
+
assert(copy_result.consumed <= values_source.len);
|
|
1952
|
+
assert(copy_result.dropped <= copy_result.consumed);
|
|
1953
|
+
assert(copy_result.produced <= values_target.len);
|
|
1954
|
+
assert(copy_result.produced == copy_result.consumed - copy_result.dropped);
|
|
1955
|
+
return copy_result;
|
|
1956
|
+
}
|
|
1957
|
+
|
|
1958
|
+
const MergeResult = struct {
|
|
1959
|
+
consumed_a: u32,
|
|
1960
|
+
consumed_b: u32,
|
|
1961
|
+
dropped: u32,
|
|
1962
|
+
produced: u32,
|
|
1963
|
+
};
|
|
1964
|
+
|
|
1965
|
+
/// Merge values from table_a and table_b, with table_a taking precedence. Tombstones may
|
|
1966
|
+
/// or may not be dropped depending on bar.drop_tombstones.
|
|
1967
|
+
fn values_merge(
|
|
1968
|
+
values_target: []Value,
|
|
1969
|
+
values_source_a: []const Value,
|
|
1970
|
+
values_source_b: []const Value,
|
|
1971
|
+
drop_tombstones: bool,
|
|
1972
|
+
) MergeResult {
|
|
1973
|
+
assert(values_source_a.len > 0);
|
|
1974
|
+
assert(values_source_a.len <= Table.data.value_count_max);
|
|
1975
|
+
assert(values_source_b.len > 0);
|
|
1976
|
+
assert(values_source_b.len <= Table.data.value_count_max);
|
|
1977
|
+
assert(values_target.len > 0);
|
|
1978
|
+
assert(values_target.len <= Table.data.value_count_max);
|
|
1979
|
+
|
|
1980
|
+
var index_source_a: usize = 0;
|
|
1981
|
+
var index_source_b: usize = 0;
|
|
1982
|
+
var index_target: usize = 0;
|
|
1983
|
+
|
|
1984
|
+
while (index_source_a < values_source_a.len and
|
|
1985
|
+
index_source_b < values_source_b.len and
|
|
1986
|
+
index_target < values_target.len)
|
|
1987
|
+
{
|
|
1988
|
+
const value_a = &values_source_a[index_source_a];
|
|
1989
|
+
const value_b = &values_source_b[index_source_b];
|
|
1990
|
+
switch (std.math.order(key_from_value(value_a), key_from_value(value_b))) {
|
|
1991
|
+
.lt => { // Pick value from level a.
|
|
1992
|
+
index_source_a += 1;
|
|
1993
|
+
if (drop_tombstones and tombstone(value_a)) {
|
|
1994
|
+
assert(Table.usage != .secondary_index);
|
|
1995
|
+
continue;
|
|
1996
|
+
}
|
|
1997
|
+
values_target[index_target] = value_a.*;
|
|
1998
|
+
index_target += 1;
|
|
1999
|
+
},
|
|
2000
|
+
.gt => { // Pick value from level b.
|
|
2001
|
+
index_source_b += 1;
|
|
2002
|
+
values_target[index_target] = value_b.*;
|
|
2003
|
+
index_target += 1;
|
|
2004
|
+
},
|
|
2005
|
+
.eq => { // Values have equal keys -- collapse them!
|
|
2006
|
+
index_source_a += 1;
|
|
2007
|
+
index_source_b += 1;
|
|
2008
|
+
|
|
2009
|
+
if (comptime Table.usage == .secondary_index) {
|
|
2010
|
+
// Secondary index optimization --- cancel out put and remove.
|
|
2011
|
+
assert(tombstone(value_a) != tombstone(value_b));
|
|
2012
|
+
} else {
|
|
2013
|
+
if (drop_tombstones and tombstone(value_a)) continue;
|
|
2014
|
+
values_target[index_target] = value_a.*;
|
|
2015
|
+
index_target += 1;
|
|
2016
|
+
}
|
|
2017
|
+
},
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
|
|
2021
|
+
const merge_result: MergeResult = .{
|
|
2022
|
+
.consumed_a = @intCast(index_source_a),
|
|
2023
|
+
.consumed_b = @intCast(index_source_b),
|
|
2024
|
+
.dropped = @intCast(index_source_a + index_source_b - index_target),
|
|
2025
|
+
.produced = @intCast(index_target),
|
|
2026
|
+
};
|
|
2027
|
+
assert(merge_result.consumed_a > 0 or merge_result.consumed_b > 0);
|
|
2028
|
+
assert(merge_result.consumed_a <= values_source_a.len);
|
|
2029
|
+
assert(merge_result.consumed_b <= values_source_b.len);
|
|
2030
|
+
assert(merge_result.dropped <= merge_result.consumed_a + merge_result.consumed_b);
|
|
2031
|
+
assert(merge_result.produced <= values_target.len);
|
|
2032
|
+
assert(merge_result.produced ==
|
|
2033
|
+
merge_result.consumed_a + merge_result.consumed_b - merge_result.dropped);
|
|
2034
|
+
return merge_result;
|
|
2035
|
+
}
|
|
2036
|
+
};
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
pub fn snapshot_max_for_table_input(op_min: u64) u64 {
|
|
2040
|
+
return snapshot_min_for_table_output(op_min) - 1;
|
|
2041
|
+
}
|
|
2042
|
+
|
|
2043
|
+
pub fn snapshot_min_for_table_output(op_min: u64) u64 {
|
|
2044
|
+
assert(op_min > 0);
|
|
2045
|
+
assert(op_min % @divExact(constants.lsm_compaction_ops, 2) == 0);
|
|
2046
|
+
return op_min + @divExact(constants.lsm_compaction_ops, 2);
|
|
2047
|
+
}
|
|
2048
|
+
|
|
2049
|
+
/// Returns the first op of the compaction (Compaction.op_min) for a given op/beat.
|
|
2050
|
+
///
|
|
2051
|
+
/// After this compaction finishes:
|
|
2052
|
+
/// - `op_min + half_bar_beat_count - 1` will be the input tables' snapshot_max.
|
|
2053
|
+
/// - `op_min + half_bar_beat_count` will be the output tables' snapshot_min.
|
|
2054
|
+
///
|
|
2055
|
+
/// Each half-bar has a separate op_min (for deriving the output snapshot_min) instead of each full
|
|
2056
|
+
/// bar because this allows the output tables of the first half-bar's compaction to be prefetched
|
|
2057
|
+
/// against earlier — hopefully while they are still warm in the cache from being written.
|
|
2058
|
+
///
|
|
2059
|
+
///
|
|
2060
|
+
/// These charts depict the commit/compact ops over a series of
|
|
2061
|
+
/// commits and compactions (with lsm_compaction_ops=8).
|
|
2062
|
+
///
|
|
2063
|
+
/// Legend:
|
|
2064
|
+
///
|
|
2065
|
+
/// ┼ full bar (first half-bar start)
|
|
2066
|
+
/// ┬ half bar (second half-bar start)
|
|
2067
|
+
/// This is incremented at the end of each compact().
|
|
2068
|
+
/// . op is in mutable table (in memory)
|
|
2069
|
+
/// , op is in immutable table (in memory)
|
|
2070
|
+
/// # op is on disk
|
|
2071
|
+
/// ✓ checkpoint() may follow compact()
|
|
2072
|
+
///
|
|
2073
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
2074
|
+
/// ┼───┬───┼───┬───┼
|
|
2075
|
+
/// . ╷ ╷ init(superblock.commit_min=0)⎤ Compaction is effectively a noop for the
|
|
2076
|
+
/// .. ╷ ╷ commit;compact( 1) start/end ⎥ first bar because there are no tables on
|
|
2077
|
+
/// ... ╷ ╷ commit;compact( 2) start/end ⎥ disk yet, and no immutable table to
|
|
2078
|
+
/// .... ╷ ╷ commit;compact( 3) start/end ⎥ flush.
|
|
2079
|
+
/// ..... ╷ ╷ commit;compact( 4) start/end ⎥
|
|
2080
|
+
/// ...... ╷ ╷ commit;compact( 5) start/end ⎥ This applies:
|
|
2081
|
+
/// ....... ╷ ╷ commit;compact( 6) start/end ⎥ - when the LSM is starting on a freshly
|
|
2082
|
+
/// ........╷ ╷ commit;compact( 7) start ⎤⎥ formatted data file, and also
|
|
2083
|
+
/// ,,,,,,,,. ╷ ✓ compact( 7) end⎦⎦ - when the LSM is recovering from a crash
|
|
2084
|
+
/// ,,,,,,,,. ╷ commit;compact( 8) start/end (see below).
|
|
2085
|
+
/// ,,,,,,,,.. ╷ commit;compact( 9) start/end
|
|
2086
|
+
/// ,,,,,,,,... ╷ commit;compact(10) start/end
|
|
2087
|
+
/// ,,,,,,,,.... ╷ commit;compact(11) start/end
|
|
2088
|
+
/// ,,,,,,,,..... ╷ commit;compact(12) start/end
|
|
2089
|
+
/// ,,,,,,,,...... ╷ commit;compact(13) start/end
|
|
2090
|
+
/// ,,,,,,,,....... ╷ commit;compact(14) start/end
|
|
2091
|
+
/// ,,,,,,,,........╷ commit;compact(15) start ⎤
|
|
2092
|
+
/// ########,,,,,,,,╷ ✓ compact(15) end⎦
|
|
2093
|
+
/// ########,,,,,,,,. commit;compact(16) start/end
|
|
2094
|
+
/// ┼───┬───┼───┬───┼
|
|
2095
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
2096
|
+
/// ┼───┬───┼───┬───┼ Recover with a checkpoint taken at op 15.
|
|
2097
|
+
/// ######## ╷ init(superblock.commit_min=7) At op 15, ops 8…15 are in memory, so they
|
|
2098
|
+
/// ########. ╷ commit ( 8) start/end ⎤ were dropped by the crash.
|
|
2099
|
+
/// ########.. ╷ commit ( 9) start/end ⎥
|
|
2100
|
+
/// ########... ╷ commit (10) start/end ⎥ But compaction is not run for ops 8…15
|
|
2101
|
+
/// ########.... ╷ commit (11) start/end ⎥ because it was already performed
|
|
2102
|
+
/// ########..... ╷ commit (12) start/end ⎥ before the checkpoint.
|
|
2103
|
+
/// ########...... ╷ commit (13) start/end ⎥
|
|
2104
|
+
/// ########....... ╷ commit (14) start/end ⎥ We can begin to compact again at op 16,
|
|
2105
|
+
/// ########........╷ commit (15) start ⎤⎥ because those compactions (if previously
|
|
2106
|
+
/// ########,,,,,,,,╷ ✓ (15) end⎦⎦ performed) are not included in the
|
|
2107
|
+
/// ########,,,,,,,,. commit;compact(16) start/end checkpoint.
|
|
2108
|
+
/// ┼───┬───┼───┬───┼
|
|
2109
|
+
/// 0 2 4 6 8 0 2 4 6
|
|
2110
|
+
///
|
|
2111
|
+
/// Notice how in the checkpoint recovery example above, we are careful not to `compact(op)` twice
|
|
2112
|
+
/// for any op (even if we crash/recover), since that could lead to differences between replicas'
|
|
2113
|
+
/// storage. The last bar of `commit()`s is always only in memory, so it is safe to repeat.
|
|
2114
|
+
pub fn compaction_op_min(op: u64) u64 {
|
|
2115
|
+
assert(op >= half_bar_beat_count);
|
|
2116
|
+
return op - op % half_bar_beat_count;
|
|
2117
|
+
}
|