tigerbeetle 0.0.34 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/tb_client/extconf.rb +13 -13
  4. data/ext/tb_client/tigerbeetle/LICENSE +177 -0
  5. data/ext/tb_client/tigerbeetle/build.zig +2327 -0
  6. data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
  7. data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
  8. data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
  9. data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
  10. data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
  11. data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
  12. data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
  13. data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
  14. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
  15. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
  16. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
  17. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
  18. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
  19. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
  20. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
  21. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
  22. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
  23. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
  24. data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
  25. data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
  26. data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
  27. data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
  28. data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
  29. data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
  30. data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
  31. data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
  32. data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
  33. data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
  34. data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
  35. data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
  36. data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
  37. data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
  38. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
  39. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
  40. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
  41. data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
  42. data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
  43. data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
  44. data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
  45. data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
  46. data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
  47. data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
  48. data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
  49. data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
  50. data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
  51. data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
  52. data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
  53. data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
  54. data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
  55. data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
  56. data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
  57. data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
  58. data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
  59. data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
  60. data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
  61. data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
  62. data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
  63. data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
  64. data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
  65. data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
  66. data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
  67. data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
  68. data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
  69. data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
  70. data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
  71. data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
  72. data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
  73. data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
  74. data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
  75. data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
  76. data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
  77. data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
  78. data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
  79. data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
  80. data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
  81. data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
  82. data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
  83. data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
  84. data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
  85. data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
  86. data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
  87. data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
  88. data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
  89. data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
  90. data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
  91. data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
  92. data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
  93. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
  94. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
  95. data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
  96. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
  97. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
  98. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
  99. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
  100. data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
  101. data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
  102. data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
  103. data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
  104. data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
  105. data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
  106. data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
  107. data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
  108. data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
  109. data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
  110. data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
  111. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
  112. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
  113. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
  114. data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
  115. data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
  116. data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
  117. data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
  118. data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
  119. data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
  120. data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
  121. data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
  122. data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
  123. data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
  124. data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
  125. data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
  126. data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
  127. data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
  128. data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
  129. data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
  130. data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
  131. data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
  132. data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
  133. data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
  134. data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
  135. data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
  136. data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
  137. data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
  138. data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
  139. data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
  140. data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
  141. data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
  142. data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
  143. data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
  144. data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
  145. data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
  146. data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
  147. data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
  148. data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
  149. data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
  150. data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
  151. data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
  152. data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
  153. data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
  154. data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
  155. data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
  156. data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
  157. data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
  158. data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
  159. data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
  160. data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
  161. data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
  162. data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
  163. data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
  164. data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
  165. data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
  166. data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
  167. data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
  168. data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
  169. data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
  170. data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
  171. data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
  172. data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
  173. data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
  174. data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
  175. data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
  176. data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
  177. data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
  178. data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
  179. data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
  180. data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
  181. data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
  182. data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
  183. data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
  184. data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
  185. data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
  186. data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
  187. data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
  188. data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
  189. data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
  190. data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
  191. data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
  192. data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
  193. data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
  194. data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
  195. data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
  196. data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
  197. data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
  198. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
  199. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
  200. data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
  201. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
  202. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
  203. data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
  204. data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
  205. data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
  206. data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
  207. data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
  208. data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
  209. data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
  210. data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
  211. data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
  212. data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
  213. data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
  214. data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
  215. data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
  216. data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
  217. data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
  218. data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
  219. data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
  220. data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
  221. data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
  222. data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
  223. data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
  224. data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
  225. data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
  226. data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
  227. data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
  228. data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
  229. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
  230. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
  231. data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
  232. data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
  233. data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
  234. data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
  235. data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
  236. data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
  237. data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
  238. data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
  239. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
  240. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
  241. data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
  242. data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
  243. data/lib/tb_client/shared_lib.rb +12 -5
  244. data/lib/tigerbeetle/client.rb +1 -1
  245. data/lib/tigerbeetle/platforms.rb +9 -0
  246. data/lib/tigerbeetle/version.rb +2 -2
  247. data/tigerbeetle.gemspec +22 -5
  248. metadata +242 -3
  249. data/ext/tb_client/pkg.tar.gz +0 -0
@@ -0,0 +1,1263 @@
1
+ //! Maintains a durable manifest log of the latest TableInfo's for every LSM tree's in-memory
2
+ //! manifest.
3
+ //!
4
+ //! Invariants:
5
+ //!
6
+ //! * Checkpointing the manifest log must flush all buffered log blocks.
7
+ //!
8
+ //! * Opening the manifest log must emit only the latest TableInfo's to be inserted.
9
+ //!
10
+ //! * The latest version of a table must never be dropped from the log through a compaction, unless
11
+ //! the table was removed.
12
+ //!
13
+ //! * Removes that are recorded in a log block must also queue that log block for compaction.
14
+ //!
15
+ //! * Compaction must compact partially full blocks, even where it must rewrite all entries to the
16
+ //! tail end of the log.
17
+ //!
18
+ //! * If a remove is dropped from the log, then all prior inserts/updates must already have been
19
+ //! dropped.
20
+
21
+ const std = @import("std");
22
+ const assert = std.debug.assert;
23
+ const mem = std.mem;
24
+ const maybe = stdx.maybe;
25
+
26
+ const log = std.log.scoped(.manifest_log);
27
+
28
+ const constants = @import("../constants.zig");
29
+ const vsr = @import("../vsr.zig");
30
+ const stdx = @import("stdx");
31
+
32
+ const SuperBlockType = vsr.SuperBlockType;
33
+ const GridType = @import("../vsr/grid.zig").GridType;
34
+ const BlockPtr = @import("../vsr/grid.zig").BlockPtr;
35
+ const BlockPtrConst = @import("../vsr/grid.zig").BlockPtrConst;
36
+ const allocate_block = @import("../vsr/grid.zig").allocate_block;
37
+ const compaction = @import("compaction.zig");
38
+ const RingBufferType = stdx.RingBufferType;
39
+ const schema = @import("schema.zig");
40
+ const TableInfo = schema.ManifestNode.TableInfo;
41
+ const BlockReference = vsr.BlockReference;
42
+
43
+ const block_builder_schema = schema.ManifestNode{
44
+ .entry_count = schema.ManifestNode.entry_count_max,
45
+ };
46
+
47
+ pub fn ManifestLogType(comptime Storage: type) type {
48
+ return struct {
49
+ const ManifestLog = @This();
50
+
51
+ const SuperBlock = SuperBlockType(Storage);
52
+ const Grid = GridType(Storage);
53
+
54
+ pub const Callback = *const fn (manifest_log: *ManifestLog) void;
55
+
56
+ pub const OpenEvent = *const fn (manifest_log: *ManifestLog, table: *const TableInfo) void;
57
+
58
+ const Write = struct {
59
+ manifest_log: *ManifestLog,
60
+ write: Grid.Write = undefined,
61
+ };
62
+
63
+ const TableExtents = std.AutoHashMapUnmanaged(u64, TableExtent);
64
+ const TablesRemoved = std.AutoHashMapUnmanaged(u64, void);
65
+
66
+ pub const TableExtent = struct {
67
+ block: u64, // Manifest block address.
68
+ entry: u32, // Index within the manifest block Label/TableInfo arrays.
69
+ };
70
+
71
+ superblock: *SuperBlock,
72
+ grid: *Grid,
73
+ pace: *const Pace,
74
+
75
+ forest_table_count_max: u32,
76
+
77
+ grid_reservation: ?Grid.Reservation = null,
78
+
79
+ /// The number of blocks (remaining) to compact during the current half-bar.
80
+ compact_blocks: ?u32 = null,
81
+
82
+ /// This is a struct-of-arrays of `BlockReference`s.
83
+ /// It includes:
84
+ /// - blocks that are written
85
+ /// - blocks that have closed, but not yet flushed
86
+ /// - blocks that are being flushed
87
+ ///
88
+ /// Entries are ordered from oldest to newest.
89
+ log_block_checksums: RingBufferType(u128, .slice),
90
+ log_block_addresses: RingBufferType(u64, .slice),
91
+
92
+ /// The head block accumulates a full block, to be written at the next flush.
93
+ /// The remaining blocks must accommodate all further appends.
94
+ blocks: RingBufferType(BlockPtr, .slice),
95
+
96
+ /// The number of blocks that have been appended to, filled up, and then closed.
97
+ blocks_closed: u8 = 0,
98
+
99
+ /// The number of entries in the open block.
100
+ ///
101
+ /// Invariants:
102
+ /// - When `entry_count = 0`, there is no open block.
103
+ /// - `entry_count < entry_count_max`. When `entry_count` reaches the maximum, the open
104
+ /// block is closed, and `entry_count` resets to 0.
105
+ entry_count: u32 = 0,
106
+
107
+ opened: bool = false,
108
+ open_event: OpenEvent = undefined,
109
+
110
+ /// Set for the duration of `open` and `compact`.
111
+ reading: bool = false,
112
+ read: Grid.Read = undefined,
113
+ read_callback: ?Callback = null,
114
+
115
+ /// Set for the duration of `flush` and `checkpoint`.
116
+ writing: bool = false,
117
+ writes: []Write,
118
+ writes_pending: usize = 0,
119
+ write_callback: ?Callback = null,
120
+
121
+ next_tick: Grid.NextTick = undefined,
122
+
123
+ /// A map from table address to the manifest block and entry that is the latest extent
124
+ /// version. Used to determine whether a table should be dropped in a compaction.
125
+ table_extents: TableExtents,
126
+
127
+ /// For a particular table in the manifest, the sequence of events is:
128
+ ///
129
+ /// insert(0|1), update(0+), remove(0|1)
130
+ ///
131
+ /// During open(), manifest entries are processed in reverse-chronological order.
132
+ ///
133
+ /// This hash-set tracks tables that have been removed but whose corresponding "insert" has
134
+ /// not yet been encountered. Given that the maximum number of tables in the forest at any
135
+ /// given moment is `forest_table_count_max`, there are likewise at most
136
+ /// `forest_table_count_max` "unpaired" removes to track.
137
+ // TODO(Optimization) This memory (~35MiB) is only needed during open() – maybe borrow it
138
+ // from the grid cache or node pool instead so that we don't pay for it during normal
139
+ // operation.
140
+ tables_removed: TablesRemoved,
141
+
142
+ pub fn init(
143
+ manifest_log: *ManifestLog,
144
+ allocator: mem.Allocator,
145
+ grid: *Grid,
146
+ compaction_pace: *const Pace,
147
+ ) !void {
148
+ manifest_log.* = .{
149
+ .superblock = grid.superblock,
150
+ .grid = grid,
151
+ .forest_table_count_max = compaction_pace.tables_max,
152
+ .pace = compaction_pace,
153
+ .log_block_checksums = undefined,
154
+ .log_block_addresses = undefined,
155
+ .blocks = undefined,
156
+ .writes = undefined,
157
+ .table_extents = undefined,
158
+ .tables_removed = undefined,
159
+ };
160
+
161
+ inline for (std.meta.fields(Pace)) |pace_field| {
162
+ log.debug("{?}: Manifest.Pace.{s} = {d}", .{
163
+ grid.superblock.replica_index,
164
+ pace_field.name,
165
+ @field(manifest_log.pace, pace_field.name),
166
+ });
167
+ }
168
+
169
+ manifest_log.log_block_checksums =
170
+ try RingBufferType(u128, .slice).init(allocator, manifest_log.pace.log_blocks_max);
171
+ errdefer manifest_log.log_block_checksums.deinit(allocator);
172
+
173
+ manifest_log.log_block_addresses =
174
+ try RingBufferType(u64, .slice).init(allocator, manifest_log.pace.log_blocks_max);
175
+ errdefer manifest_log.log_block_addresses.deinit(allocator);
176
+
177
+ // The upper-bound of manifest blocks we must buffer.
178
+ //
179
+ // `blocks` must have sufficient capacity for:
180
+ // - a leftover open block from the previous ops (+1 block)
181
+ // - table updates copied from a half bar of manifest compactions
182
+ // - table updates from a half bar of table compactions
183
+ const half_bar_buffer_blocks_max = 1 + manifest_log.pace.half_bar_compact_blocks_max +
184
+ manifest_log.pace.half_bar_append_blocks_max;
185
+ assert(half_bar_buffer_blocks_max >= 3);
186
+
187
+ // TODO RingBuffer for .slice should be extended to take care of alignment:
188
+ manifest_log.blocks =
189
+ try RingBufferType(BlockPtr, .slice).init(allocator, half_bar_buffer_blocks_max);
190
+ errdefer manifest_log.blocks.deinit(allocator);
191
+
192
+ for (manifest_log.blocks.buffer, 0..) |*block, i| {
193
+ errdefer for (manifest_log.blocks.buffer[0..i]) |b| allocator.free(b);
194
+ block.* = try allocate_block(allocator);
195
+ }
196
+ errdefer for (manifest_log.blocks.buffer) |b| allocator.free(b);
197
+
198
+ manifest_log.writes = try allocator.alloc(Write, half_bar_buffer_blocks_max);
199
+ errdefer allocator.free(manifest_log.writes);
200
+ @memset(manifest_log.writes, undefined);
201
+
202
+ manifest_log.table_extents = TableExtents{};
203
+ try manifest_log.table_extents.ensureTotalCapacity(
204
+ allocator,
205
+ // Allocate space for one additional table, so that the code can still use
206
+ // `getOrPutAssumeCapacity` while making it easier to check if the limit has been
207
+ // exceeded to error with a friendly message.
208
+ manifest_log.forest_table_count_max + 1,
209
+ );
210
+ errdefer manifest_log.table_extents.deinit(allocator);
211
+
212
+ manifest_log.tables_removed = TablesRemoved{};
213
+ try manifest_log.tables_removed.ensureTotalCapacity(
214
+ allocator,
215
+ manifest_log.forest_table_count_max,
216
+ );
217
+ errdefer manifest_log.tables_removed.deinit(allocator);
218
+ }
219
+
220
+ pub fn deinit(manifest_log: *ManifestLog, allocator: mem.Allocator) void {
221
+ manifest_log.tables_removed.deinit(allocator);
222
+ manifest_log.table_extents.deinit(allocator);
223
+ allocator.free(manifest_log.writes);
224
+ for (manifest_log.blocks.buffer) |block| allocator.free(block);
225
+ manifest_log.blocks.deinit(allocator);
226
+ manifest_log.log_block_addresses.deinit(allocator);
227
+ manifest_log.log_block_checksums.deinit(allocator);
228
+ }
229
+
230
+ pub fn reset(manifest_log: *ManifestLog) void {
231
+ assert(manifest_log.log_block_checksums.count ==
232
+ manifest_log.log_block_addresses.count);
233
+
234
+ manifest_log.grid.trace.cancel(.compact_manifest);
235
+
236
+ manifest_log.log_block_checksums.clear();
237
+ manifest_log.log_block_addresses.clear();
238
+ for (manifest_log.blocks.buffer) |block| @memset(block, 0);
239
+ manifest_log.table_extents.clearRetainingCapacity();
240
+ manifest_log.tables_removed.clearRetainingCapacity();
241
+
242
+ manifest_log.* = .{
243
+ .superblock = manifest_log.superblock,
244
+ .grid = manifest_log.grid,
245
+ .forest_table_count_max = manifest_log.pace.tables_max,
246
+ .pace = manifest_log.pace,
247
+ .log_block_checksums = manifest_log.log_block_checksums,
248
+ .log_block_addresses = manifest_log.log_block_addresses,
249
+ .blocks = .{ .buffer = manifest_log.blocks.buffer },
250
+ .writes = manifest_log.writes,
251
+ .table_extents = manifest_log.table_extents,
252
+ .tables_removed = manifest_log.tables_removed,
253
+ };
254
+ }
255
+
256
+ /// Opens the manifest log.
257
+ /// Reads the manifest blocks in reverse order and passes extent table inserts to event().
258
+ /// Therefore, only the latest version of a table will be emitted by event() for insertion
259
+ /// into the in-memory manifest. Older versions of a table in older manifest blocks will not
260
+ /// be emitted, as an optimization to not replay all table mutations.
261
+ /// `ManifestLog.table_extents` is used to track the latest version of a table.
262
+ // TODO(Optimization): Accumulate tables unordered, then sort all at once to splice into the
263
+ // ManifestLevels' SegmentedArrays. (Constructing SegmentedArrays by repeated inserts is
264
+ // expensive.)
265
+ pub fn open(manifest_log: *ManifestLog, event: OpenEvent, callback: Callback) void {
266
+ assert(!manifest_log.opened);
267
+ assert(!manifest_log.reading);
268
+ assert(!manifest_log.writing);
269
+ assert(manifest_log.read_callback == null);
270
+
271
+ assert(manifest_log.log_block_checksums.count == 0);
272
+ assert(manifest_log.log_block_addresses.count == 0);
273
+ assert(manifest_log.blocks.count == 0);
274
+ assert(manifest_log.blocks_closed == 0);
275
+ assert(manifest_log.entry_count == 0);
276
+ assert(manifest_log.table_extents.count() == 0);
277
+ assert(manifest_log.tables_removed.count() == 0);
278
+
279
+ manifest_log.open_event = event;
280
+ manifest_log.reading = true;
281
+ manifest_log.read_callback = callback;
282
+
283
+ const references = manifest_log.superblock.working.manifest_references();
284
+ assert(references.block_count <= manifest_log.log_block_checksums.buffer.len);
285
+
286
+ if (references.empty()) {
287
+ manifest_log.grid.on_next_tick(open_next_tick_callback, &manifest_log.next_tick);
288
+ } else {
289
+ manifest_log.open_read_block(.{
290
+ .checksum = references.newest_checksum,
291
+ .address = references.newest_address,
292
+ });
293
+ }
294
+ }
295
+
296
+ fn open_next_tick_callback(next_tick: *Grid.NextTick) void {
297
+ const manifest_log: *ManifestLog = @alignCast(@fieldParentPtr("next_tick", next_tick));
298
+ assert(!manifest_log.opened);
299
+ assert(manifest_log.reading);
300
+ assert(!manifest_log.writing);
301
+
302
+ assert(manifest_log.log_block_checksums.count == 0);
303
+ assert(manifest_log.log_block_addresses.count == 0);
304
+ assert(manifest_log.table_extents.count() == 0);
305
+ assert(manifest_log.tables_removed.count() == 0);
306
+ assert(manifest_log.superblock.working.manifest_references().empty());
307
+
308
+ manifest_log.open_done();
309
+ }
310
+
311
+ fn open_read_block(manifest_log: *ManifestLog, block_reference: BlockReference) void {
312
+ assert(!manifest_log.opened);
313
+ assert(manifest_log.reading);
314
+ assert(manifest_log.read_callback != null);
315
+ assert(!manifest_log.writing);
316
+ assert(manifest_log.write_callback == null);
317
+ assert(manifest_log.table_extents.count() <= manifest_log.forest_table_count_max);
318
+ assert(manifest_log.tables_removed.count() <= manifest_log.forest_table_count_max);
319
+ assert(manifest_log.log_block_checksums.count <
320
+ manifest_log.log_block_checksums.buffer.len);
321
+ assert(manifest_log.log_block_checksums.count ==
322
+ manifest_log.log_block_addresses.count);
323
+ assert(manifest_log.log_block_checksums.count <
324
+ manifest_log.superblock.working.vsr_state.checkpoint.manifest_block_count);
325
+ assert(manifest_log.blocks.count == 0);
326
+ assert(manifest_log.blocks_closed == 0);
327
+ assert(manifest_log.entry_count == 0);
328
+ assert(block_reference.address > 0);
329
+
330
+ if (constants.verify) {
331
+ // The manifest block list has no cycles.
332
+ var address_iterator = manifest_log.log_block_addresses.iterator();
333
+ while (address_iterator.next()) |address| {
334
+ assert(address != block_reference.address);
335
+ }
336
+ }
337
+
338
+ manifest_log.log_block_checksums.push_head_assume_capacity(block_reference.checksum);
339
+ manifest_log.log_block_addresses.push_head_assume_capacity(block_reference.address);
340
+
341
+ manifest_log.grid.read_block(
342
+ .{ .from_local_or_global_storage = open_read_block_callback },
343
+ &manifest_log.read,
344
+ block_reference.address,
345
+ block_reference.checksum,
346
+ .{ .cache_read = true, .cache_write = true },
347
+ );
348
+ }
349
+
350
+ fn open_read_block_callback(read: *Grid.Read, block: BlockPtrConst) void {
351
+ const manifest_log: *ManifestLog = @fieldParentPtr("read", read);
352
+ assert(!manifest_log.opened);
353
+ assert(manifest_log.reading);
354
+ assert(!manifest_log.writing);
355
+ assert(manifest_log.log_block_addresses.count > 0);
356
+ assert(manifest_log.log_block_checksums.count > 0);
357
+ assert(!manifest_log.superblock.working.manifest_references().empty());
358
+
359
+ const block_checksum = manifest_log.log_block_checksums.head().?;
360
+ const block_address = manifest_log.log_block_addresses.head().?;
361
+ verify_block(block, block_checksum, block_address);
362
+
363
+ const block_schema = schema.ManifestNode.from(block);
364
+ const tables_used = block_schema.tables_const(block);
365
+ assert(block_schema.entry_count > 0);
366
+ assert(block_schema.entry_count <= schema.ManifestNode.entry_count_max);
367
+
368
+ var entry = block_schema.entry_count;
369
+ while (entry > 0) {
370
+ entry -= 1;
371
+
372
+ const table = &tables_used[entry];
373
+ assert(table.label.event != .reserved);
374
+ assert(table.address > 0);
375
+
376
+ if (table.label.event == .remove) {
377
+ const table_removed =
378
+ manifest_log.tables_removed.fetchPutAssumeCapacity(table.address, {});
379
+ assert(table_removed == null);
380
+ } else {
381
+ if (manifest_log.tables_removed.get(table.address)) |_| {
382
+ if (table.label.event == .insert) {
383
+ assert(manifest_log.tables_removed.remove(table.address));
384
+ }
385
+ } else {
386
+ const extent =
387
+ manifest_log.table_extents.getOrPutAssumeCapacity(table.address);
388
+ if (!extent.found_existing) {
389
+ manifest_log.check_tables_count();
390
+ extent.value_ptr.* = .{ .block = block_address, .entry = entry };
391
+ manifest_log.open_event(manifest_log, table);
392
+ }
393
+ }
394
+ }
395
+ }
396
+
397
+ log.debug("{}: opened: checksum={x:0>32} address={} entries={}", .{
398
+ manifest_log.superblock.replica_index.?,
399
+ block_checksum,
400
+ block_address,
401
+ block_schema.entry_count,
402
+ });
403
+
404
+ const checkpoint_state = &manifest_log.superblock.working.vsr_state.checkpoint;
405
+ if (checkpoint_state.manifest_oldest_address == block_address) {
406
+ // When we find the oldest block, stop iterating the linked list – any more blocks
407
+ // have already been compacted away.
408
+ assert(checkpoint_state.manifest_oldest_checksum == block_checksum);
409
+
410
+ manifest_log.open_done();
411
+ } else {
412
+ const block_reference_previous = schema.ManifestNode.previous(block).?;
413
+
414
+ manifest_log.open_read_block(.{
415
+ .checksum = block_reference_previous.checksum,
416
+ .address = block_reference_previous.address,
417
+ });
418
+ }
419
+ }
420
+
421
+ fn open_done(manifest_log: *ManifestLog) void {
422
+ assert(!manifest_log.opened);
423
+ assert(manifest_log.reading);
424
+ assert(manifest_log.read_callback != null);
425
+ assert(!manifest_log.writing);
426
+ assert(manifest_log.write_callback == null);
427
+ assert(manifest_log.table_extents.count() <= manifest_log.forest_table_count_max);
428
+ assert(manifest_log.tables_removed.count() <= manifest_log.forest_table_count_max);
429
+ assert(manifest_log.log_block_checksums.count ==
430
+ manifest_log.log_block_addresses.count);
431
+ assert(manifest_log.log_block_checksums.count ==
432
+ manifest_log.superblock.working.vsr_state.checkpoint.manifest_block_count);
433
+ assert(manifest_log.blocks.count == 0);
434
+ assert(manifest_log.blocks_closed == 0);
435
+ assert(manifest_log.entry_count == 0);
436
+
437
+ log.debug("{}: open_done: opened block_count={} table_count={}", .{
438
+ manifest_log.superblock.replica_index.?,
439
+ manifest_log.log_block_checksums.count,
440
+ manifest_log.table_extents.count(),
441
+ });
442
+
443
+ const callback = manifest_log.read_callback.?;
444
+ manifest_log.opened = true;
445
+ manifest_log.open_event = undefined;
446
+ manifest_log.reading = false;
447
+ manifest_log.read_callback = null;
448
+
449
+ callback(manifest_log);
450
+ }
451
+
452
+ /// Appends an insert/update/remove of a table to a level.
453
+ ///
454
+ /// A move is only recorded as an update, there is no remove from the previous level, since
455
+ /// this is safer (no potential to get the event order wrong) and reduces fragmentation.
456
+ pub fn append(manifest_log: *ManifestLog, table: *const TableInfo) void {
457
+ maybe(manifest_log.opened);
458
+ maybe(manifest_log.reading);
459
+ assert(!manifest_log.writing);
460
+
461
+ switch (table.label.event) {
462
+ .reserved => unreachable,
463
+ .insert => assert(manifest_log.table_extents.get(table.address) == null),
464
+ // For updates + removes, the table must have previously been inserted into the log:
465
+ .update => assert(manifest_log.table_extents.get(table.address) != null),
466
+ .remove => assert(manifest_log.table_extents.get(table.address) != null),
467
+ }
468
+
469
+ manifest_log.append_internal(table);
470
+ }
471
+
472
+ /// The table extent must be updated immediately when appending, without delay.
473
+ /// Otherwise, ManifestLog.compact() may append a stale version over the latest.
474
+ ///
475
+ /// append_internal() is used for both:
476
+ /// - External appends, e.g. events created due to table compaction.
477
+ /// - Internal appends, e.g. events recycled by manifest compaction.
478
+ fn append_internal(manifest_log: *ManifestLog, table: *const TableInfo) void {
479
+ assert(manifest_log.opened);
480
+ assert(!manifest_log.writing);
481
+ maybe(manifest_log.reading);
482
+ assert(manifest_log.grid_reservation != null);
483
+ assert(table.label.level < constants.lsm_levels);
484
+ assert(table.address > 0);
485
+ assert(table.snapshot_min > 0);
486
+ assert(table.snapshot_max > table.snapshot_min);
487
+
488
+ if (manifest_log.entry_count == 0) {
489
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
490
+ manifest_log.acquire_block();
491
+ } else if (manifest_log.entry_count > 0) {
492
+ assert(manifest_log.blocks.count > 0);
493
+ }
494
+
495
+ assert(manifest_log.entry_count < schema.ManifestNode.entry_count_max);
496
+ assert(manifest_log.blocks.count - manifest_log.blocks_closed == 1);
497
+
498
+ log.debug(
499
+ "{}: {s}: level={} tree={} checksum={x:0>32} address={} snapshot={}..{}",
500
+ .{
501
+ manifest_log.superblock.replica_index.?,
502
+ @tagName(table.label.event),
503
+ table.label.level,
504
+ table.tree_id,
505
+ table.checksum,
506
+ table.address,
507
+ table.snapshot_min,
508
+ table.snapshot_max,
509
+ },
510
+ );
511
+
512
+ const block: BlockPtr = manifest_log.blocks.tail().?;
513
+ const entry = manifest_log.entry_count;
514
+ block_builder_schema.tables(block)[entry] = table.*;
515
+
516
+ const block_header =
517
+ mem.bytesAsValue(vsr.Header.Block, block[0..@sizeOf(vsr.Header)]);
518
+ const block_address = block_header.address;
519
+
520
+ switch (table.label.event) {
521
+ .reserved => unreachable,
522
+ .insert,
523
+ .update,
524
+ => {
525
+ const extent = manifest_log.table_extents.getOrPutAssumeCapacity(table.address);
526
+ if (!extent.found_existing) {
527
+ assert(table.label.event == .insert);
528
+
529
+ // When inserting, check that the insertion didn't cause the number of
530
+ // tables to exceed `forest_table_count_max`.
531
+ manifest_log.check_tables_count();
532
+ } else {
533
+ maybe(table.label.event == .insert); // (Compaction.)
534
+ }
535
+ extent.value_ptr.* = .{ .block = block_address, .entry = entry };
536
+ },
537
+ .remove => assert(manifest_log.table_extents.remove(table.address)),
538
+ }
539
+
540
+ manifest_log.entry_count += 1;
541
+ if (manifest_log.entry_count == schema.ManifestNode.entry_count_max) {
542
+ manifest_log.close_block();
543
+ assert(manifest_log.entry_count == 0);
544
+ }
545
+ }
546
+
547
+ fn check_tables_count(manifest_log: *ManifestLog) void {
548
+ const tables_count = manifest_log.table_extents.count();
549
+ if (tables_count > manifest_log.forest_table_count_max) {
550
+ vsr.fatal(
551
+ .forest_tables_count_would_exceed_limit,
552
+ "forest_tables_count would exceed limit " ++
553
+ "(tables_count={} tables_max={}) - " ++
554
+ "please contact the team directly who will be able to assist",
555
+ .{ tables_count, manifest_log.forest_table_count_max },
556
+ );
557
+ }
558
+ }
559
+
560
+ fn flush(manifest_log: *ManifestLog, callback: Callback) void {
561
+ assert(manifest_log.opened);
562
+ assert(!manifest_log.reading);
563
+ assert(!manifest_log.writing);
564
+ assert(manifest_log.write_callback == null);
565
+
566
+ log.debug("{}: flush: writing {} block(s)", .{
567
+ manifest_log.superblock.replica_index.?,
568
+ manifest_log.blocks_closed,
569
+ });
570
+
571
+ manifest_log.writing = true;
572
+ manifest_log.write_callback = callback;
573
+
574
+ for (0..manifest_log.blocks_closed) |_| manifest_log.write_block();
575
+ assert(manifest_log.blocks_closed == manifest_log.writes_pending);
576
+
577
+ if (manifest_log.writes_pending == 0) {
578
+ manifest_log.grid.on_next_tick(flush_next_tick_callback, &manifest_log.next_tick);
579
+ }
580
+ }
581
+
582
+ fn flush_next_tick_callback(next_tick: *Grid.NextTick) void {
583
+ const manifest_log: *ManifestLog = @alignCast(@fieldParentPtr("next_tick", next_tick));
584
+ assert(manifest_log.writing);
585
+
586
+ manifest_log.flush_done();
587
+ }
588
+
589
+ fn flush_done(manifest_log: *ManifestLog) void {
590
+ assert(manifest_log.writing);
591
+ assert(manifest_log.write_callback != null);
592
+ assert(manifest_log.blocks_closed == 0);
593
+
594
+ const callback = manifest_log.write_callback.?;
595
+ manifest_log.write_callback = null;
596
+ manifest_log.writing = false;
597
+ callback(manifest_log);
598
+ }
599
+
600
+ fn write_block(manifest_log: *ManifestLog) void {
601
+ assert(manifest_log.opened);
602
+ assert(manifest_log.writing);
603
+ assert(manifest_log.blocks_closed > 0);
604
+ assert(manifest_log.blocks_closed <= manifest_log.blocks.count);
605
+ assert(manifest_log.writes_pending < manifest_log.blocks_closed);
606
+
607
+ const block_index = manifest_log.writes_pending;
608
+ const block = manifest_log.blocks.get_ptr(block_index).?;
609
+ verify_block(block.*, null, null);
610
+
611
+ const block_schema = schema.ManifestNode.from(block.*);
612
+ assert(block_schema.entry_count > 0);
613
+
614
+ const header = schema.header_from_block(block.*);
615
+ assert(header.address > 0);
616
+
617
+ if (block_index == manifest_log.blocks_closed - 1) {
618
+ // This might be the last block of a checkpoint, which can be a partial block.
619
+ assert(block_schema.entry_count <= schema.ManifestNode.entry_count_max);
620
+ } else {
621
+ assert(block_schema.entry_count == schema.ManifestNode.entry_count_max);
622
+ }
623
+
624
+ log.debug("{}: write_block: checksum={x:0>32} address={} entries={}", .{
625
+ manifest_log.superblock.replica_index.?,
626
+ header.checksum,
627
+ header.address,
628
+ block_schema.entry_count,
629
+ });
630
+
631
+ const write = &manifest_log.writes[block_index];
632
+ write.* = .{ .manifest_log = manifest_log };
633
+
634
+ manifest_log.writes_pending += 1;
635
+ manifest_log.grid.create_block(write_block_callback, &write.write, block);
636
+ }
637
+
638
+ fn write_block_callback(grid_write: *Grid.Write) void {
639
+ const write: *Write = @fieldParentPtr("write", grid_write);
640
+ const manifest_log = write.manifest_log;
641
+ assert(manifest_log.opened);
642
+ assert(manifest_log.writing);
643
+ assert(manifest_log.blocks_closed <= manifest_log.blocks.count);
644
+
645
+ manifest_log.writes_pending -= 1;
646
+
647
+ if (manifest_log.writes_pending == 0) {
648
+ for (0..manifest_log.blocks_closed) |_| manifest_log.blocks.advance_head();
649
+ manifest_log.blocks_closed = 0;
650
+
651
+ if (manifest_log.blocks.count == 0) {
652
+ assert(manifest_log.entry_count == 0);
653
+ } else {
654
+ assert(manifest_log.blocks.count == 1);
655
+ assert(manifest_log.entry_count < schema.ManifestNode.entry_count_max);
656
+ }
657
+
658
+ manifest_log.flush_done();
659
+ }
660
+ }
661
+
662
+ /// `compact` does not close a partial block; that is only necessary during `checkpoint`.
663
+ ///
664
+ /// The (production) block size is large, so the number of blocks compacted per half-bar is
665
+ /// relatively small (e.g. ~4). We read them in sequence rather than parallel to spread the
666
+ /// work more evenly across the half-bar's beats.
667
+ // TODO Make sure block reservation cannot fail — before compaction begins verify that
668
+ // enough free blocks are available for all reservations.
669
+ pub fn compact(manifest_log: *ManifestLog, callback: Callback, op: u64) void {
670
+ assert(manifest_log.opened);
671
+ assert(!manifest_log.reading);
672
+ assert(!manifest_log.writing);
673
+ assert(manifest_log.read_callback == null);
674
+ assert(manifest_log.write_callback == null);
675
+ assert(manifest_log.grid_reservation == null);
676
+ assert(manifest_log.blocks.count ==
677
+ manifest_log.blocks_closed + @intFromBool(manifest_log.entry_count > 0));
678
+ assert(manifest_log.compact_blocks == null);
679
+
680
+ // TODO: Currently manifest compaction is hardcoded to run on the last beat of each
681
+ // half-bar.
682
+ // This is because otherwise it would mess with our grid reserve / forfeit ordering,
683
+ // since we now reserve / forfeit per beat.
684
+ assert((op + 1) % @divExact(constants.lsm_compaction_ops, 2) == 0);
685
+
686
+ manifest_log.grid.trace.start(.compact_manifest);
687
+
688
+ if (op < constants.lsm_compaction_ops or
689
+ manifest_log.superblock.working.vsr_state.op_compacted(op))
690
+ {
691
+ manifest_log.read_callback = callback;
692
+ manifest_log.grid.on_next_tick(compact_tick_callback, &manifest_log.next_tick);
693
+ return;
694
+ }
695
+
696
+ manifest_log.compact_blocks = @min(
697
+ manifest_log.pace.half_bar_compact_blocks(.{
698
+ .log_blocks_count = @intCast(manifest_log.log_block_checksums.count),
699
+ .tables_count = manifest_log.table_extents.count(),
700
+ }),
701
+ // Never compact closed blocks. (They haven't even been written yet.)
702
+ manifest_log.log_block_checksums.count - manifest_log.blocks_closed,
703
+ );
704
+ assert(manifest_log.compact_blocks.? <= manifest_log.pace.half_bar_compact_blocks_max);
705
+
706
+ manifest_log.grid_reservation = manifest_log.grid.reserve(
707
+ manifest_log.compact_blocks.? +
708
+ manifest_log.pace.half_bar_append_blocks_max,
709
+ );
710
+
711
+ manifest_log.read_callback = callback;
712
+ manifest_log.flush(compact_next_block);
713
+ }
714
+
715
+ fn compact_tick_callback(next_tick: *Grid.NextTick) void {
716
+ const manifest_log: *ManifestLog = @alignCast(@fieldParentPtr("next_tick", next_tick));
717
+ assert(manifest_log.write_callback == null);
718
+ assert(manifest_log.grid_reservation == null);
719
+ assert(manifest_log.blocks_closed == 0);
720
+ assert(manifest_log.blocks.count == 0);
721
+ assert(manifest_log.entry_count == 0);
722
+ assert(manifest_log.compact_blocks == null);
723
+
724
+ manifest_log.grid.trace.stop(.compact_manifest);
725
+
726
+ const callback = manifest_log.read_callback.?;
727
+ manifest_log.read_callback = null;
728
+ callback(manifest_log);
729
+ }
730
+
731
+ fn compact_next_block(manifest_log: *ManifestLog) void {
732
+ assert(manifest_log.opened);
733
+ assert(!manifest_log.reading);
734
+ assert(!manifest_log.writing);
735
+ assert(manifest_log.read_callback != null);
736
+ assert(manifest_log.grid_reservation != null);
737
+
738
+ const compact_blocks = manifest_log.compact_blocks.?;
739
+ if (compact_blocks == 0) {
740
+ manifest_log.compact_done_callback();
741
+ } else {
742
+ const oldest_checksum = manifest_log.log_block_checksums.head().?;
743
+ const oldest_address = manifest_log.log_block_addresses.head().?;
744
+ assert(oldest_address > 0);
745
+
746
+ manifest_log.compact_blocks.? -= 1;
747
+ manifest_log.reading = true;
748
+ manifest_log.grid.read_block(
749
+ .{ .from_local_or_global_storage = compact_read_block_callback },
750
+ &manifest_log.read,
751
+ oldest_address,
752
+ oldest_checksum,
753
+ .{ .cache_read = true, .cache_write = true },
754
+ );
755
+ }
756
+ }
757
+
758
+ fn compact_read_block_callback(read: *Grid.Read, block: BlockPtrConst) void {
759
+ const manifest_log: *ManifestLog = @fieldParentPtr("read", read);
760
+ assert(manifest_log.opened);
761
+ assert(manifest_log.reading);
762
+ assert(!manifest_log.writing);
763
+ assert(manifest_log.read_callback != null);
764
+ assert(manifest_log.grid_reservation != null);
765
+
766
+ const oldest_checksum = manifest_log.log_block_checksums.pop().?;
767
+ const oldest_address = manifest_log.log_block_addresses.pop().?;
768
+ verify_block(block, oldest_checksum, oldest_address);
769
+
770
+ const block_schema = schema.ManifestNode.from(block);
771
+ assert(block_schema.entry_count > 0);
772
+ assert(block_schema.entry_count <= schema.ManifestNode.entry_count_max);
773
+
774
+ var frees: u32 = 0;
775
+ for (
776
+ block_schema.tables_const(block),
777
+ 0..block_schema.entry_count,
778
+ ) |*table, entry_index| {
779
+ const entry: u32 = @intCast(entry_index);
780
+ switch (table.label.event) {
781
+ .reserved => unreachable,
782
+ // Append the table, updating the table extent:
783
+ .insert,
784
+ .update,
785
+ => {
786
+ // Update the extent if the table is the latest version.
787
+ // We must iterate entries in forward order to drop the extent here.
788
+ // Otherwise, stale versions earlier in the block may reappear.
789
+ if (std.meta.eql(
790
+ manifest_log.table_extents.get(table.address),
791
+ .{ .block = oldest_address, .entry = entry },
792
+ )) {
793
+ // Append the table, updating the table extent:
794
+ manifest_log.append_internal(table);
795
+ } else {
796
+ // Either:
797
+ // - This is not the latest insert for this table, so it can be dropped.
798
+ // - The table was removed some time after this insert.
799
+ frees += 1;
800
+ }
801
+ },
802
+ // Since we compact oldest blocks first, we know that we have already
803
+ // compacted all inserts that were eclipsed by this remove, so this remove
804
+ // can now be safely dropped.
805
+ .remove => frees += 1,
806
+ }
807
+ }
808
+
809
+ log.debug("{}: compacted: checksum={x:0>32} address={} free={}/{}", .{
810
+ manifest_log.superblock.replica_index.?,
811
+ oldest_checksum,
812
+ oldest_address,
813
+ frees,
814
+ block_schema.entry_count,
815
+ });
816
+
817
+ // Blocks are compacted in sequence – not skipped, even if no entries will be freed.
818
+ // (That should be rare though, since blocks are large.)
819
+ // This is necessary to update the block's "previous block" pointer in the header.
820
+ maybe(frees == 0);
821
+ assert(manifest_log.blocks_closed <= manifest_log.pace.half_bar_compact_blocks_max);
822
+
823
+ manifest_log.grid.release(&.{oldest_address});
824
+ manifest_log.reading = false;
825
+
826
+ manifest_log.compact_next_block();
827
+ }
828
+
829
+ fn compact_done_callback(manifest_log: *ManifestLog) void {
830
+ assert(manifest_log.opened);
831
+ assert(!manifest_log.reading);
832
+ assert(!manifest_log.writing);
833
+ assert(manifest_log.blocks_closed <= manifest_log.pace.half_bar_compact_blocks_max);
834
+ assert(manifest_log.read_callback != null);
835
+ assert(manifest_log.grid_reservation != null);
836
+ assert(manifest_log.compact_blocks.? == 0);
837
+
838
+ manifest_log.grid.trace.stop(.compact_manifest);
839
+
840
+ const callback = manifest_log.read_callback.?;
841
+ manifest_log.read_callback = null;
842
+ manifest_log.compact_blocks = null;
843
+
844
+ callback(manifest_log);
845
+ }
846
+
847
+ pub fn compact_end(manifest_log: *ManifestLog) void {
848
+ assert(manifest_log.opened);
849
+ assert(!manifest_log.reading);
850
+ assert(!manifest_log.writing);
851
+ assert(manifest_log.read_callback == null);
852
+ assert(manifest_log.write_callback == null);
853
+
854
+ if (manifest_log.grid_reservation) |reservation| {
855
+ manifest_log.grid.forfeit(reservation);
856
+ manifest_log.grid_reservation = null;
857
+ } else {
858
+ // Compaction was skipped for this half-bar.
859
+ assert(manifest_log.entry_count == 0);
860
+ assert(manifest_log.blocks.count == 0);
861
+ assert(manifest_log.blocks_closed == 0);
862
+ }
863
+ }
864
+
865
+ pub fn checkpoint(manifest_log: *ManifestLog, callback: Callback) void {
866
+ assert(manifest_log.opened);
867
+ assert(!manifest_log.reading);
868
+ assert(!manifest_log.writing);
869
+ assert(manifest_log.write_callback == null);
870
+ assert(manifest_log.grid_reservation == null);
871
+
872
+ if (manifest_log.entry_count > 0) {
873
+ manifest_log.close_block();
874
+ assert(manifest_log.entry_count == 0);
875
+ assert(manifest_log.blocks_closed > 0);
876
+ }
877
+ assert(manifest_log.blocks_closed == manifest_log.blocks.count);
878
+
879
+ manifest_log.flush(callback);
880
+ }
881
+
882
+ pub fn checkpoint_references(
883
+ manifest_log: *const ManifestLog,
884
+ ) vsr.SuperBlockManifestReferences {
885
+ assert(manifest_log.opened);
886
+ assert(!manifest_log.reading);
887
+ assert(!manifest_log.writing);
888
+ assert(manifest_log.write_callback == null);
889
+ assert(manifest_log.grid_reservation == null);
890
+ assert(manifest_log.log_block_checksums.count ==
891
+ manifest_log.log_block_addresses.count);
892
+ assert(manifest_log.blocks.count == 0);
893
+ assert(manifest_log.blocks_closed == 0);
894
+ assert(manifest_log.entry_count == 0);
895
+
896
+ if (manifest_log.log_block_addresses.count == 0) {
897
+ return std.mem.zeroes(vsr.SuperBlockManifestReferences);
898
+ } else {
899
+ return .{
900
+ .oldest_checksum = manifest_log.log_block_checksums.head().?,
901
+ .oldest_address = manifest_log.log_block_addresses.head().?,
902
+ .newest_checksum = manifest_log.log_block_checksums.tail().?,
903
+ .newest_address = manifest_log.log_block_addresses.tail().?,
904
+ .block_count = @intCast(manifest_log.log_block_addresses.count),
905
+ };
906
+ }
907
+ }
908
+
909
+ fn acquire_block(manifest_log: *ManifestLog) void {
910
+ assert(manifest_log.opened);
911
+ maybe(manifest_log.reading);
912
+ assert(!manifest_log.writing);
913
+ assert(manifest_log.entry_count == 0);
914
+ assert(manifest_log.log_block_checksums.count ==
915
+ manifest_log.log_block_addresses.count);
916
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
917
+ assert(!manifest_log.blocks.full());
918
+
919
+ manifest_log.blocks.advance_tail();
920
+
921
+ const block: BlockPtr = manifest_log.blocks.tail().?;
922
+ // The ManifestLog acquires block addresses eagerly here, rather than deferring until
923
+ // close_block(). This is because the open block's address must be inserted into
924
+ // `table_extents` at the same time the entry is appended to the open block.
925
+ const block_address = manifest_log.grid.acquire(manifest_log.grid_reservation.?);
926
+
927
+ const header = mem.bytesAsValue(vsr.Header.Block, block[0..@sizeOf(vsr.Header)]);
928
+ header.* = .{
929
+ .cluster = manifest_log.superblock.working.cluster,
930
+ .address = block_address,
931
+ .snapshot = 0, // TODO(snapshots): Set this properly; it is useful for debugging.
932
+ .size = undefined,
933
+ .command = .block,
934
+ .release = manifest_log.superblock.working.vsr_state.checkpoint.release,
935
+ .metadata_bytes = undefined, // Set by close_block().
936
+ .block_type = .manifest,
937
+ };
938
+ }
939
+
940
+ fn close_block(manifest_log: *ManifestLog) void {
941
+ assert(manifest_log.opened);
942
+ maybe(manifest_log.reading);
943
+ assert(!manifest_log.writing);
944
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed + 1);
945
+ assert(manifest_log.log_block_checksums.count <
946
+ manifest_log.log_block_checksums.buffer.len);
947
+
948
+ const block: BlockPtr = manifest_log.blocks.tail().?;
949
+ const entry_count = manifest_log.entry_count;
950
+ assert(entry_count > 0);
951
+ assert(entry_count <= schema.ManifestNode.entry_count_max);
952
+
953
+ const block_schema = schema.ManifestNode{ .entry_count = entry_count };
954
+ const header = mem.bytesAsValue(vsr.Header.Block, block[0..@sizeOf(vsr.Header)]);
955
+ assert(header.cluster == manifest_log.superblock.working.cluster);
956
+ assert(header.command == .block);
957
+ assert(header.address > 0);
958
+ header.size = block_schema.size();
959
+
960
+ const newest_checksum = manifest_log.log_block_checksums.tail() orelse 0;
961
+ const newest_address = manifest_log.log_block_addresses.tail() orelse 0;
962
+ header.metadata_bytes = @bitCast(schema.ManifestNode.Metadata{
963
+ .previous_manifest_block_checksum = newest_checksum,
964
+ .previous_manifest_block_address = newest_address,
965
+ .entry_count = entry_count,
966
+ });
967
+
968
+ // Zero padding:
969
+ @memset(block[header.size..], 0);
970
+
971
+ header.set_checksum_body(block[@sizeOf(vsr.Header)..header.size]);
972
+ header.set_checksum();
973
+ verify_block(block, null, null);
974
+
975
+ manifest_log.log_block_checksums.push_assume_capacity(header.checksum);
976
+ manifest_log.log_block_addresses.push_assume_capacity(header.address);
977
+
978
+ log.debug("{}: close_block: checksum={x:0>32} address={} entries={}/{}", .{
979
+ manifest_log.superblock.replica_index.?,
980
+ header.checksum,
981
+ header.address,
982
+ entry_count,
983
+ schema.ManifestNode.entry_count_max,
984
+ });
985
+
986
+ manifest_log.blocks_closed += 1;
987
+ manifest_log.entry_count = 0;
988
+ assert(manifest_log.blocks.count == manifest_log.blocks_closed);
989
+ }
990
+
991
+ fn verify_block(block: BlockPtrConst, checksum: ?u128, address: ?u64) void {
992
+ {
993
+ const frame = std.mem.bytesAsValue(vsr.Header, block[0..@sizeOf(vsr.Header)]);
994
+ assert(frame.valid_checksum());
995
+ assert(frame.valid_checksum_body(block[@sizeOf(vsr.Header)..frame.size]));
996
+ }
997
+
998
+ const header = schema.header_from_block(block);
999
+ assert(header.block_type == .manifest);
1000
+
1001
+ assert(address == null or header.address == address.?);
1002
+ assert(checksum == null or header.checksum == checksum.?);
1003
+
1004
+ const block_schema = schema.ManifestNode.from(block);
1005
+ assert(block_schema.entry_count > 0);
1006
+ assert(block_schema.entry_count <= schema.ManifestNode.entry_count_max);
1007
+ }
1008
+ };
1009
+ }
1010
+
1011
+ /// The goals of manifest log compaction are (in no particular order):
1012
+ ///
1013
+ /// 1. Free enough manifest blocks such that there are always enough free slots in the manifest
1014
+ /// log checksums/addresses ring buffers to accommodate the appends by table compaction.
1015
+ /// 2. Shrink the manifest log: A smaller manifest means that fewer blocks need to be replayed
1016
+ /// during recovery, or repaired during state sync.
1017
+ /// 3. Don't shrink the manifest too much: The more manifest compaction work is deferred, the more
1018
+ /// "efficient" compaction is. Put another way: deferring manifest compaction means that more
1019
+ /// entries are freed per block compacted.
1020
+ /// 4. Spread compaction work evenly between half-bars, to avoid latency spikes.
1021
+ ///
1022
+ /// To address goal 1, we must (on average) "remove" as many blocks from the manifest log as we add.
1023
+ /// But when we compact a block, only a subset of its entries can be freed/dropped – the remainder
1024
+ /// must be re-appended to the manifest log.
1025
+ ///
1026
+ /// The upper-bound number of manifest blocks is related to the rate at which we compact blocks.
1027
+ /// Put simply, the more compaction work we do, the smaller the upper bound.
1028
+ ///
1029
+ ///
1030
+ /// To reason about this relation mathematically, and compute the upper-bound number of manifest
1031
+ /// blocks in terms of the compaction rate:
1032
+ ///
1033
+ /// - Let `A` be the maximum number of manifest blocks that may be created by any single half-bar
1034
+ /// due to appends via table compaction. (In other words, `A` does not count manifest compaction.)
1035
+ /// - Let `T` be the minimum number of manifest blocks to hold `table_count_max` tables (inserts).
1036
+ /// - Let `C` be the maximum number of manifest blocks to compact (i.e. read) during any half-bar.
1037
+ /// - In the worst case, compacting a block frees no entries.
1038
+ /// - (Then `C` is also the worst-case number of manifest blocks *written* due to manifest
1039
+ /// compaction during each half-bar.)
1040
+ ///
1041
+ /// Suppose that at a certain point in time `t₀`, there are `M₀` manifest blocks total.
1042
+ ///
1043
+ /// If we compact at least `C` manifest blocks for each of `⌈M₀/C⌉` half-bars, then any of the
1044
+ /// initial `M₀` manifest blocks that required compaction at time `t₀` have been compacted.
1045
+ /// In the worst case (where all of those `M₀` blocks were full of live entries) we now have as
1046
+ /// many as `M₁ = min(M₀,T) + A×⌈M₀/C⌉` manifest blocks:
1047
+ ///
1048
+ /// - `min(M₀,T)`: After compacting the original `M₀` blocks, we may produce as many as `M₀`
1049
+ /// blocks (if no entries were freed). But if there are more than `T` blocks then some *must* be
1050
+ /// dropped, since `T` is the upper-bound of a fully-compacted manifest.
1051
+ /// - `⌈M₀/C⌉` is the number of half-bars that it takes to compact the initial `M₀` manifest
1052
+ /// blocks.
1053
+ /// - `A×⌈M₀/C⌉` is the maximum number of manifest blocks produced by table compaction while
1054
+ /// compacting the original `M₀` manifest blocks.
1055
+ ///
1056
+ /// If we cycle again, starting with `M₁` manifest blocks this time, then at the end of the cycle
1057
+ /// there are at most `M₂ = min(M₁,T) + A×⌈M₁/C⌉` manifest blocks.
1058
+ ///
1059
+ /// To generalize, at the beginning of any cycle `c`, the maximum number of manifest blocks
1060
+ /// (`MC(c)`) is:
1061
+ ///
1062
+ /// MC(c) = min(T, MC(c-1)) + A×⌈MC(c-1)/C⌉
1063
+ ///
1064
+ ///
1065
+ /// However, *within* a cycle the manifest block count may "burst" temporarily beyond this limit.
1066
+ /// We compact chronologically. If the blocks early in the manifest have no/few free entries, we
1067
+ /// must still compact them anyway, shifting their entries from the prefix of the log to its suffix.
1068
+ /// During that time, the table-compact appends still occur, so the net manifest log size grows.
1069
+ ///
1070
+ /// The lower-bound for the number of blocks freed (`F(k)`) in terms of the number of blocks
1071
+ /// compacted (`k`) is:
1072
+ ///
1073
+ /// F(k) ≥ max(0, k - (T + 1))
1074
+ ///
1075
+ /// In other words:
1076
+ /// - After compacting `T` or fewer blocks, we may not have freed any whole blocks.
1077
+ /// - After compacting `T+1` blocks, we must have freed at least 1 whole block.
1078
+ /// - After compacting `T+2` blocks, we must have freed at least 2 whole blocks.
1079
+ /// - Etc.
1080
+ ///
1081
+ /// Then the upper-bound number of manifest blocks (`MB(b)`) at any half-bar boundary (`b`) is:
1082
+ ///
1083
+ /// MB(b) = min(T, MB(b-1)) + A×⌈M(b-1)/C⌉ + A×⌈(T+1)/C⌉
1084
+ ///
1085
+ /// As `b` approaches infinity, this recurrence relation converges (iff `C > A`) to the absolute
1086
+ /// upper-bound number of manifest blocks.
1087
+ ///
1088
+ /// As `C` increases (relative to `A`), the manifest block upper-bound decreases, but the amount of
1089
+ /// compaction work performed increases.
1090
+ ///
1091
+ /// If, for any half-bar that the manifest log contains at least `MC(∞)` blocks we compact at least
1092
+ /// `C` blocks, then the total size of the manifest log will never exceed `MB(∞)` blocks.
1093
+ ///
1094
+ /// NOTE: Both the algorithm above and the implementation below make several simplifications:
1095
+ ///
1096
+ /// - The calculation is performed at the granularity of blocks, not entries. In particular, this
1097
+ /// means that "A" might in truth be fractional, but we would round up. For example, if "A" is
1098
+ /// 2.1, for the purposes of the upper-bound it is 3. Because `C` is computed (below) as
1099
+ /// "A + compact_extra_blocks", the result is that we perform more compaction (relative to
1100
+ /// appends) than the block-granular constants indicate.
1101
+ /// As a result, we overestimate the upper-bound (or, equivalently, perform compaction more
1102
+ /// quickly than strictly necessary).
1103
+ /// - The calculation does *not* consider the "padding" appends in to a partial block written
1104
+ /// during a checkpoint. This oversight is masked because "A" is overestimated (see previous
1105
+ /// bullet).
1106
+ ///
1107
+ pub const Pace = struct {
1108
+ /// "A":
1109
+ /// The maximum number of manifest blocks appended during a single half-bar by table appends.
1110
+ ///
1111
+ /// This counts:
1112
+ /// - Input tables are updated in the manifest (snapshot_max is reduced).
1113
+ /// - Input tables are removed from the manifest (if not held by a persistent snapshot).
1114
+ /// - Output tables are inserted into the manifest.
1115
+ /// This does *not* count:
1116
+ /// - Manifest log compaction.
1117
+ /// - Releasing persistent snapshots.
1118
+ half_bar_append_blocks_max: u32,
1119
+
1120
+ /// "C":
1121
+ /// The maximum number of manifest blocks to compact (i.e. read) during a single half-bar.
1122
+ half_bar_compact_blocks_max: u32,
1123
+
1124
+ /// "T":
1125
+ /// The maximum number of blocks in a fully-compacted manifest.
1126
+ /// (Exposed by the struct only for the purpose of logging.)
1127
+ log_blocks_full_max: u64,
1128
+
1129
+ /// "limit of MC(c) as c approaches ∞"
1130
+ log_blocks_cycle_max: u64,
1131
+ /// "limit of MB(b) as b approaches ∞"
1132
+ log_blocks_max: u64,
1133
+
1134
+ tables_max: u32,
1135
+
1136
+ comptime {
1137
+ const log_pace = false;
1138
+ if (log_pace) {
1139
+ const pace = Pace.init(.{
1140
+ .tree_count = 24,
1141
+ .tables_max = 2_300_000,
1142
+ .compact_extra_blocks = constants.lsm_manifest_compact_extra_blocks,
1143
+ });
1144
+
1145
+ for (std.meta.fields(Pace)) |pace_field| {
1146
+ @compileLog(std.fmt.comptimePrint("ManifestLog.Pace.{s} = {d}", .{
1147
+ pace_field.name,
1148
+ @field(pace, pace_field.name),
1149
+ }));
1150
+ }
1151
+ }
1152
+ }
1153
+
1154
+ pub fn init(options: struct {
1155
+ tree_count: u32,
1156
+ tables_max: u32,
1157
+ compact_extra_blocks: u32,
1158
+ }) Pace {
1159
+ assert(options.tree_count > 0);
1160
+ assert(options.tables_max > 0);
1161
+ assert(options.tables_max > options.tree_count);
1162
+ assert(options.compact_extra_blocks > 0);
1163
+
1164
+ const block_entries_max = schema.ManifestNode.entry_count_max;
1165
+
1166
+ const half_bar_append_entries_max = options.tree_count *
1167
+ stdx.div_ceil(constants.lsm_levels, 2) * // Maximum number of compactions/half-bar.
1168
+ (compaction.compaction_tables_input_max + // Update snapshot_max.
1169
+ compaction.compaction_tables_input_max + // Remove.
1170
+ compaction.compaction_tables_output_max); // Insert.
1171
+
1172
+ // "A":
1173
+ const half_bar_append_blocks_max =
1174
+ stdx.div_ceil(half_bar_append_entries_max, block_entries_max);
1175
+
1176
+ const half_bar_compact_blocks_extra = options.compact_extra_blocks;
1177
+ assert(half_bar_compact_blocks_extra > 0);
1178
+
1179
+ // "C":
1180
+ const half_bar_compact_blocks_max =
1181
+ half_bar_append_blocks_max + half_bar_compact_blocks_extra;
1182
+ assert(half_bar_compact_blocks_max > half_bar_append_blocks_max);
1183
+
1184
+ // "T":
1185
+ const log_blocks_full_max = stdx.div_ceil(options.tables_max, block_entries_max);
1186
+ assert(log_blocks_full_max > 0);
1187
+
1188
+ // "limit of MC(c) as c approaches ∞":
1189
+ // Working out this recurrence relation's limit with a closed-form solution is complicated.
1190
+ // Just compute the limit iteratively instead. (1024 is an arbitrary safety counter.)
1191
+ var log_blocks_before: u32 = 0;
1192
+ const log_blocks_cycle_max = for (0..1024) |_| {
1193
+ const log_blocks_after =
1194
+ log_blocks_full_max +
1195
+ half_bar_append_blocks_max *
1196
+ stdx.div_ceil(log_blocks_before, half_bar_compact_blocks_max);
1197
+
1198
+ if (log_blocks_before == log_blocks_after) {
1199
+ break log_blocks_after;
1200
+ }
1201
+ log_blocks_before = log_blocks_after;
1202
+ } else {
1203
+ // If the value does not converge within the given number of steps,
1204
+ // constants.lsm_manifest_compact_blocks_extra should probably be raised.
1205
+ @panic("ManifestLog.Pace.log_blocks_cycle_max: no convergence");
1206
+ };
1207
+
1208
+ const log_blocks_burst_max = half_bar_append_blocks_max *
1209
+ stdx.div_ceil(log_blocks_full_max + 1, half_bar_compact_blocks_max);
1210
+
1211
+ // "limit of MB(b) as b approaches ∞":
1212
+ const log_blocks_max = log_blocks_cycle_max + log_blocks_burst_max;
1213
+
1214
+ assert(log_blocks_cycle_max > log_blocks_full_max);
1215
+ assert(log_blocks_cycle_max < log_blocks_max);
1216
+
1217
+ return .{
1218
+ .half_bar_append_blocks_max = half_bar_append_blocks_max,
1219
+ .half_bar_compact_blocks_max = half_bar_compact_blocks_max,
1220
+ .log_blocks_full_max = log_blocks_full_max,
1221
+ .log_blocks_max = log_blocks_max,
1222
+ .log_blocks_cycle_max = log_blocks_cycle_max,
1223
+ .tables_max = options.tables_max,
1224
+ };
1225
+ }
1226
+
1227
+ fn half_bar_compact_blocks(pace: Pace, options: struct {
1228
+ /// The number of manifest blocks that *currently* exist.
1229
+ log_blocks_count: u32,
1230
+ /// The number of live tables.
1231
+ tables_count: u32,
1232
+ }) u32 {
1233
+ assert(options.tables_count <= pace.tables_max);
1234
+
1235
+ // Pretend we have an extra half_bar_append_blocks_max blocks so that we always switch to
1236
+ // the maximum compaction rate before we exceed the cycle-max.
1237
+ if (pace.log_blocks_cycle_max <=
1238
+ options.log_blocks_count + pace.half_bar_append_blocks_max)
1239
+ {
1240
+ return pace.half_bar_compact_blocks_max;
1241
+ }
1242
+
1243
+ // We have enough free manifest blocks that we could go a whole "cycle" without
1244
+ // compacting any. It doesn't strictly matter how much compaction we do in this case, so
1245
+ // just try to pace the work evenly, maintaining a constant load factor with respect to
1246
+ // the cycle-max.
1247
+
1248
+ // Our "target" block count extrapolates a log block count from our table count and the
1249
+ // log's maximum load factor.
1250
+ const log_blocks_target = @max(1, @divFloor(
1251
+ pace.log_blocks_cycle_max * options.tables_count,
1252
+ pace.tables_max,
1253
+ ));
1254
+
1255
+ return @min(
1256
+ pace.half_bar_compact_blocks_max,
1257
+ @divFloor(
1258
+ pace.half_bar_compact_blocks_max * options.log_blocks_count,
1259
+ log_blocks_target,
1260
+ ),
1261
+ );
1262
+ }
1263
+ };