tigerbeetle 0.0.34 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/tb_client/extconf.rb +13 -13
  4. data/ext/tb_client/tigerbeetle/LICENSE +177 -0
  5. data/ext/tb_client/tigerbeetle/build.zig +2327 -0
  6. data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
  7. data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
  8. data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
  9. data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
  10. data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
  11. data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
  12. data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
  13. data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
  14. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
  15. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
  16. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
  17. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
  18. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
  19. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
  20. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
  21. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
  22. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
  23. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
  24. data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
  25. data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
  26. data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
  27. data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
  28. data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
  29. data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
  30. data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
  31. data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
  32. data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
  33. data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
  34. data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
  35. data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
  36. data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
  37. data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
  38. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
  39. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
  40. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
  41. data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
  42. data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
  43. data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
  44. data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
  45. data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
  46. data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
  47. data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
  48. data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
  49. data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
  50. data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
  51. data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
  52. data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
  53. data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
  54. data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
  55. data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
  56. data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
  57. data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
  58. data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
  59. data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
  60. data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
  61. data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
  62. data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
  63. data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
  64. data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
  65. data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
  66. data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
  67. data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
  68. data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
  69. data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
  70. data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
  71. data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
  72. data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
  73. data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
  74. data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
  75. data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
  76. data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
  77. data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
  78. data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
  79. data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
  80. data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
  81. data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
  82. data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
  83. data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
  84. data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
  85. data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
  86. data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
  87. data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
  88. data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
  89. data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
  90. data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
  91. data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
  92. data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
  93. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
  94. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
  95. data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
  96. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
  97. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
  98. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
  99. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
  100. data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
  101. data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
  102. data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
  103. data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
  104. data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
  105. data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
  106. data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
  107. data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
  108. data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
  109. data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
  110. data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
  111. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
  112. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
  113. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
  114. data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
  115. data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
  116. data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
  117. data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
  118. data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
  119. data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
  120. data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
  121. data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
  122. data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
  123. data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
  124. data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
  125. data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
  126. data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
  127. data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
  128. data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
  129. data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
  130. data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
  131. data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
  132. data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
  133. data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
  134. data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
  135. data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
  136. data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
  137. data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
  138. data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
  139. data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
  140. data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
  141. data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
  142. data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
  143. data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
  144. data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
  145. data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
  146. data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
  147. data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
  148. data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
  149. data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
  150. data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
  151. data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
  152. data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
  153. data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
  154. data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
  155. data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
  156. data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
  157. data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
  158. data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
  159. data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
  160. data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
  161. data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
  162. data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
  163. data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
  164. data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
  165. data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
  166. data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
  167. data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
  168. data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
  169. data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
  170. data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
  171. data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
  172. data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
  173. data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
  174. data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
  175. data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
  176. data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
  177. data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
  178. data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
  179. data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
  180. data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
  181. data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
  182. data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
  183. data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
  184. data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
  185. data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
  186. data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
  187. data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
  188. data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
  189. data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
  190. data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
  191. data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
  192. data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
  193. data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
  194. data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
  195. data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
  196. data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
  197. data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
  198. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
  199. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
  200. data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
  201. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
  202. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
  203. data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
  204. data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
  205. data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
  206. data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
  207. data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
  208. data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
  209. data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
  210. data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
  211. data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
  212. data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
  213. data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
  214. data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
  215. data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
  216. data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
  217. data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
  218. data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
  219. data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
  220. data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
  221. data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
  222. data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
  223. data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
  224. data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
  225. data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
  226. data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
  227. data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
  228. data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
  229. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
  230. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
  231. data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
  232. data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
  233. data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
  234. data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
  235. data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
  236. data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
  237. data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
  238. data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
  239. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
  240. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
  241. data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
  242. data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
  243. data/lib/tb_client/shared_lib.rb +12 -5
  244. data/lib/tigerbeetle/client.rb +1 -1
  245. data/lib/tigerbeetle/platforms.rb +9 -0
  246. data/lib/tigerbeetle/version.rb +2 -2
  247. data/tigerbeetle.gemspec +22 -5
  248. metadata +242 -3
  249. data/ext/tb_client/pkg.tar.gz +0 -0
@@ -0,0 +1,1119 @@
1
+ const std = @import("std");
2
+ const builtin = @import("builtin");
3
+ const assert = std.debug.assert;
4
+ const maybe = stdx.maybe;
5
+ const mem = std.mem;
6
+ const log = std.log.scoped(.forest);
7
+
8
+ const stdx = @import("stdx");
9
+ const constants = @import("../constants.zig");
10
+
11
+ const schema = @import("schema.zig");
12
+ const GridType = @import("../vsr/grid.zig").GridType;
13
+ const NodePool = @import("node_pool.zig").NodePoolType(constants.lsm_manifest_node_size, 16);
14
+ const ManifestLogType = @import("manifest_log.zig").ManifestLogType;
15
+ const ManifestLogPace = @import("manifest_log.zig").Pace;
16
+
17
+ const ScratchMemory = @import("scratch_memory.zig").ScratchMemory;
18
+ const ScanBufferPool = @import("scan_buffer.zig").ScanBufferPool;
19
+ const ResourcePoolType = @import("compaction.zig").ResourcePoolType;
20
+ const snapshot_min_for_table_output = @import("compaction.zig").snapshot_min_for_table_output;
21
+ const compaction_op_min = @import("compaction.zig").compaction_op_min;
22
+ const compaction_block_count_beat_min = @import("compaction.zig").compaction_block_count_beat_min;
23
+ const compaction_input_tables_max = @import("compaction.zig").compaction_tables_input_max;
24
+
25
+ /// The maximum number of tables for the forest as a whole. This is set a bit backwards due to how
26
+ /// the code is structured: a single tree should be able to use all the tables in the forest, so the
27
+ /// table_count_max of the forest is equal to the table_count_max of a single tree.
28
+ /// In future, Forest.table_count_max could exceed Tree.table_count_max.
29
+ pub const table_count_max = @import("tree.zig").table_count_max;
30
+
31
+ pub fn ForestType(comptime _Storage: type, comptime groove_cfg: anytype) type {
32
+ const groove_count = std.meta.fields(@TypeOf(groove_cfg)).len;
33
+ var groove_fields: [groove_count]std.builtin.Type.StructField = undefined;
34
+ var groove_options_fields: [groove_count]std.builtin.Type.StructField = undefined;
35
+
36
+ for (std.meta.fields(@TypeOf(groove_cfg)), 0..) |field, i| {
37
+ const Groove = @field(groove_cfg, field.name);
38
+ groove_fields[i] = .{
39
+ .name = field.name,
40
+ .type = Groove,
41
+ .default_value_ptr = null,
42
+ .is_comptime = false,
43
+ .alignment = @alignOf(Groove),
44
+ };
45
+
46
+ groove_options_fields[i] = .{
47
+ .name = field.name,
48
+ .type = Groove.Options,
49
+ .default_value_ptr = null,
50
+ .is_comptime = false,
51
+ .alignment = @alignOf(Groove),
52
+ };
53
+ }
54
+
55
+ const _Grooves = @Type(.{
56
+ .@"struct" = .{
57
+ .layout = .auto,
58
+ .fields = &groove_fields,
59
+ .decls = &.{},
60
+ .is_tuple = false,
61
+ },
62
+ });
63
+
64
+ const _GroovesOptions = @Type(.{
65
+ .@"struct" = .{
66
+ .layout = .auto,
67
+ .fields = &groove_options_fields,
68
+ .decls = &.{},
69
+ .is_tuple = false,
70
+ },
71
+ });
72
+
73
+ {
74
+ // Verify that every tree id is unique.
75
+ comptime var ids: []const u16 = &.{};
76
+
77
+ inline for (std.meta.fields(_Grooves)) |groove_field| {
78
+ const Groove = groove_field.type;
79
+
80
+ for (std.meta.fields(@TypeOf(Groove.config.ids))) |field| {
81
+ const id = @field(Groove.config.ids, field.name);
82
+ assert(id > 0);
83
+ assert(std.mem.indexOfScalar(u16, ids, id) == null);
84
+
85
+ ids = ids ++ [_]u16{id};
86
+ }
87
+ }
88
+ }
89
+
90
+ const TreeInfo = struct {
91
+ Tree: type,
92
+ tree_name: []const u8,
93
+ tree_id: u16,
94
+ groove_name: []const u8,
95
+ groove_tree: union(enum) { objects, ids, indexes: []const u8 },
96
+ };
97
+
98
+ // Invariants:
99
+ // - tree_infos[tree_id - tree_id_range.min].tree_id == tree_id
100
+ // - tree_infos.len == tree_id_range.max - tree_id_range.min
101
+ const _tree_infos = tree_infos: {
102
+ @setEvalBranchQuota(32_000);
103
+
104
+ var tree_infos: []const TreeInfo = &[_]TreeInfo{};
105
+ for (std.meta.fields(_Grooves)) |groove_field| {
106
+ const Groove = groove_field.type;
107
+
108
+ tree_infos = tree_infos ++ &[_]TreeInfo{.{
109
+ .Tree = Groove.ObjectTree,
110
+ .tree_name = groove_field.name,
111
+ .tree_id = @field(Groove.config.ids, "timestamp"),
112
+ .groove_name = groove_field.name,
113
+ .groove_tree = .objects,
114
+ }};
115
+
116
+ if (Groove.IdTree != void) {
117
+ tree_infos = tree_infos ++ &[_]TreeInfo{.{
118
+ .Tree = Groove.IdTree,
119
+ .tree_name = groove_field.name ++ ".id",
120
+ .tree_id = @field(Groove.config.ids, "id"),
121
+ .groove_name = groove_field.name,
122
+ .groove_tree = .ids,
123
+ }};
124
+ }
125
+
126
+ for (std.meta.fields(Groove.IndexTrees)) |tree_field| {
127
+ tree_infos = tree_infos ++ &[_]TreeInfo{.{
128
+ .Tree = tree_field.type,
129
+ .tree_name = groove_field.name ++ "." ++ tree_field.name,
130
+ .tree_id = @field(Groove.config.ids, tree_field.name),
131
+ .groove_name = groove_field.name,
132
+ .groove_tree = .{ .indexes = tree_field.name },
133
+ }};
134
+ }
135
+ }
136
+
137
+ var tree_id_min = std.math.maxInt(u16);
138
+ for (tree_infos) |tree_info| tree_id_min = @min(tree_id_min, tree_info.tree_id);
139
+
140
+ var tree_infos_sorted: [tree_infos.len]TreeInfo = undefined;
141
+ var tree_infos_set: stdx.BitSetType(tree_infos.len) = .{};
142
+ for (tree_infos) |tree_info| {
143
+ const tree_index = tree_info.tree_id - tree_id_min;
144
+ assert(!tree_infos_set.is_set(tree_index));
145
+
146
+ tree_infos_sorted[tree_index] = tree_info;
147
+ tree_infos_set.set(tree_index);
148
+ }
149
+
150
+ // There are no gaps in the tree ids.
151
+ assert(tree_infos_set.full());
152
+
153
+ break :tree_infos tree_infos_sorted;
154
+ };
155
+
156
+ const _TreeID = comptime tree_id: {
157
+ var fields: [_tree_infos.len]std.builtin.Type.EnumField = undefined;
158
+ for (_tree_infos, 0..) |tree_info, i| {
159
+ fields[i] = .{
160
+ .name = @ptrCast(tree_info.tree_name),
161
+ .value = tree_info.tree_id,
162
+ };
163
+ }
164
+ break :tree_id @Type(.{ .@"enum" = .{
165
+ .tag_type = u16,
166
+ .fields = &fields,
167
+ .decls = &.{},
168
+ .is_exhaustive = true,
169
+ } });
170
+ };
171
+
172
+ comptime {
173
+ assert(std.enums.values(_TreeID).len == _tree_infos.len);
174
+ for (std.enums.values(_TreeID)) |tree_id| {
175
+ const tree_info = _tree_infos[@intFromEnum(tree_id) - _tree_infos[0].tree_id];
176
+ assert(tree_id == @as(_TreeID, @enumFromInt(tree_info.tree_id)));
177
+ }
178
+ }
179
+
180
+ const Grid = GridType(_Storage);
181
+
182
+ return struct {
183
+ const Forest = @This();
184
+
185
+ pub const ManifestLog = ManifestLogType(Storage);
186
+ const CompactionSchedule = CompactionScheduleType(Forest, Grid);
187
+
188
+ const Callback = *const fn (*Forest) void;
189
+
190
+ pub const Storage = _Storage;
191
+ pub const groove_config = groove_cfg;
192
+ pub const Grooves = _Grooves;
193
+ pub const GroovesOptions = _GroovesOptions;
194
+ // TreeID is an enum with a value for each tree type.
195
+ // Individual trees use `u16` to store their own id, to avoid dependency on the entire
196
+ // forest.
197
+ // Use `tree_id_cast` function to convert this type-erased u16 to a TreeID.
198
+ pub const TreeID = _TreeID;
199
+ pub const tree_infos = _tree_infos;
200
+ pub const tree_id_range = .{
201
+ .min = tree_infos[0].tree_id,
202
+ .max = tree_infos[tree_infos.len - 1].tree_id,
203
+ };
204
+
205
+ const manifest_log_compaction_pace = ManifestLogPace.init(.{
206
+ .tree_count = tree_infos.len,
207
+ // TODO Make this a runtime argument (from the CLI, derived from storage-size-max if
208
+ // possible).
209
+ .tables_max = table_count_max,
210
+ .compact_extra_blocks = constants.lsm_manifest_compact_extra_blocks,
211
+ });
212
+ pub const manifest_log_blocks_released_half_bar_max =
213
+ manifest_log_compaction_pace.half_bar_compact_blocks_max;
214
+
215
+ pub const Options = struct {
216
+ node_count: u32,
217
+ /// The amount of blocks allocated for compactions. Compactions will be deterministic
218
+ /// regardless of how much blocks you give them, but will run in fewer steps with more
219
+ /// memory.
220
+ compaction_block_count: u32,
221
+
222
+ pub const compaction_block_count_min: u32 = compaction_block_count_beat_min;
223
+ };
224
+
225
+ progress: ?union(enum) {
226
+ open: struct { callback: Callback },
227
+ checkpoint: struct { callback: Callback },
228
+ compact: struct {
229
+ op: u64,
230
+ callback: Callback,
231
+ },
232
+ } = null,
233
+
234
+ compaction_progress: ?struct {
235
+ trees_done: bool,
236
+ manifest_log_done: bool,
237
+
238
+ fn all_done(compaction_progress: @This()) bool {
239
+ return compaction_progress.trees_done and compaction_progress.manifest_log_done;
240
+ }
241
+ } = null,
242
+
243
+ grid: *Grid,
244
+ grooves: Grooves,
245
+ node_pool: NodePool,
246
+ manifest_log: ManifestLog,
247
+
248
+ compaction_schedule: CompactionSchedule,
249
+
250
+ scan_buffer_pool: ScanBufferPool,
251
+
252
+ radix_buffer: ScratchMemory,
253
+
254
+ pub fn init(
255
+ forest: *Forest,
256
+ allocator: mem.Allocator,
257
+ grid: *Grid,
258
+ options: Options,
259
+ // (e.g.) .{ .transfers = .{ .cache_entries_max = 128, … }, .accounts = … }
260
+ grooves_options: GroovesOptions,
261
+ ) !void {
262
+ assert(options.compaction_block_count >= Options.compaction_block_count_min);
263
+ forest.* = .{
264
+ .grid = grid,
265
+ .grooves = undefined,
266
+ .node_pool = undefined,
267
+ .manifest_log = undefined,
268
+ .compaction_schedule = undefined,
269
+ .scan_buffer_pool = undefined,
270
+ .radix_buffer = undefined,
271
+ };
272
+
273
+ // TODO: look into using lsm_table_size_max for the node_count.
274
+ try forest.node_pool.init(allocator, options.node_count);
275
+ errdefer forest.node_pool.deinit(allocator);
276
+
277
+ try forest.manifest_log.init(
278
+ allocator,
279
+ grid,
280
+ &manifest_log_compaction_pace,
281
+ );
282
+ errdefer forest.manifest_log.deinit(allocator);
283
+
284
+ var grooves_initialized: usize = 0;
285
+ errdefer inline for (std.meta.fields(Grooves), 0..) |field, field_index| {
286
+ if (grooves_initialized >= field_index + 1) {
287
+ const Groove = field.type;
288
+ const groove: *Groove = &@field(forest.grooves, field.name);
289
+ groove.deinit(allocator);
290
+ }
291
+ };
292
+
293
+ const radix_buffer_size: usize = comptime blk: {
294
+ var size_max: usize = 0;
295
+ for (std.enums.values(_TreeID)) |tree_id| {
296
+ const tree = _tree_infos[@intFromEnum(tree_id) - _tree_infos[0].tree_id];
297
+ const size = tree.Tree.Table.value_count_max * @sizeOf(tree.Tree.Value);
298
+ assert(size > 0);
299
+ size_max = @max(size_max, size);
300
+ }
301
+ break :blk size_max;
302
+ };
303
+
304
+ forest.radix_buffer = try .init(allocator, radix_buffer_size);
305
+ errdefer forest.radix_buffer.deinit(allocator);
306
+
307
+ inline for (std.meta.fields(Grooves)) |field| {
308
+ const Groove = field.type;
309
+ const groove: *Groove = &@field(forest.grooves, field.name);
310
+ const groove_options: Groove.Options = @field(grooves_options, field.name);
311
+
312
+ try groove.init(
313
+ allocator,
314
+ &forest.node_pool,
315
+ grid,
316
+ &forest.radix_buffer,
317
+ groove_options,
318
+ );
319
+ grooves_initialized += 1;
320
+ }
321
+
322
+ try forest.compaction_schedule.init(
323
+ allocator,
324
+ grid,
325
+ forest,
326
+ options.compaction_block_count,
327
+ );
328
+ errdefer forest.compaction_schedule.deinit(allocator);
329
+
330
+ try forest.scan_buffer_pool.init(allocator);
331
+ errdefer forest.scan_buffer_pool.deinit(allocator);
332
+ }
333
+
334
+ pub fn deinit(forest: *Forest, allocator: mem.Allocator) void {
335
+ inline for (std.meta.fields(Grooves)) |field| {
336
+ const Groove = field.type;
337
+ const groove: *Groove = &@field(forest.grooves, field.name);
338
+ groove.deinit(allocator);
339
+ }
340
+
341
+ forest.manifest_log.deinit(allocator);
342
+ forest.node_pool.deinit(allocator);
343
+
344
+ forest.radix_buffer.deinit(allocator);
345
+
346
+ forest.compaction_schedule.deinit(allocator);
347
+ forest.scan_buffer_pool.deinit(allocator);
348
+ }
349
+
350
+ pub fn reset(forest: *Forest) void {
351
+ // Components using the node_pool must release all nodes they acquired upon reset.
352
+ defer assert(forest.node_pool.free.count() == forest.node_pool.free.bit_length);
353
+
354
+ inline for (std.meta.fields(Grooves)) |field| {
355
+ @field(forest.grooves, field.name).reset();
356
+ }
357
+
358
+ forest.grid.trace.cancel(.lookup);
359
+ forest.grid.trace.cancel(.lookup_worker);
360
+ forest.grid.trace.cancel(.scan_tree);
361
+ forest.grid.trace.cancel(.scan_tree_level);
362
+
363
+ forest.manifest_log.reset();
364
+ forest.scan_buffer_pool.reset();
365
+ forest.compaction_schedule.reset();
366
+
367
+ forest.* = .{
368
+ // Don't reset the grid – replica is responsible for grid cancellation.
369
+ .grid = forest.grid,
370
+ .grooves = forest.grooves,
371
+ .node_pool = forest.node_pool,
372
+ .manifest_log = forest.manifest_log,
373
+
374
+ .compaction_schedule = forest.compaction_schedule,
375
+
376
+ .scan_buffer_pool = forest.scan_buffer_pool,
377
+ .radix_buffer = forest.radix_buffer,
378
+ };
379
+ }
380
+
381
+ pub fn open(forest: *Forest, callback: Callback) void {
382
+ assert(forest.progress == null);
383
+ assert(forest.compaction_progress == null);
384
+
385
+ forest.progress = .{ .open = .{ .callback = callback } };
386
+
387
+ inline for (std.meta.fields(Grooves)) |field| {
388
+ @field(forest.grooves, field.name).open_commence(&forest.manifest_log);
389
+ }
390
+
391
+ forest.manifest_log.open(manifest_log_open_event, manifest_log_open_callback);
392
+ }
393
+
394
+ fn manifest_log_open_event(
395
+ manifest_log: *ManifestLog,
396
+ table: *const schema.ManifestNode.TableInfo,
397
+ ) void {
398
+ const forest: *Forest = @fieldParentPtr("manifest_log", manifest_log);
399
+ assert(forest.progress.? == .open);
400
+ assert(forest.compaction_progress == null);
401
+ assert(table.label.level < constants.lsm_levels);
402
+ assert(table.label.event != .remove);
403
+
404
+ if (table.tree_id < tree_id_range.min or table.tree_id > tree_id_range.max) {
405
+ log.err("manifest_log_open_event: unknown table in manifest: {}", .{table});
406
+ @panic("Forest.manifest_log_open_event: unknown table in manifest");
407
+ }
408
+ switch (tree_id_cast(table.tree_id)) {
409
+ inline else => |tree_id| {
410
+ var tree: *TreeForIdType(tree_id) = forest.tree_for_id(tree_id);
411
+ tree.open_table(table);
412
+ },
413
+ }
414
+ }
415
+
416
+ fn manifest_log_open_callback(manifest_log: *ManifestLog) void {
417
+ const forest: *Forest = @fieldParentPtr("manifest_log", manifest_log);
418
+ assert(forest.progress.? == .open);
419
+ assert(forest.compaction_progress == null);
420
+ forest.verify_tables_recovered();
421
+
422
+ inline for (std.meta.fields(Grooves)) |field| {
423
+ @field(forest.grooves, field.name).open_complete();
424
+ }
425
+ forest.verify_table_extents();
426
+
427
+ const callback = forest.progress.?.open.callback;
428
+ forest.progress = null;
429
+ callback(forest);
430
+ }
431
+
432
+ pub fn compact(forest: *Forest, callback: Callback, op: u64) void {
433
+ const compaction_beat = op % constants.lsm_compaction_ops;
434
+
435
+ const first_beat = compaction_beat == 0;
436
+ const last_half_beat = compaction_beat ==
437
+ @divExact(constants.lsm_compaction_ops, 2) - 1;
438
+ const half_beat = compaction_beat == @divExact(constants.lsm_compaction_ops, 2);
439
+ const last_beat = compaction_beat == constants.lsm_compaction_ops - 1;
440
+ assert(@as(usize, @intFromBool(first_beat)) + @intFromBool(last_half_beat) +
441
+ @intFromBool(half_beat) + @intFromBool(last_beat) <= 1);
442
+
443
+ log.debug("entering forest.compact() op={} constants.lsm_compaction_ops={} " ++
444
+ "first_beat={} last_half_beat={} half_beat={} last_beat={}", .{
445
+ op,
446
+ constants.lsm_compaction_ops,
447
+ first_beat,
448
+ last_half_beat,
449
+ half_beat,
450
+ last_beat,
451
+ });
452
+
453
+ assert(forest.progress == null);
454
+ forest.progress = .{ .compact = .{
455
+ .op = op,
456
+ .callback = callback,
457
+ } };
458
+
459
+ // Run trees and manifest log compaction in parallel, join in compact_finish.
460
+ assert(forest.compaction_progress == null);
461
+ forest.compaction_progress = .{
462
+ .trees_done = false,
463
+ .manifest_log_done = false,
464
+ };
465
+
466
+ // Manifest log compaction. Run on the last beat of each half-bar. Start before forest
467
+ // compaction for lesser fragmentation, as manifest log grid reservations are much
468
+ // smaller than compaction's.
469
+ // TODO: Figure out a plan wrt the pacing here. Putting it on the last beat kinda-sorta
470
+ // balances out, because we expect to naturally do less other compaction work on the
471
+ // last beat.
472
+ // The first bar has no manifest compaction.
473
+ if (last_beat or last_half_beat) {
474
+ forest.manifest_log.compact(compact_manifest_log_callback, op);
475
+ } else {
476
+ forest.compaction_progress.?.manifest_log_done = true;
477
+ }
478
+
479
+ forest.compaction_schedule.beat_start(compact_trees_callback, op);
480
+ }
481
+
482
+ fn compact_trees_callback(forest: *Forest) void {
483
+ assert(forest.progress.? == .compact);
484
+ assert(forest.compaction_progress != null);
485
+ assert(!forest.compaction_progress.?.trees_done);
486
+ forest.compaction_progress.?.trees_done = true;
487
+
488
+ if (forest.compaction_progress.?.all_done()) {
489
+ forest.compact_finish();
490
+ }
491
+ }
492
+
493
+ fn compact_manifest_log_callback(manifest_log: *ManifestLog) void {
494
+ const forest: *Forest = @fieldParentPtr("manifest_log", manifest_log);
495
+
496
+ assert(forest.progress.? == .compact);
497
+ assert(forest.compaction_progress != null);
498
+ assert(!forest.compaction_progress.?.manifest_log_done);
499
+ forest.compaction_progress.?.manifest_log_done = true;
500
+
501
+ if (forest.compaction_progress.?.all_done()) {
502
+ forest.compact_finish();
503
+ }
504
+ }
505
+
506
+ fn compact_finish(forest: *Forest) void {
507
+ assert(forest.progress.? == .compact);
508
+ assert(forest.compaction_progress != null);
509
+ assert(forest.compaction_progress.?.trees_done);
510
+ assert(forest.compaction_progress.?.manifest_log_done);
511
+ assert(forest.compaction_schedule.pool.idle());
512
+ assert(forest.compaction_schedule.pool.blocks_acquired() <=
513
+ compaction_block_count_beat_min);
514
+
515
+ forest.verify_table_extents();
516
+
517
+ assert(forest.progress.? == .compact);
518
+ const op = forest.progress.?.compact.op;
519
+
520
+ const compaction_beat = op % constants.lsm_compaction_ops;
521
+ const last_half_beat = compaction_beat ==
522
+ @divExact(constants.lsm_compaction_ops, 2) - 1;
523
+ const last_beat = compaction_beat == constants.lsm_compaction_ops - 1;
524
+
525
+ if (op < constants.lsm_compaction_ops or
526
+ forest.grid.superblock.working.vsr_state.op_compacted(op))
527
+ {
528
+ // No compaction was run.
529
+ } else {
530
+ for (0..constants.lsm_levels) |level_b| {
531
+ if (level_active(.{ .level_b = level_b, .op = op })) {
532
+ inline for (comptime std.enums.values(Forest.TreeID)) |tree_id| {
533
+ const compaction =
534
+ forest.compaction_schedule.compaction_at(level_b, tree_id);
535
+
536
+ // Apply the changes to the manifest. This will run at the target
537
+ // compaction beat that is requested.
538
+ if (last_beat or last_half_beat) compaction.bar_complete();
539
+ }
540
+ }
541
+ }
542
+ }
543
+
544
+ // Groove sync compaction - must be done after all async work for the beat completes.
545
+ inline for (std.meta.fields(Grooves)) |field| {
546
+ @field(forest.grooves, field.name).compact(op);
547
+ }
548
+
549
+ if (last_beat or last_half_beat) {
550
+ assert(forest.compaction_schedule.bar_input_size == 0);
551
+
552
+ // On the last beat of the bar, make sure that manifest log compaction is finished.
553
+ forest.manifest_log.compact_end();
554
+
555
+ // Swap the mutable and immutable tables; this must happen on the last beat,
556
+ // regardless of pacing.
557
+ if (last_beat) {
558
+ inline for (comptime std.enums.values(TreeID)) |tree_id| {
559
+ const tree = tree_for_id(forest, tree_id);
560
+
561
+ log.debug("swap_mutable_and_immutable({s})", .{tree.config.name});
562
+ tree.swap_mutable_and_immutable(
563
+ snapshot_min_for_table_output(compaction_op_min(op)),
564
+ );
565
+
566
+ // Ensure tables haven't overflowed.
567
+ tree.manifest.assert_level_table_counts();
568
+ }
569
+ }
570
+ }
571
+
572
+ const callback = forest.progress.?.compact.callback;
573
+ forest.progress = null;
574
+ forest.compaction_progress = null;
575
+
576
+ callback(forest);
577
+ }
578
+
579
+ pub fn checkpoint(forest: *Forest, callback: Callback) void {
580
+ assert(forest.progress == null);
581
+ assert(forest.compaction_progress == null);
582
+ forest.grid.assert_only_repairing();
583
+ forest.verify_table_extents();
584
+
585
+ forest.progress = .{ .checkpoint = .{ .callback = callback } };
586
+
587
+ inline for (std.meta.fields(Grooves)) |field| {
588
+ @field(forest.grooves, field.name).assert_between_bars();
589
+ }
590
+
591
+ inline for (comptime std.enums.values(TreeID)) |tree_id| {
592
+ const tree = tree_for_id(forest, tree_id);
593
+
594
+ // The last immutable table constructed before the checkpoint must not absorb any
595
+ // mutable table, because otherwise recovering from checkpoint would construct a
596
+ // different immutable table.
597
+ assert(!tree.table_immutable.mutability.immutable.absorbed);
598
+ maybe(tree.table_immutable.count() > 0);
599
+ assert(tree.table_mutable.count() == 0);
600
+ }
601
+
602
+ forest.manifest_log.checkpoint(checkpoint_manifest_log_callback);
603
+ }
604
+
605
+ fn checkpoint_manifest_log_callback(manifest_log: *ManifestLog) void {
606
+ const forest: *Forest = @fieldParentPtr("manifest_log", manifest_log);
607
+ assert(forest.progress.? == .checkpoint);
608
+ assert(forest.compaction_progress == null);
609
+ forest.verify_table_extents();
610
+ forest.verify_tables_recovered();
611
+
612
+ const callback = forest.progress.?.checkpoint.callback;
613
+ forest.progress = null;
614
+ callback(forest);
615
+ }
616
+
617
+ pub fn tree_id_cast(tree_id: u16) TreeID {
618
+ return @enumFromInt(tree_id);
619
+ }
620
+
621
+ fn TreeForIdType(comptime tree_id: TreeID) type {
622
+ const tree_info = tree_infos[@intFromEnum(tree_id) - tree_id_range.min];
623
+ assert(tree_info.tree_id == @intFromEnum(tree_id));
624
+
625
+ return tree_info.Tree;
626
+ }
627
+
628
+ pub fn tree_info_for_id(comptime tree_id: TreeID) TreeInfo {
629
+ const tree_info = tree_infos[@intFromEnum(tree_id) - tree_id_range.min];
630
+ assert(tree_info.tree_id == @intFromEnum(tree_id));
631
+
632
+ return tree_info;
633
+ }
634
+
635
+ pub fn tree_for_id(forest: *Forest, comptime tree_id: TreeID) *TreeForIdType(tree_id) {
636
+ const tree_info = tree_infos[@intFromEnum(tree_id) - tree_id_range.min];
637
+ assert(tree_info.tree_id == @intFromEnum(tree_id));
638
+
639
+ var groove = &@field(forest.grooves, tree_info.groove_name);
640
+
641
+ switch (tree_info.groove_tree) {
642
+ .objects => return &groove.objects,
643
+ .ids => return &groove.ids,
644
+ .indexes => |index_name| return &@field(groove.indexes, index_name),
645
+ }
646
+ }
647
+
648
+ pub fn tree_for_id_const(
649
+ forest: *const Forest,
650
+ comptime tree_id: TreeID,
651
+ ) *const TreeForIdType(tree_id) {
652
+ const tree_info = tree_infos[@intFromEnum(tree_id) - tree_id_range.min];
653
+ assert(tree_info.tree_id == @intFromEnum(tree_id));
654
+
655
+ const groove = &@field(forest.grooves, tree_info.groove_name);
656
+
657
+ switch (tree_info.groove_tree) {
658
+ .objects => return &groove.objects,
659
+ .ids => return &groove.ids,
660
+ .indexes => |index_name| return &@field(groove.indexes, index_name),
661
+ }
662
+ }
663
+
664
+ /// Returns whether the forest contains this table (ignoring differences in snapshot_max) at
665
+ /// any level.
666
+ pub fn contains_table(
667
+ forest: *const Forest,
668
+ table: *const schema.ManifestNode.TableInfo,
669
+ ) bool {
670
+ switch (tree_id_cast(table.tree_id)) {
671
+ inline else => |tree_id| {
672
+ const tree = forest.tree_for_id_const(tree_id);
673
+ const Tree = Forest.TreeForIdType(tree_id);
674
+ const tree_table = Tree.Manifest.TreeTableInfo.decode(table);
675
+ for (&tree.manifest.levels) |manifest_level| {
676
+ if (manifest_level.find(&tree_table)) |level_table| {
677
+ assert(tree_table.checksum == level_table.table_info.checksum);
678
+ assert(tree_table.address == level_table.table_info.address);
679
+ assert(tree_table.key_min == level_table.table_info.key_min);
680
+ assert(tree_table.key_max == level_table.table_info.key_max);
681
+ assert(tree_table.snapshot_min == level_table.table_info.snapshot_min);
682
+
683
+ assert(tree_table.snapshot_max <= level_table.table_info.snapshot_max);
684
+ return true;
685
+ }
686
+ }
687
+ return false;
688
+ },
689
+ }
690
+ }
691
+
692
+ /// Verify that `ManifestLog.table_extents` has an extent for every active table.
693
+ ///
694
+ /// (Invoked between beats.)
695
+ fn verify_table_extents(forest: *const Forest) void {
696
+ var tables_count: usize = 0;
697
+ inline for (comptime std.enums.values(TreeID)) |tree_id| {
698
+ for (0..constants.lsm_levels) |level| {
699
+ const tree_level = forest.tree_for_id_const(tree_id).manifest.levels[level];
700
+ tables_count += tree_level.tables.len();
701
+
702
+ if (constants.verify) {
703
+ var tables_iterator = tree_level.tables.iterator_from_index(0, .ascending);
704
+ while (tables_iterator.next()) |table| {
705
+ assert(forest.manifest_log.table_extents.get(table.address) != null);
706
+ }
707
+ }
708
+ }
709
+ }
710
+ assert(tables_count == forest.manifest_log.table_extents.count());
711
+ }
712
+
713
+ /// Verify the tables recovered into the ManifestLevels after opening the manifest log.
714
+ ///
715
+ /// There are two strategies to reconstruct the LSM's manifest levels (i.e. the list of
716
+ /// tables) from a superblock manifest:
717
+ ///
718
+ /// 1. Iterate the manifest events in chronological order, replaying each
719
+ /// insert/update/remove in sequence.
720
+ /// 2. Iterate the manifest events in reverse-chronological order, ignoring events for
721
+ /// tables that have already been encountered.
722
+ ///
723
+ /// The manifest levels constructed by each strategy are identical.
724
+ ///
725
+ /// 1. This function implements strategy 1, to validate `ManifestLog.open()`.
726
+ /// 2. `ManifestLog.open()` implements strategy 2.
727
+ ///
728
+ /// (Strategy 2 minimizes the number of ManifestLevel mutations.)
729
+ ///
730
+ /// (Invoked immediately after open() or checkpoint()).
731
+ fn verify_tables_recovered(forest: *const Forest) void {
732
+ const ForestTableIteratorType =
733
+ @import("./forest_table_iterator.zig").ForestTableIteratorType;
734
+ const ForestTableIterator = ForestTableIteratorType(Forest);
735
+
736
+ assert(forest.grid.superblock.opened);
737
+ assert(forest.manifest_log.opened);
738
+
739
+ if (Forest.Storage != @import("../testing/storage.zig").Storage) return;
740
+
741
+ // The manifest log is opened, which means we have all of the manifest blocks.
742
+ // But if the replica is syncing, those blocks might still be writing (and thus not in
743
+ // the TestStorage when we go to retrieve them).
744
+ if (forest.grid.superblock.working.vsr_state.sync_op_max > 0) return;
745
+
746
+ // The latest version of each table, keyed by table checksum.
747
+ // Null when the table has been deleted.
748
+ var tables_latest = std.AutoHashMap(u128, struct {
749
+ table: schema.ManifestNode.TableInfo,
750
+ manifest_block: u64,
751
+ manifest_entry: u32,
752
+ }).init(forest.grid.superblock.storage.allocator);
753
+ defer tables_latest.deinit();
754
+
755
+ // Replay manifest events in chronological order.
756
+ // Accumulate all tables that belong in the recovered forest's ManifestLevels.
757
+ for (0..forest.manifest_log.log_block_checksums.count) |i| {
758
+ const block_checksum = forest.manifest_log.log_block_checksums.get(i).?;
759
+ const block_address = forest.manifest_log.log_block_addresses.get(i).?;
760
+ assert(block_address > 0);
761
+
762
+ const block = forest.grid.superblock.storage.grid_block(block_address).?;
763
+ const block_header = schema.header_from_block(block);
764
+ assert(block_header.address == block_address);
765
+ assert(block_header.checksum == block_checksum);
766
+ assert(block_header.block_type == .manifest);
767
+
768
+ const block_schema = schema.ManifestNode.from(block);
769
+ assert(block_schema.entry_count > 0);
770
+ assert(block_schema.entry_count <= schema.ManifestNode.entry_count_max);
771
+
772
+ for (block_schema.tables_const(block), 0..) |*table, entry| {
773
+ if (table.label.event == .remove) {
774
+ maybe(tables_latest.remove(table.checksum));
775
+ } else {
776
+ tables_latest.put(table.checksum, .{
777
+ .table = table.*,
778
+ .manifest_block = block_address,
779
+ .manifest_entry = @intCast(entry),
780
+ }) catch @panic("oom");
781
+ }
782
+ }
783
+
784
+ if (i > 0) {
785
+ // Verify the linked-list.
786
+ const block_previous = schema.ManifestNode.previous(block).?;
787
+ assert(block_previous.checksum ==
788
+ forest.manifest_log.log_block_checksums.get(i - 1).?);
789
+ assert(block_previous.address ==
790
+ forest.manifest_log.log_block_addresses.get(i - 1).?);
791
+ }
792
+ }
793
+
794
+ // Verify that the SuperBlock Manifest's table extents are correct.
795
+ var tables_latest_iterator = tables_latest.valueIterator();
796
+ var table_extent_counts: usize = 0;
797
+ while (tables_latest_iterator.next()) |table| {
798
+ const table_extent = forest.manifest_log.table_extents.get(table.table.address).?;
799
+ assert(table.manifest_block == table_extent.block);
800
+ assert(table.manifest_entry == table_extent.entry);
801
+
802
+ table_extent_counts += 1;
803
+ }
804
+ assert(table_extent_counts == forest.manifest_log.table_extents.count());
805
+
806
+ // Verify the tables in `tables` are exactly the tables recovered by the Forest.
807
+ var forest_tables_iterator = ForestTableIterator{};
808
+ while (forest_tables_iterator.next(forest)) |forest_table_item| {
809
+ const table_latest = tables_latest.get(forest_table_item.checksum).?;
810
+ assert(table_latest.table.label.level == forest_table_item.label.level);
811
+ assert(std.meta.eql(table_latest.table.key_min, forest_table_item.key_min));
812
+ assert(std.meta.eql(table_latest.table.key_max, forest_table_item.key_max));
813
+ assert(table_latest.table.checksum == forest_table_item.checksum);
814
+ assert(table_latest.table.address == forest_table_item.address);
815
+ assert(table_latest.table.snapshot_min == forest_table_item.snapshot_min);
816
+ assert(table_latest.table.snapshot_max == forest_table_item.snapshot_max);
817
+ assert(table_latest.table.tree_id == forest_table_item.tree_id);
818
+
819
+ const table_removed = tables_latest.remove(forest_table_item.checksum);
820
+ assert(table_removed);
821
+ }
822
+ assert(tables_latest.count() == 0);
823
+ }
824
+
825
+ /// Calculates the maximum number of blocks that could be released by Tree and ManifestLog
826
+ /// compactions before a checkpoint becomes durable on a commit quorum of replicas.
827
+ ///
828
+ /// A checkpoint is guaranteed to be durable when a replica commits the (pipeline + 1)th
829
+ /// prepare after checkpoint trigger (see `op_repair_min` in replica.zig for more details).
830
+ /// Therefore, the maximum number of blocks released prior checkpoint durability is
831
+ /// equivalent to the maximum number of blocks released by the first pipeline of prepares
832
+ /// after checkpoint trigger.
833
+ pub fn compaction_blocks_released_per_pipeline_max() usize {
834
+ const half_bar_ops = @divExact(constants.lsm_compaction_ops, 2);
835
+ const pipeline_half_bars =
836
+ stdx.div_ceil(constants.pipeline_prepare_queue_max, half_bar_ops);
837
+
838
+ // Maximum number of blocks released within a single half-bar by compaction.
839
+ const compaction_blocks_released_half_bar_max = blocks: {
840
+ var blocks: usize = 0;
841
+ inline for (Forest.tree_infos) |tree_info| {
842
+ blocks +=
843
+ stdx.div_ceil(constants.lsm_levels, 2) *
844
+ (compaction_input_tables_max *
845
+ (1 + tree_info.Tree.Table.layout.value_block_count_max));
846
+ }
847
+ break :blocks blocks;
848
+ };
849
+
850
+ const compaction_blocks_released_pipeline_max =
851
+ (pipeline_half_bars * compaction_blocks_released_half_bar_max) +
852
+ // Compaction is paced across all beats, so if a pipeline is less than half a bar,
853
+ // for simplicity, use the upper bound for a half a bar (treating pacing as
854
+ // imperfect).
855
+ @intFromBool(pipeline_half_bars == 0) * compaction_blocks_released_half_bar_max;
856
+
857
+ // Maximum number of blocks released within a pipeline by ManifestLog compactions.
858
+ const manifest_log_blocks_released_pipeline_max =
859
+ pipeline_half_bars * Forest.manifest_log_blocks_released_half_bar_max;
860
+
861
+ return compaction_blocks_released_pipeline_max +
862
+ manifest_log_blocks_released_pipeline_max;
863
+ }
864
+ };
865
+ }
866
+
867
+ /// Plans a bar's worth of compaction work across all the trees in the Forest, and schedules it
868
+ /// one beat at a time. Each bar is divided into two half bars with `lsm_compaction_ops/2` beats
869
+ /// each. Even levels (0 → 1, 2 → 4, etc.) are active during the first half bar and odd levels
870
+ /// (immutable → 0, 1 → 3, etc.) are active during the second half bar.
871
+ ///
872
+ /// We now describe the scheduling algorithm. In the description, we refer to each (tree, level)
873
+ /// combination as a `Compaction`, for example the compaction from level 0 → 1 in the Accounts tree.
874
+ ///
875
+ /// At the first beat of each half bar:
876
+ /// 1. Calculate the half-bar quota for each Compaction, which is the total number of bytes that
877
+ /// Compaction needs to chew through. We use this as an estimate of time the compaction will take
878
+ /// and then slice it into small chunks, to spread it evenly across the beats of the half bar.
879
+ /// 2. Calculate the half-bar quota for the entire Forest by summing up the aforementioned quotas.
880
+ ///
881
+ /// At each beat:
882
+ /// 1. Calculate the Forest's beat quota by equally dividing the half-bar quota across each beat.
883
+ /// 2. Resume a suspended Compaction, or start a new one.
884
+ /// 3. Run active Compaction till either the Forest's beat quota is met, or its half-bar quota is
885
+ /// met. If its the latter, go to step 2. If its the former...
886
+ /// 4. Suspend active Compaction and finish the beat.
887
+ fn CompactionScheduleType(comptime Forest: type, comptime Grid: type) type {
888
+ return struct {
889
+ grid: *Grid,
890
+ forest: *Forest,
891
+ pool: ResourcePool,
892
+ next_tick: Grid.NextTick = undefined,
893
+ callback: ?*const fn (*Forest) void = null,
894
+ bar_input_size: u64 = 0,
895
+ beat_input_size: u64 = 0,
896
+
897
+ const CompactionSchedule = @This();
898
+ const ResourcePool = ResourcePoolType(Grid);
899
+
900
+ pub fn init(
901
+ self: *CompactionSchedule,
902
+ allocator: mem.Allocator,
903
+ grid: *Grid,
904
+ forest: *Forest,
905
+ block_count: u32,
906
+ ) !void {
907
+ assert(block_count >= compaction_block_count_beat_min);
908
+
909
+ self.* = .{ .grid = grid, .forest = forest, .pool = undefined };
910
+ self.pool = try ResourcePool.init(allocator, block_count);
911
+ errdefer self.pool.deinit(allocator);
912
+ }
913
+
914
+ pub fn deinit(self: *CompactionSchedule, allocator: mem.Allocator) void {
915
+ self.pool.deinit(allocator);
916
+ }
917
+
918
+ pub fn reset(self: *CompactionSchedule) void {
919
+ self.pool.reset();
920
+
921
+ self.* = .{ .grid = self.grid, .forest = self.forest, .pool = self.pool };
922
+ }
923
+
924
+ pub fn beat_start(self: *CompactionSchedule, callback: Forest.Callback, op: u64) void {
925
+ assert(self.pool.idle());
926
+ assert(self.pool.grid_reservation == null);
927
+
928
+ assert(self.callback == null);
929
+ assert(self.beat_input_size == 0);
930
+
931
+ self.callback = callback;
932
+
933
+ if (op < constants.lsm_compaction_ops or
934
+ self.grid.superblock.working.vsr_state.op_compacted(op))
935
+ {
936
+ self.beat_finish();
937
+ return;
938
+ }
939
+
940
+ const half_bar = @divExact(constants.lsm_compaction_ops, 2);
941
+ const compaction_beat = op % constants.lsm_compaction_ops;
942
+
943
+ const first_beat = compaction_beat == 0;
944
+ const half_beat = compaction_beat == half_bar;
945
+
946
+ if (first_beat or half_beat) {
947
+ assert(self.pool.blocks_acquired() == 0);
948
+ assert(self.bar_input_size == 0);
949
+
950
+ for (0..constants.lsm_levels) |level_b| {
951
+ if (level_active(.{ .level_b = level_b, .op = op })) {
952
+ inline for (comptime std.enums.values(Forest.TreeID)) |tree_id| {
953
+ const tree = Forest.tree_info_for_id(tree_id);
954
+ const Value = tree.Tree.Value;
955
+ const compaction = self.compaction_at(level_b, tree_id);
956
+
957
+ assert(
958
+ self.pool.blocks_free() >=
959
+ // Input index & value blocks may be carried to the next beat.
960
+ compaction.level_a_index_block.buffer.len +
961
+ compaction.level_a_value_block.buffer.len +
962
+ compaction.level_b_index_block.buffer.len +
963
+ compaction.level_b_value_block.buffer.len +
964
+ // At least one output index & value block.
965
+ (1 + 1),
966
+ );
967
+ const bar_input_values = compaction.bar_commence(op);
968
+
969
+ self.bar_input_size += (bar_input_values * @sizeOf(Value));
970
+ }
971
+ }
972
+ }
973
+ }
974
+
975
+ const beats_total = half_bar;
976
+ const beats_done = compaction_beat % half_bar;
977
+ const beats_remaining = beats_total - beats_done;
978
+
979
+ self.beat_input_size = stdx.div_ceil(self.bar_input_size, beats_remaining);
980
+
981
+ // This is akin to a dry run for the actual compaction work that is going to happen
982
+ // during this beat, wherein we:
983
+ // * Invoke beat_commence on the active compactions to set beat quotas
984
+ // * Reserve blocks in the grid for the output of these compactions
985
+ {
986
+ // 1 since we may have partially finished index/value blocks from the previous beat.
987
+ var beat_index_blocks_max: u64 = 1;
988
+ var beat_value_blocks_max: u64 = 1;
989
+
990
+ var beat_input_size = self.beat_input_size;
991
+ for (0..constants.lsm_levels) |level_b| {
992
+ if (level_active(.{ .level_b = level_b, .op = op })) {
993
+ inline for (comptime std.enums.values(Forest.TreeID)) |tree_id| {
994
+ const tree = Forest.tree_info_for_id(tree_id);
995
+ const compaction = self.compaction_at(level_b, tree_id);
996
+
997
+ const Value = tree.Tree.Value;
998
+ const Table = tree.Tree.Table;
999
+
1000
+ compaction.beat_commence(
1001
+ stdx.div_ceil(beat_input_size, @sizeOf(Value)),
1002
+ );
1003
+
1004
+ // The +1 is for imperfections in pacing our immutable table, which
1005
+ // might cause us to overshoot by a single block (limited to 1 due
1006
+ // to how the immutable table values are consumed.)
1007
+ beat_value_blocks_max += stdx.div_ceil(
1008
+ compaction.quotas.beat,
1009
+ Table.layout.block_value_count_max,
1010
+ ) + 1;
1011
+
1012
+ beat_index_blocks_max += stdx.div_ceil(
1013
+ beat_value_blocks_max,
1014
+ Table.value_block_count_max,
1015
+ );
1016
+
1017
+ beat_input_size -|= (compaction.quotas.beat * @sizeOf(Value));
1018
+ }
1019
+ }
1020
+ }
1021
+ assert(beat_input_size == 0);
1022
+ self.pool.grid_reservation = self.grid.reserve(
1023
+ beat_value_blocks_max + beat_index_blocks_max,
1024
+ );
1025
+ }
1026
+
1027
+ self.beat_resume();
1028
+ }
1029
+
1030
+ fn beat_resume(self: *CompactionSchedule) void {
1031
+ assert(self.callback != null);
1032
+
1033
+ if (self.beat_input_size == 0) {
1034
+ self.beat_finish();
1035
+ return;
1036
+ }
1037
+ assert(self.pool.grid_reservation != null);
1038
+
1039
+ const op = self.forest.progress.?.compact.op;
1040
+
1041
+ for (0..constants.lsm_levels) |level_b| {
1042
+ if (level_active(.{ .level_b = level_b, .op = op })) {
1043
+ inline for (comptime std.enums.values(Forest.TreeID)) |tree_id| {
1044
+ const compaction = self.compaction_at(level_b, tree_id);
1045
+
1046
+ const resumed = compaction.compaction_dispatch_enter(.{
1047
+ .pool = &self.pool,
1048
+ .callback = beat_resume_callback,
1049
+ });
1050
+
1051
+ switch (resumed) {
1052
+ .pending => return,
1053
+ .ready => {},
1054
+ }
1055
+ }
1056
+ }
1057
+ }
1058
+ }
1059
+
1060
+ fn beat_resume_callback(pool: *ResourcePool, tree_id: u16, values_consumed: u64) void {
1061
+ const self: *CompactionSchedule = @fieldParentPtr("pool", pool);
1062
+ assert(self.callback != null);
1063
+
1064
+ switch (Forest.tree_id_cast(tree_id)) {
1065
+ inline else => |id| {
1066
+ const Value = Forest.tree_info_for_id(id).Tree.Value;
1067
+ const input_bytes_consumed = values_consumed * @sizeOf(Value);
1068
+ self.bar_input_size -= input_bytes_consumed;
1069
+ self.beat_input_size -|= input_bytes_consumed;
1070
+ },
1071
+ }
1072
+
1073
+ self.beat_resume();
1074
+ }
1075
+
1076
+ fn beat_finish(self: *CompactionSchedule) void {
1077
+ assert(self.callback != null);
1078
+ assert(self.beat_input_size == 0);
1079
+ if (self.pool.grid_reservation) |reservation| {
1080
+ self.grid.forfeit(reservation);
1081
+ self.pool.grid_reservation = null;
1082
+ }
1083
+ self.grid.on_next_tick(beat_finish_next_tick, &self.next_tick);
1084
+ }
1085
+
1086
+ fn beat_finish_next_tick(next_tick: *Grid.NextTick) void {
1087
+ const self: *CompactionSchedule = @alignCast(
1088
+ @fieldParentPtr("next_tick", next_tick),
1089
+ );
1090
+ const callback = self.callback.?;
1091
+ self.callback = null;
1092
+ callback(self.forest);
1093
+ }
1094
+
1095
+ fn compaction_at(
1096
+ self: *CompactionSchedule,
1097
+ level_b: usize,
1098
+ comptime tree_id: Forest.TreeID,
1099
+ ) *Forest.TreeForIdType(tree_id).Compaction {
1100
+ return &self.forest.tree_for_id(tree_id).compactions[level_b];
1101
+ }
1102
+ };
1103
+ }
1104
+
1105
+ fn level_active(options: struct { level_b: usize, op: u64 }) bool {
1106
+ const half_bar_beat_count = @divExact(constants.lsm_compaction_ops, 2);
1107
+ const compaction_beat = options.op % constants.lsm_compaction_ops;
1108
+ return (compaction_beat < half_bar_beat_count) == (options.level_b % 2 == 1);
1109
+ }
1110
+
1111
+ test level_active {
1112
+ assert(!level_active(.{ .level_b = 0, .op = constants.lsm_compaction_ops }));
1113
+ assert(level_active(.{ .level_b = 1, .op = constants.lsm_compaction_ops }));
1114
+ assert(!level_active(.{ .level_b = 2, .op = constants.lsm_compaction_ops }));
1115
+
1116
+ assert(level_active(.{ .level_b = 0, .op = @divExact(constants.lsm_compaction_ops, 2) }));
1117
+ assert(!level_active(.{ .level_b = 1, .op = @divExact(constants.lsm_compaction_ops, 2) }));
1118
+ assert(level_active(.{ .level_b = 2, .op = @divExact(constants.lsm_compaction_ops, 2) }));
1119
+ }