tigerbeetle 0.0.34 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/tb_client/extconf.rb +13 -13
  4. data/ext/tb_client/tigerbeetle/LICENSE +177 -0
  5. data/ext/tb_client/tigerbeetle/build.zig +2327 -0
  6. data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
  7. data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
  8. data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
  9. data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
  10. data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
  11. data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
  12. data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
  13. data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
  14. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
  15. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
  16. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
  17. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
  18. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
  19. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
  20. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
  21. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
  22. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
  23. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
  24. data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
  25. data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
  26. data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
  27. data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
  28. data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
  29. data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
  30. data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
  31. data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
  32. data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
  33. data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
  34. data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
  35. data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
  36. data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
  37. data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
  38. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
  39. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
  40. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
  41. data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
  42. data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
  43. data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
  44. data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
  45. data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
  46. data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
  47. data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
  48. data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
  49. data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
  50. data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
  51. data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
  52. data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
  53. data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
  54. data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
  55. data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
  56. data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
  57. data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
  58. data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
  59. data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
  60. data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
  61. data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
  62. data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
  63. data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
  64. data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
  65. data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
  66. data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
  67. data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
  68. data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
  69. data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
  70. data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
  71. data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
  72. data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
  73. data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
  74. data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
  75. data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
  76. data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
  77. data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
  78. data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
  79. data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
  80. data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
  81. data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
  82. data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
  83. data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
  84. data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
  85. data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
  86. data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
  87. data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
  88. data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
  89. data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
  90. data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
  91. data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
  92. data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
  93. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
  94. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
  95. data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
  96. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
  97. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
  98. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
  99. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
  100. data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
  101. data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
  102. data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
  103. data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
  104. data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
  105. data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
  106. data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
  107. data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
  108. data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
  109. data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
  110. data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
  111. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
  112. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
  113. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
  114. data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
  115. data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
  116. data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
  117. data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
  118. data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
  119. data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
  120. data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
  121. data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
  122. data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
  123. data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
  124. data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
  125. data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
  126. data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
  127. data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
  128. data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
  129. data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
  130. data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
  131. data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
  132. data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
  133. data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
  134. data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
  135. data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
  136. data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
  137. data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
  138. data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
  139. data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
  140. data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
  141. data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
  142. data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
  143. data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
  144. data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
  145. data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
  146. data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
  147. data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
  148. data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
  149. data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
  150. data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
  151. data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
  152. data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
  153. data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
  154. data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
  155. data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
  156. data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
  157. data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
  158. data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
  159. data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
  160. data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
  161. data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
  162. data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
  163. data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
  164. data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
  165. data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
  166. data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
  167. data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
  168. data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
  169. data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
  170. data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
  171. data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
  172. data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
  173. data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
  174. data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
  175. data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
  176. data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
  177. data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
  178. data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
  179. data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
  180. data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
  181. data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
  182. data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
  183. data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
  184. data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
  185. data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
  186. data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
  187. data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
  188. data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
  189. data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
  190. data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
  191. data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
  192. data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
  193. data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
  194. data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
  195. data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
  196. data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
  197. data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
  198. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
  199. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
  200. data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
  201. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
  202. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
  203. data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
  204. data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
  205. data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
  206. data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
  207. data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
  208. data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
  209. data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
  210. data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
  211. data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
  212. data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
  213. data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
  214. data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
  215. data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
  216. data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
  217. data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
  218. data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
  219. data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
  220. data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
  221. data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
  222. data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
  223. data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
  224. data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
  225. data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
  226. data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
  227. data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
  228. data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
  229. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
  230. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
  231. data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
  232. data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
  233. data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
  234. data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
  235. data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
  236. data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
  237. data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
  238. data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
  239. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
  240. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
  241. data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
  242. data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
  243. data/lib/tb_client/shared_lib.rb +12 -5
  244. data/lib/tigerbeetle/client.rb +1 -1
  245. data/lib/tigerbeetle/platforms.rb +9 -0
  246. data/lib/tigerbeetle/version.rb +2 -2
  247. data/tigerbeetle.gemspec +22 -5
  248. metadata +242 -3
  249. data/ext/tb_client/pkg.tar.gz +0 -0
@@ -0,0 +1,1075 @@
1
+ //! TigerBeetle replication routing protocol.
2
+ //!
3
+ //! Eight fallacies of distributed computing:
4
+ //!
5
+ //! 1. The network is reliable;
6
+ //! 2. Latency is zero;
7
+ //! 3. Bandwidth is infinite;
8
+ //! 4. The network is secure;
9
+ //! 5. Topology doesn't change;
10
+ //! 6. There is one administrator;
11
+ //! 7. Transport cost is zero;
12
+ //! 8. The network is homogeneous;
13
+ //!
14
+ //! Robust tail principle:
15
+ //! - The same code should handle slow nodes and crashed nodes.
16
+ //! - A single crashed node should not cause retries and risk metastability.
17
+ //!
18
+ //! Algorithm:
19
+ //!
20
+ //! The replication route is V-shaped. Primary is in the middle, and it tosses each prepare at two
21
+ //! of its neighbors. Backups forward prepares to at most one further backup. Neighbors of the
22
+ //! primary have at least one more neighbor (in a six-replica cluster). If any single node fails,
23
+ //! the primary still gets a replication quorum. If the primary and backups disagree about the
24
+ //! replication route, the primary still gets a replication quorum.
25
+ //!
26
+ //! Because topology changes, routes are dynamic. The primary broadcasts the current route in the
27
+ //! ping message. It's enough if the routing information is only eventually consistent.
28
+ //!
29
+ //! To select the best route, primary uses outcome-focused explore-exploit approach. Every once in a
30
+ //! while the primary tries an alternative route. The primary captures replication latency for a
31
+ //! route (that is, the arrival time of prepare_ok messages). If the latency for an alternative
32
+ //! route is sufficiently better than current latency, the route is switched. Note that latency
33
+ //! includes both network and disk latency.
34
+ //!
35
+ //! The experiment schedule is defined randomly. All replicas share the same RNG seed, so no
36
+ //! coordination is needed to launch an experiment!
37
+ //!
38
+ //! To remove outliers, the experiment take two ops. op and op^1 are either both experimental, or
39
+ //! both non-experimental. Experimental route cost is the average of two costs. Active route cost
40
+ //! is maintained as exponential weighted moving average.
41
+ const std = @import("std");
42
+ const assert = std.debug.assert;
43
+ const maybe = stdx.maybe;
44
+ const constants = @import("../constants.zig");
45
+ const stdx = @import("stdx");
46
+ const Ratio = stdx.PRNG.Ratio;
47
+ const ratio = stdx.PRNG.ratio;
48
+ const Instant = stdx.Instant;
49
+ const Duration = stdx.Duration;
50
+
51
+ const history_max = constants.pipeline_prepare_queue_max * 2;
52
+
53
+ // This constant serves two purposes:
54
+ // - First, as a cap for all latencies, to make sure that u64 computations can't overflow.
55
+ // - Second, to simplify cost function computations, replica we haven't heard from has this latency.
56
+ const latency_max: Duration = .{ .ns = 10 * std.time.ns_per_hour };
57
+
58
+ replica: u8,
59
+ replica_count: u8,
60
+ standby_count: u8,
61
+ view: u32,
62
+
63
+ /// The route we currently use.
64
+ a: Route,
65
+ a_cost: ?Cost, // Computed as exponential weighted moving average.
66
+
67
+ /// The best alternative, which might be better or worse than a.
68
+ b: Route,
69
+ b_cost: ?Cost, // Computed as an average of two experiments.
70
+
71
+ experiment_chance: Ratio = ratio(1, 20),
72
+
73
+ history: [history_max]OpHistory,
74
+
75
+ /// A permutation of replicas, where the middle replica is the primary.
76
+ pub const Route = struct {
77
+ replicas: [constants.replicas_max]u8,
78
+ count: u8,
79
+
80
+ // 0 1 2 ... (replica_count - 1)
81
+ fn trivial(replica_count: u8) Route {
82
+ var route: Route = .{ .replicas = undefined, .count = 0 };
83
+ for (0..replica_count) |replica| {
84
+ route.push(@intCast(replica));
85
+ }
86
+ return route;
87
+ }
88
+
89
+ fn push(route: *Route, replica: u8) void {
90
+ assert(route.count < constants.replicas_max);
91
+ route.replicas[route.count] = replica;
92
+ route.count += 1;
93
+ }
94
+
95
+ // Default route for a new view.
96
+ // The undelying cause for a view change is likely an abrupt topology change. Historical
97
+ // data collected so far is of little use and the cluster switches to a deterministic route.
98
+ pub fn view_default(view: u32, replica_count: u8) Route {
99
+ var route: Route = .trivial(replica_count);
100
+ // Rotate primary to the midpoint;
101
+ const primary_index: u8 = @intCast(view % replica_count);
102
+ const midpoint = @divFloor(replica_count, 2);
103
+ const rotation = (replica_count + primary_index - midpoint) % replica_count;
104
+ std.mem.rotate(u8, route.replicas[0..route.count], rotation);
105
+ assert(route.replicas[midpoint] == primary_index);
106
+
107
+ assert(route.valid(view, replica_count));
108
+ return route;
109
+ }
110
+
111
+ // A random route.
112
+ // Every so often replicas try using a random route, in case it turns out to be better than
113
+ // the current one.
114
+ pub fn random(prng: *stdx.PRNG, view: u32, replica_count: u8) Route {
115
+ var route: Route = .trivial(replica_count);
116
+ prng.shuffle(u8, route.replicas[0..route.count]);
117
+
118
+ const primary_index: u8 = @intCast(view % replica_count);
119
+ const midpoint = @divFloor(replica_count, 2);
120
+ const primary_position = std.mem.indexOfScalar(
121
+ u8,
122
+ route.replicas[0..route.count],
123
+ primary_index,
124
+ ).?;
125
+ std.mem.swap(
126
+ u8,
127
+ &route.replicas[primary_position],
128
+ &route.replicas[midpoint],
129
+ );
130
+
131
+ assert(route.valid(view, replica_count));
132
+ return route;
133
+ }
134
+
135
+ // Check that the route is a permuation, and that the primary is in the middle.
136
+ pub fn valid(route: *const Route, view: u32, replica_count: u8) bool {
137
+ if (route.count != replica_count) return false;
138
+
139
+ for (route.replicas[0..route.count]) |replica| {
140
+ if (replica >= replica_count) return false;
141
+ }
142
+ for (0..replica_count) |i| {
143
+ for (0..i) |j| {
144
+ if (route.replicas[i] == route.replicas[j]) return false;
145
+ }
146
+ }
147
+
148
+ const primary_index: u8 = @intCast(view % replica_count);
149
+ const midpoint = @divFloor(replica_count, 2);
150
+ if (route.replicas[midpoint] != primary_index) return false;
151
+
152
+ return true;
153
+ }
154
+
155
+ // Encode a root as a u64.
156
+ // Routes are communicated in pings, which have u64 space in the message header.
157
+ pub fn encode(route: Route) u64 {
158
+ comptime assert(constants.replicas_max <= @sizeOf(u64));
159
+ var code: u64 = 0;
160
+ for (0..@sizeOf(u64)) |index| {
161
+ const byte: u64 = if (index < route.count)
162
+ route.replicas[index]
163
+ else
164
+ @as(u8, 0xFF);
165
+ const shift: u6 = @bitSizeOf(u8) * @as(u6, @intCast(index));
166
+ code |= byte << shift;
167
+ }
168
+ assert(code != 0);
169
+ return code;
170
+ }
171
+
172
+ pub fn decode(code: u64, view: u32, replica_count: u8) ?Route {
173
+ var route: Route = .{
174
+ .replicas = undefined,
175
+ .count = 0,
176
+ };
177
+ for (0..@sizeOf(u64)) |index| {
178
+ const shift: u6 = @bitSizeOf(u8) * @as(u6, @intCast(index));
179
+ const byte: u64 = (code >> shift) & 0xFF;
180
+ if (index < replica_count) {
181
+ if (byte < replica_count) {
182
+ route.push(@intCast(byte));
183
+ } else {
184
+ return null;
185
+ }
186
+ } else {
187
+ if (byte == 0xFF) {
188
+ // "Blanks" are filled with all ones,
189
+ } else {
190
+ return null;
191
+ }
192
+ }
193
+ }
194
+
195
+ if (!route.valid(view, replica_count)) return null;
196
+
197
+ return route;
198
+ }
199
+
200
+ pub fn equal(a: *const Route, b: *const Route) bool {
201
+ assert(a.count == b.count);
202
+ return std.mem.eql(u8, a.replicas[0..a.count], b.replicas[0..b.count]);
203
+ }
204
+
205
+ fn next_hop(route: *const Route, view: u32, replica: u8, hops: *[2]u8) []const u8 {
206
+ assert(replica < route.count);
207
+ assert(route.valid(view, route.count));
208
+
209
+ // We need to return at most two "neighbours" in replication topology.
210
+ // Assume that the route is as follows, with view=7:
211
+ //
212
+ // 0 2 1 3 5 4
213
+ // ^ primary
214
+ //
215
+ // Assume that the route is as follows, with view=7
216
+ //
217
+ // 0 2 1 3 5 4
218
+ // ^ primary
219
+ //
220
+ // If our replica is before 1, we add the previous neighbour. If we are after 1,
221
+ // we add the next neighbour. The primary adds both neighbours.
222
+
223
+ const primary_index: u8 = @intCast(view % route.count);
224
+ const primary_position = @divFloor(route.count, 2);
225
+ assert(route.replicas[primary_position] == primary_index);
226
+
227
+ const replica_position = std.mem.indexOfScalar(
228
+ u8,
229
+ route.replicas[0..route.count],
230
+ replica,
231
+ ).?;
232
+
233
+ var hop_count: usize = 0;
234
+ if (replica_position <= primary_position and replica_position > 0) {
235
+ hops[hop_count] = route.replicas[replica_position - 1];
236
+ hop_count += 1;
237
+ }
238
+ if (replica_position >= primary_position and replica_position < route.count - 1) {
239
+ hops[hop_count] = route.replicas[replica_position + 1];
240
+ hop_count += 1;
241
+ }
242
+
243
+ assert(hop_count <= 2);
244
+ assert((hop_count == 2) ==
245
+ (replica_position == primary_position and route.count >= 3));
246
+ return hops[0..hop_count];
247
+ }
248
+ };
249
+
250
+ const OpHistory = struct {
251
+ op: u64,
252
+ prepare: Instant,
253
+ prepare_ok: [constants.replicas_max]Duration = @splat(latency_max),
254
+ present: stdx.BitSetType(constants.replicas_max) = .{},
255
+
256
+ const root: OpHistory = .{
257
+ .op = 0,
258
+ .prepare = .{ .ns = 0 },
259
+ };
260
+
261
+ fn record_prepare_ok(
262
+ op_history: *OpHistory,
263
+ replica: u8,
264
+ now: Instant,
265
+ ) void {
266
+ const latency = now.duration_since(op_history.prepare);
267
+ if (op_history.present.is_set(replica)) {
268
+ assert(op_history.prepare_ok[replica].ns <= latency.ns);
269
+ return;
270
+ }
271
+ assert(!op_history.present.is_set(replica));
272
+ assert(op_history.prepare_ok[replica].ns == latency_max.ns);
273
+
274
+ op_history.prepare_ok[replica] = Duration.min(latency, latency_max);
275
+ op_history.present.set(replica);
276
+ }
277
+ };
278
+
279
+ const Cost = struct {
280
+ // Left-biased median latency matches replication quorum latency, and directly contributes to
281
+ // user-visible latency.
282
+ median: Duration,
283
+ // Maximum latency corresponds to the time when a prepare is fully replicated, and is important
284
+ // for the overall health of the cluster.
285
+ maximum: Duration,
286
+ // Worst-case latency tracks the length of the critical path, but we want non-critical paths to
287
+ // be as short as possible.
288
+ sum: Duration,
289
+
290
+ fn less(lhs: Cost, rhs: Cost) bool {
291
+ // For sum, smaller than 5% improvement is considered insignificant.
292
+ // For median and maximum the threshold is 10%.
293
+ //
294
+ // Why 5%: If replicas are on a ring in terms of distance, the sum for optimal path is
295
+ //
296
+ // 1 → 2
297
+ // ↗
298
+ // ♔ 3
299
+ // ↘ ↗
300
+ // 5 → 4
301
+ //
302
+ // sum = (1 + 1) + (1 + 1 + 1) (prepares)
303
+ // + (1 + 2) + (1 + 2 + 3) (prepare ok)
304
+ // = 14
305
+ //
306
+ // The sum for next best path is
307
+ //
308
+ // 1 → 2
309
+ // ↗
310
+ // ♔ 3
311
+ // ↘ ↗ ↙
312
+ // 5 4
313
+ //
314
+ // sum = (1 + 1) + (1 + 2 + 1) (prepares, two replicas transposed)
315
+ // + (1 + 2) + (1 + 2 + 3) (prepare ok)
316
+ // = 15
317
+ //
318
+ // The difference between 14 and 15 is less than 10% but more than 5%.
319
+ //
320
+ // Why 10%: just feels like a reasonable number! median varies more than sum, so we need
321
+ // larger tolerance there.
322
+ inline for (.{
323
+ .{ "median", ratio(1, 10) },
324
+ .{ "maximum", ratio(1, 10) },
325
+ .{ "sum", ratio(1, 20) },
326
+ }) |field_threshold| {
327
+ const field, const threshold = field_threshold;
328
+ if (less_significantly(@field(lhs, field), @field(rhs, field), threshold)) return true;
329
+ if (@field(lhs, field).ns > @field(rhs, field).ns) return false;
330
+ }
331
+ return false;
332
+ }
333
+
334
+ // Returns true if lhs + lhs⋅threshold < rhs.
335
+ fn less_significantly(lhs: Duration, rhs: Duration, threshold: Ratio) bool {
336
+ assert(threshold.numerator < threshold.denominator);
337
+ return lhs.ns * (threshold.numerator + threshold.denominator) <
338
+ rhs.ns * threshold.denominator;
339
+ }
340
+
341
+ fn average(lhs: Cost, rhs: Cost) Cost {
342
+ return .{
343
+ .median = .{ .ns = @divFloor(lhs.median.ns + rhs.median.ns, 2) },
344
+ .maximum = .{ .ns = @divFloor(lhs.maximum.ns + rhs.maximum.ns, 2) },
345
+ .sum = .{ .ns = @divFloor(lhs.sum.ns + rhs.sum.ns, 2) },
346
+ };
347
+ }
348
+
349
+ fn ewma_add(old: Cost, new: Cost) Cost {
350
+ return .{
351
+ .median = ewma_add_duration(old.median, new.median),
352
+ .maximum = ewma_add_duration(old.maximum, new.maximum),
353
+ .sum = ewma_add_duration(old.sum, new.sum),
354
+ };
355
+ }
356
+
357
+ fn ewma_add_duration(old: Duration, new: Duration) Duration {
358
+ return .{
359
+ .ns = @divFloor((old.ns * 4) + new.ns, 5),
360
+ };
361
+ }
362
+ };
363
+
364
+ const Routing = @This();
365
+
366
+ pub fn init(options: struct {
367
+ replica: u8,
368
+ replica_count: u8,
369
+ standby_count: u8,
370
+ }) Routing {
371
+ assert(options.replica < options.replica_count + options.standby_count);
372
+ assert(options.replica_count <= constants.replicas_max);
373
+ assert(options.standby_count <= constants.standbys_max);
374
+
375
+ const route = Route.view_default(0, options.replica_count);
376
+
377
+ return .{
378
+ .replica = options.replica,
379
+ .replica_count = options.replica_count,
380
+ .standby_count = options.standby_count,
381
+
382
+ .view = 0,
383
+ .a = route,
384
+ .a_cost = null,
385
+
386
+ .b = route,
387
+ .b_cost = null,
388
+
389
+ .history = @splat(.root),
390
+ };
391
+ }
392
+
393
+ pub fn view_change(routing: *Routing, view: u32) void {
394
+ assert(view > routing.view or (view == 0 and routing.view == 0));
395
+ assert(routing.history_empty());
396
+ routing.view = view;
397
+
398
+ const route = Route.view_default(view, routing.replica_count);
399
+ assert(route.valid(view, routing.replica_count));
400
+
401
+ routing.a = route;
402
+ assert(routing.a_cost == null);
403
+
404
+ routing.b = route;
405
+ assert(routing.b_cost == null);
406
+ }
407
+
408
+ pub fn route_encode(routing: *const Routing, route: Route) u64 {
409
+ comptime assert(constants.replicas_max <= @sizeOf(u64));
410
+ assert(route.valid(routing.view, routing.replica_count));
411
+ const code = route.encode();
412
+ assert(code != 0);
413
+ return code;
414
+ }
415
+
416
+ // Positive space testing --- encode every single route!
417
+ test route_encode {
418
+ const Gen = @import("../testing/exhaustigen.zig");
419
+
420
+ var g: Gen = .{};
421
+
422
+ while (!g.done()) {
423
+ const replica_count = g.range_inclusive(u8, 1, constants.replicas_max);
424
+ var route: Route = .trivial(replica_count);
425
+ assert(route.count == replica_count);
426
+ g.shuffle(u8, route.replicas[0..route.count]);
427
+
428
+ const primary_index = route.replicas[@divFloor(route.count, 2)];
429
+ var routing = Routing.init(.{
430
+ .replica = primary_index,
431
+ .replica_count = replica_count,
432
+ .standby_count = 0,
433
+ });
434
+ routing.view_change(primary_index);
435
+
436
+ const code = routing.route_encode(route);
437
+ const route_decoded = routing.route_decode(code).?;
438
+
439
+ assert(route_decoded.count == replica_count);
440
+ assert(std.mem.eql(
441
+ u8,
442
+ route.replicas[0..route.count],
443
+ route_decoded.replicas[0..route_decoded.count],
444
+ ));
445
+ }
446
+ }
447
+
448
+ pub fn route_decode(routing: *const Routing, code: u64) ?Route {
449
+ return Route.decode(code, routing.view, routing.replica_count);
450
+ }
451
+
452
+ // Negative space testing, check that if a 'random' number decodes, it decodes to a route.
453
+ // It is possible to write an exhaustigen test here, but it takes 30s in debug, which is too slow.
454
+ test route_decode {
455
+ const T = struct {
456
+ const Counts = struct {
457
+ total: u32,
458
+ valid: u32,
459
+ invalid: u32,
460
+ };
461
+
462
+ fn check(prng: *stdx.PRNG) Counts {
463
+ var counts: Counts = .{
464
+ .total = 200_000,
465
+ .valid = 0,
466
+ .invalid = 0,
467
+ };
468
+ for (0..counts.total) |_| {
469
+ const replica_count = prng.range_inclusive(u8, 1, constants.replicas_max);
470
+
471
+ var code_bytes: [8]u8 = @splat(0);
472
+ for (&code_bytes) |*byte| {
473
+ byte.* = if (prng.chance(ratio(replica_count + 1, 8)))
474
+ prng.int_inclusive(u8, constants.replicas_max + 1)
475
+ else
476
+ 0xFF;
477
+ }
478
+ var code: u64 = @bitCast(code_bytes);
479
+
480
+ if (prng.chance(ratio(1, 20))) {
481
+ code ^= prng.bit(u64);
482
+ }
483
+ if (prng.chance(ratio(1, 20))) {
484
+ code = prng.int(u64);
485
+ }
486
+
487
+ var routing = Routing.init(.{
488
+ .replica = prng.int_inclusive(u8, replica_count - 1),
489
+ .replica_count = replica_count,
490
+ .standby_count = prng.int_inclusive(u8, constants.standbys_max),
491
+ });
492
+ routing.view_change(prng.int_inclusive(u32, 10_000));
493
+
494
+ if (routing.route_decode(code)) |route| {
495
+ counts.valid += 1;
496
+ const code_encoded = routing.route_encode(route);
497
+ assert(code == code_encoded);
498
+ } else {
499
+ counts.invalid += 1;
500
+ }
501
+ }
502
+ assert(counts.valid + counts.invalid == counts.total);
503
+ return counts;
504
+ }
505
+ };
506
+
507
+ // Run with a fixed seed first to assert that the test covers both valid and invalid inputs.
508
+ var prng = stdx.PRNG.from_seed(92);
509
+ const counts = T.check(&prng);
510
+ assert(counts.valid > 50);
511
+ assert(counts.invalid > 100_000);
512
+
513
+ prng = stdx.PRNG.from_seed_testing();
514
+ _ = T.check(&prng);
515
+ }
516
+
517
+ pub fn route_activate(routing: *Routing, route: Route) void {
518
+ assert(routing.history_empty());
519
+ assert(route.valid(routing.view, routing.replica_count));
520
+ routing.a = route;
521
+ assert(routing.a_cost == null);
522
+ }
523
+
524
+ pub fn route_improvement(routing: *const Routing) ?Route {
525
+ const a_cost = routing.a_cost orelse return null;
526
+ const b_cost = routing.b_cost orelse return null;
527
+ return if (Cost.less(b_cost, a_cost)) routing.b else null;
528
+ }
529
+
530
+ pub fn op_next_hop(routing: *const Routing, op: u64, hops_buffer: *[2]u8) []const u8 {
531
+ const route = routing.op_route(op);
532
+ assert(route.valid(routing.view, routing.replica_count));
533
+
534
+ var hop_count: usize = 0;
535
+ if (routing.replica < routing.replica_count) {
536
+ // Normal replication: replicate to 0-2 other replicas using a dynamic route.
537
+ hop_count = route.next_hop(routing.view, routing.replica, hops_buffer).len;
538
+
539
+ // First replica in the route kicks-off standby replication (in a cluster of six, it is
540
+ // the replica at the end of the shorter branch of the route).
541
+ if (routing.standby_count > 0) {
542
+ if (routing.replica == route.replicas[0]) {
543
+ assert(hop_count < 2);
544
+ hops_buffer[hop_count] = routing.replica_count;
545
+ hop_count += 1;
546
+ }
547
+ }
548
+ } else {
549
+ // Standby replication uses static ring topology.
550
+ if (routing.replica + 1 < routing.replica_count + routing.standby_count) {
551
+ assert(hop_count == 0);
552
+ hops_buffer[hop_count] = routing.replica + 1;
553
+ hop_count += 1;
554
+ assert(hop_count == 1);
555
+ }
556
+ }
557
+
558
+ assert(hop_count <= 2);
559
+ return hops_buffer[0..hop_count];
560
+ }
561
+
562
+ pub fn op_prepare(routing: *Routing, op: u64, now: Instant) void {
563
+ assert(routing.primary());
564
+ assert(op != 0); // Root ops is never prepared.
565
+ const slot = op % history_max;
566
+ if (routing.history[slot].op != 0) {
567
+ routing.op_finalize(routing.history[slot].op, .evicted);
568
+ }
569
+
570
+ routing.history[slot] = .{
571
+ .op = op,
572
+ .prepare = now,
573
+ };
574
+ }
575
+
576
+ pub fn op_prepare_ok(routing: *Routing, op: u64, replica: u8, now: Instant) void {
577
+ assert(routing.primary());
578
+ // Replicas can ack the root op after repair. While we can prevent replicas from sending such
579
+ // prepare_ok that will make the protocol more complex. Instead, ignore op=0 here and treat it
580
+ // as empty slot elsewhere.
581
+ if (op == 0) return;
582
+ maybe(replica == routing.replica);
583
+ const slot = op % history_max;
584
+ if (routing.history[slot].op != op) return;
585
+
586
+ routing.history[slot].record_prepare_ok(replica, now);
587
+ if (routing.history[slot].present.count() == routing.replica_count) {
588
+ routing.op_finalize(op, .replicated_fully);
589
+ }
590
+ }
591
+
592
+ fn op_finalize(
593
+ routing: *Routing,
594
+ op: u64,
595
+ reason: enum { evicted, replicated_fully },
596
+ ) void {
597
+ assert(routing.primary());
598
+ assert(op != 0);
599
+ assert(routing.history[op % history_max].op == op);
600
+ assert(routing.history[op % history_max].present.count() <= routing.replica_count);
601
+ if (reason == .replicated_fully) {
602
+ assert(routing.history[op % history_max].present.count() == routing.replica_count);
603
+ }
604
+
605
+ if (routing.op_route_b(op)) |route_b| {
606
+ var replicated_fully_count: u8 = 0;
607
+ var cost_average: ?Cost = null;
608
+
609
+ for ([2]u64{ op, op ^ 1 }) |experiment| {
610
+ assert(std.meta.eql(routing.op_route_b(experiment), route_b));
611
+
612
+ const slot = experiment % history_max;
613
+ if (routing.history[slot].op != experiment) {
614
+ // Don't have data for the other experiment yet.
615
+ return;
616
+ }
617
+
618
+ replicated_fully_count +=
619
+ @intFromBool(routing.history[slot].present.count() == routing.replica_count);
620
+ const new = routing.history_cost(experiment);
621
+ cost_average = if (cost_average) |old| Cost.average(old, new) else new;
622
+ }
623
+ assert(cost_average != null);
624
+
625
+ if ((reason == .replicated_fully and replicated_fully_count == 2) or
626
+ (reason == .evicted and replicated_fully_count < 2))
627
+ {
628
+ if (routing.b_cost == null or Cost.less(cost_average.?, routing.b_cost.?)) {
629
+ routing.b = route_b;
630
+ routing.b_cost = cost_average.?;
631
+ }
632
+ }
633
+ } else {
634
+ const slot = op % history_max;
635
+
636
+ if (reason == .replicated_fully or
637
+ (reason == .evicted and routing.history[slot].present.count() < routing.replica_count))
638
+ {
639
+ const new = routing.history_cost(op);
640
+ routing.a_cost = if (routing.a_cost) |old| Cost.ewma_add(old, new) else new;
641
+ }
642
+ }
643
+ }
644
+
645
+ fn op_route(routing: *const Routing, op: u64) Route {
646
+ return routing.op_route_b(op) orelse routing.a;
647
+ }
648
+
649
+ fn op_route_b(routing: *const Routing, op: u64) ?Route {
650
+ var prng = stdx.PRNG.from_seed(op | 1);
651
+ if (prng.chance(routing.experiment_chance)) {
652
+ const route = Route.random(&prng, routing.view, routing.replica_count);
653
+ assert(route.valid(routing.view, routing.replica_count));
654
+ return route;
655
+ }
656
+ return null;
657
+ }
658
+
659
+ fn history_empty(routing: *Routing) bool {
660
+ if (routing.a_cost != null) return false;
661
+ if (routing.b_cost != null) return false;
662
+ for (routing.history) |h| {
663
+ if (h.op != 0) return false;
664
+ }
665
+ return true;
666
+ }
667
+
668
+ pub fn history_reset(routing: *Routing) void {
669
+ routing.history = @splat(.root);
670
+ routing.a_cost = null;
671
+ routing.b_cost = null;
672
+ }
673
+
674
+ fn history_cost(routing: *const Routing, op: u64) Cost {
675
+ const slot = op % history_max;
676
+ assert(routing.history[slot].op == op);
677
+ assert(routing.history[slot].present.count() <= routing.replica_count);
678
+
679
+ var latencies_buffer: [constants.replicas_max]Duration = routing.history[slot].prepare_ok;
680
+ const latencies = latencies_buffer[0..routing.replica_count];
681
+ // Use a simpler sort for code size.
682
+ assert(latencies.len < 16);
683
+ std.sort.insertion(Duration, latencies, {}, Duration.sort.asc);
684
+
685
+ const median = latencies[@divFloor(routing.replica_count - 1, 2)]; // Left leaning median.
686
+ const maximum = latencies[routing.replica_count - 1];
687
+ var sum: Duration = .{ .ns = 0 };
688
+ for (latencies) |latency| sum.ns += latency.ns;
689
+
690
+ assert(median.ns <= maximum.ns);
691
+ assert(maximum.ns <= sum.ns);
692
+
693
+ return .{ .median = median, .maximum = maximum, .sum = sum };
694
+ }
695
+
696
+ fn primary(routing: *const Routing) bool {
697
+ const primary_index: u8 = @intCast(routing.view % routing.replica_count);
698
+ return routing.replica == primary_index;
699
+ }
700
+
701
+ test "Routing finds best route" {
702
+ // This fuzzer arranges replicas into a "ring" physical topology according to a random
703
+ // permutation, and checks that we are able to infer the permutation using our cost function.
704
+ // It checks that the happy path works as intended.
705
+ const Environment = struct {
706
+ const Path = @import("../testing/packet_simulator.zig").Path;
707
+ const Packet = union(enum) { prepare: u64, prepare_ok: u64 };
708
+ const PacketSimulator = @import("../testing/packet_simulator.zig")
709
+ .PacketSimulatorType(Packet);
710
+
711
+ replica_count: u8,
712
+ view: u32,
713
+ primary: u8,
714
+ replicas: []Routing,
715
+ prepare_ok_count: u8 = 0,
716
+ packet_simulator: PacketSimulator,
717
+ permutation: []u8,
718
+
719
+ const Environment = @This();
720
+
721
+ pub fn init(gpa: std.mem.Allocator, seed: u64) !Environment {
722
+ var prng = stdx.PRNG.from_seed(seed);
723
+
724
+ const replica_count = prng.range_inclusive(u8, 1, constants.replicas_max);
725
+ var packet_simulator = try PacketSimulator.init(gpa, .{
726
+ .node_count = replica_count,
727
+ .client_count = 0,
728
+ .seed = seed,
729
+ .one_way_delay_mean = .{ .ns = 0 },
730
+ .one_way_delay_min = .{ .ns = 0 },
731
+ .path_maximum_capacity = replica_count,
732
+ .path_clog_duration_mean = .{ .ns = 0 },
733
+ .path_clog_probability = ratio(0, 100),
734
+ }, .{
735
+ .packet_command = packet_command,
736
+ .packet_clone = packet_clone,
737
+ .packet_deinit = packet_deinit,
738
+ .packet_deliver = packet_deliver,
739
+ .packet_delay = packet_delay,
740
+ });
741
+ errdefer packet_simulator.deinit(gpa);
742
+
743
+ const permuation: []u8 = try gpa.alloc(u8, replica_count);
744
+ errdefer gpa.free(permuation);
745
+
746
+ for (0..replica_count) |i| permuation[i] = @intCast(i);
747
+ prng.shuffle(u8, permuation);
748
+
749
+ const replicas: []Routing = try gpa.alloc(Routing, replica_count);
750
+ errdefer gpa.free(replicas);
751
+
752
+ for (replicas[0..replica_count], 0..) |*replica, replica_index| {
753
+ replica.* = Routing.init(.{
754
+ .replica = @intCast(replica_index),
755
+ .replica_count = replica_count,
756
+ .standby_count = 0,
757
+ });
758
+ }
759
+
760
+ const view = prng.range_inclusive(u32, 0, 32);
761
+ const primary_index = view % replica_count;
762
+
763
+ return .{
764
+ .replica_count = replica_count,
765
+ .view = view,
766
+ .primary = @intCast(primary_index),
767
+ .replicas = replicas,
768
+ .packet_simulator = packet_simulator,
769
+ .permutation = permuation,
770
+ };
771
+ }
772
+
773
+ pub fn deinit(env: *Environment, gpa: std.mem.Allocator) void {
774
+ gpa.free(env.replicas);
775
+ gpa.free(env.permutation);
776
+ env.packet_simulator.deinit(gpa);
777
+ env.* = undefined;
778
+ }
779
+
780
+ pub fn now(env: *const Environment) Instant {
781
+ return .{ .ns = env.packet_simulator.ticks * constants.tick_ms * std.time.ns_per_ms };
782
+ }
783
+
784
+ fn ring_index(env: *const Environment, replica: u8) i8 {
785
+ return @intCast(env.permutation[replica]);
786
+ }
787
+
788
+ fn distance(env: *const Environment, source: u8, target: u8) u8 {
789
+ return @min(
790
+ @abs(env.ring_index(source) - env.ring_index(target)),
791
+ env.replica_count - @abs(env.ring_index(target) - env.ring_index(source)),
792
+ );
793
+ }
794
+
795
+ fn total_route_distance(env: *const Environment, route: Route) u8 {
796
+ if (env.replica_count == 1) return 0;
797
+ var result: u8 = 0;
798
+ for (
799
+ route.replicas[0 .. env.replica_count - 1],
800
+ route.replicas[1..env.replica_count],
801
+ ) |a, b| {
802
+ result += env.distance(a, b);
803
+ }
804
+ return result;
805
+ }
806
+
807
+ fn packet_command(_: *PacketSimulator, _: Packet) @import("../vsr.zig").Command {
808
+ return .ping; // Doesn't matter.
809
+ }
810
+ fn packet_clone(_: *PacketSimulator, packet: Packet) Packet {
811
+ return packet;
812
+ }
813
+ fn packet_deinit(_: *PacketSimulator, _: Packet) void {}
814
+
815
+ fn packet_deliver(packet_simulator: *PacketSimulator, packet: Packet, path: Path) void {
816
+ const env: *Environment = @fieldParentPtr("packet_simulator", packet_simulator);
817
+
818
+ switch (packet) {
819
+ .prepare => |op| {
820
+ if (path.target == env.primary) {
821
+ // Initial prepare injected by the fuzzer.
822
+ assert(path.source == env.primary);
823
+ assert(env.prepare_ok_count == 0);
824
+ env.replicas[env.primary].op_prepare(op, env.now());
825
+ }
826
+ env.packet_simulator.submit_packet(.{ .prepare_ok = op }, .{
827
+ .source = path.target,
828
+ .target = env.primary,
829
+ });
830
+
831
+ var next_hop_buffer: [2]u8 = undefined;
832
+ const next_hop = env.replicas[path.target].op_next_hop(op, &next_hop_buffer);
833
+ assert(next_hop.len <= 2);
834
+ for (next_hop) |target_next| {
835
+ assert(target_next < env.replica_count);
836
+ env.packet_simulator.submit_packet(.{ .prepare = op }, .{
837
+ .source = path.target,
838
+ .target = target_next,
839
+ });
840
+ }
841
+ },
842
+ .prepare_ok => |op| {
843
+ assert(path.target == env.primary);
844
+ env.prepare_ok_count += 1;
845
+ env.replicas[env.primary].op_prepare_ok(op, path.source, env.now());
846
+ },
847
+ }
848
+ }
849
+
850
+ fn packet_delay(packet_simulator: *PacketSimulator, _: Packet, path: Path) Duration {
851
+ const env: *Environment = @fieldParentPtr("packet_simulator", packet_simulator);
852
+ return .{
853
+ .ns = @as(u64, env.distance(path.source, path.target)) * 10 * std.time.ns_per_ms,
854
+ };
855
+ }
856
+ };
857
+
858
+ for (0..10) |seed| {
859
+ var env = try Environment.init(std.testing.allocator, seed);
860
+ defer env.deinit(std.testing.allocator);
861
+
862
+ for (env.replicas) |*replica| {
863
+ replica.view_change(env.view);
864
+ }
865
+
866
+ // Napkin math:
867
+ // For 6 replicas, there are (replica_count - 1)! = 5! = 120 routes.
868
+ // Two of the routes are optimal, which gives 1/60 chance of success per experiment.
869
+ // Experiment runs on a pair of ops with probability 1/20, so we expect 40 ops per
870
+ // experiment. 10k ops gives us 250 experiments.
871
+ //
872
+ // Probability that we don't select the best root in under 10_000 ops is (59/60)**250≈0.015.
873
+ var op_improvement: usize = 0;
874
+ for (1..10_000) |op| {
875
+ env.prepare_ok_count = 0;
876
+ env.packet_simulator.submit_packet(.{ .prepare = op }, .{
877
+ .source = env.primary,
878
+ .target = env.primary,
879
+ });
880
+
881
+ for (0..1_000) |_| {
882
+ while (env.packet_simulator.step()) {}
883
+ env.packet_simulator.tick();
884
+ if (env.prepare_ok_count == env.replica_count) break;
885
+ } else @panic("loop outrun safety counter");
886
+
887
+ if (env.replicas[env.primary].route_improvement()) |b| {
888
+ op_improvement = op;
889
+ const a = env.replicas[env.primary].a;
890
+ assert(env.total_route_distance(a) > env.replica_count - 1);
891
+ maybe(env.total_route_distance(b) > env.total_route_distance(a));
892
+
893
+ for (env.replicas) |*replica| {
894
+ replica.history_reset();
895
+ replica.route_activate(b);
896
+ }
897
+ }
898
+ }
899
+ assert(env.total_route_distance(env.replicas[env.primary].a) == env.replica_count - 1);
900
+ }
901
+ }
902
+
903
+ test "Routing fuzz" {
904
+ // This fuzzer doesn't try to be realistic, and just hammers the API
905
+ // with a random sequence of calls, to make sure nothing breaks.
906
+ const Environment = struct {
907
+ const fuzz = @import("../testing/fuzz.zig");
908
+
909
+ steps: u32 = 10_000,
910
+ prng: stdx.PRNG,
911
+
912
+ routing: Routing,
913
+ pipeline_length: u32,
914
+ view: u32,
915
+ op: u64,
916
+ commit_max: u64,
917
+ time: u64,
918
+
919
+ const Environment = @This();
920
+
921
+ fn init(seed: u64) Environment {
922
+ var prng = stdx.PRNG.from_seed(seed);
923
+ const replica_count = prng.range_inclusive(u8, 1, constants.replicas_max);
924
+ const standby_count = prng.range_inclusive(u8, 0, constants.standbys_max);
925
+ const replica = prng.int_inclusive(u8, replica_count - 1);
926
+
927
+ const pipeline_length = prng.range_inclusive(
928
+ u32,
929
+ 1,
930
+ constants.pipeline_prepare_queue_max,
931
+ );
932
+
933
+ return .{
934
+ .prng = prng,
935
+ .routing = Routing.init(.{
936
+ .replica = replica,
937
+ .replica_count = replica_count,
938
+ .standby_count = standby_count,
939
+ }),
940
+ .pipeline_length = pipeline_length,
941
+ .op = 0,
942
+ .commit_max = 0,
943
+ .view = 0,
944
+ .time = 0,
945
+ };
946
+ }
947
+
948
+ fn run(env: *Environment) !void {
949
+ for (0..env.steps) |_| {
950
+ env.time += env.prng.int_inclusive(u64, 100);
951
+ try env.run_step();
952
+ }
953
+ }
954
+
955
+ fn run_step(env: *Environment) !void {
956
+ const Actions = enum {
957
+ view_change,
958
+ prepare,
959
+ prepare_ok,
960
+ reroute,
961
+ };
962
+
963
+ const action = env.prng.enum_weighted(Actions, .{
964
+ .view_change = 1,
965
+ .prepare = 5,
966
+ .prepare_ok = 10,
967
+ .reroute = 1,
968
+ });
969
+ switch (action) {
970
+ .view_change => {
971
+ env.view += env.prng.range_inclusive(u8, 1, 2 * env.routing.replica_count);
972
+ env.op = env.prng.range_inclusive(u64, env.commit_max, env.op);
973
+ env.routing.history_reset();
974
+ env.routing.view_change(env.view);
975
+ },
976
+ .prepare => {
977
+ if (env.primary()) {
978
+ if (env.op - env.commit_max < env.pipeline_length) {
979
+ env.op += 1;
980
+ const op = env.op;
981
+ env.tick();
982
+ env.routing.op_prepare(op, .{ .ns = env.time });
983
+ env.verify_route(op);
984
+ }
985
+ } else {
986
+ const op = env.prng.range_inclusive(
987
+ u64,
988
+ env.commit_max,
989
+ env.commit_max + env.pipeline_length,
990
+ );
991
+ env.verify_route(op);
992
+ }
993
+ },
994
+ .prepare_ok => {
995
+ const op = if (env.prng.chance(ratio(8, 10)))
996
+ env.prng.range_inclusive(
997
+ u64,
998
+ @max(1, env.commit_max -| env.pipeline_length),
999
+ env.op,
1000
+ )
1001
+ else
1002
+ env.prng.range_inclusive(
1003
+ u64,
1004
+ 1,
1005
+ env.op + 2 * env.pipeline_length,
1006
+ );
1007
+ const backup = env.prng.int_inclusive(u8, env.routing.replica_count - 1);
1008
+ env.tick();
1009
+ env.routing.op_prepare_ok(op, backup, .{ .ns = env.time });
1010
+ if (env.prng.boolean()) {
1011
+ env.commit_max = @min(env.op, env.commit_max + 1);
1012
+ }
1013
+ },
1014
+ .reroute => {
1015
+ if (env.primary()) {
1016
+ if (env.routing.route_improvement()) |improvement| {
1017
+ env.routing.history_reset();
1018
+ env.routing.route_activate(improvement);
1019
+ }
1020
+ } else {
1021
+ const route = Route.random(&env.prng, env.view, env.routing.replica_count);
1022
+ env.routing.route_activate(route);
1023
+ }
1024
+ },
1025
+ }
1026
+ }
1027
+
1028
+ fn verify_route(env: *const Environment, op: u64) void {
1029
+ var visited: stdx.BitSetType(constants.replicas_max) = .{};
1030
+ const member_count = env.routing.replica_count + env.routing.standby_count;
1031
+ for (0..member_count) |replica_usize| {
1032
+ const replica: u8 = @intCast(replica_usize);
1033
+ var routing = env.routing;
1034
+ routing.replica = replica;
1035
+
1036
+ var next_hop_buffer: [2]u8 = undefined;
1037
+ const next_hop = routing.op_next_hop(op, &next_hop_buffer);
1038
+ assert(next_hop.len <= 2);
1039
+ if (replica == env.view % routing.replica_count) {
1040
+ switch (routing.replica_count) {
1041
+ 0 => unreachable,
1042
+ 1 => assert((next_hop.len == 0) == (routing.standby_count == 0)),
1043
+ 2 => assert(next_hop.len == 1),
1044
+ else => assert(next_hop.len == 2),
1045
+ }
1046
+ } else {
1047
+ assert(next_hop.len <= 1);
1048
+ }
1049
+ for (next_hop) |next| {
1050
+ assert(next != replica);
1051
+ assert(!visited.is_set(next));
1052
+ visited.set(next);
1053
+ }
1054
+ }
1055
+ for (0..constants.replicas_max) |replica| {
1056
+ assert(visited.is_set(replica) ==
1057
+ (replica < member_count and
1058
+ replica != env.view % env.routing.replica_count));
1059
+ }
1060
+ }
1061
+
1062
+ fn tick(env: *Environment) void {
1063
+ env.time += fuzz.random_int_exponential(&env.prng, u64, 10);
1064
+ }
1065
+
1066
+ fn primary(env: *const Environment) bool {
1067
+ return env.routing.replica == env.view % env.routing.replica_count;
1068
+ }
1069
+ };
1070
+
1071
+ var env = Environment.init(92);
1072
+ try env.run();
1073
+ assert(env.op > 1_000);
1074
+ assert(env.commit_max > 1_000);
1075
+ }