tigerbeetle 0.0.34 → 0.0.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (249) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/ext/tb_client/extconf.rb +13 -13
  4. data/ext/tb_client/tigerbeetle/LICENSE +177 -0
  5. data/ext/tb_client/tigerbeetle/build.zig +2327 -0
  6. data/ext/tb_client/tigerbeetle/src/aof.zig +1000 -0
  7. data/ext/tb_client/tigerbeetle/src/build_multiversion.zig +808 -0
  8. data/ext/tb_client/tigerbeetle/src/cdc/amqp/protocol.zig +1283 -0
  9. data/ext/tb_client/tigerbeetle/src/cdc/amqp/spec.zig +1704 -0
  10. data/ext/tb_client/tigerbeetle/src/cdc/amqp/types.zig +341 -0
  11. data/ext/tb_client/tigerbeetle/src/cdc/amqp.zig +1450 -0
  12. data/ext/tb_client/tigerbeetle/src/cdc/runner.zig +1659 -0
  13. data/ext/tb_client/tigerbeetle/src/clients/c/samples/main.c +406 -0
  14. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/context.zig +1084 -0
  15. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/echo_client.zig +286 -0
  16. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/packet.zig +158 -0
  17. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal.zig +229 -0
  18. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client/signal_fuzz.zig +110 -0
  19. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.h +386 -0
  20. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client.zig +34 -0
  21. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_exports.zig +281 -0
  22. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header.zig +312 -0
  23. data/ext/tb_client/tigerbeetle/src/clients/c/tb_client_header_test.zig +138 -0
  24. data/ext/tb_client/tigerbeetle/src/clients/c/test.zig +466 -0
  25. data/ext/tb_client/tigerbeetle/src/clients/docs_samples.zig +157 -0
  26. data/ext/tb_client/tigerbeetle/src/clients/docs_types.zig +90 -0
  27. data/ext/tb_client/tigerbeetle/src/clients/dotnet/ci.zig +203 -0
  28. data/ext/tb_client/tigerbeetle/src/clients/dotnet/docs.zig +79 -0
  29. data/ext/tb_client/tigerbeetle/src/clients/dotnet/dotnet_bindings.zig +542 -0
  30. data/ext/tb_client/tigerbeetle/src/clients/go/ci.zig +109 -0
  31. data/ext/tb_client/tigerbeetle/src/clients/go/docs.zig +86 -0
  32. data/ext/tb_client/tigerbeetle/src/clients/go/go_bindings.zig +370 -0
  33. data/ext/tb_client/tigerbeetle/src/clients/go/pkg/native/tb_client.h +386 -0
  34. data/ext/tb_client/tigerbeetle/src/clients/java/ci.zig +167 -0
  35. data/ext/tb_client/tigerbeetle/src/clients/java/docs.zig +126 -0
  36. data/ext/tb_client/tigerbeetle/src/clients/java/java_bindings.zig +996 -0
  37. data/ext/tb_client/tigerbeetle/src/clients/java/src/client.zig +748 -0
  38. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni.zig +3238 -0
  39. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_tests.zig +1718 -0
  40. data/ext/tb_client/tigerbeetle/src/clients/java/src/jni_thread_cleaner.zig +190 -0
  41. data/ext/tb_client/tigerbeetle/src/clients/node/ci.zig +104 -0
  42. data/ext/tb_client/tigerbeetle/src/clients/node/docs.zig +75 -0
  43. data/ext/tb_client/tigerbeetle/src/clients/node/node.zig +522 -0
  44. data/ext/tb_client/tigerbeetle/src/clients/node/node_bindings.zig +267 -0
  45. data/ext/tb_client/tigerbeetle/src/clients/node/src/c.zig +3 -0
  46. data/ext/tb_client/tigerbeetle/src/clients/node/src/translate.zig +379 -0
  47. data/ext/tb_client/tigerbeetle/src/clients/python/ci.zig +131 -0
  48. data/ext/tb_client/tigerbeetle/src/clients/python/docs.zig +63 -0
  49. data/ext/tb_client/tigerbeetle/src/clients/python/python_bindings.zig +588 -0
  50. data/ext/tb_client/tigerbeetle/src/clients/rust/assets/tb_client.h +386 -0
  51. data/ext/tb_client/tigerbeetle/src/clients/rust/ci.zig +73 -0
  52. data/ext/tb_client/tigerbeetle/src/clients/rust/docs.zig +106 -0
  53. data/ext/tb_client/tigerbeetle/src/clients/rust/rust_bindings.zig +305 -0
  54. data/ext/tb_client/tigerbeetle/src/config.zig +296 -0
  55. data/ext/tb_client/tigerbeetle/src/constants.zig +790 -0
  56. data/ext/tb_client/tigerbeetle/src/copyhound.zig +202 -0
  57. data/ext/tb_client/tigerbeetle/src/counting_allocator.zig +72 -0
  58. data/ext/tb_client/tigerbeetle/src/direction.zig +11 -0
  59. data/ext/tb_client/tigerbeetle/src/docs_website/build.zig +158 -0
  60. data/ext/tb_client/tigerbeetle/src/docs_website/src/content.zig +156 -0
  61. data/ext/tb_client/tigerbeetle/src/docs_website/src/docs.zig +252 -0
  62. data/ext/tb_client/tigerbeetle/src/docs_website/src/file_checker.zig +313 -0
  63. data/ext/tb_client/tigerbeetle/src/docs_website/src/html.zig +87 -0
  64. data/ext/tb_client/tigerbeetle/src/docs_website/src/page_writer.zig +63 -0
  65. data/ext/tb_client/tigerbeetle/src/docs_website/src/redirects.zig +47 -0
  66. data/ext/tb_client/tigerbeetle/src/docs_website/src/search_index_writer.zig +28 -0
  67. data/ext/tb_client/tigerbeetle/src/docs_website/src/service_worker_writer.zig +61 -0
  68. data/ext/tb_client/tigerbeetle/src/docs_website/src/single_page_writer.zig +169 -0
  69. data/ext/tb_client/tigerbeetle/src/docs_website/src/website.zig +46 -0
  70. data/ext/tb_client/tigerbeetle/src/ewah.zig +445 -0
  71. data/ext/tb_client/tigerbeetle/src/ewah_benchmark.zig +128 -0
  72. data/ext/tb_client/tigerbeetle/src/ewah_fuzz.zig +171 -0
  73. data/ext/tb_client/tigerbeetle/src/fuzz_tests.zig +179 -0
  74. data/ext/tb_client/tigerbeetle/src/integration_tests.zig +662 -0
  75. data/ext/tb_client/tigerbeetle/src/io/common.zig +155 -0
  76. data/ext/tb_client/tigerbeetle/src/io/darwin.zig +1093 -0
  77. data/ext/tb_client/tigerbeetle/src/io/linux.zig +1880 -0
  78. data/ext/tb_client/tigerbeetle/src/io/test.zig +1005 -0
  79. data/ext/tb_client/tigerbeetle/src/io/windows.zig +1598 -0
  80. data/ext/tb_client/tigerbeetle/src/io.zig +34 -0
  81. data/ext/tb_client/tigerbeetle/src/iops.zig +134 -0
  82. data/ext/tb_client/tigerbeetle/src/list.zig +236 -0
  83. data/ext/tb_client/tigerbeetle/src/lsm/binary_search.zig +848 -0
  84. data/ext/tb_client/tigerbeetle/src/lsm/binary_search_benchmark.zig +179 -0
  85. data/ext/tb_client/tigerbeetle/src/lsm/cache_map.zig +424 -0
  86. data/ext/tb_client/tigerbeetle/src/lsm/cache_map_fuzz.zig +420 -0
  87. data/ext/tb_client/tigerbeetle/src/lsm/compaction.zig +2117 -0
  88. data/ext/tb_client/tigerbeetle/src/lsm/composite_key.zig +182 -0
  89. data/ext/tb_client/tigerbeetle/src/lsm/forest.zig +1119 -0
  90. data/ext/tb_client/tigerbeetle/src/lsm/forest_fuzz.zig +1102 -0
  91. data/ext/tb_client/tigerbeetle/src/lsm/forest_table_iterator.zig +200 -0
  92. data/ext/tb_client/tigerbeetle/src/lsm/groove.zig +1495 -0
  93. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge.zig +739 -0
  94. data/ext/tb_client/tigerbeetle/src/lsm/k_way_merge_benchmark.zig +166 -0
  95. data/ext/tb_client/tigerbeetle/src/lsm/manifest.zig +754 -0
  96. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level.zig +1294 -0
  97. data/ext/tb_client/tigerbeetle/src/lsm/manifest_level_fuzz.zig +510 -0
  98. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log.zig +1263 -0
  99. data/ext/tb_client/tigerbeetle/src/lsm/manifest_log_fuzz.zig +628 -0
  100. data/ext/tb_client/tigerbeetle/src/lsm/node_pool.zig +247 -0
  101. data/ext/tb_client/tigerbeetle/src/lsm/scan_buffer.zig +116 -0
  102. data/ext/tb_client/tigerbeetle/src/lsm/scan_builder.zig +543 -0
  103. data/ext/tb_client/tigerbeetle/src/lsm/scan_fuzz.zig +938 -0
  104. data/ext/tb_client/tigerbeetle/src/lsm/scan_lookup.zig +293 -0
  105. data/ext/tb_client/tigerbeetle/src/lsm/scan_merge.zig +362 -0
  106. data/ext/tb_client/tigerbeetle/src/lsm/scan_range.zig +99 -0
  107. data/ext/tb_client/tigerbeetle/src/lsm/scan_state.zig +17 -0
  108. data/ext/tb_client/tigerbeetle/src/lsm/scan_tree.zig +1036 -0
  109. data/ext/tb_client/tigerbeetle/src/lsm/schema.zig +617 -0
  110. data/ext/tb_client/tigerbeetle/src/lsm/scratch_memory.zig +84 -0
  111. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array.zig +1500 -0
  112. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_benchmark.zig +149 -0
  113. data/ext/tb_client/tigerbeetle/src/lsm/segmented_array_fuzz.zig +7 -0
  114. data/ext/tb_client/tigerbeetle/src/lsm/set_associative_cache.zig +865 -0
  115. data/ext/tb_client/tigerbeetle/src/lsm/table.zig +607 -0
  116. data/ext/tb_client/tigerbeetle/src/lsm/table_memory.zig +843 -0
  117. data/ext/tb_client/tigerbeetle/src/lsm/table_value_iterator.zig +105 -0
  118. data/ext/tb_client/tigerbeetle/src/lsm/timestamp_range.zig +40 -0
  119. data/ext/tb_client/tigerbeetle/src/lsm/tree.zig +630 -0
  120. data/ext/tb_client/tigerbeetle/src/lsm/tree_fuzz.zig +933 -0
  121. data/ext/tb_client/tigerbeetle/src/lsm/zig_zag_merge.zig +557 -0
  122. data/ext/tb_client/tigerbeetle/src/message_buffer.zig +469 -0
  123. data/ext/tb_client/tigerbeetle/src/message_bus.zig +1214 -0
  124. data/ext/tb_client/tigerbeetle/src/message_bus_fuzz.zig +936 -0
  125. data/ext/tb_client/tigerbeetle/src/message_pool.zig +343 -0
  126. data/ext/tb_client/tigerbeetle/src/multiversion.zig +2195 -0
  127. data/ext/tb_client/tigerbeetle/src/queue.zig +390 -0
  128. data/ext/tb_client/tigerbeetle/src/repl/completion.zig +201 -0
  129. data/ext/tb_client/tigerbeetle/src/repl/parser.zig +1356 -0
  130. data/ext/tb_client/tigerbeetle/src/repl/terminal.zig +496 -0
  131. data/ext/tb_client/tigerbeetle/src/repl.zig +1034 -0
  132. data/ext/tb_client/tigerbeetle/src/scripts/amqp.zig +973 -0
  133. data/ext/tb_client/tigerbeetle/src/scripts/cfo.zig +1866 -0
  134. data/ext/tb_client/tigerbeetle/src/scripts/changelog.zig +304 -0
  135. data/ext/tb_client/tigerbeetle/src/scripts/ci.zig +227 -0
  136. data/ext/tb_client/tigerbeetle/src/scripts/client_readmes.zig +658 -0
  137. data/ext/tb_client/tigerbeetle/src/scripts/devhub.zig +466 -0
  138. data/ext/tb_client/tigerbeetle/src/scripts/release.zig +1058 -0
  139. data/ext/tb_client/tigerbeetle/src/scripts.zig +105 -0
  140. data/ext/tb_client/tigerbeetle/src/shell.zig +1195 -0
  141. data/ext/tb_client/tigerbeetle/src/stack.zig +260 -0
  142. data/ext/tb_client/tigerbeetle/src/state_machine/auditor.zig +911 -0
  143. data/ext/tb_client/tigerbeetle/src/state_machine/workload.zig +2079 -0
  144. data/ext/tb_client/tigerbeetle/src/state_machine.zig +4872 -0
  145. data/ext/tb_client/tigerbeetle/src/state_machine_fuzz.zig +288 -0
  146. data/ext/tb_client/tigerbeetle/src/state_machine_tests.zig +3128 -0
  147. data/ext/tb_client/tigerbeetle/src/static_allocator.zig +82 -0
  148. data/ext/tb_client/tigerbeetle/src/stdx/bit_set.zig +157 -0
  149. data/ext/tb_client/tigerbeetle/src/stdx/bounded_array.zig +292 -0
  150. data/ext/tb_client/tigerbeetle/src/stdx/debug.zig +65 -0
  151. data/ext/tb_client/tigerbeetle/src/stdx/flags.zig +1414 -0
  152. data/ext/tb_client/tigerbeetle/src/stdx/mlock.zig +92 -0
  153. data/ext/tb_client/tigerbeetle/src/stdx/prng.zig +677 -0
  154. data/ext/tb_client/tigerbeetle/src/stdx/radix.zig +336 -0
  155. data/ext/tb_client/tigerbeetle/src/stdx/ring_buffer.zig +511 -0
  156. data/ext/tb_client/tigerbeetle/src/stdx/sort_test.zig +112 -0
  157. data/ext/tb_client/tigerbeetle/src/stdx/stdx.zig +1160 -0
  158. data/ext/tb_client/tigerbeetle/src/stdx/testing/low_level_hash_vectors.zig +142 -0
  159. data/ext/tb_client/tigerbeetle/src/stdx/testing/snaptest.zig +361 -0
  160. data/ext/tb_client/tigerbeetle/src/stdx/time_units.zig +275 -0
  161. data/ext/tb_client/tigerbeetle/src/stdx/unshare.zig +295 -0
  162. data/ext/tb_client/tigerbeetle/src/stdx/vendored/aegis.zig +436 -0
  163. data/ext/tb_client/tigerbeetle/src/stdx/windows.zig +48 -0
  164. data/ext/tb_client/tigerbeetle/src/stdx/zipfian.zig +402 -0
  165. data/ext/tb_client/tigerbeetle/src/storage.zig +489 -0
  166. data/ext/tb_client/tigerbeetle/src/storage_fuzz.zig +180 -0
  167. data/ext/tb_client/tigerbeetle/src/testing/bench.zig +146 -0
  168. data/ext/tb_client/tigerbeetle/src/testing/cluster/grid_checker.zig +53 -0
  169. data/ext/tb_client/tigerbeetle/src/testing/cluster/journal_checker.zig +61 -0
  170. data/ext/tb_client/tigerbeetle/src/testing/cluster/manifest_checker.zig +76 -0
  171. data/ext/tb_client/tigerbeetle/src/testing/cluster/message_bus.zig +110 -0
  172. data/ext/tb_client/tigerbeetle/src/testing/cluster/network.zig +412 -0
  173. data/ext/tb_client/tigerbeetle/src/testing/cluster/state_checker.zig +331 -0
  174. data/ext/tb_client/tigerbeetle/src/testing/cluster/storage_checker.zig +458 -0
  175. data/ext/tb_client/tigerbeetle/src/testing/cluster.zig +1198 -0
  176. data/ext/tb_client/tigerbeetle/src/testing/exhaustigen.zig +128 -0
  177. data/ext/tb_client/tigerbeetle/src/testing/fixtures.zig +181 -0
  178. data/ext/tb_client/tigerbeetle/src/testing/fuzz.zig +144 -0
  179. data/ext/tb_client/tigerbeetle/src/testing/id.zig +97 -0
  180. data/ext/tb_client/tigerbeetle/src/testing/io.zig +317 -0
  181. data/ext/tb_client/tigerbeetle/src/testing/marks.zig +126 -0
  182. data/ext/tb_client/tigerbeetle/src/testing/packet_simulator.zig +533 -0
  183. data/ext/tb_client/tigerbeetle/src/testing/reply_sequence.zig +154 -0
  184. data/ext/tb_client/tigerbeetle/src/testing/state_machine.zig +389 -0
  185. data/ext/tb_client/tigerbeetle/src/testing/storage.zig +1247 -0
  186. data/ext/tb_client/tigerbeetle/src/testing/table.zig +249 -0
  187. data/ext/tb_client/tigerbeetle/src/testing/time.zig +98 -0
  188. data/ext/tb_client/tigerbeetle/src/testing/tmp_tigerbeetle.zig +212 -0
  189. data/ext/tb_client/tigerbeetle/src/testing/vortex/constants.zig +26 -0
  190. data/ext/tb_client/tigerbeetle/src/testing/vortex/faulty_network.zig +580 -0
  191. data/ext/tb_client/tigerbeetle/src/testing/vortex/java_driver/ci.zig +39 -0
  192. data/ext/tb_client/tigerbeetle/src/testing/vortex/logged_process.zig +214 -0
  193. data/ext/tb_client/tigerbeetle/src/testing/vortex/rust_driver/ci.zig +34 -0
  194. data/ext/tb_client/tigerbeetle/src/testing/vortex/supervisor.zig +766 -0
  195. data/ext/tb_client/tigerbeetle/src/testing/vortex/workload.zig +543 -0
  196. data/ext/tb_client/tigerbeetle/src/testing/vortex/zig_driver.zig +181 -0
  197. data/ext/tb_client/tigerbeetle/src/tidy.zig +1448 -0
  198. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_driver.zig +227 -0
  199. data/ext/tb_client/tigerbeetle/src/tigerbeetle/benchmark_load.zig +1069 -0
  200. data/ext/tb_client/tigerbeetle/src/tigerbeetle/cli.zig +1422 -0
  201. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect.zig +1658 -0
  202. data/ext/tb_client/tigerbeetle/src/tigerbeetle/inspect_integrity.zig +518 -0
  203. data/ext/tb_client/tigerbeetle/src/tigerbeetle/libtb_client.zig +36 -0
  204. data/ext/tb_client/tigerbeetle/src/tigerbeetle/main.zig +646 -0
  205. data/ext/tb_client/tigerbeetle/src/tigerbeetle.zig +958 -0
  206. data/ext/tb_client/tigerbeetle/src/time.zig +236 -0
  207. data/ext/tb_client/tigerbeetle/src/trace/event.zig +745 -0
  208. data/ext/tb_client/tigerbeetle/src/trace/statsd.zig +462 -0
  209. data/ext/tb_client/tigerbeetle/src/trace.zig +556 -0
  210. data/ext/tb_client/tigerbeetle/src/unit_tests.zig +321 -0
  211. data/ext/tb_client/tigerbeetle/src/vopr.zig +1785 -0
  212. data/ext/tb_client/tigerbeetle/src/vortex.zig +101 -0
  213. data/ext/tb_client/tigerbeetle/src/vsr/checkpoint_trailer.zig +473 -0
  214. data/ext/tb_client/tigerbeetle/src/vsr/checksum.zig +208 -0
  215. data/ext/tb_client/tigerbeetle/src/vsr/checksum_benchmark.zig +43 -0
  216. data/ext/tb_client/tigerbeetle/src/vsr/client.zig +768 -0
  217. data/ext/tb_client/tigerbeetle/src/vsr/client_replies.zig +532 -0
  218. data/ext/tb_client/tigerbeetle/src/vsr/client_sessions.zig +338 -0
  219. data/ext/tb_client/tigerbeetle/src/vsr/clock.zig +1019 -0
  220. data/ext/tb_client/tigerbeetle/src/vsr/fault_detector.zig +279 -0
  221. data/ext/tb_client/tigerbeetle/src/vsr/free_set.zig +1381 -0
  222. data/ext/tb_client/tigerbeetle/src/vsr/free_set_fuzz.zig +315 -0
  223. data/ext/tb_client/tigerbeetle/src/vsr/grid.zig +1460 -0
  224. data/ext/tb_client/tigerbeetle/src/vsr/grid_blocks_missing.zig +757 -0
  225. data/ext/tb_client/tigerbeetle/src/vsr/grid_scrubber.zig +797 -0
  226. data/ext/tb_client/tigerbeetle/src/vsr/journal.zig +2586 -0
  227. data/ext/tb_client/tigerbeetle/src/vsr/marzullo.zig +308 -0
  228. data/ext/tb_client/tigerbeetle/src/vsr/message_header.zig +1777 -0
  229. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch.zig +715 -0
  230. data/ext/tb_client/tigerbeetle/src/vsr/multi_batch_fuzz.zig +185 -0
  231. data/ext/tb_client/tigerbeetle/src/vsr/repair_budget.zig +333 -0
  232. data/ext/tb_client/tigerbeetle/src/vsr/replica.zig +12355 -0
  233. data/ext/tb_client/tigerbeetle/src/vsr/replica_format.zig +416 -0
  234. data/ext/tb_client/tigerbeetle/src/vsr/replica_reformat.zig +165 -0
  235. data/ext/tb_client/tigerbeetle/src/vsr/replica_test.zig +2910 -0
  236. data/ext/tb_client/tigerbeetle/src/vsr/routing.zig +1075 -0
  237. data/ext/tb_client/tigerbeetle/src/vsr/superblock.zig +1603 -0
  238. data/ext/tb_client/tigerbeetle/src/vsr/superblock_fuzz.zig +484 -0
  239. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums.zig +405 -0
  240. data/ext/tb_client/tigerbeetle/src/vsr/superblock_quorums_fuzz.zig +355 -0
  241. data/ext/tb_client/tigerbeetle/src/vsr/sync.zig +29 -0
  242. data/ext/tb_client/tigerbeetle/src/vsr.zig +1727 -0
  243. data/lib/tb_client/shared_lib.rb +12 -5
  244. data/lib/tigerbeetle/client.rb +1 -1
  245. data/lib/tigerbeetle/platforms.rb +9 -0
  246. data/lib/tigerbeetle/version.rb +2 -2
  247. data/tigerbeetle.gemspec +22 -5
  248. metadata +242 -3
  249. data/ext/tb_client/pkg.tar.gz +0 -0
@@ -0,0 +1,1036 @@
1
+ const std = @import("std");
2
+ const mem = std.mem;
3
+ const assert = std.debug.assert;
4
+
5
+ const stdx = @import("stdx");
6
+ const maybe = stdx.maybe;
7
+ const constants = @import("../constants.zig");
8
+ const schema = @import("schema.zig");
9
+ const binary_search = @import("binary_search.zig");
10
+ const k_way_merge = @import("k_way_merge.zig");
11
+
12
+ const Direction = @import("../direction.zig").Direction;
13
+ const GridType = @import("../vsr/grid.zig").GridType;
14
+ const BlockPtrConst = @import("../vsr/grid.zig").BlockPtrConst;
15
+ const TreeTableInfoType = @import("manifest.zig").TreeTableInfoType;
16
+ const ManifestType = @import("manifest.zig").ManifestType;
17
+ const ScanBuffer = @import("scan_buffer.zig").ScanBuffer;
18
+ const ScanState = @import("scan_state.zig").ScanState;
19
+ const TableValueIteratorType =
20
+ @import("table_value_iterator.zig").TableValueIteratorType;
21
+
22
+ const Pending = error{Pending};
23
+
24
+ /// Scans a range of keys over a Tree, in ascending or descending order.
25
+ /// At a high level, this is an ordered iterator over the values in a tree, at a particular
26
+ /// snapshot, within a given key range, merged across all levels (including the in-memory tables).
27
+ ///
28
+ /// 1. Sort the in-memory tables and perform a binary search on them for the key range.
29
+ /// 2. Fetch from storage and fill the buffer with values from all LSM levels that match the key
30
+ /// range (see `ScanTreeLevel`).
31
+ /// 3. Perform a k-way merge to iterate over buffers from different levels and memory tables in
32
+ /// ascending or descending order.
33
+ /// 4. Repeat step 2 when the buffer of at least one level has been consumed, until all levels
34
+ /// have been exhausted.
35
+ pub fn ScanTreeType(
36
+ comptime Context: type,
37
+ comptime Tree_: type,
38
+ comptime Storage: type,
39
+ ) type {
40
+ return struct {
41
+ const ScanTree = @This();
42
+
43
+ pub const Callback = *const fn (context: Context, scan: *ScanTree) void;
44
+
45
+ const Grid = GridType(Storage);
46
+
47
+ const TableInfo = TreeTableInfoType(Table);
48
+ const Manifest = ManifestType(Table, Storage);
49
+
50
+ pub const Tree = Tree_;
51
+ const Table = Tree.Table;
52
+ const Key = Table.Key;
53
+ const Value = Table.Value;
54
+ const key_from_value = Table.key_from_value;
55
+
56
+ const ScanTreeLevel = ScanTreeLevelType(ScanTree, Storage);
57
+
58
+ /// KWayMerge stream identifier for each level of the LSM tree,
59
+ /// plus the mutable and immutable tables.
60
+ /// The `KWayMerge` API requires it to be a `u32`.
61
+ const KWayMergeStreams = enum(u32) {
62
+ const streams_count = constants.lsm_levels + 2;
63
+
64
+ // Tables mutable and immutable are well-known indexes.
65
+ table_mutable = 0,
66
+ table_immutable = 1,
67
+
68
+ // The rest of the lsm levels are represented as a non-exhaustive enum.
69
+ _,
70
+ };
71
+
72
+ /// KWayMergeIterator for merging results from all levels of the LSM tree.
73
+ const KWayMergeIterator = T: {
74
+ const stream = struct {
75
+ fn peek(
76
+ scan: *ScanTree,
77
+ stream_index: u32,
78
+ ) Pending!?ScanTree.Key {
79
+ assert(stream_index < KWayMergeStreams.streams_count);
80
+
81
+ return switch (@as(KWayMergeStreams, @enumFromInt(stream_index))) {
82
+ .table_mutable => scan.merge_table_mutable_peek(),
83
+ .table_immutable => scan.merge_table_immutable_peek(),
84
+ _ => |index| blk: {
85
+ const level_index = @intFromEnum(index) - 2;
86
+ assert(level_index < constants.lsm_levels);
87
+ break :blk scan.merge_level_peek(level_index);
88
+ },
89
+ };
90
+ }
91
+
92
+ fn pop(scan: *ScanTree, stream_index: u32) ScanTree.Value {
93
+ assert(stream_index < KWayMergeStreams.streams_count);
94
+
95
+ return switch (@as(KWayMergeStreams, @enumFromInt(stream_index))) {
96
+ .table_mutable => scan.merge_table_mutable_pop(),
97
+ .table_immutable => scan.merge_table_immutable_pop(),
98
+ _ => |index| blk: {
99
+ const level_index = @intFromEnum(index) - 2;
100
+ assert(level_index < constants.lsm_levels);
101
+ break :blk scan.merge_level_pop(level_index);
102
+ },
103
+ };
104
+ }
105
+ };
106
+
107
+ break :T k_way_merge.KWayMergeIteratorType(
108
+ ScanTree,
109
+ ScanTree.Key,
110
+ ScanTree.Value,
111
+ .{
112
+ .streams_max = KWayMergeStreams.streams_count,
113
+ .deduplicate = true,
114
+ },
115
+ ScanTree.key_from_value,
116
+ stream.peek,
117
+ stream.pop,
118
+ );
119
+ };
120
+
121
+ tree: *Tree,
122
+ buffer: *const ScanBuffer,
123
+
124
+ direction: Direction,
125
+ key_min: Key,
126
+ key_max: Key,
127
+ snapshot: u64,
128
+
129
+ table_mutable_values: []const Value,
130
+ table_immutable_values: []const Value,
131
+
132
+ state: union(ScanState) {
133
+ /// The scan has not been executed yet.
134
+ /// All levels are still uninitialized.
135
+ idle,
136
+
137
+ /// The scan is at a valid position and ready to yield values, e.g. calling `next()`.
138
+ /// All levels are either in the state `.buffered` or `.finished`.
139
+ seeking,
140
+
141
+ /// The scan needs to load data from the LSM levels, e.g. calling `read()`.
142
+ /// At least one level is in the state `.fetching`.
143
+ /// It's also possible for levels to be in the state `.buffered` and `.finished`.
144
+ needs_data,
145
+
146
+ /// The scan is attempting to load data from the LSM levels,
147
+ /// e.g. in between calling `read()` and receiving the callback.
148
+ /// Only levels in the state `.fetching` will load from storage.
149
+ /// It's also possible for levels to be in the state `.buffered` and `.finished`.
150
+ buffering: struct {
151
+ context: Context,
152
+ callback: Callback,
153
+ pending_count: u32,
154
+ },
155
+
156
+ /// The scan was aborted and will not yield any more values.
157
+ aborted,
158
+ },
159
+ levels: [constants.lsm_levels]ScanTreeLevel,
160
+
161
+ merge_iterator: ?KWayMergeIterator,
162
+
163
+ pub fn init(
164
+ tree: *Tree,
165
+ buffer: *const ScanBuffer,
166
+ snapshot: u64,
167
+ key_min: Key,
168
+ key_max: Key,
169
+ direction: Direction,
170
+ ) ScanTree {
171
+ assert(key_min <= key_max);
172
+
173
+ const table_mutable_values: []const Value = blk: {
174
+ // TODO We currently assume that the snapshot passed in is the current snapshot.
175
+ // This must be changed when persistent snapshots are implemented.
176
+ tree.table_mutable.sort();
177
+ const values = tree.table_mutable.values_used();
178
+ const range = binary_search.binary_search_values_range(
179
+ Key,
180
+ Value,
181
+ key_from_value,
182
+ values,
183
+ key_min,
184
+ key_max,
185
+ );
186
+ break :blk values[range.start..][0..range.count];
187
+ };
188
+
189
+ const table_immutable_values: []const Value = blk: {
190
+ if (snapshot <
191
+ tree.table_immutable.mutability.immutable.snapshot_min) break :blk &.{};
192
+
193
+ const values = tree.table_immutable.values_used();
194
+ const range = binary_search.binary_search_values_range(
195
+ Key,
196
+ Value,
197
+ key_from_value,
198
+ values,
199
+ key_min,
200
+ key_max,
201
+ );
202
+ break :blk values[range.start..][0..range.count];
203
+ };
204
+
205
+ return .{
206
+ .tree = tree,
207
+ .buffer = buffer,
208
+ .state = .idle,
209
+ .snapshot = snapshot,
210
+ .key_min = key_min,
211
+ .key_max = key_max,
212
+ .direction = direction,
213
+ .table_mutable_values = table_mutable_values,
214
+ .table_immutable_values = table_immutable_values,
215
+ .levels = undefined,
216
+ .merge_iterator = null,
217
+ };
218
+ }
219
+
220
+ pub fn read(self: *ScanTree, context: Context, callback: Callback) void {
221
+ assert(self.state == .idle or self.state == .needs_data);
222
+
223
+ self.tree.grid.trace.start(
224
+ .{ .scan_tree = .{
225
+ .index = self.buffer.index,
226
+ .tree = @enumFromInt(self.tree.config.id),
227
+ } },
228
+ );
229
+
230
+ const state_before = self.state;
231
+ self.state = .{
232
+ .buffering = .{
233
+ .context = context,
234
+ .callback = callback,
235
+ .pending_count = 0,
236
+ },
237
+ };
238
+
239
+ for (&self.levels, 0..) |*level, i| {
240
+ if (state_before == .idle) {
241
+ // Initializing all levels for the first read.
242
+ level.init(
243
+ self,
244
+ self.buffer.levels[i],
245
+ @intCast(i),
246
+ );
247
+ }
248
+
249
+ switch (level.values) {
250
+ .fetching => {
251
+ assert(level.state == .loading_manifest or
252
+ level.state == .loading_index or
253
+ level.state == .iterating);
254
+
255
+ if (level.state == .loading_manifest) level.move_next();
256
+ self.state.buffering.pending_count += 1;
257
+ level.fetch();
258
+ },
259
+ .buffered => {
260
+ assert(level.state == .iterating);
261
+ assert(state_before == .needs_data);
262
+ },
263
+ .finished => {
264
+ assert(level.state == .finished);
265
+ assert(state_before == .needs_data);
266
+ },
267
+ }
268
+ }
269
+ }
270
+
271
+ pub fn abort(self: *ScanTree) void {
272
+ assert(self.state != .buffering);
273
+ self.state = .aborted;
274
+ }
275
+
276
+ /// Moves the iterator to the next position and returns its `Value` or `null` if the
277
+ /// iterator has no more values to iterate.
278
+ /// May return `error.Pending` if a value block needs to be loaded, in this case
279
+ /// call `read()` and resume the iteration after the read callback.
280
+ pub fn next(self: *ScanTree) Pending!?Value {
281
+ switch (self.state) {
282
+ .idle => {
283
+ assert(self.merge_iterator == null);
284
+ return error.Pending;
285
+ },
286
+ .seeking => return self.merge_iterator.?.pop() catch |err| switch (err) {
287
+ error.Pending => {
288
+ self.state = .needs_data;
289
+ return error.Pending;
290
+ },
291
+ },
292
+ .needs_data => return error.Pending,
293
+ .buffering => unreachable,
294
+ .aborted => return null,
295
+ }
296
+ }
297
+
298
+ /// Modifies the key_min/key_max range and moves the scan to the next value such that
299
+ /// `value.key >= probe_key` (ascending) or `value.key <= probe_key` (descending).
300
+ /// The scan may become empty or `Pending` _after_ probing.
301
+ /// Should not be called when the current key already matches the `probe_key`.
302
+ pub fn probe(self: *ScanTree, probe_key: Key) void {
303
+ if (self.state == .aborted) return;
304
+ assert(self.state != .buffering);
305
+
306
+ // No need to move if the current range is already tighter.
307
+ // It can abort scanning if the key is unreachable.
308
+ if (probe_key < self.key_min) {
309
+ if (self.direction == .descending) self.abort();
310
+ return;
311
+ } else if (self.key_max < probe_key) {
312
+ if (self.direction == .ascending) self.abort();
313
+ return;
314
+ }
315
+
316
+ // It's allowed to probe multiple times with the same `probe_key`.
317
+ // In this case, there's no need to move since the key range was already set.
318
+ if (switch (self.direction) {
319
+ .ascending => self.key_min == probe_key,
320
+ .descending => self.key_max == probe_key,
321
+ }) {
322
+ assert(self.state == .idle or
323
+ self.state == .seeking or
324
+ self.state == .needs_data);
325
+ return;
326
+ }
327
+
328
+ // Updates the scan range depending on the direction.
329
+ switch (self.direction) {
330
+ .ascending => {
331
+ assert(self.key_min < probe_key);
332
+ assert(probe_key <= self.key_max);
333
+ self.key_min = probe_key;
334
+ },
335
+ .descending => {
336
+ assert(probe_key < self.key_max);
337
+ assert(self.key_min <= probe_key);
338
+ self.key_max = probe_key;
339
+ },
340
+ }
341
+
342
+ // Re-slicing the in-memory tables:
343
+ inline for (.{ &self.table_mutable_values, &self.table_immutable_values }) |field| {
344
+ const table_memory = field.*;
345
+ const slice: []const Value = probe_values(self.direction, table_memory, probe_key);
346
+ assert(slice.len <= table_memory.len);
347
+ field.* = slice;
348
+ }
349
+
350
+ switch (self.state) {
351
+ .idle => {},
352
+ .seeking, .needs_data => {
353
+ for (&self.levels) |*level| {
354
+ // Forwarding the `probe` to each level.
355
+ level.probe(probe_key);
356
+ }
357
+
358
+ // It's not expected to probe a scan that already produced a key equals
359
+ // or ahead the probe.
360
+ assert(self.merge_iterator.?.key_popped == null or
361
+ switch (self.direction) {
362
+ .ascending => self.merge_iterator.?.key_popped.? < probe_key,
363
+ .descending => self.merge_iterator.?.key_popped.? > probe_key,
364
+ });
365
+
366
+ // Once the underlying streams have been changed, the merge iterator needs
367
+ // to reset its state, otherwise it may have dirty keys buffered.
368
+ self.merge_iterator.?.reset();
369
+ },
370
+ .buffering, .aborted => unreachable,
371
+ }
372
+ }
373
+
374
+ fn levels_read_complete(self: *ScanTree) void {
375
+ assert(self.state == .buffering);
376
+ assert(self.state.buffering.pending_count > 0);
377
+
378
+ self.state.buffering.pending_count -= 1;
379
+ if (self.state.buffering.pending_count == 0) self.read_complete();
380
+ }
381
+
382
+ /// The next value block for each level is available.
383
+ fn read_complete(self: *ScanTree) void {
384
+ assert(self.state == .buffering);
385
+ assert(self.state.buffering.pending_count == 0);
386
+
387
+ const context = self.state.buffering.context;
388
+ const callback = self.state.buffering.callback;
389
+ self.state = .seeking;
390
+
391
+ if (self.merge_iterator == null) {
392
+ self.merge_iterator = KWayMergeIterator.init(
393
+ self,
394
+ KWayMergeStreams.streams_count,
395
+ self.direction,
396
+ );
397
+ }
398
+
399
+ self.tree.grid.trace.stop(
400
+ .{ .scan_tree = .{
401
+ .index = self.buffer.index,
402
+ .tree = @enumFromInt(self.tree.config.id),
403
+ } },
404
+ );
405
+
406
+ callback(context, self);
407
+ }
408
+
409
+ fn merge_table_mutable_peek(self: *const ScanTree) Pending!?Key {
410
+ return self.table_memory_peek(self.table_mutable_values);
411
+ }
412
+
413
+ fn merge_table_immutable_peek(self: *const ScanTree) Pending!?Key {
414
+ return self.table_memory_peek(self.table_immutable_values);
415
+ }
416
+
417
+ fn merge_table_mutable_pop(self: *ScanTree) Value {
418
+ return table_memory_pop(self, &self.table_mutable_values);
419
+ }
420
+
421
+ fn merge_table_immutable_pop(self: *ScanTree) Value {
422
+ return table_memory_pop(self, &self.table_immutable_values);
423
+ }
424
+
425
+ inline fn table_memory_peek(
426
+ self: *const ScanTree,
427
+ values: []const Value,
428
+ ) Pending!?Key {
429
+ assert(self.state == .seeking);
430
+
431
+ if (values.len == 0) return null;
432
+
433
+ const value: *const Value = switch (self.direction) {
434
+ .ascending => &values[0],
435
+ .descending => &values[values.len - 1],
436
+ };
437
+
438
+ const key = key_from_value(value);
439
+ return key;
440
+ }
441
+
442
+ inline fn table_memory_pop(
443
+ self: *ScanTree,
444
+ field_reference: *[]const Value,
445
+ ) Value {
446
+ assert(self.state == .seeking);
447
+
448
+ // The slice is re-sliced during pop,
449
+ // updating the backing field at the end.
450
+ var values = field_reference.*;
451
+ defer field_reference.* = values;
452
+
453
+ assert(values.len > 0);
454
+
455
+ // TableMemory already deduplicates.
456
+ switch (self.direction) {
457
+ .ascending => {
458
+ assert(values.len <= 1 or
459
+ key_from_value(&values[0]) != key_from_value(&values[1]));
460
+
461
+ const value_first = values[0];
462
+ values = values[1..];
463
+ return value_first;
464
+ },
465
+ .descending => {
466
+ assert(values.len <= 1 or key_from_value(&values[values.len - 1]) !=
467
+ key_from_value(&values[values.len - 2]));
468
+
469
+ const value_last = values[values.len - 1];
470
+ values = values[0 .. values.len - 1];
471
+ return value_last;
472
+ },
473
+ }
474
+ }
475
+
476
+ fn merge_level_peek(self: *const ScanTree, level_index: u32) Pending!?Key {
477
+ assert(self.state == .seeking);
478
+ assert(level_index < constants.lsm_levels);
479
+
480
+ const level = &self.levels[level_index];
481
+ return level.peek();
482
+ }
483
+
484
+ fn merge_level_pop(self: *ScanTree, level_index: u32) Value {
485
+ assert(self.state == .seeking);
486
+ assert(level_index < constants.lsm_levels);
487
+
488
+ const level = &self.levels[level_index];
489
+ return level.pop();
490
+ }
491
+
492
+ fn probe_values(direction: Direction, values: []const Value, key: Key) []const Value {
493
+ switch (direction) {
494
+ .ascending => {
495
+ const start = binary_search.binary_search_values_upsert_index(
496
+ Key,
497
+ Value,
498
+ key_from_value,
499
+ values,
500
+ key,
501
+ .{ .mode = .lower_bound },
502
+ );
503
+
504
+ return if (start == values.len) &.{} else values[start..];
505
+ },
506
+ .descending => {
507
+ const end = end: {
508
+ const index = binary_search.binary_search_values_upsert_index(
509
+ Key,
510
+ Value,
511
+ key_from_value,
512
+ values,
513
+ key,
514
+ .{ .mode = .upper_bound },
515
+ );
516
+ break :end index + @intFromBool(
517
+ index < values.len and key_from_value(&values[index]) <= key,
518
+ );
519
+ };
520
+
521
+ return if (end == 0) &.{} else values[0..end];
522
+ },
523
+ }
524
+ }
525
+ };
526
+ }
527
+
528
+ /// Scans a range of keys over a single LSM Level, in ascending or descending order.
529
+ ///
530
+ /// 1. Iterate over the in-memory manifest to find the next `table_info` that might
531
+ /// contain the key range.
532
+ /// 2. Load the `index_block` of the selected `table_info`.
533
+ /// 3. Perform a binary search on the `index_block` to retrieve an array of addresses
534
+ /// and checksums of all `value_block`s that might contain the key range.
535
+ /// 4. Load a `value_block` from the address/checksum array (in ascending or descending order).
536
+ /// 5. Perform a binary search on the `value_block` and buffer the entries that match
537
+ /// the key range.
538
+ /// 6. When the buffer is consumed, repeat step [4] for loading the next `value_block`,
539
+ /// or, if there are no more `value_block`s in the current `index_block`,
540
+ /// repeat step [1] for the next `table_info`.
541
+ fn ScanTreeLevelType(comptime ScanTree: type, comptime Storage: type) type {
542
+ return struct {
543
+ const ScanTreeLevel = @This();
544
+
545
+ const Grid = GridType(Storage);
546
+ const TableValueIterator = TableValueIteratorType(Storage);
547
+
548
+ const TableInfo = ScanTree.TableInfo;
549
+ const Manifest = ScanTree.Manifest;
550
+
551
+ const Table = ScanTree.Table;
552
+ const Key = Table.Key;
553
+ const Value = Table.Value;
554
+ const key_from_value = Table.key_from_value;
555
+
556
+ scan: *ScanTree,
557
+ level_index: u8,
558
+ buffer: ScanBuffer.LevelBuffer,
559
+
560
+ state: union(enum) {
561
+ loading_manifest,
562
+ loading_index: struct {
563
+ key_exclusive_next: Key,
564
+ address: u64,
565
+ checksum: u128,
566
+ read: Grid.Read = undefined,
567
+ },
568
+ iterating: struct {
569
+ key_exclusive_next: Key,
570
+ values: union(enum) {
571
+ none,
572
+ iterator: TableValueIterator,
573
+ },
574
+ },
575
+ finished: struct {
576
+ next_tick: Grid.NextTick = undefined,
577
+ },
578
+ },
579
+
580
+ values: union(enum) {
581
+ fetching,
582
+ buffered: []const Value,
583
+ finished,
584
+ },
585
+
586
+ pub fn init(
587
+ self: *ScanTreeLevel,
588
+ scan: *ScanTree,
589
+ buffer: ScanBuffer.LevelBuffer,
590
+ level_index: u8,
591
+ ) void {
592
+ assert(level_index < constants.lsm_levels);
593
+ self.* = .{
594
+ .level_index = level_index,
595
+ .scan = scan,
596
+ .buffer = buffer,
597
+ .state = .loading_manifest,
598
+ .values = .fetching,
599
+ };
600
+ }
601
+
602
+ pub fn fetch(self: *ScanTreeLevel) void {
603
+ assert(self.scan.state == .buffering);
604
+
605
+ self.scan.tree.grid.trace.start(
606
+ .{ .scan_tree_level = .{
607
+ .index = self.scan.buffer.index,
608
+ .level = self.level_index,
609
+ .tree = @enumFromInt(self.scan.tree.config.id),
610
+ } },
611
+ );
612
+
613
+ switch (self.state) {
614
+ .loading_manifest => unreachable,
615
+ .loading_index => |*loading_index| {
616
+ assert(self.values == .fetching);
617
+ // Reading the index blocks:
618
+ self.scan.tree.grid.read_block(
619
+ .{ .from_local_or_global_storage = index_block_callback },
620
+ &loading_index.read,
621
+ loading_index.address,
622
+ loading_index.checksum,
623
+ .{ .cache_read = true, .cache_write = true },
624
+ );
625
+ },
626
+ .iterating => |*iterating| {
627
+ assert(self.values == .fetching);
628
+ assert(iterating.values == .iterator);
629
+ assert(!iterating.values.iterator.empty());
630
+ iterating.values.iterator.next_value_block(value_block_callback);
631
+ },
632
+ .finished => |*finished| {
633
+ assert(self.values == .finished);
634
+ self.scan.tree.grid.on_next_tick(
635
+ finished_callback,
636
+ &finished.next_tick,
637
+ );
638
+ },
639
+ }
640
+ }
641
+
642
+ pub fn peek(self: *const ScanTreeLevel) Pending!?Key {
643
+ // `peek` can be called in any state during `seeking`.
644
+ assert(self.state == .loading_manifest or
645
+ self.state == .loading_index or
646
+ self.state == .iterating or
647
+ self.state == .finished);
648
+ assert(self.scan.state == .seeking);
649
+
650
+ switch (self.values) {
651
+ .fetching => return error.Pending,
652
+ .buffered => |values| {
653
+ assert(values.len > 0);
654
+ assert(@intFromPtr(values.ptr) >= @intFromPtr(self.buffer.value_block));
655
+ assert(@intFromPtr(values.ptr) <=
656
+ @intFromPtr(self.buffer.value_block) + self.buffer.value_block.len);
657
+
658
+ const value: *const Value = switch (self.scan.direction) {
659
+ .ascending => &values[0],
660
+ .descending => &values[values.len - 1],
661
+ };
662
+
663
+ const key = key_from_value(value);
664
+ return key;
665
+ },
666
+ .finished => return null,
667
+ }
668
+ }
669
+
670
+ pub fn pop(self: *ScanTreeLevel) Value {
671
+ maybe(self.state == .loading_manifest or
672
+ self.state == .iterating or
673
+ self.state == .finished);
674
+ assert(self.values == .buffered);
675
+ assert(self.scan.state == .seeking);
676
+
677
+ var values = self.values.buffered;
678
+ assert(values.len > 0);
679
+ assert(@intFromPtr(values.ptr) >= @intFromPtr(self.buffer.value_block));
680
+ assert(@intFromPtr(values.ptr) <=
681
+ @intFromPtr(self.buffer.value_block) + self.buffer.value_block.len);
682
+
683
+ defer {
684
+ assert(self.values == .buffered);
685
+ if (self.values.buffered.len == 0) {
686
+ // Moving to the next `value_block` or `table_info`.
687
+ // This will cause the next `peek()` to return `Pending`.
688
+ self.move_next();
689
+ }
690
+ }
691
+
692
+ switch (self.scan.direction) {
693
+ .ascending => {
694
+ const first_value = values[0];
695
+ self.values = .{ .buffered = values[1..] };
696
+ return first_value;
697
+ },
698
+ .descending => {
699
+ const last_value = values[values.len - 1];
700
+ self.values = .{ .buffered = values[0 .. values.len - 1] };
701
+ return last_value;
702
+ },
703
+ }
704
+ }
705
+
706
+ pub fn probe(self: *ScanTreeLevel, probe_key: Key) void {
707
+ maybe(self.state == .loading_manifest or
708
+ self.state == .iterating or
709
+ self.state == .finished);
710
+
711
+ switch (self.values) {
712
+ .fetching => {},
713
+ .buffered => |buffer| {
714
+ assert(buffer.len > 0);
715
+ const slice: []const Value = ScanTree.probe_values(
716
+ self.scan.direction,
717
+ buffer,
718
+ probe_key,
719
+ );
720
+
721
+ if (slice.len == 0) {
722
+ // Moving to the next `value_block` or `table_info`.
723
+ // This will cause the next `peek()` to return `Pending`.
724
+ self.move_next();
725
+ } else {
726
+ // The next exclusive key must be ahead of (or equals) the probe key,
727
+ // so the level iterator state can be preserved without reading the
728
+ // index block again.
729
+ if (self.state == .iterating) {
730
+ const key_exclusive_next =
731
+ self.state.iterating.key_exclusive_next;
732
+ assert(switch (self.scan.direction) {
733
+ .ascending => key_exclusive_next >= probe_key,
734
+ .descending => key_exclusive_next <= probe_key,
735
+ });
736
+ }
737
+
738
+ self.values = .{ .buffered = slice };
739
+ }
740
+ },
741
+ .finished => {
742
+ assert(self.state == .finished);
743
+ return;
744
+ },
745
+ }
746
+
747
+ if (self.values == .fetching) {
748
+ // The key couldn't be found in the buffered data.
749
+ // The level iterator must read the index block again from the new key range.
750
+ //
751
+ // TODO: We may use the already buffered `index_block` to check if the key
752
+ // is present in other value blocks within the same table, advancing the level
753
+ // iterator instead of calling `move_next()`.
754
+ // However, it's most likely the index block is still in the grid cache, so this
755
+ // may not represent any real improvement.
756
+ self.state = .loading_manifest;
757
+ }
758
+ }
759
+
760
+ /// Move to the next `value_block` or `table_info` according to the current state.
761
+ fn move_next(self: *ScanTreeLevel) void {
762
+ assert(self.values == .fetching or
763
+ self.values == .buffered);
764
+
765
+ switch (self.state) {
766
+ .loading_manifest => self.move_next_manifest_table(null),
767
+ .loading_index => unreachable,
768
+ .iterating => |*iterating| {
769
+ if (iterating.values == .none or
770
+ iterating.values.iterator.empty())
771
+ {
772
+ // If the next key is out of the range,
773
+ // there are no more `table_info`s to scan next.
774
+ const key_exclusive_next = iterating.key_exclusive_next;
775
+ if (switch (self.scan.direction) {
776
+ .ascending => key_exclusive_next > self.scan.key_max,
777
+ .descending => key_exclusive_next < self.scan.key_min,
778
+ }) {
779
+ // The next `table_info` is out of the key range, so it's finished.
780
+ self.state = .{ .finished = .{} };
781
+ self.values = .finished;
782
+ } else {
783
+ // Load the next `table_info`.
784
+ self.state = .loading_manifest;
785
+ self.values = .fetching;
786
+ if (switch (self.scan.direction) {
787
+ .ascending => key_exclusive_next < self.scan.key_min,
788
+ .descending => key_exclusive_next > self.scan.key_max,
789
+ }) {
790
+ // A probe() skipped past the last table we iterated, so our
791
+ // key_exclusive_next is now out of bounds, superseded by the
792
+ // tightened key_min (ascending) or key_max (descending) bound.
793
+ self.move_next_manifest_table(null);
794
+ } else {
795
+ self.move_next_manifest_table(key_exclusive_next);
796
+ }
797
+ }
798
+ } else {
799
+ // Keep iterating to the next `value_block`.
800
+ self.values = .fetching;
801
+ }
802
+ },
803
+ .finished => unreachable,
804
+ }
805
+ }
806
+
807
+ /// Moves the iterator to the next `table_info` that might contain the key range.
808
+ fn move_next_manifest_table(
809
+ self: *ScanTreeLevel,
810
+ key_exclusive: ?Key,
811
+ ) void {
812
+ assert(self.state == .loading_manifest);
813
+ assert(self.values == .fetching);
814
+
815
+ assert(self.scan.state == .seeking or
816
+ self.scan.state == .needs_data or
817
+ self.scan.state == .buffering);
818
+
819
+ const manifest: *Manifest = &self.scan.tree.manifest;
820
+ if (manifest.next_table(.{
821
+ .level = self.level_index,
822
+ .snapshot = self.scan.snapshot,
823
+ .key_min = self.scan.key_min,
824
+ .key_max = self.scan.key_max,
825
+ .key_exclusive = key_exclusive,
826
+ .direction = self.scan.direction,
827
+ })) |table_info| {
828
+ // The last key depending on the direction:
829
+ const key_exclusive_next = switch (self.scan.direction) {
830
+ .ascending => table_info.key_max,
831
+ .descending => table_info.key_min,
832
+ };
833
+
834
+ self.state = .{
835
+ .loading_index = .{
836
+ .key_exclusive_next = key_exclusive_next,
837
+ .address = table_info.address,
838
+ .checksum = table_info.checksum,
839
+ },
840
+ };
841
+ self.values = .fetching;
842
+ } else {
843
+ self.state = .{ .finished = .{} };
844
+ self.values = .finished;
845
+ }
846
+ }
847
+
848
+ fn index_block_callback(
849
+ read: *Grid.Read,
850
+ index_block: BlockPtrConst,
851
+ ) void {
852
+ const State = @FieldType(ScanTreeLevel, "state");
853
+ const LoadingIndex = @FieldType(State, "loading_index");
854
+ const loading_index: *LoadingIndex = @fieldParentPtr("read", read);
855
+ const state: *State = @fieldParentPtr("loading_index", loading_index);
856
+ const self: *ScanTreeLevel = @fieldParentPtr("state", state);
857
+
858
+ assert(self.state == .loading_index);
859
+ assert(self.values == .fetching);
860
+ assert(self.scan.state == .buffering);
861
+ assert(self.scan.state.buffering.pending_count > 0);
862
+
863
+ // `index_block` is only valid for this callback, so copy it's contents.
864
+ stdx.copy_disjoint(.exact, u8, self.buffer.index_block, index_block);
865
+
866
+ const Range = struct { start: u32, end: u32 };
867
+ const range_found: ?Range = range: {
868
+ const keys_max = Table.index_value_keys_used(self.buffer.index_block, .key_max);
869
+ const keys_min = Table.index_value_keys_used(self.buffer.index_block, .key_min);
870
+ // The `index_block` *might* contain the key range,
871
+ // otherwise, it shouldn't have been returned by the manifest.
872
+ assert(keys_min.len > 0 and keys_max.len > 0);
873
+ assert(keys_min.len == keys_max.len);
874
+ assert(keys_min[0] <= self.scan.key_max and
875
+ self.scan.key_min <= keys_max[keys_max.len - 1]);
876
+
877
+ const indexes = binary_search.binary_search_keys_range_upsert_indexes(
878
+ Key,
879
+ keys_max,
880
+ self.scan.key_min,
881
+ self.scan.key_max,
882
+ );
883
+
884
+ // The key range was not found.
885
+ if (indexes.start == keys_max.len) break :range null;
886
+
887
+ // Because we search `key_max` in the index block, if the search does not find an
888
+ // exact match it returns the index of the next greatest key, which may contain
889
+ // the key depending on the `key_min`.
890
+ const end = end: {
891
+ break :end indexes.end + @intFromBool(
892
+ indexes.end < keys_max.len and keys_min[indexes.end] <= self.scan.key_max,
893
+ );
894
+ };
895
+
896
+ // TODO: Secondary indexes are keyed by `Prefix+timestamp`, and differently of
897
+ // monotonic ids/timestamps, they cannot be efficiently filtered by key_min/key_max.
898
+ // This may be a valid use case for bloom filters (by prefix only).
899
+ break :range if (indexes.start == end) null else .{
900
+ .start = indexes.start,
901
+ .end = end,
902
+ };
903
+ };
904
+
905
+ const index_schema = schema.TableIndex.from(self.buffer.index_block);
906
+ const data_addresses = index_schema.value_addresses_used(self.buffer.index_block);
907
+ const data_checksums = index_schema.value_checksums_used(self.buffer.index_block);
908
+ assert(data_addresses.len == data_checksums.len);
909
+
910
+ self.state = iterating: {
911
+ const key_exclusive_next = self.state.loading_index.key_exclusive_next;
912
+ break :iterating .{
913
+ .iterating = .{
914
+ .key_exclusive_next = key_exclusive_next,
915
+ .values = .none,
916
+ },
917
+ };
918
+ };
919
+
920
+ if (range_found) |range| {
921
+ self.state.iterating.values = .{ .iterator = undefined };
922
+ self.state.iterating.values.iterator.init(.{
923
+ .grid = self.scan.tree.grid,
924
+ .addresses = data_addresses[range.start..range.end],
925
+ .checksums = data_checksums[range.start..range.end],
926
+ .direction = self.scan.direction,
927
+ });
928
+ self.state.iterating.values.iterator.next_value_block(value_block_callback);
929
+ } else {
930
+ // The current `table_info` does not contain the key range,
931
+ // fetching the next `table_info`.
932
+ self.move_next();
933
+
934
+ self.scan.tree.grid.trace.stop(
935
+ .{ .scan_tree_level = .{
936
+ .index = self.scan.buffer.index,
937
+ .level = self.level_index,
938
+ .tree = @enumFromInt(self.scan.tree.config.id),
939
+ } },
940
+ );
941
+
942
+ self.fetch();
943
+ }
944
+ }
945
+
946
+ fn value_block_callback(
947
+ iterator: *TableValueIterator,
948
+ value_block: BlockPtrConst,
949
+ ) void {
950
+ const State = @FieldType(ScanTreeLevel, "state");
951
+ const Iterating = @FieldType(State, "iterating");
952
+ const IteratingValues = @FieldType(Iterating, "values");
953
+ const iterating_values: *IteratingValues = @fieldParentPtr("iterator", iterator);
954
+ const iterating: *Iterating = @fieldParentPtr("values", iterating_values);
955
+ const state: *State = @fieldParentPtr("iterating", iterating);
956
+ const self: *ScanTreeLevel = @fieldParentPtr("state", state);
957
+
958
+ assert(self.state == .iterating);
959
+ assert(self.values == .fetching);
960
+ assert(self.scan.state == .buffering);
961
+ assert(self.scan.state.buffering.pending_count > 0);
962
+
963
+ const values = Table.value_block_values_used(value_block);
964
+ const range = binary_search.binary_search_values_range(
965
+ Key,
966
+ Value,
967
+ key_from_value,
968
+ values,
969
+ self.scan.key_min,
970
+ self.scan.key_max,
971
+ );
972
+
973
+ if (range.count > 0) {
974
+ // The buffer is a whole grid block, but only the matching values should
975
+ // be copied to save memory bandwidth. The buffer `value block` does not
976
+ // follow the block layout (e.g. header + values).
977
+ const buffer: []Value = std.mem.bytesAsSlice(Value, self.buffer.value_block);
978
+ stdx.copy_disjoint(
979
+ .exact,
980
+ Value,
981
+ buffer[0..range.count],
982
+ values[range.start..][0..range.count],
983
+ );
984
+ // Found values that match the range query.
985
+ self.values = .{ .buffered = buffer[0..range.count] };
986
+ } else {
987
+ // The `value_block` *might* contain the key range,
988
+ // otherwise, it shouldn't have been returned by the iterator.
989
+ const key_min = key_from_value(&values[0]);
990
+ const key_max = key_from_value(&values[values.len - 1]);
991
+ assert(key_min < self.scan.key_min and
992
+ self.scan.key_max < key_max);
993
+
994
+ // Keep fetching if there are more value blocks on this table,
995
+ // or move to the next table otherwise.
996
+ self.move_next();
997
+ }
998
+
999
+ self.scan.tree.grid.trace.stop(
1000
+ .{ .scan_tree_level = .{
1001
+ .index = self.scan.buffer.index,
1002
+ .level = self.level_index,
1003
+ .tree = @enumFromInt(self.scan.tree.config.id),
1004
+ } },
1005
+ );
1006
+
1007
+ switch (self.values) {
1008
+ .fetching => self.fetch(),
1009
+ .buffered, .finished => self.scan.levels_read_complete(),
1010
+ }
1011
+ }
1012
+
1013
+ fn finished_callback(next_tick: *Grid.NextTick) void {
1014
+ const State = @FieldType(ScanTreeLevel, "state");
1015
+ const Finished = @FieldType(State, "finished");
1016
+ const finished: *Finished = @fieldParentPtr("next_tick", next_tick);
1017
+ const state: *State = @alignCast(@fieldParentPtr("finished", finished));
1018
+ const self: *ScanTreeLevel = @fieldParentPtr("state", state);
1019
+
1020
+ assert(self.state == .finished);
1021
+ assert(self.values == .finished);
1022
+ assert(self.scan.state == .buffering);
1023
+ assert(self.scan.state.buffering.pending_count > 0);
1024
+
1025
+ self.scan.tree.grid.trace.stop(
1026
+ .{ .scan_tree_level = .{
1027
+ .index = self.scan.buffer.index,
1028
+ .level = self.level_index,
1029
+ .tree = @enumFromInt(self.scan.tree.config.id),
1030
+ } },
1031
+ );
1032
+
1033
+ self.scan.levels_read_complete();
1034
+ }
1035
+ };
1036
+ }