temporalio 0.0.2 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (320) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +25 -23
  3. data/bridge/Cargo.lock +185 -76
  4. data/bridge/Cargo.toml +6 -4
  5. data/bridge/sdk-core/README.md +19 -6
  6. data/bridge/sdk-core/client/src/lib.rs +215 -39
  7. data/bridge/sdk-core/client/src/metrics.rs +17 -8
  8. data/bridge/sdk-core/client/src/raw.rs +4 -4
  9. data/bridge/sdk-core/client/src/retry.rs +32 -20
  10. data/bridge/sdk-core/core/Cargo.toml +22 -9
  11. data/bridge/sdk-core/core/src/abstractions.rs +203 -14
  12. data/bridge/sdk-core/core/src/core_tests/activity_tasks.rs +76 -41
  13. data/bridge/sdk-core/core/src/core_tests/determinism.rs +165 -2
  14. data/bridge/sdk-core/core/src/core_tests/local_activities.rs +204 -83
  15. data/bridge/sdk-core/core/src/core_tests/queries.rs +3 -4
  16. data/bridge/sdk-core/core/src/core_tests/workers.rs +1 -3
  17. data/bridge/sdk-core/core/src/core_tests/workflow_tasks.rs +397 -54
  18. data/bridge/sdk-core/core/src/ephemeral_server/mod.rs +106 -12
  19. data/bridge/sdk-core/core/src/internal_flags.rs +136 -0
  20. data/bridge/sdk-core/core/src/lib.rs +16 -9
  21. data/bridge/sdk-core/core/src/telemetry/log_export.rs +1 -1
  22. data/bridge/sdk-core/core/src/telemetry/metrics.rs +69 -35
  23. data/bridge/sdk-core/core/src/telemetry/mod.rs +29 -13
  24. data/bridge/sdk-core/core/src/telemetry/prometheus_server.rs +17 -12
  25. data/bridge/sdk-core/core/src/test_help/mod.rs +62 -12
  26. data/bridge/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +112 -156
  27. data/bridge/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  28. data/bridge/sdk-core/core/src/worker/activities/local_activities.rs +352 -122
  29. data/bridge/sdk-core/core/src/worker/activities.rs +233 -157
  30. data/bridge/sdk-core/core/src/worker/client/mocks.rs +22 -2
  31. data/bridge/sdk-core/core/src/worker/client.rs +18 -2
  32. data/bridge/sdk-core/core/src/worker/mod.rs +165 -58
  33. data/bridge/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  34. data/bridge/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  35. data/bridge/sdk-core/core/src/worker/workflow/history_update.rs +856 -277
  36. data/bridge/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +100 -43
  37. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +7 -7
  38. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +5 -4
  39. data/bridge/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +87 -27
  40. data/bridge/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +5 -4
  41. data/bridge/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +5 -4
  42. data/bridge/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +5 -4
  43. data/bridge/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +137 -62
  44. data/bridge/sdk-core/core/src/worker/workflow/machines/mod.rs +25 -17
  45. data/bridge/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +7 -6
  46. data/bridge/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +103 -152
  47. data/bridge/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +7 -7
  48. data/bridge/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +9 -9
  49. data/bridge/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  50. data/bridge/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +14 -7
  51. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +5 -16
  52. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +201 -121
  53. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +11 -14
  54. data/bridge/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +30 -15
  55. data/bridge/sdk-core/core/src/worker/workflow/managed_run.rs +1026 -376
  56. data/bridge/sdk-core/core/src/worker/workflow/mod.rs +460 -384
  57. data/bridge/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  58. data/bridge/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  59. data/bridge/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  60. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  61. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  62. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream.rs +448 -718
  63. data/bridge/sdk-core/core-api/Cargo.toml +2 -1
  64. data/bridge/sdk-core/core-api/src/errors.rs +1 -34
  65. data/bridge/sdk-core/core-api/src/lib.rs +6 -2
  66. data/bridge/sdk-core/core-api/src/telemetry.rs +0 -6
  67. data/bridge/sdk-core/core-api/src/worker.rs +14 -1
  68. data/bridge/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +18 -15
  69. data/bridge/sdk-core/fsm/rustfsm_trait/src/lib.rs +8 -3
  70. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  71. data/bridge/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
  72. data/bridge/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
  73. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
  74. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
  75. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
  76. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
  77. data/bridge/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
  78. data/bridge/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
  79. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
  80. data/bridge/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
  81. data/bridge/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  82. data/bridge/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
  83. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
  84. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  85. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +1 -0
  86. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +1 -0
  87. data/bridge/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +1 -0
  88. data/bridge/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  89. data/bridge/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  90. data/bridge/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +1 -0
  91. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +7 -0
  92. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +1 -0
  93. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +6 -0
  94. data/bridge/sdk-core/sdk/Cargo.toml +3 -2
  95. data/bridge/sdk-core/sdk/src/lib.rs +87 -20
  96. data/bridge/sdk-core/sdk/src/workflow_future.rs +9 -8
  97. data/bridge/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  98. data/bridge/sdk-core/sdk-core-protos/build.rs +36 -1
  99. data/bridge/sdk-core/sdk-core-protos/src/history_builder.rs +100 -87
  100. data/bridge/sdk-core/sdk-core-protos/src/history_info.rs +5 -1
  101. data/bridge/sdk-core/sdk-core-protos/src/lib.rs +175 -57
  102. data/bridge/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  103. data/bridge/sdk-core/test-utils/Cargo.toml +3 -1
  104. data/bridge/sdk-core/test-utils/src/canned_histories.rs +106 -296
  105. data/bridge/sdk-core/test-utils/src/histfetch.rs +1 -1
  106. data/bridge/sdk-core/test-utils/src/lib.rs +82 -23
  107. data/bridge/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  108. data/bridge/sdk-core/test-utils/src/workflows.rs +29 -0
  109. data/bridge/sdk-core/tests/fuzzy_workflow.rs +130 -0
  110. data/bridge/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  111. data/bridge/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  112. data/bridge/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
  113. data/bridge/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  114. data/bridge/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
  115. data/bridge/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  116. data/bridge/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  117. data/bridge/sdk-core/tests/integ_tests/workflow_tests/activities.rs +93 -69
  118. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  119. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  120. data/bridge/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +1 -0
  121. data/bridge/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  122. data/bridge/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  123. data/bridge/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +72 -191
  124. data/bridge/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +1 -0
  125. data/bridge/sdk-core/tests/integ_tests/workflow_tests/patches.rs +7 -28
  126. data/bridge/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  127. data/bridge/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  128. data/bridge/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  129. data/bridge/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  130. data/bridge/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  131. data/bridge/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -4
  132. data/bridge/sdk-core/tests/integ_tests/workflow_tests.rs +10 -11
  133. data/bridge/sdk-core/tests/main.rs +3 -13
  134. data/bridge/sdk-core/tests/runner.rs +75 -36
  135. data/bridge/sdk-core/tests/wf_input_replay.rs +32 -0
  136. data/bridge/src/connection.rs +41 -25
  137. data/bridge/src/lib.rs +269 -14
  138. data/bridge/src/runtime.rs +1 -1
  139. data/bridge/src/test_server.rs +153 -0
  140. data/bridge/src/worker.rs +89 -16
  141. data/lib/gen/temporal/api/command/v1/message_pb.rb +4 -18
  142. data/lib/gen/temporal/api/common/v1/message_pb.rb +4 -0
  143. data/lib/gen/temporal/api/enums/v1/command_type_pb.rb +1 -3
  144. data/lib/gen/temporal/api/enums/v1/event_type_pb.rb +3 -3
  145. data/lib/gen/temporal/api/enums/v1/failed_cause_pb.rb +2 -0
  146. data/lib/gen/temporal/api/enums/v1/update_pb.rb +6 -4
  147. data/lib/gen/temporal/api/history/v1/message_pb.rb +27 -19
  148. data/lib/gen/temporal/api/namespace/v1/message_pb.rb +1 -0
  149. data/lib/gen/temporal/api/operatorservice/v1/request_response_pb.rb +3 -0
  150. data/lib/gen/temporal/api/protocol/v1/message_pb.rb +30 -0
  151. data/lib/gen/temporal/api/sdk/v1/task_complete_metadata_pb.rb +23 -0
  152. data/lib/gen/temporal/api/testservice/v1/request_response_pb.rb +49 -0
  153. data/lib/gen/temporal/api/testservice/v1/service_pb.rb +21 -0
  154. data/lib/gen/temporal/api/update/v1/message_pb.rb +72 -0
  155. data/lib/gen/temporal/api/workflowservice/v1/request_response_pb.rb +26 -16
  156. data/lib/gen/temporal/sdk/core/activity_result/activity_result_pb.rb +13 -9
  157. data/lib/gen/temporal/sdk/core/activity_task/activity_task_pb.rb +10 -6
  158. data/lib/gen/temporal/sdk/core/child_workflow/child_workflow_pb.rb +13 -9
  159. data/lib/gen/temporal/sdk/core/common/common_pb.rb +7 -3
  160. data/lib/gen/temporal/sdk/core/core_interface_pb.rb +9 -3
  161. data/lib/gen/temporal/sdk/core/external_data/external_data_pb.rb +7 -3
  162. data/lib/gen/temporal/sdk/core/workflow_activation/workflow_activation_pb.rb +27 -21
  163. data/lib/gen/temporal/sdk/core/workflow_commands/workflow_commands_pb.rb +28 -24
  164. data/lib/gen/temporal/sdk/core/workflow_completion/workflow_completion_pb.rb +12 -5
  165. data/lib/temporalio/activity/context.rb +13 -8
  166. data/lib/temporalio/activity/info.rb +1 -1
  167. data/lib/temporalio/bridge/connect_options.rb +15 -0
  168. data/lib/temporalio/bridge/retry_config.rb +24 -0
  169. data/lib/temporalio/bridge/tls_options.rb +19 -0
  170. data/lib/temporalio/bridge.rb +1 -1
  171. data/lib/temporalio/client/implementation.rb +8 -8
  172. data/lib/temporalio/connection/retry_config.rb +44 -0
  173. data/lib/temporalio/connection/service.rb +20 -0
  174. data/lib/temporalio/connection/test_service.rb +92 -0
  175. data/lib/temporalio/connection/tls_options.rb +51 -0
  176. data/lib/temporalio/connection/workflow_service.rb +731 -0
  177. data/lib/temporalio/connection.rb +55 -720
  178. data/lib/temporalio/interceptor/activity_inbound.rb +22 -0
  179. data/lib/temporalio/interceptor/activity_outbound.rb +24 -0
  180. data/lib/temporalio/interceptor/chain.rb +5 -5
  181. data/lib/temporalio/interceptor/client.rb +8 -4
  182. data/lib/temporalio/interceptor.rb +22 -0
  183. data/lib/temporalio/retry_policy.rb +13 -3
  184. data/lib/temporalio/testing/time_skipping_handle.rb +32 -0
  185. data/lib/temporalio/testing/time_skipping_interceptor.rb +23 -0
  186. data/lib/temporalio/testing/workflow_environment.rb +112 -0
  187. data/lib/temporalio/testing.rb +175 -0
  188. data/lib/temporalio/version.rb +1 -1
  189. data/lib/temporalio/worker/activity_runner.rb +26 -4
  190. data/lib/temporalio/worker/activity_worker.rb +44 -18
  191. data/lib/temporalio/worker/sync_worker.rb +47 -11
  192. data/lib/temporalio/worker.rb +27 -21
  193. data/lib/temporalio/workflow/async.rb +46 -0
  194. data/lib/temporalio/workflow/future.rb +138 -0
  195. data/lib/temporalio/workflow/info.rb +76 -0
  196. data/lib/thermite_patch.rb +10 -0
  197. data/sig/async.rbs +17 -0
  198. data/sig/protobuf.rbs +16 -0
  199. data/sig/protos/dependencies/gogoproto/gogo.rbs +914 -0
  200. data/sig/protos/google/protobuf/any.rbs +157 -0
  201. data/sig/protos/google/protobuf/descriptor.rbs +2825 -0
  202. data/sig/protos/google/protobuf/duration.rbs +114 -0
  203. data/sig/protos/google/protobuf/empty.rbs +36 -0
  204. data/sig/protos/google/protobuf/timestamp.rbs +145 -0
  205. data/sig/protos/google/protobuf/wrappers.rbs +358 -0
  206. data/sig/protos/temporal/api/batch/v1/message.rbs +300 -0
  207. data/sig/protos/temporal/api/command/v1/message.rbs +1399 -0
  208. data/sig/protos/temporal/api/common/v1/message.rbs +528 -0
  209. data/sig/protos/temporal/api/enums/v1/batch_operation.rbs +79 -0
  210. data/sig/protos/temporal/api/enums/v1/command_type.rbs +68 -0
  211. data/sig/protos/temporal/api/enums/v1/common.rbs +118 -0
  212. data/sig/protos/temporal/api/enums/v1/event_type.rbs +264 -0
  213. data/sig/protos/temporal/api/enums/v1/failed_cause.rbs +277 -0
  214. data/sig/protos/temporal/api/enums/v1/namespace.rbs +108 -0
  215. data/sig/protos/temporal/api/enums/v1/query.rbs +81 -0
  216. data/sig/protos/temporal/api/enums/v1/reset.rbs +44 -0
  217. data/sig/protos/temporal/api/enums/v1/schedule.rbs +72 -0
  218. data/sig/protos/temporal/api/enums/v1/task_queue.rbs +92 -0
  219. data/sig/protos/temporal/api/enums/v1/update.rbs +64 -0
  220. data/sig/protos/temporal/api/enums/v1/workflow.rbs +371 -0
  221. data/sig/protos/temporal/api/errordetails/v1/message.rbs +551 -0
  222. data/sig/protos/temporal/api/failure/v1/message.rbs +581 -0
  223. data/sig/protos/temporal/api/filter/v1/message.rbs +171 -0
  224. data/sig/protos/temporal/api/history/v1/message.rbs +4609 -0
  225. data/sig/protos/temporal/api/namespace/v1/message.rbs +410 -0
  226. data/sig/protos/temporal/api/operatorservice/v1/request_response.rbs +643 -0
  227. data/sig/protos/temporal/api/operatorservice/v1/service.rbs +17 -0
  228. data/sig/protos/temporal/api/protocol/v1/message.rbs +84 -0
  229. data/sig/protos/temporal/api/query/v1/message.rbs +182 -0
  230. data/sig/protos/temporal/api/replication/v1/message.rbs +148 -0
  231. data/sig/protos/temporal/api/schedule/v1/message.rbs +1488 -0
  232. data/sig/protos/temporal/api/sdk/v1/task_complete_metadata.rbs +110 -0
  233. data/sig/protos/temporal/api/taskqueue/v1/message.rbs +486 -0
  234. data/sig/protos/temporal/api/testservice/v1/request_response.rbs +249 -0
  235. data/sig/protos/temporal/api/testservice/v1/service.rbs +15 -0
  236. data/sig/protos/temporal/api/update/v1/message.rbs +489 -0
  237. data/sig/protos/temporal/api/version/v1/message.rbs +184 -0
  238. data/sig/protos/temporal/api/workflow/v1/message.rbs +824 -0
  239. data/sig/protos/temporal/api/workflowservice/v1/request_response.rbs +7250 -0
  240. data/sig/protos/temporal/api/workflowservice/v1/service.rbs +22 -0
  241. data/sig/protos/temporal/sdk/core/activity_result/activity_result.rbs +380 -0
  242. data/sig/protos/temporal/sdk/core/activity_task/activity_task.rbs +386 -0
  243. data/sig/protos/temporal/sdk/core/child_workflow/child_workflow.rbs +323 -0
  244. data/sig/protos/temporal/sdk/core/common/common.rbs +62 -0
  245. data/sig/protos/temporal/sdk/core/core_interface.rbs +101 -0
  246. data/sig/protos/temporal/sdk/core/external_data/external_data.rbs +119 -0
  247. data/sig/protos/temporal/sdk/core/workflow_activation/workflow_activation.rbs +1473 -0
  248. data/sig/protos/temporal/sdk/core/workflow_commands/workflow_commands.rbs +1784 -0
  249. data/sig/protos/temporal/sdk/core/workflow_completion/workflow_completion.rbs +180 -0
  250. data/sig/ruby.rbs +12 -0
  251. data/sig/temporalio/activity/context.rbs +29 -0
  252. data/sig/temporalio/activity/info.rbs +43 -0
  253. data/sig/temporalio/activity.rbs +19 -0
  254. data/sig/temporalio/bridge/connect_options.rbs +19 -0
  255. data/sig/temporalio/bridge/error.rbs +8 -0
  256. data/sig/temporalio/bridge/retry_config.rbs +21 -0
  257. data/sig/temporalio/bridge/tls_options.rbs +17 -0
  258. data/sig/temporalio/bridge.rbs +71 -0
  259. data/sig/temporalio/client/implementation.rbs +38 -0
  260. data/sig/temporalio/client/workflow_handle.rbs +41 -0
  261. data/sig/temporalio/client.rbs +35 -0
  262. data/sig/temporalio/connection/retry_config.rbs +37 -0
  263. data/sig/temporalio/connection/service.rbs +14 -0
  264. data/sig/temporalio/connection/test_service.rbs +13 -0
  265. data/sig/temporalio/connection/tls_options.rbs +43 -0
  266. data/sig/temporalio/connection/workflow_service.rbs +48 -0
  267. data/sig/temporalio/connection.rbs +30 -0
  268. data/sig/temporalio/data_converter.rbs +35 -0
  269. data/sig/temporalio/error/failure.rbs +121 -0
  270. data/sig/temporalio/error/workflow_failure.rbs +9 -0
  271. data/sig/temporalio/errors.rbs +36 -0
  272. data/sig/temporalio/failure_converter/base.rbs +12 -0
  273. data/sig/temporalio/failure_converter/basic.rbs +86 -0
  274. data/sig/temporalio/failure_converter.rbs +5 -0
  275. data/sig/temporalio/interceptor/activity_inbound.rbs +21 -0
  276. data/sig/temporalio/interceptor/activity_outbound.rbs +10 -0
  277. data/sig/temporalio/interceptor/chain.rbs +24 -0
  278. data/sig/temporalio/interceptor/client.rbs +148 -0
  279. data/sig/temporalio/interceptor.rbs +6 -0
  280. data/sig/temporalio/payload_codec/base.rbs +12 -0
  281. data/sig/temporalio/payload_converter/base.rbs +12 -0
  282. data/sig/temporalio/payload_converter/bytes.rbs +9 -0
  283. data/sig/temporalio/payload_converter/composite.rbs +19 -0
  284. data/sig/temporalio/payload_converter/encoding_base.rbs +14 -0
  285. data/sig/temporalio/payload_converter/json.rbs +9 -0
  286. data/sig/temporalio/payload_converter/nil.rbs +9 -0
  287. data/sig/temporalio/payload_converter.rbs +5 -0
  288. data/sig/temporalio/retry_policy.rbs +25 -0
  289. data/sig/temporalio/retry_state.rbs +20 -0
  290. data/sig/temporalio/runtime.rbs +12 -0
  291. data/sig/temporalio/testing/time_skipping_handle.rbs +15 -0
  292. data/sig/temporalio/testing/time_skipping_interceptor.rbs +13 -0
  293. data/sig/temporalio/testing/workflow_environment.rbs +22 -0
  294. data/sig/temporalio/testing.rbs +35 -0
  295. data/sig/temporalio/timeout_type.rbs +15 -0
  296. data/sig/temporalio/version.rbs +3 -0
  297. data/sig/temporalio/worker/activity_runner.rbs +35 -0
  298. data/sig/temporalio/worker/activity_worker.rbs +44 -0
  299. data/sig/temporalio/worker/reactor.rbs +22 -0
  300. data/sig/temporalio/worker/runner.rbs +21 -0
  301. data/sig/temporalio/worker/sync_worker.rbs +23 -0
  302. data/sig/temporalio/worker/thread_pool_executor.rbs +23 -0
  303. data/sig/temporalio/worker.rbs +46 -0
  304. data/sig/temporalio/workflow/async.rbs +9 -0
  305. data/sig/temporalio/workflow/execution_info.rbs +55 -0
  306. data/sig/temporalio/workflow/execution_status.rbs +21 -0
  307. data/sig/temporalio/workflow/future.rbs +40 -0
  308. data/sig/temporalio/workflow/id_reuse_policy.rbs +15 -0
  309. data/sig/temporalio/workflow/info.rbs +55 -0
  310. data/sig/temporalio/workflow/query_reject_condition.rbs +14 -0
  311. data/sig/temporalio.rbs +2 -0
  312. data/sig/thermite_patch.rbs +15 -0
  313. data/temporalio.gemspec +6 -4
  314. metadata +183 -17
  315. data/bridge/sdk-core/Cargo.lock +0 -2606
  316. data/bridge/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
  317. data/lib/bridge.so +0 -0
  318. data/lib/gen/temporal/api/enums/v1/interaction_type_pb.rb +0 -25
  319. data/lib/gen/temporal/api/interaction/v1/message_pb.rb +0 -49
  320. data/lib/gen/temporal/sdk/core/bridge/bridge_pb.rb +0 -222
@@ -1,250 +1,177 @@
1
+ #[cfg(feature = "save_wf_inputs")]
2
+ mod saved_wf_inputs;
3
+ #[cfg(feature = "save_wf_inputs")]
4
+ mod tonic_status_serde;
5
+
6
+ #[cfg(feature = "save_wf_inputs")]
7
+ pub use saved_wf_inputs::replay_wf_state_inputs;
8
+
1
9
  use crate::{
2
- abstractions::{dbg_panic, stream_when_allowed, MeteredSemaphore},
3
- protosext::ValidPollWFTQResponse,
4
- telemetry::metrics::workflow_worker_type,
5
- worker::{
6
- workflow::{history_update::NextPageToken, run_cache::RunCache, *},
7
- LocalActRequest, LocalActivityResolution, LEGACY_QUERY_ID,
10
+ abstractions::dbg_panic,
11
+ worker::workflow::{
12
+ managed_run::RunUpdateAct,
13
+ run_cache::RunCache,
14
+ wft_extraction::{HistfetchRC, HistoryFetchReq, WFTExtractorOutput},
15
+ *,
8
16
  },
9
17
  MetricsContext,
10
18
  };
11
19
  use futures::{stream, stream::PollNext, Stream, StreamExt};
12
- use std::{collections::VecDeque, fmt::Debug, future, sync::Arc, time::Instant};
13
- use temporal_sdk_core_api::errors::{PollWfError, WFMachinesError};
14
- use temporal_sdk_core_protos::{
15
- coresdk::{
16
- workflow_activation::{
17
- create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
18
- workflow_activation_job,
19
- },
20
- workflow_completion::Failure,
21
- },
22
- temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure as TFailure},
23
- };
24
- use tokio::sync::{mpsc::unbounded_channel, oneshot};
25
- use tokio_stream::wrappers::UnboundedReceiverStream;
20
+ use std::{collections::VecDeque, fmt::Debug, future, sync::Arc};
21
+ use temporal_sdk_core_api::errors::PollWfError;
22
+ use temporal_sdk_core_protos::coresdk::workflow_activation::remove_from_cache::EvictionReason;
26
23
  use tokio_util::sync::CancellationToken;
27
24
  use tracing::{Level, Span};
28
25
 
29
- /// This struct holds all the state needed for tracking what workflow runs are currently cached
30
- /// and how WFTs should be dispatched to them, etc.
26
+ /// This struct holds all the state needed for tracking the state of currently cached workflow runs
27
+ /// and directs all actions which affect them. It is ultimately the top-level arbiter of nearly
28
+ /// everything important relating to workflow state.
31
29
  ///
32
30
  /// See [WFStream::build] for more
33
- pub(crate) struct WFStream {
31
+ pub(super) struct WFStream {
34
32
  runs: RunCache,
35
33
  /// Buffered polls for new runs which need a cache slot to open up before we can handle them
36
34
  buffered_polls_need_cache_slot: VecDeque<PermittedWFT>,
35
+ /// Is filled with runs that we decided need to have their history fetched during state
36
+ /// manipulation. Must be drained after handling each input.
37
+ runs_needing_fetching: VecDeque<HistoryFetchReq>,
37
38
 
38
- /// Client for accessing server for history pagination etc.
39
- client: Arc<dyn WorkerClient>,
40
-
41
- /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
42
- wft_semaphore: MeteredSemaphore,
39
+ history_fetch_refcounter: Arc<HistfetchRC>,
43
40
  shutdown_token: CancellationToken,
44
41
  ignore_evicts_on_shutdown: bool,
45
42
 
46
43
  metrics: MetricsContext,
47
- }
48
- impl WFStream {
49
- fn record_span_fields(&mut self, run_id: &str, span: &Span) {
50
- if let Some(run_handle) = self.runs.get_mut(run_id) {
51
- if let Some(spid) = span.id() {
52
- if run_handle.recorded_span_ids.contains(&spid) {
53
- return;
54
- }
55
- run_handle.recorded_span_ids.insert(spid);
56
-
57
- if let Some(wid) = run_handle.wft.as_ref().map(|wft| &wft.info.wf_id) {
58
- span.record("workflow_id", wid.as_str());
59
- }
60
- }
61
- }
62
- }
63
- }
64
44
 
65
- /// All possible inputs to the [WFStream]
66
- #[derive(derive_more::From, Debug)]
67
- enum WFStreamInput {
68
- NewWft(PermittedWFT),
69
- Local(LocalInput),
70
- /// The stream given to us which represents the poller (or a mock) terminated.
71
- PollerDead,
72
- /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
73
- /// error while polling
74
- PollerError(tonic::Status),
75
- }
76
- impl From<RunUpdateResponse> for WFStreamInput {
77
- fn from(r: RunUpdateResponse) -> Self {
78
- WFStreamInput::Local(LocalInput {
79
- input: LocalInputs::RunUpdateResponse(r.kind),
80
- span: r.span,
81
- })
82
- }
83
- }
84
- /// A non-poller-received input to the [WFStream]
85
- #[derive(derive_more::DebugCustom)]
86
- #[debug(fmt = "LocalInput {{ {:?} }}", input)]
87
- pub(super) struct LocalInput {
88
- pub input: LocalInputs,
89
- pub span: Span,
90
- }
91
- /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
92
- /// new polls.
93
- #[derive(Debug, derive_more::From)]
94
- pub(super) enum LocalInputs {
95
- Completion(WFActCompleteMsg),
96
- LocalResolution(LocalResolutionMsg),
97
- PostActivation(PostActivationMsg),
98
- RunUpdateResponse(RunUpdateResponseKind),
99
- RequestEviction(RequestEvictMsg),
100
- GetStateInfo(GetStateInfoMsg),
101
- }
102
- impl LocalInputs {
103
- fn run_id(&self) -> Option<&str> {
104
- Some(match self {
105
- LocalInputs::Completion(c) => c.completion.run_id(),
106
- LocalInputs::LocalResolution(lr) => &lr.run_id,
107
- LocalInputs::PostActivation(pa) => &pa.run_id,
108
- LocalInputs::RunUpdateResponse(rur) => rur.run_id(),
109
- LocalInputs::RequestEviction(re) => &re.run_id,
110
- LocalInputs::GetStateInfo(_) => return None,
111
- })
112
- }
113
- }
114
- #[derive(Debug, derive_more::From)]
115
- #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
116
- enum ExternalPollerInputs {
117
- NewWft(PermittedWFT),
118
- PollerDead,
119
- PollerError(tonic::Status),
120
- }
121
- impl From<ExternalPollerInputs> for WFStreamInput {
122
- fn from(l: ExternalPollerInputs) -> Self {
123
- match l {
124
- ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
125
- ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
126
- ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
127
- }
128
- }
45
+ #[cfg(feature = "save_wf_inputs")]
46
+ wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
129
47
  }
130
-
131
48
  impl WFStream {
132
49
  /// Constructs workflow state management and returns a stream which outputs activations.
133
50
  ///
134
- /// * `external_wfts` is a stream of validated poll responses as returned by a poller (or mock)
135
- /// * `wfts_from_complete` is the recv side of a channel that new WFTs from completions should
136
- /// come down.
51
+ /// * `wft_stream` is a stream of validated poll responses and fetched history pages as returned
52
+ /// by a poller (or mock), via [WFTExtractor].
137
53
  /// * `local_rx` is a stream of actions that workflow state needs to see. Things like
138
- /// completions, local activities finishing, etc. See [LocalInputs].
54
+ /// completions, local activities finishing, etc. See [LocalInputs].
55
+ /// * `local_activity_request_sink` is used to handle outgoing requests to start or cancel
56
+ /// local activities, and may return resolutions that need to be handled immediately.
139
57
  ///
140
- /// These inputs are combined, along with an internal feedback channel for run-specific updates,
141
- /// to form the inputs to a stream of [WFActStreamInput]s. The stream processor then takes
142
- /// action on those inputs, and then may yield activations.
58
+ /// The stream inputs are combined into a stream of [WFActStreamInput]s. The stream processor
59
+ /// then takes action on those inputs, mutating the [WFStream] state, and then may yield
60
+ /// activations.
143
61
  ///
144
- /// Updating runs may need to do async work like fetching additional history. In order to
145
- /// facilitate this, each run lives in its own task which is communicated with by sending
146
- /// [RunAction]s and receiving [RunUpdateResponse]s via its [ManagedRunHandle].
62
+ /// Importantly, nothing async happens while actually mutating state. This means all changes to
63
+ /// all workflow state can be represented purely via the stream of inputs, plus the
64
+ /// calls/retvals from the LA request sink, which is the last unfortunate bit of impurity in
65
+ /// the design. Eliminating it would be nice, so that all inputs come from the passed-in streams
66
+ /// and all outputs flow from the return stream, but it's difficult to do so since it would
67
+ /// require "pausing" in-progress changes to a run while sending & waiting for response from
68
+ /// local activity management. Likely the best option would be to move the pure state info
69
+ /// needed to determine immediate responses into LA state machines themselves (out of the LA
70
+ /// manager), which is a quite substantial change.
147
71
  pub(super) fn build(
148
72
  basics: WorkflowBasics,
149
- external_wfts: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
73
+ wft_stream: impl Stream<Item = Result<WFTExtractorOutput, tonic::Status>> + Send + 'static,
150
74
  local_rx: impl Stream<Item = LocalInput> + Send + 'static,
151
- client: Arc<dyn WorkerClient>,
152
- local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
153
- + Send
154
- + Sync
155
- + 'static,
156
- ) -> impl Stream<Item = Result<ActivationOrAuto, PollWfError>> {
157
- let wft_semaphore = MeteredSemaphore::new(
158
- basics.max_outstanding_wfts,
159
- basics.metrics.with_new_attrs([workflow_worker_type()]),
160
- MetricsContext::available_task_slots,
161
- );
162
- let wft_sem_clone = wft_semaphore.clone();
163
- let proceeder = stream::unfold(wft_sem_clone, |sem| async move {
164
- Some((sem.acquire_owned().await.unwrap(), sem))
165
- });
166
- let poller_wfts = stream_when_allowed(external_wfts, proceeder);
167
- let (run_update_tx, run_update_rx) = unbounded_channel();
168
- let local_rx = stream::select(
169
- local_rx.map(Into::into),
170
- UnboundedReceiverStream::new(run_update_rx).map(Into::into),
171
- );
75
+ local_activity_request_sink: impl LocalActivityRequestSink,
76
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
172
77
  let all_inputs = stream::select_with_strategy(
173
- local_rx,
174
- poller_wfts
175
- .map(|(wft, permit)| match wft {
176
- Ok(wft) => ExternalPollerInputs::NewWft(PermittedWFT { wft, permit }),
177
- Err(e) => ExternalPollerInputs::PollerError(e),
178
- })
78
+ local_rx.map(Into::into),
79
+ wft_stream
80
+ .map(Into::into)
179
81
  .chain(stream::once(async { ExternalPollerInputs::PollerDead }))
180
82
  .map(Into::into)
181
83
  .boxed(),
182
84
  // Priority always goes to the local stream
183
85
  |_: &mut ()| PollNext::Left,
184
86
  );
87
+ Self::build_internal(all_inputs, basics, local_activity_request_sink)
88
+ }
89
+
90
+ fn build_internal(
91
+ all_inputs: impl Stream<Item = WFStreamInput>,
92
+ basics: WorkflowBasics,
93
+ local_activity_request_sink: impl LocalActivityRequestSink,
94
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
185
95
  let mut state = WFStream {
186
96
  buffered_polls_need_cache_slot: Default::default(),
187
97
  runs: RunCache::new(
188
98
  basics.max_cached_workflows,
189
99
  basics.namespace.clone(),
190
- run_update_tx,
191
- Arc::new(local_activity_request_sink),
100
+ basics.server_capabilities.clone(),
101
+ local_activity_request_sink,
192
102
  basics.metrics.clone(),
193
103
  ),
194
- client,
195
- wft_semaphore,
196
104
  shutdown_token: basics.shutdown_token,
197
105
  ignore_evicts_on_shutdown: basics.ignore_evicts_on_shutdown,
198
106
  metrics: basics.metrics,
107
+ runs_needing_fetching: Default::default(),
108
+ history_fetch_refcounter: Arc::new(HistfetchRC {}),
109
+
110
+ #[cfg(feature = "save_wf_inputs")]
111
+ wf_state_inputs: basics.wf_state_inputs,
199
112
  };
200
113
  all_inputs
201
- .map(move |action| {
114
+ .map(move |action: WFStreamInput| {
202
115
  let span = span!(Level::DEBUG, "new_stream_input", action=?action);
203
116
  let _span_g = span.enter();
204
117
 
205
- let maybe_activation = match action {
118
+ #[cfg(feature = "save_wf_inputs")]
119
+ let maybe_write = state.prep_input(&action);
120
+
121
+ let mut activations = vec![];
122
+ let maybe_act = match action {
206
123
  WFStreamInput::NewWft(pwft) => {
207
- debug!(run_id=%pwft.wft.workflow_execution.run_id, "New WFT");
208
- state.instantiate_or_update(pwft);
209
- None
124
+ debug!(run_id=%pwft.work.execution.run_id, "New WFT");
125
+ state.instantiate_or_update(pwft)
210
126
  }
211
127
  WFStreamInput::Local(local_input) => {
212
128
  let _span_g = local_input.span.enter();
213
129
  if let Some(rid) = local_input.input.run_id() {
214
- state.record_span_fields(rid, &local_input.span);
130
+ if let Some(rh) = state.runs.get_mut(rid) {
131
+ rh.record_span_fields(&local_input.span);
132
+ }
215
133
  }
216
134
  match local_input.input {
217
- LocalInputs::RunUpdateResponse(resp) => {
218
- state.process_run_update_response(resp)
219
- }
220
135
  LocalInputs::Completion(completion) => {
221
- state.process_completion(completion);
222
- None
136
+ activations.extend(
137
+ state.process_completion(NewOrFetchedComplete::New(completion)),
138
+ );
139
+ None // completions can return more than one activation
140
+ }
141
+ LocalInputs::FetchedPageCompletion { paginator, update } => {
142
+ activations.extend(state.process_completion(
143
+ NewOrFetchedComplete::Fetched(update, paginator),
144
+ ));
145
+ None // completions can return more than one activation
223
146
  }
224
147
  LocalInputs::PostActivation(report) => {
225
- state.process_post_activation(report);
226
- None
148
+ state.process_post_activation(report)
227
149
  }
228
- LocalInputs::LocalResolution(res) => {
229
- state.local_resolution(res);
230
- None
150
+ LocalInputs::LocalResolution(res) => state.local_resolution(res),
151
+ LocalInputs::HeartbeatTimeout(hbt) => {
152
+ state.process_heartbeat_timeout(hbt)
231
153
  }
232
154
  LocalInputs::RequestEviction(evict) => {
233
- state.request_eviction(evict);
234
- None
155
+ state.request_eviction(evict).into_run_update_resp()
235
156
  }
236
157
  LocalInputs::GetStateInfo(gsi) => {
237
158
  let _ = gsi.response_tx.send(WorkflowStateInfo {
238
159
  cached_workflows: state.runs.len(),
239
160
  outstanding_wft: state.outstanding_wfts(),
240
- available_wft_permits: state.wft_semaphore.available_permits(),
241
161
  });
242
162
  None
243
163
  }
244
164
  }
245
165
  }
166
+ WFStreamInput::FailedFetch { run_id, err } => state
167
+ .request_eviction(RequestEvictMsg {
168
+ run_id,
169
+ message: format!("Fetching history failed: {err:?}"),
170
+ reason: EvictionReason::PaginationOrHistoryFetch,
171
+ })
172
+ .into_run_update_resp(),
246
173
  WFStreamInput::PollerDead => {
247
- debug!("WFT poller died, shutting down");
174
+ debug!("WFT poller died, beginning shutdown");
248
175
  state.shutdown_token.cancel();
249
176
  None
250
177
  }
@@ -254,457 +181,228 @@ impl WFStream {
254
181
  }
255
182
  };
256
183
 
257
- if let Some(ref act) = maybe_activation {
258
- if let Some(run_handle) = state.runs.get_mut(act.run_id()) {
259
- run_handle.insert_outstanding_activation(act);
260
- } else {
261
- dbg_panic!("Tried to insert activation for missing run!");
262
- }
184
+ activations.extend(maybe_act.into_iter());
185
+ activations.extend(state.reconcile_buffered());
186
+
187
+ // Always flush *after* actually handling the input, as this allows LA sink
188
+ // responses to be recorded before the input, so they can be read and buffered to be
189
+ // replayed during the handling of the input itself.
190
+ #[cfg(feature = "save_wf_inputs")]
191
+ if let Some(write) = maybe_write {
192
+ state.flush_write(write);
263
193
  }
264
- state.reconcile_buffered();
194
+
265
195
  if state.shutdown_done() {
196
+ info!("Workflow shutdown is done");
266
197
  return Err(PollWfError::ShutDown);
267
198
  }
268
199
 
269
- Ok(maybe_activation)
200
+ Ok(WFStreamOutput {
201
+ activations: activations.into(),
202
+ fetch_histories: std::mem::take(&mut state.runs_needing_fetching),
203
+ })
270
204
  })
271
- .filter_map(|o| {
272
- future::ready(match o {
273
- Ok(None) => None,
274
- Ok(Some(v)) => Some(Ok(v)),
275
- Err(e) => {
276
- if !matches!(e, PollWfError::ShutDown) {
277
- error!(
205
+ .inspect(|o| {
206
+ if let Some(e) = o.as_ref().err() {
207
+ if !matches!(e, PollWfError::ShutDown) {
208
+ error!(
278
209
  "Workflow processing encountered fatal error and must shut down {:?}",
279
210
  e
280
- );
281
- }
282
- Some(Err(e))
211
+ );
283
212
  }
284
- })
213
+ }
285
214
  })
286
215
  // Stop the stream once we have shut down
287
216
  .take_while(|o| future::ready(!matches!(o, Err(PollWfError::ShutDown))))
288
217
  }
289
218
 
290
- fn process_run_update_response(
291
- &mut self,
292
- resp: RunUpdateResponseKind,
293
- ) -> Option<ActivationOrAuto> {
294
- debug!(resp=%resp, "Processing run update response from machines");
295
- match resp {
296
- RunUpdateResponseKind::Good(mut resp) => {
297
- let run_handle = self
298
- .runs
299
- .get_mut(&resp.run_id)
300
- .expect("Workflow must exist, it just sent us an update response");
301
- run_handle.have_seen_terminal_event = resp.have_seen_terminal_event;
302
- run_handle.more_pending_work = resp.more_pending_work;
303
- run_handle.last_action_acked = true;
304
- run_handle.most_recently_processed_event_number =
305
- resp.most_recently_processed_event_number;
306
-
307
- let r = match resp.outgoing_activation {
308
- Some(ActivationOrAuto::LangActivation(mut activation)) => {
309
- if resp.in_response_to_wft {
310
- let wft = run_handle
311
- .wft
312
- .as_mut()
313
- .expect("WFT must exist for run just updated with one");
314
- // If there are in-poll queries, insert jobs for those queries into the
315
- // activation, but only if we hit the cache. If we didn't, those queries
316
- // will need to be dealt with once replay is over
317
- if wft.hit_cache {
318
- put_queries_in_act(&mut activation, wft);
319
- }
320
- }
321
-
322
- if activation.jobs.is_empty() {
323
- dbg_panic!("Should not send lang activation with no jobs");
324
- }
325
- Some(ActivationOrAuto::LangActivation(activation))
326
- }
327
- Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
328
- if let Some(wft) = run_handle.wft.as_mut() {
329
- put_queries_in_act(&mut act, wft);
330
- Some(ActivationOrAuto::LangActivation(act))
331
- } else {
332
- dbg_panic!("Ready for queries but no WFT!");
333
- None
334
- }
335
- }
336
- a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
337
- None => {
338
- // If the response indicates there is no activation to send yet but there
339
- // is more pending work, we should check again.
340
- if run_handle.more_pending_work {
341
- run_handle.check_more_activations();
342
- None
343
- } else if let Some(reason) = run_handle.trying_to_evict.as_ref() {
344
- // If a run update came back and had nothing to do, but we're trying to
345
- // evict, just do that now as long as there's no other outstanding work.
346
- if run_handle.activation.is_none() && !run_handle.more_pending_work {
347
- let mut evict_act = create_evict_activation(
348
- resp.run_id,
349
- reason.message.clone(),
350
- reason.reason,
351
- );
352
- evict_act.history_length =
353
- run_handle.most_recently_processed_event_number as u32;
354
- Some(ActivationOrAuto::LangActivation(evict_act))
355
- } else {
356
- None
357
- }
358
- } else {
359
- None
360
- }
361
- }
362
- };
363
- if let Some(f) = resp.fulfillable_complete.take() {
364
- f.fulfill();
365
- }
366
-
367
- // After each run update, check if it's ready to handle any buffered poll
368
- if matches!(&r, Some(ActivationOrAuto::Autocomplete { .. }) | None)
369
- && !run_handle.has_any_pending_work(false, true)
370
- {
371
- if let Some(bufft) = run_handle.buffered_resp.take() {
372
- self.instantiate_or_update(bufft);
373
- }
374
- }
375
- r
376
- }
377
- RunUpdateResponseKind::Fail(fail) => {
378
- if let Some(r) = self.runs.get_mut(&fail.run_id) {
379
- r.last_action_acked = true;
380
- }
381
-
382
- if let Some(resp_chan) = fail.completion_resp {
383
- // Automatically fail the workflow task in the event we couldn't update machines
384
- let fail_cause = if matches!(&fail.err, WFMachinesError::Nondeterminism(_)) {
385
- WorkflowTaskFailedCause::NonDeterministicError
386
- } else {
387
- WorkflowTaskFailedCause::Unspecified
388
- };
389
- let wft_fail_str = format!("{:?}", fail.err);
390
- self.failed_completion(
391
- fail.run_id,
392
- fail_cause,
393
- fail.err.evict_reason(),
394
- TFailure::application_failure(wft_fail_str, false).into(),
395
- resp_chan,
396
- );
397
- } else {
398
- // TODO: This should probably also fail workflow tasks, but that wasn't
399
- // implemented pre-refactor either.
400
- warn!(error=?fail.err, run_id=%fail.run_id, "Error while updating workflow");
401
- self.request_eviction(RequestEvictMsg {
402
- run_id: fail.run_id,
403
- message: format!("Error while updating workflow: {:?}", fail.err),
404
- reason: fail.err.evict_reason(),
405
- });
406
- }
407
- None
219
+ /// Instantiate or update run machines with a new WFT
220
+ #[instrument(skip(self, pwft)
221
+ fields(run_id=%pwft.work.execution.run_id,
222
+ workflow_id=%pwft.work.execution.workflow_id))]
223
+ fn instantiate_or_update(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
224
+ match self._instantiate_or_update(pwft) {
225
+ Err(histfetch) => {
226
+ self.runs_needing_fetching.push_back(histfetch);
227
+ Default::default()
408
228
  }
229
+ Ok(r) => r,
409
230
  }
410
231
  }
411
232
 
412
- #[instrument(skip(self, pwft),
413
- fields(run_id=%pwft.wft.workflow_execution.run_id,
414
- workflow_id=%pwft.wft.workflow_execution.workflow_id))]
415
- fn instantiate_or_update(&mut self, pwft: PermittedWFT) {
416
- let (mut work, permit) = if let Some(w) = self.buffer_resp_if_outstanding_work(pwft) {
417
- (w.wft, w.permit)
233
+ fn _instantiate_or_update(
234
+ &mut self,
235
+ pwft: PermittedWFT,
236
+ ) -> Result<RunUpdateAct, HistoryFetchReq> {
237
+ // If the run already exists, possibly buffer the work and return early if we can't handle
238
+ // it yet.
239
+ let pwft = if let Some(rh) = self.runs.get_mut(&pwft.work.execution.run_id) {
240
+ if let Some(w) = rh.buffer_wft_if_outstanding_work(pwft) {
241
+ w
242
+ } else {
243
+ return Ok(None);
244
+ }
418
245
  } else {
419
- return;
246
+ pwft
420
247
  };
421
248
 
422
- let run_id = work.workflow_execution.run_id.clone();
249
+ let run_id = pwft.work.execution.run_id.clone();
423
250
  // If our cache is full and this WFT is for an unseen run we must first evict a run before
424
251
  // we can deal with this task. So, buffer the task in that case.
425
252
  if !self.runs.has_run(&run_id) && self.runs.is_full() {
426
- self.buffer_resp_on_full_cache(PermittedWFT { wft: work, permit });
427
- return;
253
+ self.buffer_resp_on_full_cache(pwft);
254
+ return Ok(None);
428
255
  }
429
256
 
430
- let start_event_id = work.history.events.first().map(|e| e.event_id);
431
- debug!(
432
- run_id = %run_id,
433
- task_token = %&work.task_token,
434
- history_length = %work.history.events.len(),
435
- start_event_id = ?start_event_id,
436
- has_legacy_query = %work.legacy_query.is_some(),
437
- attempt = %work.attempt,
438
- "Applying new workflow task from server"
439
- );
440
-
441
- let wft_info = WorkflowTaskInfo {
442
- attempt: work.attempt,
443
- task_token: work.task_token,
444
- wf_id: work.workflow_execution.workflow_id.clone(),
445
- };
446
- let poll_resp_is_incremental = work
447
- .history
448
- .events
449
- .get(0)
450
- .map(|ev| ev.event_id > 1)
451
- .unwrap_or_default();
452
- let poll_resp_is_incremental = poll_resp_is_incremental || work.history.events.is_empty();
453
-
454
- let mut did_miss_cache = !poll_resp_is_incremental;
455
-
456
- let page_token = if !self.runs.has_run(&run_id) && poll_resp_is_incremental {
257
+ // This check can't really be lifted up higher since we could EX: See it's in the cache,
258
+ // not fetch more history, send the task, see cache is full, buffer it, then evict that
259
+ // run, and now we still have a cache miss.
260
+ if !self.runs.has_run(&run_id) && pwft.work.is_incremental() {
457
261
  debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
458
262
  cache. Will fetch history");
459
263
  self.metrics.sticky_cache_miss();
460
- did_miss_cache = true;
461
- NextPageToken::FetchFromStart
462
- } else {
463
- work.next_page_token.into()
464
- };
465
- let history_update = HistoryUpdate::new(
466
- HistoryPaginator::new(
467
- work.history,
468
- work.workflow_execution.workflow_id.clone(),
469
- run_id.clone(),
470
- page_token,
471
- self.client.clone(),
472
- ),
473
- work.previous_started_event_id,
474
- );
475
- let legacy_query_from_poll = work
476
- .legacy_query
477
- .take()
478
- .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
479
-
480
- let mut pending_queries = work.query_requests.into_iter().collect::<Vec<_>>();
481
- if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
482
- error!(
483
- "Server issued both normal and legacy queries. This should not happen. Please \
484
- file a bug report."
485
- );
486
- self.request_eviction(RequestEvictMsg {
487
- run_id,
488
- message: "Server issued both normal and legacy query".to_string(),
489
- reason: EvictionReason::Fatal,
490
- });
491
- return;
492
- }
493
- if let Some(lq) = legacy_query_from_poll {
494
- pending_queries.push(lq);
264
+ return Err(HistoryFetchReq::Full(
265
+ CacheMissFetchReq { original_wft: pwft },
266
+ self.history_fetch_refcounter.clone(),
267
+ ));
495
268
  }
496
269
 
497
- let start_time = Instant::now();
498
- let run_handle = self.runs.instantiate_or_update(
499
- &run_id,
500
- &work.workflow_execution.workflow_id,
501
- &work.workflow_type,
502
- history_update,
503
- start_time,
504
- );
505
- run_handle.wft = Some(OutstandingTask {
506
- info: wft_info,
507
- hit_cache: !did_miss_cache,
508
- pending_queries,
509
- start_time,
510
- permit,
511
- })
270
+ let rur = self.runs.instantiate_or_update(pwft);
271
+ Ok(rur)
512
272
  }
513
273
 
514
- fn process_completion(&mut self, complete: WFActCompleteMsg) {
515
- match complete.completion {
516
- ValidatedCompletion::Success { run_id, commands } => {
517
- self.successful_completion(run_id, commands, complete.response_tx);
518
- }
519
- ValidatedCompletion::Fail { run_id, failure } => {
520
- self.failed_completion(
521
- run_id,
522
- WorkflowTaskFailedCause::Unspecified,
274
+ fn process_completion(&mut self, complete: NewOrFetchedComplete) -> Vec<ActivationOrAuto> {
275
+ let rh = if let Some(rh) = self.runs.get_mut(complete.run_id()) {
276
+ rh
277
+ } else {
278
+ dbg_panic!("Run missing during completion {:?}", complete);
279
+ return vec![];
280
+ };
281
+ let mut acts: Vec<_> = match complete {
282
+ NewOrFetchedComplete::New(complete) => match complete.completion {
283
+ ValidatedCompletion::Success {
284
+ commands,
285
+ used_flags,
286
+ ..
287
+ } => match rh.successful_completion(commands, used_flags, complete.response_tx) {
288
+ Ok(acts) => acts,
289
+ Err(npr) => {
290
+ self.runs_needing_fetching
291
+ .push_back(HistoryFetchReq::NextPage(
292
+ npr,
293
+ self.history_fetch_refcounter.clone(),
294
+ ));
295
+ None
296
+ }
297
+ },
298
+ ValidatedCompletion::Fail { failure, .. } => rh.failed_completion(
299
+ failure.force_cause(),
523
300
  EvictionReason::LangFail,
524
301
  failure,
525
302
  complete.response_tx,
526
- );
303
+ ),
304
+ },
305
+ NewOrFetchedComplete::Fetched(update, paginator) => {
306
+ rh.fetched_page_completion(update, paginator)
527
307
  }
528
308
  }
309
+ .into_iter()
310
+ .collect();
529
311
  // Always queue evictions after completion when we have a zero-size cache
530
312
  if self.runs.cache_capacity() == 0 {
531
- self.request_eviction_of_lru_run();
313
+ acts.extend(self.request_eviction_of_lru_run().into_run_update_resp())
532
314
  }
315
+ acts
533
316
  }
534
317
 
535
- fn successful_completion(
536
- &mut self,
537
- run_id: String,
538
- mut commands: Vec<WFCommand>,
539
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
540
- ) {
541
- let activation_was_only_eviction = self.activation_has_only_eviction(&run_id);
542
- let (task_token, has_pending_query, start_time) =
543
- if let Some(entry) = self.get_task(&run_id) {
544
- (
545
- entry.info.task_token.clone(),
546
- !entry.pending_queries.is_empty(),
547
- entry.start_time,
548
- )
549
- } else {
550
- if !activation_was_only_eviction {
551
- // Not an error if this was an eviction, since it's normal to issue eviction
552
- // activations without an associated workflow task in that case.
553
- dbg_panic!(
554
- "Attempted to complete activation for run {} without associated workflow task",
318
+ fn process_post_activation(&mut self, report: PostActivationMsg) -> RunUpdateAct {
319
+ let run_id = &report.run_id;
320
+ let wft_from_complete = report.wft_from_complete;
321
+ if let Some((wft, _)) = &wft_from_complete {
322
+ if &wft.execution.run_id != run_id {
323
+ dbg_panic!(
324
+ "Server returned a WFT on completion for a different run ({}) than the \
325
+ one being completed ({}). This is a server bug.",
326
+ wft.execution.run_id,
555
327
  run_id
556
- );
557
- }
558
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
559
- return;
560
- };
561
-
562
- // If the only command from the activation is a legacy query response, that means we need
563
- // to respond differently than a typical activation.
564
- if matches!(&commands.as_slice(),
565
- &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
566
- {
567
- let qr = match commands.remove(0) {
568
- WFCommand::QueryResponse(qr) => qr,
569
- _ => unreachable!("We just verified this is the only command"),
570
- };
571
- self.reply_to_complete(
572
- &run_id,
573
- ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
574
- task_token,
575
- action: ActivationAction::RespondLegacyQuery {
576
- result: Box::new(qr),
577
- },
578
- }),
579
- resp_chan,
580
- );
581
- } else {
582
- // First strip out query responses from other commands that actually affect machines
583
- // Would be prettier with `drain_filter`
584
- let mut i = 0;
585
- let mut query_responses = vec![];
586
- while i < commands.len() {
587
- if matches!(commands[i], WFCommand::QueryResponse(_)) {
588
- if let WFCommand::QueryResponse(qr) = commands.remove(i) {
589
- query_responses.push(qr);
590
- }
591
- } else {
592
- i += 1;
593
- }
594
- }
595
-
596
- let activation_was_eviction = self.activation_has_eviction(&run_id);
597
- if let Some(rh) = self.runs.get_mut(&run_id) {
598
- rh.send_completion(RunActivationCompletion {
599
- task_token,
600
- start_time,
601
- commands,
602
- activation_was_eviction,
603
- activation_was_only_eviction,
604
- has_pending_query,
605
- query_responses,
606
- resp_chan: Some(resp_chan),
607
- });
608
- } else {
609
- dbg_panic!("Run {} missing during completion", run_id);
328
+ );
610
329
  }
611
- };
612
- }
613
-
614
- fn failed_completion(
615
- &mut self,
616
- run_id: String,
617
- cause: WorkflowTaskFailedCause,
618
- reason: EvictionReason,
619
- failure: Failure,
620
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
621
- ) {
622
- let tt = if let Some(tt) = self.get_task(&run_id).map(|t| t.info.task_token.clone()) {
623
- tt
624
- } else {
625
- dbg_panic!(
626
- "No workflow task for run id {} found when trying to fail activation",
627
- run_id
628
- );
629
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
630
- return;
631
- };
632
-
633
- if let Some(m) = self.run_metrics(&run_id) {
634
- m.wf_task_failed();
635
330
  }
636
- let message = format!("Workflow activation completion failed: {:?}", &failure);
637
- // Blow up any cached data associated with the workflow
638
- let should_report = match self.request_eviction(RequestEvictMsg {
639
- run_id: run_id.clone(),
640
- message,
641
- reason,
642
- }) {
643
- EvictionRequestResult::EvictionRequested(Some(attempt))
644
- | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
645
- _ => false,
646
- };
647
- // If the outstanding WFT is a legacy query task, report that we need to fail it
648
- let outcome = if self
649
- .runs
650
- .get(&run_id)
651
- .map(|rh| rh.pending_work_is_legacy_query())
652
- .unwrap_or_default()
653
- {
654
- ActivationCompleteOutcome::ReportWFTFail(
655
- FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
656
- )
657
- } else if should_report {
658
- ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
659
- tt, cause, failure,
660
- ))
661
- } else {
662
- ActivationCompleteOutcome::DoNothing
663
- };
664
- self.reply_to_complete(&run_id, outcome, resp_chan);
665
- }
666
331
 
667
- fn process_post_activation(&mut self, report: PostActivationMsg) {
668
- let run_id = &report.run_id;
332
+ let mut res = None;
669
333
 
670
334
  // If we reported to server, we always want to mark it complete.
671
- let maybe_t = self.complete_wft(run_id, report.reported_wft_to_server);
335
+ let maybe_t = self.complete_wft(run_id, report.wft_report_status);
336
+ // Delete the activation
337
+ let activation = self
338
+ .runs
339
+ .get_mut(run_id)
340
+ .and_then(|rh| rh.delete_activation());
341
+
342
+ // Evict the run if the activation contained an eviction
343
+ let mut applied_buffered_poll_for_this_run = false;
344
+ if activation.map(|a| a.has_eviction()).unwrap_or_default() {
345
+ debug!(run_id=%run_id, "Evicting run");
346
+
347
+ if let Some(mut rh) = self.runs.remove(run_id) {
348
+ if let Some(buff) = rh.take_buffered_wft() {
349
+ // Don't try to apply a buffered poll for this run if we just got a new WFT
350
+ // from completing, because by definition that buffered poll is now an
351
+ // out-of-date WFT.
352
+ if wft_from_complete.is_none() {
353
+ res = self.instantiate_or_update(buff);
354
+ applied_buffered_poll_for_this_run = true;
355
+ }
356
+ }
357
+ }
672
358
 
673
- if self
674
- .get_activation(run_id)
675
- .map(|a| a.has_eviction())
676
- .unwrap_or_default()
677
- {
678
- self.evict_run(run_id);
359
+ // Attempt to apply a buffered poll for some *other* run, if we didn't have a wft
360
+ // from complete or a buffered poll for *this* run.
361
+ if wft_from_complete.is_none() && !applied_buffered_poll_for_this_run {
362
+ if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
363
+ res = self.instantiate_or_update(buff);
364
+ }
365
+ }
679
366
  };
680
367
 
681
- if let Some(wft) = report.wft_from_complete {
682
- debug!(run_id=%wft.workflow_execution.run_id, "New WFT from completion");
368
+ if let Some((wft, pag)) = wft_from_complete {
369
+ debug!(run_id=%wft.execution.run_id, "New WFT from completion");
683
370
  if let Some(t) = maybe_t {
684
- self.instantiate_or_update(PermittedWFT {
685
- wft,
371
+ res = self.instantiate_or_update(PermittedWFT {
372
+ work: wft,
686
373
  permit: t.permit,
687
- })
374
+ paginator: pag,
375
+ });
688
376
  }
689
377
  }
690
378
 
691
- if let Some(rh) = self.runs.get_mut(run_id) {
692
- // Delete the activation
693
- rh.activation.take();
694
- // Attempt to produce the next activation if needed
695
- rh.check_more_activations();
379
+ if res.is_none() {
380
+ if let Some(rh) = self.runs.get_mut(run_id) {
381
+ // Attempt to produce the next activation if needed
382
+ res = rh.check_more_activations();
383
+ }
696
384
  }
385
+ res
697
386
  }
698
387
 
699
- fn local_resolution(&mut self, msg: LocalResolutionMsg) {
388
+ fn local_resolution(&mut self, msg: LocalResolutionMsg) -> RunUpdateAct {
700
389
  let run_id = msg.run_id;
701
390
  if let Some(rh) = self.runs.get_mut(&run_id) {
702
- rh.send_local_resolution(msg.res)
391
+ rh.local_resolution(msg.res)
703
392
  } else {
704
393
  // It isn't an explicit error if the machine is missing when a local activity resolves.
705
394
  // This can happen if an activity reports a timeout after we stopped caring about it.
706
395
  debug!(run_id = %run_id,
707
396
  "Tried to resolve a local activity for a run we are no longer tracking");
397
+ None
398
+ }
399
+ }
400
+
401
+ fn process_heartbeat_timeout(&mut self, run_id: String) -> RunUpdateAct {
402
+ if let Some(rh) = self.runs.get_mut(&run_id) {
403
+ rh.heartbeat_timeout()
404
+ } else {
405
+ None
708
406
  }
709
407
  }
710
408
 
@@ -712,17 +410,8 @@ impl WFStream {
712
410
  /// activation to evict the workflow from the lang side. Workflow will not *actually* be evicted
713
411
  /// until lang replies to that activation
714
412
  fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
715
- let activation_has_eviction = self.activation_has_eviction(&info.run_id);
716
413
  if let Some(rh) = self.runs.get_mut(&info.run_id) {
717
- let attempts = rh.wft.as_ref().map(|wt| wt.info.attempt);
718
- if !activation_has_eviction && rh.trying_to_evict.is_none() {
719
- debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
720
- rh.trying_to_evict = Some(info);
721
- rh.check_more_activations();
722
- EvictionRequestResult::EvictionRequested(attempts)
723
- } else {
724
- EvictionRequestResult::EvictionAlreadyRequested(attempts)
725
- }
414
+ rh.request_eviction(info)
726
415
  } else {
727
416
  debug!(run_id=%info.run_id, "Eviction requested for unknown run");
728
417
  EvictionRequestResult::NotFound
@@ -743,36 +432,10 @@ impl WFStream {
743
432
  }
744
433
  }
745
434
 
746
- /// Evict a workflow from the cache by its run id. Any existing pending activations will be
747
- /// destroyed, and any outstanding activations invalidated.
748
- fn evict_run(&mut self, run_id: &str) {
749
- debug!(run_id=%run_id, "Evicting run");
750
-
751
- let mut did_take_buff = false;
752
- // Now it can safely be deleted, it'll get recreated once the un-buffered poll is handled if
753
- // there was one.
754
- if let Some(mut rh) = self.runs.remove(run_id) {
755
- rh.handle.abort();
756
-
757
- if let Some(buff) = rh.buffered_resp.take() {
758
- self.instantiate_or_update(buff);
759
- did_take_buff = true;
760
- }
761
- }
762
-
763
- if !did_take_buff {
764
- // If there wasn't a buffered poll, there might be one for a different run which needs
765
- // a free cache slot, and now there is.
766
- if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
767
- self.instantiate_or_update(buff);
768
- }
769
- }
770
- }
771
-
772
435
  fn complete_wft(
773
436
  &mut self,
774
437
  run_id: &str,
775
- reported_wft_to_server: bool,
438
+ wft_report_status: WFTReportStatus,
776
439
  ) -> Option<OutstandingTask> {
777
440
  // If the WFT completion wasn't sent to the server, but we did see the final event, we still
778
441
  // want to clear the workflow task. This can really only happen in replay testing, where we
@@ -782,9 +445,9 @@ impl WFStream {
782
445
  let saw_final = self
783
446
  .runs
784
447
  .get(run_id)
785
- .map(|r| r.have_seen_terminal_event)
448
+ .map(|r| r.have_seen_terminal_event())
786
449
  .unwrap_or_default();
787
- if !saw_final && !reported_wft_to_server {
450
+ if !saw_final && matches!(wft_report_status, WFTReportStatus::NotReported) {
788
451
  return None;
789
452
  }
790
453
 
@@ -792,60 +455,26 @@ impl WFStream {
792
455
  // Can't mark the WFT complete if there are pending queries, as doing so would destroy
793
456
  // them.
794
457
  if rh
795
- .wft
796
- .as_ref()
458
+ .wft()
797
459
  .map(|wft| !wft.pending_queries.is_empty())
798
460
  .unwrap_or_default()
799
461
  {
800
462
  return None;
801
463
  }
802
464
 
803
- debug!("Marking WFT completed");
804
- let retme = rh.wft.take();
805
- if let Some(ot) = &retme {
806
- if let Some(m) = self.run_metrics(run_id) {
807
- m.wf_task_latency(ot.start_time.elapsed());
808
- }
809
- }
810
- retme
465
+ rh.mark_wft_complete(wft_report_status)
811
466
  } else {
812
467
  None
813
468
  }
814
469
  }
815
470
 
816
- /// Stores some work if there is any outstanding WFT or activation for the run. If there was
817
- /// not, returns the work back out inside the option.
818
- fn buffer_resp_if_outstanding_work(&mut self, work: PermittedWFT) -> Option<PermittedWFT> {
819
- let run_id = &work.wft.workflow_execution.run_id;
820
- if let Some(mut run) = self.runs.get_mut(run_id) {
821
- let about_to_issue_evict = run.trying_to_evict.is_some() && !run.last_action_acked;
822
- let has_wft = run.wft.is_some();
823
- let has_activation = run.activation.is_some();
824
- if has_wft
825
- || has_activation
826
- || about_to_issue_evict
827
- || run.more_pending_work
828
- || !run.last_action_acked
829
- {
830
- debug!(run_id = %run_id, run = ?run,
831
- "Got new WFT for a run with outstanding work, buffering it");
832
- run.buffered_resp = Some(work);
833
- None
834
- } else {
835
- Some(work)
836
- }
837
- } else {
838
- Some(work)
839
- }
840
- }
841
-
842
471
  fn buffer_resp_on_full_cache(&mut self, work: PermittedWFT) {
843
- debug!(run_id=%work.wft.workflow_execution.run_id, "Buffering WFT because cache is full");
472
+ debug!(run_id=%work.work.execution.run_id, "Buffering WFT because cache is full");
844
473
  // If there's already a buffered poll for the run, replace it.
845
474
  if let Some(rh) = self
846
475
  .buffered_polls_need_cache_slot
847
476
  .iter_mut()
848
- .find(|w| w.wft.workflow_execution.run_id == work.wft.workflow_execution.run_id)
477
+ .find(|w| w.work.execution.run_id == work.work.execution.run_id)
849
478
  {
850
479
  *rh = work;
851
480
  } else {
@@ -856,7 +485,7 @@ impl WFStream {
856
485
 
857
486
  /// Makes sure we have enough pending evictions to fulfill the needs of buffered WFTs who are
858
487
  /// waiting on a cache slot
859
- fn reconcile_buffered(&mut self) {
488
+ fn reconcile_buffered(&mut self) -> Vec<ActivationOrAuto> {
860
489
  // We must ensure that there are at least as many pending evictions as there are tasks
861
490
  // that we might need to un-buffer (skipping runs which already have buffered tasks for
862
491
  // themselves)
@@ -865,121 +494,222 @@ impl WFStream {
865
494
  let num_existing_evictions = self
866
495
  .runs
867
496
  .runs_lru_order()
868
- .filter(|(_, h)| h.trying_to_evict.is_some())
497
+ .filter(|(_, h)| h.is_trying_to_evict())
869
498
  .count();
870
499
  let mut num_evicts_needed = num_in_buff.saturating_sub(num_existing_evictions);
871
500
  for (rid, handle) in self.runs.runs_lru_order() {
872
501
  if num_evicts_needed == 0 {
873
502
  break;
874
503
  }
875
- if handle.buffered_resp.is_none() {
504
+ if !handle.has_buffered_wft() {
876
505
  num_evicts_needed -= 1;
877
506
  evict_these.push(rid.to_string());
878
507
  }
879
508
  }
509
+ let mut acts = vec![];
880
510
  for run_id in evict_these {
881
- self.request_eviction(RequestEvictMsg {
882
- run_id,
883
- message: "Workflow cache full".to_string(),
884
- reason: EvictionReason::CacheFull,
885
- });
511
+ acts.extend(
512
+ self.request_eviction(RequestEvictMsg {
513
+ run_id,
514
+ message: "Workflow cache full".to_string(),
515
+ reason: EvictionReason::CacheFull,
516
+ })
517
+ .into_run_update_resp(),
518
+ );
886
519
  }
887
- }
888
-
889
- fn reply_to_complete(
890
- &self,
891
- run_id: &str,
892
- outcome: ActivationCompleteOutcome,
893
- chan: oneshot::Sender<ActivationCompleteResult>,
894
- ) {
895
- let most_recently_processed_event = self
896
- .runs
897
- .peek(run_id)
898
- .map(|rh| rh.most_recently_processed_event_number)
899
- .unwrap_or_default();
900
- chan.send(ActivationCompleteResult {
901
- most_recently_processed_event,
902
- outcome,
903
- })
904
- .expect("Rcv half of activation reply not dropped");
520
+ acts
905
521
  }
906
522
 
907
523
  fn shutdown_done(&self) -> bool {
908
- let all_runs_ready = self
909
- .runs
910
- .handles()
911
- .all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
912
- if self.shutdown_token.is_cancelled() && all_runs_ready {
913
- info!("Workflow shutdown is done");
914
- true
915
- } else {
916
- false
524
+ if self.shutdown_token.is_cancelled() {
525
+ if Arc::strong_count(&self.history_fetch_refcounter) > 1 {
526
+ // Don't exit if there are outstanding fetch requests
527
+ return false;
528
+ }
529
+ let all_runs_ready = self
530
+ .runs
531
+ .handles()
532
+ .all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
533
+ if all_runs_ready {
534
+ return true;
535
+ }
917
536
  }
918
- }
919
-
920
- fn get_task(&mut self, run_id: &str) -> Option<&OutstandingTask> {
921
- self.runs.get(run_id).and_then(|rh| rh.wft.as_ref())
922
- }
923
-
924
- fn get_activation(&mut self, run_id: &str) -> Option<&OutstandingActivation> {
925
- self.runs.get(run_id).and_then(|rh| rh.activation.as_ref())
926
- }
927
-
928
- fn run_metrics(&mut self, run_id: &str) -> Option<&MetricsContext> {
929
- self.runs.get(run_id).map(|r| &r.metrics)
930
- }
931
-
932
- fn activation_has_only_eviction(&mut self, run_id: &str) -> bool {
933
- self.runs
934
- .get(run_id)
935
- .and_then(|rh| rh.activation)
936
- .map(OutstandingActivation::has_only_eviction)
937
- .unwrap_or_default()
938
- }
939
-
940
- fn activation_has_eviction(&mut self, run_id: &str) -> bool {
941
- self.runs
942
- .get(run_id)
943
- .and_then(|rh| rh.activation)
944
- .map(OutstandingActivation::has_eviction)
945
- .unwrap_or_default()
537
+ false
946
538
  }
947
539
 
948
540
  fn outstanding_wfts(&self) -> usize {
949
- self.runs.handles().filter(|r| r.wft.is_some()).count()
541
+ self.runs.handles().filter(|r| r.wft().is_some()).count()
950
542
  }
951
543
 
952
544
  // Useful when debugging
953
545
  #[allow(dead_code)]
954
546
  fn info_dump(&self, run_id: &str) {
955
547
  if let Some(r) = self.runs.peek(run_id) {
956
- info!(run_id, wft=?r.wft, activation=?r.activation, buffered=r.buffered_resp.is_some(),
957
- trying_to_evict=r.trying_to_evict.is_some(), more_work=r.more_pending_work,
958
- last_action_acked=r.last_action_acked);
548
+ info!(run_id, wft=?r.wft(), activation=?r.activation(),
549
+ buffered_wft=r.has_buffered_wft(),
550
+ trying_to_evict=r.is_trying_to_evict(), more_work=r.more_pending_work());
959
551
  } else {
960
552
  info!(run_id, "Run not found");
961
553
  }
962
554
  }
963
555
  }
964
556
 
965
- /// Drains pending queries from the workflow task and appends them to the activation's jobs
966
- fn put_queries_in_act(act: &mut WorkflowActivation, wft: &mut OutstandingTask) {
967
- // Nothing to do if there are no pending queries
968
- if wft.pending_queries.is_empty() {
969
- return;
970
- }
557
+ /// All possible inputs to the [WFStream]
558
+ #[derive(derive_more::From, Debug)]
559
+ #[cfg_attr(
560
+ feature = "save_wf_inputs",
561
+ derive(serde::Serialize, serde::Deserialize)
562
+ )]
563
+ enum WFStreamInput {
564
+ NewWft(PermittedWFT),
565
+ Local(LocalInput),
566
+ /// The stream given to us which represents the poller (or a mock) terminated.
567
+ PollerDead,
568
+ /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
569
+ /// error while polling
570
+ PollerError(
571
+ #[cfg_attr(
572
+ feature = "save_wf_inputs",
573
+ serde(with = "tonic_status_serde::SerdeStatus")
574
+ )]
575
+ tonic::Status,
576
+ ),
577
+ FailedFetch {
578
+ run_id: String,
579
+ #[cfg_attr(
580
+ feature = "save_wf_inputs",
581
+ serde(with = "tonic_status_serde::SerdeStatus")
582
+ )]
583
+ err: tonic::Status,
584
+ },
585
+ }
971
586
 
972
- let has_legacy = wft.has_pending_legacy_query();
973
- // Cannot dispatch legacy query if there are any other jobs - which can happen if, ex, a local
974
- // activity resolves while we've gotten a legacy query after heartbeating.
975
- if has_legacy && !act.jobs.is_empty() {
976
- return;
587
+ /// A non-poller-received input to the [WFStream]
588
+ #[derive(derive_more::DebugCustom)]
589
+ #[cfg_attr(
590
+ feature = "save_wf_inputs",
591
+ derive(serde::Serialize, serde::Deserialize)
592
+ )]
593
+ #[debug(fmt = "LocalInput {{ {input:?} }}")]
594
+ pub(super) struct LocalInput {
595
+ pub input: LocalInputs,
596
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip, default = "Span::current"))]
597
+ pub span: Span,
598
+ }
599
+ impl From<HeartbeatTimeoutMsg> for LocalInput {
600
+ fn from(hb: HeartbeatTimeoutMsg) -> Self {
601
+ Self {
602
+ input: LocalInputs::HeartbeatTimeout(hb.run_id),
603
+ span: hb.span,
604
+ }
605
+ }
606
+ }
607
+ /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
608
+ /// new polls.
609
+ #[derive(Debug, derive_more::From)]
610
+ #[cfg_attr(
611
+ feature = "save_wf_inputs",
612
+ derive(serde::Serialize, serde::Deserialize)
613
+ )]
614
+ pub(super) enum LocalInputs {
615
+ Completion(WFActCompleteMsg),
616
+ FetchedPageCompletion {
617
+ paginator: HistoryPaginator,
618
+ update: HistoryUpdate,
619
+ },
620
+ LocalResolution(LocalResolutionMsg),
621
+ PostActivation(PostActivationMsg),
622
+ RequestEviction(RequestEvictMsg),
623
+ HeartbeatTimeout(String),
624
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
625
+ GetStateInfo(GetStateInfoMsg),
626
+ }
627
+ impl LocalInputs {
628
+ fn run_id(&self) -> Option<&str> {
629
+ Some(match self {
630
+ LocalInputs::Completion(c) => c.completion.run_id(),
631
+ LocalInputs::FetchedPageCompletion { paginator, .. } => &paginator.run_id,
632
+ LocalInputs::LocalResolution(lr) => &lr.run_id,
633
+ LocalInputs::PostActivation(pa) => &pa.run_id,
634
+ LocalInputs::RequestEviction(re) => &re.run_id,
635
+ LocalInputs::HeartbeatTimeout(hb) => hb,
636
+ LocalInputs::GetStateInfo(_) => return None,
637
+ })
638
+ }
639
+ }
640
+ #[derive(Debug)]
641
+ #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
642
+ enum ExternalPollerInputs {
643
+ NewWft(PermittedWFT),
644
+ PollerDead,
645
+ PollerError(tonic::Status),
646
+ FetchedUpdate(PermittedWFT),
647
+ NextPage {
648
+ paginator: HistoryPaginator,
649
+ update: HistoryUpdate,
650
+ span: Span,
651
+ },
652
+ FailedFetch {
653
+ run_id: String,
654
+ err: tonic::Status,
655
+ },
656
+ }
657
+ impl From<ExternalPollerInputs> for WFStreamInput {
658
+ fn from(l: ExternalPollerInputs) -> Self {
659
+ match l {
660
+ ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
661
+ ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
662
+ ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
663
+ ExternalPollerInputs::FetchedUpdate(wft) => WFStreamInput::NewWft(wft),
664
+ ExternalPollerInputs::FailedFetch { run_id, err } => {
665
+ WFStreamInput::FailedFetch { run_id, err }
666
+ }
667
+ ExternalPollerInputs::NextPage {
668
+ paginator,
669
+ update,
670
+ span,
671
+ } => WFStreamInput::Local(LocalInput {
672
+ input: LocalInputs::FetchedPageCompletion { paginator, update },
673
+ span,
674
+ }),
675
+ }
676
+ }
677
+ }
678
+ impl From<Result<WFTExtractorOutput, tonic::Status>> for ExternalPollerInputs {
679
+ fn from(v: Result<WFTExtractorOutput, tonic::Status>) -> Self {
680
+ match v {
681
+ Ok(WFTExtractorOutput::NewWFT(pwft)) => ExternalPollerInputs::NewWft(pwft),
682
+ Ok(WFTExtractorOutput::FetchResult(updated_wft, _)) => {
683
+ ExternalPollerInputs::FetchedUpdate(updated_wft)
684
+ }
685
+ Ok(WFTExtractorOutput::NextPage {
686
+ paginator,
687
+ update,
688
+ span,
689
+ rc: _rc,
690
+ }) => ExternalPollerInputs::NextPage {
691
+ paginator,
692
+ update,
693
+ span,
694
+ },
695
+ Ok(WFTExtractorOutput::FailedFetch { run_id, err }) => {
696
+ ExternalPollerInputs::FailedFetch { run_id, err }
697
+ }
698
+ Ok(WFTExtractorOutput::PollerDead) => ExternalPollerInputs::PollerDead,
699
+ Err(e) => ExternalPollerInputs::PollerError(e),
700
+ }
701
+ }
702
+ }
703
+ #[derive(Debug)]
704
+ enum NewOrFetchedComplete {
705
+ New(WFActCompleteMsg),
706
+ Fetched(HistoryUpdate, HistoryPaginator),
707
+ }
708
+ impl NewOrFetchedComplete {
709
+ fn run_id(&self) -> &str {
710
+ match self {
711
+ NewOrFetchedComplete::New(c) => c.completion.run_id(),
712
+ NewOrFetchedComplete::Fetched(_, p) => &p.run_id,
713
+ }
977
714
  }
978
-
979
- debug!(queries=?wft.pending_queries, "Dispatching queries");
980
- let query_jobs = wft
981
- .pending_queries
982
- .drain(..)
983
- .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
984
- act.jobs.extend(query_jobs);
985
715
  }