temporalio 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (310) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +180 -7
  3. data/bridge/Cargo.lock +208 -76
  4. data/bridge/Cargo.toml +5 -2
  5. data/bridge/sdk-core/Cargo.toml +1 -1
  6. data/bridge/sdk-core/README.md +20 -10
  7. data/bridge/sdk-core/client/Cargo.toml +1 -1
  8. data/bridge/sdk-core/client/src/lib.rs +227 -59
  9. data/bridge/sdk-core/client/src/metrics.rs +17 -8
  10. data/bridge/sdk-core/client/src/raw.rs +13 -12
  11. data/bridge/sdk-core/client/src/retry.rs +132 -43
  12. data/bridge/sdk-core/core/Cargo.toml +28 -15
  13. data/bridge/sdk-core/core/benches/workflow_replay.rs +13 -10
  14. data/bridge/sdk-core/core/src/abstractions.rs +225 -36
  15. data/bridge/sdk-core/core/src/core_tests/activity_tasks.rs +217 -79
  16. data/bridge/sdk-core/core/src/core_tests/determinism.rs +165 -2
  17. data/bridge/sdk-core/core/src/core_tests/local_activities.rs +565 -34
  18. data/bridge/sdk-core/core/src/core_tests/queries.rs +247 -90
  19. data/bridge/sdk-core/core/src/core_tests/workers.rs +3 -5
  20. data/bridge/sdk-core/core/src/core_tests/workflow_cancels.rs +1 -1
  21. data/bridge/sdk-core/core/src/core_tests/workflow_tasks.rs +430 -67
  22. data/bridge/sdk-core/core/src/ephemeral_server/mod.rs +106 -12
  23. data/bridge/sdk-core/core/src/internal_flags.rs +136 -0
  24. data/bridge/sdk-core/core/src/lib.rs +148 -34
  25. data/bridge/sdk-core/core/src/protosext/mod.rs +1 -1
  26. data/bridge/sdk-core/core/src/replay/mod.rs +185 -41
  27. data/bridge/sdk-core/core/src/telemetry/log_export.rs +190 -0
  28. data/bridge/sdk-core/core/src/telemetry/metrics.rs +219 -140
  29. data/bridge/sdk-core/core/src/telemetry/mod.rs +326 -315
  30. data/bridge/sdk-core/core/src/telemetry/prometheus_server.rs +20 -14
  31. data/bridge/sdk-core/core/src/test_help/mod.rs +85 -21
  32. data/bridge/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +112 -156
  33. data/bridge/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  34. data/bridge/sdk-core/core/src/worker/activities/local_activities.rs +364 -128
  35. data/bridge/sdk-core/core/src/worker/activities.rs +263 -170
  36. data/bridge/sdk-core/core/src/worker/client/mocks.rs +23 -3
  37. data/bridge/sdk-core/core/src/worker/client.rs +48 -6
  38. data/bridge/sdk-core/core/src/worker/mod.rs +186 -75
  39. data/bridge/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  40. data/bridge/sdk-core/core/src/worker/workflow/driven_workflow.rs +13 -24
  41. data/bridge/sdk-core/core/src/worker/workflow/history_update.rs +879 -226
  42. data/bridge/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +101 -48
  43. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +8 -12
  44. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +6 -9
  45. data/bridge/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +90 -32
  46. data/bridge/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +6 -9
  47. data/bridge/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +7 -10
  48. data/bridge/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +6 -9
  49. data/bridge/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +160 -83
  50. data/bridge/sdk-core/core/src/worker/workflow/machines/mod.rs +36 -54
  51. data/bridge/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +179 -0
  52. data/bridge/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +104 -157
  53. data/bridge/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +8 -12
  54. data/bridge/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +9 -13
  55. data/bridge/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +10 -4
  56. data/bridge/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +14 -11
  57. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
  58. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +395 -299
  59. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +12 -20
  60. data/bridge/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +33 -18
  61. data/bridge/sdk-core/core/src/worker/workflow/managed_run.rs +1032 -374
  62. data/bridge/sdk-core/core/src/worker/workflow/mod.rs +525 -392
  63. data/bridge/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  64. data/bridge/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  65. data/bridge/sdk-core/core/src/worker/workflow/wft_poller.rs +3 -6
  66. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  67. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  68. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream.rs +456 -681
  69. data/bridge/sdk-core/core-api/Cargo.toml +6 -4
  70. data/bridge/sdk-core/core-api/src/errors.rs +1 -34
  71. data/bridge/sdk-core/core-api/src/lib.rs +7 -45
  72. data/bridge/sdk-core/core-api/src/telemetry.rs +141 -0
  73. data/bridge/sdk-core/core-api/src/worker.rs +27 -1
  74. data/bridge/sdk-core/etc/deps.svg +115 -140
  75. data/bridge/sdk-core/etc/regen-depgraph.sh +5 -0
  76. data/bridge/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +18 -15
  77. data/bridge/sdk-core/fsm/rustfsm_procmacro/tests/trybuild/no_handle_conversions_require_into_fail.stderr +1 -1
  78. data/bridge/sdk-core/fsm/rustfsm_trait/src/lib.rs +8 -3
  79. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  80. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-23_history.bin +0 -0
  81. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-85_history.bin +0 -0
  82. data/bridge/sdk-core/protos/api_upstream/buf.yaml +0 -3
  83. data/bridge/sdk-core/protos/api_upstream/build/go.mod +7 -0
  84. data/bridge/sdk-core/protos/api_upstream/build/go.sum +5 -0
  85. data/bridge/sdk-core/protos/api_upstream/{temporal/api/enums/v1/cluster.proto → build/tools.go} +7 -18
  86. data/bridge/sdk-core/protos/api_upstream/go.mod +6 -0
  87. data/bridge/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +12 -9
  88. data/bridge/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +15 -26
  89. data/bridge/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +13 -2
  90. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
  91. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +4 -9
  92. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
  93. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +10 -8
  94. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +28 -2
  95. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
  96. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
  97. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
  98. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
  99. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
  100. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +24 -19
  101. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
  102. data/bridge/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
  103. data/bridge/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
  104. data/bridge/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
  105. data/bridge/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +62 -26
  106. data/bridge/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +4 -2
  107. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +24 -61
  108. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -21
  109. data/bridge/sdk-core/protos/api_upstream/temporal/api/protocol/v1/message.proto +57 -0
  110. data/bridge/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
  111. data/bridge/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
  112. data/bridge/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +110 -31
  113. data/bridge/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  114. data/bridge/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +4 -4
  115. data/bridge/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +71 -6
  116. data/bridge/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
  117. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +3 -2
  118. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +111 -36
  119. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +19 -5
  120. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +1 -0
  121. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +1 -0
  122. data/bridge/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +1 -0
  123. data/bridge/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  124. data/bridge/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  125. data/bridge/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +1 -0
  126. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +9 -0
  127. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +9 -1
  128. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +6 -0
  129. data/bridge/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
  130. data/bridge/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
  131. data/bridge/sdk-core/sdk/Cargo.toml +4 -3
  132. data/bridge/sdk-core/sdk/src/interceptors.rs +36 -3
  133. data/bridge/sdk-core/sdk/src/lib.rs +94 -25
  134. data/bridge/sdk-core/sdk/src/workflow_context.rs +13 -2
  135. data/bridge/sdk-core/sdk/src/workflow_future.rs +10 -13
  136. data/bridge/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  137. data/bridge/sdk-core/sdk-core-protos/build.rs +36 -2
  138. data/bridge/sdk-core/sdk-core-protos/src/history_builder.rs +164 -104
  139. data/bridge/sdk-core/sdk-core-protos/src/history_info.rs +27 -23
  140. data/bridge/sdk-core/sdk-core-protos/src/lib.rs +252 -74
  141. data/bridge/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  142. data/bridge/sdk-core/test-utils/Cargo.toml +4 -1
  143. data/bridge/sdk-core/test-utils/src/canned_histories.rs +106 -296
  144. data/bridge/sdk-core/test-utils/src/histfetch.rs +1 -1
  145. data/bridge/sdk-core/test-utils/src/lib.rs +161 -50
  146. data/bridge/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  147. data/bridge/sdk-core/test-utils/src/workflows.rs +29 -0
  148. data/bridge/sdk-core/tests/fuzzy_workflow.rs +130 -0
  149. data/bridge/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  150. data/bridge/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  151. data/bridge/sdk-core/tests/integ_tests/heartbeat_tests.rs +10 -5
  152. data/bridge/sdk-core/tests/integ_tests/metrics_tests.rs +239 -0
  153. data/bridge/sdk-core/tests/integ_tests/polling_tests.rs +4 -60
  154. data/bridge/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  155. data/bridge/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  156. data/bridge/sdk-core/tests/integ_tests/workflow_tests/activities.rs +93 -69
  157. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  158. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  159. data/bridge/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +1 -0
  160. data/bridge/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  161. data/bridge/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  162. data/bridge/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +151 -116
  163. data/bridge/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +54 -0
  164. data/bridge/sdk-core/tests/integ_tests/workflow_tests/patches.rs +7 -28
  165. data/bridge/sdk-core/tests/integ_tests/workflow_tests/replay.rs +115 -24
  166. data/bridge/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  167. data/bridge/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  168. data/bridge/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  169. data/bridge/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  170. data/bridge/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -4
  171. data/bridge/sdk-core/tests/integ_tests/workflow_tests.rs +27 -18
  172. data/bridge/sdk-core/tests/main.rs +8 -16
  173. data/bridge/sdk-core/tests/runner.rs +75 -36
  174. data/bridge/sdk-core/tests/wf_input_replay.rs +32 -0
  175. data/bridge/src/connection.rs +117 -82
  176. data/bridge/src/lib.rs +356 -42
  177. data/bridge/src/runtime.rs +10 -3
  178. data/bridge/src/test_server.rs +153 -0
  179. data/bridge/src/worker.rs +133 -9
  180. data/lib/gen/temporal/api/batch/v1/message_pb.rb +8 -6
  181. data/lib/gen/temporal/api/command/v1/message_pb.rb +10 -16
  182. data/lib/gen/temporal/api/common/v1/message_pb.rb +5 -1
  183. data/lib/gen/temporal/api/enums/v1/batch_operation_pb.rb +2 -1
  184. data/lib/gen/temporal/api/enums/v1/command_type_pb.rb +3 -3
  185. data/lib/gen/temporal/api/enums/v1/common_pb.rb +2 -1
  186. data/lib/gen/temporal/api/enums/v1/event_type_pb.rb +5 -4
  187. data/lib/gen/temporal/api/enums/v1/failed_cause_pb.rb +9 -1
  188. data/lib/gen/temporal/api/enums/v1/namespace_pb.rb +1 -1
  189. data/lib/gen/temporal/api/enums/v1/query_pb.rb +1 -1
  190. data/lib/gen/temporal/api/enums/v1/reset_pb.rb +1 -1
  191. data/lib/gen/temporal/api/enums/v1/schedule_pb.rb +1 -1
  192. data/lib/gen/temporal/api/enums/v1/task_queue_pb.rb +1 -1
  193. data/lib/gen/temporal/api/enums/v1/update_pb.rb +7 -10
  194. data/lib/gen/temporal/api/enums/v1/workflow_pb.rb +1 -1
  195. data/lib/gen/temporal/api/errordetails/v1/message_pb.rb +1 -1
  196. data/lib/gen/temporal/api/failure/v1/message_pb.rb +1 -1
  197. data/lib/gen/temporal/api/filter/v1/message_pb.rb +1 -1
  198. data/lib/gen/temporal/api/history/v1/message_pb.rb +34 -25
  199. data/lib/gen/temporal/api/namespace/v1/message_pb.rb +2 -1
  200. data/lib/gen/temporal/api/operatorservice/v1/request_response_pb.rb +14 -51
  201. data/lib/gen/temporal/api/operatorservice/v1/service_pb.rb +1 -1
  202. data/lib/gen/temporal/api/protocol/v1/message_pb.rb +30 -0
  203. data/lib/gen/temporal/api/query/v1/message_pb.rb +1 -1
  204. data/lib/gen/temporal/api/replication/v1/message_pb.rb +1 -1
  205. data/lib/gen/temporal/api/schedule/v1/message_pb.rb +22 -1
  206. data/lib/gen/temporal/api/sdk/v1/task_complete_metadata_pb.rb +23 -0
  207. data/lib/gen/temporal/api/taskqueue/v1/message_pb.rb +2 -2
  208. data/lib/gen/temporal/api/testservice/v1/request_response_pb.rb +49 -0
  209. data/lib/gen/temporal/api/testservice/v1/service_pb.rb +21 -0
  210. data/lib/gen/temporal/api/update/v1/message_pb.rb +49 -3
  211. data/lib/gen/temporal/api/version/v1/message_pb.rb +1 -1
  212. data/lib/gen/temporal/api/workflow/v1/message_pb.rb +2 -1
  213. data/lib/gen/temporal/api/workflowservice/v1/request_response_pb.rb +47 -20
  214. data/lib/gen/temporal/api/workflowservice/v1/service_pb.rb +1 -1
  215. data/lib/gen/temporal/sdk/core/activity_result/activity_result_pb.rb +13 -9
  216. data/lib/gen/temporal/sdk/core/activity_task/activity_task_pb.rb +10 -6
  217. data/lib/gen/temporal/sdk/core/child_workflow/child_workflow_pb.rb +13 -9
  218. data/lib/gen/temporal/sdk/core/common/common_pb.rb +7 -3
  219. data/lib/gen/temporal/sdk/core/core_interface_pb.rb +9 -3
  220. data/lib/gen/temporal/sdk/core/external_data/external_data_pb.rb +7 -3
  221. data/lib/gen/temporal/sdk/core/workflow_activation/workflow_activation_pb.rb +28 -21
  222. data/lib/gen/temporal/sdk/core/workflow_commands/workflow_commands_pb.rb +32 -24
  223. data/lib/gen/temporal/sdk/core/workflow_completion/workflow_completion_pb.rb +12 -5
  224. data/lib/temporalio/activity/context.rb +102 -0
  225. data/lib/temporalio/activity/info.rb +67 -0
  226. data/lib/temporalio/activity.rb +85 -0
  227. data/lib/temporalio/bridge/connect_options.rb +15 -0
  228. data/lib/temporalio/bridge/error.rb +8 -0
  229. data/lib/temporalio/bridge/retry_config.rb +24 -0
  230. data/lib/temporalio/bridge/tls_options.rb +19 -0
  231. data/lib/temporalio/bridge.rb +14 -0
  232. data/lib/{temporal → temporalio}/client/implementation.rb +57 -56
  233. data/lib/{temporal → temporalio}/client/workflow_handle.rb +35 -35
  234. data/lib/{temporal → temporalio}/client.rb +19 -32
  235. data/lib/temporalio/connection/retry_config.rb +44 -0
  236. data/lib/temporalio/connection/service.rb +20 -0
  237. data/lib/temporalio/connection/test_service.rb +92 -0
  238. data/lib/temporalio/connection/tls_options.rb +51 -0
  239. data/lib/temporalio/connection/workflow_service.rb +731 -0
  240. data/lib/temporalio/connection.rb +86 -0
  241. data/lib/{temporal → temporalio}/data_converter.rb +76 -35
  242. data/lib/{temporal → temporalio}/error/failure.rb +6 -6
  243. data/lib/{temporal → temporalio}/error/workflow_failure.rb +4 -2
  244. data/lib/{temporal → temporalio}/errors.rb +19 -1
  245. data/lib/{temporal → temporalio}/failure_converter/base.rb +5 -5
  246. data/lib/{temporal → temporalio}/failure_converter/basic.rb +58 -52
  247. data/lib/temporalio/failure_converter.rb +7 -0
  248. data/lib/temporalio/interceptor/activity_inbound.rb +22 -0
  249. data/lib/temporalio/interceptor/activity_outbound.rb +24 -0
  250. data/lib/{temporal → temporalio}/interceptor/chain.rb +7 -6
  251. data/lib/{temporal → temporalio}/interceptor/client.rb +27 -2
  252. data/lib/temporalio/interceptor.rb +22 -0
  253. data/lib/{temporal → temporalio}/payload_codec/base.rb +5 -5
  254. data/lib/{temporal → temporalio}/payload_converter/base.rb +3 -3
  255. data/lib/{temporal → temporalio}/payload_converter/bytes.rb +4 -3
  256. data/lib/{temporal → temporalio}/payload_converter/composite.rb +7 -5
  257. data/lib/{temporal → temporalio}/payload_converter/encoding_base.rb +4 -4
  258. data/lib/{temporal → temporalio}/payload_converter/json.rb +4 -3
  259. data/lib/{temporal → temporalio}/payload_converter/nil.rb +4 -3
  260. data/lib/temporalio/payload_converter.rb +14 -0
  261. data/lib/{temporal → temporalio}/retry_policy.rb +17 -7
  262. data/lib/{temporal → temporalio}/retry_state.rb +1 -1
  263. data/lib/temporalio/runtime.rb +25 -0
  264. data/lib/temporalio/testing/time_skipping_handle.rb +32 -0
  265. data/lib/temporalio/testing/time_skipping_interceptor.rb +23 -0
  266. data/lib/temporalio/testing/workflow_environment.rb +112 -0
  267. data/lib/temporalio/testing.rb +175 -0
  268. data/lib/{temporal → temporalio}/timeout_type.rb +2 -2
  269. data/lib/temporalio/version.rb +3 -0
  270. data/lib/temporalio/worker/activity_runner.rb +114 -0
  271. data/lib/temporalio/worker/activity_worker.rb +164 -0
  272. data/lib/temporalio/worker/reactor.rb +46 -0
  273. data/lib/temporalio/worker/runner.rb +63 -0
  274. data/lib/temporalio/worker/sync_worker.rb +124 -0
  275. data/lib/temporalio/worker/thread_pool_executor.rb +51 -0
  276. data/lib/temporalio/worker.rb +204 -0
  277. data/lib/temporalio/workflow/async.rb +46 -0
  278. data/lib/{temporal → temporalio}/workflow/execution_info.rb +4 -4
  279. data/lib/{temporal → temporalio}/workflow/execution_status.rb +1 -1
  280. data/lib/temporalio/workflow/future.rb +138 -0
  281. data/lib/{temporal → temporalio}/workflow/id_reuse_policy.rb +6 -6
  282. data/lib/temporalio/workflow/info.rb +76 -0
  283. data/lib/{temporal → temporalio}/workflow/query_reject_condition.rb +5 -5
  284. data/lib/temporalio.rb +12 -3
  285. data/temporalio.gemspec +11 -6
  286. metadata +137 -64
  287. data/bridge/sdk-core/Cargo.lock +0 -2606
  288. data/bridge/sdk-core/bridge-ffi/Cargo.toml +0 -24
  289. data/bridge/sdk-core/bridge-ffi/LICENSE.txt +0 -23
  290. data/bridge/sdk-core/bridge-ffi/build.rs +0 -25
  291. data/bridge/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -249
  292. data/bridge/sdk-core/bridge-ffi/src/lib.rs +0 -825
  293. data/bridge/sdk-core/bridge-ffi/src/wrappers.rs +0 -211
  294. data/bridge/sdk-core/core/src/log_export.rs +0 -62
  295. data/bridge/sdk-core/core/src/worker/workflow/machines/mutable_side_effect_state_machine.rs +0 -127
  296. data/bridge/sdk-core/core/src/worker/workflow/machines/side_effect_state_machine.rs +0 -71
  297. data/bridge/sdk-core/protos/api_upstream/temporal/api/cluster/v1/message.proto +0 -83
  298. data/bridge/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
  299. data/bridge/sdk-core/sdk/src/conversions.rs +0 -8
  300. data/lib/bridge.so +0 -0
  301. data/lib/gen/temporal/api/cluster/v1/message_pb.rb +0 -67
  302. data/lib/gen/temporal/api/enums/v1/cluster_pb.rb +0 -26
  303. data/lib/gen/temporal/sdk/core/bridge/bridge_pb.rb +0 -222
  304. data/lib/temporal/bridge.rb +0 -14
  305. data/lib/temporal/connection.rb +0 -736
  306. data/lib/temporal/failure_converter.rb +0 -8
  307. data/lib/temporal/payload_converter.rb +0 -14
  308. data/lib/temporal/runtime.rb +0 -22
  309. data/lib/temporal/version.rb +0 -3
  310. data/lib/temporal.rb +0 -8
@@ -1,217 +1,177 @@
1
+ #[cfg(feature = "save_wf_inputs")]
2
+ mod saved_wf_inputs;
3
+ #[cfg(feature = "save_wf_inputs")]
4
+ mod tonic_status_serde;
5
+
6
+ #[cfg(feature = "save_wf_inputs")]
7
+ pub use saved_wf_inputs::replay_wf_state_inputs;
8
+
1
9
  use crate::{
2
- abstractions::{dbg_panic, stream_when_allowed, MeteredSemaphore},
3
- protosext::ValidPollWFTQResponse,
4
- telemetry::metrics::workflow_worker_type,
5
- worker::{
6
- workflow::{history_update::NextPageToken, run_cache::RunCache, *},
7
- LocalActRequest, LocalActivityResolution, LEGACY_QUERY_ID,
10
+ abstractions::dbg_panic,
11
+ worker::workflow::{
12
+ managed_run::RunUpdateAct,
13
+ run_cache::RunCache,
14
+ wft_extraction::{HistfetchRC, HistoryFetchReq, WFTExtractorOutput},
15
+ *,
8
16
  },
9
17
  MetricsContext,
10
18
  };
11
19
  use futures::{stream, stream::PollNext, Stream, StreamExt};
12
- use std::{collections::VecDeque, fmt::Debug, future, sync::Arc, time::Instant};
13
- use temporal_sdk_core_api::errors::{PollWfError, WFMachinesError};
14
- use temporal_sdk_core_protos::{
15
- coresdk::{
16
- workflow_activation::{
17
- create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
18
- workflow_activation_job,
19
- },
20
- workflow_completion::Failure,
21
- },
22
- temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure as TFailure},
23
- };
24
- use tokio::sync::{mpsc::unbounded_channel, oneshot};
25
- use tokio_stream::wrappers::UnboundedReceiverStream;
20
+ use std::{collections::VecDeque, fmt::Debug, future, sync::Arc};
21
+ use temporal_sdk_core_api::errors::PollWfError;
22
+ use temporal_sdk_core_protos::coresdk::workflow_activation::remove_from_cache::EvictionReason;
26
23
  use tokio_util::sync::CancellationToken;
27
24
  use tracing::{Level, Span};
28
25
 
29
- /// This struct holds all the state needed for tracking what workflow runs are currently cached
30
- /// and how WFTs should be dispatched to them, etc.
26
+ /// This struct holds all the state needed for tracking the state of currently cached workflow runs
27
+ /// and directs all actions which affect them. It is ultimately the top-level arbiter of nearly
28
+ /// everything important relating to workflow state.
31
29
  ///
32
30
  /// See [WFStream::build] for more
33
- pub(crate) struct WFStream {
31
+ pub(super) struct WFStream {
34
32
  runs: RunCache,
35
33
  /// Buffered polls for new runs which need a cache slot to open up before we can handle them
36
34
  buffered_polls_need_cache_slot: VecDeque<PermittedWFT>,
35
+ /// Is filled with runs that we decided need to have their history fetched during state
36
+ /// manipulation. Must be drained after handling each input.
37
+ runs_needing_fetching: VecDeque<HistoryFetchReq>,
37
38
 
38
- /// Client for accessing server for history pagination etc.
39
- client: Arc<dyn WorkerClient>,
40
-
41
- /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
42
- wft_semaphore: MeteredSemaphore,
39
+ history_fetch_refcounter: Arc<HistfetchRC>,
43
40
  shutdown_token: CancellationToken,
41
+ ignore_evicts_on_shutdown: bool,
44
42
 
45
43
  metrics: MetricsContext,
46
- }
47
- /// All possible inputs to the [WFStream]
48
- #[derive(derive_more::From, Debug)]
49
- enum WFStreamInput {
50
- NewWft(PermittedWFT),
51
- Local(LocalInput),
52
- /// The stream given to us which represents the poller (or a mock) terminated.
53
- PollerDead,
54
- /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
55
- /// error while polling
56
- PollerError(tonic::Status),
57
- }
58
- impl From<RunUpdateResponse> for WFStreamInput {
59
- fn from(r: RunUpdateResponse) -> Self {
60
- WFStreamInput::Local(LocalInput {
61
- input: LocalInputs::RunUpdateResponse(r.kind),
62
- span: r.span,
63
- })
64
- }
65
- }
66
- /// A non-poller-received input to the [WFStream]
67
- #[derive(derive_more::DebugCustom)]
68
- #[debug(fmt = "LocalInput {{ {:?} }}", input)]
69
- pub(super) struct LocalInput {
70
- pub input: LocalInputs,
71
- pub span: Span,
72
- }
73
- /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
74
- /// new polls.
75
- #[derive(Debug, derive_more::From)]
76
- pub(super) enum LocalInputs {
77
- Completion(WFActCompleteMsg),
78
- LocalResolution(LocalResolutionMsg),
79
- PostActivation(PostActivationMsg),
80
- RunUpdateResponse(RunUpdateResponseKind),
81
- RequestEviction(RequestEvictMsg),
82
- GetStateInfo(GetStateInfoMsg),
83
- }
84
- #[derive(Debug, derive_more::From)]
85
- #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
86
- enum ExternalPollerInputs {
87
- NewWft(PermittedWFT),
88
- PollerDead,
89
- PollerError(tonic::Status),
90
- }
91
- impl From<ExternalPollerInputs> for WFStreamInput {
92
- fn from(l: ExternalPollerInputs) -> Self {
93
- match l {
94
- ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
95
- ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
96
- ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
97
- }
98
- }
99
- }
100
44
 
45
+ #[cfg(feature = "save_wf_inputs")]
46
+ wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
47
+ }
101
48
  impl WFStream {
102
49
  /// Constructs workflow state management and returns a stream which outputs activations.
103
50
  ///
104
- /// * `external_wfts` is a stream of validated poll responses as returned by a poller (or mock)
105
- /// * `wfts_from_complete` is the recv side of a channel that new WFTs from completions should
106
- /// come down.
51
+ /// * `wft_stream` is a stream of validated poll responses and fetched history pages as returned
52
+ /// by a poller (or mock), via [WFTExtractor].
107
53
  /// * `local_rx` is a stream of actions that workflow state needs to see. Things like
108
- /// completions, local activities finishing, etc. See [LocalInputs].
54
+ /// completions, local activities finishing, etc. See [LocalInputs].
55
+ /// * `local_activity_request_sink` is used to handle outgoing requests to start or cancel
56
+ /// local activities, and may return resolutions that need to be handled immediately.
109
57
  ///
110
- /// These inputs are combined, along with an internal feedback channel for run-specific updates,
111
- /// to form the inputs to a stream of [WFActStreamInput]s. The stream processor then takes
112
- /// action on those inputs, and then may yield activations.
58
+ /// The stream inputs are combined into a stream of [WFActStreamInput]s. The stream processor
59
+ /// then takes action on those inputs, mutating the [WFStream] state, and then may yield
60
+ /// activations.
113
61
  ///
114
- /// Updating runs may need to do async work like fetching additional history. In order to
115
- /// facilitate this, each run lives in its own task which is communicated with by sending
116
- /// [RunAction]s and receiving [RunUpdateResponse]s via its [ManagedRunHandle].
62
+ /// Importantly, nothing async happens while actually mutating state. This means all changes to
63
+ /// all workflow state can be represented purely via the stream of inputs, plus the
64
+ /// calls/retvals from the LA request sink, which is the last unfortunate bit of impurity in
65
+ /// the design. Eliminating it would be nice, so that all inputs come from the passed-in streams
66
+ /// and all outputs flow from the return stream, but it's difficult to do so since it would
67
+ /// require "pausing" in-progress changes to a run while sending & waiting for response from
68
+ /// local activity management. Likely the best option would be to move the pure state info
69
+ /// needed to determine immediate responses into LA state machines themselves (out of the LA
70
+ /// manager), which is a quite substantial change.
117
71
  pub(super) fn build(
118
72
  basics: WorkflowBasics,
119
- external_wfts: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
73
+ wft_stream: impl Stream<Item = Result<WFTExtractorOutput, tonic::Status>> + Send + 'static,
120
74
  local_rx: impl Stream<Item = LocalInput> + Send + 'static,
121
- client: Arc<dyn WorkerClient>,
122
- local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
123
- + Send
124
- + Sync
125
- + 'static,
126
- ) -> impl Stream<Item = Result<ActivationOrAuto, PollWfError>> {
127
- let wft_semaphore = MeteredSemaphore::new(
128
- basics.max_outstanding_wfts,
129
- basics.metrics.with_new_attrs([workflow_worker_type()]),
130
- MetricsContext::available_task_slots,
131
- );
132
- let wft_sem_clone = wft_semaphore.clone();
133
- let proceeder = move || {
134
- let wft_sem_clone = wft_sem_clone.clone();
135
- async move { wft_sem_clone.acquire_owned().await.unwrap() }
136
- };
137
- let poller_wfts = stream_when_allowed(external_wfts, proceeder);
138
- let (run_update_tx, run_update_rx) = unbounded_channel();
139
- let local_rx = stream::select(
140
- local_rx.map(Into::into),
141
- UnboundedReceiverStream::new(run_update_rx).map(Into::into),
142
- );
75
+ local_activity_request_sink: impl LocalActivityRequestSink,
76
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
143
77
  let all_inputs = stream::select_with_strategy(
144
- local_rx,
145
- poller_wfts
146
- .map(|(wft, permit)| match wft {
147
- Ok(wft) => ExternalPollerInputs::NewWft(PermittedWFT { wft, permit }),
148
- Err(e) => ExternalPollerInputs::PollerError(e),
149
- })
78
+ local_rx.map(Into::into),
79
+ wft_stream
80
+ .map(Into::into)
150
81
  .chain(stream::once(async { ExternalPollerInputs::PollerDead }))
151
82
  .map(Into::into)
152
83
  .boxed(),
153
84
  // Priority always goes to the local stream
154
85
  |_: &mut ()| PollNext::Left,
155
86
  );
87
+ Self::build_internal(all_inputs, basics, local_activity_request_sink)
88
+ }
89
+
90
+ fn build_internal(
91
+ all_inputs: impl Stream<Item = WFStreamInput>,
92
+ basics: WorkflowBasics,
93
+ local_activity_request_sink: impl LocalActivityRequestSink,
94
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
156
95
  let mut state = WFStream {
157
96
  buffered_polls_need_cache_slot: Default::default(),
158
97
  runs: RunCache::new(
159
98
  basics.max_cached_workflows,
160
99
  basics.namespace.clone(),
161
- run_update_tx,
162
- Arc::new(local_activity_request_sink),
100
+ basics.server_capabilities.clone(),
101
+ local_activity_request_sink,
163
102
  basics.metrics.clone(),
164
103
  ),
165
- client,
166
- wft_semaphore,
167
104
  shutdown_token: basics.shutdown_token,
105
+ ignore_evicts_on_shutdown: basics.ignore_evicts_on_shutdown,
168
106
  metrics: basics.metrics,
107
+ runs_needing_fetching: Default::default(),
108
+ history_fetch_refcounter: Arc::new(HistfetchRC {}),
109
+
110
+ #[cfg(feature = "save_wf_inputs")]
111
+ wf_state_inputs: basics.wf_state_inputs,
169
112
  };
170
113
  all_inputs
171
- .map(move |action| {
114
+ .map(move |action: WFStreamInput| {
172
115
  let span = span!(Level::DEBUG, "new_stream_input", action=?action);
173
116
  let _span_g = span.enter();
174
117
 
175
- let maybe_activation = match action {
118
+ #[cfg(feature = "save_wf_inputs")]
119
+ let maybe_write = state.prep_input(&action);
120
+
121
+ let mut activations = vec![];
122
+ let maybe_act = match action {
176
123
  WFStreamInput::NewWft(pwft) => {
177
- debug!(run_id=%pwft.wft.workflow_execution.run_id, "New WFT");
178
- state.instantiate_or_update(pwft);
179
- None
124
+ debug!(run_id=%pwft.work.execution.run_id, "New WFT");
125
+ state.instantiate_or_update(pwft)
180
126
  }
181
127
  WFStreamInput::Local(local_input) => {
182
128
  let _span_g = local_input.span.enter();
183
- match local_input.input {
184
- LocalInputs::RunUpdateResponse(resp) => {
185
- state.process_run_update_response(resp)
129
+ if let Some(rid) = local_input.input.run_id() {
130
+ if let Some(rh) = state.runs.get_mut(rid) {
131
+ rh.record_span_fields(&local_input.span);
186
132
  }
133
+ }
134
+ match local_input.input {
187
135
  LocalInputs::Completion(completion) => {
188
- state.process_completion(completion);
189
- None
136
+ activations.extend(
137
+ state.process_completion(NewOrFetchedComplete::New(completion)),
138
+ );
139
+ None // completions can return more than one activation
140
+ }
141
+ LocalInputs::FetchedPageCompletion { paginator, update } => {
142
+ activations.extend(state.process_completion(
143
+ NewOrFetchedComplete::Fetched(update, paginator),
144
+ ));
145
+ None // completions can return more than one activation
190
146
  }
191
147
  LocalInputs::PostActivation(report) => {
192
- state.process_post_activation(report);
193
- None
148
+ state.process_post_activation(report)
194
149
  }
195
- LocalInputs::LocalResolution(res) => {
196
- state.local_resolution(res);
197
- None
150
+ LocalInputs::LocalResolution(res) => state.local_resolution(res),
151
+ LocalInputs::HeartbeatTimeout(hbt) => {
152
+ state.process_heartbeat_timeout(hbt)
198
153
  }
199
154
  LocalInputs::RequestEviction(evict) => {
200
- state.request_eviction(evict);
201
- None
155
+ state.request_eviction(evict).into_run_update_resp()
202
156
  }
203
157
  LocalInputs::GetStateInfo(gsi) => {
204
158
  let _ = gsi.response_tx.send(WorkflowStateInfo {
205
159
  cached_workflows: state.runs.len(),
206
160
  outstanding_wft: state.outstanding_wfts(),
207
- available_wft_permits: state.wft_semaphore.available_permits(),
208
161
  });
209
162
  None
210
163
  }
211
164
  }
212
165
  }
166
+ WFStreamInput::FailedFetch { run_id, err } => state
167
+ .request_eviction(RequestEvictMsg {
168
+ run_id,
169
+ message: format!("Fetching history failed: {err:?}"),
170
+ reason: EvictionReason::PaginationOrHistoryFetch,
171
+ })
172
+ .into_run_update_resp(),
213
173
  WFStreamInput::PollerDead => {
214
- debug!("WFT poller died, shutting down");
174
+ debug!("WFT poller died, beginning shutdown");
215
175
  state.shutdown_token.cancel();
216
176
  None
217
177
  }
@@ -221,457 +181,228 @@ impl WFStream {
221
181
  }
222
182
  };
223
183
 
224
- if let Some(ref act) = maybe_activation {
225
- if let Some(run_handle) = state.runs.get_mut(act.run_id()) {
226
- run_handle.insert_outstanding_activation(act);
227
- } else {
228
- dbg_panic!("Tried to insert activation for missing run!");
229
- }
184
+ activations.extend(maybe_act.into_iter());
185
+ activations.extend(state.reconcile_buffered());
186
+
187
+ // Always flush *after* actually handling the input, as this allows LA sink
188
+ // responses to be recorded before the input, so they can be read and buffered to be
189
+ // replayed during the handling of the input itself.
190
+ #[cfg(feature = "save_wf_inputs")]
191
+ if let Some(write) = maybe_write {
192
+ state.flush_write(write);
230
193
  }
231
- state.reconcile_buffered();
194
+
232
195
  if state.shutdown_done() {
196
+ info!("Workflow shutdown is done");
233
197
  return Err(PollWfError::ShutDown);
234
198
  }
235
199
 
236
- Ok(maybe_activation)
200
+ Ok(WFStreamOutput {
201
+ activations: activations.into(),
202
+ fetch_histories: std::mem::take(&mut state.runs_needing_fetching),
203
+ })
237
204
  })
238
- .filter_map(|o| {
239
- future::ready(match o {
240
- Ok(None) => None,
241
- Ok(Some(v)) => Some(Ok(v)),
242
- Err(e) => {
243
- if !matches!(e, PollWfError::ShutDown) {
244
- error!(
205
+ .inspect(|o| {
206
+ if let Some(e) = o.as_ref().err() {
207
+ if !matches!(e, PollWfError::ShutDown) {
208
+ error!(
245
209
  "Workflow processing encountered fatal error and must shut down {:?}",
246
210
  e
247
- );
248
- }
249
- Some(Err(e))
211
+ );
250
212
  }
251
- })
213
+ }
252
214
  })
253
215
  // Stop the stream once we have shut down
254
216
  .take_while(|o| future::ready(!matches!(o, Err(PollWfError::ShutDown))))
255
217
  }
256
218
 
257
- fn process_run_update_response(
258
- &mut self,
259
- resp: RunUpdateResponseKind,
260
- ) -> Option<ActivationOrAuto> {
261
- debug!(resp=%resp, "Processing run update response from machines");
262
- match resp {
263
- RunUpdateResponseKind::Good(mut resp) => {
264
- let run_handle = self
265
- .runs
266
- .get_mut(&resp.run_id)
267
- .expect("Workflow must exist, it just sent us an update response");
268
- run_handle.have_seen_terminal_event = resp.have_seen_terminal_event;
269
- run_handle.more_pending_work = resp.more_pending_work;
270
- run_handle.last_action_acked = true;
271
- run_handle.most_recently_processed_event_number =
272
- resp.most_recently_processed_event_number;
273
-
274
- let r = match resp.outgoing_activation {
275
- Some(ActivationOrAuto::LangActivation(mut activation)) => {
276
- if resp.in_response_to_wft {
277
- let wft = run_handle
278
- .wft
279
- .as_mut()
280
- .expect("WFT must exist for run just updated with one");
281
- // If there are in-poll queries, insert jobs for those queries into the
282
- // activation, but only if we hit the cache. If we didn't, those queries
283
- // will need to be dealt with once replay is over
284
- if !wft.pending_queries.is_empty() && wft.hit_cache {
285
- put_queries_in_act(&mut activation, wft);
286
- }
287
- }
288
-
289
- if activation.jobs.is_empty() {
290
- dbg_panic!("Should not send lang activation with no jobs");
291
- }
292
- Some(ActivationOrAuto::LangActivation(activation))
293
- }
294
- Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
295
- if let Some(wft) = run_handle.wft.as_mut() {
296
- put_queries_in_act(&mut act, wft);
297
- Some(ActivationOrAuto::LangActivation(act))
298
- } else {
299
- dbg_panic!("Ready for queries but no WFT!");
300
- None
301
- }
302
- }
303
- a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
304
- None => {
305
- // If the response indicates there is no activation to send yet but there
306
- // is more pending work, we should check again.
307
- if run_handle.more_pending_work {
308
- run_handle.check_more_activations();
309
- None
310
- } else if let Some(reason) = run_handle.trying_to_evict.as_ref() {
311
- // If a run update came back and had nothing to do, but we're trying to
312
- // evict, just do that now as long as there's no other outstanding work.
313
- if run_handle.activation.is_none() && !run_handle.more_pending_work {
314
- let mut evict_act = create_evict_activation(
315
- resp.run_id,
316
- reason.message.clone(),
317
- reason.reason,
318
- );
319
- evict_act.history_length =
320
- run_handle.most_recently_processed_event_number as u32;
321
- Some(ActivationOrAuto::LangActivation(evict_act))
322
- } else {
323
- None
324
- }
325
- } else {
326
- None
327
- }
328
- }
329
- };
330
- if let Some(f) = resp.fulfillable_complete.take() {
331
- f.fulfill();
332
- }
333
-
334
- // After each run update, check if it's ready to handle any buffered poll
335
- if matches!(&r, Some(ActivationOrAuto::Autocomplete { .. }) | None)
336
- && !run_handle.has_any_pending_work(false, true)
337
- {
338
- if let Some(bufft) = run_handle.buffered_resp.take() {
339
- self.instantiate_or_update(bufft);
340
- }
341
- }
342
- r
343
- }
344
- RunUpdateResponseKind::Fail(fail) => {
345
- if let Some(r) = self.runs.get_mut(&fail.run_id) {
346
- r.last_action_acked = true;
347
- }
348
-
349
- if let Some(resp_chan) = fail.completion_resp {
350
- // Automatically fail the workflow task in the event we couldn't update machines
351
- let fail_cause = if matches!(&fail.err, WFMachinesError::Nondeterminism(_)) {
352
- WorkflowTaskFailedCause::NonDeterministicError
353
- } else {
354
- WorkflowTaskFailedCause::Unspecified
355
- };
356
- let wft_fail_str = format!("{:?}", fail.err);
357
- self.failed_completion(
358
- fail.run_id,
359
- fail_cause,
360
- fail.err.evict_reason(),
361
- TFailure::application_failure(wft_fail_str, false).into(),
362
- resp_chan,
363
- );
364
- } else {
365
- // TODO: This should probably also fail workflow tasks, but that wasn't
366
- // implemented pre-refactor either.
367
- warn!(error=?fail.err, run_id=%fail.run_id, "Error while updating workflow");
368
- self.request_eviction(RequestEvictMsg {
369
- run_id: fail.run_id,
370
- message: format!("Error while updating workflow: {:?}", fail.err),
371
- reason: fail.err.evict_reason(),
372
- });
373
- }
374
- None
219
+ /// Instantiate or update run machines with a new WFT
220
+ #[instrument(skip(self, pwft)
221
+ fields(run_id=%pwft.work.execution.run_id,
222
+ workflow_id=%pwft.work.execution.workflow_id))]
223
+ fn instantiate_or_update(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
224
+ match self._instantiate_or_update(pwft) {
225
+ Err(histfetch) => {
226
+ self.runs_needing_fetching.push_back(histfetch);
227
+ Default::default()
375
228
  }
229
+ Ok(r) => r,
376
230
  }
377
231
  }
378
232
 
379
- #[instrument(level = "debug", skip(self, pwft),
380
- fields(run_id=%pwft.wft.workflow_execution.run_id))]
381
- fn instantiate_or_update(&mut self, pwft: PermittedWFT) {
382
- let (mut work, permit) = if let Some(w) = self.buffer_resp_if_outstanding_work(pwft) {
383
- (w.wft, w.permit)
233
+ fn _instantiate_or_update(
234
+ &mut self,
235
+ pwft: PermittedWFT,
236
+ ) -> Result<RunUpdateAct, HistoryFetchReq> {
237
+ // If the run already exists, possibly buffer the work and return early if we can't handle
238
+ // it yet.
239
+ let pwft = if let Some(rh) = self.runs.get_mut(&pwft.work.execution.run_id) {
240
+ if let Some(w) = rh.buffer_wft_if_outstanding_work(pwft) {
241
+ w
242
+ } else {
243
+ return Ok(None);
244
+ }
384
245
  } else {
385
- return;
246
+ pwft
386
247
  };
387
248
 
388
- let run_id = work.workflow_execution.run_id.clone();
249
+ let run_id = pwft.work.execution.run_id.clone();
389
250
  // If our cache is full and this WFT is for an unseen run we must first evict a run before
390
251
  // we can deal with this task. So, buffer the task in that case.
391
252
  if !self.runs.has_run(&run_id) && self.runs.is_full() {
392
- self.buffer_resp_on_full_cache(PermittedWFT { wft: work, permit });
393
- return;
253
+ self.buffer_resp_on_full_cache(pwft);
254
+ return Ok(None);
394
255
  }
395
256
 
396
- let start_event_id = work.history.events.first().map(|e| e.event_id);
397
- debug!(
398
- run_id = %run_id,
399
- task_token = %&work.task_token,
400
- history_length = %work.history.events.len(),
401
- start_event_id = ?start_event_id,
402
- has_legacy_query = %work.legacy_query.is_some(),
403
- attempt = %work.attempt,
404
- "Applying new workflow task from server"
405
- );
406
-
407
- let wft_info = WorkflowTaskInfo {
408
- attempt: work.attempt,
409
- task_token: work.task_token,
410
- };
411
- let poll_resp_is_incremental = work
412
- .history
413
- .events
414
- .get(0)
415
- .map(|ev| ev.event_id > 1)
416
- .unwrap_or_default();
417
- let poll_resp_is_incremental = poll_resp_is_incremental || work.history.events.is_empty();
418
-
419
- let mut did_miss_cache = !poll_resp_is_incremental;
420
-
421
- let page_token = if !self.runs.has_run(&run_id) && poll_resp_is_incremental {
257
+ // This check can't really be lifted up higher since we could EX: See it's in the cache,
258
+ // not fetch more history, send the task, see cache is full, buffer it, then evict that
259
+ // run, and now we still have a cache miss.
260
+ if !self.runs.has_run(&run_id) && pwft.work.is_incremental() {
422
261
  debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
423
262
  cache. Will fetch history");
424
263
  self.metrics.sticky_cache_miss();
425
- did_miss_cache = true;
426
- NextPageToken::FetchFromStart
427
- } else {
428
- work.next_page_token.into()
429
- };
430
- let history_update = HistoryUpdate::new(
431
- HistoryPaginator::new(
432
- work.history,
433
- work.workflow_execution.workflow_id.clone(),
434
- run_id.clone(),
435
- page_token,
436
- self.client.clone(),
437
- ),
438
- work.previous_started_event_id,
439
- );
440
- let legacy_query_from_poll = work
441
- .legacy_query
442
- .take()
443
- .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
444
-
445
- let mut pending_queries = work.query_requests.into_iter().collect::<Vec<_>>();
446
- if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
447
- error!(
448
- "Server issued both normal and legacy queries. This should not happen. Please \
449
- file a bug report."
450
- );
451
- self.request_eviction(RequestEvictMsg {
452
- run_id,
453
- message: "Server issued both normal and legacy query".to_string(),
454
- reason: EvictionReason::Fatal,
455
- });
456
- return;
457
- }
458
- if let Some(lq) = legacy_query_from_poll {
459
- pending_queries.push(lq);
264
+ return Err(HistoryFetchReq::Full(
265
+ CacheMissFetchReq { original_wft: pwft },
266
+ self.history_fetch_refcounter.clone(),
267
+ ));
460
268
  }
461
269
 
462
- let start_time = Instant::now();
463
- let run_handle = self.runs.instantiate_or_update(
464
- &run_id,
465
- &work.workflow_execution.workflow_id,
466
- &work.workflow_type,
467
- history_update,
468
- start_time,
469
- );
470
- run_handle.wft = Some(OutstandingTask {
471
- info: wft_info,
472
- hit_cache: !did_miss_cache,
473
- pending_queries,
474
- start_time,
475
- permit,
476
- })
270
+ let rur = self.runs.instantiate_or_update(pwft);
271
+ Ok(rur)
477
272
  }
478
273
 
479
- #[instrument(level = "debug", skip(self, complete),
480
- fields(run_id=%complete.completion.run_id()))]
481
- fn process_completion(&mut self, complete: WFActCompleteMsg) {
482
- match complete.completion {
483
- ValidatedCompletion::Success { run_id, commands } => {
484
- self.successful_completion(run_id, commands, complete.response_tx);
485
- }
486
- ValidatedCompletion::Fail { run_id, failure } => {
487
- self.failed_completion(
488
- run_id,
489
- WorkflowTaskFailedCause::Unspecified,
274
+ fn process_completion(&mut self, complete: NewOrFetchedComplete) -> Vec<ActivationOrAuto> {
275
+ let rh = if let Some(rh) = self.runs.get_mut(complete.run_id()) {
276
+ rh
277
+ } else {
278
+ dbg_panic!("Run missing during completion {:?}", complete);
279
+ return vec![];
280
+ };
281
+ let mut acts: Vec<_> = match complete {
282
+ NewOrFetchedComplete::New(complete) => match complete.completion {
283
+ ValidatedCompletion::Success {
284
+ commands,
285
+ used_flags,
286
+ ..
287
+ } => match rh.successful_completion(commands, used_flags, complete.response_tx) {
288
+ Ok(acts) => acts,
289
+ Err(npr) => {
290
+ self.runs_needing_fetching
291
+ .push_back(HistoryFetchReq::NextPage(
292
+ npr,
293
+ self.history_fetch_refcounter.clone(),
294
+ ));
295
+ None
296
+ }
297
+ },
298
+ ValidatedCompletion::Fail { failure, .. } => rh.failed_completion(
299
+ failure.force_cause(),
490
300
  EvictionReason::LangFail,
491
301
  failure,
492
302
  complete.response_tx,
493
- );
303
+ ),
304
+ },
305
+ NewOrFetchedComplete::Fetched(update, paginator) => {
306
+ rh.fetched_page_completion(update, paginator)
494
307
  }
495
308
  }
309
+ .into_iter()
310
+ .collect();
496
311
  // Always queue evictions after completion when we have a zero-size cache
497
312
  if self.runs.cache_capacity() == 0 {
498
- self.request_eviction_of_lru_run();
313
+ acts.extend(self.request_eviction_of_lru_run().into_run_update_resp())
499
314
  }
315
+ acts
500
316
  }
501
317
 
502
- fn successful_completion(
503
- &mut self,
504
- run_id: String,
505
- mut commands: Vec<WFCommand>,
506
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
507
- ) {
508
- let activation_was_only_eviction = self.activation_has_only_eviction(&run_id);
509
- let (task_token, has_pending_query, start_time) =
510
- if let Some(entry) = self.get_task(&run_id) {
511
- (
512
- entry.info.task_token.clone(),
513
- !entry.pending_queries.is_empty(),
514
- entry.start_time,
515
- )
516
- } else {
517
- if !activation_was_only_eviction {
518
- // Not an error if this was an eviction, since it's normal to issue eviction
519
- // activations without an associated workflow task in that case.
520
- dbg_panic!(
521
- "Attempted to complete activation for run {} without associated workflow task",
318
+ fn process_post_activation(&mut self, report: PostActivationMsg) -> RunUpdateAct {
319
+ let run_id = &report.run_id;
320
+ let wft_from_complete = report.wft_from_complete;
321
+ if let Some((wft, _)) = &wft_from_complete {
322
+ if &wft.execution.run_id != run_id {
323
+ dbg_panic!(
324
+ "Server returned a WFT on completion for a different run ({}) than the \
325
+ one being completed ({}). This is a server bug.",
326
+ wft.execution.run_id,
522
327
  run_id
523
- );
524
- }
525
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
526
- return;
527
- };
528
-
529
- // If the only command from the activation is a legacy query response, that means we need
530
- // to respond differently than a typical activation.
531
- if matches!(&commands.as_slice(),
532
- &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
533
- {
534
- let qr = match commands.remove(0) {
535
- WFCommand::QueryResponse(qr) => qr,
536
- _ => unreachable!("We just verified this is the only command"),
537
- };
538
- self.reply_to_complete(
539
- &run_id,
540
- ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
541
- task_token,
542
- action: ActivationAction::RespondLegacyQuery {
543
- result: Box::new(qr),
544
- },
545
- }),
546
- resp_chan,
547
- );
548
- } else {
549
- // First strip out query responses from other commands that actually affect machines
550
- // Would be prettier with `drain_filter`
551
- let mut i = 0;
552
- let mut query_responses = vec![];
553
- while i < commands.len() {
554
- if matches!(commands[i], WFCommand::QueryResponse(_)) {
555
- if let WFCommand::QueryResponse(qr) = commands.remove(i) {
556
- query_responses.push(qr);
557
- }
558
- } else {
559
- i += 1;
560
- }
561
- }
562
-
563
- let activation_was_eviction = self.activation_has_eviction(&run_id);
564
- if let Some(rh) = self.runs.get_mut(&run_id) {
565
- rh.send_completion(RunActivationCompletion {
566
- task_token,
567
- start_time,
568
- commands,
569
- activation_was_eviction,
570
- activation_was_only_eviction,
571
- has_pending_query,
572
- query_responses,
573
- resp_chan: Some(resp_chan),
574
- });
575
- } else {
576
- dbg_panic!("Run {} missing during completion", run_id);
328
+ );
577
329
  }
578
- };
579
- }
580
-
581
- fn failed_completion(
582
- &mut self,
583
- run_id: String,
584
- cause: WorkflowTaskFailedCause,
585
- reason: EvictionReason,
586
- failure: Failure,
587
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
588
- ) {
589
- let tt = if let Some(tt) = self.get_task(&run_id).map(|t| t.info.task_token.clone()) {
590
- tt
591
- } else {
592
- dbg_panic!(
593
- "No workflow task for run id {} found when trying to fail activation",
594
- run_id
595
- );
596
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
597
- return;
598
- };
599
-
600
- if let Some(m) = self.run_metrics(&run_id) {
601
- m.wf_task_failed();
602
330
  }
603
- let message = format!("Workflow activation completion failed: {:?}", &failure);
604
- // Blow up any cached data associated with the workflow
605
- let should_report = match self.request_eviction(RequestEvictMsg {
606
- run_id: run_id.clone(),
607
- message,
608
- reason,
609
- }) {
610
- EvictionRequestResult::EvictionRequested(Some(attempt))
611
- | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
612
- _ => false,
613
- };
614
- // If the outstanding WFT is a legacy query task, report that we need to fail it
615
- let outcome = if self
616
- .runs
617
- .get(&run_id)
618
- .map(|rh| rh.pending_work_is_legacy_query())
619
- .unwrap_or_default()
620
- {
621
- ActivationCompleteOutcome::ReportWFTFail(
622
- FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
623
- )
624
- } else if should_report {
625
- ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
626
- tt, cause, failure,
627
- ))
628
- } else {
629
- ActivationCompleteOutcome::DoNothing
630
- };
631
- self.reply_to_complete(&run_id, outcome, resp_chan);
632
- }
633
331
 
634
- fn process_post_activation(&mut self, report: PostActivationMsg) {
635
- let run_id = &report.run_id;
332
+ let mut res = None;
636
333
 
637
334
  // If we reported to server, we always want to mark it complete.
638
- let maybe_t = self.complete_wft(run_id, report.reported_wft_to_server);
335
+ let maybe_t = self.complete_wft(run_id, report.wft_report_status);
336
+ // Delete the activation
337
+ let activation = self
338
+ .runs
339
+ .get_mut(run_id)
340
+ .and_then(|rh| rh.delete_activation());
341
+
342
+ // Evict the run if the activation contained an eviction
343
+ let mut applied_buffered_poll_for_this_run = false;
344
+ if activation.map(|a| a.has_eviction()).unwrap_or_default() {
345
+ debug!(run_id=%run_id, "Evicting run");
346
+
347
+ if let Some(mut rh) = self.runs.remove(run_id) {
348
+ if let Some(buff) = rh.take_buffered_wft() {
349
+ // Don't try to apply a buffered poll for this run if we just got a new WFT
350
+ // from completing, because by definition that buffered poll is now an
351
+ // out-of-date WFT.
352
+ if wft_from_complete.is_none() {
353
+ res = self.instantiate_or_update(buff);
354
+ applied_buffered_poll_for_this_run = true;
355
+ }
356
+ }
357
+ }
639
358
 
640
- if self
641
- .get_activation(run_id)
642
- .map(|a| a.has_eviction())
643
- .unwrap_or_default()
644
- {
645
- self.evict_run(run_id);
359
+ // Attempt to apply a buffered poll for some *other* run, if we didn't have a wft
360
+ // from complete or a buffered poll for *this* run.
361
+ if wft_from_complete.is_none() && !applied_buffered_poll_for_this_run {
362
+ if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
363
+ res = self.instantiate_or_update(buff);
364
+ }
365
+ }
646
366
  };
647
367
 
648
- if let Some(wft) = report.wft_from_complete {
649
- debug!(run_id=%wft.workflow_execution.run_id, "New WFT from completion");
368
+ if let Some((wft, pag)) = wft_from_complete {
369
+ debug!(run_id=%wft.execution.run_id, "New WFT from completion");
650
370
  if let Some(t) = maybe_t {
651
- self.instantiate_or_update(PermittedWFT {
652
- wft,
371
+ res = self.instantiate_or_update(PermittedWFT {
372
+ work: wft,
653
373
  permit: t.permit,
654
- })
374
+ paginator: pag,
375
+ });
655
376
  }
656
377
  }
657
378
 
658
- if let Some(rh) = self.runs.get_mut(run_id) {
659
- // Delete the activation
660
- rh.activation.take();
661
- // Attempt to produce the next activation if needed
662
- rh.check_more_activations();
379
+ if res.is_none() {
380
+ if let Some(rh) = self.runs.get_mut(run_id) {
381
+ // Attempt to produce the next activation if needed
382
+ res = rh.check_more_activations();
383
+ }
663
384
  }
385
+ res
664
386
  }
665
387
 
666
- fn local_resolution(&mut self, msg: LocalResolutionMsg) {
388
+ fn local_resolution(&mut self, msg: LocalResolutionMsg) -> RunUpdateAct {
667
389
  let run_id = msg.run_id;
668
390
  if let Some(rh) = self.runs.get_mut(&run_id) {
669
- rh.send_local_resolution(msg.res)
391
+ rh.local_resolution(msg.res)
670
392
  } else {
671
393
  // It isn't an explicit error if the machine is missing when a local activity resolves.
672
394
  // This can happen if an activity reports a timeout after we stopped caring about it.
673
395
  debug!(run_id = %run_id,
674
396
  "Tried to resolve a local activity for a run we are no longer tracking");
397
+ None
398
+ }
399
+ }
400
+
401
+ fn process_heartbeat_timeout(&mut self, run_id: String) -> RunUpdateAct {
402
+ if let Some(rh) = self.runs.get_mut(&run_id) {
403
+ rh.heartbeat_timeout()
404
+ } else {
405
+ None
675
406
  }
676
407
  }
677
408
 
@@ -679,19 +410,10 @@ impl WFStream {
679
410
  /// activation to evict the workflow from the lang side. Workflow will not *actually* be evicted
680
411
  /// until lang replies to that activation
681
412
  fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
682
- let activation_has_eviction = self.activation_has_eviction(&info.run_id);
683
413
  if let Some(rh) = self.runs.get_mut(&info.run_id) {
684
- let attempts = rh.wft.as_ref().map(|wt| wt.info.attempt);
685
- if !activation_has_eviction && rh.trying_to_evict.is_none() {
686
- debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
687
- rh.trying_to_evict = Some(info);
688
- rh.check_more_activations();
689
- EvictionRequestResult::EvictionRequested(attempts)
690
- } else {
691
- EvictionRequestResult::EvictionAlreadyRequested(attempts)
692
- }
414
+ rh.request_eviction(info)
693
415
  } else {
694
- warn!(run_id=%info.run_id, "Eviction requested for unknown run");
416
+ debug!(run_id=%info.run_id, "Eviction requested for unknown run");
695
417
  EvictionRequestResult::NotFound
696
418
  }
697
419
  }
@@ -710,36 +432,10 @@ impl WFStream {
710
432
  }
711
433
  }
712
434
 
713
- /// Evict a workflow from the cache by its run id. Any existing pending activations will be
714
- /// destroyed, and any outstanding activations invalidated.
715
- fn evict_run(&mut self, run_id: &str) {
716
- debug!(run_id=%run_id, "Evicting run");
717
-
718
- let mut did_take_buff = false;
719
- // Now it can safely be deleted, it'll get recreated once the un-buffered poll is handled if
720
- // there was one.
721
- if let Some(mut rh) = self.runs.remove(run_id) {
722
- rh.handle.abort();
723
-
724
- if let Some(buff) = rh.buffered_resp.take() {
725
- self.instantiate_or_update(buff);
726
- did_take_buff = true;
727
- }
728
- }
729
-
730
- if !did_take_buff {
731
- // If there wasn't a buffered poll, there might be one for a different run which needs
732
- // a free cache slot, and now there is.
733
- if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
734
- self.instantiate_or_update(buff);
735
- }
736
- }
737
- }
738
-
739
435
  fn complete_wft(
740
436
  &mut self,
741
437
  run_id: &str,
742
- reported_wft_to_server: bool,
438
+ wft_report_status: WFTReportStatus,
743
439
  ) -> Option<OutstandingTask> {
744
440
  // If the WFT completion wasn't sent to the server, but we did see the final event, we still
745
441
  // want to clear the workflow task. This can really only happen in replay testing, where we
@@ -749,9 +445,9 @@ impl WFStream {
749
445
  let saw_final = self
750
446
  .runs
751
447
  .get(run_id)
752
- .map(|r| r.have_seen_terminal_event)
448
+ .map(|r| r.have_seen_terminal_event())
753
449
  .unwrap_or_default();
754
- if !saw_final && !reported_wft_to_server {
450
+ if !saw_final && matches!(wft_report_status, WFTReportStatus::NotReported) {
755
451
  return None;
756
452
  }
757
453
 
@@ -759,60 +455,26 @@ impl WFStream {
759
455
  // Can't mark the WFT complete if there are pending queries, as doing so would destroy
760
456
  // them.
761
457
  if rh
762
- .wft
763
- .as_ref()
458
+ .wft()
764
459
  .map(|wft| !wft.pending_queries.is_empty())
765
460
  .unwrap_or_default()
766
461
  {
767
462
  return None;
768
463
  }
769
464
 
770
- debug!("Marking WFT completed");
771
- let retme = rh.wft.take();
772
- if let Some(ot) = &retme {
773
- if let Some(m) = self.run_metrics(run_id) {
774
- m.wf_task_latency(ot.start_time.elapsed());
775
- }
776
- }
777
- retme
465
+ rh.mark_wft_complete(wft_report_status)
778
466
  } else {
779
467
  None
780
468
  }
781
469
  }
782
470
 
783
- /// Stores some work if there is any outstanding WFT or activation for the run. If there was
784
- /// not, returns the work back out inside the option.
785
- fn buffer_resp_if_outstanding_work(&mut self, work: PermittedWFT) -> Option<PermittedWFT> {
786
- let run_id = &work.wft.workflow_execution.run_id;
787
- if let Some(mut run) = self.runs.get_mut(run_id) {
788
- let about_to_issue_evict = run.trying_to_evict.is_some() && !run.last_action_acked;
789
- let has_wft = run.wft.is_some();
790
- let has_activation = run.activation.is_some();
791
- if has_wft
792
- || has_activation
793
- || about_to_issue_evict
794
- || run.more_pending_work
795
- || !run.last_action_acked
796
- {
797
- debug!(run_id = %run_id, run = ?run,
798
- "Got new WFT for a run with outstanding work, buffering it");
799
- run.buffered_resp = Some(work);
800
- None
801
- } else {
802
- Some(work)
803
- }
804
- } else {
805
- Some(work)
806
- }
807
- }
808
-
809
471
  fn buffer_resp_on_full_cache(&mut self, work: PermittedWFT) {
810
- debug!(run_id=%work.wft.workflow_execution.run_id, "Buffering WFT because cache is full");
472
+ debug!(run_id=%work.work.execution.run_id, "Buffering WFT because cache is full");
811
473
  // If there's already a buffered poll for the run, replace it.
812
474
  if let Some(rh) = self
813
475
  .buffered_polls_need_cache_slot
814
476
  .iter_mut()
815
- .find(|w| w.wft.workflow_execution.run_id == work.wft.workflow_execution.run_id)
477
+ .find(|w| w.work.execution.run_id == work.work.execution.run_id)
816
478
  {
817
479
  *rh = work;
818
480
  } else {
@@ -823,7 +485,7 @@ impl WFStream {
823
485
 
824
486
  /// Makes sure we have enough pending evictions to fulfill the needs of buffered WFTs who are
825
487
  /// waiting on a cache slot
826
- fn reconcile_buffered(&mut self) {
488
+ fn reconcile_buffered(&mut self) -> Vec<ActivationOrAuto> {
827
489
  // We must ensure that there are at least as many pending evictions as there are tasks
828
490
  // that we might need to un-buffer (skipping runs which already have buffered tasks for
829
491
  // themselves)
@@ -832,109 +494,222 @@ impl WFStream {
832
494
  let num_existing_evictions = self
833
495
  .runs
834
496
  .runs_lru_order()
835
- .filter(|(_, h)| h.trying_to_evict.is_some())
497
+ .filter(|(_, h)| h.is_trying_to_evict())
836
498
  .count();
837
499
  let mut num_evicts_needed = num_in_buff.saturating_sub(num_existing_evictions);
838
500
  for (rid, handle) in self.runs.runs_lru_order() {
839
501
  if num_evicts_needed == 0 {
840
502
  break;
841
503
  }
842
- if handle.buffered_resp.is_none() {
504
+ if !handle.has_buffered_wft() {
843
505
  num_evicts_needed -= 1;
844
506
  evict_these.push(rid.to_string());
845
507
  }
846
508
  }
509
+ let mut acts = vec![];
847
510
  for run_id in evict_these {
848
- self.request_eviction(RequestEvictMsg {
849
- run_id,
850
- message: "Workflow cache full".to_string(),
851
- reason: EvictionReason::CacheFull,
852
- });
511
+ acts.extend(
512
+ self.request_eviction(RequestEvictMsg {
513
+ run_id,
514
+ message: "Workflow cache full".to_string(),
515
+ reason: EvictionReason::CacheFull,
516
+ })
517
+ .into_run_update_resp(),
518
+ );
853
519
  }
854
- }
855
-
856
- fn reply_to_complete(
857
- &self,
858
- run_id: &str,
859
- outcome: ActivationCompleteOutcome,
860
- chan: oneshot::Sender<ActivationCompleteResult>,
861
- ) {
862
- let most_recently_processed_event = self
863
- .runs
864
- .peek(run_id)
865
- .map(|rh| rh.most_recently_processed_event_number)
866
- .unwrap_or_default();
867
- chan.send(ActivationCompleteResult {
868
- most_recently_processed_event,
869
- outcome,
870
- })
871
- .expect("Rcv half of activation reply not dropped");
520
+ acts
872
521
  }
873
522
 
874
523
  fn shutdown_done(&self) -> bool {
875
- let all_runs_ready = self
876
- .runs
877
- .handles()
878
- .all(|r| !r.has_any_pending_work(true, false));
879
- if self.shutdown_token.is_cancelled() && all_runs_ready {
880
- info!("Workflow shutdown is done");
881
- true
882
- } else {
883
- false
524
+ if self.shutdown_token.is_cancelled() {
525
+ if Arc::strong_count(&self.history_fetch_refcounter) > 1 {
526
+ // Don't exit if there are outstanding fetch requests
527
+ return false;
528
+ }
529
+ let all_runs_ready = self
530
+ .runs
531
+ .handles()
532
+ .all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
533
+ if all_runs_ready {
534
+ return true;
535
+ }
884
536
  }
885
- }
886
-
887
- fn get_task(&mut self, run_id: &str) -> Option<&OutstandingTask> {
888
- self.runs.get(run_id).and_then(|rh| rh.wft.as_ref())
889
- }
890
-
891
- fn get_activation(&mut self, run_id: &str) -> Option<&OutstandingActivation> {
892
- self.runs.get(run_id).and_then(|rh| rh.activation.as_ref())
893
- }
894
-
895
- fn run_metrics(&mut self, run_id: &str) -> Option<&MetricsContext> {
896
- self.runs.get(run_id).map(|r| &r.metrics)
897
- }
898
-
899
- fn activation_has_only_eviction(&mut self, run_id: &str) -> bool {
900
- self.runs
901
- .get(run_id)
902
- .and_then(|rh| rh.activation)
903
- .map(OutstandingActivation::has_only_eviction)
904
- .unwrap_or_default()
905
- }
906
-
907
- fn activation_has_eviction(&mut self, run_id: &str) -> bool {
908
- self.runs
909
- .get(run_id)
910
- .and_then(|rh| rh.activation)
911
- .map(OutstandingActivation::has_eviction)
912
- .unwrap_or_default()
537
+ false
913
538
  }
914
539
 
915
540
  fn outstanding_wfts(&self) -> usize {
916
- self.runs.handles().filter(|r| r.wft.is_some()).count()
541
+ self.runs.handles().filter(|r| r.wft().is_some()).count()
917
542
  }
918
543
 
919
544
  // Useful when debugging
920
545
  #[allow(dead_code)]
921
546
  fn info_dump(&self, run_id: &str) {
922
547
  if let Some(r) = self.runs.peek(run_id) {
923
- info!(run_id, wft=?r.wft, activation=?r.activation, buffered=r.buffered_resp.is_some(),
924
- trying_to_evict=r.trying_to_evict.is_some(), more_work=r.more_pending_work,
925
- last_action_acked=r.last_action_acked);
548
+ info!(run_id, wft=?r.wft(), activation=?r.activation(),
549
+ buffered_wft=r.has_buffered_wft(),
550
+ trying_to_evict=r.is_trying_to_evict(), more_work=r.more_pending_work());
926
551
  } else {
927
552
  info!(run_id, "Run not found");
928
553
  }
929
554
  }
930
555
  }
931
556
 
932
- /// Drains pending queries from the workflow task and appends them to the activation's jobs
933
- fn put_queries_in_act(act: &mut WorkflowActivation, wft: &mut OutstandingTask) {
934
- debug!(queries=?wft.pending_queries, "Dispatching queries");
935
- let query_jobs = wft
936
- .pending_queries
937
- .drain(..)
938
- .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
939
- act.jobs.extend(query_jobs);
557
+ /// All possible inputs to the [WFStream]
558
+ #[derive(derive_more::From, Debug)]
559
+ #[cfg_attr(
560
+ feature = "save_wf_inputs",
561
+ derive(serde::Serialize, serde::Deserialize)
562
+ )]
563
+ enum WFStreamInput {
564
+ NewWft(PermittedWFT),
565
+ Local(LocalInput),
566
+ /// The stream given to us which represents the poller (or a mock) terminated.
567
+ PollerDead,
568
+ /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
569
+ /// error while polling
570
+ PollerError(
571
+ #[cfg_attr(
572
+ feature = "save_wf_inputs",
573
+ serde(with = "tonic_status_serde::SerdeStatus")
574
+ )]
575
+ tonic::Status,
576
+ ),
577
+ FailedFetch {
578
+ run_id: String,
579
+ #[cfg_attr(
580
+ feature = "save_wf_inputs",
581
+ serde(with = "tonic_status_serde::SerdeStatus")
582
+ )]
583
+ err: tonic::Status,
584
+ },
585
+ }
586
+
587
+ /// A non-poller-received input to the [WFStream]
588
+ #[derive(derive_more::DebugCustom)]
589
+ #[cfg_attr(
590
+ feature = "save_wf_inputs",
591
+ derive(serde::Serialize, serde::Deserialize)
592
+ )]
593
+ #[debug(fmt = "LocalInput {{ {input:?} }}")]
594
+ pub(super) struct LocalInput {
595
+ pub input: LocalInputs,
596
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip, default = "Span::current"))]
597
+ pub span: Span,
598
+ }
599
+ impl From<HeartbeatTimeoutMsg> for LocalInput {
600
+ fn from(hb: HeartbeatTimeoutMsg) -> Self {
601
+ Self {
602
+ input: LocalInputs::HeartbeatTimeout(hb.run_id),
603
+ span: hb.span,
604
+ }
605
+ }
606
+ }
607
+ /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
608
+ /// new polls.
609
+ #[derive(Debug, derive_more::From)]
610
+ #[cfg_attr(
611
+ feature = "save_wf_inputs",
612
+ derive(serde::Serialize, serde::Deserialize)
613
+ )]
614
+ pub(super) enum LocalInputs {
615
+ Completion(WFActCompleteMsg),
616
+ FetchedPageCompletion {
617
+ paginator: HistoryPaginator,
618
+ update: HistoryUpdate,
619
+ },
620
+ LocalResolution(LocalResolutionMsg),
621
+ PostActivation(PostActivationMsg),
622
+ RequestEviction(RequestEvictMsg),
623
+ HeartbeatTimeout(String),
624
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
625
+ GetStateInfo(GetStateInfoMsg),
626
+ }
627
+ impl LocalInputs {
628
+ fn run_id(&self) -> Option<&str> {
629
+ Some(match self {
630
+ LocalInputs::Completion(c) => c.completion.run_id(),
631
+ LocalInputs::FetchedPageCompletion { paginator, .. } => &paginator.run_id,
632
+ LocalInputs::LocalResolution(lr) => &lr.run_id,
633
+ LocalInputs::PostActivation(pa) => &pa.run_id,
634
+ LocalInputs::RequestEviction(re) => &re.run_id,
635
+ LocalInputs::HeartbeatTimeout(hb) => hb,
636
+ LocalInputs::GetStateInfo(_) => return None,
637
+ })
638
+ }
639
+ }
640
+ #[derive(Debug)]
641
+ #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
642
+ enum ExternalPollerInputs {
643
+ NewWft(PermittedWFT),
644
+ PollerDead,
645
+ PollerError(tonic::Status),
646
+ FetchedUpdate(PermittedWFT),
647
+ NextPage {
648
+ paginator: HistoryPaginator,
649
+ update: HistoryUpdate,
650
+ span: Span,
651
+ },
652
+ FailedFetch {
653
+ run_id: String,
654
+ err: tonic::Status,
655
+ },
656
+ }
657
+ impl From<ExternalPollerInputs> for WFStreamInput {
658
+ fn from(l: ExternalPollerInputs) -> Self {
659
+ match l {
660
+ ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
661
+ ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
662
+ ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
663
+ ExternalPollerInputs::FetchedUpdate(wft) => WFStreamInput::NewWft(wft),
664
+ ExternalPollerInputs::FailedFetch { run_id, err } => {
665
+ WFStreamInput::FailedFetch { run_id, err }
666
+ }
667
+ ExternalPollerInputs::NextPage {
668
+ paginator,
669
+ update,
670
+ span,
671
+ } => WFStreamInput::Local(LocalInput {
672
+ input: LocalInputs::FetchedPageCompletion { paginator, update },
673
+ span,
674
+ }),
675
+ }
676
+ }
677
+ }
678
+ impl From<Result<WFTExtractorOutput, tonic::Status>> for ExternalPollerInputs {
679
+ fn from(v: Result<WFTExtractorOutput, tonic::Status>) -> Self {
680
+ match v {
681
+ Ok(WFTExtractorOutput::NewWFT(pwft)) => ExternalPollerInputs::NewWft(pwft),
682
+ Ok(WFTExtractorOutput::FetchResult(updated_wft, _)) => {
683
+ ExternalPollerInputs::FetchedUpdate(updated_wft)
684
+ }
685
+ Ok(WFTExtractorOutput::NextPage {
686
+ paginator,
687
+ update,
688
+ span,
689
+ rc: _rc,
690
+ }) => ExternalPollerInputs::NextPage {
691
+ paginator,
692
+ update,
693
+ span,
694
+ },
695
+ Ok(WFTExtractorOutput::FailedFetch { run_id, err }) => {
696
+ ExternalPollerInputs::FailedFetch { run_id, err }
697
+ }
698
+ Ok(WFTExtractorOutput::PollerDead) => ExternalPollerInputs::PollerDead,
699
+ Err(e) => ExternalPollerInputs::PollerError(e),
700
+ }
701
+ }
702
+ }
703
+ #[derive(Debug)]
704
+ enum NewOrFetchedComplete {
705
+ New(WFActCompleteMsg),
706
+ Fetched(HistoryUpdate, HistoryPaginator),
707
+ }
708
+ impl NewOrFetchedComplete {
709
+ fn run_id(&self) -> &str {
710
+ match self {
711
+ NewOrFetchedComplete::New(c) => c.completion.run_id(),
712
+ NewOrFetchedComplete::Fetched(_, p) => &p.run_id,
713
+ }
714
+ }
940
715
  }