temporalio 0.0.2 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (320) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +25 -23
  3. data/bridge/Cargo.lock +185 -76
  4. data/bridge/Cargo.toml +6 -4
  5. data/bridge/sdk-core/README.md +19 -6
  6. data/bridge/sdk-core/client/src/lib.rs +215 -39
  7. data/bridge/sdk-core/client/src/metrics.rs +17 -8
  8. data/bridge/sdk-core/client/src/raw.rs +4 -4
  9. data/bridge/sdk-core/client/src/retry.rs +32 -20
  10. data/bridge/sdk-core/core/Cargo.toml +22 -9
  11. data/bridge/sdk-core/core/src/abstractions.rs +203 -14
  12. data/bridge/sdk-core/core/src/core_tests/activity_tasks.rs +76 -41
  13. data/bridge/sdk-core/core/src/core_tests/determinism.rs +165 -2
  14. data/bridge/sdk-core/core/src/core_tests/local_activities.rs +204 -83
  15. data/bridge/sdk-core/core/src/core_tests/queries.rs +3 -4
  16. data/bridge/sdk-core/core/src/core_tests/workers.rs +1 -3
  17. data/bridge/sdk-core/core/src/core_tests/workflow_tasks.rs +397 -54
  18. data/bridge/sdk-core/core/src/ephemeral_server/mod.rs +106 -12
  19. data/bridge/sdk-core/core/src/internal_flags.rs +136 -0
  20. data/bridge/sdk-core/core/src/lib.rs +16 -9
  21. data/bridge/sdk-core/core/src/telemetry/log_export.rs +1 -1
  22. data/bridge/sdk-core/core/src/telemetry/metrics.rs +69 -35
  23. data/bridge/sdk-core/core/src/telemetry/mod.rs +29 -13
  24. data/bridge/sdk-core/core/src/telemetry/prometheus_server.rs +17 -12
  25. data/bridge/sdk-core/core/src/test_help/mod.rs +62 -12
  26. data/bridge/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +112 -156
  27. data/bridge/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  28. data/bridge/sdk-core/core/src/worker/activities/local_activities.rs +352 -122
  29. data/bridge/sdk-core/core/src/worker/activities.rs +233 -157
  30. data/bridge/sdk-core/core/src/worker/client/mocks.rs +22 -2
  31. data/bridge/sdk-core/core/src/worker/client.rs +18 -2
  32. data/bridge/sdk-core/core/src/worker/mod.rs +165 -58
  33. data/bridge/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  34. data/bridge/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  35. data/bridge/sdk-core/core/src/worker/workflow/history_update.rs +856 -277
  36. data/bridge/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +100 -43
  37. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +7 -7
  38. data/bridge/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +5 -4
  39. data/bridge/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +87 -27
  40. data/bridge/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +5 -4
  41. data/bridge/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +5 -4
  42. data/bridge/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +5 -4
  43. data/bridge/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +137 -62
  44. data/bridge/sdk-core/core/src/worker/workflow/machines/mod.rs +25 -17
  45. data/bridge/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +7 -6
  46. data/bridge/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +103 -152
  47. data/bridge/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +7 -7
  48. data/bridge/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +9 -9
  49. data/bridge/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  50. data/bridge/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +14 -7
  51. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +5 -16
  52. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +201 -121
  53. data/bridge/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +11 -14
  54. data/bridge/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +30 -15
  55. data/bridge/sdk-core/core/src/worker/workflow/managed_run.rs +1026 -376
  56. data/bridge/sdk-core/core/src/worker/workflow/mod.rs +460 -384
  57. data/bridge/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
  58. data/bridge/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  59. data/bridge/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  60. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
  61. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  62. data/bridge/sdk-core/core/src/worker/workflow/workflow_stream.rs +448 -718
  63. data/bridge/sdk-core/core-api/Cargo.toml +2 -1
  64. data/bridge/sdk-core/core-api/src/errors.rs +1 -34
  65. data/bridge/sdk-core/core-api/src/lib.rs +6 -2
  66. data/bridge/sdk-core/core-api/src/telemetry.rs +0 -6
  67. data/bridge/sdk-core/core-api/src/worker.rs +14 -1
  68. data/bridge/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +18 -15
  69. data/bridge/sdk-core/fsm/rustfsm_trait/src/lib.rs +8 -3
  70. data/bridge/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  71. data/bridge/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
  72. data/bridge/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
  73. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
  74. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
  75. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
  76. data/bridge/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
  77. data/bridge/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
  78. data/bridge/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
  79. data/bridge/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
  80. data/bridge/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
  81. data/bridge/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  82. data/bridge/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
  83. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
  84. data/bridge/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  85. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +1 -0
  86. data/bridge/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +1 -0
  87. data/bridge/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +1 -0
  88. data/bridge/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  89. data/bridge/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  90. data/bridge/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +1 -0
  91. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +7 -0
  92. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +1 -0
  93. data/bridge/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +6 -0
  94. data/bridge/sdk-core/sdk/Cargo.toml +3 -2
  95. data/bridge/sdk-core/sdk/src/lib.rs +87 -20
  96. data/bridge/sdk-core/sdk/src/workflow_future.rs +9 -8
  97. data/bridge/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  98. data/bridge/sdk-core/sdk-core-protos/build.rs +36 -1
  99. data/bridge/sdk-core/sdk-core-protos/src/history_builder.rs +100 -87
  100. data/bridge/sdk-core/sdk-core-protos/src/history_info.rs +5 -1
  101. data/bridge/sdk-core/sdk-core-protos/src/lib.rs +175 -57
  102. data/bridge/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  103. data/bridge/sdk-core/test-utils/Cargo.toml +3 -1
  104. data/bridge/sdk-core/test-utils/src/canned_histories.rs +106 -296
  105. data/bridge/sdk-core/test-utils/src/histfetch.rs +1 -1
  106. data/bridge/sdk-core/test-utils/src/lib.rs +82 -23
  107. data/bridge/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  108. data/bridge/sdk-core/test-utils/src/workflows.rs +29 -0
  109. data/bridge/sdk-core/tests/fuzzy_workflow.rs +130 -0
  110. data/bridge/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
  111. data/bridge/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  112. data/bridge/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
  113. data/bridge/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  114. data/bridge/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
  115. data/bridge/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
  116. data/bridge/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
  117. data/bridge/sdk-core/tests/integ_tests/workflow_tests/activities.rs +93 -69
  118. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  119. data/bridge/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
  120. data/bridge/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +1 -0
  121. data/bridge/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
  122. data/bridge/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
  123. data/bridge/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +72 -191
  124. data/bridge/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +1 -0
  125. data/bridge/sdk-core/tests/integ_tests/workflow_tests/patches.rs +7 -28
  126. data/bridge/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  127. data/bridge/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  128. data/bridge/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
  129. data/bridge/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
  130. data/bridge/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
  131. data/bridge/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -4
  132. data/bridge/sdk-core/tests/integ_tests/workflow_tests.rs +10 -11
  133. data/bridge/sdk-core/tests/main.rs +3 -13
  134. data/bridge/sdk-core/tests/runner.rs +75 -36
  135. data/bridge/sdk-core/tests/wf_input_replay.rs +32 -0
  136. data/bridge/src/connection.rs +41 -25
  137. data/bridge/src/lib.rs +269 -14
  138. data/bridge/src/runtime.rs +1 -1
  139. data/bridge/src/test_server.rs +153 -0
  140. data/bridge/src/worker.rs +89 -16
  141. data/lib/gen/temporal/api/command/v1/message_pb.rb +4 -18
  142. data/lib/gen/temporal/api/common/v1/message_pb.rb +4 -0
  143. data/lib/gen/temporal/api/enums/v1/command_type_pb.rb +1 -3
  144. data/lib/gen/temporal/api/enums/v1/event_type_pb.rb +3 -3
  145. data/lib/gen/temporal/api/enums/v1/failed_cause_pb.rb +2 -0
  146. data/lib/gen/temporal/api/enums/v1/update_pb.rb +6 -4
  147. data/lib/gen/temporal/api/history/v1/message_pb.rb +27 -19
  148. data/lib/gen/temporal/api/namespace/v1/message_pb.rb +1 -0
  149. data/lib/gen/temporal/api/operatorservice/v1/request_response_pb.rb +3 -0
  150. data/lib/gen/temporal/api/protocol/v1/message_pb.rb +30 -0
  151. data/lib/gen/temporal/api/sdk/v1/task_complete_metadata_pb.rb +23 -0
  152. data/lib/gen/temporal/api/testservice/v1/request_response_pb.rb +49 -0
  153. data/lib/gen/temporal/api/testservice/v1/service_pb.rb +21 -0
  154. data/lib/gen/temporal/api/update/v1/message_pb.rb +72 -0
  155. data/lib/gen/temporal/api/workflowservice/v1/request_response_pb.rb +26 -16
  156. data/lib/gen/temporal/sdk/core/activity_result/activity_result_pb.rb +13 -9
  157. data/lib/gen/temporal/sdk/core/activity_task/activity_task_pb.rb +10 -6
  158. data/lib/gen/temporal/sdk/core/child_workflow/child_workflow_pb.rb +13 -9
  159. data/lib/gen/temporal/sdk/core/common/common_pb.rb +7 -3
  160. data/lib/gen/temporal/sdk/core/core_interface_pb.rb +9 -3
  161. data/lib/gen/temporal/sdk/core/external_data/external_data_pb.rb +7 -3
  162. data/lib/gen/temporal/sdk/core/workflow_activation/workflow_activation_pb.rb +27 -21
  163. data/lib/gen/temporal/sdk/core/workflow_commands/workflow_commands_pb.rb +28 -24
  164. data/lib/gen/temporal/sdk/core/workflow_completion/workflow_completion_pb.rb +12 -5
  165. data/lib/temporalio/activity/context.rb +13 -8
  166. data/lib/temporalio/activity/info.rb +1 -1
  167. data/lib/temporalio/bridge/connect_options.rb +15 -0
  168. data/lib/temporalio/bridge/retry_config.rb +24 -0
  169. data/lib/temporalio/bridge/tls_options.rb +19 -0
  170. data/lib/temporalio/bridge.rb +1 -1
  171. data/lib/temporalio/client/implementation.rb +8 -8
  172. data/lib/temporalio/connection/retry_config.rb +44 -0
  173. data/lib/temporalio/connection/service.rb +20 -0
  174. data/lib/temporalio/connection/test_service.rb +92 -0
  175. data/lib/temporalio/connection/tls_options.rb +51 -0
  176. data/lib/temporalio/connection/workflow_service.rb +731 -0
  177. data/lib/temporalio/connection.rb +55 -720
  178. data/lib/temporalio/interceptor/activity_inbound.rb +22 -0
  179. data/lib/temporalio/interceptor/activity_outbound.rb +24 -0
  180. data/lib/temporalio/interceptor/chain.rb +5 -5
  181. data/lib/temporalio/interceptor/client.rb +8 -4
  182. data/lib/temporalio/interceptor.rb +22 -0
  183. data/lib/temporalio/retry_policy.rb +13 -3
  184. data/lib/temporalio/testing/time_skipping_handle.rb +32 -0
  185. data/lib/temporalio/testing/time_skipping_interceptor.rb +23 -0
  186. data/lib/temporalio/testing/workflow_environment.rb +112 -0
  187. data/lib/temporalio/testing.rb +175 -0
  188. data/lib/temporalio/version.rb +1 -1
  189. data/lib/temporalio/worker/activity_runner.rb +26 -4
  190. data/lib/temporalio/worker/activity_worker.rb +44 -18
  191. data/lib/temporalio/worker/sync_worker.rb +47 -11
  192. data/lib/temporalio/worker.rb +27 -21
  193. data/lib/temporalio/workflow/async.rb +46 -0
  194. data/lib/temporalio/workflow/future.rb +138 -0
  195. data/lib/temporalio/workflow/info.rb +76 -0
  196. data/lib/thermite_patch.rb +10 -0
  197. data/sig/async.rbs +17 -0
  198. data/sig/protobuf.rbs +16 -0
  199. data/sig/protos/dependencies/gogoproto/gogo.rbs +914 -0
  200. data/sig/protos/google/protobuf/any.rbs +157 -0
  201. data/sig/protos/google/protobuf/descriptor.rbs +2825 -0
  202. data/sig/protos/google/protobuf/duration.rbs +114 -0
  203. data/sig/protos/google/protobuf/empty.rbs +36 -0
  204. data/sig/protos/google/protobuf/timestamp.rbs +145 -0
  205. data/sig/protos/google/protobuf/wrappers.rbs +358 -0
  206. data/sig/protos/temporal/api/batch/v1/message.rbs +300 -0
  207. data/sig/protos/temporal/api/command/v1/message.rbs +1399 -0
  208. data/sig/protos/temporal/api/common/v1/message.rbs +528 -0
  209. data/sig/protos/temporal/api/enums/v1/batch_operation.rbs +79 -0
  210. data/sig/protos/temporal/api/enums/v1/command_type.rbs +68 -0
  211. data/sig/protos/temporal/api/enums/v1/common.rbs +118 -0
  212. data/sig/protos/temporal/api/enums/v1/event_type.rbs +264 -0
  213. data/sig/protos/temporal/api/enums/v1/failed_cause.rbs +277 -0
  214. data/sig/protos/temporal/api/enums/v1/namespace.rbs +108 -0
  215. data/sig/protos/temporal/api/enums/v1/query.rbs +81 -0
  216. data/sig/protos/temporal/api/enums/v1/reset.rbs +44 -0
  217. data/sig/protos/temporal/api/enums/v1/schedule.rbs +72 -0
  218. data/sig/protos/temporal/api/enums/v1/task_queue.rbs +92 -0
  219. data/sig/protos/temporal/api/enums/v1/update.rbs +64 -0
  220. data/sig/protos/temporal/api/enums/v1/workflow.rbs +371 -0
  221. data/sig/protos/temporal/api/errordetails/v1/message.rbs +551 -0
  222. data/sig/protos/temporal/api/failure/v1/message.rbs +581 -0
  223. data/sig/protos/temporal/api/filter/v1/message.rbs +171 -0
  224. data/sig/protos/temporal/api/history/v1/message.rbs +4609 -0
  225. data/sig/protos/temporal/api/namespace/v1/message.rbs +410 -0
  226. data/sig/protos/temporal/api/operatorservice/v1/request_response.rbs +643 -0
  227. data/sig/protos/temporal/api/operatorservice/v1/service.rbs +17 -0
  228. data/sig/protos/temporal/api/protocol/v1/message.rbs +84 -0
  229. data/sig/protos/temporal/api/query/v1/message.rbs +182 -0
  230. data/sig/protos/temporal/api/replication/v1/message.rbs +148 -0
  231. data/sig/protos/temporal/api/schedule/v1/message.rbs +1488 -0
  232. data/sig/protos/temporal/api/sdk/v1/task_complete_metadata.rbs +110 -0
  233. data/sig/protos/temporal/api/taskqueue/v1/message.rbs +486 -0
  234. data/sig/protos/temporal/api/testservice/v1/request_response.rbs +249 -0
  235. data/sig/protos/temporal/api/testservice/v1/service.rbs +15 -0
  236. data/sig/protos/temporal/api/update/v1/message.rbs +489 -0
  237. data/sig/protos/temporal/api/version/v1/message.rbs +184 -0
  238. data/sig/protos/temporal/api/workflow/v1/message.rbs +824 -0
  239. data/sig/protos/temporal/api/workflowservice/v1/request_response.rbs +7250 -0
  240. data/sig/protos/temporal/api/workflowservice/v1/service.rbs +22 -0
  241. data/sig/protos/temporal/sdk/core/activity_result/activity_result.rbs +380 -0
  242. data/sig/protos/temporal/sdk/core/activity_task/activity_task.rbs +386 -0
  243. data/sig/protos/temporal/sdk/core/child_workflow/child_workflow.rbs +323 -0
  244. data/sig/protos/temporal/sdk/core/common/common.rbs +62 -0
  245. data/sig/protos/temporal/sdk/core/core_interface.rbs +101 -0
  246. data/sig/protos/temporal/sdk/core/external_data/external_data.rbs +119 -0
  247. data/sig/protos/temporal/sdk/core/workflow_activation/workflow_activation.rbs +1473 -0
  248. data/sig/protos/temporal/sdk/core/workflow_commands/workflow_commands.rbs +1784 -0
  249. data/sig/protos/temporal/sdk/core/workflow_completion/workflow_completion.rbs +180 -0
  250. data/sig/ruby.rbs +12 -0
  251. data/sig/temporalio/activity/context.rbs +29 -0
  252. data/sig/temporalio/activity/info.rbs +43 -0
  253. data/sig/temporalio/activity.rbs +19 -0
  254. data/sig/temporalio/bridge/connect_options.rbs +19 -0
  255. data/sig/temporalio/bridge/error.rbs +8 -0
  256. data/sig/temporalio/bridge/retry_config.rbs +21 -0
  257. data/sig/temporalio/bridge/tls_options.rbs +17 -0
  258. data/sig/temporalio/bridge.rbs +71 -0
  259. data/sig/temporalio/client/implementation.rbs +38 -0
  260. data/sig/temporalio/client/workflow_handle.rbs +41 -0
  261. data/sig/temporalio/client.rbs +35 -0
  262. data/sig/temporalio/connection/retry_config.rbs +37 -0
  263. data/sig/temporalio/connection/service.rbs +14 -0
  264. data/sig/temporalio/connection/test_service.rbs +13 -0
  265. data/sig/temporalio/connection/tls_options.rbs +43 -0
  266. data/sig/temporalio/connection/workflow_service.rbs +48 -0
  267. data/sig/temporalio/connection.rbs +30 -0
  268. data/sig/temporalio/data_converter.rbs +35 -0
  269. data/sig/temporalio/error/failure.rbs +121 -0
  270. data/sig/temporalio/error/workflow_failure.rbs +9 -0
  271. data/sig/temporalio/errors.rbs +36 -0
  272. data/sig/temporalio/failure_converter/base.rbs +12 -0
  273. data/sig/temporalio/failure_converter/basic.rbs +86 -0
  274. data/sig/temporalio/failure_converter.rbs +5 -0
  275. data/sig/temporalio/interceptor/activity_inbound.rbs +21 -0
  276. data/sig/temporalio/interceptor/activity_outbound.rbs +10 -0
  277. data/sig/temporalio/interceptor/chain.rbs +24 -0
  278. data/sig/temporalio/interceptor/client.rbs +148 -0
  279. data/sig/temporalio/interceptor.rbs +6 -0
  280. data/sig/temporalio/payload_codec/base.rbs +12 -0
  281. data/sig/temporalio/payload_converter/base.rbs +12 -0
  282. data/sig/temporalio/payload_converter/bytes.rbs +9 -0
  283. data/sig/temporalio/payload_converter/composite.rbs +19 -0
  284. data/sig/temporalio/payload_converter/encoding_base.rbs +14 -0
  285. data/sig/temporalio/payload_converter/json.rbs +9 -0
  286. data/sig/temporalio/payload_converter/nil.rbs +9 -0
  287. data/sig/temporalio/payload_converter.rbs +5 -0
  288. data/sig/temporalio/retry_policy.rbs +25 -0
  289. data/sig/temporalio/retry_state.rbs +20 -0
  290. data/sig/temporalio/runtime.rbs +12 -0
  291. data/sig/temporalio/testing/time_skipping_handle.rbs +15 -0
  292. data/sig/temporalio/testing/time_skipping_interceptor.rbs +13 -0
  293. data/sig/temporalio/testing/workflow_environment.rbs +22 -0
  294. data/sig/temporalio/testing.rbs +35 -0
  295. data/sig/temporalio/timeout_type.rbs +15 -0
  296. data/sig/temporalio/version.rbs +3 -0
  297. data/sig/temporalio/worker/activity_runner.rbs +35 -0
  298. data/sig/temporalio/worker/activity_worker.rbs +44 -0
  299. data/sig/temporalio/worker/reactor.rbs +22 -0
  300. data/sig/temporalio/worker/runner.rbs +21 -0
  301. data/sig/temporalio/worker/sync_worker.rbs +23 -0
  302. data/sig/temporalio/worker/thread_pool_executor.rbs +23 -0
  303. data/sig/temporalio/worker.rbs +46 -0
  304. data/sig/temporalio/workflow/async.rbs +9 -0
  305. data/sig/temporalio/workflow/execution_info.rbs +55 -0
  306. data/sig/temporalio/workflow/execution_status.rbs +21 -0
  307. data/sig/temporalio/workflow/future.rbs +40 -0
  308. data/sig/temporalio/workflow/id_reuse_policy.rbs +15 -0
  309. data/sig/temporalio/workflow/info.rbs +55 -0
  310. data/sig/temporalio/workflow/query_reject_condition.rbs +14 -0
  311. data/sig/temporalio.rbs +2 -0
  312. data/sig/thermite_patch.rbs +15 -0
  313. data/temporalio.gemspec +6 -4
  314. metadata +183 -17
  315. data/bridge/sdk-core/Cargo.lock +0 -2606
  316. data/bridge/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
  317. data/lib/bridge.so +0 -0
  318. data/lib/gen/temporal/api/enums/v1/interaction_type_pb.rb +0 -25
  319. data/lib/gen/temporal/api/interaction/v1/message_pb.rb +0 -49
  320. data/lib/gen/temporal/sdk/core/bridge/bridge_pb.rb +0 -222
@@ -1,188 +1,217 @@
1
1
  #[cfg(test)]
2
2
  mod managed_wf_test;
3
3
 
4
+ #[cfg(test)]
5
+ pub(crate) use managed_wf_test::ManagedWFFunc;
6
+
4
7
  use crate::{
8
+ abstractions::dbg_panic,
9
+ protosext::WorkflowActivationExt,
5
10
  worker::{
6
11
  workflow::{
7
- machines::WorkflowMachines, ActivationAction, ActivationCompleteOutcome, HistoryUpdate,
8
- LocalResolution, NewIncomingWFT, OutgoingServerCommands, RequestEvictMsg, RunActions,
9
- RunActivationCompletion, RunUpdateResponse, ServerCommandsWithWorkflowInfo, WFCommand,
10
- WorkflowBridge,
12
+ history_update::HistoryPaginator, machines::WorkflowMachines, ActivationAction,
13
+ ActivationCompleteOutcome, ActivationCompleteResult, ActivationOrAuto,
14
+ EvictionRequestResult, FailedActivationWFTReport, HeartbeatTimeoutMsg, HistoryUpdate,
15
+ LocalActivityRequestSink, LocalResolution, NextPageReq, OutgoingServerCommands,
16
+ OutstandingActivation, OutstandingTask, PermittedWFT, RequestEvictMsg, RunBasics,
17
+ ServerCommandsWithWorkflowInfo, WFCommand, WFMachinesError, WFTReportStatus,
18
+ WorkflowBridge, WorkflowTaskInfo, WFT_HEARTBEAT_TIMEOUT_FRACTION,
11
19
  },
12
- LocalActRequest,
20
+ LocalActRequest, LEGACY_QUERY_ID,
13
21
  },
14
22
  MetricsContext,
15
23
  };
16
- use futures::{stream, StreamExt};
24
+ use futures_util::future::AbortHandle;
17
25
  use std::{
26
+ collections::HashSet,
18
27
  ops::Add,
28
+ rc::Rc,
19
29
  sync::mpsc::Sender,
20
30
  time::{Duration, Instant},
21
31
  };
22
- use temporal_sdk_core_api::errors::WFMachinesError;
23
- use temporal_sdk_core_protos::coresdk::{
24
- workflow_activation::{RemoveFromCache, WorkflowActivation},
25
- workflow_commands::QueryResult,
26
- };
27
- use tokio::{
28
- sync::{
29
- mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
30
- oneshot,
32
+ use temporal_sdk_core_protos::{
33
+ coresdk::{
34
+ workflow_activation::{
35
+ create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
36
+ workflow_activation_job, RemoveFromCache, WorkflowActivation,
37
+ },
38
+ workflow_commands::QueryResult,
39
+ workflow_completion,
31
40
  },
32
- task,
33
- task::JoinHandle,
41
+ temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure},
42
+ TaskToken,
34
43
  };
35
- use tokio_stream::wrappers::UnboundedReceiverStream;
44
+ use tokio::sync::oneshot;
36
45
  use tracing::Span;
37
- use tracing_futures::Instrument;
38
-
39
- use crate::worker::workflow::{
40
- ActivationCompleteResult, ActivationOrAuto, FailRunUpdate, FulfillableActivationComplete,
41
- GoodRunUpdate, LocalActivityRequestSink, RunAction, RunUpdateResponseKind,
42
- };
43
- use temporal_sdk_core_protos::TaskToken;
44
-
45
- use crate::abstractions::dbg_panic;
46
- #[cfg(test)]
47
- pub(crate) use managed_wf_test::ManagedWFFunc;
48
46
 
49
47
  type Result<T, E = WFMachinesError> = std::result::Result<T, E>;
50
- /// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
51
- /// necessary.
52
- const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
48
+ pub(super) type RunUpdateAct = Option<ActivationOrAuto>;
53
49
 
50
+ /// Manages access to a specific workflow run. Everything inside is entirely synchronous and should
51
+ /// remain that way.
52
+ #[derive(derive_more::DebugCustom)]
53
+ #[debug(
54
+ fmt = "ManagedRun {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
55
+ trying_to_evict: {} }}",
56
+ wft,
57
+ activation,
58
+ buffered_resp,
59
+ "trying_to_evict.is_some()"
60
+ )]
54
61
  pub(super) struct ManagedRun {
55
62
  wfm: WorkflowManager,
56
- update_tx: UnboundedSender<RunUpdateResponse>,
57
- local_activity_request_sink: LocalActivityRequestSink,
63
+ /// Called when the machines need to produce local activity requests. This can't be lifted up
64
+ /// easily as return values, because sometimes local activity requests trigger immediate
65
+ /// resolutions (ex: too many attempts). Thus lifting it up creates a lot of unneeded complexity
66
+ /// pushing things out and then directly back in. The downside is this is the only "impure" part
67
+ /// of the in/out nature of workflow state management. If there's ever a sensible way to lift it
68
+ /// up, that'd be nice.
69
+ local_activity_request_sink: Rc<dyn LocalActivityRequestSink>,
70
+ /// Set if the run is currently waiting on the execution of some local activities.
58
71
  waiting_on_la: Option<WaitingOnLAs>,
59
- // Is set to true if the machines encounter an error and the only subsequent thing we should
60
- // do is be evicted.
72
+ /// Is set to true if the machines encounter an error and the only subsequent thing we should
73
+ /// do is be evicted.
61
74
  am_broken: bool,
62
- }
75
+ /// If set, the WFT this run is currently/will be processing.
76
+ wft: Option<OutstandingTask>,
77
+ /// An outstanding activation to lang
78
+ activation: Option<OutstandingActivation>,
79
+ /// If set, it indicates there is a buffered poll response from the server that applies to this
80
+ /// run. This can happen when lang takes too long to complete a task and the task times out, for
81
+ /// example. Upon next completion, the buffered response will be removed and can be made ready
82
+ /// to be returned from polling
83
+ buffered_resp: Option<PermittedWFT>,
84
+ /// Is set if an eviction has been requested for this run
85
+ trying_to_evict: Option<RequestEvictMsg>,
63
86
 
64
- /// If an activation completion needed to wait on LA completions (or heartbeat timeout) we use
65
- /// this struct to store the data we need to finish the completion once that has happened
66
- struct WaitingOnLAs {
67
- wft_timeout: Duration,
68
- /// If set, we are waiting for LAs to complete as part of a just-finished workflow activation.
69
- /// If unset, we already had a heartbeat timeout and got a new WFT without any new work while
70
- /// there are still incomplete LAs.
71
- completion_dat: Option<(
72
- CompletionDataForWFT,
73
- oneshot::Sender<ActivationCompleteResult>,
74
- )>,
75
- hb_chan: UnboundedSender<Span>,
76
- heartbeat_timeout_task: JoinHandle<()>,
77
- }
78
-
79
- #[derive(Debug)]
80
- struct CompletionDataForWFT {
81
- task_token: TaskToken,
82
- query_responses: Vec<QueryResult>,
83
- has_pending_query: bool,
84
- activation_was_only_eviction: bool,
87
+ /// We track if we have recorded useful debugging values onto a certain span yet, to overcome
88
+ /// duplicating field values. Remove this once https://github.com/tokio-rs/tracing/issues/2334
89
+ /// is fixed.
90
+ recorded_span_ids: HashSet<tracing::Id>,
91
+ metrics: MetricsContext,
92
+ /// We store the paginator used for our own run's history fetching
93
+ paginator: Option<HistoryPaginator>,
94
+ completion_waiting_on_page_fetch: Option<RunActivationCompletion>,
85
95
  }
86
-
87
96
  impl ManagedRun {
88
97
  pub(super) fn new(
89
- wfm: WorkflowManager,
90
- update_tx: UnboundedSender<RunUpdateResponse>,
91
- local_activity_request_sink: LocalActivityRequestSink,
98
+ basics: RunBasics,
99
+ local_activity_request_sink: Rc<dyn LocalActivityRequestSink>,
92
100
  ) -> Self {
101
+ let metrics = basics.metrics.clone();
102
+ let wfm = WorkflowManager::new(basics);
93
103
  Self {
94
104
  wfm,
95
- update_tx,
96
105
  local_activity_request_sink,
97
106
  waiting_on_la: None,
98
107
  am_broken: false,
108
+ wft: None,
109
+ activation: None,
110
+ buffered_resp: None,
111
+ trying_to_evict: None,
112
+ recorded_span_ids: Default::default(),
113
+ metrics,
114
+ paginator: None,
115
+ completion_waiting_on_page_fetch: None,
99
116
  }
100
117
  }
101
118
 
102
- pub(super) async fn run(self, run_actions_rx: UnboundedReceiver<RunAction>) {
103
- let (heartbeat_tx, heartbeat_rx) = unbounded_channel();
104
- stream::select(
105
- UnboundedReceiverStream::new(run_actions_rx),
106
- UnboundedReceiverStream::new(heartbeat_rx).map(|trace_span| RunAction {
107
- action: RunActions::HeartbeatTimeout,
108
- trace_span,
109
- }),
110
- )
111
- .fold((self, heartbeat_tx), |(mut me, heartbeat_tx), action| {
112
- let span = action.trace_span;
113
- let action = action.action;
114
- let mut no_wft = false;
115
- async move {
116
- let res = match action {
117
- RunActions::NewIncomingWFT(wft) => me
118
- .incoming_wft(wft)
119
- .await
120
- .map(RunActionOutcome::AfterNewWFT),
121
- RunActions::ActivationCompletion(completion) => me
122
- .completion(completion, &heartbeat_tx)
123
- .await
124
- .map(RunActionOutcome::AfterCompletion),
125
- RunActions::CheckMoreWork {
126
- want_to_evict,
127
- has_pending_queries,
128
- has_wft,
129
- } => {
130
- if !has_wft {
131
- no_wft = true;
132
- }
133
- me.check_more_work(want_to_evict, has_pending_queries, has_wft)
134
- .await
135
- .map(RunActionOutcome::AfterCheckWork)
136
- }
137
- RunActions::LocalResolution(r) => me
138
- .local_resolution(r)
139
- .await
140
- .map(RunActionOutcome::AfterLocalResolution),
141
- RunActions::HeartbeatTimeout => {
142
- let maybe_act = if me.heartbeat_timeout() {
143
- Some(ActivationOrAuto::Autocomplete {
144
- run_id: me.wfm.machines.run_id.clone(),
145
- })
146
- } else {
147
- None
148
- };
149
- Ok(RunActionOutcome::AfterHeartbeatTimeout(maybe_act))
150
- }
151
- };
152
- match res {
153
- Ok(outcome) => {
154
- me.send_update_response(outcome, no_wft);
155
- }
156
- Err(e) => {
157
- error!(error=?e, "Error in run machines");
158
- me.am_broken = true;
159
- me.update_tx
160
- .send(RunUpdateResponse {
161
- kind: RunUpdateResponseKind::Fail(FailRunUpdate {
162
- run_id: me.wfm.machines.run_id.clone(),
163
- err: e.source,
164
- completion_resp: e.complete_resp_chan,
165
- }),
166
- span: Span::current(),
167
- })
168
- .expect("Machine can send update");
169
- }
170
- }
171
- (me, heartbeat_tx)
172
- }
173
- .instrument(span)
174
- })
175
- .await;
119
+ /// Returns true if there are pending jobs that need to be sent to lang.
120
+ pub(super) fn more_pending_work(&self) -> bool {
121
+ // We don't want to consider there to be more local-only work to be done if there is
122
+ // no workflow task associated with the run right now. This can happen if, ex, we
123
+ // complete a local activity while waiting for server to send us the next WFT.
124
+ // Activating lang would be harmful at this stage, as there might be work returned
125
+ // in that next WFT which should be part of the next activation.
126
+ self.wft.is_some() && self.wfm.machines.has_pending_jobs()
127
+ }
128
+
129
+ pub(super) fn have_seen_terminal_event(&self) -> bool {
130
+ self.wfm.machines.have_seen_terminal_event
176
131
  }
177
132
 
178
- async fn incoming_wft(
133
+ /// Returns a ref to info about the currently tracked workflow task, if any.
134
+ pub(super) fn wft(&self) -> Option<&OutstandingTask> {
135
+ self.wft.as_ref()
136
+ }
137
+
138
+ /// Returns a ref to info about the currently tracked workflow activation, if any.
139
+ pub(super) fn activation(&self) -> Option<&OutstandingActivation> {
140
+ self.activation.as_ref()
141
+ }
142
+
143
+ /// Returns true if this run has already been told it will be evicted.
144
+ pub(super) fn is_trying_to_evict(&self) -> bool {
145
+ self.trying_to_evict.is_some()
146
+ }
147
+
148
+ /// Called whenever a new workflow task is obtained for this run
149
+ pub(super) fn incoming_wft(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
150
+ let res = self._incoming_wft(pwft);
151
+ self.update_to_acts(res.map(Into::into), true)
152
+ }
153
+
154
+ fn _incoming_wft(
179
155
  &mut self,
180
- wft: NewIncomingWFT,
156
+ pwft: PermittedWFT,
181
157
  ) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
182
- let activation = if let Some(h) = wft.history_update {
183
- self.wfm.feed_history_from_server(h).await?
158
+ if self.wft.is_some() {
159
+ dbg_panic!("Trying to send a new WFT for a run which already has one!");
160
+ }
161
+ let start_time = Instant::now();
162
+
163
+ let work = pwft.work;
164
+ let did_miss_cache = !work.is_incremental() || !work.update.is_real();
165
+ debug!(
166
+ run_id = %work.execution.run_id,
167
+ task_token = %&work.task_token,
168
+ update = ?work.update,
169
+ has_legacy_query = %work.legacy_query.is_some(),
170
+ attempt = %work.attempt,
171
+ "Applying new workflow task from server"
172
+ );
173
+ let wft_info = WorkflowTaskInfo {
174
+ attempt: work.attempt,
175
+ task_token: work.task_token,
176
+ wf_id: work.execution.workflow_id.clone(),
177
+ };
178
+
179
+ let legacy_query_from_poll = work
180
+ .legacy_query
181
+ .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
182
+
183
+ let mut pending_queries = work.query_requests;
184
+ if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
185
+ error!(
186
+ "Server issued both normal and legacy queries. This should not happen. Please \
187
+ file a bug report."
188
+ );
189
+ return Err(RunUpdateErr {
190
+ source: WFMachinesError::Fatal(
191
+ "Server issued both normal and legacy query".to_string(),
192
+ ),
193
+ complete_resp_chan: None,
194
+ });
195
+ }
196
+ if let Some(lq) = legacy_query_from_poll {
197
+ pending_queries.push(lq);
198
+ }
199
+
200
+ self.paginator = Some(pwft.paginator);
201
+ self.wft = Some(OutstandingTask {
202
+ info: wft_info,
203
+ hit_cache: !did_miss_cache,
204
+ pending_queries,
205
+ start_time,
206
+ permit: pwft.permit,
207
+ });
208
+
209
+ // The update field is only populated in the event we hit the cache
210
+ let activation = if work.update.is_real() {
211
+ self.metrics.sticky_cache_hit();
212
+ self.wfm.feed_history_from_server(work.update)?
184
213
  } else {
185
- let r = self.wfm.get_next_activation().await?;
214
+ let r = self.wfm.get_next_activation()?;
186
215
  if r.jobs.is_empty() {
187
216
  return Err(RunUpdateErr {
188
217
  source: WFMachinesError::Fatal(format!(
@@ -197,16 +226,17 @@ impl ManagedRun {
197
226
 
198
227
  if activation.jobs.is_empty() {
199
228
  if self.wfm.machines.outstanding_local_activity_count() > 0 {
200
- // If the activation has no jobs but there are outstanding LAs, we need to restart the
201
- // WFT heartbeat.
229
+ // If the activation has no jobs but there are outstanding LAs, we need to restart
230
+ // the WFT heartbeat.
202
231
  if let Some(ref mut lawait) = self.waiting_on_la {
203
232
  if lawait.completion_dat.is_some() {
204
233
  panic!("Should not have completion dat when getting new wft & empty jobs")
205
234
  }
206
- lawait.heartbeat_timeout_task.abort();
207
- lawait.heartbeat_timeout_task = start_heartbeat_timeout_task(
208
- lawait.hb_chan.clone(),
209
- wft.start_time,
235
+ lawait.hb_timeout_handle.abort();
236
+ lawait.hb_timeout_handle = sink_heartbeat_timeout_start(
237
+ self.wfm.machines.run_id.clone(),
238
+ self.local_activity_request_sink.as_ref(),
239
+ start_time,
210
240
  lawait.wft_timeout,
211
241
  );
212
242
  // No activation needs to be sent to lang. We just need to wait for another
@@ -228,41 +258,340 @@ impl ManagedRun {
228
258
  Ok(Some(ActivationOrAuto::LangActivation(activation)))
229
259
  }
230
260
 
231
- async fn completion(
261
+ /// Deletes the currently tracked WFT & records latency metrics. Should be called after it has
262
+ /// been responded to (server has been told). Returns the WFT if there was one.
263
+ pub(super) fn mark_wft_complete(
232
264
  &mut self,
233
- mut completion: RunActivationCompletion,
234
- heartbeat_tx: &UnboundedSender<Span>,
235
- ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
236
- let resp_chan = completion
237
- .resp_chan
238
- .take()
239
- .expect("Completion response channel must be populated");
240
-
241
- let outcome = async move {
242
- // Send commands from lang into the machines then check if the workflow run
243
- // needs another activation and mark it if so
244
- self.wfm.push_commands(completion.commands).await?;
245
- // Don't bother applying the next task if we're evicting at the end of
246
- // this activation
247
- if !completion.activation_was_eviction {
248
- self.wfm.apply_next_task_if_ready().await?;
265
+ report_status: WFTReportStatus,
266
+ ) -> Option<OutstandingTask> {
267
+ debug!("Marking WFT completed");
268
+ let retme = self.wft.take();
269
+
270
+ // Only record latency metrics if we genuinely reported to server
271
+ if matches!(report_status, WFTReportStatus::Reported) {
272
+ if let Some(ot) = &retme {
273
+ self.metrics.wf_task_latency(ot.start_time.elapsed());
249
274
  }
250
- let new_local_acts = self.wfm.drain_queued_local_activities();
275
+ // Tell the LA manager that we're done with the WFT
276
+ self.local_activity_request_sink.sink_reqs(vec![
277
+ LocalActRequest::IndicateWorkflowTaskCompleted(self.wfm.machines.run_id.clone()),
278
+ ]);
279
+ }
251
280
 
252
- let immediate_resolutions = (self.local_activity_request_sink)(new_local_acts);
253
- for resolution in immediate_resolutions {
254
- self.wfm
255
- .notify_of_local_result(LocalResolution::LocalActivity(resolution))?;
281
+ retme
282
+ }
283
+
284
+ /// Checks if any further activations need to go out for this run and produces them if so.
285
+ pub(super) fn check_more_activations(&mut self) -> RunUpdateAct {
286
+ let res = self._check_more_activations();
287
+ self.update_to_acts(res.map(Into::into), false)
288
+ }
289
+
290
+ fn _check_more_activations(&mut self) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
291
+ // No point in checking for more activations if there's already an outstanding activation.
292
+ if self.activation.is_some() {
293
+ return Ok(None);
294
+ }
295
+ // In the event it's time to evict this run, cancel any outstanding LAs
296
+ if self.trying_to_evict.is_some() {
297
+ self.sink_la_requests(vec![LocalActRequest::CancelAllInRun(
298
+ self.wfm.machines.run_id.clone(),
299
+ )])?;
300
+ }
301
+
302
+ if self.wft.is_none() {
303
+ // It doesn't make sense to do workflow work unless we have a WFT
304
+ return Ok(None);
305
+ }
306
+
307
+ if self.wfm.machines.has_pending_jobs() && !self.am_broken {
308
+ Ok(Some(ActivationOrAuto::LangActivation(
309
+ self.wfm.get_next_activation()?,
310
+ )))
311
+ } else {
312
+ if !self.am_broken {
313
+ let has_pending_queries = self
314
+ .wft
315
+ .as_ref()
316
+ .map(|wft| !wft.pending_queries.is_empty())
317
+ .unwrap_or_default();
318
+ if has_pending_queries {
319
+ return Ok(Some(ActivationOrAuto::ReadyForQueries(
320
+ self.wfm.machines.get_wf_activation(),
321
+ )));
322
+ }
323
+ }
324
+ if let Some(wte) = self.trying_to_evict.clone() {
325
+ let mut act = self.wfm.machines.get_wf_activation();
326
+ // No other jobs make any sense to send if we encountered an error.
327
+ if self.am_broken {
328
+ act.jobs = vec![];
329
+ }
330
+ act.append_evict_job(RemoveFromCache {
331
+ message: wte.message,
332
+ reason: wte.reason as i32,
333
+ });
334
+ Ok(Some(ActivationOrAuto::LangActivation(act)))
335
+ } else {
336
+ Ok(None)
256
337
  }
338
+ }
339
+ }
340
+
341
+ /// Called whenever lang successfully completes a workflow activation. Commands produced by the
342
+ /// activation are passed in. `resp_chan` will be used to unblock the completion call when
343
+ /// everything we need to do to fulfill it has happened.
344
+ ///
345
+ /// Can return an error in the event that another page of history needs to be fetched before
346
+ /// the completion can proceed.
347
+ pub(super) fn successful_completion(
348
+ &mut self,
349
+ mut commands: Vec<WFCommand>,
350
+ used_flags: Vec<u32>,
351
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
352
+ ) -> Result<RunUpdateAct, NextPageReq> {
353
+ let activation_was_only_eviction = self.activation_has_only_eviction();
354
+ let (task_token, has_pending_query, start_time) = if let Some(entry) = self.wft.as_ref() {
355
+ (
356
+ entry.info.task_token.clone(),
357
+ !entry.pending_queries.is_empty(),
358
+ entry.start_time,
359
+ )
360
+ } else {
361
+ if !activation_was_only_eviction {
362
+ // Not an error if this was an eviction, since it's normal to issue eviction
363
+ // activations without an associated workflow task in that case.
364
+ dbg_panic!(
365
+ "Attempted to complete activation for run {} without associated workflow task",
366
+ self.run_id()
367
+ );
368
+ }
369
+ self.reply_to_complete(ActivationCompleteOutcome::DoNothing, resp_chan);
370
+ return Ok(None);
371
+ };
257
372
 
258
- let data = CompletionDataForWFT {
259
- task_token: completion.task_token,
260
- query_responses: completion.query_responses,
261
- has_pending_query: completion.has_pending_query,
262
- activation_was_only_eviction: completion.activation_was_only_eviction,
373
+ // If the only command from the activation is a legacy query response, that means we need
374
+ // to respond differently than a typical activation.
375
+ if matches!(&commands.as_slice(),
376
+ &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
377
+ {
378
+ let qr = match commands.remove(0) {
379
+ WFCommand::QueryResponse(qr) => qr,
380
+ _ => unreachable!("We just verified this is the only command"),
263
381
  };
382
+ self.reply_to_complete(
383
+ ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
384
+ task_token,
385
+ action: ActivationAction::RespondLegacyQuery {
386
+ result: Box::new(qr),
387
+ },
388
+ }),
389
+ resp_chan,
390
+ );
391
+ Ok(None)
392
+ } else {
393
+ // First strip out query responses from other commands that actually affect machines
394
+ // Would be prettier with `drain_filter`
395
+ let mut i = 0;
396
+ let mut query_responses = vec![];
397
+ while i < commands.len() {
398
+ if matches!(commands[i], WFCommand::QueryResponse(_)) {
399
+ if let WFCommand::QueryResponse(qr) = commands.remove(i) {
400
+ query_responses.push(qr);
401
+ }
402
+ } else {
403
+ i += 1;
404
+ }
405
+ }
406
+
407
+ if activation_was_only_eviction && !commands.is_empty() {
408
+ dbg_panic!("Reply to an eviction only containing an eviction included commands");
409
+ }
410
+
411
+ let rac = RunActivationCompletion {
412
+ task_token,
413
+ start_time,
414
+ commands,
415
+ activation_was_eviction: self.activation_has_eviction(),
416
+ activation_was_only_eviction,
417
+ has_pending_query,
418
+ query_responses,
419
+ used_flags,
420
+ resp_chan,
421
+ };
422
+
423
+ // Verify we can actually apply the next workflow task, which will happen as part of
424
+ // applying the completion to machines. If we can't, return early indicating we need
425
+ // to fetch a page.
426
+ if !self.wfm.ready_to_apply_next_wft() {
427
+ return if let Some(paginator) = self.paginator.take() {
428
+ debug!("Need to fetch a history page before next WFT can be applied");
429
+ self.completion_waiting_on_page_fetch = Some(rac);
430
+ Err(NextPageReq {
431
+ paginator,
432
+ span: Span::current(),
433
+ })
434
+ } else {
435
+ Ok(self.update_to_acts(
436
+ Err(RunUpdateErr {
437
+ source: WFMachinesError::Fatal(
438
+ "Run's paginator was absent when attempting to fetch next history \
439
+ page. This is a Core SDK bug."
440
+ .to_string(),
441
+ ),
442
+ complete_resp_chan: rac.resp_chan,
443
+ }),
444
+ false,
445
+ ))
446
+ };
447
+ }
448
+
449
+ Ok(self.process_completion(rac))
450
+ }
451
+ }
452
+
453
+ /// Called after the higher-up machinery has fetched more pages of event history needed to apply
454
+ /// the next workflow task. The history update and paginator used to perform the fetch are
455
+ /// passed in, with the update being used to apply the task, and the paginator stored to be
456
+ /// attached with another fetch request if needed.
457
+ pub(super) fn fetched_page_completion(
458
+ &mut self,
459
+ update: HistoryUpdate,
460
+ paginator: HistoryPaginator,
461
+ ) -> RunUpdateAct {
462
+ let res = self._fetched_page_completion(update, paginator);
463
+ self.update_to_acts(res.map(Into::into), false)
464
+ }
465
+ fn _fetched_page_completion(
466
+ &mut self,
467
+ update: HistoryUpdate,
468
+ paginator: HistoryPaginator,
469
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
470
+ self.paginator = Some(paginator);
471
+ if let Some(d) = self.completion_waiting_on_page_fetch.take() {
472
+ self._process_completion(d, Some(update))
473
+ } else {
474
+ dbg_panic!(
475
+ "Shouldn't be possible to be applying a next-page-fetch update when \
476
+ doing anything other than completing an activation."
477
+ );
478
+ Err(RunUpdateErr::from(WFMachinesError::Fatal(
479
+ "Tried to apply next-page-fetch update to a run that wasn't handling a completion"
480
+ .to_string(),
481
+ )))
482
+ }
483
+ }
484
+
485
+ /// Called whenever either core lang cannot complete a workflow activation. EX: Nondeterminism
486
+ /// or user code threw/panicked, respectively. The `cause` and `reason` fields are determined
487
+ /// inside core always. The `failure` field may come from lang. `resp_chan` will be used to
488
+ /// unblock the completion call when everything we need to do to fulfill it has happened.
489
+ pub(super) fn failed_completion(
490
+ &mut self,
491
+ cause: WorkflowTaskFailedCause,
492
+ reason: EvictionReason,
493
+ failure: workflow_completion::Failure,
494
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
495
+ ) -> RunUpdateAct {
496
+ let tt = if let Some(tt) = self.wft.as_ref().map(|t| t.info.task_token.clone()) {
497
+ tt
498
+ } else {
499
+ dbg_panic!(
500
+ "No workflow task for run id {} found when trying to fail activation",
501
+ self.run_id()
502
+ );
503
+ self.reply_to_complete(ActivationCompleteOutcome::DoNothing, resp_chan);
504
+ return None;
505
+ };
506
+
507
+ self.metrics.wf_task_failed();
508
+ let message = format!("Workflow activation completion failed: {:?}", &failure);
509
+ // Blow up any cached data associated with the workflow
510
+ let evict_req_outcome = self.request_eviction(RequestEvictMsg {
511
+ run_id: self.run_id().to_string(),
512
+ message,
513
+ reason,
514
+ });
515
+ let should_report = match &evict_req_outcome {
516
+ EvictionRequestResult::EvictionRequested(Some(attempt), _)
517
+ | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => *attempt <= 1,
518
+ _ => false,
519
+ };
520
+ let rur = evict_req_outcome.into_run_update_resp();
521
+ // If the outstanding WFT is a legacy query task, report that we need to fail it
522
+ let outcome = if self.pending_work_is_legacy_query() {
523
+ ActivationCompleteOutcome::ReportWFTFail(
524
+ FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
525
+ )
526
+ } else if should_report {
527
+ ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
528
+ tt, cause, failure,
529
+ ))
530
+ } else {
531
+ ActivationCompleteOutcome::WFTFailedDontReport
532
+ };
533
+ self.reply_to_complete(outcome, resp_chan);
534
+ rur
535
+ }
536
+
537
+ /// Delete the currently tracked workflow activation and return it, if any. Should be called
538
+ /// after the processing of the activation completion, and WFT reporting.
539
+ pub(super) fn delete_activation(&mut self) -> Option<OutstandingActivation> {
540
+ self.activation.take()
541
+ }
542
+
543
+ /// Called when local activities resolve
544
+ pub(super) fn local_resolution(&mut self, res: LocalResolution) -> RunUpdateAct {
545
+ let res = self._local_resolution(res);
546
+ self.update_to_acts(res.map(Into::into), false)
547
+ }
548
+
549
+ fn process_completion(&mut self, completion: RunActivationCompletion) -> RunUpdateAct {
550
+ let res = self._process_completion(completion, None);
551
+ self.update_to_acts(res.map(Into::into), false)
552
+ }
553
+
554
+ fn _process_completion(
555
+ &mut self,
556
+ completion: RunActivationCompletion,
557
+ new_update: Option<HistoryUpdate>,
558
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
559
+ let data = CompletionDataForWFT {
560
+ task_token: completion.task_token,
561
+ query_responses: completion.query_responses,
562
+ has_pending_query: completion.has_pending_query,
563
+ activation_was_only_eviction: completion.activation_was_only_eviction,
564
+ };
565
+
566
+ self.wfm.machines.add_lang_used_flags(completion.used_flags);
567
+
568
+ // If this is just bookkeeping after a reply to an only-eviction activation, we can bypass
569
+ // everything, since there is no reason to continue trying to update machines.
570
+ if completion.activation_was_only_eviction {
571
+ return Ok(Some(self.prepare_complete_resp(
572
+ completion.resp_chan,
573
+ data,
574
+ false,
575
+ )));
576
+ }
577
+
578
+ let outcome = (|| {
579
+ // Send commands from lang into the machines then check if the workflow run needs
580
+ // another activation and mark it if so
581
+ self.wfm.push_commands_and_iterate(completion.commands)?;
582
+ // If there was a new update included as part of the completion, apply it.
583
+ if let Some(update) = new_update {
584
+ self.wfm.feed_history_from_new_page(update)?;
585
+ }
586
+ // Don't bother applying the next task if we're evicting at the end of this activation
587
+ if !completion.activation_was_eviction {
588
+ self.wfm.apply_next_task_if_ready()?;
589
+ }
590
+ let new_local_acts = self.wfm.drain_queued_local_activities();
591
+ self.sink_la_requests(new_local_acts)?;
592
+
264
593
  if self.wfm.machines.outstanding_local_activity_count() == 0 {
265
- Ok((None, data, self))
594
+ Ok(None)
266
595
  } else {
267
596
  let wft_timeout: Duration = self
268
597
  .wfm
@@ -275,28 +604,26 @@ impl ManagedRun {
275
604
  .to_string(),
276
605
  )
277
606
  })?;
278
- let heartbeat_tx = heartbeat_tx.clone();
279
- Ok((
280
- Some((heartbeat_tx, completion.start_time, wft_timeout)),
281
- data,
282
- self,
283
- ))
607
+ Ok(Some((completion.start_time, wft_timeout)))
284
608
  }
285
- }
286
- .await;
609
+ })();
287
610
 
288
611
  match outcome {
289
- Ok((None, data, me)) => Ok(Some(me.prepare_complete_resp(resp_chan, data, false))),
290
- Ok((Some((chan, start_t, wft_timeout)), data, me)) => {
291
- if let Some(wola) = me.waiting_on_la.as_mut() {
292
- wola.heartbeat_timeout_task.abort();
612
+ Ok(None) => Ok(Some(self.prepare_complete_resp(
613
+ completion.resp_chan,
614
+ data,
615
+ false,
616
+ ))),
617
+ Ok(Some((start_t, wft_timeout))) => {
618
+ if let Some(wola) = self.waiting_on_la.as_mut() {
619
+ wola.hb_timeout_handle.abort();
293
620
  }
294
- me.waiting_on_la = Some(WaitingOnLAs {
621
+ self.waiting_on_la = Some(WaitingOnLAs {
295
622
  wft_timeout,
296
- completion_dat: Some((data, resp_chan)),
297
- hb_chan: chan.clone(),
298
- heartbeat_timeout_task: start_heartbeat_timeout_task(
299
- chan,
623
+ completion_dat: Some((data, completion.resp_chan)),
624
+ hb_timeout_handle: sink_heartbeat_timeout_start(
625
+ self.run_id().to_string(),
626
+ self.local_activity_request_sink.as_ref(),
300
627
  start_t,
301
628
  wft_timeout,
302
629
  ),
@@ -305,72 +632,347 @@ impl ManagedRun {
305
632
  }
306
633
  Err(e) => Err(RunUpdateErr {
307
634
  source: e,
308
- complete_resp_chan: Some(resp_chan),
635
+ complete_resp_chan: completion.resp_chan,
309
636
  }),
310
637
  }
311
638
  }
312
639
 
313
- async fn check_more_work(
640
+ fn _local_resolution(
314
641
  &mut self,
315
- want_to_evict: Option<RequestEvictMsg>,
316
- has_pending_queries: bool,
317
- has_wft: bool,
318
- ) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
319
- if !has_wft {
320
- // It doesn't make sense to do work unless we have a WFT
321
- return Ok(None);
642
+ res: LocalResolution,
643
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
644
+ debug!(resolution=?res, "Applying local resolution");
645
+ self.wfm.notify_of_local_result(res)?;
646
+ if self.wfm.machines.outstanding_local_activity_count() == 0 {
647
+ if let Some(mut wait_dat) = self.waiting_on_la.take() {
648
+ // Cancel the heartbeat timeout
649
+ wait_dat.hb_timeout_handle.abort();
650
+ if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
651
+ return Ok(Some(self.prepare_complete_resp(
652
+ resp_chan,
653
+ completion_dat,
654
+ false,
655
+ )));
656
+ }
657
+ }
322
658
  }
323
- if self.wfm.machines.has_pending_jobs() && !self.am_broken {
324
- Ok(Some(ActivationOrAuto::LangActivation(
325
- self.wfm.get_next_activation().await?,
326
- )))
659
+ Ok(None)
660
+ }
661
+
662
+ pub(super) fn heartbeat_timeout(&mut self) -> RunUpdateAct {
663
+ let maybe_act = if self._heartbeat_timeout() {
664
+ Some(ActivationOrAuto::Autocomplete {
665
+ run_id: self.wfm.machines.run_id.clone(),
666
+ })
327
667
  } else {
328
- if has_pending_queries && !self.am_broken {
329
- return Ok(Some(ActivationOrAuto::ReadyForQueries(
330
- self.wfm.machines.get_wf_activation(),
331
- )));
668
+ None
669
+ };
670
+ self.update_to_acts(Ok(maybe_act).map(Into::into), false)
671
+ }
672
+ /// Returns `true` if autocompletion should be issued, which will actually cause us to end up
673
+ /// in [completion] again, at which point we'll start a new heartbeat timeout, which will
674
+ /// immediately trigger and thus finish the completion, forcing a new task as it should.
675
+ fn _heartbeat_timeout(&mut self) -> bool {
676
+ if let Some(ref mut wait_dat) = self.waiting_on_la {
677
+ // Cancel the heartbeat timeout
678
+ wait_dat.hb_timeout_handle.abort();
679
+ if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
680
+ let compl = self.prepare_complete_resp(resp_chan, completion_dat, true);
681
+ // Immediately fulfill the completion since the run update will already have
682
+ // been replied to
683
+ compl.fulfill();
684
+ } else {
685
+ // Auto-reply WFT complete
686
+ return true;
332
687
  }
333
- if let Some(wte) = want_to_evict {
334
- let mut act = self.wfm.machines.get_wf_activation();
335
- // No other jobs make any sense to send if we encountered an error.
336
- if self.am_broken {
337
- act.jobs = vec![];
338
- }
339
- act.append_evict_job(RemoveFromCache {
340
- message: wte.message,
341
- reason: wte.reason as i32,
342
- });
343
- Ok(Some(ActivationOrAuto::LangActivation(act)))
688
+ }
689
+ false
690
+ }
691
+
692
+ /// Returns true if the managed run has any form of pending work
693
+ /// If `ignore_evicts` is true, pending evictions do not count as pending work.
694
+ /// If `ignore_buffered` is true, buffered workflow tasks do not count as pending work.
695
+ pub(super) fn has_any_pending_work(&self, ignore_evicts: bool, ignore_buffered: bool) -> bool {
696
+ let evict_work = if ignore_evicts {
697
+ false
698
+ } else {
699
+ self.trying_to_evict.is_some()
700
+ };
701
+ let act_work = if ignore_evicts {
702
+ if let Some(ref act) = self.activation {
703
+ !act.has_only_eviction()
344
704
  } else {
345
- Ok(None)
705
+ false
706
+ }
707
+ } else {
708
+ self.activation.is_some()
709
+ };
710
+ let buffered = if ignore_buffered {
711
+ false
712
+ } else {
713
+ self.buffered_resp.is_some()
714
+ };
715
+ trace!(wft=self.wft.is_some(), buffered=?buffered, more_work=?self.more_pending_work(),
716
+ act_work, evict_work, "Does run have pending work?");
717
+ self.wft.is_some() || buffered || self.more_pending_work() || act_work || evict_work
718
+ }
719
+
720
+ /// Stores some work if there is any outstanding WFT or activation for the run. If there was
721
+ /// not, returns the work back out inside the option.
722
+ pub(super) fn buffer_wft_if_outstanding_work(
723
+ &mut self,
724
+ work: PermittedWFT,
725
+ ) -> Option<PermittedWFT> {
726
+ let about_to_issue_evict = self.trying_to_evict.is_some();
727
+ let has_wft = self.wft().is_some();
728
+ let has_activation = self.activation().is_some();
729
+ if has_wft || has_activation || about_to_issue_evict || self.more_pending_work() {
730
+ debug!(run_id = %self.run_id(),
731
+ "Got new WFT for a run with outstanding work, buffering it");
732
+ self.buffered_resp = Some(work);
733
+ None
734
+ } else {
735
+ Some(work)
736
+ }
737
+ }
738
+
739
+ /// Returns true if there is a buffered workflow task for this run.
740
+ pub(super) fn has_buffered_wft(&self) -> bool {
741
+ self.buffered_resp.is_some()
742
+ }
743
+
744
+ /// Removes and returns the buffered workflow task, if any.
745
+ pub(super) fn take_buffered_wft(&mut self) -> Option<PermittedWFT> {
746
+ self.buffered_resp.take()
747
+ }
748
+
749
+ pub(super) fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
750
+ let attempts = self.wft.as_ref().map(|wt| wt.info.attempt);
751
+
752
+ // If we were waiting on a page fetch and we're getting evicted because fetching failed,
753
+ // then make sure we allow the completion to proceed, otherwise we're stuck waiting forever.
754
+ if self.completion_waiting_on_page_fetch.is_some()
755
+ && matches!(info.reason, EvictionReason::PaginationOrHistoryFetch)
756
+ {
757
+ // We just checked it is some, unwrap OK.
758
+ let c = self.completion_waiting_on_page_fetch.take().unwrap();
759
+ let run_upd = self.failed_completion(
760
+ WorkflowTaskFailedCause::Unspecified,
761
+ info.reason,
762
+ Failure::application_failure(info.message, false).into(),
763
+ c.resp_chan,
764
+ );
765
+ return EvictionRequestResult::EvictionRequested(attempts, run_upd);
766
+ }
767
+
768
+ if !self.activation_has_eviction() && self.trying_to_evict.is_none() {
769
+ debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
770
+ self.trying_to_evict = Some(info);
771
+ EvictionRequestResult::EvictionRequested(attempts, self.check_more_activations())
772
+ } else {
773
+ EvictionRequestResult::EvictionAlreadyRequested(attempts)
774
+ }
775
+ }
776
+
777
+ pub(super) fn record_span_fields(&mut self, span: &Span) {
778
+ if let Some(spid) = span.id() {
779
+ if self.recorded_span_ids.contains(&spid) {
780
+ return;
781
+ }
782
+ self.recorded_span_ids.insert(spid);
783
+
784
+ if let Some(wid) = self.wft().map(|wft| &wft.info.wf_id) {
785
+ span.record("workflow_id", wid.as_str());
786
+ }
787
+ }
788
+ }
789
+
790
+ /// Take the result of some update to ourselves and turn it into a return value of zero or more
791
+ /// actions
792
+ fn update_to_acts(
793
+ &mut self,
794
+ outcome: Result<ActOrFulfill, RunUpdateErr>,
795
+ in_response_to_wft: bool,
796
+ ) -> RunUpdateAct {
797
+ match outcome {
798
+ Ok(act_or_fulfill) => {
799
+ let (mut maybe_act, maybe_fulfill) = match act_or_fulfill {
800
+ ActOrFulfill::OutgoingAct(a) => (a, None),
801
+ ActOrFulfill::FulfillableComplete(c) => (None, c),
802
+ };
803
+ // If there's no activation but is pending work, check and possibly generate one
804
+ if self.more_pending_work() && maybe_act.is_none() {
805
+ match self._check_more_activations() {
806
+ Ok(oa) => maybe_act = oa,
807
+ Err(e) => {
808
+ return self.update_to_acts(Err(e), in_response_to_wft);
809
+ }
810
+ }
811
+ }
812
+ let r = match maybe_act {
813
+ Some(ActivationOrAuto::LangActivation(mut activation)) => {
814
+ if in_response_to_wft {
815
+ let wft = self
816
+ .wft
817
+ .as_mut()
818
+ .expect("WFT must exist for run just updated with one");
819
+ // If there are in-poll queries, insert jobs for those queries into the
820
+ // activation, but only if we hit the cache. If we didn't, those queries
821
+ // will need to be dealt with once replay is over
822
+ if wft.hit_cache {
823
+ put_queries_in_act(&mut activation, wft);
824
+ }
825
+ }
826
+
827
+ if activation.jobs.is_empty() {
828
+ dbg_panic!("Should not send lang activation with no jobs");
829
+ }
830
+ Some(ActivationOrAuto::LangActivation(activation))
831
+ }
832
+ Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
833
+ if let Some(wft) = self.wft.as_mut() {
834
+ put_queries_in_act(&mut act, wft);
835
+ Some(ActivationOrAuto::LangActivation(act))
836
+ } else {
837
+ dbg_panic!("Ready for queries but no WFT!");
838
+ None
839
+ }
840
+ }
841
+ a @ Some(
842
+ ActivationOrAuto::Autocomplete { .. } | ActivationOrAuto::AutoFail { .. },
843
+ ) => a,
844
+ None => {
845
+ if let Some(reason) = self.trying_to_evict.as_ref() {
846
+ // If we had nothing to do, but we're trying to evict, just do that now
847
+ // as long as there's no other outstanding work.
848
+ if self.activation.is_none() && !self.more_pending_work() {
849
+ let mut evict_act = create_evict_activation(
850
+ self.run_id().to_string(),
851
+ reason.message.clone(),
852
+ reason.reason,
853
+ );
854
+ evict_act.history_length =
855
+ self.most_recently_processed_event_number() as u32;
856
+ Some(ActivationOrAuto::LangActivation(evict_act))
857
+ } else {
858
+ None
859
+ }
860
+ } else {
861
+ None
862
+ }
863
+ }
864
+ };
865
+ if let Some(f) = maybe_fulfill {
866
+ f.fulfill();
867
+ }
868
+
869
+ match r {
870
+ // After each run update, check if it's ready to handle any buffered poll
871
+ None | Some(ActivationOrAuto::Autocomplete { .. })
872
+ if !self.has_any_pending_work(false, true) =>
873
+ {
874
+ if let Some(bufft) = self.buffered_resp.take() {
875
+ self.incoming_wft(bufft)
876
+ } else {
877
+ None
878
+ }
879
+ }
880
+ Some(r) => {
881
+ self.insert_outstanding_activation(&r);
882
+ Some(r)
883
+ }
884
+ None => None,
885
+ }
886
+ }
887
+ Err(fail) => {
888
+ self.am_broken = true;
889
+ let rur = if let Some(resp_chan) = fail.complete_resp_chan {
890
+ // Automatically fail the workflow task in the event we couldn't update machines
891
+ let fail_cause = if matches!(&fail.source, WFMachinesError::Nondeterminism(_)) {
892
+ WorkflowTaskFailedCause::NonDeterministicError
893
+ } else {
894
+ WorkflowTaskFailedCause::Unspecified
895
+ };
896
+ let wft_fail_str = format!("{:?}", fail.source);
897
+ self.failed_completion(
898
+ fail_cause,
899
+ fail.source.evict_reason(),
900
+ Failure::application_failure(wft_fail_str, false).into(),
901
+ Some(resp_chan),
902
+ )
903
+ } else {
904
+ warn!(error=?fail.source, "Error while updating workflow");
905
+ Some(ActivationOrAuto::AutoFail {
906
+ run_id: self.run_id().to_owned(),
907
+ machines_err: fail.source,
908
+ })
909
+ };
910
+ rur
346
911
  }
347
912
  }
348
913
  }
349
914
 
915
+ fn insert_outstanding_activation(&mut self, act: &ActivationOrAuto) {
916
+ let act_type = match &act {
917
+ ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
918
+ if act.is_legacy_query() {
919
+ OutstandingActivation::LegacyQuery
920
+ } else {
921
+ OutstandingActivation::Normal {
922
+ contains_eviction: act.eviction_index().is_some(),
923
+ num_jobs: act.jobs.len(),
924
+ }
925
+ }
926
+ }
927
+ ActivationOrAuto::Autocomplete { .. } | ActivationOrAuto::AutoFail { .. } => {
928
+ OutstandingActivation::Autocomplete
929
+ }
930
+ };
931
+ if let Some(old_act) = self.activation {
932
+ // This is a panic because we have screwed up core logic if this is violated. It must be
933
+ // upheld.
934
+ panic!(
935
+ "Attempted to insert a new outstanding activation {act:?}, but there already was \
936
+ one outstanding: {old_act:?}"
937
+ );
938
+ }
939
+ self.activation = Some(act_type);
940
+ }
941
+
350
942
  fn prepare_complete_resp(
351
943
  &mut self,
352
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
944
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
353
945
  data: CompletionDataForWFT,
354
946
  due_to_heartbeat_timeout: bool,
355
947
  ) -> FulfillableActivationComplete {
356
- let outgoing_cmds = self.wfm.get_server_commands();
948
+ let mut outgoing_cmds = self.wfm.get_server_commands();
949
+ if data.activation_was_only_eviction && !outgoing_cmds.commands.is_empty() {
950
+ if self.am_broken {
951
+ // If we broke there could be commands in the pipe that we didn't get a chance to
952
+ // handle properly during replay, just wipe them all out.
953
+ outgoing_cmds.commands = vec![];
954
+ } else {
955
+ dbg_panic!(
956
+ "There should not be any outgoing commands when preparing a completion response \
957
+ if the activation was only an eviction. This is an SDK bug."
958
+ );
959
+ }
960
+ }
961
+
357
962
  let query_responses = data.query_responses;
358
963
  let has_query_responses = !query_responses.is_empty();
359
964
  let is_query_playback = data.has_pending_query && !has_query_responses;
360
965
  let mut force_new_wft = due_to_heartbeat_timeout;
361
966
 
362
- // We only actually want to send commands back to the server if there are no more
363
- // pending activations and we are caught up on replay. We don't want to complete a wft
364
- // if we already saw the final event in the workflow, or if we are playing back for the
365
- // express purpose of fulfilling a query. If the activation we sent was *only* an
366
- // eviction, and there were no commands produced during iteration, don't send that
967
+ // We only actually want to send commands back to the server if there are no more pending
968
+ // activations and we are caught up on replay. We don't want to complete a wft if we already
969
+ // saw the final event in the workflow, or if we are playing back for the express purpose of
970
+ // fulfilling a query. If the activation we sent was *only* an eviction, don't send that
367
971
  // either.
368
- let no_commands_and_evicting =
369
- outgoing_cmds.commands.is_empty() && data.activation_was_only_eviction;
370
972
  let should_respond = !(self.wfm.machines.has_pending_jobs()
371
973
  || outgoing_cmds.replaying
372
974
  || is_query_playback
373
- || no_commands_and_evicting);
975
+ || data.activation_was_only_eviction);
374
976
  // If there are pending LA resolutions, and we're responding to a query here,
375
977
  // we want to make sure to force a new task, as otherwise once we tell lang about
376
978
  // the LA resolution there wouldn't be any task to reply to with the result of iterating
@@ -378,17 +980,17 @@ impl ManagedRun {
378
980
  if has_query_responses && self.wfm.machines.has_pending_la_resolutions() {
379
981
  force_new_wft = true;
380
982
  }
381
- let to_be_sent = ServerCommandsWithWorkflowInfo {
382
- task_token: data.task_token,
383
- action: ActivationAction::WftComplete {
384
- force_new_wft,
385
- commands: outgoing_cmds.commands,
386
- query_responses,
387
- },
388
- };
389
983
 
390
984
  let outcome = if should_respond || has_query_responses {
391
- ActivationCompleteOutcome::ReportWFTSuccess(to_be_sent)
985
+ ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
986
+ task_token: data.task_token,
987
+ action: ActivationAction::WftComplete {
988
+ force_new_wft,
989
+ commands: outgoing_cmds.commands,
990
+ query_responses,
991
+ sdk_metadata: self.wfm.machines.get_metadata_for_wft_complete(),
992
+ },
993
+ })
392
994
  } else {
393
995
  ActivationCompleteOutcome::DoNothing
394
996
  };
@@ -401,131 +1003,136 @@ impl ManagedRun {
401
1003
  }
402
1004
  }
403
1005
 
404
- async fn local_resolution(
1006
+ /// Pump some local activity requests into the sink, applying any immediate results to the
1007
+ /// workflow machines.
1008
+ fn sink_la_requests(
405
1009
  &mut self,
406
- res: LocalResolution,
407
- ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
408
- debug!(resolution=?res, "Applying local resolution");
409
- self.wfm.notify_of_local_result(res)?;
410
- if self.wfm.machines.outstanding_local_activity_count() == 0 {
411
- if let Some(mut wait_dat) = self.waiting_on_la.take() {
412
- // Cancel the heartbeat timeout
413
- wait_dat.heartbeat_timeout_task.abort();
414
- if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
415
- return Ok(Some(self.prepare_complete_resp(
416
- resp_chan,
417
- completion_dat,
418
- false,
419
- )));
420
- }
421
- }
1010
+ new_local_acts: Vec<LocalActRequest>,
1011
+ ) -> Result<(), WFMachinesError> {
1012
+ let immediate_resolutions = self.local_activity_request_sink.sink_reqs(new_local_acts);
1013
+ if !immediate_resolutions.is_empty() {
1014
+ warn!("Immediate res: {:?}", &immediate_resolutions);
422
1015
  }
423
- Ok(None)
1016
+ for resolution in immediate_resolutions {
1017
+ self.wfm
1018
+ .notify_of_local_result(LocalResolution::LocalActivity(resolution))?;
1019
+ }
1020
+ Ok(())
424
1021
  }
425
1022
 
426
- /// Returns `true` if autocompletion should be issued, which will actually cause us to end up
427
- /// in [completion] again, at which point we'll start a new heartbeat timeout, which will
428
- /// immediately trigger and thus finish the completion, forcing a new task as it should.
429
- fn heartbeat_timeout(&mut self) -> bool {
430
- if let Some(ref mut wait_dat) = self.waiting_on_la {
431
- // Cancel the heartbeat timeout
432
- wait_dat.heartbeat_timeout_task.abort();
433
- if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
434
- let compl = self.prepare_complete_resp(resp_chan, completion_dat, true);
435
- // Immediately fulfill the completion since the run update will already have
436
- // been replied to
437
- compl.fulfill();
438
- } else {
439
- // Auto-reply WFT complete
440
- return true;
441
- }
442
- } else {
443
- // If a heartbeat timeout happened, we should always have been waiting on LAs
444
- dbg_panic!("WFT heartbeat timeout fired but we were not waiting on any LAs");
1023
+ fn reply_to_complete(
1024
+ &self,
1025
+ outcome: ActivationCompleteOutcome,
1026
+ chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1027
+ ) {
1028
+ if let Some(chan) = chan {
1029
+ chan.send(ActivationCompleteResult {
1030
+ most_recently_processed_event: self.most_recently_processed_event_number() as usize,
1031
+ outcome,
1032
+ })
1033
+ .expect("Rcv half of activation reply not dropped");
445
1034
  }
446
- false
447
1035
  }
448
1036
 
449
- fn send_update_response(&self, outcome: RunActionOutcome, no_wft: bool) {
450
- let mut in_response_to_wft = false;
451
- let (outgoing_activation, fulfillable_complete) = match outcome {
452
- RunActionOutcome::AfterNewWFT(a) => {
453
- in_response_to_wft = true;
454
- (a, None)
455
- }
456
- RunActionOutcome::AfterCheckWork(a) => (a, None),
457
- RunActionOutcome::AfterLocalResolution(f) => (None, f),
458
- RunActionOutcome::AfterCompletion(f) => (None, f),
459
- RunActionOutcome::AfterHeartbeatTimeout(a) => (a, None),
460
- };
461
- let mut more_pending_work = self.wfm.machines.has_pending_jobs();
462
- // We don't want to consider there to be more local-only work to be done if there is no
463
- // workflow task associated with the run right now. This can happen if, ex, we complete
464
- // a local activity while waiting for server to send us the next WFT. Activating lang would
465
- // be harmful at this stage, as there might be work returned in that next WFT which should
466
- // be part of the next activation.
467
- if no_wft {
468
- more_pending_work = false;
469
- }
470
- self.update_tx
471
- .send(RunUpdateResponse {
472
- kind: RunUpdateResponseKind::Good(GoodRunUpdate {
473
- run_id: self.wfm.machines.run_id.clone(),
474
- outgoing_activation,
475
- fulfillable_complete,
476
- have_seen_terminal_event: self.wfm.machines.have_seen_terminal_event,
477
- more_pending_work,
478
- most_recently_processed_event_number: self.wfm.machines.last_processed_event
479
- as usize,
480
- in_response_to_wft,
481
- }),
482
- span: Span::current(),
483
- })
484
- .expect("Machine can send update");
1037
+ /// Returns true if the handle is currently processing a WFT which contains a legacy query.
1038
+ fn pending_work_is_legacy_query(&self) -> bool {
1039
+ // Either we know because there is a pending legacy query, or it's already been drained and
1040
+ // sent as an activation.
1041
+ matches!(self.activation, Some(OutstandingActivation::LegacyQuery))
1042
+ || self
1043
+ .wft
1044
+ .as_ref()
1045
+ .map(|t| t.has_pending_legacy_query())
1046
+ .unwrap_or_default()
1047
+ }
1048
+
1049
+ fn most_recently_processed_event_number(&self) -> i64 {
1050
+ self.wfm.machines.last_processed_event
1051
+ }
1052
+
1053
+ fn activation_has_eviction(&mut self) -> bool {
1054
+ self.activation
1055
+ .map(OutstandingActivation::has_eviction)
1056
+ .unwrap_or_default()
1057
+ }
1058
+
1059
+ fn activation_has_only_eviction(&mut self) -> bool {
1060
+ self.activation
1061
+ .map(OutstandingActivation::has_only_eviction)
1062
+ .unwrap_or_default()
1063
+ }
1064
+
1065
+ fn run_id(&self) -> &str {
1066
+ &self.wfm.machines.run_id
485
1067
  }
486
1068
  }
487
1069
 
488
- fn start_heartbeat_timeout_task(
489
- chan: UnboundedSender<Span>,
1070
+ /// Drains pending queries from the workflow task and appends them to the activation's jobs
1071
+ fn put_queries_in_act(act: &mut WorkflowActivation, wft: &mut OutstandingTask) {
1072
+ // Nothing to do if there are no pending queries
1073
+ if wft.pending_queries.is_empty() {
1074
+ return;
1075
+ }
1076
+
1077
+ let has_legacy = wft.has_pending_legacy_query();
1078
+ // Cannot dispatch legacy query if there are any other jobs - which can happen if, ex, a local
1079
+ // activity resolves while we've gotten a legacy query after heartbeating.
1080
+ if has_legacy && !act.jobs.is_empty() {
1081
+ return;
1082
+ }
1083
+
1084
+ debug!(queries=?wft.pending_queries, "Dispatching queries");
1085
+ let query_jobs = wft
1086
+ .pending_queries
1087
+ .drain(..)
1088
+ .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
1089
+ act.jobs.extend(query_jobs);
1090
+ }
1091
+ fn sink_heartbeat_timeout_start(
1092
+ run_id: String,
1093
+ sink: &dyn LocalActivityRequestSink,
490
1094
  wft_start_time: Instant,
491
1095
  wft_timeout: Duration,
492
- ) -> JoinHandle<()> {
1096
+ ) -> AbortHandle {
493
1097
  // The heartbeat deadline is 80% of the WFT timeout
494
- let wft_heartbeat_deadline =
495
- wft_start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
496
- task::spawn(async move {
497
- tokio::time::sleep_until(wft_heartbeat_deadline.into()).await;
498
- let _ = chan.send(Span::current());
499
- })
500
- }
501
-
502
- enum RunActionOutcome {
503
- AfterNewWFT(Option<ActivationOrAuto>),
504
- AfterCheckWork(Option<ActivationOrAuto>),
505
- AfterLocalResolution(Option<FulfillableActivationComplete>),
506
- AfterCompletion(Option<FulfillableActivationComplete>),
507
- AfterHeartbeatTimeout(Option<ActivationOrAuto>),
1098
+ let deadline = wft_start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
1099
+ let (abort_handle, abort_reg) = AbortHandle::new_pair();
1100
+ sink.sink_reqs(vec![LocalActRequest::StartHeartbeatTimeout {
1101
+ send_on_elapse: HeartbeatTimeoutMsg {
1102
+ run_id,
1103
+ span: Span::current(),
1104
+ },
1105
+ deadline,
1106
+ abort_reg,
1107
+ }]);
1108
+ abort_handle
508
1109
  }
509
1110
 
510
- #[derive(derive_more::DebugCustom)]
511
- #[debug(fmt = "RunUpdateErr({:?})", source)]
512
- struct RunUpdateErr {
513
- source: WFMachinesError,
514
- complete_resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1111
+ /// If an activation completion needed to wait on LA completions (or heartbeat timeout) we use
1112
+ /// this struct to store the data we need to finish the completion once that has happened
1113
+ struct WaitingOnLAs {
1114
+ wft_timeout: Duration,
1115
+ /// If set, we are waiting for LAs to complete as part of a just-finished workflow activation.
1116
+ /// If unset, we already had a heartbeat timeout and got a new WFT without any new work while
1117
+ /// there are still incomplete LAs.
1118
+ completion_dat: Option<(
1119
+ CompletionDataForWFT,
1120
+ Option<oneshot::Sender<ActivationCompleteResult>>,
1121
+ )>,
1122
+ /// Can be used to abort heartbeat timeouts
1123
+ hb_timeout_handle: AbortHandle,
515
1124
  }
516
-
517
- impl From<WFMachinesError> for RunUpdateErr {
518
- fn from(e: WFMachinesError) -> Self {
519
- RunUpdateErr {
520
- source: e,
521
- complete_resp_chan: None,
522
- }
523
- }
1125
+ #[derive(Debug)]
1126
+ struct CompletionDataForWFT {
1127
+ task_token: TaskToken,
1128
+ query_responses: Vec<QueryResult>,
1129
+ has_pending_query: bool,
1130
+ activation_was_only_eviction: bool,
524
1131
  }
525
1132
 
526
1133
  /// Manages an instance of a [WorkflowMachines], which is not thread-safe, as well as other data
527
1134
  /// associated with that specific workflow run.
528
- pub(crate) struct WorkflowManager {
1135
+ struct WorkflowManager {
529
1136
  machines: WorkflowMachines,
530
1137
  /// Is always `Some` in normal operation. Optional to allow for unit testing with the test
531
1138
  /// workflow driver, which does not need to complete activations the normal way.
@@ -535,24 +1142,9 @@ pub(crate) struct WorkflowManager {
535
1142
  impl WorkflowManager {
536
1143
  /// Create a new workflow manager given workflow history and execution info as would be found
537
1144
  /// in [PollWorkflowTaskQueueResponse]
538
- pub fn new(
539
- history: HistoryUpdate,
540
- namespace: String,
541
- workflow_id: String,
542
- workflow_type: String,
543
- run_id: String,
544
- metrics: MetricsContext,
545
- ) -> Self {
1145
+ fn new(basics: RunBasics) -> Self {
546
1146
  let (wfb, cmd_sink) = WorkflowBridge::new();
547
- let state_machines = WorkflowMachines::new(
548
- namespace,
549
- workflow_id,
550
- workflow_type,
551
- run_id,
552
- history,
553
- Box::new(wfb).into(),
554
- metrics,
555
- );
1147
+ let state_machines = WorkflowMachines::new(basics, Box::new(wfb).into());
556
1148
  Self {
557
1149
  machines: state_machines,
558
1150
  command_sink: Some(cmd_sink),
@@ -560,7 +1152,7 @@ impl WorkflowManager {
560
1152
  }
561
1153
 
562
1154
  #[cfg(test)]
563
- pub const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
1155
+ const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
564
1156
  Self {
565
1157
  machines: workflow_machines,
566
1158
  command_sink: None,
@@ -571,12 +1163,15 @@ impl WorkflowManager {
571
1163
  ///
572
1164
  /// Should only be called when a workflow has caught up on replay (or is just beginning). It
573
1165
  /// will return a workflow activation if one is needed.
574
- async fn feed_history_from_server(
575
- &mut self,
576
- update: HistoryUpdate,
577
- ) -> Result<WorkflowActivation> {
578
- self.machines.new_history_from_server(update).await?;
579
- self.get_next_activation().await
1166
+ fn feed_history_from_server(&mut self, update: HistoryUpdate) -> Result<WorkflowActivation> {
1167
+ self.machines.new_history_from_server(update)?;
1168
+ self.get_next_activation()
1169
+ }
1170
+
1171
+ /// Update the machines with some events from fetching another page of history. Does *not*
1172
+ /// attempt to pull the next activation, unlike [Self::feed_history_from_server].
1173
+ fn feed_history_from_new_page(&mut self, update: HistoryUpdate) -> Result<()> {
1174
+ self.machines.new_history_from_server(update)
580
1175
  }
581
1176
 
582
1177
  /// Let this workflow know that something we've been waiting locally on has resolved, like a
@@ -593,27 +1188,33 @@ impl WorkflowManager {
593
1188
  ///
594
1189
  /// Callers may also need to call [get_server_commands] after this to issue any pending commands
595
1190
  /// to the server.
596
- async fn get_next_activation(&mut self) -> Result<WorkflowActivation> {
1191
+ fn get_next_activation(&mut self) -> Result<WorkflowActivation> {
597
1192
  // First check if there are already some pending jobs, which can be a result of replay.
598
1193
  let activation = self.machines.get_wf_activation();
599
1194
  if !activation.jobs.is_empty() {
600
1195
  return Ok(activation);
601
1196
  }
602
1197
 
603
- self.machines.apply_next_wft_from_history().await?;
1198
+ self.machines.apply_next_wft_from_history()?;
604
1199
  Ok(self.machines.get_wf_activation())
605
1200
  }
606
1201
 
1202
+ /// Returns true if machines are ready to apply the next WFT sequence, false if events will need
1203
+ /// to be fetched in order to create a complete update with the entire next WFT sequence.
1204
+ pub(crate) fn ready_to_apply_next_wft(&self) -> bool {
1205
+ self.machines.ready_to_apply_next_wft()
1206
+ }
1207
+
607
1208
  /// If there are no pending jobs for the workflow, apply the next workflow task and check
608
1209
  /// again if there are any jobs. Importantly, does not *drain* jobs.
609
1210
  ///
610
1211
  /// Returns true if there are jobs (before or after applying the next WFT).
611
- async fn apply_next_task_if_ready(&mut self) -> Result<bool> {
1212
+ fn apply_next_task_if_ready(&mut self) -> Result<bool> {
612
1213
  if self.machines.has_pending_jobs() {
613
1214
  return Ok(true);
614
1215
  }
615
1216
  loop {
616
- let consumed_events = self.machines.apply_next_wft_from_history().await?;
1217
+ let consumed_events = self.machines.apply_next_wft_from_history()?;
617
1218
 
618
1219
  if consumed_events == 0 || !self.machines.replaying || self.machines.has_pending_jobs()
619
1220
  {
@@ -643,13 +1244,62 @@ impl WorkflowManager {
643
1244
 
644
1245
  /// Feed the workflow machines new commands issued by the executing workflow code, and iterate
645
1246
  /// the machines.
646
- async fn push_commands(&mut self, cmds: Vec<WFCommand>) -> Result<()> {
1247
+ fn push_commands_and_iterate(&mut self, cmds: Vec<WFCommand>) -> Result<()> {
647
1248
  if let Some(cs) = self.command_sink.as_mut() {
648
1249
  cs.send(cmds).map_err(|_| {
649
1250
  WFMachinesError::Fatal("Internal error buffering workflow commands".to_string())
650
1251
  })?;
651
1252
  }
652
- self.machines.iterate_machines().await?;
1253
+ self.machines.iterate_machines()?;
653
1254
  Ok(())
654
1255
  }
655
1256
  }
1257
+
1258
+ #[derive(Debug)]
1259
+ struct FulfillableActivationComplete {
1260
+ result: ActivationCompleteResult,
1261
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1262
+ }
1263
+ impl FulfillableActivationComplete {
1264
+ fn fulfill(self) {
1265
+ if let Some(resp_chan) = self.resp_chan {
1266
+ let _ = resp_chan.send(self.result);
1267
+ }
1268
+ }
1269
+ }
1270
+
1271
+ #[derive(Debug)]
1272
+ struct RunActivationCompletion {
1273
+ task_token: TaskToken,
1274
+ start_time: Instant,
1275
+ commands: Vec<WFCommand>,
1276
+ activation_was_eviction: bool,
1277
+ activation_was_only_eviction: bool,
1278
+ has_pending_query: bool,
1279
+ query_responses: Vec<QueryResult>,
1280
+ used_flags: Vec<u32>,
1281
+ /// Used to notify the worker when the completion is done processing and the completion can
1282
+ /// unblock. Must always be `Some` when initialized.
1283
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1284
+ }
1285
+ #[derive(Debug, derive_more::From)]
1286
+ enum ActOrFulfill {
1287
+ OutgoingAct(Option<ActivationOrAuto>),
1288
+ FulfillableComplete(Option<FulfillableActivationComplete>),
1289
+ }
1290
+
1291
+ #[derive(derive_more::DebugCustom)]
1292
+ #[debug(fmt = "RunUpdateErr({source:?})")]
1293
+ struct RunUpdateErr {
1294
+ source: WFMachinesError,
1295
+ complete_resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1296
+ }
1297
+
1298
+ impl From<WFMachinesError> for RunUpdateErr {
1299
+ fn from(e: WFMachinesError) -> Self {
1300
+ RunUpdateErr {
1301
+ source: e,
1302
+ complete_resp_chan: None,
1303
+ }
1304
+ }
1305
+ }