@temporalio/core-bridge 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/Cargo.lock +255 -48
  2. package/package.json +4 -4
  3. package/releases/aarch64-apple-darwin/index.node +0 -0
  4. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  5. package/releases/x86_64-apple-darwin/index.node +0 -0
  6. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  7. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  8. package/sdk-core/.buildkite/pipeline.yml +1 -3
  9. package/sdk-core/.cargo/config.toml +5 -2
  10. package/sdk-core/.github/workflows/heavy.yml +28 -0
  11. package/sdk-core/Cargo.toml +1 -1
  12. package/sdk-core/README.md +9 -5
  13. package/sdk-core/client/src/lib.rs +211 -36
  14. package/sdk-core/client/src/raw.rs +1 -1
  15. package/sdk-core/client/src/retry.rs +32 -20
  16. package/sdk-core/core/Cargo.toml +23 -9
  17. package/sdk-core/core/src/abstractions.rs +11 -0
  18. package/sdk-core/core/src/core_tests/activity_tasks.rs +6 -5
  19. package/sdk-core/core/src/core_tests/local_activities.rs +263 -22
  20. package/sdk-core/core/src/core_tests/queries.rs +2 -2
  21. package/sdk-core/core/src/core_tests/workflow_tasks.rs +249 -5
  22. package/sdk-core/core/src/ephemeral_server/mod.rs +5 -6
  23. package/sdk-core/core/src/lib.rs +2 -0
  24. package/sdk-core/core/src/protosext/mod.rs +1 -1
  25. package/sdk-core/core/src/telemetry/log_export.rs +1 -1
  26. package/sdk-core/core/src/telemetry/mod.rs +23 -8
  27. package/sdk-core/core/src/test_help/mod.rs +8 -1
  28. package/sdk-core/core/src/worker/activities/local_activities.rs +259 -125
  29. package/sdk-core/core/src/worker/activities.rs +3 -2
  30. package/sdk-core/core/src/worker/mod.rs +53 -26
  31. package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  32. package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  33. package/sdk-core/core/src/worker/workflow/history_update.rs +835 -277
  34. package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +9 -17
  35. package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +3 -5
  36. package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +1 -2
  37. package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +3 -5
  38. package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +1 -2
  39. package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +1 -2
  40. package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +1 -2
  41. package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +73 -51
  42. package/sdk-core/core/src/worker/workflow/machines/mod.rs +3 -3
  43. package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +4 -4
  44. package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +1 -2
  45. package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +3 -5
  46. package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +6 -7
  47. package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  48. package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +4 -4
  49. package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
  50. package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +89 -58
  51. package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +4 -7
  52. package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +21 -9
  53. package/sdk-core/core/src/worker/workflow/managed_run.rs +1021 -360
  54. package/sdk-core/core/src/worker/workflow/mod.rs +306 -346
  55. package/sdk-core/core/src/worker/workflow/run_cache.rs +29 -53
  56. package/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  57. package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  58. package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +115 -0
  59. package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  60. package/sdk-core/core/src/worker/workflow/workflow_stream.rs +444 -714
  61. package/sdk-core/core-api/Cargo.toml +2 -0
  62. package/sdk-core/core-api/src/errors.rs +1 -34
  63. package/sdk-core/core-api/src/lib.rs +6 -2
  64. package/sdk-core/core-api/src/worker.rs +14 -1
  65. package/sdk-core/etc/deps.svg +115 -140
  66. package/sdk-core/etc/regen-depgraph.sh +5 -0
  67. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +6 -6
  68. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +7 -3
  69. package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  70. package/sdk-core/protos/api_upstream/Makefile +5 -5
  71. package/sdk-core/protos/api_upstream/build/go.mod +7 -0
  72. package/sdk-core/protos/api_upstream/build/go.sum +5 -0
  73. package/sdk-core/protos/api_upstream/build/tools.go +29 -0
  74. package/sdk-core/protos/api_upstream/go.mod +6 -0
  75. package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
  76. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +12 -19
  77. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +2 -2
  78. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
  79. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -2
  80. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
  81. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +3 -3
  82. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +20 -2
  83. package/sdk-core/protos/api_upstream/temporal/api/{update/v1/message.proto → enums/v1/interaction_type.proto} +11 -18
  84. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
  85. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
  86. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
  87. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
  88. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
  89. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +2 -13
  90. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
  91. package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
  92. package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
  93. package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
  94. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +13 -19
  95. package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +87 -0
  96. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -2
  97. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +2 -2
  98. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
  99. package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
  100. package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
  101. package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
  102. package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
  103. package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
  104. package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
  105. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +13 -8
  106. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  107. package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +2 -0
  108. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
  109. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
  110. package/sdk-core/sdk/Cargo.toml +4 -3
  111. package/sdk-core/sdk/src/lib.rs +87 -21
  112. package/sdk-core/sdk/src/workflow_future.rs +7 -12
  113. package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  114. package/sdk-core/sdk-core-protos/build.rs +36 -2
  115. package/sdk-core/sdk-core-protos/src/history_builder.rs +26 -19
  116. package/sdk-core/sdk-core-protos/src/history_info.rs +4 -0
  117. package/sdk-core/sdk-core-protos/src/lib.rs +78 -34
  118. package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  119. package/sdk-core/test-utils/Cargo.toml +3 -1
  120. package/sdk-core/test-utils/src/histfetch.rs +1 -1
  121. package/sdk-core/test-utils/src/lib.rs +50 -18
  122. package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  123. package/sdk-core/test-utils/src/workflows.rs +29 -0
  124. package/sdk-core/tests/fuzzy_workflow.rs +130 -0
  125. package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +114 -7
  126. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -2
  127. package/sdk-core/tests/integ_tests/metrics_tests.rs +1 -1
  128. package/sdk-core/tests/integ_tests/polling_tests.rs +1 -39
  129. package/sdk-core/tests/integ_tests/queries_tests.rs +2 -127
  130. package/sdk-core/tests/integ_tests/visibility_tests.rs +52 -5
  131. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +74 -1
  132. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +5 -13
  133. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +1 -1
  134. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +2 -10
  135. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +69 -197
  136. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +4 -28
  137. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  138. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +14 -14
  139. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -19
  140. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +3 -19
  141. package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +1 -1
  142. package/sdk-core/tests/integ_tests/workflow_tests.rs +5 -6
  143. package/sdk-core/tests/main.rs +2 -12
  144. package/sdk-core/tests/runner.rs +71 -34
  145. package/sdk-core/tests/wf_input_replay.rs +32 -0
  146. package/sdk-core/bridge-ffi/Cargo.toml +0 -24
  147. package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
  148. package/sdk-core/bridge-ffi/build.rs +0 -25
  149. package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
  150. package/sdk-core/bridge-ffi/src/lib.rs +0 -746
  151. package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
  152. package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
  153. package/sdk-core/sdk/src/conversions.rs +0 -8
@@ -1,188 +1,228 @@
1
1
  #[cfg(test)]
2
2
  mod managed_wf_test;
3
3
 
4
+ #[cfg(test)]
5
+ pub(crate) use managed_wf_test::ManagedWFFunc;
6
+
4
7
  use crate::{
8
+ abstractions::dbg_panic,
9
+ protosext::WorkflowActivationExt,
5
10
  worker::{
6
11
  workflow::{
7
- machines::WorkflowMachines, ActivationAction, ActivationCompleteOutcome, HistoryUpdate,
8
- LocalResolution, NewIncomingWFT, OutgoingServerCommands, RequestEvictMsg, RunActions,
9
- RunActivationCompletion, RunUpdateResponse, ServerCommandsWithWorkflowInfo, WFCommand,
10
- WorkflowBridge,
12
+ history_update::HistoryPaginator, machines::WorkflowMachines, ActivationAction,
13
+ ActivationCompleteOutcome, ActivationCompleteResult, ActivationOrAuto,
14
+ EvictionRequestResult, FailedActivationWFTReport, HeartbeatTimeoutMsg, HistoryUpdate,
15
+ LocalActivityRequestSink, LocalResolution, NextPageReq, OutgoingServerCommands,
16
+ OutstandingActivation, OutstandingTask, PermittedWFT, RequestEvictMsg,
17
+ ServerCommandsWithWorkflowInfo, WFCommand, WFMachinesError, WFTReportStatus,
18
+ WorkflowBridge, WorkflowTaskInfo, WFT_HEARTBEAT_TIMEOUT_FRACTION,
11
19
  },
12
- LocalActRequest,
20
+ LocalActRequest, LEGACY_QUERY_ID,
13
21
  },
14
22
  MetricsContext,
15
23
  };
16
- use futures::{stream, StreamExt};
24
+ use futures_util::future::AbortHandle;
17
25
  use std::{
26
+ collections::HashSet,
18
27
  ops::Add,
19
- sync::mpsc::Sender,
28
+ sync::{mpsc::Sender, Arc},
20
29
  time::{Duration, Instant},
21
30
  };
22
- use temporal_sdk_core_api::errors::WFMachinesError;
23
- use temporal_sdk_core_protos::coresdk::{
24
- workflow_activation::{RemoveFromCache, WorkflowActivation},
25
- workflow_commands::QueryResult,
26
- };
27
- use tokio::{
28
- sync::{
29
- mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
30
- oneshot,
31
+ use temporal_sdk_core_protos::{
32
+ coresdk::{
33
+ workflow_activation::{
34
+ create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
35
+ workflow_activation_job, RemoveFromCache, WorkflowActivation,
36
+ },
37
+ workflow_commands::QueryResult,
38
+ workflow_completion,
31
39
  },
32
- task,
33
- task::JoinHandle,
40
+ temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure},
41
+ TaskToken,
34
42
  };
35
- use tokio_stream::wrappers::UnboundedReceiverStream;
43
+ use tokio::sync::oneshot;
36
44
  use tracing::Span;
37
- use tracing_futures::Instrument;
38
-
39
- use crate::worker::workflow::{
40
- ActivationCompleteResult, ActivationOrAuto, FailRunUpdate, FulfillableActivationComplete,
41
- GoodRunUpdate, LocalActivityRequestSink, RunAction, RunUpdateResponseKind,
42
- };
43
- use temporal_sdk_core_protos::TaskToken;
44
-
45
- use crate::abstractions::dbg_panic;
46
- #[cfg(test)]
47
- pub(crate) use managed_wf_test::ManagedWFFunc;
48
45
 
49
46
  type Result<T, E = WFMachinesError> = std::result::Result<T, E>;
50
- /// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
51
- /// necessary.
52
- const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
47
+ pub(super) type RunUpdateAct = Option<ActivationOrAuto>;
53
48
 
49
+ /// Manages access to a specific workflow run. Everything inside is entirely synchronous and should
50
+ /// remain that way.
51
+ #[derive(derive_more::DebugCustom)]
52
+ #[debug(
53
+ fmt = "ManagedRun {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
54
+ trying_to_evict: {} }}",
55
+ wft,
56
+ activation,
57
+ buffered_resp,
58
+ "trying_to_evict.is_some()"
59
+ )]
54
60
  pub(super) struct ManagedRun {
55
61
  wfm: WorkflowManager,
56
- update_tx: UnboundedSender<RunUpdateResponse>,
57
- local_activity_request_sink: LocalActivityRequestSink,
62
+ /// Called when the machines need to produce local activity requests. This can't be lifted up
63
+ /// easily as return values, because sometimes local activity requests trigger immediate
64
+ /// resolutions (ex: too many attempts). Thus lifting it up creates a lot of unneeded complexity
65
+ /// pushing things out and then directly back in. The downside is this is the only "impure" part
66
+ /// of the in/out nature of workflow state management. If there's ever a sensible way to lift it
67
+ /// up, that'd be nice.
68
+ local_activity_request_sink: Arc<dyn LocalActivityRequestSink>,
69
+ /// Set if the run is currently waiting on the execution of some local activities.
58
70
  waiting_on_la: Option<WaitingOnLAs>,
59
- // Is set to true if the machines encounter an error and the only subsequent thing we should
60
- // do is be evicted.
71
+ /// Is set to true if the machines encounter an error and the only subsequent thing we should
72
+ /// do is be evicted.
61
73
  am_broken: bool,
62
- }
74
+ /// If set, the WFT this run is currently/will be processing.
75
+ wft: Option<OutstandingTask>,
76
+ /// An outstanding activation to lang
77
+ activation: Option<OutstandingActivation>,
78
+ /// If set, it indicates there is a buffered poll response from the server that applies to this
79
+ /// run. This can happen when lang takes too long to complete a task and the task times out, for
80
+ /// example. Upon next completion, the buffered response will be removed and can be made ready
81
+ /// to be returned from polling
82
+ buffered_resp: Option<PermittedWFT>,
83
+ /// Is set if an eviction has been requested for this run
84
+ trying_to_evict: Option<RequestEvictMsg>,
63
85
 
64
- /// If an activation completion needed to wait on LA completions (or heartbeat timeout) we use
65
- /// this struct to store the data we need to finish the completion once that has happened
66
- struct WaitingOnLAs {
67
- wft_timeout: Duration,
68
- /// If set, we are waiting for LAs to complete as part of a just-finished workflow activation.
69
- /// If unset, we already had a heartbeat timeout and got a new WFT without any new work while
70
- /// there are still incomplete LAs.
71
- completion_dat: Option<(
72
- CompletionDataForWFT,
73
- oneshot::Sender<ActivationCompleteResult>,
74
- )>,
75
- hb_chan: UnboundedSender<Span>,
76
- heartbeat_timeout_task: JoinHandle<()>,
77
- }
78
-
79
- #[derive(Debug)]
80
- struct CompletionDataForWFT {
81
- task_token: TaskToken,
82
- query_responses: Vec<QueryResult>,
83
- has_pending_query: bool,
84
- activation_was_only_eviction: bool,
86
+ /// We track if we have recorded useful debugging values onto a certain span yet, to overcome
87
+ /// duplicating field values. Remove this once https://github.com/tokio-rs/tracing/issues/2334
88
+ /// is fixed.
89
+ recorded_span_ids: HashSet<tracing::Id>,
90
+ metrics: MetricsContext,
91
+ /// We store the paginator used for our own run's history fetching
92
+ paginator: Option<HistoryPaginator>,
93
+ completion_waiting_on_page_fetch: Option<RunActivationCompletion>,
85
94
  }
86
-
87
95
  impl ManagedRun {
96
+ #[allow(clippy::too_many_arguments)] // Ok with this here. Nothing reusable to extract.
88
97
  pub(super) fn new(
89
- wfm: WorkflowManager,
90
- update_tx: UnboundedSender<RunUpdateResponse>,
91
- local_activity_request_sink: LocalActivityRequestSink,
98
+ history_update: HistoryUpdate,
99
+ namespace: String,
100
+ workflow_id: String,
101
+ workflow_type: String,
102
+ run_id: String,
103
+ local_activity_request_sink: Arc<dyn LocalActivityRequestSink>,
104
+ metrics: MetricsContext,
92
105
  ) -> Self {
106
+ let wfm = WorkflowManager::new(
107
+ history_update,
108
+ namespace,
109
+ workflow_id,
110
+ workflow_type,
111
+ run_id,
112
+ metrics.clone(),
113
+ );
93
114
  Self {
94
115
  wfm,
95
- update_tx,
96
116
  local_activity_request_sink,
97
117
  waiting_on_la: None,
98
118
  am_broken: false,
119
+ wft: None,
120
+ activation: None,
121
+ buffered_resp: None,
122
+ trying_to_evict: None,
123
+ recorded_span_ids: Default::default(),
124
+ metrics,
125
+ paginator: None,
126
+ completion_waiting_on_page_fetch: None,
99
127
  }
100
128
  }
101
129
 
102
- pub(super) async fn run(self, run_actions_rx: UnboundedReceiver<RunAction>) {
103
- let (heartbeat_tx, heartbeat_rx) = unbounded_channel();
104
- stream::select(
105
- UnboundedReceiverStream::new(run_actions_rx),
106
- UnboundedReceiverStream::new(heartbeat_rx).map(|trace_span| RunAction {
107
- action: RunActions::HeartbeatTimeout,
108
- trace_span,
109
- }),
110
- )
111
- .fold((self, heartbeat_tx), |(mut me, heartbeat_tx), action| {
112
- let span = action.trace_span;
113
- let action = action.action;
114
- let mut no_wft = false;
115
- async move {
116
- let res = match action {
117
- RunActions::NewIncomingWFT(wft) => me
118
- .incoming_wft(wft)
119
- .await
120
- .map(RunActionOutcome::AfterNewWFT),
121
- RunActions::ActivationCompletion(completion) => me
122
- .completion(completion, &heartbeat_tx)
123
- .await
124
- .map(RunActionOutcome::AfterCompletion),
125
- RunActions::CheckMoreWork {
126
- want_to_evict,
127
- has_pending_queries,
128
- has_wft,
129
- } => {
130
- if !has_wft {
131
- no_wft = true;
132
- }
133
- me.check_more_work(want_to_evict, has_pending_queries, has_wft)
134
- .await
135
- .map(RunActionOutcome::AfterCheckWork)
136
- }
137
- RunActions::LocalResolution(r) => me
138
- .local_resolution(r)
139
- .await
140
- .map(RunActionOutcome::AfterLocalResolution),
141
- RunActions::HeartbeatTimeout => {
142
- let maybe_act = if me.heartbeat_timeout() {
143
- Some(ActivationOrAuto::Autocomplete {
144
- run_id: me.wfm.machines.run_id.clone(),
145
- })
146
- } else {
147
- None
148
- };
149
- Ok(RunActionOutcome::AfterHeartbeatTimeout(maybe_act))
150
- }
151
- };
152
- match res {
153
- Ok(outcome) => {
154
- me.send_update_response(outcome, no_wft);
155
- }
156
- Err(e) => {
157
- error!(error=?e, "Error in run machines");
158
- me.am_broken = true;
159
- me.update_tx
160
- .send(RunUpdateResponse {
161
- kind: RunUpdateResponseKind::Fail(FailRunUpdate {
162
- run_id: me.wfm.machines.run_id.clone(),
163
- err: e.source,
164
- completion_resp: e.complete_resp_chan,
165
- }),
166
- span: Span::current(),
167
- })
168
- .expect("Machine can send update");
169
- }
170
- }
171
- (me, heartbeat_tx)
172
- }
173
- .instrument(span)
174
- })
175
- .await;
130
+ /// Returns true if there are pending jobs that need to be sent to lang.
131
+ pub(super) fn more_pending_work(&self) -> bool {
132
+ // We don't want to consider there to be more local-only work to be done if there is
133
+ // no workflow task associated with the run right now. This can happen if, ex, we
134
+ // complete a local activity while waiting for server to send us the next WFT.
135
+ // Activating lang would be harmful at this stage, as there might be work returned
136
+ // in that next WFT which should be part of the next activation.
137
+ self.wft.is_some() && self.wfm.machines.has_pending_jobs()
138
+ }
139
+
140
+ pub(super) fn have_seen_terminal_event(&self) -> bool {
141
+ self.wfm.machines.have_seen_terminal_event
142
+ }
143
+
144
+ /// Returns a ref to info about the currently tracked workflow task, if any.
145
+ pub(super) fn wft(&self) -> Option<&OutstandingTask> {
146
+ self.wft.as_ref()
147
+ }
148
+
149
+ /// Returns a ref to info about the currently tracked workflow activation, if any.
150
+ pub(super) fn activation(&self) -> Option<&OutstandingActivation> {
151
+ self.activation.as_ref()
152
+ }
153
+
154
+ /// Returns true if this run has already been told it will be evicted.
155
+ pub(super) fn is_trying_to_evict(&self) -> bool {
156
+ self.trying_to_evict.is_some()
176
157
  }
177
158
 
178
- async fn incoming_wft(
159
+ /// Called whenever a new workflow task is obtained for this run
160
+ pub(super) fn incoming_wft(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
161
+ let res = self._incoming_wft(pwft);
162
+ self.update_to_acts(res.map(Into::into), true)
163
+ }
164
+
165
+ fn _incoming_wft(
179
166
  &mut self,
180
- wft: NewIncomingWFT,
167
+ pwft: PermittedWFT,
181
168
  ) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
182
- let activation = if let Some(h) = wft.history_update {
183
- self.wfm.feed_history_from_server(h).await?
169
+ if self.wft.is_some() {
170
+ dbg_panic!("Trying to send a new WFT for a run which already has one!");
171
+ }
172
+ let start_time = Instant::now();
173
+
174
+ let work = pwft.work;
175
+ let did_miss_cache = !work.is_incremental() || !work.update.is_real();
176
+ debug!(
177
+ run_id = %work.execution.run_id,
178
+ task_token = %&work.task_token,
179
+ update = ?work.update,
180
+ has_legacy_query = %work.legacy_query.is_some(),
181
+ attempt = %work.attempt,
182
+ "Applying new workflow task from server"
183
+ );
184
+ let wft_info = WorkflowTaskInfo {
185
+ attempt: work.attempt,
186
+ task_token: work.task_token,
187
+ wf_id: work.execution.workflow_id.clone(),
188
+ };
189
+
190
+ let legacy_query_from_poll = work
191
+ .legacy_query
192
+ .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
193
+
194
+ let mut pending_queries = work.query_requests;
195
+ if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
196
+ error!(
197
+ "Server issued both normal and legacy queries. This should not happen. Please \
198
+ file a bug report."
199
+ );
200
+ return Err(RunUpdateErr {
201
+ source: WFMachinesError::Fatal(
202
+ "Server issued both normal and legacy query".to_string(),
203
+ ),
204
+ complete_resp_chan: None,
205
+ });
206
+ }
207
+ if let Some(lq) = legacy_query_from_poll {
208
+ pending_queries.push(lq);
209
+ }
210
+
211
+ self.paginator = Some(pwft.paginator);
212
+ self.wft = Some(OutstandingTask {
213
+ info: wft_info,
214
+ hit_cache: !did_miss_cache,
215
+ pending_queries,
216
+ start_time,
217
+ permit: pwft.permit,
218
+ });
219
+
220
+ // The update field is only populated in the event we hit the cache
221
+ let activation = if work.update.is_real() {
222
+ self.metrics.sticky_cache_hit();
223
+ self.wfm.feed_history_from_server(work.update)?
184
224
  } else {
185
- let r = self.wfm.get_next_activation().await?;
225
+ let r = self.wfm.get_next_activation()?;
186
226
  if r.jobs.is_empty() {
187
227
  return Err(RunUpdateErr {
188
228
  source: WFMachinesError::Fatal(format!(
@@ -197,16 +237,17 @@ impl ManagedRun {
197
237
 
198
238
  if activation.jobs.is_empty() {
199
239
  if self.wfm.machines.outstanding_local_activity_count() > 0 {
200
- // If the activation has no jobs but there are outstanding LAs, we need to restart the
201
- // WFT heartbeat.
240
+ // If the activation has no jobs but there are outstanding LAs, we need to restart
241
+ // the WFT heartbeat.
202
242
  if let Some(ref mut lawait) = self.waiting_on_la {
203
243
  if lawait.completion_dat.is_some() {
204
244
  panic!("Should not have completion dat when getting new wft & empty jobs")
205
245
  }
206
- lawait.heartbeat_timeout_task.abort();
207
- lawait.heartbeat_timeout_task = start_heartbeat_timeout_task(
208
- lawait.hb_chan.clone(),
209
- wft.start_time,
246
+ lawait.hb_timeout_handle.abort();
247
+ lawait.hb_timeout_handle = sink_heartbeat_timeout_start(
248
+ self.wfm.machines.run_id.clone(),
249
+ self.local_activity_request_sink.as_ref(),
250
+ start_time,
210
251
  lawait.wft_timeout,
211
252
  );
212
253
  // No activation needs to be sent to lang. We just need to wait for another
@@ -228,41 +269,332 @@ impl ManagedRun {
228
269
  Ok(Some(ActivationOrAuto::LangActivation(activation)))
229
270
  }
230
271
 
231
- async fn completion(
272
+ /// Deletes the currently tracked WFT & records latency metrics. Should be called after it has
273
+ /// been responded to (server has been told). Returns the WFT if there was one.
274
+ pub(super) fn mark_wft_complete(
232
275
  &mut self,
233
- mut completion: RunActivationCompletion,
234
- heartbeat_tx: &UnboundedSender<Span>,
235
- ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
236
- let resp_chan = completion
237
- .resp_chan
238
- .take()
239
- .expect("Completion response channel must be populated");
240
-
241
- let outcome = async move {
242
- // Send commands from lang into the machines then check if the workflow run
243
- // needs another activation and mark it if so
244
- self.wfm.push_commands(completion.commands).await?;
245
- // Don't bother applying the next task if we're evicting at the end of
246
- // this activation
247
- if !completion.activation_was_eviction {
248
- self.wfm.apply_next_task_if_ready().await?;
276
+ report_status: WFTReportStatus,
277
+ ) -> Option<OutstandingTask> {
278
+ debug!("Marking WFT completed");
279
+ let retme = self.wft.take();
280
+
281
+ // Only record latency metrics if we genuinely reported to server
282
+ if matches!(report_status, WFTReportStatus::Reported) {
283
+ if let Some(ot) = &retme {
284
+ self.metrics.wf_task_latency(ot.start_time.elapsed());
249
285
  }
250
- let new_local_acts = self.wfm.drain_queued_local_activities();
286
+ }
287
+
288
+ retme
289
+ }
290
+
291
+ /// Checks if any further activations need to go out for this run and produces them if so.
292
+ pub(super) fn check_more_activations(&mut self) -> RunUpdateAct {
293
+ let res = self._check_more_activations();
294
+ self.update_to_acts(res.map(Into::into), false)
295
+ }
296
+
297
+ fn _check_more_activations(&mut self) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
298
+ // No point in checking for more activations if there's already an outstanding activation.
299
+ if self.activation.is_some() {
300
+ return Ok(None);
301
+ }
302
+ // In the event it's time to evict this run, cancel any outstanding LAs
303
+ if self.trying_to_evict.is_some() {
304
+ self.sink_la_requests(vec![LocalActRequest::CancelAllInRun(
305
+ self.wfm.machines.run_id.clone(),
306
+ )])?;
307
+ }
308
+
309
+ if self.wft.is_none() {
310
+ // It doesn't make sense to do workflow work unless we have a WFT
311
+ return Ok(None);
312
+ }
313
+
314
+ if self.wfm.machines.has_pending_jobs() && !self.am_broken {
315
+ Ok(Some(ActivationOrAuto::LangActivation(
316
+ self.wfm.get_next_activation()?,
317
+ )))
318
+ } else {
319
+ if !self.am_broken {
320
+ let has_pending_queries = self
321
+ .wft
322
+ .as_ref()
323
+ .map(|wft| !wft.pending_queries.is_empty())
324
+ .unwrap_or_default();
325
+ if has_pending_queries {
326
+ return Ok(Some(ActivationOrAuto::ReadyForQueries(
327
+ self.wfm.machines.get_wf_activation(),
328
+ )));
329
+ }
330
+ }
331
+ if let Some(wte) = self.trying_to_evict.clone() {
332
+ let mut act = self.wfm.machines.get_wf_activation();
333
+ // No other jobs make any sense to send if we encountered an error.
334
+ if self.am_broken {
335
+ act.jobs = vec![];
336
+ }
337
+ act.append_evict_job(RemoveFromCache {
338
+ message: wte.message,
339
+ reason: wte.reason as i32,
340
+ });
341
+ Ok(Some(ActivationOrAuto::LangActivation(act)))
342
+ } else {
343
+ Ok(None)
344
+ }
345
+ }
346
+ }
251
347
 
252
- let immediate_resolutions = (self.local_activity_request_sink)(new_local_acts);
253
- for resolution in immediate_resolutions {
254
- self.wfm
255
- .notify_of_local_result(LocalResolution::LocalActivity(resolution))?;
348
+ /// Called whenever lang successfully completes a workflow activation. Commands produced by the
349
+ /// activation are passed in. `resp_chan` will be used to unblock the completion call when
350
+ /// everything we need to do to fulfill it has happened.
351
+ ///
352
+ /// Can return an error in the event that another page of history needs to be fetched before
353
+ /// the completion can proceed.
354
+ pub(super) fn successful_completion(
355
+ &mut self,
356
+ mut commands: Vec<WFCommand>,
357
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
358
+ ) -> Result<RunUpdateAct, NextPageReq> {
359
+ let activation_was_only_eviction = self.activation_has_only_eviction();
360
+ let (task_token, has_pending_query, start_time) = if let Some(entry) = self.wft.as_ref() {
361
+ (
362
+ entry.info.task_token.clone(),
363
+ !entry.pending_queries.is_empty(),
364
+ entry.start_time,
365
+ )
366
+ } else {
367
+ if !activation_was_only_eviction {
368
+ // Not an error if this was an eviction, since it's normal to issue eviction
369
+ // activations without an associated workflow task in that case.
370
+ dbg_panic!(
371
+ "Attempted to complete activation for run {} without associated workflow task",
372
+ self.run_id()
373
+ );
256
374
  }
375
+ self.reply_to_complete(ActivationCompleteOutcome::DoNothing, resp_chan);
376
+ return Ok(None);
377
+ };
257
378
 
258
- let data = CompletionDataForWFT {
259
- task_token: completion.task_token,
260
- query_responses: completion.query_responses,
261
- has_pending_query: completion.has_pending_query,
262
- activation_was_only_eviction: completion.activation_was_only_eviction,
379
+ // If the only command from the activation is a legacy query response, that means we need
380
+ // to respond differently than a typical activation.
381
+ if matches!(&commands.as_slice(),
382
+ &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
383
+ {
384
+ let qr = match commands.remove(0) {
385
+ WFCommand::QueryResponse(qr) => qr,
386
+ _ => unreachable!("We just verified this is the only command"),
263
387
  };
388
+ self.reply_to_complete(
389
+ ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
390
+ task_token,
391
+ action: ActivationAction::RespondLegacyQuery {
392
+ result: Box::new(qr),
393
+ },
394
+ }),
395
+ resp_chan,
396
+ );
397
+ Ok(None)
398
+ } else {
399
+ // First strip out query responses from other commands that actually affect machines
400
+ // Would be prettier with `drain_filter`
401
+ let mut i = 0;
402
+ let mut query_responses = vec![];
403
+ while i < commands.len() {
404
+ if matches!(commands[i], WFCommand::QueryResponse(_)) {
405
+ if let WFCommand::QueryResponse(qr) = commands.remove(i) {
406
+ query_responses.push(qr);
407
+ }
408
+ } else {
409
+ i += 1;
410
+ }
411
+ }
412
+
413
+ if activation_was_only_eviction && !commands.is_empty() {
414
+ dbg_panic!("Reply to an eviction only containing an eviction included commands");
415
+ }
416
+
417
+ let rac = RunActivationCompletion {
418
+ task_token,
419
+ start_time,
420
+ commands,
421
+ activation_was_eviction: self.activation_has_eviction(),
422
+ activation_was_only_eviction,
423
+ has_pending_query,
424
+ query_responses,
425
+ resp_chan,
426
+ };
427
+
428
+ // Verify we can actually apply the next workflow task, which will happen as part of
429
+ // applying the completion to machines. If we can't, return early indicating we need
430
+ // to fetch a page.
431
+ if !self.wfm.ready_to_apply_next_wft() {
432
+ return if let Some(paginator) = self.paginator.take() {
433
+ debug!("Need to fetch a history page before next WFT can be applied");
434
+ self.completion_waiting_on_page_fetch = Some(rac);
435
+ Err(NextPageReq {
436
+ paginator,
437
+ span: Span::current(),
438
+ })
439
+ } else {
440
+ Ok(self.update_to_acts(
441
+ Err(RunUpdateErr {
442
+ source: WFMachinesError::Fatal(
443
+ "Run's paginator was absent when attempting to fetch next history \
444
+ page. This is a Core SDK bug."
445
+ .to_string(),
446
+ ),
447
+ complete_resp_chan: rac.resp_chan,
448
+ }),
449
+ false,
450
+ ))
451
+ };
452
+ }
453
+
454
+ Ok(self.process_completion(rac))
455
+ }
456
+ }
457
+
458
+ /// Called after the higher-up machinery has fetched more pages of event history needed to apply
459
+ /// the next workflow task. The history update and paginator used to perform the fetch are
460
+ /// passed in, with the update being used to apply the task, and the paginator stored to be
461
+ /// attached with another fetch request if needed.
462
+ pub(super) fn fetched_page_completion(
463
+ &mut self,
464
+ update: HistoryUpdate,
465
+ paginator: HistoryPaginator,
466
+ ) -> RunUpdateAct {
467
+ let res = self._fetched_page_completion(update, paginator);
468
+ self.update_to_acts(res.map(Into::into), false)
469
+ }
470
+ fn _fetched_page_completion(
471
+ &mut self,
472
+ update: HistoryUpdate,
473
+ paginator: HistoryPaginator,
474
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
475
+ self.paginator = Some(paginator);
476
+ if let Some(d) = self.completion_waiting_on_page_fetch.take() {
477
+ self._process_completion(d, Some(update))
478
+ } else {
479
+ dbg_panic!(
480
+ "Shouldn't be possible to be applying a next-page-fetch update when \
481
+ doing anything other than completing an activation."
482
+ );
483
+ Err(RunUpdateErr::from(WFMachinesError::Fatal(
484
+ "Tried to apply next-page-fetch update to a run that wasn't handling a completion"
485
+ .to_string(),
486
+ )))
487
+ }
488
+ }
489
+
490
+ /// Called whenever either core lang cannot complete a workflow activation. EX: Nondeterminism
491
+ /// or user code threw/panicked, respectively. The `cause` and `reason` fields are determined
492
+ /// inside core always. The `failure` field may come from lang. `resp_chan` will be used to
493
+ /// unblock the completion call when everything we need to do to fulfill it has happened.
494
+ pub(super) fn failed_completion(
495
+ &mut self,
496
+ cause: WorkflowTaskFailedCause,
497
+ reason: EvictionReason,
498
+ failure: workflow_completion::Failure,
499
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
500
+ ) -> RunUpdateAct {
501
+ let tt = if let Some(tt) = self.wft.as_ref().map(|t| t.info.task_token.clone()) {
502
+ tt
503
+ } else {
504
+ dbg_panic!(
505
+ "No workflow task for run id {} found when trying to fail activation",
506
+ self.run_id()
507
+ );
508
+ self.reply_to_complete(ActivationCompleteOutcome::DoNothing, resp_chan);
509
+ return None;
510
+ };
511
+
512
+ self.metrics.wf_task_failed();
513
+ let message = format!("Workflow activation completion failed: {:?}", &failure);
514
+ // Blow up any cached data associated with the workflow
515
+ let evict_req_outcome = self.request_eviction(RequestEvictMsg {
516
+ run_id: self.run_id().to_string(),
517
+ message,
518
+ reason,
519
+ });
520
+ let should_report = match &evict_req_outcome {
521
+ EvictionRequestResult::EvictionRequested(Some(attempt), _)
522
+ | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => *attempt <= 1,
523
+ _ => false,
524
+ };
525
+ let rur = evict_req_outcome.into_run_update_resp();
526
+ // If the outstanding WFT is a legacy query task, report that we need to fail it
527
+ let outcome = if self.pending_work_is_legacy_query() {
528
+ ActivationCompleteOutcome::ReportWFTFail(
529
+ FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
530
+ )
531
+ } else if should_report {
532
+ ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
533
+ tt, cause, failure,
534
+ ))
535
+ } else {
536
+ ActivationCompleteOutcome::WFTFailedDontReport
537
+ };
538
+ self.reply_to_complete(outcome, resp_chan);
539
+ rur
540
+ }
541
+
542
+ /// Delete the currently tracked workflow activation and return it, if any. Should be called
543
+ /// after the processing of the activation completion, and WFT reporting.
544
+ pub(super) fn delete_activation(&mut self) -> Option<OutstandingActivation> {
545
+ self.activation.take()
546
+ }
547
+
548
+ /// Called when local activities resolve
549
+ pub(super) fn local_resolution(&mut self, res: LocalResolution) -> RunUpdateAct {
550
+ let res = self._local_resolution(res);
551
+ self.update_to_acts(res.map(Into::into), false)
552
+ }
553
+
554
+ fn process_completion(&mut self, completion: RunActivationCompletion) -> RunUpdateAct {
555
+ let res = self._process_completion(completion, None);
556
+ self.update_to_acts(res.map(Into::into), false)
557
+ }
558
+
559
+ fn _process_completion(
560
+ &mut self,
561
+ completion: RunActivationCompletion,
562
+ new_update: Option<HistoryUpdate>,
563
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
564
+ let data = CompletionDataForWFT {
565
+ task_token: completion.task_token,
566
+ query_responses: completion.query_responses,
567
+ has_pending_query: completion.has_pending_query,
568
+ activation_was_only_eviction: completion.activation_was_only_eviction,
569
+ };
570
+
571
+ // If this is just bookkeeping after a reply to an only-eviction activation, we can bypass
572
+ // everything, since there is no reason to continue trying to update machines.
573
+ if completion.activation_was_only_eviction {
574
+ return Ok(Some(self.prepare_complete_resp(
575
+ completion.resp_chan,
576
+ data,
577
+ false,
578
+ )));
579
+ }
580
+
581
+ let outcome = (|| {
582
+ // Send commands from lang into the machines then check if the workflow run needs
583
+ // another activation and mark it if so
584
+ self.wfm.push_commands_and_iterate(completion.commands)?;
585
+ // If there was a new update included as part of the completion, apply it.
586
+ if let Some(update) = new_update {
587
+ self.wfm.feed_history_from_new_page(update)?;
588
+ }
589
+ // Don't bother applying the next task if we're evicting at the end of this activation
590
+ if !completion.activation_was_eviction {
591
+ self.wfm.apply_next_task_if_ready()?;
592
+ }
593
+ let new_local_acts = self.wfm.drain_queued_local_activities();
594
+ self.sink_la_requests(new_local_acts)?;
595
+
264
596
  if self.wfm.machines.outstanding_local_activity_count() == 0 {
265
- Ok((None, data, self))
597
+ Ok(None)
266
598
  } else {
267
599
  let wft_timeout: Duration = self
268
600
  .wfm
@@ -275,28 +607,26 @@ impl ManagedRun {
275
607
  .to_string(),
276
608
  )
277
609
  })?;
278
- let heartbeat_tx = heartbeat_tx.clone();
279
- Ok((
280
- Some((heartbeat_tx, completion.start_time, wft_timeout)),
281
- data,
282
- self,
283
- ))
610
+ Ok(Some((completion.start_time, wft_timeout)))
284
611
  }
285
- }
286
- .await;
612
+ })();
287
613
 
288
614
  match outcome {
289
- Ok((None, data, me)) => Ok(Some(me.prepare_complete_resp(resp_chan, data, false))),
290
- Ok((Some((chan, start_t, wft_timeout)), data, me)) => {
291
- if let Some(wola) = me.waiting_on_la.as_mut() {
292
- wola.heartbeat_timeout_task.abort();
615
+ Ok(None) => Ok(Some(self.prepare_complete_resp(
616
+ completion.resp_chan,
617
+ data,
618
+ false,
619
+ ))),
620
+ Ok(Some((start_t, wft_timeout))) => {
621
+ if let Some(wola) = self.waiting_on_la.as_mut() {
622
+ wola.hb_timeout_handle.abort();
293
623
  }
294
- me.waiting_on_la = Some(WaitingOnLAs {
624
+ self.waiting_on_la = Some(WaitingOnLAs {
295
625
  wft_timeout,
296
- completion_dat: Some((data, resp_chan)),
297
- hb_chan: chan.clone(),
298
- heartbeat_timeout_task: start_heartbeat_timeout_task(
299
- chan,
626
+ completion_dat: Some((data, completion.resp_chan)),
627
+ hb_timeout_handle: sink_heartbeat_timeout_start(
628
+ self.run_id().to_string(),
629
+ self.local_activity_request_sink.as_ref(),
300
630
  start_t,
301
631
  wft_timeout,
302
632
  ),
@@ -305,72 +635,342 @@ impl ManagedRun {
305
635
  }
306
636
  Err(e) => Err(RunUpdateErr {
307
637
  source: e,
308
- complete_resp_chan: Some(resp_chan),
638
+ complete_resp_chan: completion.resp_chan,
309
639
  }),
310
640
  }
311
641
  }
312
642
 
313
- async fn check_more_work(
643
+ fn _local_resolution(
314
644
  &mut self,
315
- want_to_evict: Option<RequestEvictMsg>,
316
- has_pending_queries: bool,
317
- has_wft: bool,
318
- ) -> Result<Option<ActivationOrAuto>, RunUpdateErr> {
319
- if !has_wft {
320
- // It doesn't make sense to do work unless we have a WFT
321
- return Ok(None);
645
+ res: LocalResolution,
646
+ ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
647
+ debug!(resolution=?res, "Applying local resolution");
648
+ self.wfm.notify_of_local_result(res)?;
649
+ if self.wfm.machines.outstanding_local_activity_count() == 0 {
650
+ if let Some(mut wait_dat) = self.waiting_on_la.take() {
651
+ // Cancel the heartbeat timeout
652
+ wait_dat.hb_timeout_handle.abort();
653
+ if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
654
+ return Ok(Some(self.prepare_complete_resp(
655
+ resp_chan,
656
+ completion_dat,
657
+ false,
658
+ )));
659
+ }
660
+ }
322
661
  }
323
- if self.wfm.machines.has_pending_jobs() && !self.am_broken {
324
- Ok(Some(ActivationOrAuto::LangActivation(
325
- self.wfm.get_next_activation().await?,
326
- )))
662
+ Ok(None)
663
+ }
664
+
665
+ pub(super) fn heartbeat_timeout(&mut self) -> RunUpdateAct {
666
+ let maybe_act = if self._heartbeat_timeout() {
667
+ Some(ActivationOrAuto::Autocomplete {
668
+ run_id: self.wfm.machines.run_id.clone(),
669
+ })
327
670
  } else {
328
- if has_pending_queries && !self.am_broken {
329
- return Ok(Some(ActivationOrAuto::ReadyForQueries(
330
- self.wfm.machines.get_wf_activation(),
331
- )));
671
+ None
672
+ };
673
+ self.update_to_acts(Ok(maybe_act).map(Into::into), false)
674
+ }
675
+ /// Returns `true` if autocompletion should be issued, which will actually cause us to end up
676
+ /// in [completion] again, at which point we'll start a new heartbeat timeout, which will
677
+ /// immediately trigger and thus finish the completion, forcing a new task as it should.
678
+ fn _heartbeat_timeout(&mut self) -> bool {
679
+ if let Some(ref mut wait_dat) = self.waiting_on_la {
680
+ // Cancel the heartbeat timeout
681
+ wait_dat.hb_timeout_handle.abort();
682
+ if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
683
+ let compl = self.prepare_complete_resp(resp_chan, completion_dat, true);
684
+ // Immediately fulfill the completion since the run update will already have
685
+ // been replied to
686
+ compl.fulfill();
687
+ } else {
688
+ // Auto-reply WFT complete
689
+ return true;
332
690
  }
333
- if let Some(wte) = want_to_evict {
334
- let mut act = self.wfm.machines.get_wf_activation();
335
- // No other jobs make any sense to send if we encountered an error.
336
- if self.am_broken {
337
- act.jobs = vec![];
338
- }
339
- act.append_evict_job(RemoveFromCache {
340
- message: wte.message,
341
- reason: wte.reason as i32,
342
- });
343
- Ok(Some(ActivationOrAuto::LangActivation(act)))
691
+ } else {
692
+ // If a heartbeat timeout happened, we should always have been waiting on LAs
693
+ dbg_panic!("WFT heartbeat timeout fired but we were not waiting on any LAs");
694
+ }
695
+ false
696
+ }
697
+
698
+ /// Returns true if the managed run has any form of pending work
699
+ /// If `ignore_evicts` is true, pending evictions do not count as pending work.
700
+ /// If `ignore_buffered` is true, buffered workflow tasks do not count as pending work.
701
+ pub(super) fn has_any_pending_work(&self, ignore_evicts: bool, ignore_buffered: bool) -> bool {
702
+ let evict_work = if ignore_evicts {
703
+ false
704
+ } else {
705
+ self.trying_to_evict.is_some()
706
+ };
707
+ let act_work = if ignore_evicts {
708
+ if let Some(ref act) = self.activation {
709
+ !act.has_only_eviction()
344
710
  } else {
345
- Ok(None)
711
+ false
712
+ }
713
+ } else {
714
+ self.activation.is_some()
715
+ };
716
+ let buffered = if ignore_buffered {
717
+ false
718
+ } else {
719
+ self.buffered_resp.is_some()
720
+ };
721
+ trace!(wft=self.wft.is_some(), buffered=?buffered, more_work=?self.more_pending_work(),
722
+ act_work, evict_work, "Does run have pending work?");
723
+ self.wft.is_some() || buffered || self.more_pending_work() || act_work || evict_work
724
+ }
725
+
726
+ /// Stores some work if there is any outstanding WFT or activation for the run. If there was
727
+ /// not, returns the work back out inside the option.
728
+ pub(super) fn buffer_wft_if_outstanding_work(
729
+ &mut self,
730
+ work: PermittedWFT,
731
+ ) -> Option<PermittedWFT> {
732
+ let about_to_issue_evict = self.trying_to_evict.is_some();
733
+ let has_wft = self.wft().is_some();
734
+ let has_activation = self.activation().is_some();
735
+ if has_wft || has_activation || about_to_issue_evict || self.more_pending_work() {
736
+ debug!(run_id = %self.run_id(),
737
+ "Got new WFT for a run with outstanding work, buffering it");
738
+ self.buffered_resp = Some(work);
739
+ None
740
+ } else {
741
+ Some(work)
742
+ }
743
+ }
744
+
745
+ /// Returns true if there is a buffered workflow task for this run.
746
+ pub(super) fn has_buffered_wft(&self) -> bool {
747
+ self.buffered_resp.is_some()
748
+ }
749
+
750
+ /// Removes and returns the buffered workflow task, if any.
751
+ pub(super) fn take_buffered_wft(&mut self) -> Option<PermittedWFT> {
752
+ self.buffered_resp.take()
753
+ }
754
+
755
+ pub(super) fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
756
+ let attempts = self.wft.as_ref().map(|wt| wt.info.attempt);
757
+
758
+ // If we were waiting on a page fetch and we're getting evicted because fetching failed,
759
+ // then make sure we allow the completion to proceed, otherwise we're stuck waiting forever.
760
+ if self.completion_waiting_on_page_fetch.is_some()
761
+ && matches!(info.reason, EvictionReason::PaginationOrHistoryFetch)
762
+ {
763
+ // We just checked it is some, unwrap OK.
764
+ let c = self.completion_waiting_on_page_fetch.take().unwrap();
765
+ let run_upd = self.failed_completion(
766
+ WorkflowTaskFailedCause::Unspecified,
767
+ info.reason,
768
+ Failure::application_failure(info.message, false).into(),
769
+ c.resp_chan,
770
+ );
771
+ return EvictionRequestResult::EvictionRequested(attempts, run_upd);
772
+ }
773
+
774
+ if !self.activation_has_eviction() && self.trying_to_evict.is_none() {
775
+ debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
776
+ self.trying_to_evict = Some(info);
777
+ EvictionRequestResult::EvictionRequested(attempts, self.check_more_activations())
778
+ } else {
779
+ EvictionRequestResult::EvictionAlreadyRequested(attempts)
780
+ }
781
+ }
782
+
783
+ pub(super) fn record_span_fields(&mut self, span: &Span) {
784
+ if let Some(spid) = span.id() {
785
+ if self.recorded_span_ids.contains(&spid) {
786
+ return;
787
+ }
788
+ self.recorded_span_ids.insert(spid);
789
+
790
+ if let Some(wid) = self.wft().map(|wft| &wft.info.wf_id) {
791
+ span.record("workflow_id", wid.as_str());
792
+ }
793
+ }
794
+ }
795
+
796
+ /// Take the result of some update to ourselves and turn it into a return value of zero or more
797
+ /// actions
798
+ fn update_to_acts(
799
+ &mut self,
800
+ outcome: Result<ActOrFulfill, RunUpdateErr>,
801
+ in_response_to_wft: bool,
802
+ ) -> RunUpdateAct {
803
+ match outcome {
804
+ Ok(act_or_fulfill) => {
805
+ let (mut maybe_act, maybe_fulfill) = match act_or_fulfill {
806
+ ActOrFulfill::OutgoingAct(a) => (a, None),
807
+ ActOrFulfill::FulfillableComplete(c) => (None, c),
808
+ };
809
+ // If there's no activation but is pending work, check and possibly generate one
810
+ if self.more_pending_work() && maybe_act.is_none() {
811
+ match self._check_more_activations() {
812
+ Ok(oa) => maybe_act = oa,
813
+ Err(e) => {
814
+ return self.update_to_acts(Err(e), in_response_to_wft);
815
+ }
816
+ }
817
+ }
818
+ let r = match maybe_act {
819
+ Some(ActivationOrAuto::LangActivation(mut activation)) => {
820
+ if in_response_to_wft {
821
+ let wft = self
822
+ .wft
823
+ .as_mut()
824
+ .expect("WFT must exist for run just updated with one");
825
+ // If there are in-poll queries, insert jobs for those queries into the
826
+ // activation, but only if we hit the cache. If we didn't, those queries
827
+ // will need to be dealt with once replay is over
828
+ if wft.hit_cache {
829
+ put_queries_in_act(&mut activation, wft);
830
+ }
831
+ }
832
+
833
+ if activation.jobs.is_empty() {
834
+ dbg_panic!("Should not send lang activation with no jobs");
835
+ }
836
+ Some(ActivationOrAuto::LangActivation(activation))
837
+ }
838
+ Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
839
+ if let Some(wft) = self.wft.as_mut() {
840
+ put_queries_in_act(&mut act, wft);
841
+ Some(ActivationOrAuto::LangActivation(act))
842
+ } else {
843
+ dbg_panic!("Ready for queries but no WFT!");
844
+ None
845
+ }
846
+ }
847
+ a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
848
+ None => {
849
+ if let Some(reason) = self.trying_to_evict.as_ref() {
850
+ // If we had nothing to do, but we're trying to evict, just do that now
851
+ // as long as there's no other outstanding work.
852
+ if self.activation.is_none() && !self.more_pending_work() {
853
+ let mut evict_act = create_evict_activation(
854
+ self.run_id().to_string(),
855
+ reason.message.clone(),
856
+ reason.reason,
857
+ );
858
+ evict_act.history_length =
859
+ self.most_recently_processed_event_number() as u32;
860
+ Some(ActivationOrAuto::LangActivation(evict_act))
861
+ } else {
862
+ None
863
+ }
864
+ } else {
865
+ None
866
+ }
867
+ }
868
+ };
869
+ if let Some(f) = maybe_fulfill {
870
+ f.fulfill();
871
+ }
872
+
873
+ match r {
874
+ // After each run update, check if it's ready to handle any buffered poll
875
+ None | Some(ActivationOrAuto::Autocomplete { .. })
876
+ if !self.has_any_pending_work(false, true) =>
877
+ {
878
+ if let Some(bufft) = self.buffered_resp.take() {
879
+ self.incoming_wft(bufft)
880
+ } else {
881
+ None
882
+ }
883
+ }
884
+ Some(r) => {
885
+ self.insert_outstanding_activation(&r);
886
+ Some(r)
887
+ }
888
+ None => None,
889
+ }
890
+ }
891
+ Err(fail) => {
892
+ self.am_broken = true;
893
+ let rur = if let Some(resp_chan) = fail.complete_resp_chan {
894
+ // Automatically fail the workflow task in the event we couldn't update machines
895
+ let fail_cause = if matches!(&fail.source, WFMachinesError::Nondeterminism(_)) {
896
+ WorkflowTaskFailedCause::NonDeterministicError
897
+ } else {
898
+ WorkflowTaskFailedCause::Unspecified
899
+ };
900
+ let wft_fail_str = format!("{:?}", fail.source);
901
+ self.failed_completion(
902
+ fail_cause,
903
+ fail.source.evict_reason(),
904
+ Failure::application_failure(wft_fail_str, false).into(),
905
+ Some(resp_chan),
906
+ )
907
+ } else {
908
+ warn!(error=?fail.source, "Error while updating workflow");
909
+ self.request_eviction(RequestEvictMsg {
910
+ run_id: self.run_id().to_string(),
911
+ message: format!("Error while updating workflow: {:?}", fail.source),
912
+ reason: fail.source.evict_reason(),
913
+ })
914
+ .into_run_update_resp()
915
+ };
916
+ rur
917
+ }
918
+ }
919
+ }
920
+
921
+ fn insert_outstanding_activation(&mut self, act: &ActivationOrAuto) {
922
+ let act_type = match &act {
923
+ ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
924
+ if act.is_legacy_query() {
925
+ OutstandingActivation::LegacyQuery
926
+ } else {
927
+ OutstandingActivation::Normal {
928
+ contains_eviction: act.eviction_index().is_some(),
929
+ num_jobs: act.jobs.len(),
930
+ }
931
+ }
346
932
  }
933
+ ActivationOrAuto::Autocomplete { .. } => OutstandingActivation::Autocomplete,
934
+ };
935
+ if let Some(old_act) = self.activation {
936
+ // This is a panic because we have screwed up core logic if this is violated. It must be
937
+ // upheld.
938
+ panic!(
939
+ "Attempted to insert a new outstanding activation {act:?}, but there already was \
940
+ one outstanding: {old_act:?}"
941
+ );
347
942
  }
943
+ self.activation = Some(act_type);
348
944
  }
349
945
 
350
946
  fn prepare_complete_resp(
351
947
  &mut self,
352
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
948
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
353
949
  data: CompletionDataForWFT,
354
950
  due_to_heartbeat_timeout: bool,
355
951
  ) -> FulfillableActivationComplete {
356
952
  let outgoing_cmds = self.wfm.get_server_commands();
953
+ if data.activation_was_only_eviction && !outgoing_cmds.commands.is_empty() {
954
+ dbg_panic!(
955
+ "There should not be any outgoing commands when preparing a completion response \
956
+ if the activation was only an eviction. This is an SDK bug."
957
+ );
958
+ }
959
+
357
960
  let query_responses = data.query_responses;
358
961
  let has_query_responses = !query_responses.is_empty();
359
962
  let is_query_playback = data.has_pending_query && !has_query_responses;
360
963
  let mut force_new_wft = due_to_heartbeat_timeout;
361
964
 
362
- // We only actually want to send commands back to the server if there are no more
363
- // pending activations and we are caught up on replay. We don't want to complete a wft
364
- // if we already saw the final event in the workflow, or if we are playing back for the
365
- // express purpose of fulfilling a query. If the activation we sent was *only* an
366
- // eviction, and there were no commands produced during iteration, don't send that
965
+ // We only actually want to send commands back to the server if there are no more pending
966
+ // activations and we are caught up on replay. We don't want to complete a wft if we already
967
+ // saw the final event in the workflow, or if we are playing back for the express purpose of
968
+ // fulfilling a query. If the activation we sent was *only* an eviction, don't send that
367
969
  // either.
368
- let no_commands_and_evicting =
369
- outgoing_cmds.commands.is_empty() && data.activation_was_only_eviction;
370
970
  let should_respond = !(self.wfm.machines.has_pending_jobs()
371
971
  || outgoing_cmds.replaying
372
972
  || is_query_playback
373
- || no_commands_and_evicting);
973
+ || data.activation_was_only_eviction);
374
974
  // If there are pending LA resolutions, and we're responding to a query here,
375
975
  // we want to make sure to force a new task, as otherwise once we tell lang about
376
976
  // the LA resolution there wouldn't be any task to reply to with the result of iterating
@@ -378,17 +978,16 @@ impl ManagedRun {
378
978
  if has_query_responses && self.wfm.machines.has_pending_la_resolutions() {
379
979
  force_new_wft = true;
380
980
  }
381
- let to_be_sent = ServerCommandsWithWorkflowInfo {
382
- task_token: data.task_token,
383
- action: ActivationAction::WftComplete {
384
- force_new_wft,
385
- commands: outgoing_cmds.commands,
386
- query_responses,
387
- },
388
- };
389
981
 
390
982
  let outcome = if should_respond || has_query_responses {
391
- ActivationCompleteOutcome::ReportWFTSuccess(to_be_sent)
983
+ ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
984
+ task_token: data.task_token,
985
+ action: ActivationAction::WftComplete {
986
+ force_new_wft,
987
+ commands: outgoing_cmds.commands,
988
+ query_responses,
989
+ },
990
+ })
392
991
  } else {
393
992
  ActivationCompleteOutcome::DoNothing
394
993
  };
@@ -401,131 +1000,136 @@ impl ManagedRun {
401
1000
  }
402
1001
  }
403
1002
 
404
- async fn local_resolution(
1003
+ /// Pump some local activity requests into the sink, applying any immediate results to the
1004
+ /// workflow machines.
1005
+ fn sink_la_requests(
405
1006
  &mut self,
406
- res: LocalResolution,
407
- ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
408
- debug!(resolution=?res, "Applying local resolution");
409
- self.wfm.notify_of_local_result(res)?;
410
- if self.wfm.machines.outstanding_local_activity_count() == 0 {
411
- if let Some(mut wait_dat) = self.waiting_on_la.take() {
412
- // Cancel the heartbeat timeout
413
- wait_dat.heartbeat_timeout_task.abort();
414
- if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
415
- return Ok(Some(self.prepare_complete_resp(
416
- resp_chan,
417
- completion_dat,
418
- false,
419
- )));
420
- }
421
- }
1007
+ new_local_acts: Vec<LocalActRequest>,
1008
+ ) -> Result<(), WFMachinesError> {
1009
+ let immediate_resolutions = self.local_activity_request_sink.sink_reqs(new_local_acts);
1010
+ if !immediate_resolutions.is_empty() {
1011
+ warn!("Immediate res: {:?}", &immediate_resolutions);
422
1012
  }
423
- Ok(None)
1013
+ for resolution in immediate_resolutions {
1014
+ self.wfm
1015
+ .notify_of_local_result(LocalResolution::LocalActivity(resolution))?;
1016
+ }
1017
+ Ok(())
424
1018
  }
425
1019
 
426
- /// Returns `true` if autocompletion should be issued, which will actually cause us to end up
427
- /// in [completion] again, at which point we'll start a new heartbeat timeout, which will
428
- /// immediately trigger and thus finish the completion, forcing a new task as it should.
429
- fn heartbeat_timeout(&mut self) -> bool {
430
- if let Some(ref mut wait_dat) = self.waiting_on_la {
431
- // Cancel the heartbeat timeout
432
- wait_dat.heartbeat_timeout_task.abort();
433
- if let Some((completion_dat, resp_chan)) = wait_dat.completion_dat.take() {
434
- let compl = self.prepare_complete_resp(resp_chan, completion_dat, true);
435
- // Immediately fulfill the completion since the run update will already have
436
- // been replied to
437
- compl.fulfill();
438
- } else {
439
- // Auto-reply WFT complete
440
- return true;
441
- }
442
- } else {
443
- // If a heartbeat timeout happened, we should always have been waiting on LAs
444
- dbg_panic!("WFT heartbeat timeout fired but we were not waiting on any LAs");
1020
+ fn reply_to_complete(
1021
+ &self,
1022
+ outcome: ActivationCompleteOutcome,
1023
+ chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1024
+ ) {
1025
+ if let Some(chan) = chan {
1026
+ chan.send(ActivationCompleteResult {
1027
+ most_recently_processed_event: self.most_recently_processed_event_number() as usize,
1028
+ outcome,
1029
+ })
1030
+ .expect("Rcv half of activation reply not dropped");
445
1031
  }
446
- false
447
1032
  }
448
1033
 
449
- fn send_update_response(&self, outcome: RunActionOutcome, no_wft: bool) {
450
- let mut in_response_to_wft = false;
451
- let (outgoing_activation, fulfillable_complete) = match outcome {
452
- RunActionOutcome::AfterNewWFT(a) => {
453
- in_response_to_wft = true;
454
- (a, None)
455
- }
456
- RunActionOutcome::AfterCheckWork(a) => (a, None),
457
- RunActionOutcome::AfterLocalResolution(f) => (None, f),
458
- RunActionOutcome::AfterCompletion(f) => (None, f),
459
- RunActionOutcome::AfterHeartbeatTimeout(a) => (a, None),
460
- };
461
- let mut more_pending_work = self.wfm.machines.has_pending_jobs();
462
- // We don't want to consider there to be more local-only work to be done if there is no
463
- // workflow task associated with the run right now. This can happen if, ex, we complete
464
- // a local activity while waiting for server to send us the next WFT. Activating lang would
465
- // be harmful at this stage, as there might be work returned in that next WFT which should
466
- // be part of the next activation.
467
- if no_wft {
468
- more_pending_work = false;
469
- }
470
- self.update_tx
471
- .send(RunUpdateResponse {
472
- kind: RunUpdateResponseKind::Good(GoodRunUpdate {
473
- run_id: self.wfm.machines.run_id.clone(),
474
- outgoing_activation,
475
- fulfillable_complete,
476
- have_seen_terminal_event: self.wfm.machines.have_seen_terminal_event,
477
- more_pending_work,
478
- most_recently_processed_event_number: self.wfm.machines.last_processed_event
479
- as usize,
480
- in_response_to_wft,
481
- }),
482
- span: Span::current(),
483
- })
484
- .expect("Machine can send update");
1034
+ /// Returns true if the handle is currently processing a WFT which contains a legacy query.
1035
+ fn pending_work_is_legacy_query(&self) -> bool {
1036
+ // Either we know because there is a pending legacy query, or it's already been drained and
1037
+ // sent as an activation.
1038
+ matches!(self.activation, Some(OutstandingActivation::LegacyQuery))
1039
+ || self
1040
+ .wft
1041
+ .as_ref()
1042
+ .map(|t| t.has_pending_legacy_query())
1043
+ .unwrap_or_default()
1044
+ }
1045
+
1046
+ fn most_recently_processed_event_number(&self) -> i64 {
1047
+ self.wfm.machines.last_processed_event
1048
+ }
1049
+
1050
+ fn activation_has_eviction(&mut self) -> bool {
1051
+ self.activation
1052
+ .map(OutstandingActivation::has_eviction)
1053
+ .unwrap_or_default()
1054
+ }
1055
+
1056
+ fn activation_has_only_eviction(&mut self) -> bool {
1057
+ self.activation
1058
+ .map(OutstandingActivation::has_only_eviction)
1059
+ .unwrap_or_default()
1060
+ }
1061
+
1062
+ fn run_id(&self) -> &str {
1063
+ &self.wfm.machines.run_id
485
1064
  }
486
1065
  }
487
1066
 
488
- fn start_heartbeat_timeout_task(
489
- chan: UnboundedSender<Span>,
1067
+ /// Drains pending queries from the workflow task and appends them to the activation's jobs
1068
+ fn put_queries_in_act(act: &mut WorkflowActivation, wft: &mut OutstandingTask) {
1069
+ // Nothing to do if there are no pending queries
1070
+ if wft.pending_queries.is_empty() {
1071
+ return;
1072
+ }
1073
+
1074
+ let has_legacy = wft.has_pending_legacy_query();
1075
+ // Cannot dispatch legacy query if there are any other jobs - which can happen if, ex, a local
1076
+ // activity resolves while we've gotten a legacy query after heartbeating.
1077
+ if has_legacy && !act.jobs.is_empty() {
1078
+ return;
1079
+ }
1080
+
1081
+ debug!(queries=?wft.pending_queries, "Dispatching queries");
1082
+ let query_jobs = wft
1083
+ .pending_queries
1084
+ .drain(..)
1085
+ .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
1086
+ act.jobs.extend(query_jobs);
1087
+ }
1088
+ fn sink_heartbeat_timeout_start(
1089
+ run_id: String,
1090
+ sink: &dyn LocalActivityRequestSink,
490
1091
  wft_start_time: Instant,
491
1092
  wft_timeout: Duration,
492
- ) -> JoinHandle<()> {
1093
+ ) -> AbortHandle {
493
1094
  // The heartbeat deadline is 80% of the WFT timeout
494
- let wft_heartbeat_deadline =
495
- wft_start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
496
- task::spawn(async move {
497
- tokio::time::sleep_until(wft_heartbeat_deadline.into()).await;
498
- let _ = chan.send(Span::current());
499
- })
500
- }
501
-
502
- enum RunActionOutcome {
503
- AfterNewWFT(Option<ActivationOrAuto>),
504
- AfterCheckWork(Option<ActivationOrAuto>),
505
- AfterLocalResolution(Option<FulfillableActivationComplete>),
506
- AfterCompletion(Option<FulfillableActivationComplete>),
507
- AfterHeartbeatTimeout(Option<ActivationOrAuto>),
1095
+ let deadline = wft_start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
1096
+ let (abort_handle, abort_reg) = AbortHandle::new_pair();
1097
+ sink.sink_reqs(vec![LocalActRequest::StartHeartbeatTimeout {
1098
+ send_on_elapse: HeartbeatTimeoutMsg {
1099
+ run_id,
1100
+ span: Span::current(),
1101
+ },
1102
+ deadline,
1103
+ abort_reg,
1104
+ }]);
1105
+ abort_handle
508
1106
  }
509
1107
 
510
- #[derive(derive_more::DebugCustom)]
511
- #[debug(fmt = "RunUpdateErr({:?})", source)]
512
- struct RunUpdateErr {
513
- source: WFMachinesError,
514
- complete_resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1108
+ /// If an activation completion needed to wait on LA completions (or heartbeat timeout) we use
1109
+ /// this struct to store the data we need to finish the completion once that has happened
1110
+ struct WaitingOnLAs {
1111
+ wft_timeout: Duration,
1112
+ /// If set, we are waiting for LAs to complete as part of a just-finished workflow activation.
1113
+ /// If unset, we already had a heartbeat timeout and got a new WFT without any new work while
1114
+ /// there are still incomplete LAs.
1115
+ completion_dat: Option<(
1116
+ CompletionDataForWFT,
1117
+ Option<oneshot::Sender<ActivationCompleteResult>>,
1118
+ )>,
1119
+ /// Can be used to abort heartbeat timeouts
1120
+ hb_timeout_handle: AbortHandle,
515
1121
  }
516
-
517
- impl From<WFMachinesError> for RunUpdateErr {
518
- fn from(e: WFMachinesError) -> Self {
519
- RunUpdateErr {
520
- source: e,
521
- complete_resp_chan: None,
522
- }
523
- }
1122
+ #[derive(Debug)]
1123
+ struct CompletionDataForWFT {
1124
+ task_token: TaskToken,
1125
+ query_responses: Vec<QueryResult>,
1126
+ has_pending_query: bool,
1127
+ activation_was_only_eviction: bool,
524
1128
  }
525
1129
 
526
1130
  /// Manages an instance of a [WorkflowMachines], which is not thread-safe, as well as other data
527
1131
  /// associated with that specific workflow run.
528
- pub(crate) struct WorkflowManager {
1132
+ struct WorkflowManager {
529
1133
  machines: WorkflowMachines,
530
1134
  /// Is always `Some` in normal operation. Optional to allow for unit testing with the test
531
1135
  /// workflow driver, which does not need to complete activations the normal way.
@@ -535,7 +1139,7 @@ pub(crate) struct WorkflowManager {
535
1139
  impl WorkflowManager {
536
1140
  /// Create a new workflow manager given workflow history and execution info as would be found
537
1141
  /// in [PollWorkflowTaskQueueResponse]
538
- pub fn new(
1142
+ fn new(
539
1143
  history: HistoryUpdate,
540
1144
  namespace: String,
541
1145
  workflow_id: String,
@@ -560,7 +1164,7 @@ impl WorkflowManager {
560
1164
  }
561
1165
 
562
1166
  #[cfg(test)]
563
- pub const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
1167
+ const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
564
1168
  Self {
565
1169
  machines: workflow_machines,
566
1170
  command_sink: None,
@@ -571,12 +1175,15 @@ impl WorkflowManager {
571
1175
  ///
572
1176
  /// Should only be called when a workflow has caught up on replay (or is just beginning). It
573
1177
  /// will return a workflow activation if one is needed.
574
- async fn feed_history_from_server(
575
- &mut self,
576
- update: HistoryUpdate,
577
- ) -> Result<WorkflowActivation> {
578
- self.machines.new_history_from_server(update).await?;
579
- self.get_next_activation().await
1178
+ fn feed_history_from_server(&mut self, update: HistoryUpdate) -> Result<WorkflowActivation> {
1179
+ self.machines.new_history_from_server(update)?;
1180
+ self.get_next_activation()
1181
+ }
1182
+
1183
+ /// Update the machines with some events from fetching another page of history. Does *not*
1184
+ /// attempt to pull the next activation, unlike [Self::feed_history_from_server].
1185
+ fn feed_history_from_new_page(&mut self, update: HistoryUpdate) -> Result<()> {
1186
+ self.machines.new_history_from_server(update)
580
1187
  }
581
1188
 
582
1189
  /// Let this workflow know that something we've been waiting locally on has resolved, like a
@@ -593,27 +1200,33 @@ impl WorkflowManager {
593
1200
  ///
594
1201
  /// Callers may also need to call [get_server_commands] after this to issue any pending commands
595
1202
  /// to the server.
596
- async fn get_next_activation(&mut self) -> Result<WorkflowActivation> {
1203
+ fn get_next_activation(&mut self) -> Result<WorkflowActivation> {
597
1204
  // First check if there are already some pending jobs, which can be a result of replay.
598
1205
  let activation = self.machines.get_wf_activation();
599
1206
  if !activation.jobs.is_empty() {
600
1207
  return Ok(activation);
601
1208
  }
602
1209
 
603
- self.machines.apply_next_wft_from_history().await?;
1210
+ self.machines.apply_next_wft_from_history()?;
604
1211
  Ok(self.machines.get_wf_activation())
605
1212
  }
606
1213
 
1214
+ /// Returns true if machines are ready to apply the next WFT sequence, false if events will need
1215
+ /// to be fetched in order to create a complete update with the entire next WFT sequence.
1216
+ pub(crate) fn ready_to_apply_next_wft(&self) -> bool {
1217
+ self.machines.ready_to_apply_next_wft()
1218
+ }
1219
+
607
1220
  /// If there are no pending jobs for the workflow, apply the next workflow task and check
608
1221
  /// again if there are any jobs. Importantly, does not *drain* jobs.
609
1222
  ///
610
1223
  /// Returns true if there are jobs (before or after applying the next WFT).
611
- async fn apply_next_task_if_ready(&mut self) -> Result<bool> {
1224
+ fn apply_next_task_if_ready(&mut self) -> Result<bool> {
612
1225
  if self.machines.has_pending_jobs() {
613
1226
  return Ok(true);
614
1227
  }
615
1228
  loop {
616
- let consumed_events = self.machines.apply_next_wft_from_history().await?;
1229
+ let consumed_events = self.machines.apply_next_wft_from_history()?;
617
1230
 
618
1231
  if consumed_events == 0 || !self.machines.replaying || self.machines.has_pending_jobs()
619
1232
  {
@@ -643,13 +1256,61 @@ impl WorkflowManager {
643
1256
 
644
1257
  /// Feed the workflow machines new commands issued by the executing workflow code, and iterate
645
1258
  /// the machines.
646
- async fn push_commands(&mut self, cmds: Vec<WFCommand>) -> Result<()> {
1259
+ fn push_commands_and_iterate(&mut self, cmds: Vec<WFCommand>) -> Result<()> {
647
1260
  if let Some(cs) = self.command_sink.as_mut() {
648
1261
  cs.send(cmds).map_err(|_| {
649
1262
  WFMachinesError::Fatal("Internal error buffering workflow commands".to_string())
650
1263
  })?;
651
1264
  }
652
- self.machines.iterate_machines().await?;
1265
+ self.machines.iterate_machines()?;
653
1266
  Ok(())
654
1267
  }
655
1268
  }
1269
+
1270
+ #[derive(Debug)]
1271
+ struct FulfillableActivationComplete {
1272
+ result: ActivationCompleteResult,
1273
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1274
+ }
1275
+ impl FulfillableActivationComplete {
1276
+ fn fulfill(self) {
1277
+ if let Some(resp_chan) = self.resp_chan {
1278
+ let _ = resp_chan.send(self.result);
1279
+ }
1280
+ }
1281
+ }
1282
+
1283
+ #[derive(Debug)]
1284
+ struct RunActivationCompletion {
1285
+ task_token: TaskToken,
1286
+ start_time: Instant,
1287
+ commands: Vec<WFCommand>,
1288
+ activation_was_eviction: bool,
1289
+ activation_was_only_eviction: bool,
1290
+ has_pending_query: bool,
1291
+ query_responses: Vec<QueryResult>,
1292
+ /// Used to notify the worker when the completion is done processing and the completion can
1293
+ /// unblock. Must always be `Some` when initialized.
1294
+ resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1295
+ }
1296
+ #[derive(Debug, derive_more::From)]
1297
+ enum ActOrFulfill {
1298
+ OutgoingAct(Option<ActivationOrAuto>),
1299
+ FulfillableComplete(Option<FulfillableActivationComplete>),
1300
+ }
1301
+
1302
+ #[derive(derive_more::DebugCustom)]
1303
+ #[debug(fmt = "RunUpdateErr({source:?})")]
1304
+ struct RunUpdateErr {
1305
+ source: WFMachinesError,
1306
+ complete_resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
1307
+ }
1308
+
1309
+ impl From<WFMachinesError> for RunUpdateErr {
1310
+ fn from(e: WFMachinesError) -> Self {
1311
+ RunUpdateErr {
1312
+ source: e,
1313
+ complete_resp_chan: None,
1314
+ }
1315
+ }
1316
+ }