@temporalio/core-bridge 0.17.2 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (170) hide show
  1. package/Cargo.lock +339 -226
  2. package/Cargo.toml +7 -3
  3. package/common.js +50 -0
  4. package/index.d.ts +7 -0
  5. package/index.js +12 -0
  6. package/package.json +7 -4
  7. package/releases/aarch64-apple-darwin/index.node +0 -0
  8. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  9. package/{index.node → releases/index.node} +0 -0
  10. package/releases/x86_64-apple-darwin/index.node +0 -0
  11. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  12. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  13. package/scripts/build.js +10 -50
  14. package/sdk-core/.buildkite/docker/Dockerfile +1 -1
  15. package/sdk-core/.buildkite/docker/docker-compose.yaml +2 -2
  16. package/sdk-core/.buildkite/pipeline.yml +2 -0
  17. package/sdk-core/Cargo.toml +1 -88
  18. package/sdk-core/README.md +30 -6
  19. package/sdk-core/bridge-ffi/Cargo.toml +24 -0
  20. package/sdk-core/bridge-ffi/LICENSE.txt +23 -0
  21. package/sdk-core/bridge-ffi/build.rs +25 -0
  22. package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +216 -0
  23. package/sdk-core/bridge-ffi/src/lib.rs +829 -0
  24. package/sdk-core/bridge-ffi/src/wrappers.rs +193 -0
  25. package/sdk-core/client/Cargo.toml +32 -0
  26. package/sdk-core/{src/pollers/gateway.rs → client/src/lib.rs} +101 -195
  27. package/sdk-core/client/src/metrics.rs +89 -0
  28. package/sdk-core/client/src/mocks.rs +167 -0
  29. package/sdk-core/{src/pollers → client/src}/retry.rs +172 -14
  30. package/sdk-core/core/Cargo.toml +96 -0
  31. package/sdk-core/{src → core/src}/core_tests/activity_tasks.rs +193 -37
  32. package/sdk-core/{src → core/src}/core_tests/child_workflows.rs +14 -14
  33. package/sdk-core/{src → core/src}/core_tests/determinism.rs +8 -8
  34. package/sdk-core/core/src/core_tests/local_activities.rs +328 -0
  35. package/sdk-core/{src → core/src}/core_tests/mod.rs +6 -9
  36. package/sdk-core/{src → core/src}/core_tests/queries.rs +45 -52
  37. package/sdk-core/{src → core/src}/core_tests/replay_flag.rs +8 -12
  38. package/sdk-core/{src → core/src}/core_tests/workers.rs +120 -33
  39. package/sdk-core/{src → core/src}/core_tests/workflow_cancels.rs +16 -26
  40. package/sdk-core/{src → core/src}/core_tests/workflow_tasks.rs +264 -286
  41. package/sdk-core/core/src/lib.rs +374 -0
  42. package/sdk-core/{src → core/src}/log_export.rs +3 -27
  43. package/sdk-core/core/src/pending_activations.rs +162 -0
  44. package/sdk-core/{src → core/src}/pollers/mod.rs +4 -22
  45. package/sdk-core/{src → core/src}/pollers/poll_buffer.rs +1 -1
  46. package/sdk-core/core/src/protosext/mod.rs +396 -0
  47. package/sdk-core/core/src/replay/mod.rs +210 -0
  48. package/sdk-core/core/src/retry_logic.rs +144 -0
  49. package/sdk-core/{src → core/src}/telemetry/metrics.rs +3 -58
  50. package/sdk-core/{src → core/src}/telemetry/mod.rs +8 -8
  51. package/sdk-core/{src → core/src}/telemetry/prometheus_server.rs +0 -0
  52. package/sdk-core/{src → core/src}/test_help/mod.rs +34 -73
  53. package/sdk-core/{src → core/src}/worker/activities/activity_heartbeat_manager.rs +95 -42
  54. package/sdk-core/core/src/worker/activities/local_activities.rs +973 -0
  55. package/sdk-core/{src → core/src}/worker/activities.rs +52 -33
  56. package/sdk-core/{src → core/src}/worker/dispatcher.rs +8 -6
  57. package/sdk-core/{src → core/src}/worker/mod.rs +305 -195
  58. package/sdk-core/core/src/worker/wft_delivery.rs +81 -0
  59. package/sdk-core/{src → core/src}/workflow/bridge.rs +5 -2
  60. package/sdk-core/{src → core/src}/workflow/driven_workflow.rs +17 -7
  61. package/sdk-core/{src → core/src}/workflow/history_update.rs +33 -7
  62. package/sdk-core/{src → core/src/workflow}/machines/activity_state_machine.rs +26 -26
  63. package/sdk-core/{src → core/src/workflow}/machines/cancel_external_state_machine.rs +8 -11
  64. package/sdk-core/{src → core/src/workflow}/machines/cancel_workflow_state_machine.rs +19 -21
  65. package/sdk-core/{src → core/src/workflow}/machines/child_workflow_state_machine.rs +19 -21
  66. package/sdk-core/{src → core/src/workflow}/machines/complete_workflow_state_machine.rs +3 -5
  67. package/sdk-core/{src → core/src/workflow}/machines/continue_as_new_workflow_state_machine.rs +18 -18
  68. package/sdk-core/{src → core/src/workflow}/machines/fail_workflow_state_machine.rs +5 -6
  69. package/sdk-core/core/src/workflow/machines/local_activity_state_machine.rs +1451 -0
  70. package/sdk-core/{src → core/src/workflow}/machines/mod.rs +54 -107
  71. package/sdk-core/{src → core/src/workflow}/machines/mutable_side_effect_state_machine.rs +0 -0
  72. package/sdk-core/{src → core/src/workflow}/machines/patch_state_machine.rs +29 -30
  73. package/sdk-core/{src → core/src/workflow}/machines/side_effect_state_machine.rs +0 -0
  74. package/sdk-core/{src → core/src/workflow}/machines/signal_external_state_machine.rs +17 -19
  75. package/sdk-core/{src → core/src/workflow}/machines/timer_state_machine.rs +20 -21
  76. package/sdk-core/{src → core/src/workflow}/machines/transition_coverage.rs +5 -2
  77. package/sdk-core/{src → core/src/workflow}/machines/upsert_search_attributes_state_machine.rs +0 -0
  78. package/sdk-core/core/src/workflow/machines/workflow_machines/local_acts.rs +96 -0
  79. package/sdk-core/{src → core/src/workflow}/machines/workflow_machines.rs +344 -160
  80. package/sdk-core/{src → core/src/workflow}/machines/workflow_task_state_machine.rs +1 -1
  81. package/sdk-core/{src → core/src}/workflow/mod.rs +200 -39
  82. package/sdk-core/{src → core/src}/workflow/workflow_tasks/cache_manager.rs +0 -0
  83. package/sdk-core/{src → core/src}/workflow/workflow_tasks/concurrency_manager.rs +38 -5
  84. package/sdk-core/{src → core/src}/workflow/workflow_tasks/mod.rs +297 -81
  85. package/sdk-core/{test_utils → core-api}/Cargo.toml +10 -7
  86. package/sdk-core/{src → core-api/src}/errors.rs +42 -90
  87. package/sdk-core/core-api/src/lib.rs +158 -0
  88. package/sdk-core/{src/worker/config.rs → core-api/src/worker.rs} +18 -23
  89. package/sdk-core/etc/deps.svg +156 -0
  90. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +5 -5
  91. package/sdk-core/fsm/rustfsm_procmacro/tests/trybuild/no_handle_conversions_require_into_fail.stderr +3 -5
  92. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +7 -1
  93. package/sdk-core/histories/fail_wf_task.bin +0 -0
  94. package/sdk-core/histories/timer_workflow_history.bin +0 -0
  95. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +44 -13
  96. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +19 -1
  97. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +1 -1
  98. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +9 -0
  99. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +1 -0
  100. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +1 -0
  101. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +13 -0
  102. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +14 -7
  103. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +176 -18
  104. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +6 -0
  105. package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +11 -0
  106. package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +3 -0
  107. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +156 -7
  108. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +135 -104
  109. package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +78 -0
  110. package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +78 -0
  111. package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +205 -0
  112. package/sdk-core/protos/local/temporal/sdk/core/bridge/service.proto +61 -0
  113. package/sdk-core/protos/local/{child_workflow.proto → temporal/sdk/core/child_workflow/child_workflow.proto} +1 -1
  114. package/sdk-core/protos/local/{common.proto → temporal/sdk/core/common/common.proto} +5 -3
  115. package/sdk-core/protos/local/{core_interface.proto → temporal/sdk/core/core_interface.proto} +10 -10
  116. package/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +30 -0
  117. package/sdk-core/protos/local/{workflow_activation.proto → temporal/sdk/core/workflow_activation/workflow_activation.proto} +35 -11
  118. package/sdk-core/protos/local/{workflow_commands.proto → temporal/sdk/core/workflow_commands/workflow_commands.proto} +55 -4
  119. package/sdk-core/protos/local/{workflow_completion.proto → temporal/sdk/core/workflow_completion/workflow_completion.proto} +3 -3
  120. package/sdk-core/sdk/Cargo.toml +32 -0
  121. package/sdk-core/{src/prototype_rust_sdk → sdk/src}/conversions.rs +0 -0
  122. package/sdk-core/sdk/src/lib.rs +699 -0
  123. package/sdk-core/sdk/src/payload_converter.rs +11 -0
  124. package/sdk-core/sdk/src/workflow_context/options.rs +180 -0
  125. package/sdk-core/{src/prototype_rust_sdk → sdk/src}/workflow_context.rs +201 -124
  126. package/sdk-core/{src/prototype_rust_sdk → sdk/src}/workflow_future.rs +63 -30
  127. package/sdk-core/sdk-core-protos/Cargo.toml +10 -0
  128. package/sdk-core/sdk-core-protos/build.rs +28 -6
  129. package/sdk-core/sdk-core-protos/src/constants.rs +7 -0
  130. package/sdk-core/{src/test_help → sdk-core-protos/src}/history_builder.rs +134 -49
  131. package/sdk-core/sdk-core-protos/src/history_info.rs +216 -0
  132. package/sdk-core/sdk-core-protos/src/lib.rs +594 -168
  133. package/sdk-core/sdk-core-protos/src/task_token.rs +38 -0
  134. package/sdk-core/sdk-core-protos/src/utilities.rs +14 -0
  135. package/sdk-core/test-utils/Cargo.toml +32 -0
  136. package/sdk-core/{src/test_help → test-utils/src}/canned_histories.rs +59 -78
  137. package/sdk-core/test-utils/src/histfetch.rs +28 -0
  138. package/sdk-core/{test_utils → test-utils}/src/lib.rs +131 -68
  139. package/sdk-core/tests/integ_tests/client_tests.rs +1 -1
  140. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +11 -7
  141. package/sdk-core/tests/integ_tests/polling_tests.rs +12 -11
  142. package/sdk-core/tests/integ_tests/queries_tests.rs +82 -78
  143. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +91 -71
  144. package/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +3 -4
  145. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +2 -4
  146. package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +4 -6
  147. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +4 -6
  148. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -4
  149. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +496 -0
  150. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +5 -8
  151. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +125 -0
  152. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +7 -13
  153. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +33 -5
  154. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +12 -16
  155. package/sdk-core/tests/integ_tests/workflow_tests.rs +85 -82
  156. package/sdk-core/tests/load_tests.rs +6 -6
  157. package/sdk-core/tests/main.rs +2 -2
  158. package/src/conversions.rs +24 -21
  159. package/src/errors.rs +8 -0
  160. package/src/lib.rs +323 -211
  161. package/sdk-core/protos/local/activity_result.proto +0 -46
  162. package/sdk-core/protos/local/activity_task.proto +0 -66
  163. package/sdk-core/src/core_tests/retry.rs +0 -147
  164. package/sdk-core/src/lib.rs +0 -403
  165. package/sdk-core/src/machines/local_activity_state_machine.rs +0 -117
  166. package/sdk-core/src/pending_activations.rs +0 -249
  167. package/sdk-core/src/protosext/mod.rs +0 -160
  168. package/sdk-core/src/prototype_rust_sdk.rs +0 -412
  169. package/sdk-core/src/task_token.rs +0 -20
  170. package/sdk-core/src/test_help/history_info.rs +0 -158
@@ -4,33 +4,46 @@ mod cache_manager;
4
4
  mod concurrency_manager;
5
5
 
6
6
  use crate::{
7
- errors::{WorkflowMissingError, WorkflowUpdateError},
8
- machines::{ProtoCommand, WFCommand, WFMachinesError},
9
7
  pending_activations::PendingActivations,
10
- pollers::GatewayRef,
11
- protosext::{ValidPollWFTQResponse, WfActivationExt},
12
- task_token::TaskToken,
8
+ protosext::{ValidPollWFTQResponse, WorkflowActivationExt},
13
9
  telemetry::metrics::MetricsContext,
10
+ worker::{LocalActRequest, LocalActivityResolution},
14
11
  workflow::{
12
+ machines::WFMachinesError,
15
13
  workflow_tasks::{
16
14
  cache_manager::WorkflowCacheManager, concurrency_manager::WorkflowConcurrencyManager,
17
15
  },
18
- HistoryPaginator, HistoryUpdate, WorkflowCachingPolicy, WorkflowManager, LEGACY_QUERY_ID,
16
+ HistoryPaginator, HistoryUpdate, LocalResolution, WFCommand, WorkflowCachingPolicy,
17
+ WorkflowManager, LEGACY_QUERY_ID,
19
18
  },
20
19
  };
21
20
  use crossbeam::queue::SegQueue;
22
21
  use futures::FutureExt;
23
22
  use parking_lot::Mutex;
24
- use std::{fmt::Debug, time::Instant};
25
- use temporal_sdk_core_protos::coresdk::{
26
- workflow_activation::{
27
- create_evict_activation, create_query_activation, wf_activation_job, QueryWorkflow,
28
- WfActivation,
23
+ use std::{
24
+ fmt::Debug,
25
+ ops::Add,
26
+ sync::Arc,
27
+ time::{Duration, Instant},
28
+ };
29
+ use temporal_client::ServerGatewayApis;
30
+ use temporal_sdk_core_protos::{
31
+ coresdk::{
32
+ workflow_activation::{
33
+ create_query_activation, remove_from_cache::EvictionReason, workflow_activation_job,
34
+ QueryWorkflow, WorkflowActivation,
35
+ },
36
+ workflow_commands::QueryResult,
37
+ FromPayloadsExt,
29
38
  },
30
- workflow_commands::QueryResult,
31
- FromPayloadsExt,
39
+ temporal::api::command::v1::Command as ProtoCommand,
40
+ TaskToken,
32
41
  };
33
- use tokio::sync::watch;
42
+ use tokio::{sync::Notify, time::timeout_at};
43
+
44
+ /// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
45
+ /// necessary.
46
+ const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
34
47
 
35
48
  /// Centralizes concerns related to applying new workflow tasks and reporting the activations they
36
49
  /// produce.
@@ -43,10 +56,13 @@ pub struct WorkflowTaskManager {
43
56
  /// when cancelling an activity in try-cancel/abandon mode), or for other reasons such as a
44
57
  /// requested eviction. They queue here.
45
58
  pending_activations: PendingActivations,
59
+ /// Holds activations which are purely query activations needed to respond to legacy queries.
60
+ /// Activations may only be added here for runs which do not have other pending activations.
61
+ pending_legacy_queries: SegQueue<WorkflowActivation>,
46
62
  /// Holds poll wft responses from the server that need to be applied
47
63
  ready_buffered_wft: SegQueue<ValidPollWFTQResponse>,
48
64
  /// Used to wake blocked workflow task polling
49
- pending_activations_notifier: watch::Sender<bool>,
65
+ pending_activations_notifier: Arc<Notify>,
50
66
  /// Lock guarded cache manager, which is the authority for limit-based workflow machine eviction
51
67
  /// from the cache.
52
68
  // TODO: Also should be moved inside concurrency manager, but there is some complexity around
@@ -98,7 +114,7 @@ pub struct WorkflowTaskInfo {
98
114
  #[derive(Debug, derive_more::From)]
99
115
  pub(crate) enum NewWfTaskOutcome {
100
116
  /// A new activation for the workflow should be issued to lang
101
- IssueActivation(WfActivation),
117
+ IssueActivation(WorkflowActivation),
102
118
  /// The poll loop should be restarted, there is nothing to do
103
119
  TaskBuffered,
104
120
  /// The workflow task should be auto-completed with an empty command list, as it must be replied
@@ -108,6 +124,8 @@ pub(crate) enum NewWfTaskOutcome {
108
124
  CacheMiss,
109
125
  /// The workflow task ran into problems while being applied and we must now evict the workflow
110
126
  Evict(WorkflowUpdateError),
127
+ /// No action should be taken. Possibly we are waiting for local activities to complete
128
+ LocalActsOutstanding,
111
129
  }
112
130
 
113
131
  #[derive(Debug)]
@@ -118,17 +136,18 @@ pub enum FailedActivationOutcome {
118
136
  }
119
137
 
120
138
  #[derive(Debug)]
121
- pub struct ServerCommandsWithWorkflowInfo {
139
+ pub(crate) struct ServerCommandsWithWorkflowInfo {
122
140
  pub task_token: TaskToken,
123
141
  pub action: ActivationAction,
124
142
  }
125
143
 
126
144
  #[derive(Debug)]
127
- pub enum ActivationAction {
145
+ pub(crate) enum ActivationAction {
128
146
  /// We should respond that the workflow task is complete
129
147
  WftComplete {
130
148
  commands: Vec<ProtoCommand>,
131
149
  query_responses: Vec<QueryResult>,
150
+ force_new_wft: bool,
132
151
  },
133
152
  /// We should respond to a legacy query request
134
153
  RespondLegacyQuery { result: QueryResult },
@@ -149,13 +168,14 @@ macro_rules! machine_mut {
149
168
 
150
169
  impl WorkflowTaskManager {
151
170
  pub(crate) fn new(
152
- pending_activations_notifier: watch::Sender<bool>,
171
+ pending_activations_notifier: Arc<Notify>,
153
172
  eviction_policy: WorkflowCachingPolicy,
154
173
  metrics: MetricsContext,
155
174
  ) -> Self {
156
175
  Self {
157
176
  workflow_machines: WorkflowConcurrencyManager::new(),
158
177
  pending_activations: Default::default(),
178
+ pending_legacy_queries: Default::default(),
159
179
  ready_buffered_wft: Default::default(),
160
180
  pending_activations_notifier,
161
181
  cache_manager: Mutex::new(WorkflowCacheManager::new(eviction_policy, metrics.clone())),
@@ -163,7 +183,11 @@ impl WorkflowTaskManager {
163
183
  }
164
184
  }
165
185
 
166
- pub(crate) fn next_pending_activation(&self) -> Option<WfActivation> {
186
+ pub(crate) fn next_pending_activation(&self) -> Option<WorkflowActivation> {
187
+ // Dispatch pending legacy queries first
188
+ if let leg_q @ Some(_) = self.pending_legacy_queries.pop() {
189
+ return leg_q;
190
+ }
167
191
  // It is important that we do not issue pending activations for any workflows which already
168
192
  // have an outstanding activation. If we did, it can result in races where an in-progress
169
193
  // completion may appear to be the last in a task (no more pending activations) because
@@ -172,37 +196,69 @@ impl WorkflowTaskManager {
172
196
  let maybe_act = self
173
197
  .pending_activations
174
198
  .pop_first_matching(|rid| self.workflow_machines.get_activation(rid).is_none());
175
- if let Some(act) = maybe_act.as_ref() {
176
- if let Err(WorkflowMissingError { run_id }) = self.insert_outstanding_activation(act) {
177
- self.request_eviction(&run_id, "Pending activation present for missing run");
199
+ if let Some(pending_info) = maybe_act {
200
+ if let Ok(act) = self
201
+ .workflow_machines
202
+ .access_sync(&pending_info.run_id, |wfm| wfm.machines.get_wf_activation())
203
+ .and_then(|mut act| {
204
+ if let Some(reason) = pending_info.needs_eviction {
205
+ act.append_evict_job(reason);
206
+ }
207
+ self.insert_outstanding_activation(&act)?;
208
+ Ok(act)
209
+ })
210
+ {
211
+ self.cache_manager.lock().touch(&act.run_id);
212
+ Some(act)
213
+ } else {
214
+ self.request_eviction(
215
+ &pending_info.run_id,
216
+ "Tried to apply pending activation for missing run",
217
+ EvictionReason::Fatal,
218
+ );
178
219
  // Continue trying to return a valid pending activation
179
- return self.next_pending_activation();
220
+ self.next_pending_activation()
180
221
  }
181
- self.cache_manager.lock().touch(&act.run_id);
222
+ } else {
223
+ None
182
224
  }
183
- maybe_act
184
225
  }
185
226
 
186
- pub fn next_buffered_poll(&self) -> Option<ValidPollWFTQResponse> {
227
+ pub(crate) fn next_buffered_poll(&self) -> Option<ValidPollWFTQResponse> {
187
228
  self.ready_buffered_wft.pop()
188
229
  }
189
230
 
190
- pub fn outstanding_wft(&self) -> usize {
231
+ pub(crate) fn outstanding_wft(&self) -> usize {
191
232
  self.workflow_machines.outstanding_wft()
192
233
  }
193
234
 
235
+ /// Returns the event id of the most recently processed event for the provided run id.
236
+ pub(crate) fn most_recently_processed_event(
237
+ &self,
238
+ run_id: &str,
239
+ ) -> Result<i64, WorkflowMissingError> {
240
+ self.workflow_machines
241
+ .access_sync(run_id, |wfm| wfm.machines.last_processed_event)
242
+ }
243
+
194
244
  /// Request a workflow eviction. This will queue up an activation to evict the workflow from
195
245
  /// the lang side. Workflow will not *actually* be evicted until lang replies to that activation
196
246
  ///
197
247
  /// Returns, if found, the number of attempts on the current workflow task
198
- pub fn request_eviction(&self, run_id: &str, reason: impl Into<String>) -> Option<u32> {
248
+ pub(crate) fn request_eviction(
249
+ &self,
250
+ run_id: &str,
251
+ message: impl Into<String>,
252
+ reason: EvictionReason,
253
+ ) -> Option<u32> {
199
254
  if self.workflow_machines.exists(run_id) {
200
255
  if !self.activation_has_eviction(run_id) {
201
- debug!(%run_id, "Eviction requested");
256
+ let message = message.into();
257
+ debug!(%run_id, %message, "Eviction requested");
202
258
  // Queue up an eviction activation
203
259
  self.pending_activations
204
- .push(create_evict_activation(run_id.to_string(), reason.into()));
205
- let _ = self.pending_activations_notifier.send(true);
260
+ .notify_needs_eviction(run_id, message, reason);
261
+ self.pending_activations_notifier.notify_waiters();
206
262
  }
207
263
  self.workflow_machines
208
264
  .get_task(run_id)
@@ -235,15 +291,14 @@ impl WorkflowTaskManager {
235
291
  }
236
292
 
237
293
  /// Given a validated poll response from the server, prepare an activation (if there is one) to
238
- /// be sent to lang. If applying the response to the workflow's state does not produce a new
239
- /// activation, `None` is returned.
294
+ /// be sent to lang.
240
295
  ///
241
296
  /// The new activation is immediately considered to be an outstanding workflow task - so it is
242
297
  /// expected that new activations will be dispatched to lang right away.
243
298
  pub(crate) async fn apply_new_poll_resp(
244
299
  &self,
245
300
  work: ValidPollWFTQResponse,
246
- gateway: &GatewayRef,
301
+ gateway: Arc<dyn ServerGatewayApis + Send + Sync>,
247
302
  ) -> NewWfTaskOutcome {
248
303
  let mut work = if let Some(w) = self.workflow_machines.buffer_resp_if_outstanding_work(work)
249
304
  {
@@ -285,7 +340,7 @@ impl WorkflowTaskManager {
285
340
  debug!("Dispatching legacy query {:?}", &lq);
286
341
  next_activation
287
342
  .jobs
288
- .push(wf_activation_job::Variant::QueryWorkflow(lq).into());
343
+ .push(workflow_activation_job::Variant::QueryWorkflow(lq).into());
289
344
  }
290
345
  None
291
346
  } else {
@@ -304,7 +359,20 @@ impl WorkflowTaskManager {
304
359
  .expect("Workflow machines must exist, we just created/updated them");
305
360
 
306
361
  if next_activation.jobs.is_empty() {
307
- NewWfTaskOutcome::Autocomplete
362
+ let outstanding_las = self
363
+ .workflow_machines
364
+ .access_sync(&next_activation.run_id, |wfm| {
365
+ wfm.machines.outstanding_local_activity_count()
366
+ })
367
+ .expect("Workflow machines must exist, we just created/updated them");
368
+ if outstanding_las > 0 {
369
+ // If there are outstanding local activities, we don't want to autocomplete the
370
+ // workflow task. We want to give them a chance to complete. If they take longer
371
+ // than the WFT timeout, we will force a new WFT just before the timeout.
372
+ NewWfTaskOutcome::LocalActsOutstanding
373
+ } else {
374
+ NewWfTaskOutcome::Autocomplete
375
+ }
308
376
  } else {
309
377
  if let Err(wme) = self.insert_outstanding_activation(&next_activation) {
310
378
  return NewWfTaskOutcome::Evict(wme.into());
@@ -319,15 +387,20 @@ impl WorkflowTaskManager {
319
387
  &self,
320
388
  run_id: &str,
321
389
  mut commands: Vec<WFCommand>,
390
+ local_activity_request_sink: impl FnOnce(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>,
322
391
  ) -> Result<Option<ServerCommandsWithWorkflowInfo>, WorkflowUpdateError> {
323
392
  // No-command replies to evictions can simply skip everything
324
393
  if commands.is_empty() && self.activation_has_eviction(run_id) {
325
394
  return Ok(None);
326
395
  }
327
396
 
328
- let (task_token, is_leg_query_task) =
397
+ let (task_token, is_leg_query_task, start_time) =
329
398
  if let Some(entry) = self.workflow_machines.get_task(run_id) {
330
- (entry.info.task_token.clone(), entry.legacy_query.is_some())
399
+ (
400
+ entry.info.task_token.clone(),
401
+ entry.legacy_query.is_some(),
402
+ entry.start_time,
403
+ )
331
404
  } else {
332
405
  if !self.activation_has_eviction(run_id) {
333
406
  // Don't bother warning if this was an eviction, since it's normal to issue
@@ -340,7 +413,7 @@ impl WorkflowTaskManager {
340
413
  return Ok(None);
341
414
  };
342
415
 
343
- // If the only command in the activation is a legacy query response, that means we need
416
+ // If the only command from the activation is a legacy query response, that means we need
344
417
  // to respond differently than a typical activation.
345
418
  let ret = if matches!(&commands.as_slice(),
346
419
  &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
@@ -363,7 +436,11 @@ impl WorkflowTaskManager {
363
436
  if let WFCommand::QueryResponse(qr) = commands.remove(i) {
364
437
  if qr.query_id == LEGACY_QUERY_ID {
365
438
  return Err(WorkflowUpdateError {
366
- source: WFMachinesError::Fatal("Legacy query activation response included other commands, this is not allowed and constitutes an error in the lang SDK".to_string()),
439
+ source: WFMachinesError::Fatal(
440
+ "Legacy query activation response included other commands, \
441
+ this is not allowed and constitutes an error in the lang SDK"
442
+ .to_string(),
443
+ ),
367
444
  run_id: run_id.to_string(),
368
445
  });
369
446
  }
@@ -374,25 +451,57 @@ impl WorkflowTaskManager {
374
451
  }
375
452
  }
376
453
 
377
- // Send commands from lang into the machines
378
- machine_mut!(self, run_id, |wfm: &mut WorkflowManager| {
379
- wfm.push_commands(commands).boxed()
380
- })?;
381
- // Check if the workflow run needs another activation and queue it up if there is one
382
- // by pushing it into the pending activations list
383
- let next_activation = machine_mut!(self, run_id, move |mgr: &mut WorkflowManager| mgr
384
- .get_next_activation()
385
- .boxed())?;
386
- if !next_activation.jobs.is_empty() {
387
- self.pending_activations.push(next_activation);
388
- let _ = self.pending_activations_notifier.send(true);
454
+ let (are_pending, server_cmds, local_activities, wft_timeout) = machine_mut!(
455
+ self,
456
+ run_id,
457
+ |wfm: &mut WorkflowManager| {
458
+ async move {
459
+ // Send commands from lang into the machines then check if the workflow run
460
+ // needs another activation and mark it if so
461
+ wfm.push_commands(commands).await?;
462
+ let are_pending = wfm.apply_next_task_if_ready().await?;
463
+ // We want to fetch the outgoing commands only after a next WFT may have
464
+ // been applied, as outgoing server commands may be affected.
465
+ let outgoing_cmds = wfm.get_server_commands();
466
+ let new_local_acts = wfm.drain_queued_local_activities();
467
+
468
+ let wft_timeout: Duration = wfm
469
+ .machines
470
+ .started_attrs()
471
+ .and_then(|attrs| attrs.workflow_task_timeout.clone())
472
+ .and_then(|tt| tt.try_into().ok())
473
+ .ok_or_else(|| {
474
+ WFMachinesError::Fatal(
475
+ "Workflow's start attribs were missing a well formed task timeout"
476
+ .to_string(),
477
+ )
478
+ })?;
479
+
480
+ Ok((are_pending, outgoing_cmds, new_local_acts, wft_timeout))
481
+ }
482
+ .boxed()
483
+ }
484
+ )?;
485
+
486
+ if are_pending {
487
+ self.needs_activation(run_id);
389
488
  }
390
- // We want to fetch the outgoing commands only after any new activation has been queued,
391
- // as doing so may have altered the outgoing commands.
392
- let server_cmds = machine_mut!(self, run_id, |wfm: &mut WorkflowManager| {
393
- async move { Ok(wfm.get_server_commands()) }.boxed()
394
- })?;
489
+ let immediate_resolutions = local_activity_request_sink(local_activities);
490
+ for resolution in immediate_resolutions {
491
+ self.notify_of_local_result(run_id, LocalResolution::LocalActivity(resolution))
492
+ .await?;
493
+ }
494
+
495
+ // The heartbeat deadline is 80% of the WFT timeout
496
+ let wft_heartbeat_deadline =
497
+ start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
498
+ // Wait on local activities to resolve if there are any, or for the WFT timeout to
499
+ // be about to expire, in which case we will need to send a WFT heartbeat.
500
+ let must_heartbeat = self
501
+ .wait_for_local_acts_or_heartbeat(run_id, wft_heartbeat_deadline)
502
+ .await;
395
503
  let is_query_playback = is_leg_query_task && query_responses.is_empty();
504
+
396
505
  // We only actually want to send commands back to the server if there are no more
397
506
  // pending activations and we are caught up on replay. We don't want to complete a wft
398
507
  // if we already saw the final event in the workflow, or if we are playing back for the
@@ -404,6 +513,8 @@ impl WorkflowTaskManager {
404
513
  Some(ServerCommandsWithWorkflowInfo {
405
514
  task_token,
406
515
  action: ActivationAction::WftComplete {
516
+ // TODO: Don't force if also sending complete execution cmd
517
+ force_new_wft: must_heartbeat,
407
518
  commands: server_cmds.commands,
408
519
  query_responses,
409
520
  },
@@ -416,6 +527,7 @@ impl WorkflowTaskManager {
416
527
  action: ActivationAction::WftComplete {
417
528
  commands: vec![],
418
529
  query_responses,
530
+ force_new_wft: false,
419
531
  },
420
532
  })
421
533
  }
@@ -425,7 +537,12 @@ impl WorkflowTaskManager {
425
537
 
426
538
  /// Record that an activation failed, returns enum that indicates if failure should be reported
427
539
  /// to the server
428
- pub(crate) fn failed_activation(&self, run_id: &str) -> FailedActivationOutcome {
540
+ pub(crate) fn failed_activation(
541
+ &self,
542
+ run_id: &str,
543
+ reason: EvictionReason,
544
+ failstr: String,
545
+ ) -> FailedActivationOutcome {
429
546
  let tt = if let Some(tt) = self
430
547
  .workflow_machines
431
548
  .get_task(run_id)
@@ -450,7 +567,7 @@ impl WorkflowTaskManager {
450
567
  } else {
451
568
  // Blow up any cached data associated with the workflow
452
569
  let should_report = self
453
- .request_eviction(run_id, "Activation failed")
570
+ .request_eviction(run_id, failstr, reason)
454
571
  .map_or(true, |attempt| attempt <= 1);
455
572
  if should_report {
456
573
  FailedActivationOutcome::Report(tt)
@@ -467,8 +584,8 @@ impl WorkflowTaskManager {
467
584
  async fn instantiate_or_update_workflow(
468
585
  &self,
469
586
  poll_wf_resp: ValidPollWFTQResponse,
470
- gateway: &GatewayRef,
471
- ) -> Result<(WorkflowTaskInfo, WfActivation), WorkflowUpdateError> {
587
+ gateway: Arc<dyn ServerGatewayApis + Send + Sync>,
588
+ ) -> Result<(WorkflowTaskInfo, WorkflowActivation), WorkflowUpdateError> {
472
589
  let run_id = poll_wf_resp.workflow_execution.run_id.clone();
473
590
 
474
591
  let wft_info = WorkflowTaskInfo {
@@ -486,12 +603,12 @@ impl WorkflowTaskManager {
486
603
  poll_wf_resp.workflow_execution.workflow_id.clone(),
487
604
  poll_wf_resp.workflow_execution.run_id.clone(),
488
605
  poll_wf_resp.next_page_token,
489
- gateway.gw.clone(),
606
+ gateway.clone(),
490
607
  ),
491
608
  poll_wf_resp.previous_started_event_id,
492
609
  ),
493
610
  &poll_wf_resp.workflow_execution.workflow_id,
494
- &gateway.options.namespace,
611
+ &gateway.get_options().namespace,
495
612
  &poll_wf_resp.workflow_type,
496
613
  &self.metrics,
497
614
  )
@@ -503,7 +620,7 @@ impl WorkflowTaskManager {
503
620
  let query_jobs = poll_wf_resp
504
621
  .query_requests
505
622
  .into_iter()
506
- .map(|q| wf_activation_job::Variant::QueryWorkflow(q).into());
623
+ .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
507
624
  activation.jobs.extend(query_jobs);
508
625
  }
509
626
 
@@ -513,13 +630,13 @@ impl WorkflowTaskManager {
513
630
  }
514
631
  }
515
632
 
516
- /// Called after every WFT completion or failure, updates outstanding task status & issues
517
- /// evictions if required. It is important this is called *after* reporting a successful WFT
518
- /// to server, as some replies (task not found) may require an eviction, which could be avoided
519
- /// if this is called too early.
633
+ /// Called after every workflow activation completion or failure, updates outstanding task
634
+ /// status & issues evictions if required. It is important this is called *after* potentially
635
+ /// reporting a successful WFT to server, as some replies (task not found) may require an
636
+ /// eviction, which could be avoided if this is called too early.
520
637
  ///
521
- /// Returns true if WFT is complete
522
- pub(crate) fn after_wft_report(&self, run_id: &str) -> bool {
638
+ /// Returns true if WFT was marked completed internally
639
+ pub(crate) fn after_wft_report(&self, run_id: &str, did_complete_wft: bool) -> bool {
523
640
  let mut just_evicted = false;
524
641
 
525
642
  if let Some(OutstandingActivation::Normal {
@@ -529,9 +646,10 @@ impl WorkflowTaskManager {
529
646
  self.evict_run(run_id);
530
647
  just_evicted = true;
531
648
  };
649
+
532
650
  // Workflows with no more pending activations (IE: They have completed a WFT) must be
533
651
  // removed from the outstanding tasks map
534
- if !self.pending_activations.has_pending(run_id) {
652
+ let retme = if !self.pending_activations.has_pending(run_id) {
535
653
  if !just_evicted {
536
654
  // Check if there was a legacy query which must be fulfilled, and if there is create
537
655
  // a new pending activation for it.
@@ -542,8 +660,8 @@ impl WorkflowTaskManager {
542
660
  {
543
661
  if let Some(query) = ot.legacy_query.take() {
544
662
  let na = create_query_activation(run_id.to_string(), [query]);
545
- self.pending_activations.push(na);
546
- let _ = self.pending_activations_notifier.send(true);
663
+ self.pending_legacy_queries.push(na);
664
+ self.pending_activations_notifier.notify_waiters();
547
665
  return false;
548
666
  }
549
667
  }
@@ -551,7 +669,11 @@ impl WorkflowTaskManager {
551
669
  // Evict run id if cache is full. Non-sticky will always evict.
552
670
  let maybe_evicted = self.cache_manager.lock().insert(run_id);
553
671
  if let Some(evicted_run_id) = maybe_evicted {
554
- self.request_eviction(&evicted_run_id, "Workflow cache full");
672
+ self.request_eviction(
673
+ &evicted_run_id,
674
+ "Workflow cache full",
675
+ EvictionReason::CacheFull,
676
+ );
555
677
  }
556
678
 
557
679
  // If there was a buffered poll response from the server, it is now ready to
@@ -563,9 +685,14 @@ impl WorkflowTaskManager {
563
685
 
564
686
  // The evict may or may not have already done this, but even when we aren't evicting
565
687
  // we want to clear the outstanding workflow task since it's now complete.
566
- return self.workflow_machines.complete_wft(run_id).is_some();
567
- }
568
- false
688
+ self.workflow_machines
689
+ .complete_wft(run_id, did_complete_wft)
690
+ .is_some()
691
+ } else {
692
+ false
693
+ };
694
+ self.on_activation_done(run_id);
695
+ retme
569
696
  }
570
697
 
571
698
  /// Must be called after *every* activation is replied to, regardless of whether or not we
@@ -573,20 +700,41 @@ impl WorkflowTaskManager {
573
700
  /// every activation we issue to lang has exactly one reply.
574
701
  ///
575
702
  /// Any subsequent action that needs to be taken will be created as a new activation
576
- pub(crate) fn on_activation_done(&self, run_id: &str) {
703
+ fn on_activation_done(&self, run_id: &str) {
577
704
  if self.workflow_machines.delete_activation(run_id).is_some() {
578
- let _ = self.pending_activations_notifier.send(true);
705
+ self.pending_activations_notifier.notify_waiters();
579
706
  }
580
707
  // It's possible the activation is already removed due to completing an eviction
581
708
  }
582
709
 
710
+ /// Let a workflow know that something we've been waiting locally on has resolved, like a local
711
+ /// activity or side effect
712
+ #[instrument(level = "debug", skip(self, resolved))]
713
+ pub(crate) async fn notify_of_local_result(
714
+ &self,
715
+ run_id: &str,
716
+ resolved: LocalResolution,
717
+ ) -> Result<(), WorkflowUpdateError> {
718
+ self.workflow_machines
719
+ .access_sync(run_id, |wfm: &mut WorkflowManager| {
720
+ wfm.notify_of_local_result(resolved)
721
+ })?
722
+ .map_err(|wfme| WorkflowUpdateError {
723
+ source: wfme,
724
+ run_id: run_id.to_string(),
725
+ })?;
726
+
727
+ self.needs_activation(run_id);
728
+ Ok(())
729
+ }
730
+
583
731
  fn make_buffered_poll_ready(&self, buffd: ValidPollWFTQResponse) {
584
732
  self.ready_buffered_wft.push(buffd);
585
733
  }
586
734
 
587
735
  fn insert_outstanding_activation(
588
736
  &self,
589
- act: &WfActivation,
737
+ act: &WorkflowActivation,
590
738
  ) -> Result<(), WorkflowMissingError> {
591
739
  let act_type = if act.is_legacy_query() {
592
740
  OutstandingActivation::LegacyQuery
@@ -619,4 +767,72 @@ impl WorkflowTaskManager {
619
767
  .map(OutstandingActivation::has_eviction)
620
768
  .unwrap_or_default()
621
769
  }
770
+
771
+ fn needs_activation(&self, run_id: &str) {
772
+ self.pending_activations.notify_needs_activation(run_id);
773
+ self.pending_activations_notifier.notify_waiters();
774
+ }
775
+
776
+ /// Wait for either all local activities to resolve, or for 80% of the WFT timeout, in which
777
+ /// case we will "heartbeat" by completing the WFT, even if there are no commands to send.
778
+ ///
779
+ /// Returns true if we must heartbeat
780
+ async fn wait_for_local_acts_or_heartbeat(
781
+ &self,
782
+ run_id: &str,
783
+ wft_heartbeat_deadline: Instant,
784
+ ) -> bool {
785
+ loop {
786
+ let la_count = self
787
+ .workflow_machines
788
+ .access_sync(run_id, |wfm| {
789
+ wfm.machines.outstanding_local_activity_count()
790
+ })
791
+ .expect("Workflow cannot go missing while we are waiting on LAs");
792
+ if la_count == 0 {
793
+ return false;
794
+ } else if Instant::now() >= wft_heartbeat_deadline {
795
+ // We must heartbeat b/c there are still pending local activities
796
+ return true;
797
+ }
798
+ // Since an LA resolution always results in a new pending activation, we can wait on
799
+ // notifications of that to re-check if they're all resolved.
800
+ let _ = timeout_at(
801
+ wft_heartbeat_deadline.into(),
802
+ self.pending_activations_notifier.notified(),
803
+ )
804
+ .await;
805
+ }
806
+ }
807
+ }
808
+
809
+ #[derive(Debug)]
810
+ pub(crate) struct WorkflowUpdateError {
811
+ /// Underlying workflow error
812
+ pub source: WFMachinesError,
813
+ /// The run id of the erring workflow
814
+ #[allow(dead_code)] // Useful in debug output
815
+ pub run_id: String,
816
+ }
817
+
818
+ impl WorkflowUpdateError {
819
+ pub fn evict_reason(&self) -> EvictionReason {
820
+ self.source.evict_reason()
821
+ }
822
+ }
823
+
824
+ impl From<WorkflowMissingError> for WorkflowUpdateError {
825
+ fn from(wme: WorkflowMissingError) -> Self {
826
+ Self {
827
+ source: WFMachinesError::Fatal("Workflow machines missing".to_string()),
828
+ run_id: wme.run_id,
829
+ }
830
+ }
831
+ }
832
+
833
+ /// The workflow machines were expected to be in the cache but were not
834
+ #[derive(Debug)]
835
+ pub(crate) struct WorkflowMissingError {
836
+ /// The run id of the erring workflow
837
+ pub run_id: String,
622
838
  }