@temporalio/core-bridge 0.13.0 → 0.16.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +203 -78
- package/Cargo.toml +3 -3
- package/index.d.ts +195 -0
- package/index.node +0 -0
- package/package.json +10 -6
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/{x86_64-pc-windows-gnu → aarch64-unknown-linux-gnu}/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/scripts/build.js +77 -34
- package/sdk-core/.buildkite/docker/Dockerfile +1 -1
- package/sdk-core/CODEOWNERS +1 -1
- package/sdk-core/Cargo.toml +6 -5
- package/sdk-core/fsm/Cargo.toml +1 -1
- package/sdk-core/fsm/rustfsm_procmacro/Cargo.toml +2 -2
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +8 -9
- package/sdk-core/fsm/rustfsm_procmacro/tests/trybuild/no_handle_conversions_require_into_fail.stderr +13 -7
- package/sdk-core/fsm/rustfsm_trait/Cargo.toml +2 -2
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +1 -1
- package/sdk-core/protos/local/activity_result.proto +10 -1
- package/sdk-core/protos/local/workflow_activation.proto +6 -3
- package/sdk-core/sdk-core-protos/Cargo.toml +4 -4
- package/sdk-core/sdk-core-protos/src/lib.rs +44 -49
- package/sdk-core/src/core_tests/activity_tasks.rs +5 -5
- package/sdk-core/src/core_tests/child_workflows.rs +55 -29
- package/sdk-core/src/core_tests/determinism.rs +19 -9
- package/sdk-core/src/core_tests/mod.rs +3 -3
- package/sdk-core/src/core_tests/retry.rs +96 -2
- package/sdk-core/src/core_tests/workers.rs +1 -1
- package/sdk-core/src/core_tests/workflow_tasks.rs +278 -4
- package/sdk-core/src/errors.rs +27 -44
- package/sdk-core/src/lib.rs +13 -3
- package/sdk-core/src/machines/activity_state_machine.rs +44 -5
- package/sdk-core/src/machines/child_workflow_state_machine.rs +31 -11
- package/sdk-core/src/machines/complete_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/continue_as_new_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/mod.rs +18 -23
- package/sdk-core/src/machines/patch_state_machine.rs +8 -8
- package/sdk-core/src/machines/signal_external_state_machine.rs +22 -1
- package/sdk-core/src/machines/timer_state_machine.rs +21 -3
- package/sdk-core/src/machines/transition_coverage.rs +3 -3
- package/sdk-core/src/machines/workflow_machines.rs +11 -11
- package/sdk-core/src/pending_activations.rs +27 -22
- package/sdk-core/src/pollers/gateway.rs +28 -7
- package/sdk-core/src/pollers/poll_buffer.rs +6 -5
- package/sdk-core/src/pollers/retry.rs +193 -136
- package/sdk-core/src/prototype_rust_sdk/workflow_context.rs +61 -46
- package/sdk-core/src/prototype_rust_sdk/workflow_future.rs +13 -12
- package/sdk-core/src/prototype_rust_sdk.rs +17 -23
- package/sdk-core/src/telemetry/metrics.rs +2 -4
- package/sdk-core/src/telemetry/mod.rs +6 -7
- package/sdk-core/src/test_help/canned_histories.rs +17 -93
- package/sdk-core/src/test_help/history_builder.rs +51 -2
- package/sdk-core/src/test_help/history_info.rs +2 -2
- package/sdk-core/src/test_help/mod.rs +21 -34
- package/sdk-core/src/worker/activities/activity_heartbeat_manager.rs +246 -138
- package/sdk-core/src/worker/activities.rs +47 -45
- package/sdk-core/src/worker/config.rs +11 -0
- package/sdk-core/src/worker/dispatcher.rs +5 -5
- package/sdk-core/src/worker/mod.rs +86 -56
- package/sdk-core/src/workflow/driven_workflow.rs +3 -3
- package/sdk-core/src/workflow/history_update.rs +1 -1
- package/sdk-core/src/workflow/mod.rs +2 -1
- package/sdk-core/src/workflow/workflow_tasks/cache_manager.rs +13 -17
- package/sdk-core/src/workflow/workflow_tasks/concurrency_manager.rs +10 -18
- package/sdk-core/src/workflow/workflow_tasks/mod.rs +72 -57
- package/sdk-core/test_utils/Cargo.toml +1 -1
- package/sdk-core/test_utils/src/lib.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +131 -2
- package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +49 -0
- package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests.rs +74 -47
- package/src/conversions.rs +17 -0
- package/src/errors.rs +0 -7
- package/src/lib.rs +0 -20
|
@@ -10,7 +10,6 @@ use activity_heartbeat_manager::ActivityHeartbeatManager;
|
|
|
10
10
|
use dashmap::DashMap;
|
|
11
11
|
use std::{
|
|
12
12
|
convert::TryInto,
|
|
13
|
-
ops::Div,
|
|
14
13
|
sync::Arc,
|
|
15
14
|
time::{Duration, Instant},
|
|
16
15
|
};
|
|
@@ -80,6 +79,9 @@ pub(crate) struct WorkerActivityTasks {
|
|
|
80
79
|
activities_semaphore: Semaphore,
|
|
81
80
|
|
|
82
81
|
metrics: MetricsContext,
|
|
82
|
+
|
|
83
|
+
max_heartbeat_throttle_interval: Duration,
|
|
84
|
+
default_heartbeat_throttle_interval: Duration,
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
impl WorkerActivityTasks {
|
|
@@ -88,6 +90,8 @@ impl WorkerActivityTasks {
|
|
|
88
90
|
poller: BoxedActPoller,
|
|
89
91
|
sg: Arc<impl ServerGatewayApis + Send + Sync + 'static + ?Sized>,
|
|
90
92
|
metrics: MetricsContext,
|
|
93
|
+
max_heartbeat_throttle_interval: Duration,
|
|
94
|
+
default_heartbeat_throttle_interval: Duration,
|
|
91
95
|
) -> Self {
|
|
92
96
|
Self {
|
|
93
97
|
heartbeat_manager: ActivityHeartbeatManager::new(sg),
|
|
@@ -95,12 +99,13 @@ impl WorkerActivityTasks {
|
|
|
95
99
|
poller,
|
|
96
100
|
activities_semaphore: Semaphore::new(max_activity_tasks),
|
|
97
101
|
metrics,
|
|
102
|
+
max_heartbeat_throttle_interval,
|
|
103
|
+
default_heartbeat_throttle_interval,
|
|
98
104
|
}
|
|
99
105
|
}
|
|
100
106
|
|
|
101
107
|
pub(crate) fn notify_shutdown(&self) {
|
|
102
108
|
self.poller.notify_shutdown();
|
|
103
|
-
self.heartbeat_manager.notify_shutdown();
|
|
104
109
|
}
|
|
105
110
|
|
|
106
111
|
pub(crate) async fn shutdown(self) {
|
|
@@ -171,17 +176,21 @@ impl WorkerActivityTasks {
|
|
|
171
176
|
status: activity_result::Status,
|
|
172
177
|
gateway: &(dyn ServerGatewayApis + Send + Sync),
|
|
173
178
|
) -> Result<(), CompleteActivityError> {
|
|
174
|
-
if let Some(act_info) = self.outstanding_activity_tasks.
|
|
179
|
+
if let Some((_, act_info)) = self.outstanding_activity_tasks.remove(&task_token) {
|
|
175
180
|
let act_metrics = self.metrics.with_new_attrs([
|
|
176
181
|
activity_type(act_info.activity_type.clone()),
|
|
177
182
|
workflow_type(act_info.workflow_type.clone()),
|
|
178
183
|
]);
|
|
179
184
|
act_metrics.act_execution_latency(act_info.start_time.elapsed());
|
|
185
|
+
self.activities_semaphore.add_permits(1);
|
|
186
|
+
self.heartbeat_manager.evict(task_token.clone());
|
|
187
|
+
let known_not_found = act_info.known_not_found;
|
|
188
|
+
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
180
189
|
|
|
181
190
|
// No need to report activities which we already know the server doesn't care about
|
|
182
|
-
|
|
183
|
-
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
191
|
+
if !known_not_found {
|
|
184
192
|
let maybe_net_err = match status {
|
|
193
|
+
activity_result::Status::WillCompleteAsync(_) => None,
|
|
185
194
|
activity_result::Status::Completed(ar::Success { result }) => gateway
|
|
186
195
|
.complete_activity_task(task_token.clone(), result.map(Into::into))
|
|
187
196
|
.await
|
|
@@ -194,19 +203,17 @@ impl WorkerActivityTasks {
|
|
|
194
203
|
.err()
|
|
195
204
|
}
|
|
196
205
|
activity_result::Status::Cancelled(ar::Cancellation { failure }) => {
|
|
197
|
-
let details =
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
None
|
|
209
|
-
}
|
|
206
|
+
let details = if let Some(Failure {
|
|
207
|
+
failure_info:
|
|
208
|
+
Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo { details })),
|
|
209
|
+
..
|
|
210
|
+
}) = failure
|
|
211
|
+
{
|
|
212
|
+
details
|
|
213
|
+
} else {
|
|
214
|
+
warn!(task_token = ? task_token,
|
|
215
|
+
"Expected activity cancelled status with CanceledFailureInfo");
|
|
216
|
+
None
|
|
210
217
|
};
|
|
211
218
|
gateway
|
|
212
219
|
.cancel_activity_task(task_token.clone(), details.map(Into::into))
|
|
@@ -214,37 +221,24 @@ impl WorkerActivityTasks {
|
|
|
214
221
|
.err()
|
|
215
222
|
}
|
|
216
223
|
};
|
|
217
|
-
|
|
218
|
-
|
|
224
|
+
|
|
225
|
+
if let Some(e) = maybe_net_err {
|
|
226
|
+
if e.code() == tonic::Code::NotFound {
|
|
219
227
|
warn!(task_token = ?task_token, details = ?e, "Activity not found on \
|
|
220
228
|
completion. This may happen if the activity has already been cancelled but \
|
|
221
229
|
completed anyway.");
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
}
|
|
227
|
-
} else {
|
|
228
|
-
true
|
|
230
|
+
} else {
|
|
231
|
+
return Err(e.into());
|
|
232
|
+
};
|
|
233
|
+
};
|
|
229
234
|
};
|
|
230
|
-
|
|
231
|
-
if should_remove
|
|
232
|
-
&& self
|
|
233
|
-
.outstanding_activity_tasks
|
|
234
|
-
.remove(&task_token)
|
|
235
|
-
.is_some()
|
|
236
|
-
{
|
|
237
|
-
self.activities_semaphore.add_permits(1);
|
|
238
|
-
self.heartbeat_manager.evict(task_token);
|
|
239
|
-
}
|
|
240
|
-
Ok(())
|
|
241
235
|
} else {
|
|
242
236
|
warn!(
|
|
243
237
|
"Attempted to complete activity task {} but we were not tracking it",
|
|
244
238
|
&task_token
|
|
245
239
|
);
|
|
246
|
-
Ok(())
|
|
247
240
|
}
|
|
241
|
+
Ok(())
|
|
248
242
|
}
|
|
249
243
|
|
|
250
244
|
/// Attempt to record an activity heartbeat
|
|
@@ -253,22 +247,30 @@ impl WorkerActivityTasks {
|
|
|
253
247
|
details: ActivityHeartbeat,
|
|
254
248
|
) -> Result<(), ActivityHeartbeatError> {
|
|
255
249
|
// TODO: Propagate these back as cancels. Silent fails is too nonobvious
|
|
256
|
-
let
|
|
250
|
+
let heartbeat_timeout: Duration = self
|
|
257
251
|
.outstanding_activity_tasks
|
|
258
252
|
.get(&TaskToken(details.task_token.clone()))
|
|
259
253
|
.ok_or(ActivityHeartbeatError::UnknownActivity)?
|
|
260
254
|
.heartbeat_timeout
|
|
261
255
|
.clone()
|
|
262
|
-
|
|
256
|
+
// We treat None as 0 (even though heartbeat_timeout is never set to None by the server)
|
|
257
|
+
.unwrap_or_default()
|
|
263
258
|
.try_into()
|
|
259
|
+
// This technically should never happen since prost duration should be directly mappable
|
|
260
|
+
// to std::time::Duration.
|
|
264
261
|
.or(Err(ActivityHeartbeatError::InvalidHeartbeatTimeout))?;
|
|
262
|
+
|
|
265
263
|
// There is a bug in the server that translates non-set heartbeat timeouts into 0 duration.
|
|
266
264
|
// That's why we treat 0 the same way as None, otherwise we wouldn't know which aggregation
|
|
267
265
|
// delay to use, and using 0 is not a good idea as SDK would hammer the server too hard.
|
|
268
|
-
if
|
|
269
|
-
|
|
270
|
-
}
|
|
271
|
-
|
|
266
|
+
let throttle_interval = if heartbeat_timeout.as_millis() == 0 {
|
|
267
|
+
self.default_heartbeat_throttle_interval
|
|
268
|
+
} else {
|
|
269
|
+
heartbeat_timeout.mul_f64(0.8)
|
|
270
|
+
};
|
|
271
|
+
let throttle_interval =
|
|
272
|
+
std::cmp::min(throttle_interval, self.max_heartbeat_throttle_interval);
|
|
273
|
+
self.heartbeat_manager.record(details, throttle_interval)
|
|
272
274
|
}
|
|
273
275
|
|
|
274
276
|
async fn next_pending_cancel_task(&self) -> Result<Option<ActivityTask>, PollActivityError> {
|
|
@@ -51,6 +51,17 @@ pub struct WorkerConfig {
|
|
|
51
51
|
/// and moved to the non-sticky queue where it may be picked up by any worker.
|
|
52
52
|
#[builder(default = "Duration::from_secs(10)")]
|
|
53
53
|
pub sticky_queue_schedule_to_start_timeout: Duration,
|
|
54
|
+
|
|
55
|
+
/// Longest interval for throttling activity heartbeats
|
|
56
|
+
#[builder(default = "Duration::from_secs(60)")]
|
|
57
|
+
pub max_heartbeat_throttle_interval: Duration,
|
|
58
|
+
|
|
59
|
+
/// Default interval for throttling activity heartbeats in case
|
|
60
|
+
/// `ActivityOptions.heartbeat_timeout` is unset.
|
|
61
|
+
/// When the timeout *is* set in the `ActivityOptions`, throttling is set to
|
|
62
|
+
/// `heartbeat_timeout * 0.8`.
|
|
63
|
+
#[builder(default = "Duration::from_secs(30)")]
|
|
64
|
+
pub default_heartbeat_throttle_interval: Duration,
|
|
54
65
|
}
|
|
55
66
|
|
|
56
67
|
impl WorkerConfigBuilder {
|
|
@@ -6,7 +6,7 @@ use crate::{
|
|
|
6
6
|
};
|
|
7
7
|
use arc_swap::ArcSwap;
|
|
8
8
|
use futures::future::join_all;
|
|
9
|
-
use std::{collections::HashMap, ops::Deref, sync::Arc};
|
|
9
|
+
use std::{collections::HashMap, ops::Deref, option::Option, sync::Arc};
|
|
10
10
|
use tokio::sync::Notify;
|
|
11
11
|
|
|
12
12
|
/// Allows access to workers by task queue name
|
|
@@ -40,7 +40,7 @@ impl WorkerDispatcher {
|
|
|
40
40
|
.workers
|
|
41
41
|
.load()
|
|
42
42
|
.get(&tq)
|
|
43
|
-
.map(
|
|
43
|
+
.map(Option::is_some)
|
|
44
44
|
.unwrap_or_default()
|
|
45
45
|
{
|
|
46
46
|
return Err(WorkerRegistrationError::WorkerAlreadyRegisteredForQueue(tq));
|
|
@@ -77,7 +77,7 @@ impl WorkerDispatcher {
|
|
|
77
77
|
self.workers.rcu(|map| {
|
|
78
78
|
let mut map = HashMap::clone(map);
|
|
79
79
|
if maybe_worker.is_none() {
|
|
80
|
-
maybe_worker = map.get_mut(task_queue).and_then(
|
|
80
|
+
maybe_worker = map.get_mut(task_queue).and_then(Option::take);
|
|
81
81
|
}
|
|
82
82
|
map
|
|
83
83
|
});
|
|
@@ -149,7 +149,7 @@ impl Deref for WorkerRefCt {
|
|
|
149
149
|
type Target = Worker;
|
|
150
150
|
|
|
151
151
|
fn deref(&self) -> &Self::Target {
|
|
152
|
-
self.inner.
|
|
152
|
+
self.inner.as_deref().expect("Must exist")
|
|
153
153
|
}
|
|
154
154
|
}
|
|
155
155
|
|
|
@@ -161,7 +161,7 @@ impl Drop for WorkerRefCt {
|
|
|
161
161
|
Some(arc) => {
|
|
162
162
|
// We wait until 2 rather than 1 because we ourselves still have an Arc
|
|
163
163
|
if Arc::strong_count(arc) == 2 {
|
|
164
|
-
self.notify.notify_one()
|
|
164
|
+
self.notify.notify_one();
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
167
|
};
|
|
@@ -6,7 +6,7 @@ pub use crate::worker::config::{WorkerConfig, WorkerConfigBuilder};
|
|
|
6
6
|
pub(crate) use dispatcher::WorkerDispatcher;
|
|
7
7
|
|
|
8
8
|
use crate::{
|
|
9
|
-
errors::
|
|
9
|
+
errors::CompleteWfError,
|
|
10
10
|
machines::{EmptyWorkflowCommandErr, WFMachinesError},
|
|
11
11
|
pollers::{
|
|
12
12
|
new_activity_task_buffer, new_workflow_task_buffer, BoxedActPoller, BoxedWFPoller,
|
|
@@ -98,6 +98,7 @@ impl Worker {
|
|
|
98
98
|
let mut wf_task_poll_buffer = new_workflow_task_buffer(
|
|
99
99
|
sg.gw.clone(),
|
|
100
100
|
config.task_queue.clone(),
|
|
101
|
+
false,
|
|
101
102
|
max_nonsticky_polls,
|
|
102
103
|
max_nonsticky_polls * 2,
|
|
103
104
|
);
|
|
@@ -107,6 +108,7 @@ impl Worker {
|
|
|
107
108
|
let mut sp = new_workflow_task_buffer(
|
|
108
109
|
sg.gw.clone(),
|
|
109
110
|
sqn.clone(),
|
|
111
|
+
true,
|
|
110
112
|
max_sticky_polls,
|
|
111
113
|
max_sticky_polls * 2,
|
|
112
114
|
);
|
|
@@ -172,6 +174,8 @@ impl Worker {
|
|
|
172
174
|
ap,
|
|
173
175
|
sg.gw.clone(),
|
|
174
176
|
metrics.clone(),
|
|
177
|
+
config.max_heartbeat_throttle_interval,
|
|
178
|
+
config.default_heartbeat_throttle_interval,
|
|
175
179
|
)
|
|
176
180
|
}),
|
|
177
181
|
workflows_semaphore: Semaphore::new(config.max_outstanding_workflow_tasks),
|
|
@@ -219,6 +223,11 @@ impl Worker {
|
|
|
219
223
|
self.wft_manager.outstanding_wft()
|
|
220
224
|
}
|
|
221
225
|
|
|
226
|
+
#[cfg(test)]
|
|
227
|
+
pub(crate) fn available_wft_permits(&self) -> usize {
|
|
228
|
+
self.workflows_semaphore.available_permits()
|
|
229
|
+
}
|
|
230
|
+
|
|
222
231
|
/// Wait until not at the outstanding activity limit, and then poll this worker's task queue for
|
|
223
232
|
/// new activities.
|
|
224
233
|
///
|
|
@@ -246,7 +255,7 @@ impl Worker {
|
|
|
246
255
|
if let Some(at_mgr) = self.at_task_mgr.as_ref() {
|
|
247
256
|
let tt = details.task_token.clone();
|
|
248
257
|
if let Err(e) = at_mgr.record_heartbeat(details) {
|
|
249
|
-
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.")
|
|
258
|
+
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.");
|
|
250
259
|
}
|
|
251
260
|
}
|
|
252
261
|
}
|
|
@@ -275,7 +284,7 @@ impl Worker {
|
|
|
275
284
|
// We must first check if there are pending workflow activations for workflows that are
|
|
276
285
|
// currently replaying or otherwise need immediate jobs, and issue those before
|
|
277
286
|
// bothering the server.
|
|
278
|
-
if let Some(pa) = self.wft_manager.next_pending_activation()
|
|
287
|
+
if let Some(pa) = self.wft_manager.next_pending_activation() {
|
|
279
288
|
debug!(activation=%pa, "Sending pending activation to lang");
|
|
280
289
|
return Ok(pa);
|
|
281
290
|
}
|
|
@@ -284,7 +293,7 @@ impl Worker {
|
|
|
284
293
|
// activations, since there may be an eviction etc for whatever run is popped here.
|
|
285
294
|
if let Some(buff_wft) = self.wft_manager.next_buffered_poll() {
|
|
286
295
|
match self.apply_server_work(buff_wft).await? {
|
|
287
|
-
|
|
296
|
+
Some(a) => return Ok(a),
|
|
288
297
|
_ => continue,
|
|
289
298
|
}
|
|
290
299
|
}
|
|
@@ -304,14 +313,8 @@ impl Worker {
|
|
|
304
313
|
|
|
305
314
|
if let Some(work) = selected_f {
|
|
306
315
|
self.metrics.wf_tq_poll_ok();
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
NewWfTaskOutcome::TaskBuffered => {
|
|
310
|
-
// If the task was buffered, it's not actually outstanding, so we can
|
|
311
|
-
// immediately return a permit.
|
|
312
|
-
self.return_workflow_task_permit();
|
|
313
|
-
}
|
|
314
|
-
_ => {}
|
|
316
|
+
if let Some(a) = self.apply_server_work(work).await? {
|
|
317
|
+
return Ok(a);
|
|
315
318
|
}
|
|
316
319
|
}
|
|
317
320
|
|
|
@@ -326,7 +329,7 @@ impl Worker {
|
|
|
326
329
|
completion: WfActivationCompletion,
|
|
327
330
|
) -> Result<(), CompleteWfError> {
|
|
328
331
|
let wfstatus = completion.status;
|
|
329
|
-
let
|
|
332
|
+
let did_complete_wft = match wfstatus {
|
|
330
333
|
Some(wf_activation_completion::Status::Successful(success)) => {
|
|
331
334
|
self.wf_activation_success(&completion.run_id, success)
|
|
332
335
|
.await
|
|
@@ -338,11 +341,9 @@ impl Worker {
|
|
|
338
341
|
reason: "Workflow completion had empty status field".to_owned(),
|
|
339
342
|
completion: None,
|
|
340
343
|
}),
|
|
341
|
-
}
|
|
342
|
-
self.
|
|
343
|
-
|
|
344
|
-
self.maybe_notify_wtfs_drained();
|
|
345
|
-
r
|
|
344
|
+
}?;
|
|
345
|
+
self.after_workflow_activation(&completion.run_id, did_complete_wft);
|
|
346
|
+
Ok(())
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
fn maybe_notify_wtfs_drained(&self) {
|
|
@@ -355,11 +356,11 @@ impl Worker {
|
|
|
355
356
|
|
|
356
357
|
/// Tell the worker a workflow task has completed, for tracking max outstanding WFTs
|
|
357
358
|
pub(crate) fn return_workflow_task_permit(&self) {
|
|
358
|
-
self.workflows_semaphore.add_permits(1)
|
|
359
|
+
self.workflows_semaphore.add_permits(1);
|
|
359
360
|
}
|
|
360
361
|
|
|
361
|
-
pub(crate) fn request_wf_eviction(&self, run_id: &str) {
|
|
362
|
-
self.wft_manager.request_eviction(run_id);
|
|
362
|
+
pub(crate) fn request_wf_eviction(&self, run_id: &str, reason: impl Into<String>) {
|
|
363
|
+
self.wft_manager.request_eviction(run_id, reason);
|
|
363
364
|
}
|
|
364
365
|
|
|
365
366
|
/// Resolves with WFT poll response or `PollWfError::ShutDown` if WFTs have been drained
|
|
@@ -443,18 +444,24 @@ impl Worker {
|
|
|
443
444
|
async fn apply_server_work(
|
|
444
445
|
&self,
|
|
445
446
|
work: ValidPollWFTQResponse,
|
|
446
|
-
) -> Result<
|
|
447
|
+
) -> Result<Option<WfActivation>, PollWfError> {
|
|
447
448
|
let we = work.workflow_execution.clone();
|
|
448
449
|
let tt = work.task_token.clone();
|
|
449
450
|
let res = self
|
|
450
451
|
.wft_manager
|
|
451
452
|
.apply_new_poll_resp(work, &self.server_gateway)
|
|
452
|
-
.await
|
|
453
|
-
match
|
|
453
|
+
.await;
|
|
454
|
+
Ok(match res {
|
|
454
455
|
NewWfTaskOutcome::IssueActivation(a) => {
|
|
455
456
|
debug!(activation=%a, "Sending activation to lang");
|
|
457
|
+
Some(a)
|
|
458
|
+
}
|
|
459
|
+
NewWfTaskOutcome::TaskBuffered => {
|
|
460
|
+
// If the task was buffered, it's not actually outstanding, so we can
|
|
461
|
+
// immediately return a permit.
|
|
462
|
+
self.return_workflow_task_permit();
|
|
463
|
+
None
|
|
456
464
|
}
|
|
457
|
-
NewWfTaskOutcome::TaskBuffered => {}
|
|
458
465
|
NewWfTaskOutcome::Autocomplete => {
|
|
459
466
|
debug!(workflow_execution=?we,
|
|
460
467
|
"No work for lang to perform after polling server. Sending autocomplete.");
|
|
@@ -464,6 +471,7 @@ impl Worker {
|
|
|
464
471
|
status: Some(workflow_completion::Success::from_variants(vec![]).into()),
|
|
465
472
|
})
|
|
466
473
|
.await?;
|
|
474
|
+
None
|
|
467
475
|
}
|
|
468
476
|
NewWfTaskOutcome::CacheMiss => {
|
|
469
477
|
debug!(workflow_execution=?we, "Unable to process workflow task with partial \
|
|
@@ -480,17 +488,27 @@ impl Worker {
|
|
|
480
488
|
}),
|
|
481
489
|
)
|
|
482
490
|
.await?;
|
|
491
|
+
None
|
|
483
492
|
}
|
|
484
|
-
|
|
485
|
-
|
|
493
|
+
NewWfTaskOutcome::Evict(e) => {
|
|
494
|
+
warn!(error=?e, run_id=%we.run_id, "Error while applying poll response to workflow");
|
|
495
|
+
self.request_wf_eviction(
|
|
496
|
+
&we.run_id,
|
|
497
|
+
format!("Error while applying poll response to workflow: {:?}", e),
|
|
498
|
+
);
|
|
499
|
+
None
|
|
500
|
+
}
|
|
501
|
+
})
|
|
486
502
|
}
|
|
487
503
|
|
|
488
|
-
/// Handle a successful workflow
|
|
504
|
+
/// Handle a successful workflow activation
|
|
505
|
+
///
|
|
506
|
+
/// Returns true if we actually reported WFT completion to server (success or failure)
|
|
489
507
|
async fn wf_activation_success(
|
|
490
508
|
&self,
|
|
491
509
|
run_id: &str,
|
|
492
510
|
success: workflow_completion::Success,
|
|
493
|
-
) -> Result<
|
|
511
|
+
) -> Result<bool, CompleteWfError> {
|
|
494
512
|
// Convert to wf commands
|
|
495
513
|
let cmds = success
|
|
496
514
|
.commands
|
|
@@ -534,6 +552,7 @@ impl Worker {
|
|
|
534
552
|
.await
|
|
535
553
|
})
|
|
536
554
|
.await?;
|
|
555
|
+
Ok(true)
|
|
537
556
|
}
|
|
538
557
|
Ok(Some(ServerCommandsWithWorkflowInfo {
|
|
539
558
|
task_token,
|
|
@@ -543,8 +562,9 @@ impl Worker {
|
|
|
543
562
|
self.server_gateway
|
|
544
563
|
.respond_legacy_query(task_token, result)
|
|
545
564
|
.await?;
|
|
565
|
+
Ok(true)
|
|
546
566
|
}
|
|
547
|
-
Ok(None) =>
|
|
567
|
+
Ok(None) => Ok(false),
|
|
548
568
|
Err(update_err) => {
|
|
549
569
|
// Automatically fail the workflow task in the event we couldn't update machines
|
|
550
570
|
let fail_cause = if matches!(&update_err.source, WFMachinesError::Nondeterminism(_))
|
|
@@ -554,35 +574,42 @@ impl Worker {
|
|
|
554
574
|
WorkflowTaskFailedCause::Unspecified
|
|
555
575
|
};
|
|
556
576
|
|
|
577
|
+
warn!(run_id, error=?update_err, "Failing workflow task");
|
|
578
|
+
|
|
557
579
|
if let Some(ref tt) = update_err.task_token {
|
|
580
|
+
let wft_fail_str = format!("{:?}", update_err);
|
|
558
581
|
self.handle_wft_reporting_errs(run_id, || async {
|
|
559
582
|
self.server_gateway
|
|
560
583
|
.fail_workflow_task(
|
|
561
584
|
tt.clone(),
|
|
562
585
|
fail_cause,
|
|
563
|
-
Some(Failure::application_failure(
|
|
564
|
-
format!("{:?}", update_err),
|
|
565
|
-
false,
|
|
566
|
-
)),
|
|
586
|
+
Some(Failure::application_failure(wft_fail_str.clone(), false)),
|
|
567
587
|
)
|
|
568
588
|
.await
|
|
569
589
|
})
|
|
570
590
|
.await?;
|
|
591
|
+
// We must evict the workflow since we've failed a WFT
|
|
592
|
+
self.request_wf_eviction(
|
|
593
|
+
run_id,
|
|
594
|
+
format!("Workflow task failure: {}", wft_fail_str),
|
|
595
|
+
);
|
|
596
|
+
Ok(true)
|
|
597
|
+
} else {
|
|
598
|
+
Ok(false)
|
|
571
599
|
}
|
|
572
|
-
return Err(update_err.into());
|
|
573
600
|
}
|
|
574
601
|
}
|
|
575
|
-
|
|
576
|
-
Ok(())
|
|
577
602
|
}
|
|
578
603
|
|
|
579
604
|
/// Handle a failed workflow completion
|
|
605
|
+
///
|
|
606
|
+
/// Returns true if we actually reported WFT completion to server
|
|
580
607
|
async fn wf_activation_failed(
|
|
581
608
|
&self,
|
|
582
609
|
run_id: &str,
|
|
583
610
|
failure: workflow_completion::Failure,
|
|
584
|
-
) -> Result<
|
|
585
|
-
match self.wft_manager.failed_activation(run_id) {
|
|
611
|
+
) -> Result<bool, CompleteWfError> {
|
|
612
|
+
Ok(match self.wft_manager.failed_activation(run_id) {
|
|
586
613
|
FailedActivationOutcome::Report(tt) => {
|
|
587
614
|
self.handle_wft_reporting_errs(run_id, || async {
|
|
588
615
|
self.server_gateway
|
|
@@ -594,23 +621,25 @@ impl Worker {
|
|
|
594
621
|
.await
|
|
595
622
|
})
|
|
596
623
|
.await?;
|
|
624
|
+
true
|
|
597
625
|
}
|
|
598
626
|
FailedActivationOutcome::ReportLegacyQueryFailure(task_token) => {
|
|
599
627
|
self.server_gateway
|
|
600
628
|
.respond_legacy_query(task_token, legacy_query_failure(failure))
|
|
601
629
|
.await?;
|
|
630
|
+
true
|
|
602
631
|
}
|
|
603
|
-
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
Ok(())
|
|
632
|
+
FailedActivationOutcome::NoReport => false,
|
|
633
|
+
})
|
|
607
634
|
}
|
|
608
635
|
|
|
609
|
-
fn
|
|
610
|
-
|
|
636
|
+
fn after_workflow_activation(&self, run_id: &str, did_complete_wft: bool) {
|
|
637
|
+
self.wft_manager.after_wft_report(run_id);
|
|
638
|
+
if did_complete_wft {
|
|
611
639
|
self.return_workflow_task_permit();
|
|
612
|
-
}
|
|
613
|
-
|
|
640
|
+
}
|
|
641
|
+
self.wft_manager.on_activation_done(run_id);
|
|
642
|
+
self.maybe_notify_wtfs_drained();
|
|
614
643
|
}
|
|
615
644
|
|
|
616
645
|
/// Handle server errors from either completing or failing a workflow task. Returns any errors
|
|
@@ -630,11 +659,12 @@ impl Worker {
|
|
|
630
659
|
// Silence unhandled command errors since the lang SDK cannot do anything about
|
|
631
660
|
// them besides poll again, which it will do anyway.
|
|
632
661
|
tonic::Code::InvalidArgument if err.message() == "UnhandledCommand" => {
|
|
633
|
-
warn!("Unhandled command response when completing
|
|
662
|
+
warn!(error = %err, "Unhandled command response when completing");
|
|
663
|
+
should_evict = true;
|
|
634
664
|
Ok(())
|
|
635
665
|
}
|
|
636
666
|
tonic::Code::NotFound => {
|
|
637
|
-
warn!("Task not found when completing
|
|
667
|
+
warn!(error = %err, "Task not found when completing");
|
|
638
668
|
should_evict = true;
|
|
639
669
|
Ok(())
|
|
640
670
|
}
|
|
@@ -644,7 +674,7 @@ impl Worker {
|
|
|
644
674
|
_ => Ok(()),
|
|
645
675
|
};
|
|
646
676
|
if should_evict {
|
|
647
|
-
self.
|
|
677
|
+
self.request_wf_eviction(run_id, "Error reporting WFT to server");
|
|
648
678
|
}
|
|
649
679
|
res.map_err(Into::into)
|
|
650
680
|
}
|
|
@@ -703,7 +733,7 @@ mod tests {
|
|
|
703
733
|
|
|
704
734
|
let cfg = WorkerConfigBuilder::default()
|
|
705
735
|
.task_queue("whatever")
|
|
706
|
-
.max_outstanding_activities(
|
|
736
|
+
.max_outstanding_activities(5_usize)
|
|
707
737
|
.build()
|
|
708
738
|
.unwrap();
|
|
709
739
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -716,12 +746,12 @@ mod tests {
|
|
|
716
746
|
let mut mock_gateway = MockServerGatewayApis::new();
|
|
717
747
|
mock_gateway
|
|
718
748
|
.expect_poll_workflow_task()
|
|
719
|
-
.returning(|_| Ok(PollWorkflowTaskQueueResponse::default()));
|
|
749
|
+
.returning(|_, _| Ok(PollWorkflowTaskQueueResponse::default()));
|
|
720
750
|
let gwref = GatewayRef::new(Arc::new(mock_gateway), fake_sg_opts());
|
|
721
751
|
|
|
722
752
|
let cfg = WorkerConfigBuilder::default()
|
|
723
753
|
.task_queue("whatever")
|
|
724
|
-
.max_outstanding_workflow_tasks(
|
|
754
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
725
755
|
.build()
|
|
726
756
|
.unwrap();
|
|
727
757
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -739,7 +769,7 @@ mod tests {
|
|
|
739
769
|
|
|
740
770
|
let cfg = WorkerConfigBuilder::default()
|
|
741
771
|
.task_queue("whatever")
|
|
742
|
-
.max_outstanding_activities(
|
|
772
|
+
.max_outstanding_activities(5_usize)
|
|
743
773
|
.build()
|
|
744
774
|
.unwrap();
|
|
745
775
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -752,12 +782,12 @@ mod tests {
|
|
|
752
782
|
let mut mock_gateway = MockServerGatewayApis::new();
|
|
753
783
|
mock_gateway
|
|
754
784
|
.expect_poll_workflow_task()
|
|
755
|
-
.returning(|_| Err(tonic::Status::internal("ahhh")));
|
|
785
|
+
.returning(|_, _| Err(tonic::Status::internal("ahhh")));
|
|
756
786
|
let gwref = GatewayRef::new(Arc::new(mock_gateway), fake_sg_opts());
|
|
757
787
|
|
|
758
788
|
let cfg = WorkerConfigBuilder::default()
|
|
759
789
|
.task_queue("whatever")
|
|
760
|
-
.max_outstanding_workflow_tasks(
|
|
790
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
761
791
|
.build()
|
|
762
792
|
.unwrap();
|
|
763
793
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -33,7 +33,7 @@ impl DrivenWorkflow {
|
|
|
33
33
|
/// Start the workflow
|
|
34
34
|
pub fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) {
|
|
35
35
|
debug!(run_id = %attribs.original_execution_run_id, "Driven WF start");
|
|
36
|
-
self.started_attrs = Some(attribs)
|
|
36
|
+
self.started_attrs = Some(attribs);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
/// Enqueue a new job to be sent to the driven workflow
|
|
@@ -51,12 +51,12 @@ impl DrivenWorkflow {
|
|
|
51
51
|
|
|
52
52
|
/// Signal the workflow
|
|
53
53
|
pub fn signal(&mut self, signal: SignalWorkflow) {
|
|
54
|
-
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal))
|
|
54
|
+
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal));
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
/// Cancel the workflow
|
|
58
58
|
pub fn cancel(&mut self, attribs: CancelWorkflow) {
|
|
59
|
-
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs))
|
|
59
|
+
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs));
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
@@ -62,7 +62,7 @@ impl WorkflowManager {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
#[cfg(test)]
|
|
65
|
-
pub fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
65
|
+
pub const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
66
66
|
Self {
|
|
67
67
|
machines: workflow_machines,
|
|
68
68
|
command_sink: None,
|
|
@@ -277,6 +277,7 @@ pub mod managed_wf {
|
|
|
277
277
|
// Send an eviction to ensure wf exits if it has not finished (ex: feeding partial hist)
|
|
278
278
|
let _ = self.activation_tx.send(create_evict_activation(
|
|
279
279
|
"not actually important".to_string(),
|
|
280
|
+
"force shutdown".to_string(),
|
|
280
281
|
));
|
|
281
282
|
self.future_handle.take().unwrap().await.unwrap()
|
|
282
283
|
}
|