@temporalio/core-bridge 0.14.0 → 0.16.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +162 -38
- package/Cargo.toml +3 -3
- package/index.d.ts +14 -1
- package/index.node +0 -0
- package/package.json +8 -5
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/{x86_64-pc-windows-gnu → aarch64-unknown-linux-gnu}/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/scripts/build.js +77 -34
- package/sdk-core/.buildkite/docker/Dockerfile +1 -1
- package/sdk-core/Cargo.toml +6 -5
- package/sdk-core/fsm/Cargo.toml +1 -1
- package/sdk-core/fsm/rustfsm_procmacro/Cargo.toml +2 -2
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +8 -9
- package/sdk-core/fsm/rustfsm_procmacro/tests/trybuild/no_handle_conversions_require_into_fail.stderr +13 -7
- package/sdk-core/fsm/rustfsm_trait/Cargo.toml +2 -2
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +1 -1
- package/sdk-core/protos/local/workflow_activation.proto +6 -3
- package/sdk-core/sdk-core-protos/Cargo.toml +4 -4
- package/sdk-core/sdk-core-protos/src/lib.rs +38 -50
- package/sdk-core/src/core_tests/activity_tasks.rs +5 -5
- package/sdk-core/src/core_tests/child_workflows.rs +55 -29
- package/sdk-core/src/core_tests/determinism.rs +19 -9
- package/sdk-core/src/core_tests/mod.rs +3 -3
- package/sdk-core/src/core_tests/retry.rs +14 -8
- package/sdk-core/src/core_tests/workers.rs +1 -1
- package/sdk-core/src/core_tests/workflow_tasks.rs +347 -4
- package/sdk-core/src/errors.rs +27 -44
- package/sdk-core/src/lib.rs +13 -3
- package/sdk-core/src/machines/activity_state_machine.rs +44 -5
- package/sdk-core/src/machines/child_workflow_state_machine.rs +31 -11
- package/sdk-core/src/machines/complete_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/continue_as_new_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/mod.rs +18 -23
- package/sdk-core/src/machines/patch_state_machine.rs +8 -8
- package/sdk-core/src/machines/signal_external_state_machine.rs +22 -1
- package/sdk-core/src/machines/timer_state_machine.rs +21 -3
- package/sdk-core/src/machines/transition_coverage.rs +3 -3
- package/sdk-core/src/machines/workflow_machines.rs +11 -11
- package/sdk-core/src/pending_activations.rs +27 -22
- package/sdk-core/src/pollers/gateway.rs +15 -7
- package/sdk-core/src/pollers/poll_buffer.rs +6 -5
- package/sdk-core/src/pollers/retry.rs +153 -120
- package/sdk-core/src/prototype_rust_sdk/workflow_context.rs +61 -46
- package/sdk-core/src/prototype_rust_sdk/workflow_future.rs +13 -12
- package/sdk-core/src/prototype_rust_sdk.rs +17 -23
- package/sdk-core/src/telemetry/metrics.rs +2 -4
- package/sdk-core/src/telemetry/mod.rs +6 -7
- package/sdk-core/src/test_help/canned_histories.rs +17 -93
- package/sdk-core/src/test_help/history_builder.rs +61 -2
- package/sdk-core/src/test_help/history_info.rs +21 -2
- package/sdk-core/src/test_help/mod.rs +26 -34
- package/sdk-core/src/worker/activities/activity_heartbeat_manager.rs +246 -138
- package/sdk-core/src/worker/activities.rs +46 -45
- package/sdk-core/src/worker/config.rs +11 -0
- package/sdk-core/src/worker/dispatcher.rs +5 -5
- package/sdk-core/src/worker/mod.rs +86 -56
- package/sdk-core/src/workflow/driven_workflow.rs +3 -3
- package/sdk-core/src/workflow/history_update.rs +1 -1
- package/sdk-core/src/workflow/mod.rs +2 -1
- package/sdk-core/src/workflow/workflow_tasks/cache_manager.rs +13 -17
- package/sdk-core/src/workflow/workflow_tasks/concurrency_manager.rs +10 -18
- package/sdk-core/src/workflow/workflow_tasks/mod.rs +72 -57
- package/sdk-core/test_utils/Cargo.toml +1 -1
- package/sdk-core/test_utils/src/lib.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +61 -1
- package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +49 -0
- package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests.rs +1 -0
- package/src/conversions.rs +17 -0
- package/src/errors.rs +0 -7
- package/src/lib.rs +0 -20
|
@@ -10,7 +10,6 @@ use activity_heartbeat_manager::ActivityHeartbeatManager;
|
|
|
10
10
|
use dashmap::DashMap;
|
|
11
11
|
use std::{
|
|
12
12
|
convert::TryInto,
|
|
13
|
-
ops::Div,
|
|
14
13
|
sync::Arc,
|
|
15
14
|
time::{Duration, Instant},
|
|
16
15
|
};
|
|
@@ -80,6 +79,9 @@ pub(crate) struct WorkerActivityTasks {
|
|
|
80
79
|
activities_semaphore: Semaphore,
|
|
81
80
|
|
|
82
81
|
metrics: MetricsContext,
|
|
82
|
+
|
|
83
|
+
max_heartbeat_throttle_interval: Duration,
|
|
84
|
+
default_heartbeat_throttle_interval: Duration,
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
impl WorkerActivityTasks {
|
|
@@ -88,6 +90,8 @@ impl WorkerActivityTasks {
|
|
|
88
90
|
poller: BoxedActPoller,
|
|
89
91
|
sg: Arc<impl ServerGatewayApis + Send + Sync + 'static + ?Sized>,
|
|
90
92
|
metrics: MetricsContext,
|
|
93
|
+
max_heartbeat_throttle_interval: Duration,
|
|
94
|
+
default_heartbeat_throttle_interval: Duration,
|
|
91
95
|
) -> Self {
|
|
92
96
|
Self {
|
|
93
97
|
heartbeat_manager: ActivityHeartbeatManager::new(sg),
|
|
@@ -95,12 +99,13 @@ impl WorkerActivityTasks {
|
|
|
95
99
|
poller,
|
|
96
100
|
activities_semaphore: Semaphore::new(max_activity_tasks),
|
|
97
101
|
metrics,
|
|
102
|
+
max_heartbeat_throttle_interval,
|
|
103
|
+
default_heartbeat_throttle_interval,
|
|
98
104
|
}
|
|
99
105
|
}
|
|
100
106
|
|
|
101
107
|
pub(crate) fn notify_shutdown(&self) {
|
|
102
108
|
self.poller.notify_shutdown();
|
|
103
|
-
self.heartbeat_manager.notify_shutdown();
|
|
104
109
|
}
|
|
105
110
|
|
|
106
111
|
pub(crate) async fn shutdown(self) {
|
|
@@ -171,16 +176,19 @@ impl WorkerActivityTasks {
|
|
|
171
176
|
status: activity_result::Status,
|
|
172
177
|
gateway: &(dyn ServerGatewayApis + Send + Sync),
|
|
173
178
|
) -> Result<(), CompleteActivityError> {
|
|
174
|
-
if let Some(act_info) = self.outstanding_activity_tasks.
|
|
179
|
+
if let Some((_, act_info)) = self.outstanding_activity_tasks.remove(&task_token) {
|
|
175
180
|
let act_metrics = self.metrics.with_new_attrs([
|
|
176
181
|
activity_type(act_info.activity_type.clone()),
|
|
177
182
|
workflow_type(act_info.workflow_type.clone()),
|
|
178
183
|
]);
|
|
179
184
|
act_metrics.act_execution_latency(act_info.start_time.elapsed());
|
|
185
|
+
self.activities_semaphore.add_permits(1);
|
|
186
|
+
self.heartbeat_manager.evict(task_token.clone());
|
|
187
|
+
let known_not_found = act_info.known_not_found;
|
|
188
|
+
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
180
189
|
|
|
181
190
|
// No need to report activities which we already know the server doesn't care about
|
|
182
|
-
|
|
183
|
-
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
191
|
+
if !known_not_found {
|
|
184
192
|
let maybe_net_err = match status {
|
|
185
193
|
activity_result::Status::WillCompleteAsync(_) => None,
|
|
186
194
|
activity_result::Status::Completed(ar::Success { result }) => gateway
|
|
@@ -195,19 +203,17 @@ impl WorkerActivityTasks {
|
|
|
195
203
|
.err()
|
|
196
204
|
}
|
|
197
205
|
activity_result::Status::Cancelled(ar::Cancellation { failure }) => {
|
|
198
|
-
let details =
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
None
|
|
210
|
-
}
|
|
206
|
+
let details = if let Some(Failure {
|
|
207
|
+
failure_info:
|
|
208
|
+
Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo { details })),
|
|
209
|
+
..
|
|
210
|
+
}) = failure
|
|
211
|
+
{
|
|
212
|
+
details
|
|
213
|
+
} else {
|
|
214
|
+
warn!(task_token = ? task_token,
|
|
215
|
+
"Expected activity cancelled status with CanceledFailureInfo");
|
|
216
|
+
None
|
|
211
217
|
};
|
|
212
218
|
gateway
|
|
213
219
|
.cancel_activity_task(task_token.clone(), details.map(Into::into))
|
|
@@ -215,37 +221,24 @@ impl WorkerActivityTasks {
|
|
|
215
221
|
.err()
|
|
216
222
|
}
|
|
217
223
|
};
|
|
218
|
-
|
|
219
|
-
|
|
224
|
+
|
|
225
|
+
if let Some(e) = maybe_net_err {
|
|
226
|
+
if e.code() == tonic::Code::NotFound {
|
|
220
227
|
warn!(task_token = ?task_token, details = ?e, "Activity not found on \
|
|
221
228
|
completion. This may happen if the activity has already been cancelled but \
|
|
222
229
|
completed anyway.");
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
}
|
|
228
|
-
} else {
|
|
229
|
-
true
|
|
230
|
+
} else {
|
|
231
|
+
return Err(e.into());
|
|
232
|
+
};
|
|
233
|
+
};
|
|
230
234
|
};
|
|
231
|
-
|
|
232
|
-
if should_remove
|
|
233
|
-
&& self
|
|
234
|
-
.outstanding_activity_tasks
|
|
235
|
-
.remove(&task_token)
|
|
236
|
-
.is_some()
|
|
237
|
-
{
|
|
238
|
-
self.activities_semaphore.add_permits(1);
|
|
239
|
-
self.heartbeat_manager.evict(task_token);
|
|
240
|
-
}
|
|
241
|
-
Ok(())
|
|
242
235
|
} else {
|
|
243
236
|
warn!(
|
|
244
237
|
"Attempted to complete activity task {} but we were not tracking it",
|
|
245
238
|
&task_token
|
|
246
239
|
);
|
|
247
|
-
Ok(())
|
|
248
240
|
}
|
|
241
|
+
Ok(())
|
|
249
242
|
}
|
|
250
243
|
|
|
251
244
|
/// Attempt to record an activity heartbeat
|
|
@@ -254,22 +247,30 @@ impl WorkerActivityTasks {
|
|
|
254
247
|
details: ActivityHeartbeat,
|
|
255
248
|
) -> Result<(), ActivityHeartbeatError> {
|
|
256
249
|
// TODO: Propagate these back as cancels. Silent fails is too nonobvious
|
|
257
|
-
let
|
|
250
|
+
let heartbeat_timeout: Duration = self
|
|
258
251
|
.outstanding_activity_tasks
|
|
259
252
|
.get(&TaskToken(details.task_token.clone()))
|
|
260
253
|
.ok_or(ActivityHeartbeatError::UnknownActivity)?
|
|
261
254
|
.heartbeat_timeout
|
|
262
255
|
.clone()
|
|
263
|
-
|
|
256
|
+
// We treat None as 0 (even though heartbeat_timeout is never set to None by the server)
|
|
257
|
+
.unwrap_or_default()
|
|
264
258
|
.try_into()
|
|
259
|
+
// This technically should never happen since prost duration should be directly mappable
|
|
260
|
+
// to std::time::Duration.
|
|
265
261
|
.or(Err(ActivityHeartbeatError::InvalidHeartbeatTimeout))?;
|
|
262
|
+
|
|
266
263
|
// There is a bug in the server that translates non-set heartbeat timeouts into 0 duration.
|
|
267
264
|
// That's why we treat 0 the same way as None, otherwise we wouldn't know which aggregation
|
|
268
265
|
// delay to use, and using 0 is not a good idea as SDK would hammer the server too hard.
|
|
269
|
-
if
|
|
270
|
-
|
|
271
|
-
}
|
|
272
|
-
|
|
266
|
+
let throttle_interval = if heartbeat_timeout.as_millis() == 0 {
|
|
267
|
+
self.default_heartbeat_throttle_interval
|
|
268
|
+
} else {
|
|
269
|
+
heartbeat_timeout.mul_f64(0.8)
|
|
270
|
+
};
|
|
271
|
+
let throttle_interval =
|
|
272
|
+
std::cmp::min(throttle_interval, self.max_heartbeat_throttle_interval);
|
|
273
|
+
self.heartbeat_manager.record(details, throttle_interval)
|
|
273
274
|
}
|
|
274
275
|
|
|
275
276
|
async fn next_pending_cancel_task(&self) -> Result<Option<ActivityTask>, PollActivityError> {
|
|
@@ -51,6 +51,17 @@ pub struct WorkerConfig {
|
|
|
51
51
|
/// and moved to the non-sticky queue where it may be picked up by any worker.
|
|
52
52
|
#[builder(default = "Duration::from_secs(10)")]
|
|
53
53
|
pub sticky_queue_schedule_to_start_timeout: Duration,
|
|
54
|
+
|
|
55
|
+
/// Longest interval for throttling activity heartbeats
|
|
56
|
+
#[builder(default = "Duration::from_secs(60)")]
|
|
57
|
+
pub max_heartbeat_throttle_interval: Duration,
|
|
58
|
+
|
|
59
|
+
/// Default interval for throttling activity heartbeats in case
|
|
60
|
+
/// `ActivityOptions.heartbeat_timeout` is unset.
|
|
61
|
+
/// When the timeout *is* set in the `ActivityOptions`, throttling is set to
|
|
62
|
+
/// `heartbeat_timeout * 0.8`.
|
|
63
|
+
#[builder(default = "Duration::from_secs(30)")]
|
|
64
|
+
pub default_heartbeat_throttle_interval: Duration,
|
|
54
65
|
}
|
|
55
66
|
|
|
56
67
|
impl WorkerConfigBuilder {
|
|
@@ -6,7 +6,7 @@ use crate::{
|
|
|
6
6
|
};
|
|
7
7
|
use arc_swap::ArcSwap;
|
|
8
8
|
use futures::future::join_all;
|
|
9
|
-
use std::{collections::HashMap, ops::Deref, sync::Arc};
|
|
9
|
+
use std::{collections::HashMap, ops::Deref, option::Option, sync::Arc};
|
|
10
10
|
use tokio::sync::Notify;
|
|
11
11
|
|
|
12
12
|
/// Allows access to workers by task queue name
|
|
@@ -40,7 +40,7 @@ impl WorkerDispatcher {
|
|
|
40
40
|
.workers
|
|
41
41
|
.load()
|
|
42
42
|
.get(&tq)
|
|
43
|
-
.map(
|
|
43
|
+
.map(Option::is_some)
|
|
44
44
|
.unwrap_or_default()
|
|
45
45
|
{
|
|
46
46
|
return Err(WorkerRegistrationError::WorkerAlreadyRegisteredForQueue(tq));
|
|
@@ -77,7 +77,7 @@ impl WorkerDispatcher {
|
|
|
77
77
|
self.workers.rcu(|map| {
|
|
78
78
|
let mut map = HashMap::clone(map);
|
|
79
79
|
if maybe_worker.is_none() {
|
|
80
|
-
maybe_worker = map.get_mut(task_queue).and_then(
|
|
80
|
+
maybe_worker = map.get_mut(task_queue).and_then(Option::take);
|
|
81
81
|
}
|
|
82
82
|
map
|
|
83
83
|
});
|
|
@@ -149,7 +149,7 @@ impl Deref for WorkerRefCt {
|
|
|
149
149
|
type Target = Worker;
|
|
150
150
|
|
|
151
151
|
fn deref(&self) -> &Self::Target {
|
|
152
|
-
self.inner.
|
|
152
|
+
self.inner.as_deref().expect("Must exist")
|
|
153
153
|
}
|
|
154
154
|
}
|
|
155
155
|
|
|
@@ -161,7 +161,7 @@ impl Drop for WorkerRefCt {
|
|
|
161
161
|
Some(arc) => {
|
|
162
162
|
// We wait until 2 rather than 1 because we ourselves still have an Arc
|
|
163
163
|
if Arc::strong_count(arc) == 2 {
|
|
164
|
-
self.notify.notify_one()
|
|
164
|
+
self.notify.notify_one();
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
167
|
};
|
|
@@ -6,7 +6,7 @@ pub use crate::worker::config::{WorkerConfig, WorkerConfigBuilder};
|
|
|
6
6
|
pub(crate) use dispatcher::WorkerDispatcher;
|
|
7
7
|
|
|
8
8
|
use crate::{
|
|
9
|
-
errors::
|
|
9
|
+
errors::CompleteWfError,
|
|
10
10
|
machines::{EmptyWorkflowCommandErr, WFMachinesError},
|
|
11
11
|
pollers::{
|
|
12
12
|
new_activity_task_buffer, new_workflow_task_buffer, BoxedActPoller, BoxedWFPoller,
|
|
@@ -98,6 +98,7 @@ impl Worker {
|
|
|
98
98
|
let mut wf_task_poll_buffer = new_workflow_task_buffer(
|
|
99
99
|
sg.gw.clone(),
|
|
100
100
|
config.task_queue.clone(),
|
|
101
|
+
false,
|
|
101
102
|
max_nonsticky_polls,
|
|
102
103
|
max_nonsticky_polls * 2,
|
|
103
104
|
);
|
|
@@ -107,6 +108,7 @@ impl Worker {
|
|
|
107
108
|
let mut sp = new_workflow_task_buffer(
|
|
108
109
|
sg.gw.clone(),
|
|
109
110
|
sqn.clone(),
|
|
111
|
+
true,
|
|
110
112
|
max_sticky_polls,
|
|
111
113
|
max_sticky_polls * 2,
|
|
112
114
|
);
|
|
@@ -172,6 +174,8 @@ impl Worker {
|
|
|
172
174
|
ap,
|
|
173
175
|
sg.gw.clone(),
|
|
174
176
|
metrics.clone(),
|
|
177
|
+
config.max_heartbeat_throttle_interval,
|
|
178
|
+
config.default_heartbeat_throttle_interval,
|
|
175
179
|
)
|
|
176
180
|
}),
|
|
177
181
|
workflows_semaphore: Semaphore::new(config.max_outstanding_workflow_tasks),
|
|
@@ -219,6 +223,11 @@ impl Worker {
|
|
|
219
223
|
self.wft_manager.outstanding_wft()
|
|
220
224
|
}
|
|
221
225
|
|
|
226
|
+
#[cfg(test)]
|
|
227
|
+
pub(crate) fn available_wft_permits(&self) -> usize {
|
|
228
|
+
self.workflows_semaphore.available_permits()
|
|
229
|
+
}
|
|
230
|
+
|
|
222
231
|
/// Wait until not at the outstanding activity limit, and then poll this worker's task queue for
|
|
223
232
|
/// new activities.
|
|
224
233
|
///
|
|
@@ -246,7 +255,7 @@ impl Worker {
|
|
|
246
255
|
if let Some(at_mgr) = self.at_task_mgr.as_ref() {
|
|
247
256
|
let tt = details.task_token.clone();
|
|
248
257
|
if let Err(e) = at_mgr.record_heartbeat(details) {
|
|
249
|
-
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.")
|
|
258
|
+
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.");
|
|
250
259
|
}
|
|
251
260
|
}
|
|
252
261
|
}
|
|
@@ -275,7 +284,7 @@ impl Worker {
|
|
|
275
284
|
// We must first check if there are pending workflow activations for workflows that are
|
|
276
285
|
// currently replaying or otherwise need immediate jobs, and issue those before
|
|
277
286
|
// bothering the server.
|
|
278
|
-
if let Some(pa) = self.wft_manager.next_pending_activation()
|
|
287
|
+
if let Some(pa) = self.wft_manager.next_pending_activation() {
|
|
279
288
|
debug!(activation=%pa, "Sending pending activation to lang");
|
|
280
289
|
return Ok(pa);
|
|
281
290
|
}
|
|
@@ -284,7 +293,7 @@ impl Worker {
|
|
|
284
293
|
// activations, since there may be an eviction etc for whatever run is popped here.
|
|
285
294
|
if let Some(buff_wft) = self.wft_manager.next_buffered_poll() {
|
|
286
295
|
match self.apply_server_work(buff_wft).await? {
|
|
287
|
-
|
|
296
|
+
Some(a) => return Ok(a),
|
|
288
297
|
_ => continue,
|
|
289
298
|
}
|
|
290
299
|
}
|
|
@@ -304,14 +313,8 @@ impl Worker {
|
|
|
304
313
|
|
|
305
314
|
if let Some(work) = selected_f {
|
|
306
315
|
self.metrics.wf_tq_poll_ok();
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
NewWfTaskOutcome::TaskBuffered => {
|
|
310
|
-
// If the task was buffered, it's not actually outstanding, so we can
|
|
311
|
-
// immediately return a permit.
|
|
312
|
-
self.return_workflow_task_permit();
|
|
313
|
-
}
|
|
314
|
-
_ => {}
|
|
316
|
+
if let Some(a) = self.apply_server_work(work).await? {
|
|
317
|
+
return Ok(a);
|
|
315
318
|
}
|
|
316
319
|
}
|
|
317
320
|
|
|
@@ -326,7 +329,7 @@ impl Worker {
|
|
|
326
329
|
completion: WfActivationCompletion,
|
|
327
330
|
) -> Result<(), CompleteWfError> {
|
|
328
331
|
let wfstatus = completion.status;
|
|
329
|
-
let
|
|
332
|
+
let did_complete_wft = match wfstatus {
|
|
330
333
|
Some(wf_activation_completion::Status::Successful(success)) => {
|
|
331
334
|
self.wf_activation_success(&completion.run_id, success)
|
|
332
335
|
.await
|
|
@@ -338,11 +341,9 @@ impl Worker {
|
|
|
338
341
|
reason: "Workflow completion had empty status field".to_owned(),
|
|
339
342
|
completion: None,
|
|
340
343
|
}),
|
|
341
|
-
}
|
|
342
|
-
self.
|
|
343
|
-
|
|
344
|
-
self.maybe_notify_wtfs_drained();
|
|
345
|
-
r
|
|
344
|
+
}?;
|
|
345
|
+
self.after_workflow_activation(&completion.run_id, did_complete_wft);
|
|
346
|
+
Ok(())
|
|
346
347
|
}
|
|
347
348
|
|
|
348
349
|
fn maybe_notify_wtfs_drained(&self) {
|
|
@@ -355,11 +356,11 @@ impl Worker {
|
|
|
355
356
|
|
|
356
357
|
/// Tell the worker a workflow task has completed, for tracking max outstanding WFTs
|
|
357
358
|
pub(crate) fn return_workflow_task_permit(&self) {
|
|
358
|
-
self.workflows_semaphore.add_permits(1)
|
|
359
|
+
self.workflows_semaphore.add_permits(1);
|
|
359
360
|
}
|
|
360
361
|
|
|
361
|
-
pub(crate) fn request_wf_eviction(&self, run_id: &str) {
|
|
362
|
-
self.wft_manager.request_eviction(run_id);
|
|
362
|
+
pub(crate) fn request_wf_eviction(&self, run_id: &str, reason: impl Into<String>) {
|
|
363
|
+
self.wft_manager.request_eviction(run_id, reason);
|
|
363
364
|
}
|
|
364
365
|
|
|
365
366
|
/// Resolves with WFT poll response or `PollWfError::ShutDown` if WFTs have been drained
|
|
@@ -443,18 +444,24 @@ impl Worker {
|
|
|
443
444
|
async fn apply_server_work(
|
|
444
445
|
&self,
|
|
445
446
|
work: ValidPollWFTQResponse,
|
|
446
|
-
) -> Result<
|
|
447
|
+
) -> Result<Option<WfActivation>, PollWfError> {
|
|
447
448
|
let we = work.workflow_execution.clone();
|
|
448
449
|
let tt = work.task_token.clone();
|
|
449
450
|
let res = self
|
|
450
451
|
.wft_manager
|
|
451
452
|
.apply_new_poll_resp(work, &self.server_gateway)
|
|
452
|
-
.await
|
|
453
|
-
match
|
|
453
|
+
.await;
|
|
454
|
+
Ok(match res {
|
|
454
455
|
NewWfTaskOutcome::IssueActivation(a) => {
|
|
455
456
|
debug!(activation=%a, "Sending activation to lang");
|
|
457
|
+
Some(a)
|
|
458
|
+
}
|
|
459
|
+
NewWfTaskOutcome::TaskBuffered => {
|
|
460
|
+
// If the task was buffered, it's not actually outstanding, so we can
|
|
461
|
+
// immediately return a permit.
|
|
462
|
+
self.return_workflow_task_permit();
|
|
463
|
+
None
|
|
456
464
|
}
|
|
457
|
-
NewWfTaskOutcome::TaskBuffered => {}
|
|
458
465
|
NewWfTaskOutcome::Autocomplete => {
|
|
459
466
|
debug!(workflow_execution=?we,
|
|
460
467
|
"No work for lang to perform after polling server. Sending autocomplete.");
|
|
@@ -464,6 +471,7 @@ impl Worker {
|
|
|
464
471
|
status: Some(workflow_completion::Success::from_variants(vec![]).into()),
|
|
465
472
|
})
|
|
466
473
|
.await?;
|
|
474
|
+
None
|
|
467
475
|
}
|
|
468
476
|
NewWfTaskOutcome::CacheMiss => {
|
|
469
477
|
debug!(workflow_execution=?we, "Unable to process workflow task with partial \
|
|
@@ -480,17 +488,28 @@ impl Worker {
|
|
|
480
488
|
}),
|
|
481
489
|
)
|
|
482
490
|
.await?;
|
|
491
|
+
self.return_workflow_task_permit();
|
|
492
|
+
None
|
|
483
493
|
}
|
|
484
|
-
|
|
485
|
-
|
|
494
|
+
NewWfTaskOutcome::Evict(e) => {
|
|
495
|
+
warn!(error=?e, run_id=%we.run_id, "Error while applying poll response to workflow");
|
|
496
|
+
self.request_wf_eviction(
|
|
497
|
+
&we.run_id,
|
|
498
|
+
format!("Error while applying poll response to workflow: {:?}", e),
|
|
499
|
+
);
|
|
500
|
+
None
|
|
501
|
+
}
|
|
502
|
+
})
|
|
486
503
|
}
|
|
487
504
|
|
|
488
|
-
/// Handle a successful workflow
|
|
505
|
+
/// Handle a successful workflow activation
|
|
506
|
+
///
|
|
507
|
+
/// Returns true if we actually reported WFT completion to server (success or failure)
|
|
489
508
|
async fn wf_activation_success(
|
|
490
509
|
&self,
|
|
491
510
|
run_id: &str,
|
|
492
511
|
success: workflow_completion::Success,
|
|
493
|
-
) -> Result<
|
|
512
|
+
) -> Result<bool, CompleteWfError> {
|
|
494
513
|
// Convert to wf commands
|
|
495
514
|
let cmds = success
|
|
496
515
|
.commands
|
|
@@ -534,6 +553,7 @@ impl Worker {
|
|
|
534
553
|
.await
|
|
535
554
|
})
|
|
536
555
|
.await?;
|
|
556
|
+
Ok(true)
|
|
537
557
|
}
|
|
538
558
|
Ok(Some(ServerCommandsWithWorkflowInfo {
|
|
539
559
|
task_token,
|
|
@@ -543,8 +563,9 @@ impl Worker {
|
|
|
543
563
|
self.server_gateway
|
|
544
564
|
.respond_legacy_query(task_token, result)
|
|
545
565
|
.await?;
|
|
566
|
+
Ok(true)
|
|
546
567
|
}
|
|
547
|
-
Ok(None) =>
|
|
568
|
+
Ok(None) => Ok(false),
|
|
548
569
|
Err(update_err) => {
|
|
549
570
|
// Automatically fail the workflow task in the event we couldn't update machines
|
|
550
571
|
let fail_cause = if matches!(&update_err.source, WFMachinesError::Nondeterminism(_))
|
|
@@ -554,35 +575,42 @@ impl Worker {
|
|
|
554
575
|
WorkflowTaskFailedCause::Unspecified
|
|
555
576
|
};
|
|
556
577
|
|
|
578
|
+
warn!(run_id, error=?update_err, "Failing workflow task");
|
|
579
|
+
|
|
557
580
|
if let Some(ref tt) = update_err.task_token {
|
|
581
|
+
let wft_fail_str = format!("{:?}", update_err);
|
|
558
582
|
self.handle_wft_reporting_errs(run_id, || async {
|
|
559
583
|
self.server_gateway
|
|
560
584
|
.fail_workflow_task(
|
|
561
585
|
tt.clone(),
|
|
562
586
|
fail_cause,
|
|
563
|
-
Some(Failure::application_failure(
|
|
564
|
-
format!("{:?}", update_err),
|
|
565
|
-
false,
|
|
566
|
-
)),
|
|
587
|
+
Some(Failure::application_failure(wft_fail_str.clone(), false)),
|
|
567
588
|
)
|
|
568
589
|
.await
|
|
569
590
|
})
|
|
570
591
|
.await?;
|
|
592
|
+
// We must evict the workflow since we've failed a WFT
|
|
593
|
+
self.request_wf_eviction(
|
|
594
|
+
run_id,
|
|
595
|
+
format!("Workflow task failure: {}", wft_fail_str),
|
|
596
|
+
);
|
|
597
|
+
Ok(true)
|
|
598
|
+
} else {
|
|
599
|
+
Ok(false)
|
|
571
600
|
}
|
|
572
|
-
return Err(update_err.into());
|
|
573
601
|
}
|
|
574
602
|
}
|
|
575
|
-
|
|
576
|
-
Ok(())
|
|
577
603
|
}
|
|
578
604
|
|
|
579
605
|
/// Handle a failed workflow completion
|
|
606
|
+
///
|
|
607
|
+
/// Returns true if we actually reported WFT completion to server
|
|
580
608
|
async fn wf_activation_failed(
|
|
581
609
|
&self,
|
|
582
610
|
run_id: &str,
|
|
583
611
|
failure: workflow_completion::Failure,
|
|
584
|
-
) -> Result<
|
|
585
|
-
match self.wft_manager.failed_activation(run_id) {
|
|
612
|
+
) -> Result<bool, CompleteWfError> {
|
|
613
|
+
Ok(match self.wft_manager.failed_activation(run_id) {
|
|
586
614
|
FailedActivationOutcome::Report(tt) => {
|
|
587
615
|
self.handle_wft_reporting_errs(run_id, || async {
|
|
588
616
|
self.server_gateway
|
|
@@ -594,23 +622,25 @@ impl Worker {
|
|
|
594
622
|
.await
|
|
595
623
|
})
|
|
596
624
|
.await?;
|
|
625
|
+
true
|
|
597
626
|
}
|
|
598
627
|
FailedActivationOutcome::ReportLegacyQueryFailure(task_token) => {
|
|
599
628
|
self.server_gateway
|
|
600
629
|
.respond_legacy_query(task_token, legacy_query_failure(failure))
|
|
601
630
|
.await?;
|
|
631
|
+
true
|
|
602
632
|
}
|
|
603
|
-
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
Ok(())
|
|
633
|
+
FailedActivationOutcome::NoReport => false,
|
|
634
|
+
})
|
|
607
635
|
}
|
|
608
636
|
|
|
609
|
-
fn
|
|
610
|
-
|
|
637
|
+
fn after_workflow_activation(&self, run_id: &str, did_complete_wft: bool) {
|
|
638
|
+
self.wft_manager.after_wft_report(run_id);
|
|
639
|
+
if did_complete_wft {
|
|
611
640
|
self.return_workflow_task_permit();
|
|
612
|
-
}
|
|
613
|
-
|
|
641
|
+
}
|
|
642
|
+
self.wft_manager.on_activation_done(run_id);
|
|
643
|
+
self.maybe_notify_wtfs_drained();
|
|
614
644
|
}
|
|
615
645
|
|
|
616
646
|
/// Handle server errors from either completing or failing a workflow task. Returns any errors
|
|
@@ -630,12 +660,12 @@ impl Worker {
|
|
|
630
660
|
// Silence unhandled command errors since the lang SDK cannot do anything about
|
|
631
661
|
// them besides poll again, which it will do anyway.
|
|
632
662
|
tonic::Code::InvalidArgument if err.message() == "UnhandledCommand" => {
|
|
633
|
-
warn!("Unhandled command response when completing
|
|
663
|
+
warn!(error = %err, "Unhandled command response when completing");
|
|
634
664
|
should_evict = true;
|
|
635
665
|
Ok(())
|
|
636
666
|
}
|
|
637
667
|
tonic::Code::NotFound => {
|
|
638
|
-
warn!("Task not found when completing
|
|
668
|
+
warn!(error = %err, "Task not found when completing");
|
|
639
669
|
should_evict = true;
|
|
640
670
|
Ok(())
|
|
641
671
|
}
|
|
@@ -645,7 +675,7 @@ impl Worker {
|
|
|
645
675
|
_ => Ok(()),
|
|
646
676
|
};
|
|
647
677
|
if should_evict {
|
|
648
|
-
self.
|
|
678
|
+
self.request_wf_eviction(run_id, "Error reporting WFT to server");
|
|
649
679
|
}
|
|
650
680
|
res.map_err(Into::into)
|
|
651
681
|
}
|
|
@@ -704,7 +734,7 @@ mod tests {
|
|
|
704
734
|
|
|
705
735
|
let cfg = WorkerConfigBuilder::default()
|
|
706
736
|
.task_queue("whatever")
|
|
707
|
-
.max_outstanding_activities(
|
|
737
|
+
.max_outstanding_activities(5_usize)
|
|
708
738
|
.build()
|
|
709
739
|
.unwrap();
|
|
710
740
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -717,12 +747,12 @@ mod tests {
|
|
|
717
747
|
let mut mock_gateway = MockServerGatewayApis::new();
|
|
718
748
|
mock_gateway
|
|
719
749
|
.expect_poll_workflow_task()
|
|
720
|
-
.returning(|_| Ok(PollWorkflowTaskQueueResponse::default()));
|
|
750
|
+
.returning(|_, _| Ok(PollWorkflowTaskQueueResponse::default()));
|
|
721
751
|
let gwref = GatewayRef::new(Arc::new(mock_gateway), fake_sg_opts());
|
|
722
752
|
|
|
723
753
|
let cfg = WorkerConfigBuilder::default()
|
|
724
754
|
.task_queue("whatever")
|
|
725
|
-
.max_outstanding_workflow_tasks(
|
|
755
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
726
756
|
.build()
|
|
727
757
|
.unwrap();
|
|
728
758
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -740,7 +770,7 @@ mod tests {
|
|
|
740
770
|
|
|
741
771
|
let cfg = WorkerConfigBuilder::default()
|
|
742
772
|
.task_queue("whatever")
|
|
743
|
-
.max_outstanding_activities(
|
|
773
|
+
.max_outstanding_activities(5_usize)
|
|
744
774
|
.build()
|
|
745
775
|
.unwrap();
|
|
746
776
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -753,12 +783,12 @@ mod tests {
|
|
|
753
783
|
let mut mock_gateway = MockServerGatewayApis::new();
|
|
754
784
|
mock_gateway
|
|
755
785
|
.expect_poll_workflow_task()
|
|
756
|
-
.returning(|_| Err(tonic::Status::internal("ahhh")));
|
|
786
|
+
.returning(|_, _| Err(tonic::Status::internal("ahhh")));
|
|
757
787
|
let gwref = GatewayRef::new(Arc::new(mock_gateway), fake_sg_opts());
|
|
758
788
|
|
|
759
789
|
let cfg = WorkerConfigBuilder::default()
|
|
760
790
|
.task_queue("whatever")
|
|
761
|
-
.max_outstanding_workflow_tasks(
|
|
791
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
762
792
|
.build()
|
|
763
793
|
.unwrap();
|
|
764
794
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -33,7 +33,7 @@ impl DrivenWorkflow {
|
|
|
33
33
|
/// Start the workflow
|
|
34
34
|
pub fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) {
|
|
35
35
|
debug!(run_id = %attribs.original_execution_run_id, "Driven WF start");
|
|
36
|
-
self.started_attrs = Some(attribs)
|
|
36
|
+
self.started_attrs = Some(attribs);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
/// Enqueue a new job to be sent to the driven workflow
|
|
@@ -51,12 +51,12 @@ impl DrivenWorkflow {
|
|
|
51
51
|
|
|
52
52
|
/// Signal the workflow
|
|
53
53
|
pub fn signal(&mut self, signal: SignalWorkflow) {
|
|
54
|
-
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal))
|
|
54
|
+
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal));
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
/// Cancel the workflow
|
|
58
58
|
pub fn cancel(&mut self, attribs: CancelWorkflow) {
|
|
59
|
-
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs))
|
|
59
|
+
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs));
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
@@ -62,7 +62,7 @@ impl WorkflowManager {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
#[cfg(test)]
|
|
65
|
-
pub fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
65
|
+
pub const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
66
66
|
Self {
|
|
67
67
|
machines: workflow_machines,
|
|
68
68
|
command_sink: None,
|
|
@@ -277,6 +277,7 @@ pub mod managed_wf {
|
|
|
277
277
|
// Send an eviction to ensure wf exits if it has not finished (ex: feeding partial hist)
|
|
278
278
|
let _ = self.activation_tx.send(create_evict_activation(
|
|
279
279
|
"not actually important".to_string(),
|
|
280
|
+
"force shutdown".to_string(),
|
|
280
281
|
));
|
|
281
282
|
self.future_handle.take().unwrap().await.unwrap()
|
|
282
283
|
}
|