@temporalio/core-bridge 0.16.0 → 0.17.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +1 -0
- package/index.d.ts +14 -0
- package/index.node +0 -0
- package/package.json +3 -3
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/Cargo.toml +1 -0
- package/sdk-core/fsm/rustfsm_procmacro/Cargo.toml +1 -1
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +8 -9
- package/sdk-core/fsm/rustfsm_trait/Cargo.toml +1 -1
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +1 -1
- package/sdk-core/sdk-core-protos/src/lib.rs +43 -48
- package/sdk-core/src/core_tests/activity_tasks.rs +5 -5
- package/sdk-core/src/core_tests/mod.rs +2 -2
- package/sdk-core/src/core_tests/queries.rs +9 -2
- package/sdk-core/src/core_tests/workflow_tasks.rs +87 -8
- package/sdk-core/src/errors.rs +13 -13
- package/sdk-core/src/lib.rs +2 -2
- package/sdk-core/src/machines/activity_state_machine.rs +3 -3
- package/sdk-core/src/machines/child_workflow_state_machine.rs +6 -15
- package/sdk-core/src/machines/complete_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/continue_as_new_workflow_state_machine.rs +1 -1
- package/sdk-core/src/machines/mod.rs +16 -22
- package/sdk-core/src/machines/patch_state_machine.rs +8 -8
- package/sdk-core/src/machines/signal_external_state_machine.rs +2 -2
- package/sdk-core/src/machines/timer_state_machine.rs +4 -4
- package/sdk-core/src/machines/transition_coverage.rs +3 -3
- package/sdk-core/src/machines/workflow_machines.rs +26 -24
- package/sdk-core/src/pending_activations.rs +19 -20
- package/sdk-core/src/pollers/gateway.rs +3 -3
- package/sdk-core/src/pollers/poll_buffer.rs +2 -2
- package/sdk-core/src/pollers/retry.rs +4 -4
- package/sdk-core/src/prototype_rust_sdk/workflow_context.rs +3 -3
- package/sdk-core/src/prototype_rust_sdk/workflow_future.rs +4 -4
- package/sdk-core/src/prototype_rust_sdk.rs +3 -11
- package/sdk-core/src/telemetry/metrics.rs +2 -4
- package/sdk-core/src/telemetry/mod.rs +6 -7
- package/sdk-core/src/test_help/canned_histories.rs +8 -5
- package/sdk-core/src/test_help/history_builder.rs +12 -2
- package/sdk-core/src/test_help/history_info.rs +23 -3
- package/sdk-core/src/test_help/mod.rs +24 -40
- package/sdk-core/src/worker/activities/activity_heartbeat_manager.rs +246 -138
- package/sdk-core/src/worker/activities.rs +46 -45
- package/sdk-core/src/worker/config.rs +11 -0
- package/sdk-core/src/worker/dispatcher.rs +5 -5
- package/sdk-core/src/worker/mod.rs +71 -52
- package/sdk-core/src/workflow/driven_workflow.rs +3 -3
- package/sdk-core/src/workflow/history_update.rs +1 -1
- package/sdk-core/src/workflow/mod.rs +1 -1
- package/sdk-core/src/workflow/workflow_tasks/cache_manager.rs +13 -17
- package/sdk-core/src/workflow/workflow_tasks/concurrency_manager.rs +4 -8
- package/sdk-core/src/workflow/workflow_tasks/mod.rs +46 -53
- package/sdk-core/test_utils/src/lib.rs +2 -2
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +61 -1
- package/src/conversions.rs +17 -0
|
@@ -10,7 +10,6 @@ use activity_heartbeat_manager::ActivityHeartbeatManager;
|
|
|
10
10
|
use dashmap::DashMap;
|
|
11
11
|
use std::{
|
|
12
12
|
convert::TryInto,
|
|
13
|
-
ops::Div,
|
|
14
13
|
sync::Arc,
|
|
15
14
|
time::{Duration, Instant},
|
|
16
15
|
};
|
|
@@ -80,6 +79,9 @@ pub(crate) struct WorkerActivityTasks {
|
|
|
80
79
|
activities_semaphore: Semaphore,
|
|
81
80
|
|
|
82
81
|
metrics: MetricsContext,
|
|
82
|
+
|
|
83
|
+
max_heartbeat_throttle_interval: Duration,
|
|
84
|
+
default_heartbeat_throttle_interval: Duration,
|
|
83
85
|
}
|
|
84
86
|
|
|
85
87
|
impl WorkerActivityTasks {
|
|
@@ -88,6 +90,8 @@ impl WorkerActivityTasks {
|
|
|
88
90
|
poller: BoxedActPoller,
|
|
89
91
|
sg: Arc<impl ServerGatewayApis + Send + Sync + 'static + ?Sized>,
|
|
90
92
|
metrics: MetricsContext,
|
|
93
|
+
max_heartbeat_throttle_interval: Duration,
|
|
94
|
+
default_heartbeat_throttle_interval: Duration,
|
|
91
95
|
) -> Self {
|
|
92
96
|
Self {
|
|
93
97
|
heartbeat_manager: ActivityHeartbeatManager::new(sg),
|
|
@@ -95,12 +99,13 @@ impl WorkerActivityTasks {
|
|
|
95
99
|
poller,
|
|
96
100
|
activities_semaphore: Semaphore::new(max_activity_tasks),
|
|
97
101
|
metrics,
|
|
102
|
+
max_heartbeat_throttle_interval,
|
|
103
|
+
default_heartbeat_throttle_interval,
|
|
98
104
|
}
|
|
99
105
|
}
|
|
100
106
|
|
|
101
107
|
pub(crate) fn notify_shutdown(&self) {
|
|
102
108
|
self.poller.notify_shutdown();
|
|
103
|
-
self.heartbeat_manager.notify_shutdown();
|
|
104
109
|
}
|
|
105
110
|
|
|
106
111
|
pub(crate) async fn shutdown(self) {
|
|
@@ -171,16 +176,19 @@ impl WorkerActivityTasks {
|
|
|
171
176
|
status: activity_result::Status,
|
|
172
177
|
gateway: &(dyn ServerGatewayApis + Send + Sync),
|
|
173
178
|
) -> Result<(), CompleteActivityError> {
|
|
174
|
-
if let Some(act_info) = self.outstanding_activity_tasks.
|
|
179
|
+
if let Some((_, act_info)) = self.outstanding_activity_tasks.remove(&task_token) {
|
|
175
180
|
let act_metrics = self.metrics.with_new_attrs([
|
|
176
181
|
activity_type(act_info.activity_type.clone()),
|
|
177
182
|
workflow_type(act_info.workflow_type.clone()),
|
|
178
183
|
]);
|
|
179
184
|
act_metrics.act_execution_latency(act_info.start_time.elapsed());
|
|
185
|
+
self.activities_semaphore.add_permits(1);
|
|
186
|
+
self.heartbeat_manager.evict(task_token.clone());
|
|
187
|
+
let known_not_found = act_info.known_not_found;
|
|
188
|
+
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
180
189
|
|
|
181
190
|
// No need to report activities which we already know the server doesn't care about
|
|
182
|
-
|
|
183
|
-
drop(act_info); // TODO: Get rid of dashmap. If we hold ref across await, bad stuff.
|
|
191
|
+
if !known_not_found {
|
|
184
192
|
let maybe_net_err = match status {
|
|
185
193
|
activity_result::Status::WillCompleteAsync(_) => None,
|
|
186
194
|
activity_result::Status::Completed(ar::Success { result }) => gateway
|
|
@@ -195,19 +203,17 @@ impl WorkerActivityTasks {
|
|
|
195
203
|
.err()
|
|
196
204
|
}
|
|
197
205
|
activity_result::Status::Cancelled(ar::Cancellation { failure }) => {
|
|
198
|
-
let details =
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
None
|
|
210
|
-
}
|
|
206
|
+
let details = if let Some(Failure {
|
|
207
|
+
failure_info:
|
|
208
|
+
Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo { details })),
|
|
209
|
+
..
|
|
210
|
+
}) = failure
|
|
211
|
+
{
|
|
212
|
+
details
|
|
213
|
+
} else {
|
|
214
|
+
warn!(task_token = ? task_token,
|
|
215
|
+
"Expected activity cancelled status with CanceledFailureInfo");
|
|
216
|
+
None
|
|
211
217
|
};
|
|
212
218
|
gateway
|
|
213
219
|
.cancel_activity_task(task_token.clone(), details.map(Into::into))
|
|
@@ -215,37 +221,24 @@ impl WorkerActivityTasks {
|
|
|
215
221
|
.err()
|
|
216
222
|
}
|
|
217
223
|
};
|
|
218
|
-
|
|
219
|
-
|
|
224
|
+
|
|
225
|
+
if let Some(e) = maybe_net_err {
|
|
226
|
+
if e.code() == tonic::Code::NotFound {
|
|
220
227
|
warn!(task_token = ?task_token, details = ?e, "Activity not found on \
|
|
221
228
|
completion. This may happen if the activity has already been cancelled but \
|
|
222
229
|
completed anyway.");
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
}
|
|
228
|
-
} else {
|
|
229
|
-
true
|
|
230
|
+
} else {
|
|
231
|
+
return Err(e.into());
|
|
232
|
+
};
|
|
233
|
+
};
|
|
230
234
|
};
|
|
231
|
-
|
|
232
|
-
if should_remove
|
|
233
|
-
&& self
|
|
234
|
-
.outstanding_activity_tasks
|
|
235
|
-
.remove(&task_token)
|
|
236
|
-
.is_some()
|
|
237
|
-
{
|
|
238
|
-
self.activities_semaphore.add_permits(1);
|
|
239
|
-
self.heartbeat_manager.evict(task_token);
|
|
240
|
-
}
|
|
241
|
-
Ok(())
|
|
242
235
|
} else {
|
|
243
236
|
warn!(
|
|
244
237
|
"Attempted to complete activity task {} but we were not tracking it",
|
|
245
238
|
&task_token
|
|
246
239
|
);
|
|
247
|
-
Ok(())
|
|
248
240
|
}
|
|
241
|
+
Ok(())
|
|
249
242
|
}
|
|
250
243
|
|
|
251
244
|
/// Attempt to record an activity heartbeat
|
|
@@ -254,22 +247,30 @@ impl WorkerActivityTasks {
|
|
|
254
247
|
details: ActivityHeartbeat,
|
|
255
248
|
) -> Result<(), ActivityHeartbeatError> {
|
|
256
249
|
// TODO: Propagate these back as cancels. Silent fails is too nonobvious
|
|
257
|
-
let
|
|
250
|
+
let heartbeat_timeout: Duration = self
|
|
258
251
|
.outstanding_activity_tasks
|
|
259
252
|
.get(&TaskToken(details.task_token.clone()))
|
|
260
253
|
.ok_or(ActivityHeartbeatError::UnknownActivity)?
|
|
261
254
|
.heartbeat_timeout
|
|
262
255
|
.clone()
|
|
263
|
-
|
|
256
|
+
// We treat None as 0 (even though heartbeat_timeout is never set to None by the server)
|
|
257
|
+
.unwrap_or_default()
|
|
264
258
|
.try_into()
|
|
259
|
+
// This technically should never happen since prost duration should be directly mappable
|
|
260
|
+
// to std::time::Duration.
|
|
265
261
|
.or(Err(ActivityHeartbeatError::InvalidHeartbeatTimeout))?;
|
|
262
|
+
|
|
266
263
|
// There is a bug in the server that translates non-set heartbeat timeouts into 0 duration.
|
|
267
264
|
// That's why we treat 0 the same way as None, otherwise we wouldn't know which aggregation
|
|
268
265
|
// delay to use, and using 0 is not a good idea as SDK would hammer the server too hard.
|
|
269
|
-
if
|
|
270
|
-
|
|
271
|
-
}
|
|
272
|
-
|
|
266
|
+
let throttle_interval = if heartbeat_timeout.as_millis() == 0 {
|
|
267
|
+
self.default_heartbeat_throttle_interval
|
|
268
|
+
} else {
|
|
269
|
+
heartbeat_timeout.mul_f64(0.8)
|
|
270
|
+
};
|
|
271
|
+
let throttle_interval =
|
|
272
|
+
std::cmp::min(throttle_interval, self.max_heartbeat_throttle_interval);
|
|
273
|
+
self.heartbeat_manager.record(details, throttle_interval)
|
|
273
274
|
}
|
|
274
275
|
|
|
275
276
|
async fn next_pending_cancel_task(&self) -> Result<Option<ActivityTask>, PollActivityError> {
|
|
@@ -51,6 +51,17 @@ pub struct WorkerConfig {
|
|
|
51
51
|
/// and moved to the non-sticky queue where it may be picked up by any worker.
|
|
52
52
|
#[builder(default = "Duration::from_secs(10)")]
|
|
53
53
|
pub sticky_queue_schedule_to_start_timeout: Duration,
|
|
54
|
+
|
|
55
|
+
/// Longest interval for throttling activity heartbeats
|
|
56
|
+
#[builder(default = "Duration::from_secs(60)")]
|
|
57
|
+
pub max_heartbeat_throttle_interval: Duration,
|
|
58
|
+
|
|
59
|
+
/// Default interval for throttling activity heartbeats in case
|
|
60
|
+
/// `ActivityOptions.heartbeat_timeout` is unset.
|
|
61
|
+
/// When the timeout *is* set in the `ActivityOptions`, throttling is set to
|
|
62
|
+
/// `heartbeat_timeout * 0.8`.
|
|
63
|
+
#[builder(default = "Duration::from_secs(30)")]
|
|
64
|
+
pub default_heartbeat_throttle_interval: Duration,
|
|
54
65
|
}
|
|
55
66
|
|
|
56
67
|
impl WorkerConfigBuilder {
|
|
@@ -6,7 +6,7 @@ use crate::{
|
|
|
6
6
|
};
|
|
7
7
|
use arc_swap::ArcSwap;
|
|
8
8
|
use futures::future::join_all;
|
|
9
|
-
use std::{collections::HashMap, ops::Deref, sync::Arc};
|
|
9
|
+
use std::{collections::HashMap, ops::Deref, option::Option, sync::Arc};
|
|
10
10
|
use tokio::sync::Notify;
|
|
11
11
|
|
|
12
12
|
/// Allows access to workers by task queue name
|
|
@@ -40,7 +40,7 @@ impl WorkerDispatcher {
|
|
|
40
40
|
.workers
|
|
41
41
|
.load()
|
|
42
42
|
.get(&tq)
|
|
43
|
-
.map(
|
|
43
|
+
.map(Option::is_some)
|
|
44
44
|
.unwrap_or_default()
|
|
45
45
|
{
|
|
46
46
|
return Err(WorkerRegistrationError::WorkerAlreadyRegisteredForQueue(tq));
|
|
@@ -77,7 +77,7 @@ impl WorkerDispatcher {
|
|
|
77
77
|
self.workers.rcu(|map| {
|
|
78
78
|
let mut map = HashMap::clone(map);
|
|
79
79
|
if maybe_worker.is_none() {
|
|
80
|
-
maybe_worker = map.get_mut(task_queue).and_then(
|
|
80
|
+
maybe_worker = map.get_mut(task_queue).and_then(Option::take);
|
|
81
81
|
}
|
|
82
82
|
map
|
|
83
83
|
});
|
|
@@ -149,7 +149,7 @@ impl Deref for WorkerRefCt {
|
|
|
149
149
|
type Target = Worker;
|
|
150
150
|
|
|
151
151
|
fn deref(&self) -> &Self::Target {
|
|
152
|
-
self.inner.
|
|
152
|
+
self.inner.as_deref().expect("Must exist")
|
|
153
153
|
}
|
|
154
154
|
}
|
|
155
155
|
|
|
@@ -161,7 +161,7 @@ impl Drop for WorkerRefCt {
|
|
|
161
161
|
Some(arc) => {
|
|
162
162
|
// We wait until 2 rather than 1 because we ourselves still have an Arc
|
|
163
163
|
if Arc::strong_count(arc) == 2 {
|
|
164
|
-
self.notify.notify_one()
|
|
164
|
+
self.notify.notify_one();
|
|
165
165
|
}
|
|
166
166
|
}
|
|
167
167
|
};
|
|
@@ -174,6 +174,8 @@ impl Worker {
|
|
|
174
174
|
ap,
|
|
175
175
|
sg.gw.clone(),
|
|
176
176
|
metrics.clone(),
|
|
177
|
+
config.max_heartbeat_throttle_interval,
|
|
178
|
+
config.default_heartbeat_throttle_interval,
|
|
177
179
|
)
|
|
178
180
|
}),
|
|
179
181
|
workflows_semaphore: Semaphore::new(config.max_outstanding_workflow_tasks),
|
|
@@ -253,7 +255,7 @@ impl Worker {
|
|
|
253
255
|
if let Some(at_mgr) = self.at_task_mgr.as_ref() {
|
|
254
256
|
let tt = details.task_token.clone();
|
|
255
257
|
if let Err(e) = at_mgr.record_heartbeat(details) {
|
|
256
|
-
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.")
|
|
258
|
+
warn!(task_token = ?tt, details = ?e, "Activity heartbeat failed.");
|
|
257
259
|
}
|
|
258
260
|
}
|
|
259
261
|
}
|
|
@@ -327,20 +329,28 @@ impl Worker {
|
|
|
327
329
|
completion: WfActivationCompletion,
|
|
328
330
|
) -> Result<(), CompleteWfError> {
|
|
329
331
|
let wfstatus = completion.status;
|
|
330
|
-
let
|
|
332
|
+
let report_outcome = match wfstatus {
|
|
331
333
|
Some(wf_activation_completion::Status::Successful(success)) => {
|
|
332
334
|
self.wf_activation_success(&completion.run_id, success)
|
|
333
335
|
.await
|
|
334
336
|
}
|
|
337
|
+
|
|
335
338
|
Some(wf_activation_completion::Status::Failed(failure)) => {
|
|
336
|
-
self.wf_activation_failed(
|
|
339
|
+
self.wf_activation_failed(
|
|
340
|
+
&completion.run_id,
|
|
341
|
+
WorkflowTaskFailedCause::Unspecified,
|
|
342
|
+
failure,
|
|
343
|
+
)
|
|
344
|
+
.await
|
|
345
|
+
}
|
|
346
|
+
None => {
|
|
347
|
+
return Err(CompleteWfError::MalformedWorkflowCompletion {
|
|
348
|
+
reason: "Workflow completion had empty status field".to_owned(),
|
|
349
|
+
completion: None,
|
|
350
|
+
})
|
|
337
351
|
}
|
|
338
|
-
None => Err(CompleteWfError::MalformedWorkflowCompletion {
|
|
339
|
-
reason: "Workflow completion had empty status field".to_owned(),
|
|
340
|
-
completion: None,
|
|
341
|
-
}),
|
|
342
352
|
}?;
|
|
343
|
-
self.after_workflow_activation(&completion.run_id,
|
|
353
|
+
self.after_workflow_activation(&completion.run_id, report_outcome);
|
|
344
354
|
Ok(())
|
|
345
355
|
}
|
|
346
356
|
|
|
@@ -354,7 +364,7 @@ impl Worker {
|
|
|
354
364
|
|
|
355
365
|
/// Tell the worker a workflow task has completed, for tracking max outstanding WFTs
|
|
356
366
|
pub(crate) fn return_workflow_task_permit(&self) {
|
|
357
|
-
self.workflows_semaphore.add_permits(1)
|
|
367
|
+
self.workflows_semaphore.add_permits(1);
|
|
358
368
|
}
|
|
359
369
|
|
|
360
370
|
pub(crate) fn request_wf_eviction(&self, run_id: &str, reason: impl Into<String>) {
|
|
@@ -486,6 +496,7 @@ impl Worker {
|
|
|
486
496
|
}),
|
|
487
497
|
)
|
|
488
498
|
.await?;
|
|
499
|
+
self.return_workflow_task_permit();
|
|
489
500
|
None
|
|
490
501
|
}
|
|
491
502
|
NewWfTaskOutcome::Evict(e) => {
|
|
@@ -506,7 +517,7 @@ impl Worker {
|
|
|
506
517
|
&self,
|
|
507
518
|
run_id: &str,
|
|
508
519
|
success: workflow_completion::Success,
|
|
509
|
-
) -> Result<
|
|
520
|
+
) -> Result<WFTReportOutcome, CompleteWfError> {
|
|
510
521
|
// Convert to wf commands
|
|
511
522
|
let cmds = success
|
|
512
523
|
.commands
|
|
@@ -550,7 +561,10 @@ impl Worker {
|
|
|
550
561
|
.await
|
|
551
562
|
})
|
|
552
563
|
.await?;
|
|
553
|
-
Ok(
|
|
564
|
+
Ok(WFTReportOutcome {
|
|
565
|
+
reported_to_server: true,
|
|
566
|
+
failed: false,
|
|
567
|
+
})
|
|
554
568
|
}
|
|
555
569
|
Ok(Some(ServerCommandsWithWorkflowInfo {
|
|
556
570
|
task_token,
|
|
@@ -560,9 +574,15 @@ impl Worker {
|
|
|
560
574
|
self.server_gateway
|
|
561
575
|
.respond_legacy_query(task_token, result)
|
|
562
576
|
.await?;
|
|
563
|
-
Ok(
|
|
577
|
+
Ok(WFTReportOutcome {
|
|
578
|
+
reported_to_server: true,
|
|
579
|
+
failed: false,
|
|
580
|
+
})
|
|
564
581
|
}
|
|
565
|
-
Ok(None) => Ok(
|
|
582
|
+
Ok(None) => Ok(WFTReportOutcome {
|
|
583
|
+
reported_to_server: false,
|
|
584
|
+
failed: false,
|
|
585
|
+
}),
|
|
566
586
|
Err(update_err) => {
|
|
567
587
|
// Automatically fail the workflow task in the event we couldn't update machines
|
|
568
588
|
let fail_cause = if matches!(&update_err.source, WFMachinesError::Nondeterminism(_))
|
|
@@ -571,30 +591,13 @@ impl Worker {
|
|
|
571
591
|
} else {
|
|
572
592
|
WorkflowTaskFailedCause::Unspecified
|
|
573
593
|
};
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
.fail_workflow_task(
|
|
582
|
-
tt.clone(),
|
|
583
|
-
fail_cause,
|
|
584
|
-
Some(Failure::application_failure(wft_fail_str.clone(), false)),
|
|
585
|
-
)
|
|
586
|
-
.await
|
|
587
|
-
})
|
|
588
|
-
.await?;
|
|
589
|
-
// We must evict the workflow since we've failed a WFT
|
|
590
|
-
self.request_wf_eviction(
|
|
591
|
-
run_id,
|
|
592
|
-
format!("Workflow task failure: {}", wft_fail_str),
|
|
593
|
-
);
|
|
594
|
-
Ok(true)
|
|
595
|
-
} else {
|
|
596
|
-
Ok(false)
|
|
597
|
-
}
|
|
594
|
+
let wft_fail_str = format!("{:?}", update_err);
|
|
595
|
+
self.wf_activation_failed(
|
|
596
|
+
run_id,
|
|
597
|
+
fail_cause,
|
|
598
|
+
Failure::application_failure(wft_fail_str.clone(), false).into(),
|
|
599
|
+
)
|
|
600
|
+
.await
|
|
598
601
|
}
|
|
599
602
|
}
|
|
600
603
|
}
|
|
@@ -605,35 +608,46 @@ impl Worker {
|
|
|
605
608
|
async fn wf_activation_failed(
|
|
606
609
|
&self,
|
|
607
610
|
run_id: &str,
|
|
611
|
+
cause: WorkflowTaskFailedCause,
|
|
608
612
|
failure: workflow_completion::Failure,
|
|
609
|
-
) -> Result<
|
|
613
|
+
) -> Result<WFTReportOutcome, CompleteWfError> {
|
|
610
614
|
Ok(match self.wft_manager.failed_activation(run_id) {
|
|
611
615
|
FailedActivationOutcome::Report(tt) => {
|
|
616
|
+
warn!(run_id, failure=?failure, "Failing workflow activation");
|
|
612
617
|
self.handle_wft_reporting_errs(run_id, || async {
|
|
613
618
|
self.server_gateway
|
|
614
|
-
.fail_workflow_task(
|
|
615
|
-
tt,
|
|
616
|
-
WorkflowTaskFailedCause::Unspecified,
|
|
617
|
-
failure.failure.map(Into::into),
|
|
618
|
-
)
|
|
619
|
+
.fail_workflow_task(tt, cause, failure.failure.map(Into::into))
|
|
619
620
|
.await
|
|
620
621
|
})
|
|
621
622
|
.await?;
|
|
622
|
-
|
|
623
|
+
WFTReportOutcome {
|
|
624
|
+
reported_to_server: true,
|
|
625
|
+
failed: true,
|
|
626
|
+
}
|
|
623
627
|
}
|
|
624
628
|
FailedActivationOutcome::ReportLegacyQueryFailure(task_token) => {
|
|
629
|
+
warn!(run_id, failure=?failure, "Failing legacy query request");
|
|
625
630
|
self.server_gateway
|
|
626
631
|
.respond_legacy_query(task_token, legacy_query_failure(failure))
|
|
627
632
|
.await?;
|
|
628
|
-
|
|
633
|
+
WFTReportOutcome {
|
|
634
|
+
reported_to_server: true,
|
|
635
|
+
failed: true,
|
|
636
|
+
}
|
|
629
637
|
}
|
|
630
|
-
FailedActivationOutcome::NoReport =>
|
|
638
|
+
FailedActivationOutcome::NoReport => WFTReportOutcome {
|
|
639
|
+
reported_to_server: false,
|
|
640
|
+
failed: true,
|
|
641
|
+
},
|
|
631
642
|
})
|
|
632
643
|
}
|
|
633
644
|
|
|
634
|
-
fn after_workflow_activation(&self, run_id: &str,
|
|
645
|
+
fn after_workflow_activation(&self, run_id: &str, report_outcome: WFTReportOutcome) {
|
|
635
646
|
self.wft_manager.after_wft_report(run_id);
|
|
636
|
-
if
|
|
647
|
+
if report_outcome.reported_to_server || report_outcome.failed {
|
|
648
|
+
// If we failed the WFT but didn't report anything, we still want to release the WFT
|
|
649
|
+
// permit since the server will eventually time out the task and we've already evicted
|
|
650
|
+
// the run.
|
|
637
651
|
self.return_workflow_task_permit();
|
|
638
652
|
}
|
|
639
653
|
self.wft_manager.on_activation_done(run_id);
|
|
@@ -715,6 +729,11 @@ impl WorkerConfig {
|
|
|
715
729
|
}
|
|
716
730
|
}
|
|
717
731
|
|
|
732
|
+
struct WFTReportOutcome {
|
|
733
|
+
reported_to_server: bool,
|
|
734
|
+
failed: bool,
|
|
735
|
+
}
|
|
736
|
+
|
|
718
737
|
#[cfg(test)]
|
|
719
738
|
mod tests {
|
|
720
739
|
use super::*;
|
|
@@ -731,7 +750,7 @@ mod tests {
|
|
|
731
750
|
|
|
732
751
|
let cfg = WorkerConfigBuilder::default()
|
|
733
752
|
.task_queue("whatever")
|
|
734
|
-
.max_outstanding_activities(
|
|
753
|
+
.max_outstanding_activities(5_usize)
|
|
735
754
|
.build()
|
|
736
755
|
.unwrap();
|
|
737
756
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -749,7 +768,7 @@ mod tests {
|
|
|
749
768
|
|
|
750
769
|
let cfg = WorkerConfigBuilder::default()
|
|
751
770
|
.task_queue("whatever")
|
|
752
|
-
.max_outstanding_workflow_tasks(
|
|
771
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
753
772
|
.build()
|
|
754
773
|
.unwrap();
|
|
755
774
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -767,7 +786,7 @@ mod tests {
|
|
|
767
786
|
|
|
768
787
|
let cfg = WorkerConfigBuilder::default()
|
|
769
788
|
.task_queue("whatever")
|
|
770
|
-
.max_outstanding_activities(
|
|
789
|
+
.max_outstanding_activities(5_usize)
|
|
771
790
|
.build()
|
|
772
791
|
.unwrap();
|
|
773
792
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -785,7 +804,7 @@ mod tests {
|
|
|
785
804
|
|
|
786
805
|
let cfg = WorkerConfigBuilder::default()
|
|
787
806
|
.task_queue("whatever")
|
|
788
|
-
.max_outstanding_workflow_tasks(
|
|
807
|
+
.max_outstanding_workflow_tasks(5_usize)
|
|
789
808
|
.build()
|
|
790
809
|
.unwrap();
|
|
791
810
|
let worker = Worker::new(cfg, None, Arc::new(gwref), Default::default());
|
|
@@ -33,7 +33,7 @@ impl DrivenWorkflow {
|
|
|
33
33
|
/// Start the workflow
|
|
34
34
|
pub fn start(&mut self, attribs: WorkflowExecutionStartedEventAttributes) {
|
|
35
35
|
debug!(run_id = %attribs.original_execution_run_id, "Driven WF start");
|
|
36
|
-
self.started_attrs = Some(attribs)
|
|
36
|
+
self.started_attrs = Some(attribs);
|
|
37
37
|
}
|
|
38
38
|
|
|
39
39
|
/// Enqueue a new job to be sent to the driven workflow
|
|
@@ -51,12 +51,12 @@ impl DrivenWorkflow {
|
|
|
51
51
|
|
|
52
52
|
/// Signal the workflow
|
|
53
53
|
pub fn signal(&mut self, signal: SignalWorkflow) {
|
|
54
|
-
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal))
|
|
54
|
+
self.send_job(wf_activation_job::Variant::SignalWorkflow(signal));
|
|
55
55
|
}
|
|
56
56
|
|
|
57
57
|
/// Cancel the workflow
|
|
58
58
|
pub fn cancel(&mut self, attribs: CancelWorkflow) {
|
|
59
|
-
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs))
|
|
59
|
+
self.send_job(wf_activation_job::Variant::CancelWorkflow(attribs));
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
@@ -62,7 +62,7 @@ impl WorkflowManager {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
#[cfg(test)]
|
|
65
|
-
pub fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
65
|
+
pub const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
|
|
66
66
|
Self {
|
|
67
67
|
machines: workflow_machines,
|
|
68
68
|
command_sink: None,
|
|
@@ -35,17 +35,13 @@ impl WorkflowCacheManager {
|
|
|
35
35
|
// Blindly add a record into the cache, since it still has capacity.
|
|
36
36
|
self.cache.put(run_id.to_owned(), ());
|
|
37
37
|
None
|
|
38
|
-
} else if self.cache.cap()
|
|
39
|
-
let maybe_got_evicted = self.cache.peek_lru().map(|r| r.0.to_owned());
|
|
40
|
-
let already_existed = self.cache.put(run_id.to_owned(), ()).is_some();
|
|
41
|
-
if !already_existed {
|
|
42
|
-
maybe_got_evicted
|
|
43
|
-
} else {
|
|
44
|
-
None
|
|
45
|
-
}
|
|
46
|
-
} else {
|
|
38
|
+
} else if self.cache.cap() == 0 {
|
|
47
39
|
// Run id should be evicted right away as cache size is 0.
|
|
48
40
|
Some(run_id.to_owned())
|
|
41
|
+
} else {
|
|
42
|
+
let maybe_got_evicted = self.cache.peek_lru().map(|r| r.0.clone());
|
|
43
|
+
let not_cached = self.cache.put(run_id.to_owned(), ()).is_none();
|
|
44
|
+
not_cached.then(|| maybe_got_evicted).flatten()
|
|
49
45
|
};
|
|
50
46
|
|
|
51
47
|
self.metrics.cache_size(self.cache.len() as u64);
|
|
@@ -75,7 +71,7 @@ mod tests {
|
|
|
75
71
|
assert_matches!(wcm.insert("1"), None);
|
|
76
72
|
assert_matches!(wcm.insert("2"), None);
|
|
77
73
|
assert_matches!(wcm.insert("3"), Some(run_id) => {
|
|
78
|
-
assert_eq!(run_id, "1")
|
|
74
|
+
assert_eq!(run_id, "1");
|
|
79
75
|
});
|
|
80
76
|
}
|
|
81
77
|
|
|
@@ -88,7 +84,7 @@ mod tests {
|
|
|
88
84
|
wcm.remove("1");
|
|
89
85
|
assert_matches!(wcm.insert("2"), None);
|
|
90
86
|
assert_matches!(wcm.insert("3"), Some(run_id) => {
|
|
91
|
-
assert_eq!(run_id, "2")
|
|
87
|
+
assert_eq!(run_id, "2");
|
|
92
88
|
});
|
|
93
89
|
}
|
|
94
90
|
|
|
@@ -110,7 +106,7 @@ mod tests {
|
|
|
110
106
|
assert_matches!(wcm.insert("2"), None);
|
|
111
107
|
wcm.touch("1");
|
|
112
108
|
assert_matches!(wcm.insert("3"), Some(run_id) => {
|
|
113
|
-
assert_eq!(run_id, "2")
|
|
109
|
+
assert_eq!(run_id, "2");
|
|
114
110
|
});
|
|
115
111
|
}
|
|
116
112
|
|
|
@@ -123,7 +119,7 @@ mod tests {
|
|
|
123
119
|
assert_matches!(wcm.insert("1"), None);
|
|
124
120
|
assert_matches!(wcm.insert("2"), None);
|
|
125
121
|
assert_matches!(wcm.insert("3"), Some(run_id) => {
|
|
126
|
-
assert_eq!(run_id, "1")
|
|
122
|
+
assert_eq!(run_id, "1");
|
|
127
123
|
});
|
|
128
124
|
}
|
|
129
125
|
|
|
@@ -133,10 +129,10 @@ mod tests {
|
|
|
133
129
|
max_cached_workflows: 0,
|
|
134
130
|
});
|
|
135
131
|
assert_matches!(wcm.insert("1"), Some(run_id) => {
|
|
136
|
-
assert_eq!(run_id, "1")
|
|
132
|
+
assert_eq!(run_id, "1");
|
|
137
133
|
});
|
|
138
134
|
assert_matches!(wcm.insert("2"), Some(run_id) => {
|
|
139
|
-
assert_eq!(run_id, "2")
|
|
135
|
+
assert_eq!(run_id, "2");
|
|
140
136
|
});
|
|
141
137
|
}
|
|
142
138
|
|
|
@@ -144,10 +140,10 @@ mod tests {
|
|
|
144
140
|
fn non_sticky_always_pending_eviction() {
|
|
145
141
|
let mut wcm = WorkflowCacheManager::new_test(WorkflowCachingPolicy::NonSticky);
|
|
146
142
|
assert_matches!(wcm.insert("1"), Some(run_id) => {
|
|
147
|
-
assert_eq!(run_id, "1")
|
|
143
|
+
assert_eq!(run_id, "1");
|
|
148
144
|
});
|
|
149
145
|
assert_matches!(wcm.insert("2"), Some(run_id) => {
|
|
150
|
-
assert_eq!(run_id, "2")
|
|
146
|
+
assert_eq!(run_id, "2");
|
|
151
147
|
});
|
|
152
148
|
}
|
|
153
149
|
}
|
|
@@ -184,11 +184,7 @@ impl WorkflowConcurrencyManager {
|
|
|
184
184
|
pub fn delete_activation(&self, run_id: &str) -> Option<OutstandingActivation> {
|
|
185
185
|
let mut writelock = self.runs.write();
|
|
186
186
|
let machine_ref = writelock.get_mut(run_id);
|
|
187
|
-
|
|
188
|
-
run.activation.take()
|
|
189
|
-
} else {
|
|
190
|
-
None
|
|
191
|
-
}
|
|
187
|
+
machine_ref.and_then(|run| run.activation.take())
|
|
192
188
|
}
|
|
193
189
|
|
|
194
190
|
pub fn exists(&self, run_id: &str) -> bool {
|
|
@@ -270,14 +266,14 @@ impl WorkflowConcurrencyManager {
|
|
|
270
266
|
/// Remove the workflow with the provided run id from management
|
|
271
267
|
pub fn evict(&self, run_id: &str) -> Option<ValidPollWFTQResponse> {
|
|
272
268
|
let val = self.runs.write().remove(run_id);
|
|
273
|
-
val.
|
|
269
|
+
val.and_then(|v| v.buffered_resp)
|
|
274
270
|
}
|
|
275
271
|
|
|
276
272
|
/// Clear and return any buffered polling response for this run ID
|
|
277
273
|
pub fn take_buffered_poll(&self, run_id: &str) -> Option<ValidPollWFTQResponse> {
|
|
278
274
|
let mut writelock = self.runs.write();
|
|
279
275
|
let val = writelock.get_mut(run_id);
|
|
280
|
-
val.
|
|
276
|
+
val.and_then(|v| v.buffered_resp.take())
|
|
281
277
|
}
|
|
282
278
|
|
|
283
279
|
pub fn outstanding_wft(&self) -> usize {
|
|
@@ -311,6 +307,6 @@ mod tests {
|
|
|
311
307
|
)
|
|
312
308
|
.await;
|
|
313
309
|
// Should whine that the machines have nothing to do (history empty)
|
|
314
|
-
assert_matches!(res.unwrap_err(), WFMachinesError::Fatal { .. })
|
|
310
|
+
assert_matches!(res.unwrap_err(), WFMachinesError::Fatal { .. });
|
|
315
311
|
}
|
|
316
312
|
}
|