@temporalio/core-bridge 1.16.0 → 1.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.cargo/config.toml +1 -1
- package/sdk-core/crates/sdk-core/src/core_tests/workers.rs +112 -0
- package/sdk-core/crates/sdk-core/src/lib.rs +7 -7
- package/sdk-core/crates/sdk-core/src/pollers/mod.rs +95 -0
- package/sdk-core/crates/sdk-core/src/pollers/poll_buffer.rs +29 -20
- package/sdk-core/crates/sdk-core/src/worker/activities.rs +10 -4
- package/sdk-core/crates/sdk-core/src/worker/mod.rs +89 -52
- package/sdk-core/crates/sdk-core/src/worker/workflow/wft_poller.rs +28 -9
- package/sdk-core/crates/sdk-core/tests/cloud_tests.rs +5 -0
- package/sdk-core/crates/sdk-core/tests/common/mod.rs +4 -0
- package/sdk-core/crates/sdk-core/tests/integ_tests/worker_heartbeat_tests.rs +55 -48
- package/sdk-core/crates/sdk-core/tests/integ_tests/worker_tests.rs +5 -0
- package/sdk-core/crates/sdk-core/tests/shared_tests/mod.rs +123 -5
- package/sdk-core/crates/sdk-core-c-bridge/src/worker.rs +1 -0
- package/src/worker.rs +7 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@temporalio/core-bridge",
|
|
3
|
-
"version": "1.16.
|
|
3
|
+
"version": "1.16.2",
|
|
4
4
|
"description": "Temporal.io SDK Core<>Node bridge",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"types": "lib/index.d.ts",
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"license": "MIT",
|
|
15
15
|
"dependencies": {
|
|
16
16
|
"@grpc/grpc-js": "^1.12.4",
|
|
17
|
-
"@temporalio/common": "1.16.
|
|
17
|
+
"@temporalio/common": "1.16.2"
|
|
18
18
|
},
|
|
19
19
|
"devDependencies": {
|
|
20
20
|
"arg": "^5.0.2",
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -13,7 +13,15 @@ use crate::{
|
|
|
13
13
|
},
|
|
14
14
|
};
|
|
15
15
|
use futures_util::{stream, stream::StreamExt};
|
|
16
|
+
use std::sync::{
|
|
17
|
+
Arc,
|
|
18
|
+
atomic::{AtomicBool, Ordering},
|
|
19
|
+
};
|
|
16
20
|
use std::{cell::RefCell, collections::HashMap, time::Duration};
|
|
21
|
+
use temporalio_common::protos::temporal::api::{
|
|
22
|
+
namespace::v1::{NamespaceInfo, namespace_info::Capabilities},
|
|
23
|
+
workflowservice::v1::DescribeNamespaceResponse,
|
|
24
|
+
};
|
|
17
25
|
use temporalio_common::{
|
|
18
26
|
protos::{
|
|
19
27
|
canned_histories,
|
|
@@ -51,6 +59,7 @@ use temporalio_common::{
|
|
|
51
59
|
},
|
|
52
60
|
worker::WorkerTaskTypes,
|
|
53
61
|
};
|
|
62
|
+
use tokio::sync::Notify;
|
|
54
63
|
use tokio::sync::{Barrier, watch};
|
|
55
64
|
use uuid::Uuid;
|
|
56
65
|
|
|
@@ -1209,3 +1218,106 @@ async fn nexus_start_operation_failure_converts_to_legacy_for_old_server(
|
|
|
1209
1218
|
worker.shutdown().await;
|
|
1210
1219
|
worker.finalize_shutdown().await;
|
|
1211
1220
|
}
|
|
1221
|
+
|
|
1222
|
+
/// Verifies that `initiate_shutdown` sends the `ShutdownWorker` RPC so that the server can
|
|
1223
|
+
/// complete in-flight polls. Without this, graceful poll shutdown deadlocks: the SDK waits for
|
|
1224
|
+
/// polls to drain, but the server was never told to flush them.
|
|
1225
|
+
#[tokio::test]
|
|
1226
|
+
async fn graceful_shutdown_sends_shutdown_worker_rpc_during_initiate() {
|
|
1227
|
+
let shutdown_rpc_called = Arc::new(AtomicBool::new(false));
|
|
1228
|
+
let shutdown_rpc_called_clone = shutdown_rpc_called.clone();
|
|
1229
|
+
// When the shutdown_worker RPC fires, it signals polls to complete (simulating server
|
|
1230
|
+
// behavior where ShutdownWorker causes the server to return empty poll responses).
|
|
1231
|
+
let poll_releaser = Arc::new(Notify::new());
|
|
1232
|
+
let poll_releaser_for_rpc = poll_releaser.clone();
|
|
1233
|
+
|
|
1234
|
+
let mut mock_client = MockWorkerClient::new();
|
|
1235
|
+
mock_client
|
|
1236
|
+
.expect_capabilities()
|
|
1237
|
+
.returning(|| Some(*DEFAULT_TEST_CAPABILITIES));
|
|
1238
|
+
mock_client
|
|
1239
|
+
.expect_workers()
|
|
1240
|
+
.returning(|| DEFAULT_WORKERS_REGISTRY.clone());
|
|
1241
|
+
mock_client.expect_is_mock().returning(|| true);
|
|
1242
|
+
mock_client
|
|
1243
|
+
.expect_sdk_name_and_version()
|
|
1244
|
+
.returning(|| ("test-core".to_string(), "0.0.0".to_string()));
|
|
1245
|
+
mock_client
|
|
1246
|
+
.expect_identity()
|
|
1247
|
+
.returning(|| "test-identity".to_string());
|
|
1248
|
+
mock_client
|
|
1249
|
+
.expect_worker_grouping_key()
|
|
1250
|
+
.returning(Uuid::new_v4);
|
|
1251
|
+
mock_client
|
|
1252
|
+
.expect_worker_instance_key()
|
|
1253
|
+
.returning(Uuid::new_v4);
|
|
1254
|
+
mock_client
|
|
1255
|
+
.expect_set_heartbeat_client_fields()
|
|
1256
|
+
.returning(|hb| {
|
|
1257
|
+
hb.sdk_name = "test-core".to_string();
|
|
1258
|
+
hb.sdk_version = "0.0.0".to_string();
|
|
1259
|
+
hb.worker_identity = "test-identity".to_string();
|
|
1260
|
+
hb.heartbeat_time = Some(std::time::SystemTime::now().into());
|
|
1261
|
+
});
|
|
1262
|
+
// Return the worker_poll_complete_on_shutdown capability so validate() enables graceful mode
|
|
1263
|
+
mock_client.expect_describe_namespace().returning(move || {
|
|
1264
|
+
Ok(DescribeNamespaceResponse {
|
|
1265
|
+
namespace_info: Some(NamespaceInfo {
|
|
1266
|
+
capabilities: Some(Capabilities {
|
|
1267
|
+
worker_poll_complete_on_shutdown: true,
|
|
1268
|
+
..Capabilities::default()
|
|
1269
|
+
}),
|
|
1270
|
+
..NamespaceInfo::default()
|
|
1271
|
+
}),
|
|
1272
|
+
..DescribeNamespaceResponse::default()
|
|
1273
|
+
})
|
|
1274
|
+
});
|
|
1275
|
+
// When shutdown_worker RPC is called, mark it and release polls
|
|
1276
|
+
mock_client
|
|
1277
|
+
.expect_shutdown_worker()
|
|
1278
|
+
.returning(move |_, _, _, _| {
|
|
1279
|
+
shutdown_rpc_called_clone.store(true, Ordering::SeqCst);
|
|
1280
|
+
poll_releaser_for_rpc.notify_waiters();
|
|
1281
|
+
Ok(ShutdownWorkerResponse {})
|
|
1282
|
+
});
|
|
1283
|
+
mock_client
|
|
1284
|
+
.expect_complete_workflow_task()
|
|
1285
|
+
.returning(|_| Ok(RespondWorkflowTaskCompletedResponse::default()));
|
|
1286
|
+
|
|
1287
|
+
// Polls block until shutdown_worker RPC releases them (simulating server holding polls
|
|
1288
|
+
// open until it receives the ShutdownWorker signal)
|
|
1289
|
+
let poll_releaser_for_stream = poll_releaser.clone();
|
|
1290
|
+
let stream = stream::unfold(poll_releaser_for_stream, |releaser| async move {
|
|
1291
|
+
releaser.notified().await;
|
|
1292
|
+
Some((
|
|
1293
|
+
Ok(PollWorkflowTaskQueueResponse::default().try_into().unwrap()),
|
|
1294
|
+
releaser,
|
|
1295
|
+
))
|
|
1296
|
+
});
|
|
1297
|
+
|
|
1298
|
+
let mw = MockWorkerInputs::new(stream.boxed());
|
|
1299
|
+
let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw));
|
|
1300
|
+
|
|
1301
|
+
// validate() reads describe_namespace and sets capabilities.graceful_poll_shutdown = true
|
|
1302
|
+
worker.validate().await.unwrap();
|
|
1303
|
+
|
|
1304
|
+
let poll_fut = worker.poll_workflow_activation();
|
|
1305
|
+
let shutdown_fut = async {
|
|
1306
|
+
// initiate_shutdown must send the ShutdownWorker RPC, which releases the polls
|
|
1307
|
+
worker.initiate_shutdown();
|
|
1308
|
+
};
|
|
1309
|
+
|
|
1310
|
+
let (poll_result, _) = tokio::time::timeout(Duration::from_secs(5), async {
|
|
1311
|
+
tokio::join!(poll_fut, shutdown_fut)
|
|
1312
|
+
})
|
|
1313
|
+
.await
|
|
1314
|
+
.expect("Shutdown should complete within 5s -- if it hangs, the ShutdownWorker RPC was not sent during initiate_shutdown");
|
|
1315
|
+
|
|
1316
|
+
assert_matches!(poll_result.unwrap_err(), PollError::ShutDown);
|
|
1317
|
+
assert!(
|
|
1318
|
+
shutdown_rpc_called.load(Ordering::SeqCst),
|
|
1319
|
+
"ShutdownWorker RPC must be called during initiate_shutdown"
|
|
1320
|
+
);
|
|
1321
|
+
|
|
1322
|
+
worker.finalize_shutdown().await;
|
|
1323
|
+
}
|
|
@@ -42,13 +42,13 @@ pub use temporalio_common::protos::TaskToken;
|
|
|
42
42
|
pub use url::Url;
|
|
43
43
|
pub use worker::{
|
|
44
44
|
ActivitySlotKind, CompleteActivityError, CompleteNexusError, CompleteWfError,
|
|
45
|
-
FixedSizeSlotSupplier, LocalActivitySlotKind, NexusSlotKind, PollError,
|
|
46
|
-
ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder,
|
|
47
|
-
ResourceSlotOptions, SlotInfo, SlotInfoTrait, SlotKind, SlotKindType,
|
|
48
|
-
SlotReleaseContext, SlotReservationContext, SlotSupplier,
|
|
49
|
-
SlotSupplierPermit, TunerBuilder, TunerHolder, TunerHolderOptions,
|
|
50
|
-
Worker, WorkerConfig, WorkerConfigBuilder, WorkerTuner,
|
|
51
|
-
WorkerVersioningStrategy, WorkflowErrorType, WorkflowSlotKind,
|
|
45
|
+
FixedSizeSlotSupplier, LocalActivitySlotKind, NamespaceCapabilities, NexusSlotKind, PollError,
|
|
46
|
+
PollerBehavior, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder,
|
|
47
|
+
ResourceBasedTuner, ResourceSlotOptions, SlotInfo, SlotInfoTrait, SlotKind, SlotKindType,
|
|
48
|
+
SlotMarkUsedContext, SlotReleaseContext, SlotReservationContext, SlotSupplier,
|
|
49
|
+
SlotSupplierOptions, SlotSupplierPermit, TunerBuilder, TunerHolder, TunerHolderOptions,
|
|
50
|
+
TunerHolderOptionsBuilder, Worker, WorkerConfig, WorkerConfigBuilder, WorkerTuner,
|
|
51
|
+
WorkerValidationError, WorkerVersioningStrategy, WorkflowErrorType, WorkflowSlotKind,
|
|
52
52
|
};
|
|
53
53
|
|
|
54
54
|
use crate::{
|
|
@@ -157,6 +157,11 @@ where
|
|
|
157
157
|
return match state.poller.poll().await {
|
|
158
158
|
Some(Ok((task, permit))) => {
|
|
159
159
|
if task == Default::default() {
|
|
160
|
+
if state.poller_was_shutdown {
|
|
161
|
+
// Server sent an empty response after we initiated
|
|
162
|
+
// shutdown — this is the graceful shutdown signal.
|
|
163
|
+
return None;
|
|
164
|
+
}
|
|
160
165
|
// We get the default proto in the event that the long poll
|
|
161
166
|
// times out.
|
|
162
167
|
debug!("Poll {} task timeout", T::task_name());
|
|
@@ -276,3 +281,93 @@ pub(crate) fn new_nexus_task_poller(
|
|
|
276
281
|
)
|
|
277
282
|
.into_stream()
|
|
278
283
|
}
|
|
284
|
+
|
|
285
|
+
#[cfg(test)]
|
|
286
|
+
mod tests {
|
|
287
|
+
use super::*;
|
|
288
|
+
use crate::{
|
|
289
|
+
abstractions::tests::fixed_size_permit_dealer, pollers::MockPermittedPollBuffer,
|
|
290
|
+
test_help::mock_poller, worker::ActivitySlotKind,
|
|
291
|
+
};
|
|
292
|
+
use futures_util::{StreamExt, pin_mut};
|
|
293
|
+
use std::sync::{
|
|
294
|
+
Arc,
|
|
295
|
+
atomic::{AtomicUsize, Ordering},
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
/// Verify that empty responses after shutdown are not treated as poll timeout and retried
|
|
299
|
+
/// indefinitely
|
|
300
|
+
#[tokio::test]
|
|
301
|
+
async fn empty_response_after_shutdown_terminates_stream() {
|
|
302
|
+
let poll_count = Arc::new(AtomicUsize::new(0));
|
|
303
|
+
let poll_count_clone = poll_count.clone();
|
|
304
|
+
|
|
305
|
+
let mut mock_poller = mock_poller();
|
|
306
|
+
mock_poller.expect_poll().returning(move || {
|
|
307
|
+
poll_count_clone.fetch_add(1, Ordering::SeqCst);
|
|
308
|
+
Some(Ok(PollActivityTaskQueueResponse::default()))
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
let sem = Arc::new(fixed_size_permit_dealer::<ActivitySlotKind>(10));
|
|
312
|
+
let shutdown_token = CancellationToken::new();
|
|
313
|
+
|
|
314
|
+
let stream = new_activity_task_poller(
|
|
315
|
+
Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
|
|
316
|
+
MetricsContext::no_op(),
|
|
317
|
+
shutdown_token.clone(),
|
|
318
|
+
);
|
|
319
|
+
pin_mut!(stream);
|
|
320
|
+
|
|
321
|
+
shutdown_token.cancel();
|
|
322
|
+
|
|
323
|
+
let result = tokio::time::timeout(std::time::Duration::from_secs(2), stream.next()).await;
|
|
324
|
+
assert!(
|
|
325
|
+
result.is_ok(),
|
|
326
|
+
"Stream should terminate promptly after shutdown, not hang"
|
|
327
|
+
);
|
|
328
|
+
assert!(
|
|
329
|
+
result.unwrap().is_none(),
|
|
330
|
+
"Stream should return None (terminated) on empty response after shutdown"
|
|
331
|
+
);
|
|
332
|
+
|
|
333
|
+
let total = poll_count.load(Ordering::SeqCst);
|
|
334
|
+
assert!(
|
|
335
|
+
total < 5,
|
|
336
|
+
"Expected stream to terminate quickly, but poller was called {total} times"
|
|
337
|
+
);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
#[tokio::test]
|
|
341
|
+
async fn empty_response_before_shutdown_retries() {
|
|
342
|
+
let mut mock_poller = mock_poller();
|
|
343
|
+
let call_count = Arc::new(AtomicUsize::new(0));
|
|
344
|
+
let call_count_clone = call_count.clone();
|
|
345
|
+
mock_poller.expect_poll().returning(move || {
|
|
346
|
+
let n = call_count_clone.fetch_add(1, Ordering::SeqCst);
|
|
347
|
+
if n < 2 {
|
|
348
|
+
Some(Ok(PollActivityTaskQueueResponse::default()))
|
|
349
|
+
} else {
|
|
350
|
+
None
|
|
351
|
+
}
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
let sem = Arc::new(fixed_size_permit_dealer::<ActivitySlotKind>(10));
|
|
355
|
+
let shutdown_token = CancellationToken::new();
|
|
356
|
+
|
|
357
|
+
let stream = new_activity_task_poller(
|
|
358
|
+
Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
|
|
359
|
+
MetricsContext::no_op(),
|
|
360
|
+
shutdown_token,
|
|
361
|
+
);
|
|
362
|
+
pin_mut!(stream);
|
|
363
|
+
|
|
364
|
+
// Without shutdown, empty responses should be skipped and the stream terminates
|
|
365
|
+
// only when the poller returns None.
|
|
366
|
+
let result = stream.next().await;
|
|
367
|
+
assert!(
|
|
368
|
+
result.is_none(),
|
|
369
|
+
"Stream should end when poller returns None"
|
|
370
|
+
);
|
|
371
|
+
assert_eq!(call_count.load(Ordering::SeqCst), 3);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
@@ -2,8 +2,8 @@ use crate::{
|
|
|
2
2
|
abstractions::{ActiveCounter, MeteredPermitDealer, OwnedMeteredSemPermit, dbg_panic},
|
|
3
3
|
pollers::{self, Poller},
|
|
4
4
|
worker::{
|
|
5
|
-
ActivitySlotKind, NexusSlotKind, PollerBehavior, SlotKind,
|
|
6
|
-
WorkflowSlotKind,
|
|
5
|
+
ActivitySlotKind, NamespaceCapabilities, NexusSlotKind, PollerBehavior, SlotKind,
|
|
6
|
+
WFTPollerShared, WorkflowSlotKind,
|
|
7
7
|
client::{PollActivityOptions, PollOptions, PollWorkflowOptions, WorkerClient},
|
|
8
8
|
},
|
|
9
9
|
};
|
|
@@ -77,7 +77,7 @@ impl LongPollBuffer<PollWorkflowTaskQueueResponse, WorkflowSlotKind> {
|
|
|
77
77
|
num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
|
|
78
78
|
options: WorkflowTaskOptions,
|
|
79
79
|
last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
|
|
80
|
-
|
|
80
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
81
81
|
) -> Self {
|
|
82
82
|
let is_sticky = sticky_queue.is_some();
|
|
83
83
|
let poll_scaler = PollScaler::new(
|
|
@@ -140,7 +140,7 @@ impl LongPollBuffer<PollWorkflowTaskQueueResponse, WorkflowSlotKind> {
|
|
|
140
140
|
poll_scaler,
|
|
141
141
|
pre_permit_delay,
|
|
142
142
|
post_poll_fn,
|
|
143
|
-
|
|
143
|
+
capabilities,
|
|
144
144
|
)
|
|
145
145
|
}
|
|
146
146
|
}
|
|
@@ -156,7 +156,7 @@ impl LongPollBuffer<PollActivityTaskQueueResponse, ActivitySlotKind> {
|
|
|
156
156
|
num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
|
|
157
157
|
options: ActivityTaskOptions,
|
|
158
158
|
last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
|
|
159
|
-
|
|
159
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
160
160
|
) -> Self {
|
|
161
161
|
let pre_permit_delay = options
|
|
162
162
|
.max_worker_acts_per_second
|
|
@@ -209,7 +209,7 @@ impl LongPollBuffer<PollActivityTaskQueueResponse, ActivitySlotKind> {
|
|
|
209
209
|
poll_scaler,
|
|
210
210
|
pre_permit_delay,
|
|
211
211
|
None::<fn(&PollActivityTaskQueueResponse)>,
|
|
212
|
-
|
|
212
|
+
capabilities,
|
|
213
213
|
)
|
|
214
214
|
}
|
|
215
215
|
}
|
|
@@ -225,7 +225,7 @@ impl LongPollBuffer<PollNexusTaskQueueResponse, NexusSlotKind> {
|
|
|
225
225
|
num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
|
|
226
226
|
last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
|
|
227
227
|
send_heartbeat: bool,
|
|
228
|
-
|
|
228
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
229
229
|
) -> Self {
|
|
230
230
|
let no_retry = if matches!(poller_behavior, PollerBehavior::Autoscaling { .. }) {
|
|
231
231
|
Some(NoRetryOnMatching {
|
|
@@ -262,7 +262,7 @@ impl LongPollBuffer<PollNexusTaskQueueResponse, NexusSlotKind> {
|
|
|
262
262
|
),
|
|
263
263
|
None::<fn() -> BoxFuture<'static, ()>>,
|
|
264
264
|
None::<fn(&PollNexusTaskQueueResponse)>,
|
|
265
|
-
|
|
265
|
+
capabilities,
|
|
266
266
|
)
|
|
267
267
|
}
|
|
268
268
|
}
|
|
@@ -288,7 +288,7 @@ where
|
|
|
288
288
|
mut poll_scaler: PollScaler<F>,
|
|
289
289
|
pre_permit_delay: Option<impl Fn() -> DelayFut + Send + Sync + 'static>,
|
|
290
290
|
post_poll_fn: Option<impl Fn(&T) + Send + Sync + 'static>,
|
|
291
|
-
|
|
291
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
292
292
|
) -> Self
|
|
293
293
|
where
|
|
294
294
|
FT: Future<Output = pollers::Result<T>> + Send,
|
|
@@ -359,11 +359,9 @@ where
|
|
|
359
359
|
} else {
|
|
360
360
|
None
|
|
361
361
|
};
|
|
362
|
-
let
|
|
362
|
+
let capabilities = capabilities.clone();
|
|
363
363
|
let poll_task = tokio::spawn(async move {
|
|
364
|
-
let
|
|
365
|
-
|
|
366
|
-
let r = if graceful_shutdown.load(Ordering::Relaxed) {
|
|
364
|
+
let r = if capabilities.graceful_poll_shutdown() {
|
|
367
365
|
pf(timeout_override).await
|
|
368
366
|
} else {
|
|
369
367
|
let poll_interruptor = shutdown.cancelled().then(|_| async move {
|
|
@@ -383,10 +381,11 @@ where
|
|
|
383
381
|
}
|
|
384
382
|
let (should_forward, backoff_duration) = report_handle.poll_result(&r);
|
|
385
383
|
if let Some(duration) = backoff_duration {
|
|
386
|
-
// Apply backoff BEFORE dropping active_guard to prevent next poll from
|
|
384
|
+
// Apply backoff BEFORE dropping active_guard to prevent next poll from
|
|
385
|
+
// starting
|
|
387
386
|
tokio::select! {
|
|
388
387
|
_ = tokio::time::sleep(duration) => return,
|
|
389
|
-
_ =
|
|
388
|
+
_ = shutdown.cancelled() => (),
|
|
390
389
|
};
|
|
391
390
|
}
|
|
392
391
|
drop(active_guard);
|
|
@@ -853,7 +852,9 @@ mod tests {
|
|
|
853
852
|
wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
|
|
854
853
|
},
|
|
855
854
|
Arc::new(AtomicCell::new(None)),
|
|
856
|
-
Arc::new(
|
|
855
|
+
Arc::new(NamespaceCapabilities {
|
|
856
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
857
|
+
}),
|
|
857
858
|
);
|
|
858
859
|
|
|
859
860
|
// Poll a bunch of times, "interrupting" it each time, we should only actually have polled
|
|
@@ -910,7 +911,9 @@ mod tests {
|
|
|
910
911
|
wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))),
|
|
911
912
|
},
|
|
912
913
|
Arc::new(AtomicCell::new(None)),
|
|
913
|
-
Arc::new(
|
|
914
|
+
Arc::new(NamespaceCapabilities {
|
|
915
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
916
|
+
}),
|
|
914
917
|
);
|
|
915
918
|
|
|
916
919
|
// Should not see error, unwraps should get empty response
|
|
@@ -987,7 +990,9 @@ mod tests {
|
|
|
987
990
|
wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
|
|
988
991
|
},
|
|
989
992
|
Arc::new(AtomicCell::new(None)),
|
|
990
|
-
Arc::new(
|
|
993
|
+
Arc::new(NamespaceCapabilities {
|
|
994
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
995
|
+
}),
|
|
991
996
|
);
|
|
992
997
|
|
|
993
998
|
let first_task = pb.poll().await.expect("Should get first task");
|
|
@@ -1093,7 +1098,9 @@ mod tests {
|
|
|
1093
1098
|
wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
|
|
1094
1099
|
},
|
|
1095
1100
|
Arc::new(AtomicCell::new(None)),
|
|
1096
|
-
Arc::new(
|
|
1101
|
+
Arc::new(NamespaceCapabilities {
|
|
1102
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
1103
|
+
}),
|
|
1097
1104
|
));
|
|
1098
1105
|
|
|
1099
1106
|
// Trigger the first poll to initialize and get the scaling decision
|
|
@@ -1174,7 +1181,9 @@ mod tests {
|
|
|
1174
1181
|
wft_poller_shared: None,
|
|
1175
1182
|
},
|
|
1176
1183
|
Arc::new(AtomicCell::new(None)),
|
|
1177
|
-
Arc::new(
|
|
1184
|
+
Arc::new(NamespaceCapabilities {
|
|
1185
|
+
graceful_poll_shutdown: AtomicBool::new(graceful),
|
|
1186
|
+
}),
|
|
1178
1187
|
);
|
|
1179
1188
|
|
|
1180
1189
|
let first = pb.poll().await.unwrap().unwrap();
|
|
@@ -733,7 +733,7 @@ mod tests {
|
|
|
733
733
|
abstractions::tests::fixed_size_permit_dealer,
|
|
734
734
|
pollers::{ActivityTaskOptions, LongPollBuffer},
|
|
735
735
|
prost_dur,
|
|
736
|
-
worker::{PollerBehavior, client::mocks::mock_worker_client},
|
|
736
|
+
worker::{NamespaceCapabilities, PollerBehavior, client::mocks::mock_worker_client},
|
|
737
737
|
};
|
|
738
738
|
use crossbeam_utils::atomic::AtomicCell;
|
|
739
739
|
use temporalio_common::protos::coresdk::activity_result::ActivityExecutionResult;
|
|
@@ -781,7 +781,9 @@ mod tests {
|
|
|
781
781
|
max_tps: None,
|
|
782
782
|
},
|
|
783
783
|
Arc::new(AtomicCell::new(None)),
|
|
784
|
-
Arc::new(
|
|
784
|
+
Arc::new(NamespaceCapabilities {
|
|
785
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
786
|
+
}),
|
|
785
787
|
);
|
|
786
788
|
let atm = WorkerActivityTasks::new(
|
|
787
789
|
sem.clone(),
|
|
@@ -874,7 +876,9 @@ mod tests {
|
|
|
874
876
|
max_tps: None,
|
|
875
877
|
},
|
|
876
878
|
Arc::new(AtomicCell::new(None)),
|
|
877
|
-
Arc::new(
|
|
879
|
+
Arc::new(NamespaceCapabilities {
|
|
880
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
881
|
+
}),
|
|
878
882
|
);
|
|
879
883
|
let atm = WorkerActivityTasks::new(
|
|
880
884
|
sem.clone(),
|
|
@@ -949,7 +953,9 @@ mod tests {
|
|
|
949
953
|
max_tps: None,
|
|
950
954
|
},
|
|
951
955
|
Arc::new(AtomicCell::new(None)),
|
|
952
|
-
Arc::new(
|
|
956
|
+
Arc::new(NamespaceCapabilities {
|
|
957
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
958
|
+
}),
|
|
953
959
|
);
|
|
954
960
|
let atm = WorkerActivityTasks::new(
|
|
955
961
|
sem.clone(),
|
|
@@ -61,7 +61,7 @@ use anyhow::bail;
|
|
|
61
61
|
use crossbeam_utils::atomic::AtomicCell;
|
|
62
62
|
use futures_util::{StreamExt, stream};
|
|
63
63
|
use gethostname::gethostname;
|
|
64
|
-
use parking_lot::RwLock;
|
|
64
|
+
use parking_lot::{Mutex, RwLock};
|
|
65
65
|
use slot_provider::SlotProvider;
|
|
66
66
|
use std::{
|
|
67
67
|
any::Any,
|
|
@@ -415,9 +415,24 @@ pub struct Worker {
|
|
|
415
415
|
client_worker_registrator: Arc<ClientWorkerRegistrator>,
|
|
416
416
|
/// Status of the worker
|
|
417
417
|
status: Arc<RwLock<WorkerStatus>>,
|
|
418
|
-
///
|
|
419
|
-
///
|
|
420
|
-
|
|
418
|
+
/// Capabilities as returned by a describe namespace rpc. Not set until after validate() is
|
|
419
|
+
/// called.
|
|
420
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
421
|
+
/// Handle for the spawned ShutdownWorker RPC task, awaited during shutdown.
|
|
422
|
+
shutdown_rpc_handle: Mutex<Option<tokio::task::JoinHandle<()>>>,
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
/// Namespace capabilities discovered via `describe_namespace` during worker validation.
|
|
426
|
+
pub struct NamespaceCapabilities {
|
|
427
|
+
pub(crate) graceful_poll_shutdown: AtomicBool,
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
impl NamespaceCapabilities {
|
|
431
|
+
/// Returns true if the server supports graceful poll cancellation on shutdown, so pollers
|
|
432
|
+
/// can let in-flight polls complete rather than hard-killing them.
|
|
433
|
+
pub fn graceful_poll_shutdown(&self) -> bool {
|
|
434
|
+
self.graceful_poll_shutdown.load(Ordering::Relaxed)
|
|
435
|
+
}
|
|
421
436
|
}
|
|
422
437
|
|
|
423
438
|
struct AllPermitsTracker {
|
|
@@ -490,11 +505,12 @@ impl Worker {
|
|
|
490
505
|
memo_size_limit_error: api_limits.memo_size_limit_error,
|
|
491
506
|
})
|
|
492
507
|
});
|
|
493
|
-
if ns_info
|
|
494
|
-
|
|
495
|
-
.is_some_and(|caps| caps.worker_poll_complete_on_shutdown)
|
|
508
|
+
if let Some(caps) = ns_info.and_then(|ns| ns.capabilities)
|
|
509
|
+
&& caps.worker_poll_complete_on_shutdown
|
|
496
510
|
{
|
|
497
|
-
self.
|
|
511
|
+
self.capabilities
|
|
512
|
+
.graceful_poll_shutdown
|
|
513
|
+
.store(true, Ordering::Relaxed);
|
|
498
514
|
}
|
|
499
515
|
Ok(NamespaceInfo { limits })
|
|
500
516
|
}
|
|
@@ -616,7 +632,9 @@ impl Worker {
|
|
|
616
632
|
let wf_sticky_last_suc_poll_time = Arc::new(AtomicCell::new(None));
|
|
617
633
|
let act_last_suc_poll_time = Arc::new(AtomicCell::new(None));
|
|
618
634
|
let nexus_last_suc_poll_time = Arc::new(AtomicCell::new(None));
|
|
619
|
-
let
|
|
635
|
+
let capabilities = Arc::new(NamespaceCapabilities {
|
|
636
|
+
graceful_poll_shutdown: AtomicBool::new(false),
|
|
637
|
+
});
|
|
620
638
|
|
|
621
639
|
let nexus_slots = MeteredPermitDealer::new(
|
|
622
640
|
tuner.nexus_task_slot_supplier(),
|
|
@@ -637,7 +655,7 @@ impl Worker {
|
|
|
637
655
|
&wft_slots,
|
|
638
656
|
wf_last_suc_poll_time.clone(),
|
|
639
657
|
wf_sticky_last_suc_poll_time.clone(),
|
|
640
|
-
|
|
658
|
+
capabilities.clone(),
|
|
641
659
|
)
|
|
642
660
|
.boxed();
|
|
643
661
|
let stream = if !client.is_mock() {
|
|
@@ -667,7 +685,7 @@ impl Worker {
|
|
|
667
685
|
max_tps: config.max_task_queue_activities_per_second,
|
|
668
686
|
},
|
|
669
687
|
act_last_suc_poll_time.clone(),
|
|
670
|
-
|
|
688
|
+
capabilities.clone(),
|
|
671
689
|
);
|
|
672
690
|
Some(Box::from(ap) as BoxedActPoller)
|
|
673
691
|
} else {
|
|
@@ -685,7 +703,7 @@ impl Worker {
|
|
|
685
703
|
Some(move |np| np_metrics.record_num_pollers(np)),
|
|
686
704
|
nexus_last_suc_poll_time.clone(),
|
|
687
705
|
shared_namespace_worker,
|
|
688
|
-
|
|
706
|
+
capabilities.clone(),
|
|
689
707
|
)) as BoxedNexusPoller)
|
|
690
708
|
} else {
|
|
691
709
|
None
|
|
@@ -905,7 +923,8 @@ impl Worker {
|
|
|
905
923
|
nexus_mgr,
|
|
906
924
|
client_worker_registrator,
|
|
907
925
|
status: worker_status,
|
|
908
|
-
|
|
926
|
+
capabilities,
|
|
927
|
+
shutdown_rpc_handle: Mutex::new(None),
|
|
909
928
|
})
|
|
910
929
|
}
|
|
911
930
|
|
|
@@ -923,43 +942,12 @@ impl Worker {
|
|
|
923
942
|
/// [Worker::finalize_shutdown].
|
|
924
943
|
pub async fn shutdown(&self) {
|
|
925
944
|
self.initiate_shutdown();
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
let
|
|
930
|
-
|
|
931
|
-
.
|
|
932
|
-
.as_ref()
|
|
933
|
-
.map(|hm| hm.heartbeat_callback.clone()());
|
|
934
|
-
let sticky_name = self
|
|
935
|
-
.workflows
|
|
936
|
-
.as_ref()
|
|
937
|
-
.and_then(|wf| wf.get_sticky_queue_name())
|
|
938
|
-
.unwrap_or_default();
|
|
939
|
-
// This is a best effort call and we can still shutdown the worker if it fails
|
|
940
|
-
let task_queue_types = self.config.task_types.to_task_queue_types();
|
|
941
|
-
match self
|
|
942
|
-
.client
|
|
943
|
-
.shutdown_worker(
|
|
944
|
-
sticky_name,
|
|
945
|
-
self.config.task_queue.clone(),
|
|
946
|
-
task_queue_types,
|
|
947
|
-
heartbeat,
|
|
948
|
-
)
|
|
949
|
-
.await
|
|
950
|
-
{
|
|
951
|
-
Err(err)
|
|
952
|
-
if !matches!(
|
|
953
|
-
err.code(),
|
|
954
|
-
tonic::Code::Unimplemented | tonic::Code::Unavailable
|
|
955
|
-
) =>
|
|
956
|
-
{
|
|
957
|
-
warn!(
|
|
958
|
-
"shutdown_worker rpc errored during worker shutdown: {:?}",
|
|
959
|
-
err
|
|
960
|
-
);
|
|
961
|
-
}
|
|
962
|
-
_ => {}
|
|
945
|
+
|
|
946
|
+
// Ensure the ShutdownWorker RPC completes before waiting for polls to drain,
|
|
947
|
+
// otherwise graceful poll shutdown deadlocks.
|
|
948
|
+
let handle = self.shutdown_rpc_handle.lock().take();
|
|
949
|
+
if let Some(handle) = handle {
|
|
950
|
+
let _ = handle.await;
|
|
963
951
|
}
|
|
964
952
|
|
|
965
953
|
// We need to wait for all local activities to finish so no more workflow task heartbeats
|
|
@@ -1354,8 +1342,15 @@ impl Worker {
|
|
|
1354
1342
|
&self.config
|
|
1355
1343
|
}
|
|
1356
1344
|
|
|
1357
|
-
///
|
|
1358
|
-
|
|
1345
|
+
/// Returns the namespace capabilities discovered during [Worker::validate].
|
|
1346
|
+
pub fn get_namespace_capabilities(&self) -> &NamespaceCapabilities {
|
|
1347
|
+
&self.capabilities
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
/// Initiate shutdown, including spawning the `ShutdownWorker` RPC so the server can complete
|
|
1351
|
+
/// in-flight polls. The RPC runs in a background task and is awaited in [Worker::shutdown].
|
|
1352
|
+
///
|
|
1353
|
+
/// You can then wait on `shutdown` or [Worker::finalize_shutdown].
|
|
1359
1354
|
pub fn initiate_shutdown(&self) {
|
|
1360
1355
|
if !self.shutdown_token.is_cancelled() {
|
|
1361
1356
|
info!(
|
|
@@ -1364,6 +1359,7 @@ impl Worker {
|
|
|
1364
1359
|
"Initiated shutdown",
|
|
1365
1360
|
);
|
|
1366
1361
|
}
|
|
1362
|
+
let already_initiated_shutdown = self.shutdown_token.is_cancelled();
|
|
1367
1363
|
self.shutdown_token.cancel();
|
|
1368
1364
|
{
|
|
1369
1365
|
*self.status.write() = WorkerStatus::ShuttingDown;
|
|
@@ -1398,6 +1394,47 @@ impl Worker {
|
|
|
1398
1394
|
la_mgr.workflows_have_shutdown();
|
|
1399
1395
|
}
|
|
1400
1396
|
}
|
|
1397
|
+
|
|
1398
|
+
// Spawn the ShutdownWorker RPC so the server can complete in-flight polls.
|
|
1399
|
+
// The handle is stored and awaited in shutdown() to ensure completion.
|
|
1400
|
+
let mut guard = self.shutdown_rpc_handle.lock();
|
|
1401
|
+
if guard.is_some() || already_initiated_shutdown {
|
|
1402
|
+
return;
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
let client = self.client.clone();
|
|
1406
|
+
let sticky_name = self
|
|
1407
|
+
.workflows
|
|
1408
|
+
.as_ref()
|
|
1409
|
+
.and_then(|wf| wf.get_sticky_queue_name())
|
|
1410
|
+
.unwrap_or_default();
|
|
1411
|
+
let task_queue = self.config.task_queue.clone();
|
|
1412
|
+
let task_queue_types = self.config.task_types.to_task_queue_types();
|
|
1413
|
+
let heartbeat = self
|
|
1414
|
+
.client_worker_registrator
|
|
1415
|
+
.heartbeat_manager
|
|
1416
|
+
.as_ref()
|
|
1417
|
+
.map(|hm| hm.heartbeat_callback.clone()());
|
|
1418
|
+
let handle = tokio::spawn(async move {
|
|
1419
|
+
match client
|
|
1420
|
+
.shutdown_worker(sticky_name, task_queue, task_queue_types, heartbeat)
|
|
1421
|
+
.await
|
|
1422
|
+
{
|
|
1423
|
+
Err(err)
|
|
1424
|
+
if !matches!(
|
|
1425
|
+
err.code(),
|
|
1426
|
+
tonic::Code::Unimplemented | tonic::Code::Unavailable
|
|
1427
|
+
) =>
|
|
1428
|
+
{
|
|
1429
|
+
warn!(
|
|
1430
|
+
"shutdown_worker rpc errored during worker shutdown: {:?}",
|
|
1431
|
+
err
|
|
1432
|
+
);
|
|
1433
|
+
}
|
|
1434
|
+
_ => {}
|
|
1435
|
+
}
|
|
1436
|
+
});
|
|
1437
|
+
*guard = Some(handle);
|
|
1401
1438
|
}
|
|
1402
1439
|
|
|
1403
1440
|
/// Unique identifier for this worker instance.
|
|
@@ -4,12 +4,12 @@ use crate::{
|
|
|
4
4
|
pollers::{BoxedWFPoller, LongPollBuffer, Poller, WorkflowTaskOptions, WorkflowTaskPoller},
|
|
5
5
|
protosext::ValidPollWFTQResponse,
|
|
6
6
|
telemetry::metrics::{workflow_poller, workflow_sticky_poller},
|
|
7
|
-
worker::{WorkflowSlotKind, client::WorkerClient, wft_poller_behavior},
|
|
7
|
+
worker::{NamespaceCapabilities, WorkflowSlotKind, client::WorkerClient, wft_poller_behavior},
|
|
8
8
|
};
|
|
9
9
|
use crossbeam_utils::atomic::AtomicCell;
|
|
10
10
|
use futures_util::{Stream, stream};
|
|
11
11
|
use std::{
|
|
12
|
-
sync::{Arc, OnceLock
|
|
12
|
+
sync::{Arc, OnceLock},
|
|
13
13
|
time::SystemTime,
|
|
14
14
|
};
|
|
15
15
|
use temporalio_common::protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse;
|
|
@@ -26,7 +26,7 @@ pub(crate) fn make_wft_poller(
|
|
|
26
26
|
wft_slots: &MeteredPermitDealer<WorkflowSlotKind>,
|
|
27
27
|
last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
|
|
28
28
|
sticky_last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
|
|
29
|
-
|
|
29
|
+
capabilities: Arc<NamespaceCapabilities>,
|
|
30
30
|
) -> impl Stream<
|
|
31
31
|
Item = Result<
|
|
32
32
|
(
|
|
@@ -60,7 +60,7 @@ pub(crate) fn make_wft_poller(
|
|
|
60
60
|
wft_poller_shared: wft_poller_shared.clone(),
|
|
61
61
|
},
|
|
62
62
|
last_successful_poll_time,
|
|
63
|
-
|
|
63
|
+
capabilities.clone(),
|
|
64
64
|
);
|
|
65
65
|
let sticky_queue_poller = sticky_queue_name.as_ref().map(|sqn| {
|
|
66
66
|
let sticky_metrics = metrics.with_new_attrs([workflow_sticky_poller()]);
|
|
@@ -76,7 +76,7 @@ pub(crate) fn make_wft_poller(
|
|
|
76
76
|
}),
|
|
77
77
|
WorkflowTaskOptions { wft_poller_shared },
|
|
78
78
|
sticky_last_successful_poll_time,
|
|
79
|
-
|
|
79
|
+
capabilities,
|
|
80
80
|
)
|
|
81
81
|
});
|
|
82
82
|
let wf_task_poll_buffer = Box::new(WorkflowTaskPoller::new(
|
|
@@ -228,11 +228,11 @@ fn new_wft_poller(
|
|
|
228
228
|
warn!(error=?e, "Error while polling for workflow tasks");
|
|
229
229
|
Some((Err(e), (poller, metrics)))
|
|
230
230
|
}
|
|
231
|
-
// If poller returns None, it's dead, thus we also return None to terminate
|
|
232
|
-
// stream.
|
|
231
|
+
// If poller returns None, it's dead, thus we also return None to terminate
|
|
232
|
+
// this stream.
|
|
233
233
|
None => {
|
|
234
|
-
// Make sure we call the actual shutdown function here to propagate any
|
|
235
|
-
// inside the polling tasks as errors.
|
|
234
|
+
// Make sure we call the actual shutdown function here to propagate any
|
|
235
|
+
// panics inside the polling tasks as errors.
|
|
236
236
|
poller.shutdown_box().await;
|
|
237
237
|
None
|
|
238
238
|
}
|
|
@@ -281,6 +281,25 @@ mod tests {
|
|
|
281
281
|
assert_matches!(stream.next().await, None);
|
|
282
282
|
}
|
|
283
283
|
|
|
284
|
+
/// When the underlying poller returns None (indicating shutdown), the wrapping WFT stream
|
|
285
|
+
/// should also return None to terminate.
|
|
286
|
+
#[tokio::test]
|
|
287
|
+
async fn poller_returning_none_terminates_wft_stream() {
|
|
288
|
+
let mut mock_poller = mock_poller();
|
|
289
|
+
mock_poller.expect_poll().times(1).returning(|| None);
|
|
290
|
+
mock_poller.expect_shutdown().times(1).returning(|| ());
|
|
291
|
+
|
|
292
|
+
let sem = Arc::new(fixed_size_permit_dealer::<WorkflowSlotKind>(10));
|
|
293
|
+
|
|
294
|
+
let stream = new_wft_poller(
|
|
295
|
+
Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
|
|
296
|
+
MetricsContext::no_op(),
|
|
297
|
+
);
|
|
298
|
+
pin_mut!(stream);
|
|
299
|
+
|
|
300
|
+
assert_matches!(stream.next().await, None);
|
|
301
|
+
}
|
|
302
|
+
|
|
284
303
|
#[tokio::test]
|
|
285
304
|
async fn poll_errors_do_produce_responses() {
|
|
286
305
|
let mut mock_poller = mock_poller();
|
|
@@ -32,3 +32,8 @@ async fn grpc_message_too_large_test() {
|
|
|
32
32
|
async fn priority_values_sent_to_server() {
|
|
33
33
|
shared_tests::priority::priority_values_sent_to_server().await
|
|
34
34
|
}
|
|
35
|
+
|
|
36
|
+
#[tokio::test]
|
|
37
|
+
async fn shutdown_during_active_timer_activity_workflows() {
|
|
38
|
+
shared_tests::shutdown_during_active_timer_activity_workflows().await
|
|
39
|
+
}
|
|
@@ -1076,6 +1076,10 @@ pub(crate) fn integ_dev_server_config(
|
|
|
1076
1076
|
"frontend.WorkerHeartbeatsEnabled=true".to_owned(),
|
|
1077
1077
|
"--dynamic-config-value".to_owned(),
|
|
1078
1078
|
"frontend.ListWorkersEnabled=true".to_owned(),
|
|
1079
|
+
"--dynamic-config-value".to_owned(),
|
|
1080
|
+
"frontend.enableCancelWorkerPollsOnShutdown=true".to_owned(),
|
|
1081
|
+
"--dynamic-config-value".to_owned(),
|
|
1082
|
+
"matching.rps=12000".to_owned(),
|
|
1079
1083
|
"--search-attribute".to_string(),
|
|
1080
1084
|
format!("{SEARCH_ATTR_TXT}=Text"),
|
|
1081
1085
|
"--search-attribute".to_string(),
|
|
@@ -247,43 +247,67 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b
|
|
|
247
247
|
);
|
|
248
248
|
in_activity_checks(heartbeat, &start_time, &heartbeat_time);
|
|
249
249
|
acts_done.notify_one();
|
|
250
|
+
|
|
251
|
+
// Poll until the heartbeat reflects shutdown with the second WFT processed.
|
|
252
|
+
// The worker stays alive (join! waits for both futures) so heartbeats keep firing.
|
|
253
|
+
eventually(
|
|
254
|
+
|| {
|
|
255
|
+
let mut rc = raw_client.clone();
|
|
256
|
+
let ns = client.namespace().to_owned();
|
|
257
|
+
async move {
|
|
258
|
+
let workers_list = WorkflowService::list_workers(
|
|
259
|
+
&mut rc,
|
|
260
|
+
ListWorkersRequest {
|
|
261
|
+
namespace: ns,
|
|
262
|
+
page_size: 100,
|
|
263
|
+
next_page_token: Vec::new(),
|
|
264
|
+
query: String::new(),
|
|
265
|
+
}
|
|
266
|
+
.into_request(),
|
|
267
|
+
)
|
|
268
|
+
.await
|
|
269
|
+
.unwrap()
|
|
270
|
+
.into_inner();
|
|
271
|
+
#[allow(deprecated)]
|
|
272
|
+
let hb = workers_list
|
|
273
|
+
.workers_info
|
|
274
|
+
.iter()
|
|
275
|
+
.find_map(|wi| {
|
|
276
|
+
wi.worker_heartbeat.as_ref().filter(|hb| {
|
|
277
|
+
hb.worker_instance_key == worker_instance_key.to_string()
|
|
278
|
+
})
|
|
279
|
+
})
|
|
280
|
+
.unwrap()
|
|
281
|
+
.clone();
|
|
282
|
+
let tasks_done = hb
|
|
283
|
+
.workflow_task_slots_info
|
|
284
|
+
.as_ref()
|
|
285
|
+
.is_some_and(|s| s.total_processed_tasks >= 2);
|
|
286
|
+
let is_shutting_down = hb.status == WorkerStatus::ShuttingDown as i32;
|
|
287
|
+
if tasks_done && is_shutting_down {
|
|
288
|
+
Ok(hb)
|
|
289
|
+
} else {
|
|
290
|
+
Err(anyhow::anyhow!(
|
|
291
|
+
"Heartbeat not ready: tasks={}, shutting_down={}",
|
|
292
|
+
hb.workflow_task_slots_info
|
|
293
|
+
.as_ref()
|
|
294
|
+
.map_or(0, |s| s.total_processed_tasks),
|
|
295
|
+
is_shutting_down,
|
|
296
|
+
))
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
},
|
|
300
|
+
Duration::from_secs(5),
|
|
301
|
+
)
|
|
302
|
+
.await
|
|
303
|
+
.map(|hb| after_shutdown_checks(&hb, &wf_name, &start_time, &heartbeat_time))
|
|
304
|
+
.unwrap();
|
|
250
305
|
};
|
|
251
306
|
|
|
252
307
|
let runner = async move {
|
|
253
308
|
worker.run_until_done().await.unwrap();
|
|
254
309
|
};
|
|
255
310
|
tokio::join!(test_fut, runner);
|
|
256
|
-
|
|
257
|
-
let client = starter.get_client().await;
|
|
258
|
-
let mut raw_client = client.clone();
|
|
259
|
-
let workers_list = WorkflowService::list_workers(
|
|
260
|
-
&mut raw_client,
|
|
261
|
-
ListWorkersRequest {
|
|
262
|
-
namespace: client.namespace().to_owned(),
|
|
263
|
-
page_size: 100,
|
|
264
|
-
next_page_token: Vec::new(),
|
|
265
|
-
query: String::new(),
|
|
266
|
-
}
|
|
267
|
-
.into_request(),
|
|
268
|
-
)
|
|
269
|
-
.await
|
|
270
|
-
.unwrap()
|
|
271
|
-
.into_inner();
|
|
272
|
-
// Since list_workers finds all workers in the namespace, must find specific worker used in this
|
|
273
|
-
// test
|
|
274
|
-
let worker_info = workers_list
|
|
275
|
-
.workers_info
|
|
276
|
-
.iter()
|
|
277
|
-
.find(|worker_info| {
|
|
278
|
-
if let Some(hb) = worker_info.worker_heartbeat.as_ref() {
|
|
279
|
-
hb.worker_instance_key == worker_instance_key.to_string()
|
|
280
|
-
} else {
|
|
281
|
-
false
|
|
282
|
-
}
|
|
283
|
-
})
|
|
284
|
-
.unwrap();
|
|
285
|
-
let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap();
|
|
286
|
-
after_shutdown_checks(heartbeat, &wf_name, &start_time, &heartbeat_time);
|
|
287
311
|
}
|
|
288
312
|
|
|
289
313
|
// Tests that rely on Prometheus running in a docker container need to start
|
|
@@ -521,23 +545,7 @@ fn after_shutdown_checks(
|
|
|
521
545
|
));
|
|
522
546
|
|
|
523
547
|
let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap();
|
|
524
|
-
assert_eq!(workflow_task_slots.current_available_slots, 5);
|
|
525
|
-
assert_eq!(workflow_task_slots.current_used_slots, 1);
|
|
526
548
|
assert_eq!(workflow_task_slots.total_processed_tasks, 2);
|
|
527
|
-
assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed");
|
|
528
|
-
let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap();
|
|
529
|
-
assert_eq!(activity_task_slots.current_available_slots, 5);
|
|
530
|
-
assert_eq!(workflow_task_slots.current_used_slots, 1);
|
|
531
|
-
assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed");
|
|
532
|
-
assert_eq!(activity_task_slots.last_interval_processed_tasks, 1);
|
|
533
|
-
let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap();
|
|
534
|
-
assert_eq!(nexus_task_slots.current_available_slots, 0);
|
|
535
|
-
assert_eq!(nexus_task_slots.current_used_slots, 0);
|
|
536
|
-
assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed");
|
|
537
|
-
let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap();
|
|
538
|
-
assert_eq!(local_activity_task_slots.current_available_slots, 100);
|
|
539
|
-
assert_eq!(local_activity_task_slots.current_used_slots, 0);
|
|
540
|
-
assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed");
|
|
541
549
|
|
|
542
550
|
let workflow_poller_info = heartbeat.workflow_poller_info.unwrap();
|
|
543
551
|
assert!(!workflow_poller_info.is_autoscaling);
|
|
@@ -559,7 +567,6 @@ fn after_shutdown_checks(
|
|
|
559
567
|
));
|
|
560
568
|
|
|
561
569
|
assert_eq!(heartbeat.total_sticky_cache_hit, 1);
|
|
562
|
-
assert_eq!(heartbeat.current_sticky_cache_size, 0);
|
|
563
570
|
assert_eq!(
|
|
564
571
|
heartbeat.plugins,
|
|
565
572
|
vec![
|
|
@@ -983,3 +983,8 @@ fn test_default_build_id() {
|
|
|
983
983
|
assert!(!o.deployment_options.version.build_id.is_empty());
|
|
984
984
|
assert_ne!(o.deployment_options.version.build_id, "undetermined");
|
|
985
985
|
}
|
|
986
|
+
|
|
987
|
+
#[tokio::test]
|
|
988
|
+
async fn shutdown_during_active_timer_activity_workflows() {
|
|
989
|
+
shared_tests::shutdown_during_active_timer_activity_workflows().await
|
|
990
|
+
}
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
//! Shared tests that are meant to be run against both local dev server and cloud
|
|
2
2
|
|
|
3
|
-
use crate::common::CoreWfStarter;
|
|
4
|
-
use std::
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
use crate::common::{CoreWfStarter, activity_functions::StdActivities};
|
|
4
|
+
use std::{
|
|
5
|
+
sync::{
|
|
6
|
+
Arc,
|
|
7
|
+
atomic::{AtomicBool, Ordering::Relaxed},
|
|
8
|
+
},
|
|
9
|
+
time::Duration,
|
|
10
|
+
};
|
|
11
|
+
use temporalio_client::{
|
|
12
|
+
UntypedWorkflow, WorkflowFetchHistoryOptions, WorkflowStartOptions, WorkflowTerminateOptions,
|
|
7
13
|
};
|
|
8
14
|
use temporalio_common::{
|
|
9
15
|
protos::temporal::api::{
|
|
@@ -15,7 +21,7 @@ use temporalio_common::{
|
|
|
15
21
|
worker::WorkerTaskTypes,
|
|
16
22
|
};
|
|
17
23
|
use temporalio_macros::{workflow, workflow_methods};
|
|
18
|
-
use temporalio_sdk::{WorkflowContext, WorkflowResult};
|
|
24
|
+
use temporalio_sdk::{ActivityOptions, WorkflowContext, WorkflowResult, WorkflowTermination};
|
|
19
25
|
|
|
20
26
|
pub(crate) mod priority;
|
|
21
27
|
|
|
@@ -92,3 +98,115 @@ pub(crate) fn is_oversize_grpc_event(
|
|
|
92
98
|
false
|
|
93
99
|
}
|
|
94
100
|
}
|
|
101
|
+
|
|
102
|
+
#[workflow]
|
|
103
|
+
#[derive(Default)]
|
|
104
|
+
struct ShutdownTimerActivityLoopWf;
|
|
105
|
+
|
|
106
|
+
#[workflow_methods]
|
|
107
|
+
impl ShutdownTimerActivityLoopWf {
|
|
108
|
+
#[run]
|
|
109
|
+
async fn run(ctx: &mut WorkflowContext<Self>) -> WorkflowResult<()> {
|
|
110
|
+
loop {
|
|
111
|
+
ctx.timer(Duration::from_millis(10)).await;
|
|
112
|
+
ctx.start_activity(
|
|
113
|
+
StdActivities::no_op,
|
|
114
|
+
(),
|
|
115
|
+
ActivityOptions {
|
|
116
|
+
start_to_close_timeout: Some(Duration::from_secs(10)),
|
|
117
|
+
..Default::default()
|
|
118
|
+
},
|
|
119
|
+
)
|
|
120
|
+
.await
|
|
121
|
+
.map_err(|e| WorkflowTermination::from(anyhow::Error::from(e)))?;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/// Starts 10 workflows that each run a tight timer+activity loop, then shuts down the worker
|
|
127
|
+
/// and verifies:
|
|
128
|
+
/// 1. Shutdown completes rapidly (< 5s)
|
|
129
|
+
/// 2. No workflow task failures or timeouts appear in any workflow's history
|
|
130
|
+
pub(crate) async fn shutdown_during_active_timer_activity_workflows() {
|
|
131
|
+
let wf_name = "shutdown_during_active_timer_activity_workflows";
|
|
132
|
+
let num_workflows = 10;
|
|
133
|
+
|
|
134
|
+
let mut starter =
|
|
135
|
+
if let Some(wfs) = CoreWfStarter::new_cloud_or_local(wf_name, ">=1.6.3-serverless").await {
|
|
136
|
+
wfs
|
|
137
|
+
} else {
|
|
138
|
+
return;
|
|
139
|
+
};
|
|
140
|
+
starter.sdk_config.register_activities(StdActivities);
|
|
141
|
+
let mut worker = starter.worker().await;
|
|
142
|
+
worker.register_workflow::<ShutdownTimerActivityLoopWf>();
|
|
143
|
+
|
|
144
|
+
let core = worker.core_worker();
|
|
145
|
+
core.validate().await.unwrap();
|
|
146
|
+
assert!(
|
|
147
|
+
core.get_namespace_capabilities().graceful_poll_shutdown(),
|
|
148
|
+
"Server must support graceful poll shutdown for this test"
|
|
149
|
+
);
|
|
150
|
+
|
|
151
|
+
let task_queue = starter.get_task_queue().to_owned();
|
|
152
|
+
let mut wf_ids = Vec::with_capacity(num_workflows);
|
|
153
|
+
for i in 0..num_workflows {
|
|
154
|
+
let wf_id = format!("{task_queue}-{i}");
|
|
155
|
+
worker
|
|
156
|
+
.submit_workflow(
|
|
157
|
+
ShutdownTimerActivityLoopWf::run,
|
|
158
|
+
(),
|
|
159
|
+
WorkflowStartOptions::new(task_queue.clone(), wf_id.clone()).build(),
|
|
160
|
+
)
|
|
161
|
+
.await
|
|
162
|
+
.unwrap();
|
|
163
|
+
wf_ids.push(wf_id);
|
|
164
|
+
}
|
|
165
|
+
// Don't wait for workflow completion — these loop forever
|
|
166
|
+
worker.fetch_results = false;
|
|
167
|
+
|
|
168
|
+
let shutdown_handle = worker.inner_mut().shutdown_handle();
|
|
169
|
+
let run_fut = async { worker.run_until_done().await.unwrap() };
|
|
170
|
+
|
|
171
|
+
let shutdown_fut = async {
|
|
172
|
+
// Let workflows run a few iterations
|
|
173
|
+
tokio::time::sleep(Duration::from_secs(2)).await;
|
|
174
|
+
shutdown_handle();
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
let shutdown_start = std::time::Instant::now();
|
|
178
|
+
tokio::join!(run_fut, shutdown_fut);
|
|
179
|
+
let shutdown_elapsed = shutdown_start.elapsed();
|
|
180
|
+
|
|
181
|
+
assert!(
|
|
182
|
+
shutdown_elapsed < Duration::from_secs(5),
|
|
183
|
+
"Worker shutdown took {shutdown_elapsed:?}, expected < 5s"
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
let client = starter.get_client().await;
|
|
187
|
+
for wf_id in &wf_ids {
|
|
188
|
+
client
|
|
189
|
+
.get_workflow_handle::<UntypedWorkflow>(wf_id)
|
|
190
|
+
.terminate(WorkflowTerminateOptions::default())
|
|
191
|
+
.await
|
|
192
|
+
.unwrap();
|
|
193
|
+
|
|
194
|
+
let history = client
|
|
195
|
+
.get_workflow_handle::<UntypedWorkflow>(wf_id)
|
|
196
|
+
.fetch_history(WorkflowFetchHistoryOptions::default())
|
|
197
|
+
.await
|
|
198
|
+
.unwrap();
|
|
199
|
+
let bad_events: Vec<_> = history
|
|
200
|
+
.events()
|
|
201
|
+
.iter()
|
|
202
|
+
.filter(|e| {
|
|
203
|
+
e.event_type() == EventType::WorkflowTaskFailed
|
|
204
|
+
|| e.event_type() == EventType::WorkflowTaskTimedOut
|
|
205
|
+
})
|
|
206
|
+
.collect();
|
|
207
|
+
assert!(
|
|
208
|
+
bad_events.is_empty(),
|
|
209
|
+
"Workflow {wf_id} had unexpected WFT failures/timeouts: {bad_events:?}"
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
@@ -929,6 +929,7 @@ pub extern "C" fn temporal_core_worker_request_workflow_eviction(
|
|
|
929
929
|
#[unsafe(no_mangle)]
|
|
930
930
|
pub extern "C" fn temporal_core_worker_initiate_shutdown(worker: *mut Worker) {
|
|
931
931
|
let worker = unsafe { &*worker };
|
|
932
|
+
enter_sync!(worker.runtime);
|
|
932
933
|
worker.worker.as_ref().unwrap().initiate_shutdown();
|
|
933
934
|
}
|
|
934
935
|
|
package/src/worker.rs
CHANGED
|
@@ -339,6 +339,12 @@ pub fn worker_complete_nexus_task(
|
|
|
339
339
|
#[js_function]
|
|
340
340
|
pub fn worker_initiate_shutdown(worker: OpaqueInboundHandle<Worker>) -> BridgeResult<()> {
|
|
341
341
|
let worker_ref = worker.borrow()?;
|
|
342
|
+
|
|
343
|
+
// Core worker shutdown now spawns a Tokio task, so this sync Neon binding must
|
|
344
|
+
// enter Core's Tokio runtime before initiating shutdown.
|
|
345
|
+
let runtime = worker_ref.core_runtime.clone();
|
|
346
|
+
enter_sync!(runtime);
|
|
347
|
+
|
|
342
348
|
worker_ref.core_worker.initiate_shutdown();
|
|
343
349
|
Ok(())
|
|
344
350
|
}
|
|
@@ -868,7 +874,7 @@ mod custom_slot_supplier {
|
|
|
868
874
|
}
|
|
869
875
|
Err(err) => {
|
|
870
876
|
warn!("Error reserving slot: {err:?}");
|
|
871
|
-
tokio::time::sleep(std::time::Duration::
|
|
877
|
+
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
|
872
878
|
}
|
|
873
879
|
}
|
|
874
880
|
}
|