@temporalio/core-bridge 1.16.0 → 1.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@temporalio/core-bridge",
3
- "version": "1.16.0",
3
+ "version": "1.16.2",
4
4
  "description": "Temporal.io SDK Core<>Node bridge",
5
5
  "main": "index.js",
6
6
  "types": "lib/index.d.ts",
@@ -14,7 +14,7 @@
14
14
  "license": "MIT",
15
15
  "dependencies": {
16
16
  "@grpc/grpc-js": "^1.12.4",
17
- "@temporalio/common": "1.16.0"
17
+ "@temporalio/common": "1.16.2"
18
18
  },
19
19
  "devDependencies": {
20
20
  "arg": "^5.0.2",
@@ -1,6 +1,6 @@
1
1
  [env]
2
2
  # This temporarily overrides the version of the CLI used for integration tests, locally and in CI
3
- #CLI_VERSION_OVERRIDE = "v1.4.1-cloud-v1-29-0-139-2.0"
3
+ CLI_VERSION_OVERRIDE = "v1.6.3-serverless"
4
4
 
5
5
  [alias]
6
6
  # Not sure why --all-features doesn't work
@@ -13,7 +13,15 @@ use crate::{
13
13
  },
14
14
  };
15
15
  use futures_util::{stream, stream::StreamExt};
16
+ use std::sync::{
17
+ Arc,
18
+ atomic::{AtomicBool, Ordering},
19
+ };
16
20
  use std::{cell::RefCell, collections::HashMap, time::Duration};
21
+ use temporalio_common::protos::temporal::api::{
22
+ namespace::v1::{NamespaceInfo, namespace_info::Capabilities},
23
+ workflowservice::v1::DescribeNamespaceResponse,
24
+ };
17
25
  use temporalio_common::{
18
26
  protos::{
19
27
  canned_histories,
@@ -51,6 +59,7 @@ use temporalio_common::{
51
59
  },
52
60
  worker::WorkerTaskTypes,
53
61
  };
62
+ use tokio::sync::Notify;
54
63
  use tokio::sync::{Barrier, watch};
55
64
  use uuid::Uuid;
56
65
 
@@ -1209,3 +1218,106 @@ async fn nexus_start_operation_failure_converts_to_legacy_for_old_server(
1209
1218
  worker.shutdown().await;
1210
1219
  worker.finalize_shutdown().await;
1211
1220
  }
1221
+
1222
+ /// Verifies that `initiate_shutdown` sends the `ShutdownWorker` RPC so that the server can
1223
+ /// complete in-flight polls. Without this, graceful poll shutdown deadlocks: the SDK waits for
1224
+ /// polls to drain, but the server was never told to flush them.
1225
+ #[tokio::test]
1226
+ async fn graceful_shutdown_sends_shutdown_worker_rpc_during_initiate() {
1227
+ let shutdown_rpc_called = Arc::new(AtomicBool::new(false));
1228
+ let shutdown_rpc_called_clone = shutdown_rpc_called.clone();
1229
+ // When the shutdown_worker RPC fires, it signals polls to complete (simulating server
1230
+ // behavior where ShutdownWorker causes the server to return empty poll responses).
1231
+ let poll_releaser = Arc::new(Notify::new());
1232
+ let poll_releaser_for_rpc = poll_releaser.clone();
1233
+
1234
+ let mut mock_client = MockWorkerClient::new();
1235
+ mock_client
1236
+ .expect_capabilities()
1237
+ .returning(|| Some(*DEFAULT_TEST_CAPABILITIES));
1238
+ mock_client
1239
+ .expect_workers()
1240
+ .returning(|| DEFAULT_WORKERS_REGISTRY.clone());
1241
+ mock_client.expect_is_mock().returning(|| true);
1242
+ mock_client
1243
+ .expect_sdk_name_and_version()
1244
+ .returning(|| ("test-core".to_string(), "0.0.0".to_string()));
1245
+ mock_client
1246
+ .expect_identity()
1247
+ .returning(|| "test-identity".to_string());
1248
+ mock_client
1249
+ .expect_worker_grouping_key()
1250
+ .returning(Uuid::new_v4);
1251
+ mock_client
1252
+ .expect_worker_instance_key()
1253
+ .returning(Uuid::new_v4);
1254
+ mock_client
1255
+ .expect_set_heartbeat_client_fields()
1256
+ .returning(|hb| {
1257
+ hb.sdk_name = "test-core".to_string();
1258
+ hb.sdk_version = "0.0.0".to_string();
1259
+ hb.worker_identity = "test-identity".to_string();
1260
+ hb.heartbeat_time = Some(std::time::SystemTime::now().into());
1261
+ });
1262
+ // Return the worker_poll_complete_on_shutdown capability so validate() enables graceful mode
1263
+ mock_client.expect_describe_namespace().returning(move || {
1264
+ Ok(DescribeNamespaceResponse {
1265
+ namespace_info: Some(NamespaceInfo {
1266
+ capabilities: Some(Capabilities {
1267
+ worker_poll_complete_on_shutdown: true,
1268
+ ..Capabilities::default()
1269
+ }),
1270
+ ..NamespaceInfo::default()
1271
+ }),
1272
+ ..DescribeNamespaceResponse::default()
1273
+ })
1274
+ });
1275
+ // When shutdown_worker RPC is called, mark it and release polls
1276
+ mock_client
1277
+ .expect_shutdown_worker()
1278
+ .returning(move |_, _, _, _| {
1279
+ shutdown_rpc_called_clone.store(true, Ordering::SeqCst);
1280
+ poll_releaser_for_rpc.notify_waiters();
1281
+ Ok(ShutdownWorkerResponse {})
1282
+ });
1283
+ mock_client
1284
+ .expect_complete_workflow_task()
1285
+ .returning(|_| Ok(RespondWorkflowTaskCompletedResponse::default()));
1286
+
1287
+ // Polls block until shutdown_worker RPC releases them (simulating server holding polls
1288
+ // open until it receives the ShutdownWorker signal)
1289
+ let poll_releaser_for_stream = poll_releaser.clone();
1290
+ let stream = stream::unfold(poll_releaser_for_stream, |releaser| async move {
1291
+ releaser.notified().await;
1292
+ Some((
1293
+ Ok(PollWorkflowTaskQueueResponse::default().try_into().unwrap()),
1294
+ releaser,
1295
+ ))
1296
+ });
1297
+
1298
+ let mw = MockWorkerInputs::new(stream.boxed());
1299
+ let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw));
1300
+
1301
+ // validate() reads describe_namespace and sets capabilities.graceful_poll_shutdown = true
1302
+ worker.validate().await.unwrap();
1303
+
1304
+ let poll_fut = worker.poll_workflow_activation();
1305
+ let shutdown_fut = async {
1306
+ // initiate_shutdown must send the ShutdownWorker RPC, which releases the polls
1307
+ worker.initiate_shutdown();
1308
+ };
1309
+
1310
+ let (poll_result, _) = tokio::time::timeout(Duration::from_secs(5), async {
1311
+ tokio::join!(poll_fut, shutdown_fut)
1312
+ })
1313
+ .await
1314
+ .expect("Shutdown should complete within 5s -- if it hangs, the ShutdownWorker RPC was not sent during initiate_shutdown");
1315
+
1316
+ assert_matches!(poll_result.unwrap_err(), PollError::ShutDown);
1317
+ assert!(
1318
+ shutdown_rpc_called.load(Ordering::SeqCst),
1319
+ "ShutdownWorker RPC must be called during initiate_shutdown"
1320
+ );
1321
+
1322
+ worker.finalize_shutdown().await;
1323
+ }
@@ -42,13 +42,13 @@ pub use temporalio_common::protos::TaskToken;
42
42
  pub use url::Url;
43
43
  pub use worker::{
44
44
  ActivitySlotKind, CompleteActivityError, CompleteNexusError, CompleteWfError,
45
- FixedSizeSlotSupplier, LocalActivitySlotKind, NexusSlotKind, PollError, PollerBehavior,
46
- ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder, ResourceBasedTuner,
47
- ResourceSlotOptions, SlotInfo, SlotInfoTrait, SlotKind, SlotKindType, SlotMarkUsedContext,
48
- SlotReleaseContext, SlotReservationContext, SlotSupplier, SlotSupplierOptions,
49
- SlotSupplierPermit, TunerBuilder, TunerHolder, TunerHolderOptions, TunerHolderOptionsBuilder,
50
- Worker, WorkerConfig, WorkerConfigBuilder, WorkerTuner, WorkerValidationError,
51
- WorkerVersioningStrategy, WorkflowErrorType, WorkflowSlotKind,
45
+ FixedSizeSlotSupplier, LocalActivitySlotKind, NamespaceCapabilities, NexusSlotKind, PollError,
46
+ PollerBehavior, ResourceBasedSlotsOptions, ResourceBasedSlotsOptionsBuilder,
47
+ ResourceBasedTuner, ResourceSlotOptions, SlotInfo, SlotInfoTrait, SlotKind, SlotKindType,
48
+ SlotMarkUsedContext, SlotReleaseContext, SlotReservationContext, SlotSupplier,
49
+ SlotSupplierOptions, SlotSupplierPermit, TunerBuilder, TunerHolder, TunerHolderOptions,
50
+ TunerHolderOptionsBuilder, Worker, WorkerConfig, WorkerConfigBuilder, WorkerTuner,
51
+ WorkerValidationError, WorkerVersioningStrategy, WorkflowErrorType, WorkflowSlotKind,
52
52
  };
53
53
 
54
54
  use crate::{
@@ -157,6 +157,11 @@ where
157
157
  return match state.poller.poll().await {
158
158
  Some(Ok((task, permit))) => {
159
159
  if task == Default::default() {
160
+ if state.poller_was_shutdown {
161
+ // Server sent an empty response after we initiated
162
+ // shutdown — this is the graceful shutdown signal.
163
+ return None;
164
+ }
160
165
  // We get the default proto in the event that the long poll
161
166
  // times out.
162
167
  debug!("Poll {} task timeout", T::task_name());
@@ -276,3 +281,93 @@ pub(crate) fn new_nexus_task_poller(
276
281
  )
277
282
  .into_stream()
278
283
  }
284
+
285
+ #[cfg(test)]
286
+ mod tests {
287
+ use super::*;
288
+ use crate::{
289
+ abstractions::tests::fixed_size_permit_dealer, pollers::MockPermittedPollBuffer,
290
+ test_help::mock_poller, worker::ActivitySlotKind,
291
+ };
292
+ use futures_util::{StreamExt, pin_mut};
293
+ use std::sync::{
294
+ Arc,
295
+ atomic::{AtomicUsize, Ordering},
296
+ };
297
+
298
+ /// Verify that empty responses after shutdown are not treated as poll timeout and retried
299
+ /// indefinitely
300
+ #[tokio::test]
301
+ async fn empty_response_after_shutdown_terminates_stream() {
302
+ let poll_count = Arc::new(AtomicUsize::new(0));
303
+ let poll_count_clone = poll_count.clone();
304
+
305
+ let mut mock_poller = mock_poller();
306
+ mock_poller.expect_poll().returning(move || {
307
+ poll_count_clone.fetch_add(1, Ordering::SeqCst);
308
+ Some(Ok(PollActivityTaskQueueResponse::default()))
309
+ });
310
+
311
+ let sem = Arc::new(fixed_size_permit_dealer::<ActivitySlotKind>(10));
312
+ let shutdown_token = CancellationToken::new();
313
+
314
+ let stream = new_activity_task_poller(
315
+ Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
316
+ MetricsContext::no_op(),
317
+ shutdown_token.clone(),
318
+ );
319
+ pin_mut!(stream);
320
+
321
+ shutdown_token.cancel();
322
+
323
+ let result = tokio::time::timeout(std::time::Duration::from_secs(2), stream.next()).await;
324
+ assert!(
325
+ result.is_ok(),
326
+ "Stream should terminate promptly after shutdown, not hang"
327
+ );
328
+ assert!(
329
+ result.unwrap().is_none(),
330
+ "Stream should return None (terminated) on empty response after shutdown"
331
+ );
332
+
333
+ let total = poll_count.load(Ordering::SeqCst);
334
+ assert!(
335
+ total < 5,
336
+ "Expected stream to terminate quickly, but poller was called {total} times"
337
+ );
338
+ }
339
+
340
+ #[tokio::test]
341
+ async fn empty_response_before_shutdown_retries() {
342
+ let mut mock_poller = mock_poller();
343
+ let call_count = Arc::new(AtomicUsize::new(0));
344
+ let call_count_clone = call_count.clone();
345
+ mock_poller.expect_poll().returning(move || {
346
+ let n = call_count_clone.fetch_add(1, Ordering::SeqCst);
347
+ if n < 2 {
348
+ Some(Ok(PollActivityTaskQueueResponse::default()))
349
+ } else {
350
+ None
351
+ }
352
+ });
353
+
354
+ let sem = Arc::new(fixed_size_permit_dealer::<ActivitySlotKind>(10));
355
+ let shutdown_token = CancellationToken::new();
356
+
357
+ let stream = new_activity_task_poller(
358
+ Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
359
+ MetricsContext::no_op(),
360
+ shutdown_token,
361
+ );
362
+ pin_mut!(stream);
363
+
364
+ // Without shutdown, empty responses should be skipped and the stream terminates
365
+ // only when the poller returns None.
366
+ let result = stream.next().await;
367
+ assert!(
368
+ result.is_none(),
369
+ "Stream should end when poller returns None"
370
+ );
371
+ assert_eq!(call_count.load(Ordering::SeqCst), 3);
372
+ }
373
+ }
@@ -2,8 +2,8 @@ use crate::{
2
2
  abstractions::{ActiveCounter, MeteredPermitDealer, OwnedMeteredSemPermit, dbg_panic},
3
3
  pollers::{self, Poller},
4
4
  worker::{
5
- ActivitySlotKind, NexusSlotKind, PollerBehavior, SlotKind, WFTPollerShared,
6
- WorkflowSlotKind,
5
+ ActivitySlotKind, NamespaceCapabilities, NexusSlotKind, PollerBehavior, SlotKind,
6
+ WFTPollerShared, WorkflowSlotKind,
7
7
  client::{PollActivityOptions, PollOptions, PollWorkflowOptions, WorkerClient},
8
8
  },
9
9
  };
@@ -77,7 +77,7 @@ impl LongPollBuffer<PollWorkflowTaskQueueResponse, WorkflowSlotKind> {
77
77
  num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
78
78
  options: WorkflowTaskOptions,
79
79
  last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
80
- graceful_poll_shutdown: Arc<AtomicBool>,
80
+ capabilities: Arc<NamespaceCapabilities>,
81
81
  ) -> Self {
82
82
  let is_sticky = sticky_queue.is_some();
83
83
  let poll_scaler = PollScaler::new(
@@ -140,7 +140,7 @@ impl LongPollBuffer<PollWorkflowTaskQueueResponse, WorkflowSlotKind> {
140
140
  poll_scaler,
141
141
  pre_permit_delay,
142
142
  post_poll_fn,
143
- graceful_poll_shutdown,
143
+ capabilities,
144
144
  )
145
145
  }
146
146
  }
@@ -156,7 +156,7 @@ impl LongPollBuffer<PollActivityTaskQueueResponse, ActivitySlotKind> {
156
156
  num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
157
157
  options: ActivityTaskOptions,
158
158
  last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
159
- graceful_poll_shutdown: Arc<AtomicBool>,
159
+ capabilities: Arc<NamespaceCapabilities>,
160
160
  ) -> Self {
161
161
  let pre_permit_delay = options
162
162
  .max_worker_acts_per_second
@@ -209,7 +209,7 @@ impl LongPollBuffer<PollActivityTaskQueueResponse, ActivitySlotKind> {
209
209
  poll_scaler,
210
210
  pre_permit_delay,
211
211
  None::<fn(&PollActivityTaskQueueResponse)>,
212
- graceful_poll_shutdown,
212
+ capabilities,
213
213
  )
214
214
  }
215
215
  }
@@ -225,7 +225,7 @@ impl LongPollBuffer<PollNexusTaskQueueResponse, NexusSlotKind> {
225
225
  num_pollers_handler: Option<impl Fn(usize) + Send + Sync + 'static>,
226
226
  last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
227
227
  send_heartbeat: bool,
228
- graceful_poll_shutdown: Arc<AtomicBool>,
228
+ capabilities: Arc<NamespaceCapabilities>,
229
229
  ) -> Self {
230
230
  let no_retry = if matches!(poller_behavior, PollerBehavior::Autoscaling { .. }) {
231
231
  Some(NoRetryOnMatching {
@@ -262,7 +262,7 @@ impl LongPollBuffer<PollNexusTaskQueueResponse, NexusSlotKind> {
262
262
  ),
263
263
  None::<fn() -> BoxFuture<'static, ()>>,
264
264
  None::<fn(&PollNexusTaskQueueResponse)>,
265
- graceful_poll_shutdown,
265
+ capabilities,
266
266
  )
267
267
  }
268
268
  }
@@ -288,7 +288,7 @@ where
288
288
  mut poll_scaler: PollScaler<F>,
289
289
  pre_permit_delay: Option<impl Fn() -> DelayFut + Send + Sync + 'static>,
290
290
  post_poll_fn: Option<impl Fn(&T) + Send + Sync + 'static>,
291
- graceful_shutdown: Arc<AtomicBool>,
291
+ capabilities: Arc<NamespaceCapabilities>,
292
292
  ) -> Self
293
293
  where
294
294
  FT: Future<Output = pollers::Result<T>> + Send,
@@ -359,11 +359,9 @@ where
359
359
  } else {
360
360
  None
361
361
  };
362
- let graceful_shutdown = graceful_shutdown.clone();
362
+ let capabilities = capabilities.clone();
363
363
  let poll_task = tokio::spawn(async move {
364
- let shutdown_clone = shutdown.clone();
365
-
366
- let r = if graceful_shutdown.load(Ordering::Relaxed) {
364
+ let r = if capabilities.graceful_poll_shutdown() {
367
365
  pf(timeout_override).await
368
366
  } else {
369
367
  let poll_interruptor = shutdown.cancelled().then(|_| async move {
@@ -383,10 +381,11 @@ where
383
381
  }
384
382
  let (should_forward, backoff_duration) = report_handle.poll_result(&r);
385
383
  if let Some(duration) = backoff_duration {
386
- // Apply backoff BEFORE dropping active_guard to prevent next poll from starting
384
+ // Apply backoff BEFORE dropping active_guard to prevent next poll from
385
+ // starting
387
386
  tokio::select! {
388
387
  _ = tokio::time::sleep(duration) => return,
389
- _ = shutdown_clone.cancelled() => (),
388
+ _ = shutdown.cancelled() => (),
390
389
  };
391
390
  }
392
391
  drop(active_guard);
@@ -853,7 +852,9 @@ mod tests {
853
852
  wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
854
853
  },
855
854
  Arc::new(AtomicCell::new(None)),
856
- Arc::new(AtomicBool::new(false)),
855
+ Arc::new(NamespaceCapabilities {
856
+ graceful_poll_shutdown: AtomicBool::new(false),
857
+ }),
857
858
  );
858
859
 
859
860
  // Poll a bunch of times, "interrupting" it each time, we should only actually have polled
@@ -910,7 +911,9 @@ mod tests {
910
911
  wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(1)))),
911
912
  },
912
913
  Arc::new(AtomicCell::new(None)),
913
- Arc::new(AtomicBool::new(false)),
914
+ Arc::new(NamespaceCapabilities {
915
+ graceful_poll_shutdown: AtomicBool::new(false),
916
+ }),
914
917
  );
915
918
 
916
919
  // Should not see error, unwraps should get empty response
@@ -987,7 +990,9 @@ mod tests {
987
990
  wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
988
991
  },
989
992
  Arc::new(AtomicCell::new(None)),
990
- Arc::new(AtomicBool::new(false)),
993
+ Arc::new(NamespaceCapabilities {
994
+ graceful_poll_shutdown: AtomicBool::new(false),
995
+ }),
991
996
  );
992
997
 
993
998
  let first_task = pb.poll().await.expect("Should get first task");
@@ -1093,7 +1098,9 @@ mod tests {
1093
1098
  wft_poller_shared: Some(Arc::new(WFTPollerShared::new(Some(10)))),
1094
1099
  },
1095
1100
  Arc::new(AtomicCell::new(None)),
1096
- Arc::new(AtomicBool::new(false)),
1101
+ Arc::new(NamespaceCapabilities {
1102
+ graceful_poll_shutdown: AtomicBool::new(false),
1103
+ }),
1097
1104
  ));
1098
1105
 
1099
1106
  // Trigger the first poll to initialize and get the scaling decision
@@ -1174,7 +1181,9 @@ mod tests {
1174
1181
  wft_poller_shared: None,
1175
1182
  },
1176
1183
  Arc::new(AtomicCell::new(None)),
1177
- Arc::new(AtomicBool::new(graceful)),
1184
+ Arc::new(NamespaceCapabilities {
1185
+ graceful_poll_shutdown: AtomicBool::new(graceful),
1186
+ }),
1178
1187
  );
1179
1188
 
1180
1189
  let first = pb.poll().await.unwrap().unwrap();
@@ -733,7 +733,7 @@ mod tests {
733
733
  abstractions::tests::fixed_size_permit_dealer,
734
734
  pollers::{ActivityTaskOptions, LongPollBuffer},
735
735
  prost_dur,
736
- worker::{PollerBehavior, client::mocks::mock_worker_client},
736
+ worker::{NamespaceCapabilities, PollerBehavior, client::mocks::mock_worker_client},
737
737
  };
738
738
  use crossbeam_utils::atomic::AtomicCell;
739
739
  use temporalio_common::protos::coresdk::activity_result::ActivityExecutionResult;
@@ -781,7 +781,9 @@ mod tests {
781
781
  max_tps: None,
782
782
  },
783
783
  Arc::new(AtomicCell::new(None)),
784
- Arc::new(AtomicBool::new(false)),
784
+ Arc::new(NamespaceCapabilities {
785
+ graceful_poll_shutdown: AtomicBool::new(false),
786
+ }),
785
787
  );
786
788
  let atm = WorkerActivityTasks::new(
787
789
  sem.clone(),
@@ -874,7 +876,9 @@ mod tests {
874
876
  max_tps: None,
875
877
  },
876
878
  Arc::new(AtomicCell::new(None)),
877
- Arc::new(AtomicBool::new(false)),
879
+ Arc::new(NamespaceCapabilities {
880
+ graceful_poll_shutdown: AtomicBool::new(false),
881
+ }),
878
882
  );
879
883
  let atm = WorkerActivityTasks::new(
880
884
  sem.clone(),
@@ -949,7 +953,9 @@ mod tests {
949
953
  max_tps: None,
950
954
  },
951
955
  Arc::new(AtomicCell::new(None)),
952
- Arc::new(AtomicBool::new(false)),
956
+ Arc::new(NamespaceCapabilities {
957
+ graceful_poll_shutdown: AtomicBool::new(false),
958
+ }),
953
959
  );
954
960
  let atm = WorkerActivityTasks::new(
955
961
  sem.clone(),
@@ -61,7 +61,7 @@ use anyhow::bail;
61
61
  use crossbeam_utils::atomic::AtomicCell;
62
62
  use futures_util::{StreamExt, stream};
63
63
  use gethostname::gethostname;
64
- use parking_lot::RwLock;
64
+ use parking_lot::{Mutex, RwLock};
65
65
  use slot_provider::SlotProvider;
66
66
  use std::{
67
67
  any::Any,
@@ -415,9 +415,24 @@ pub struct Worker {
415
415
  client_worker_registrator: Arc<ClientWorkerRegistrator>,
416
416
  /// Status of the worker
417
417
  status: Arc<RwLock<WorkerStatus>>,
418
- /// Set during validate() when server supports graceful poll cancellation on shutdown.
419
- /// Shared with pollers so they can decide per-poll whether to hard-kill or wait.
420
- graceful_poll_shutdown: Arc<AtomicBool>,
418
+ /// Capabilities as returned by a describe namespace rpc. Not set until after validate() is
419
+ /// called.
420
+ capabilities: Arc<NamespaceCapabilities>,
421
+ /// Handle for the spawned ShutdownWorker RPC task, awaited during shutdown.
422
+ shutdown_rpc_handle: Mutex<Option<tokio::task::JoinHandle<()>>>,
423
+ }
424
+
425
+ /// Namespace capabilities discovered via `describe_namespace` during worker validation.
426
+ pub struct NamespaceCapabilities {
427
+ pub(crate) graceful_poll_shutdown: AtomicBool,
428
+ }
429
+
430
+ impl NamespaceCapabilities {
431
+ /// Returns true if the server supports graceful poll cancellation on shutdown, so pollers
432
+ /// can let in-flight polls complete rather than hard-killing them.
433
+ pub fn graceful_poll_shutdown(&self) -> bool {
434
+ self.graceful_poll_shutdown.load(Ordering::Relaxed)
435
+ }
421
436
  }
422
437
 
423
438
  struct AllPermitsTracker {
@@ -490,11 +505,12 @@ impl Worker {
490
505
  memo_size_limit_error: api_limits.memo_size_limit_error,
491
506
  })
492
507
  });
493
- if ns_info
494
- .and_then(|ns| ns.capabilities)
495
- .is_some_and(|caps| caps.worker_poll_complete_on_shutdown)
508
+ if let Some(caps) = ns_info.and_then(|ns| ns.capabilities)
509
+ && caps.worker_poll_complete_on_shutdown
496
510
  {
497
- self.graceful_poll_shutdown.store(true, Ordering::Relaxed);
511
+ self.capabilities
512
+ .graceful_poll_shutdown
513
+ .store(true, Ordering::Relaxed);
498
514
  }
499
515
  Ok(NamespaceInfo { limits })
500
516
  }
@@ -616,7 +632,9 @@ impl Worker {
616
632
  let wf_sticky_last_suc_poll_time = Arc::new(AtomicCell::new(None));
617
633
  let act_last_suc_poll_time = Arc::new(AtomicCell::new(None));
618
634
  let nexus_last_suc_poll_time = Arc::new(AtomicCell::new(None));
619
- let graceful_poll_shutdown = Arc::new(AtomicBool::new(false));
635
+ let capabilities = Arc::new(NamespaceCapabilities {
636
+ graceful_poll_shutdown: AtomicBool::new(false),
637
+ });
620
638
 
621
639
  let nexus_slots = MeteredPermitDealer::new(
622
640
  tuner.nexus_task_slot_supplier(),
@@ -637,7 +655,7 @@ impl Worker {
637
655
  &wft_slots,
638
656
  wf_last_suc_poll_time.clone(),
639
657
  wf_sticky_last_suc_poll_time.clone(),
640
- graceful_poll_shutdown.clone(),
658
+ capabilities.clone(),
641
659
  )
642
660
  .boxed();
643
661
  let stream = if !client.is_mock() {
@@ -667,7 +685,7 @@ impl Worker {
667
685
  max_tps: config.max_task_queue_activities_per_second,
668
686
  },
669
687
  act_last_suc_poll_time.clone(),
670
- graceful_poll_shutdown.clone(),
688
+ capabilities.clone(),
671
689
  );
672
690
  Some(Box::from(ap) as BoxedActPoller)
673
691
  } else {
@@ -685,7 +703,7 @@ impl Worker {
685
703
  Some(move |np| np_metrics.record_num_pollers(np)),
686
704
  nexus_last_suc_poll_time.clone(),
687
705
  shared_namespace_worker,
688
- graceful_poll_shutdown.clone(),
706
+ capabilities.clone(),
689
707
  )) as BoxedNexusPoller)
690
708
  } else {
691
709
  None
@@ -905,7 +923,8 @@ impl Worker {
905
923
  nexus_mgr,
906
924
  client_worker_registrator,
907
925
  status: worker_status,
908
- graceful_poll_shutdown,
926
+ capabilities,
927
+ shutdown_rpc_handle: Mutex::new(None),
909
928
  })
910
929
  }
911
930
 
@@ -923,43 +942,12 @@ impl Worker {
923
942
  /// [Worker::finalize_shutdown].
924
943
  pub async fn shutdown(&self) {
925
944
  self.initiate_shutdown();
926
- {
927
- *self.status.write() = WorkerStatus::ShuttingDown;
928
- }
929
- let heartbeat = self
930
- .client_worker_registrator
931
- .heartbeat_manager
932
- .as_ref()
933
- .map(|hm| hm.heartbeat_callback.clone()());
934
- let sticky_name = self
935
- .workflows
936
- .as_ref()
937
- .and_then(|wf| wf.get_sticky_queue_name())
938
- .unwrap_or_default();
939
- // This is a best effort call and we can still shutdown the worker if it fails
940
- let task_queue_types = self.config.task_types.to_task_queue_types();
941
- match self
942
- .client
943
- .shutdown_worker(
944
- sticky_name,
945
- self.config.task_queue.clone(),
946
- task_queue_types,
947
- heartbeat,
948
- )
949
- .await
950
- {
951
- Err(err)
952
- if !matches!(
953
- err.code(),
954
- tonic::Code::Unimplemented | tonic::Code::Unavailable
955
- ) =>
956
- {
957
- warn!(
958
- "shutdown_worker rpc errored during worker shutdown: {:?}",
959
- err
960
- );
961
- }
962
- _ => {}
945
+
946
+ // Ensure the ShutdownWorker RPC completes before waiting for polls to drain,
947
+ // otherwise graceful poll shutdown deadlocks.
948
+ let handle = self.shutdown_rpc_handle.lock().take();
949
+ if let Some(handle) = handle {
950
+ let _ = handle.await;
963
951
  }
964
952
 
965
953
  // We need to wait for all local activities to finish so no more workflow task heartbeats
@@ -1354,8 +1342,15 @@ impl Worker {
1354
1342
  &self.config
1355
1343
  }
1356
1344
 
1357
- /// Initiate shutdown. See [Worker::shutdown], this is just a sync version that starts the
1358
- /// process. You can then wait on `shutdown` or [Worker::finalize_shutdown].
1345
+ /// Returns the namespace capabilities discovered during [Worker::validate].
1346
+ pub fn get_namespace_capabilities(&self) -> &NamespaceCapabilities {
1347
+ &self.capabilities
1348
+ }
1349
+
1350
+ /// Initiate shutdown, including spawning the `ShutdownWorker` RPC so the server can complete
1351
+ /// in-flight polls. The RPC runs in a background task and is awaited in [Worker::shutdown].
1352
+ ///
1353
+ /// You can then wait on `shutdown` or [Worker::finalize_shutdown].
1359
1354
  pub fn initiate_shutdown(&self) {
1360
1355
  if !self.shutdown_token.is_cancelled() {
1361
1356
  info!(
@@ -1364,6 +1359,7 @@ impl Worker {
1364
1359
  "Initiated shutdown",
1365
1360
  );
1366
1361
  }
1362
+ let already_initiated_shutdown = self.shutdown_token.is_cancelled();
1367
1363
  self.shutdown_token.cancel();
1368
1364
  {
1369
1365
  *self.status.write() = WorkerStatus::ShuttingDown;
@@ -1398,6 +1394,47 @@ impl Worker {
1398
1394
  la_mgr.workflows_have_shutdown();
1399
1395
  }
1400
1396
  }
1397
+
1398
+ // Spawn the ShutdownWorker RPC so the server can complete in-flight polls.
1399
+ // The handle is stored and awaited in shutdown() to ensure completion.
1400
+ let mut guard = self.shutdown_rpc_handle.lock();
1401
+ if guard.is_some() || already_initiated_shutdown {
1402
+ return;
1403
+ }
1404
+
1405
+ let client = self.client.clone();
1406
+ let sticky_name = self
1407
+ .workflows
1408
+ .as_ref()
1409
+ .and_then(|wf| wf.get_sticky_queue_name())
1410
+ .unwrap_or_default();
1411
+ let task_queue = self.config.task_queue.clone();
1412
+ let task_queue_types = self.config.task_types.to_task_queue_types();
1413
+ let heartbeat = self
1414
+ .client_worker_registrator
1415
+ .heartbeat_manager
1416
+ .as_ref()
1417
+ .map(|hm| hm.heartbeat_callback.clone()());
1418
+ let handle = tokio::spawn(async move {
1419
+ match client
1420
+ .shutdown_worker(sticky_name, task_queue, task_queue_types, heartbeat)
1421
+ .await
1422
+ {
1423
+ Err(err)
1424
+ if !matches!(
1425
+ err.code(),
1426
+ tonic::Code::Unimplemented | tonic::Code::Unavailable
1427
+ ) =>
1428
+ {
1429
+ warn!(
1430
+ "shutdown_worker rpc errored during worker shutdown: {:?}",
1431
+ err
1432
+ );
1433
+ }
1434
+ _ => {}
1435
+ }
1436
+ });
1437
+ *guard = Some(handle);
1401
1438
  }
1402
1439
 
1403
1440
  /// Unique identifier for this worker instance.
@@ -4,12 +4,12 @@ use crate::{
4
4
  pollers::{BoxedWFPoller, LongPollBuffer, Poller, WorkflowTaskOptions, WorkflowTaskPoller},
5
5
  protosext::ValidPollWFTQResponse,
6
6
  telemetry::metrics::{workflow_poller, workflow_sticky_poller},
7
- worker::{WorkflowSlotKind, client::WorkerClient, wft_poller_behavior},
7
+ worker::{NamespaceCapabilities, WorkflowSlotKind, client::WorkerClient, wft_poller_behavior},
8
8
  };
9
9
  use crossbeam_utils::atomic::AtomicCell;
10
10
  use futures_util::{Stream, stream};
11
11
  use std::{
12
- sync::{Arc, OnceLock, atomic::AtomicBool},
12
+ sync::{Arc, OnceLock},
13
13
  time::SystemTime,
14
14
  };
15
15
  use temporalio_common::protos::temporal::api::workflowservice::v1::PollWorkflowTaskQueueResponse;
@@ -26,7 +26,7 @@ pub(crate) fn make_wft_poller(
26
26
  wft_slots: &MeteredPermitDealer<WorkflowSlotKind>,
27
27
  last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
28
28
  sticky_last_successful_poll_time: Arc<AtomicCell<Option<SystemTime>>>,
29
- graceful_poll_shutdown: Arc<AtomicBool>,
29
+ capabilities: Arc<NamespaceCapabilities>,
30
30
  ) -> impl Stream<
31
31
  Item = Result<
32
32
  (
@@ -60,7 +60,7 @@ pub(crate) fn make_wft_poller(
60
60
  wft_poller_shared: wft_poller_shared.clone(),
61
61
  },
62
62
  last_successful_poll_time,
63
- graceful_poll_shutdown.clone(),
63
+ capabilities.clone(),
64
64
  );
65
65
  let sticky_queue_poller = sticky_queue_name.as_ref().map(|sqn| {
66
66
  let sticky_metrics = metrics.with_new_attrs([workflow_sticky_poller()]);
@@ -76,7 +76,7 @@ pub(crate) fn make_wft_poller(
76
76
  }),
77
77
  WorkflowTaskOptions { wft_poller_shared },
78
78
  sticky_last_successful_poll_time,
79
- graceful_poll_shutdown,
79
+ capabilities,
80
80
  )
81
81
  });
82
82
  let wf_task_poll_buffer = Box::new(WorkflowTaskPoller::new(
@@ -228,11 +228,11 @@ fn new_wft_poller(
228
228
  warn!(error=?e, "Error while polling for workflow tasks");
229
229
  Some((Err(e), (poller, metrics)))
230
230
  }
231
- // If poller returns None, it's dead, thus we also return None to terminate this
232
- // stream.
231
+ // If poller returns None, it's dead, thus we also return None to terminate
232
+ // this stream.
233
233
  None => {
234
- // Make sure we call the actual shutdown function here to propagate any panics
235
- // inside the polling tasks as errors.
234
+ // Make sure we call the actual shutdown function here to propagate any
235
+ // panics inside the polling tasks as errors.
236
236
  poller.shutdown_box().await;
237
237
  None
238
238
  }
@@ -281,6 +281,25 @@ mod tests {
281
281
  assert_matches!(stream.next().await, None);
282
282
  }
283
283
 
284
+ /// When the underlying poller returns None (indicating shutdown), the wrapping WFT stream
285
+ /// should also return None to terminate.
286
+ #[tokio::test]
287
+ async fn poller_returning_none_terminates_wft_stream() {
288
+ let mut mock_poller = mock_poller();
289
+ mock_poller.expect_poll().times(1).returning(|| None);
290
+ mock_poller.expect_shutdown().times(1).returning(|| ());
291
+
292
+ let sem = Arc::new(fixed_size_permit_dealer::<WorkflowSlotKind>(10));
293
+
294
+ let stream = new_wft_poller(
295
+ Box::new(MockPermittedPollBuffer::new(sem, mock_poller)),
296
+ MetricsContext::no_op(),
297
+ );
298
+ pin_mut!(stream);
299
+
300
+ assert_matches!(stream.next().await, None);
301
+ }
302
+
284
303
  #[tokio::test]
285
304
  async fn poll_errors_do_produce_responses() {
286
305
  let mut mock_poller = mock_poller();
@@ -32,3 +32,8 @@ async fn grpc_message_too_large_test() {
32
32
  async fn priority_values_sent_to_server() {
33
33
  shared_tests::priority::priority_values_sent_to_server().await
34
34
  }
35
+
36
+ #[tokio::test]
37
+ async fn shutdown_during_active_timer_activity_workflows() {
38
+ shared_tests::shutdown_during_active_timer_activity_workflows().await
39
+ }
@@ -1076,6 +1076,10 @@ pub(crate) fn integ_dev_server_config(
1076
1076
  "frontend.WorkerHeartbeatsEnabled=true".to_owned(),
1077
1077
  "--dynamic-config-value".to_owned(),
1078
1078
  "frontend.ListWorkersEnabled=true".to_owned(),
1079
+ "--dynamic-config-value".to_owned(),
1080
+ "frontend.enableCancelWorkerPollsOnShutdown=true".to_owned(),
1081
+ "--dynamic-config-value".to_owned(),
1082
+ "matching.rps=12000".to_owned(),
1079
1083
  "--search-attribute".to_string(),
1080
1084
  format!("{SEARCH_ATTR_TXT}=Text"),
1081
1085
  "--search-attribute".to_string(),
@@ -247,43 +247,67 @@ async fn docker_worker_heartbeat_basic(#[values("otel", "prom", "no_metrics")] b
247
247
  );
248
248
  in_activity_checks(heartbeat, &start_time, &heartbeat_time);
249
249
  acts_done.notify_one();
250
+
251
+ // Poll until the heartbeat reflects shutdown with the second WFT processed.
252
+ // The worker stays alive (join! waits for both futures) so heartbeats keep firing.
253
+ eventually(
254
+ || {
255
+ let mut rc = raw_client.clone();
256
+ let ns = client.namespace().to_owned();
257
+ async move {
258
+ let workers_list = WorkflowService::list_workers(
259
+ &mut rc,
260
+ ListWorkersRequest {
261
+ namespace: ns,
262
+ page_size: 100,
263
+ next_page_token: Vec::new(),
264
+ query: String::new(),
265
+ }
266
+ .into_request(),
267
+ )
268
+ .await
269
+ .unwrap()
270
+ .into_inner();
271
+ #[allow(deprecated)]
272
+ let hb = workers_list
273
+ .workers_info
274
+ .iter()
275
+ .find_map(|wi| {
276
+ wi.worker_heartbeat.as_ref().filter(|hb| {
277
+ hb.worker_instance_key == worker_instance_key.to_string()
278
+ })
279
+ })
280
+ .unwrap()
281
+ .clone();
282
+ let tasks_done = hb
283
+ .workflow_task_slots_info
284
+ .as_ref()
285
+ .is_some_and(|s| s.total_processed_tasks >= 2);
286
+ let is_shutting_down = hb.status == WorkerStatus::ShuttingDown as i32;
287
+ if tasks_done && is_shutting_down {
288
+ Ok(hb)
289
+ } else {
290
+ Err(anyhow::anyhow!(
291
+ "Heartbeat not ready: tasks={}, shutting_down={}",
292
+ hb.workflow_task_slots_info
293
+ .as_ref()
294
+ .map_or(0, |s| s.total_processed_tasks),
295
+ is_shutting_down,
296
+ ))
297
+ }
298
+ }
299
+ },
300
+ Duration::from_secs(5),
301
+ )
302
+ .await
303
+ .map(|hb| after_shutdown_checks(&hb, &wf_name, &start_time, &heartbeat_time))
304
+ .unwrap();
250
305
  };
251
306
 
252
307
  let runner = async move {
253
308
  worker.run_until_done().await.unwrap();
254
309
  };
255
310
  tokio::join!(test_fut, runner);
256
-
257
- let client = starter.get_client().await;
258
- let mut raw_client = client.clone();
259
- let workers_list = WorkflowService::list_workers(
260
- &mut raw_client,
261
- ListWorkersRequest {
262
- namespace: client.namespace().to_owned(),
263
- page_size: 100,
264
- next_page_token: Vec::new(),
265
- query: String::new(),
266
- }
267
- .into_request(),
268
- )
269
- .await
270
- .unwrap()
271
- .into_inner();
272
- // Since list_workers finds all workers in the namespace, must find specific worker used in this
273
- // test
274
- let worker_info = workers_list
275
- .workers_info
276
- .iter()
277
- .find(|worker_info| {
278
- if let Some(hb) = worker_info.worker_heartbeat.as_ref() {
279
- hb.worker_instance_key == worker_instance_key.to_string()
280
- } else {
281
- false
282
- }
283
- })
284
- .unwrap();
285
- let heartbeat = worker_info.worker_heartbeat.as_ref().unwrap();
286
- after_shutdown_checks(heartbeat, &wf_name, &start_time, &heartbeat_time);
287
311
  }
288
312
 
289
313
  // Tests that rely on Prometheus running in a docker container need to start
@@ -521,23 +545,7 @@ fn after_shutdown_checks(
521
545
  ));
522
546
 
523
547
  let workflow_task_slots = heartbeat.workflow_task_slots_info.clone().unwrap();
524
- assert_eq!(workflow_task_slots.current_available_slots, 5);
525
- assert_eq!(workflow_task_slots.current_used_slots, 1);
526
548
  assert_eq!(workflow_task_slots.total_processed_tasks, 2);
527
- assert_eq!(workflow_task_slots.slot_supplier_kind, "Fixed");
528
- let activity_task_slots = heartbeat.activity_task_slots_info.clone().unwrap();
529
- assert_eq!(activity_task_slots.current_available_slots, 5);
530
- assert_eq!(workflow_task_slots.current_used_slots, 1);
531
- assert_eq!(activity_task_slots.slot_supplier_kind, "Fixed");
532
- assert_eq!(activity_task_slots.last_interval_processed_tasks, 1);
533
- let nexus_task_slots = heartbeat.nexus_task_slots_info.clone().unwrap();
534
- assert_eq!(nexus_task_slots.current_available_slots, 0);
535
- assert_eq!(nexus_task_slots.current_used_slots, 0);
536
- assert_eq!(nexus_task_slots.slot_supplier_kind, "Fixed");
537
- let local_activity_task_slots = heartbeat.local_activity_slots_info.clone().unwrap();
538
- assert_eq!(local_activity_task_slots.current_available_slots, 100);
539
- assert_eq!(local_activity_task_slots.current_used_slots, 0);
540
- assert_eq!(local_activity_task_slots.slot_supplier_kind, "Fixed");
541
549
 
542
550
  let workflow_poller_info = heartbeat.workflow_poller_info.unwrap();
543
551
  assert!(!workflow_poller_info.is_autoscaling);
@@ -559,7 +567,6 @@ fn after_shutdown_checks(
559
567
  ));
560
568
 
561
569
  assert_eq!(heartbeat.total_sticky_cache_hit, 1);
562
- assert_eq!(heartbeat.current_sticky_cache_size, 0);
563
570
  assert_eq!(
564
571
  heartbeat.plugins,
565
572
  vec![
@@ -983,3 +983,8 @@ fn test_default_build_id() {
983
983
  assert!(!o.deployment_options.version.build_id.is_empty());
984
984
  assert_ne!(o.deployment_options.version.build_id, "undetermined");
985
985
  }
986
+
987
+ #[tokio::test]
988
+ async fn shutdown_during_active_timer_activity_workflows() {
989
+ shared_tests::shutdown_during_active_timer_activity_workflows().await
990
+ }
@@ -1,9 +1,15 @@
1
1
  //! Shared tests that are meant to be run against both local dev server and cloud
2
2
 
3
- use crate::common::CoreWfStarter;
4
- use std::sync::{
5
- Arc,
6
- atomic::{AtomicBool, Ordering::Relaxed},
3
+ use crate::common::{CoreWfStarter, activity_functions::StdActivities};
4
+ use std::{
5
+ sync::{
6
+ Arc,
7
+ atomic::{AtomicBool, Ordering::Relaxed},
8
+ },
9
+ time::Duration,
10
+ };
11
+ use temporalio_client::{
12
+ UntypedWorkflow, WorkflowFetchHistoryOptions, WorkflowStartOptions, WorkflowTerminateOptions,
7
13
  };
8
14
  use temporalio_common::{
9
15
  protos::temporal::api::{
@@ -15,7 +21,7 @@ use temporalio_common::{
15
21
  worker::WorkerTaskTypes,
16
22
  };
17
23
  use temporalio_macros::{workflow, workflow_methods};
18
- use temporalio_sdk::{WorkflowContext, WorkflowResult};
24
+ use temporalio_sdk::{ActivityOptions, WorkflowContext, WorkflowResult, WorkflowTermination};
19
25
 
20
26
  pub(crate) mod priority;
21
27
 
@@ -92,3 +98,115 @@ pub(crate) fn is_oversize_grpc_event(
92
98
  false
93
99
  }
94
100
  }
101
+
102
+ #[workflow]
103
+ #[derive(Default)]
104
+ struct ShutdownTimerActivityLoopWf;
105
+
106
+ #[workflow_methods]
107
+ impl ShutdownTimerActivityLoopWf {
108
+ #[run]
109
+ async fn run(ctx: &mut WorkflowContext<Self>) -> WorkflowResult<()> {
110
+ loop {
111
+ ctx.timer(Duration::from_millis(10)).await;
112
+ ctx.start_activity(
113
+ StdActivities::no_op,
114
+ (),
115
+ ActivityOptions {
116
+ start_to_close_timeout: Some(Duration::from_secs(10)),
117
+ ..Default::default()
118
+ },
119
+ )
120
+ .await
121
+ .map_err(|e| WorkflowTermination::from(anyhow::Error::from(e)))?;
122
+ }
123
+ }
124
+ }
125
+
126
+ /// Starts 10 workflows that each run a tight timer+activity loop, then shuts down the worker
127
+ /// and verifies:
128
+ /// 1. Shutdown completes rapidly (< 5s)
129
+ /// 2. No workflow task failures or timeouts appear in any workflow's history
130
+ pub(crate) async fn shutdown_during_active_timer_activity_workflows() {
131
+ let wf_name = "shutdown_during_active_timer_activity_workflows";
132
+ let num_workflows = 10;
133
+
134
+ let mut starter =
135
+ if let Some(wfs) = CoreWfStarter::new_cloud_or_local(wf_name, ">=1.6.3-serverless").await {
136
+ wfs
137
+ } else {
138
+ return;
139
+ };
140
+ starter.sdk_config.register_activities(StdActivities);
141
+ let mut worker = starter.worker().await;
142
+ worker.register_workflow::<ShutdownTimerActivityLoopWf>();
143
+
144
+ let core = worker.core_worker();
145
+ core.validate().await.unwrap();
146
+ assert!(
147
+ core.get_namespace_capabilities().graceful_poll_shutdown(),
148
+ "Server must support graceful poll shutdown for this test"
149
+ );
150
+
151
+ let task_queue = starter.get_task_queue().to_owned();
152
+ let mut wf_ids = Vec::with_capacity(num_workflows);
153
+ for i in 0..num_workflows {
154
+ let wf_id = format!("{task_queue}-{i}");
155
+ worker
156
+ .submit_workflow(
157
+ ShutdownTimerActivityLoopWf::run,
158
+ (),
159
+ WorkflowStartOptions::new(task_queue.clone(), wf_id.clone()).build(),
160
+ )
161
+ .await
162
+ .unwrap();
163
+ wf_ids.push(wf_id);
164
+ }
165
+ // Don't wait for workflow completion — these loop forever
166
+ worker.fetch_results = false;
167
+
168
+ let shutdown_handle = worker.inner_mut().shutdown_handle();
169
+ let run_fut = async { worker.run_until_done().await.unwrap() };
170
+
171
+ let shutdown_fut = async {
172
+ // Let workflows run a few iterations
173
+ tokio::time::sleep(Duration::from_secs(2)).await;
174
+ shutdown_handle();
175
+ };
176
+
177
+ let shutdown_start = std::time::Instant::now();
178
+ tokio::join!(run_fut, shutdown_fut);
179
+ let shutdown_elapsed = shutdown_start.elapsed();
180
+
181
+ assert!(
182
+ shutdown_elapsed < Duration::from_secs(5),
183
+ "Worker shutdown took {shutdown_elapsed:?}, expected < 5s"
184
+ );
185
+
186
+ let client = starter.get_client().await;
187
+ for wf_id in &wf_ids {
188
+ client
189
+ .get_workflow_handle::<UntypedWorkflow>(wf_id)
190
+ .terminate(WorkflowTerminateOptions::default())
191
+ .await
192
+ .unwrap();
193
+
194
+ let history = client
195
+ .get_workflow_handle::<UntypedWorkflow>(wf_id)
196
+ .fetch_history(WorkflowFetchHistoryOptions::default())
197
+ .await
198
+ .unwrap();
199
+ let bad_events: Vec<_> = history
200
+ .events()
201
+ .iter()
202
+ .filter(|e| {
203
+ e.event_type() == EventType::WorkflowTaskFailed
204
+ || e.event_type() == EventType::WorkflowTaskTimedOut
205
+ })
206
+ .collect();
207
+ assert!(
208
+ bad_events.is_empty(),
209
+ "Workflow {wf_id} had unexpected WFT failures/timeouts: {bad_events:?}"
210
+ );
211
+ }
212
+ }
@@ -929,6 +929,7 @@ pub extern "C" fn temporal_core_worker_request_workflow_eviction(
929
929
  #[unsafe(no_mangle)]
930
930
  pub extern "C" fn temporal_core_worker_initiate_shutdown(worker: *mut Worker) {
931
931
  let worker = unsafe { &*worker };
932
+ enter_sync!(worker.runtime);
932
933
  worker.worker.as_ref().unwrap().initiate_shutdown();
933
934
  }
934
935
 
package/src/worker.rs CHANGED
@@ -339,6 +339,12 @@ pub fn worker_complete_nexus_task(
339
339
  #[js_function]
340
340
  pub fn worker_initiate_shutdown(worker: OpaqueInboundHandle<Worker>) -> BridgeResult<()> {
341
341
  let worker_ref = worker.borrow()?;
342
+
343
+ // Core worker shutdown now spawns a Tokio task, so this sync Neon binding must
344
+ // enter Core's Tokio runtime before initiating shutdown.
345
+ let runtime = worker_ref.core_runtime.clone();
346
+ enter_sync!(runtime);
347
+
342
348
  worker_ref.core_worker.initiate_shutdown();
343
349
  Ok(())
344
350
  }
@@ -868,7 +874,7 @@ mod custom_slot_supplier {
868
874
  }
869
875
  Err(err) => {
870
876
  warn!("Error reserving slot: {err:?}");
871
- tokio::time::sleep(std::time::Duration::from_millis(1000)).await;
877
+ tokio::time::sleep(std::time::Duration::from_secs(1)).await;
872
878
  }
873
879
  }
874
880
  }