@temporalio/core-bridge 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts CHANGED
@@ -162,6 +162,7 @@ export declare function newReplayWorker(
162
162
  callback: WorkerCallback
163
163
  ): void;
164
164
  export declare function workerShutdown(worker: Worker, callback: VoidCallback): void;
165
+ export declare function clientClose(client: Client): void;
165
166
  export declare function runtimeShutdown(runtime: Runtime, callback: VoidCallback): void;
166
167
  export declare function pollLogs(runtime: Runtime, callback: LogsCallback): void;
167
168
  export declare function workerPollWorkflowActivation(
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@temporalio/core-bridge",
3
- "version": "0.20.0",
3
+ "version": "0.21.0",
4
4
  "description": "Temporal.io SDK Core<>Node bridge",
5
5
  "main": "index.js",
6
6
  "types": "index.d.ts",
@@ -20,7 +20,7 @@
20
20
  "license": "MIT",
21
21
  "dependencies": {
22
22
  "@opentelemetry/api": "^1.0.3",
23
- "@temporalio/internal-non-workflow-common": "^0.20.0",
23
+ "@temporalio/internal-non-workflow-common": "^0.21.0",
24
24
  "arg": "^5.0.1",
25
25
  "cargo-cp-artifact": "^0.1.4",
26
26
  "which": "^2.0.2"
@@ -43,5 +43,5 @@
43
43
  "publishConfig": {
44
44
  "access": "public"
45
45
  },
46
- "gitHead": "e6b7468a00c68efd4baebbf866cf0a28c150bb6b"
46
+ "gitHead": "eb5901f47e16f8c8fe36c1154d5176c5f3205efc"
47
47
  }
@@ -21,7 +21,8 @@ use temporal_sdk_core_protos::{
21
21
  history::v1::History,
22
22
  query::v1::WorkflowQuery,
23
23
  workflowservice::v1::{
24
- RespondQueryTaskCompletedResponse, RespondWorkflowTaskCompletedResponse,
24
+ GetWorkflowExecutionHistoryResponse, RespondQueryTaskCompletedResponse,
25
+ RespondWorkflowTaskCompletedResponse,
25
26
  },
26
27
  },
27
28
  };
@@ -381,3 +382,108 @@ async fn legacy_query_after_complete(#[values(false, true)] full_history: bool)
381
382
 
382
383
  core.shutdown().await;
383
384
  }
385
+
386
+ #[tokio::test]
387
+ async fn query_cache_miss_causes_page_fetch_dont_reply_wft_too_early() {
388
+ let wfid = "fake_wf_id";
389
+ let query_resp = "response";
390
+ let t = canned_histories::single_timer("1");
391
+ let full_hist = t.get_full_history_info().unwrap();
392
+ let tasks = VecDeque::from(vec![{
393
+ // Create a partial task
394
+ let mut pr = hist_to_poll_resp(
395
+ &t,
396
+ wfid.to_owned(),
397
+ ResponseType::OneTask(2),
398
+ TEST_Q.to_string(),
399
+ );
400
+ pr.queries = HashMap::new();
401
+ pr.queries.insert(
402
+ "the-query".to_string(),
403
+ WorkflowQuery {
404
+ query_type: "query-type".to_string(),
405
+ query_args: Some(b"hi".into()),
406
+ header: None,
407
+ },
408
+ );
409
+ pr
410
+ }]);
411
+ let mut mock_client = mock_workflow_client();
412
+ mock_client
413
+ .expect_get_workflow_execution_history()
414
+ .returning(move |_, _, _| {
415
+ Ok(GetWorkflowExecutionHistoryResponse {
416
+ history: Some(full_hist.clone().into()),
417
+ ..Default::default()
418
+ })
419
+ });
420
+ mock_client
421
+ .expect_complete_workflow_task()
422
+ .times(1)
423
+ .returning(|resp| {
424
+ // Verify both the complete command and the query response are sent
425
+ assert_eq!(resp.commands.len(), 1);
426
+ assert_eq!(resp.query_responses.len(), 1);
427
+
428
+ Ok(RespondWorkflowTaskCompletedResponse::default())
429
+ });
430
+
431
+ let mut mock = MocksHolder::from_client_with_responses(mock_client, tasks, vec![]);
432
+ mock.worker_cfg(|wc| wc.max_cached_workflows = 10);
433
+ let core = mock_worker(mock);
434
+ let task = core.poll_workflow_activation().await.unwrap();
435
+ // The first task should *only* start the workflow. It should *not* have a query in it, which
436
+ // was the bug. Query should only appear after we have caught up on replay.
437
+ assert_matches!(
438
+ task.jobs.as_slice(),
439
+ [WorkflowActivationJob {
440
+ variant: Some(workflow_activation_job::Variant::StartWorkflow(_)),
441
+ }]
442
+ );
443
+ core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
444
+ task.run_id,
445
+ start_timer_cmd(1, Duration::from_secs(1)),
446
+ ))
447
+ .await
448
+ .unwrap();
449
+
450
+ let task = core.poll_workflow_activation().await.unwrap();
451
+ assert_matches!(
452
+ task.jobs.as_slice(),
453
+ [WorkflowActivationJob {
454
+ variant: Some(workflow_activation_job::Variant::FireTimer(_)),
455
+ }]
456
+ );
457
+ core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
458
+ task.run_id,
459
+ CompleteWorkflowExecution { result: None }.into(),
460
+ ))
461
+ .await
462
+ .unwrap();
463
+
464
+ // Now the query shall arrive
465
+ let task = core.poll_workflow_activation().await.unwrap();
466
+ assert_matches!(
467
+ task.jobs[0],
468
+ WorkflowActivationJob {
469
+ variant: Some(workflow_activation_job::Variant::QueryWorkflow(_)),
470
+ }
471
+ );
472
+ core.complete_workflow_activation(WorkflowActivationCompletion::from_cmd(
473
+ task.run_id,
474
+ QueryResult {
475
+ query_id: "the-query".to_string(),
476
+ variant: Some(
477
+ QuerySuccess {
478
+ response: Some(query_resp.into()),
479
+ }
480
+ .into(),
481
+ ),
482
+ }
483
+ .into(),
484
+ ))
485
+ .await
486
+ .unwrap();
487
+
488
+ core.shutdown().await;
489
+ }
@@ -35,7 +35,7 @@ use temporal_sdk_core_protos::{
35
35
  temporal::api::{
36
36
  enums::v1::{EventType, WorkflowTaskFailedCause},
37
37
  failure::v1::Failure,
38
- history::v1::{history_event, TimerFiredEventAttributes},
38
+ history::v1::{history_event, History, TimerFiredEventAttributes},
39
39
  workflowservice::v1::{
40
40
  GetWorkflowExecutionHistoryResponse, RespondWorkflowTaskCompletedResponse,
41
41
  },
@@ -1694,3 +1694,46 @@ async fn tasks_from_completion_are_delivered() {
1694
1694
  .unwrap();
1695
1695
  core.shutdown().await;
1696
1696
  }
1697
+
1698
+ #[tokio::test]
1699
+ async fn evict_missing_wf_during_poll_doesnt_eat_permit() {
1700
+ let wfid = "fake_wf_id";
1701
+ let mut t = TestHistoryBuilder::default();
1702
+ t.add_by_type(EventType::WorkflowExecutionStarted);
1703
+ t.add_full_wf_task();
1704
+ t.add_we_signaled("sig", vec![]);
1705
+ t.add_full_wf_task();
1706
+ t.add_workflow_execution_completed();
1707
+
1708
+ let tasks = [hist_to_poll_resp(
1709
+ &t,
1710
+ wfid.to_owned(),
1711
+ // Use a partial task so that we'll fetch history
1712
+ ResponseType::OneTask(2),
1713
+ TEST_Q.to_string(),
1714
+ )];
1715
+ let mut mock = mock_workflow_client();
1716
+ mock.expect_get_workflow_execution_history()
1717
+ .times(1)
1718
+ .returning(move |_, _, _| {
1719
+ Ok(GetWorkflowExecutionHistoryResponse {
1720
+ // Empty history so we error applying it (no jobs)
1721
+ history: Some(History { events: vec![] }),
1722
+ raw_history: vec![],
1723
+ next_page_token: vec![],
1724
+ archived: false,
1725
+ })
1726
+ });
1727
+ let mut mock = MocksHolder::from_client_with_responses(mock, tasks, []);
1728
+ mock.worker_cfg(|wc| {
1729
+ wc.max_cached_workflows = 1;
1730
+ wc.max_outstanding_workflow_tasks = 1;
1731
+ });
1732
+ let core = mock_worker(mock);
1733
+
1734
+ // Should error because mock is out of work
1735
+ assert_matches!(core.poll_workflow_activation().await, Err(_));
1736
+ assert_eq!(core.available_wft_permits(), 1);
1737
+
1738
+ core.shutdown().await;
1739
+ }
@@ -14,7 +14,10 @@ use temporal_sdk_core_protos::{
14
14
  activity_task::{activity_task, ActivityCancelReason, ActivityTask, Cancel, Start},
15
15
  common::WorkflowExecution,
16
16
  },
17
- temporal::api::enums::v1::TimeoutType,
17
+ temporal::api::{
18
+ enums::v1::TimeoutType,
19
+ failure::v1::{failure::FailureInfo, ApplicationFailureInfo},
20
+ },
18
21
  };
19
22
  use tokio::{
20
23
  sync::{
@@ -420,10 +423,13 @@ impl LocalActivityManager {
420
423
  LocalActivityExecutionResult::Failed(f) => {
421
424
  if let Some(backoff_dur) = info.la_info.schedule_cmd.retry_policy.should_retry(
422
425
  info.attempt as usize,
423
- &f.failure
424
- .as_ref()
425
- .map(|f| format!("{:?}", f))
426
- .unwrap_or_else(|| "".to_string()),
426
+ f.failure.as_ref().map_or("", |f| match &f.failure_info {
427
+ Some(FailureInfo::ApplicationFailureInfo(ApplicationFailureInfo {
428
+ r#type,
429
+ ..
430
+ })) => r#type.as_str(),
431
+ _ => "",
432
+ }),
427
433
  ) {
428
434
  let will_use_timer =
429
435
  backoff_dur > info.la_info.schedule_cmd.local_retry_threshold;
@@ -637,7 +643,9 @@ impl Drop for TimeoutBag {
637
643
  mod tests {
638
644
  use super::*;
639
645
  use crate::protosext::LACloseTimeouts;
640
- use temporal_sdk_core_protos::coresdk::common::RetryPolicy;
646
+ use temporal_sdk_core_protos::{
647
+ coresdk::common::RetryPolicy, temporal::api::failure::v1::Failure,
648
+ };
641
649
  use tokio::{sync::mpsc::error::TryRecvError, task::yield_now};
642
650
 
643
651
  impl DispatchOrTimeoutLA {
@@ -785,6 +793,50 @@ mod tests {
785
793
  )
786
794
  }
787
795
 
796
+ #[tokio::test]
797
+ async fn respects_non_retryable_error_types() {
798
+ let lam = LocalActivityManager::test(1);
799
+ lam.enqueue([NewLocalAct {
800
+ schedule_cmd: ValidScheduleLA {
801
+ seq: 1,
802
+ activity_id: "1".to_string(),
803
+ attempt: 1,
804
+ retry_policy: RetryPolicy {
805
+ initial_interval: Some(Duration::from_secs(1).into()),
806
+ backoff_coefficient: 10.0,
807
+ maximum_interval: Some(Duration::from_secs(10).into()),
808
+ maximum_attempts: 10,
809
+ non_retryable_error_types: vec!["TestError".to_string()],
810
+ },
811
+ local_retry_threshold: Duration::from_secs(5),
812
+ ..Default::default()
813
+ },
814
+ workflow_type: "".to_string(),
815
+ workflow_exec_info: Default::default(),
816
+ schedule_time: SystemTime::now(),
817
+ }
818
+ .into()]);
819
+
820
+ let next = lam.next_pending().await.unwrap().unwrap();
821
+ let tt = TaskToken(next.task_token);
822
+ let res = lam.complete(
823
+ &tt,
824
+ &LocalActivityExecutionResult::Failed(ActFail {
825
+ failure: Some(Failure {
826
+ failure_info: Some(FailureInfo::ApplicationFailureInfo(
827
+ ApplicationFailureInfo {
828
+ r#type: "TestError".to_string(),
829
+ non_retryable: false,
830
+ ..Default::default()
831
+ },
832
+ )),
833
+ ..Default::default()
834
+ }),
835
+ }),
836
+ );
837
+ assert_matches!(res, LACompleteAction::Report(_));
838
+ }
839
+
788
840
  #[tokio::test]
789
841
  async fn can_cancel_during_local_backoff() {
790
842
  let lam = LocalActivityManager::test(1);
@@ -65,6 +65,7 @@ use tracing_futures::Instrument;
65
65
 
66
66
  #[cfg(test)]
67
67
  use crate::worker::client::WorkerClient;
68
+ use crate::workflow::workflow_tasks::EvictionRequestResult;
68
69
 
69
70
  /// A worker polls on a certain task queue
70
71
  pub struct Worker {
@@ -530,13 +531,18 @@ impl Worker {
530
531
  self.workflows_semaphore.add_permit();
531
532
  }
532
533
 
534
+ /// Request a workflow eviction. Returns true if we actually queued up a new eviction request.
533
535
  pub(crate) fn request_wf_eviction(
534
536
  &self,
535
537
  run_id: &str,
536
538
  message: impl Into<String>,
537
539
  reason: EvictionReason,
538
- ) {
539
- self.wft_manager.request_eviction(run_id, message, reason);
540
+ ) -> bool {
541
+ match self.wft_manager.request_eviction(run_id, message, reason) {
542
+ EvictionRequestResult::EvictionIssued(_) => true,
543
+ EvictionRequestResult::NotFound => false,
544
+ EvictionRequestResult::EvictionAlreadyOutstanding => false,
545
+ }
540
546
  }
541
547
 
542
548
  /// Sets a function to be called at the end of each activation completion
@@ -675,11 +681,16 @@ impl Worker {
675
681
  }
676
682
  NewWfTaskOutcome::Evict(e) => {
677
683
  warn!(error=?e, run_id=%we.run_id, "Error while applying poll response to workflow");
678
- self.request_wf_eviction(
684
+ let did_issue_eviction = self.request_wf_eviction(
679
685
  &we.run_id,
680
686
  format!("Error while applying poll response to workflow: {:?}", e),
681
687
  e.evict_reason(),
682
688
  );
689
+ // If we didn't actually need to issue an eviction, then return the WFT permit.
690
+ // EX: The workflow we tried to evict wasn't in the cache.
691
+ if !did_issue_eviction {
692
+ self.return_workflow_task_permit();
693
+ }
683
694
  None
684
695
  }
685
696
  })
@@ -12,6 +12,7 @@ use std::{
12
12
  collections::HashMap,
13
13
  fmt::Debug,
14
14
  ops::{Deref, DerefMut},
15
+ sync::Arc,
15
16
  };
16
17
  use temporal_sdk_core_protos::coresdk::workflow_activation::WorkflowActivation;
17
18
 
@@ -22,7 +23,7 @@ pub(crate) struct WorkflowConcurrencyManager {
22
23
  }
23
24
 
24
25
  struct ManagedRun {
25
- wfm: Mutex<WorkflowManager>,
26
+ wfm: Arc<Mutex<WorkflowManager>>,
26
27
  wft: Option<OutstandingTask>,
27
28
  activation: Option<OutstandingActivation>,
28
29
  metrics: MetricsContext,
@@ -36,7 +37,7 @@ struct ManagedRun {
36
37
  impl ManagedRun {
37
38
  fn new(wfm: WorkflowManager, metrics: MetricsContext) -> Self {
38
39
  Self {
39
- wfm: Mutex::new(wfm),
40
+ wfm: Arc::new(Mutex::new(wfm)),
40
41
  wft: None,
41
42
  activation: None,
42
43
  metrics,
@@ -266,16 +267,19 @@ impl WorkflowConcurrencyManager {
266
267
  F: for<'a> FnOnce(&'a mut WorkflowManager) -> BoxFuture<Result<Fout>>,
267
268
  Fout: Send + Debug,
268
269
  {
269
- let readlock = self.runs.read();
270
- let m = readlock
271
- .get(run_id)
272
- .ok_or_else(|| WFMachinesError::Fatal("Missing workflow machines".to_string()))?;
273
- // This holds a non-async mutex across an await point which is technically a no-no, but
274
- // we never access the machines for the same run simultaneously anyway. This should all
275
- // get fixed with a generally different approach which moves the runs inside workers.
276
- let mut wfm_mutex = m.wfm.lock();
277
- let res = mutator(&mut wfm_mutex).await;
270
+ // TODO: Slightly less than ideal. We must avoid holding the read lock on the overall
271
+ // machine map while async-ly mutating the inner machine. So, we clone the inner ArcMutex.
272
+ // We should restructure things to avoid the top-level lock on the map.
273
+
274
+ let wfm = {
275
+ let readlock = self.runs.read();
276
+ let m = readlock
277
+ .get(run_id)
278
+ .ok_or_else(|| WFMachinesError::Fatal("Missing workflow machines".to_string()))?;
279
+ m.wfm.clone()
280
+ };
278
281
 
282
+ let res = mutator(&mut wfm.lock()).await;
279
283
  res
280
284
  }
281
285
 
@@ -321,6 +325,8 @@ impl WorkflowConcurrencyManager {
321
325
  #[cfg(test)]
322
326
  mod tests {
323
327
  use super::*;
328
+ use crate::test_help::canned_histories;
329
+ use tokio::sync::Barrier;
324
330
 
325
331
  // We test mostly error paths here since the happy paths are well covered by the tests of the
326
332
  // core sdk itself, and setting up the fake data is onerous here. If we make the concurrency
@@ -342,4 +348,57 @@ mod tests {
342
348
  // Should whine that the machines have nothing to do (history empty)
343
349
  assert_matches!(res.unwrap_err(), WFMachinesError::Fatal { .. });
344
350
  }
351
+
352
+ /// This test makes sure that if we're stuck on an await within the machine mutator we don't
353
+ /// cause a deadlock if a write happens during that. This test will hang without proper
354
+ /// implementation.
355
+ #[tokio::test]
356
+ async fn aba_deadlock_prevented() {
357
+ let run_id = "some_run_id";
358
+ let timer_hist = canned_histories::single_timer("t");
359
+ let access_barr: &'static Barrier = Box::leak(Box::new(Barrier::new(2)));
360
+ let wft = timer_hist.get_history_info(1).unwrap();
361
+
362
+ let mgr = WorkflowConcurrencyManager::new();
363
+ mgr.create_or_update(
364
+ run_id,
365
+ wft.clone().into(),
366
+ "fake_wf_id",
367
+ "fake_namespace",
368
+ "fake_wf_type",
369
+ &Default::default(),
370
+ )
371
+ .await
372
+ .unwrap();
373
+ // Perform access which blocks
374
+ let access_fut = mgr.access(run_id, |_wfm| {
375
+ async {
376
+ // Wait to make sure access has started
377
+ access_barr.wait().await;
378
+ // Wait to make sure write has finished
379
+ access_barr.wait().await;
380
+ Ok(())
381
+ }
382
+ .boxed()
383
+ });
384
+ let write_fut = async {
385
+ // Wait to make sure access has started
386
+ access_barr.wait().await;
387
+ // Now try writing
388
+ mgr.create_or_update(
389
+ "different_run_id",
390
+ wft.clone().into(),
391
+ "fake_wf_id",
392
+ "fake_namespace",
393
+ "fake_wf_type",
394
+ &Default::default(),
395
+ )
396
+ .await
397
+ .unwrap();
398
+ // Indicate write has finished
399
+ access_barr.wait().await;
400
+ };
401
+ let (r1, _) = tokio::join!(access_fut, write_fut);
402
+ r1.unwrap();
403
+ }
345
404
  }
@@ -57,7 +57,7 @@ pub struct WorkflowTaskManager {
57
57
  pending_activations: PendingActivations,
58
58
  /// Holds activations which are purely query activations needed to respond to legacy queries.
59
59
  /// Activations may only be added here for runs which do not have other pending activations.
60
- pending_legacy_queries: SegQueue<WorkflowActivation>,
60
+ pending_queries: SegQueue<WorkflowActivation>,
61
61
  /// Holds poll wft responses from the server that need to be applied
62
62
  ready_buffered_wft: SegQueue<ValidPollWFTQResponse>,
63
63
  /// Used to wake blocked workflow task polling
@@ -74,9 +74,8 @@ pub struct WorkflowTaskManager {
74
74
  #[derive(Clone, Debug)]
75
75
  pub(crate) struct OutstandingTask {
76
76
  pub info: WorkflowTaskInfo,
77
- /// If set the outstanding task has query from the old `query` field which must be fulfilled
78
- /// upon finishing replay
79
- pub legacy_query: Option<QueryWorkflow>,
77
+ /// Set if the outstanding task has quer(ies) which must be fulfilled upon finishing replay
78
+ pub pending_queries: Vec<QueryWorkflow>,
80
79
  start_time: Instant,
81
80
  }
82
81
 
@@ -150,6 +149,13 @@ pub(crate) enum ActivationAction {
150
149
  RespondLegacyQuery { result: QueryResult },
151
150
  }
152
151
 
152
+ #[derive(Debug, Eq, PartialEq, Hash)]
153
+ pub(crate) enum EvictionRequestResult {
154
+ EvictionIssued(Option<u32>),
155
+ NotFound,
156
+ EvictionAlreadyOutstanding,
157
+ }
158
+
153
159
  macro_rules! machine_mut {
154
160
  ($myself:ident, $run_id:ident, $clos:expr) => {{
155
161
  $myself
@@ -172,7 +178,7 @@ impl WorkflowTaskManager {
172
178
  Self {
173
179
  workflow_machines: WorkflowConcurrencyManager::new(),
174
180
  pending_activations: Default::default(),
175
- pending_legacy_queries: Default::default(),
181
+ pending_queries: Default::default(),
176
182
  ready_buffered_wft: Default::default(),
177
183
  pending_activations_notifier,
178
184
  cache_manager: Mutex::new(WorkflowCacheManager::new(eviction_policy, metrics.clone())),
@@ -181,8 +187,8 @@ impl WorkflowTaskManager {
181
187
  }
182
188
 
183
189
  pub(crate) fn next_pending_activation(&self) -> Option<WorkflowActivation> {
184
- // Dispatch pending legacy queries first
185
- if let leg_q @ Some(_) = self.pending_legacy_queries.pop() {
190
+ // Dispatch pending queries first
191
+ if let leg_q @ Some(_) = self.pending_queries.pop() {
186
192
  return leg_q;
187
193
  }
188
194
  // It is important that we do not issue pending activations for any workflows which already
@@ -247,7 +253,7 @@ impl WorkflowTaskManager {
247
253
  run_id: &str,
248
254
  message: impl Into<String>,
249
255
  reason: EvictionReason,
250
- ) -> Option<u32> {
256
+ ) -> EvictionRequestResult {
251
257
  if self.workflow_machines.exists(run_id) {
252
258
  if !self.activation_has_eviction(run_id) {
253
259
  let message = message.into();
@@ -256,13 +262,17 @@ impl WorkflowTaskManager {
256
262
  self.pending_activations
257
263
  .notify_needs_eviction(run_id, message, reason);
258
264
  self.pending_activations_notifier.notify_waiters();
265
+ EvictionRequestResult::EvictionIssued(
266
+ self.workflow_machines
267
+ .get_task(run_id)
268
+ .map(|wt| wt.info.attempt),
269
+ )
270
+ } else {
271
+ EvictionRequestResult::EvictionAlreadyOutstanding
259
272
  }
260
- self.workflow_machines
261
- .get_task(run_id)
262
- .map(|wt| wt.info.attempt)
263
273
  } else {
264
274
  warn!(%run_id, "Eviction requested for unknown run");
265
- None
275
+ EvictionRequestResult::NotFound
266
276
  }
267
277
  }
268
278
 
@@ -304,9 +314,11 @@ impl WorkflowTaskManager {
304
314
  return NewWfTaskOutcome::TaskBuffered;
305
315
  };
306
316
 
317
+ let start_event_id = work.history.events.first().map(|e| e.event_id);
307
318
  debug!(
308
319
  task_token = %&work.task_token,
309
320
  history_length = %work.history.events.len(),
321
+ start_event_id = ?start_event_id,
310
322
  attempt = %work.attempt,
311
323
  run_id = %work.workflow_execution.run_id,
312
324
  "Applying new workflow task from server"
@@ -320,33 +332,45 @@ impl WorkflowTaskManager {
320
332
  .take()
321
333
  .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
322
334
 
323
- let (info, mut next_activation) =
335
+ let (info, mut next_activation, mut pending_queries) =
324
336
  match self.instantiate_or_update_workflow(work, client).await {
325
- Ok((info, next_activation)) => (info, next_activation),
337
+ Ok(res) => res,
326
338
  Err(e) => {
327
339
  return NewWfTaskOutcome::Evict(e);
328
340
  }
329
341
  };
330
342
 
343
+ if !pending_queries.is_empty() && legacy_query.is_some() {
344
+ error!(
345
+ "Server issued both normal and legacy queries. This should not happen. Please \
346
+ file a bug report."
347
+ );
348
+ return NewWfTaskOutcome::Evict(WorkflowUpdateError {
349
+ source: WFMachinesError::Fatal(
350
+ "Server issued both normal and legacy query".to_string(),
351
+ ),
352
+ run_id: next_activation.run_id,
353
+ });
354
+ }
355
+
331
356
  // Immediately dispatch query activation if no other jobs
332
- let legacy_query = if next_activation.jobs.is_empty() {
333
- if let Some(lq) = legacy_query {
357
+ if let Some(lq) = legacy_query {
358
+ if next_activation.jobs.is_empty() {
334
359
  debug!("Dispatching legacy query {}", &lq);
335
360
  next_activation
336
361
  .jobs
337
362
  .push(workflow_activation_job::Variant::QueryWorkflow(lq).into());
363
+ } else {
364
+ pending_queries.push(lq);
338
365
  }
339
- None
340
- } else {
341
- legacy_query
342
- };
366
+ }
343
367
 
344
368
  self.workflow_machines
345
369
  .insert_wft(
346
370
  &next_activation.run_id,
347
371
  OutstandingTask {
348
372
  info,
349
- legacy_query,
373
+ pending_queries,
350
374
  start_time: task_start_time,
351
375
  },
352
376
  )
@@ -388,11 +412,11 @@ impl WorkflowTaskManager {
388
412
  return Ok(None);
389
413
  }
390
414
 
391
- let (task_token, is_leg_query_task, start_time) =
415
+ let (task_token, has_pending_query, start_time) =
392
416
  if let Some(entry) = self.workflow_machines.get_task(run_id) {
393
417
  (
394
418
  entry.info.task_token.clone(),
395
- entry.legacy_query.is_some(),
419
+ !entry.pending_queries.is_empty(),
396
420
  entry.start_time,
397
421
  )
398
422
  } else {
@@ -493,7 +517,7 @@ impl WorkflowTaskManager {
493
517
  let must_heartbeat = self
494
518
  .wait_for_local_acts_or_heartbeat(run_id, wft_heartbeat_deadline)
495
519
  .await;
496
- let is_query_playback = is_leg_query_task && query_responses.is_empty();
520
+ let is_query_playback = has_pending_query && query_responses.is_empty();
497
521
 
498
522
  // We only actually want to send commands back to the server if there are no more
499
523
  // pending activations and we are caught up on replay. We don't want to complete a wft
@@ -559,9 +583,10 @@ impl WorkflowTaskManager {
559
583
  FailedActivationOutcome::ReportLegacyQueryFailure(tt)
560
584
  } else {
561
585
  // Blow up any cached data associated with the workflow
562
- let should_report = self
563
- .request_eviction(run_id, failstr, reason)
564
- .map_or(true, |attempt| attempt <= 1);
586
+ let should_report = match self.request_eviction(run_id, failstr, reason) {
587
+ EvictionRequestResult::EvictionIssued(Some(attempt)) => attempt <= 1,
588
+ _ => false,
589
+ };
565
590
  if should_report {
566
591
  FailedActivationOutcome::Report(tt)
567
592
  } else {
@@ -578,7 +603,8 @@ impl WorkflowTaskManager {
578
603
  &self,
579
604
  poll_wf_resp: ValidPollWFTQResponse,
580
605
  client: Arc<WorkerClientBag>,
581
- ) -> Result<(WorkflowTaskInfo, WorkflowActivation), WorkflowUpdateError> {
606
+ ) -> Result<(WorkflowTaskInfo, WorkflowActivation, Vec<QueryWorkflow>), WorkflowUpdateError>
607
+ {
582
608
  let run_id = poll_wf_resp.workflow_execution.run_id.clone();
583
609
 
584
610
  let wft_info = WorkflowTaskInfo {
@@ -593,10 +619,12 @@ impl WorkflowTaskManager {
593
619
  .map(|ev| ev.event_id > 1)
594
620
  .unwrap_or_default();
595
621
 
622
+ let mut did_miss_cache = false;
596
623
  let page_token = if !self.workflow_machines.exists(&run_id) && poll_resp_is_incremental {
597
624
  debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
598
625
  cache. Will fetch history");
599
626
  self.metrics.sticky_cache_miss();
627
+ did_miss_cache = true;
600
628
  NextPageToken::FetchFromStart
601
629
  } else {
602
630
  poll_wf_resp.next_page_token.into()
@@ -625,16 +653,26 @@ impl WorkflowTaskManager {
625
653
  .await
626
654
  {
627
655
  Ok(mut activation) => {
628
- // If there are in-poll queries, insert jobs for those queries into the activation
656
+ // If there are in-poll queries, insert jobs for those queries into the activation,
657
+ // but only if we hit the cache. If we didn't, those queries will need to be dealt
658
+ // with once replay is over
659
+ let mut pending_queries = vec![];
629
660
  if !poll_wf_resp.query_requests.is_empty() {
630
- let query_jobs = poll_wf_resp
631
- .query_requests
632
- .into_iter()
633
- .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
634
- activation.jobs.extend(query_jobs);
661
+ if !did_miss_cache {
662
+ let query_jobs = poll_wf_resp
663
+ .query_requests
664
+ .into_iter()
665
+ .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
666
+ activation.jobs.extend(query_jobs);
667
+ } else {
668
+ poll_wf_resp
669
+ .query_requests
670
+ .into_iter()
671
+ .for_each(|q| pending_queries.push(q));
672
+ }
635
673
  }
636
674
 
637
- Ok((wft_info, activation))
675
+ Ok((wft_info, activation, pending_queries))
638
676
  }
639
677
  Err(source) => Err(WorkflowUpdateError { source, run_id }),
640
678
  }
@@ -661,16 +699,18 @@ impl WorkflowTaskManager {
661
699
  // removed from the outstanding tasks map
662
700
  let retme = if !self.pending_activations.has_pending(run_id) {
663
701
  if !just_evicted {
664
- // Check if there was a legacy query which must be fulfilled, and if there is create
665
- // a new pending activation for it.
702
+ // Check if there was a pending query which must be fulfilled, and if there is
703
+ // create a new pending activation for it.
666
704
  if let Some(ref mut ot) = &mut *self
667
705
  .workflow_machines
668
706
  .get_task_mut(run_id)
669
707
  .expect("Machine must exist")
670
708
  {
671
- if let Some(query) = ot.legacy_query.take() {
672
- let na = create_query_activation(run_id.to_string(), [query]);
673
- self.pending_legacy_queries.push(na);
709
+ if !ot.pending_queries.is_empty() {
710
+ for query in ot.pending_queries.drain(..) {
711
+ let na = create_query_activation(run_id.to_string(), [query]);
712
+ self.pending_queries.push(na);
713
+ }
674
714
  self.pending_activations_notifier.notify_waiters();
675
715
  return false;
676
716
  }
@@ -389,7 +389,7 @@ impl ActivityHalf {
389
389
  tokio::spawn(ACT_CANCEL_TOK.scope(ct, async move {
390
390
  let mut inputs = start.input;
391
391
  let arg = inputs.pop().unwrap_or_default();
392
- let output = (&act_fn.act_func)(arg).await;
392
+ let output = (act_fn.act_func)(arg).await;
393
393
  let result = match output {
394
394
  Ok(res) => ActivityExecutionResult::ok(res),
395
395
  Err(err) => match err.downcast::<ActivityCancelledError>() {
@@ -1,5 +1,5 @@
1
1
  use std::time::Duration;
2
- use temporal_client::{WorkflowClientTrait, WorkflowOptions};
2
+ use temporal_client::WorkflowOptions;
3
3
  use temporal_sdk::{WfContext, WfExitValue, WorkflowResult};
4
4
  use temporal_sdk_core_protos::coresdk::workflow_commands::ContinueAsNewWorkflowExecution;
5
5
  use temporal_sdk_core_test_utils::CoreWfStarter;
@@ -33,13 +33,31 @@ async fn continue_as_new_happy_path() {
33
33
  )
34
34
  .await
35
35
  .unwrap();
36
+ // The four additional runs
37
+ worker.incr_expected_run_count(4);
36
38
  worker.run_until_done().await.unwrap();
39
+ }
37
40
 
38
- // Terminate the continued workflow
39
- starter
40
- .get_client()
41
- .await
42
- .terminate_workflow_execution(wf_name.to_owned(), None)
43
- .await
44
- .unwrap();
41
+ #[tokio::test]
42
+ async fn continue_as_new_multiple_concurrent() {
43
+ let wf_name = "continue_as_new_multiple_concurrent";
44
+ let mut starter = CoreWfStarter::new(wf_name);
45
+ starter.max_cached_workflows(3).max_wft(3);
46
+ let mut worker = starter.worker().await;
47
+ worker.register_wf(wf_name.to_string(), continue_as_new_wf);
48
+
49
+ let wf_names = (1..=20).map(|i| format!("{}-{}", wf_name, i));
50
+ for name in wf_names.clone() {
51
+ worker
52
+ .submit_wf(
53
+ name.to_string(),
54
+ wf_name.to_string(),
55
+ vec![[1].into()],
56
+ WorkflowOptions::default(),
57
+ )
58
+ .await
59
+ .unwrap();
60
+ }
61
+ worker.incr_expected_run_count(20 * 4);
62
+ worker.run_until_done().await.unwrap();
45
63
  }
package/src/errors.rs CHANGED
@@ -10,6 +10,8 @@ pub static SHUTDOWN_ERROR: OnceCell<Root<JsFunction>> = OnceCell::new();
10
10
  pub static NO_WORKER_ERROR: OnceCell<Root<JsFunction>> = OnceCell::new();
11
11
  /// Something unexpected happened, considered fatal
12
12
  pub static UNEXPECTED_ERROR: OnceCell<Root<JsFunction>> = OnceCell::new();
13
+ /// Used in different parts of the project to signal that something unexpected has happened
14
+ pub static ILLEGAL_STATE_ERROR: OnceCell<Root<JsFunction>> = OnceCell::new();
13
15
 
14
16
  static ALREADY_REGISTERED_ERRORS: OnceCell<bool> = OnceCell::new();
15
17
 
@@ -70,9 +72,9 @@ pub fn register_errors(mut cx: FunctionContext) -> JsResult<JsUndefined> {
70
72
  let res = ALREADY_REGISTERED_ERRORS.set(true);
71
73
  if res.is_err() {
72
74
  // Don't do anything if errors are already registered
73
- return Ok(cx.undefined())
75
+ return Ok(cx.undefined());
74
76
  }
75
-
77
+
76
78
  let mapping = cx.argument::<JsObject>(0)?;
77
79
  let shutdown_error = mapping
78
80
  .get(&mut cx, "ShutdownError")?
@@ -90,11 +92,16 @@ pub fn register_errors(mut cx: FunctionContext) -> JsResult<JsUndefined> {
90
92
  .get(&mut cx, "UnexpectedError")?
91
93
  .downcast_or_throw::<JsFunction, FunctionContext>(&mut cx)?
92
94
  .root(&mut cx);
95
+ let illegal_state_error = mapping
96
+ .get(&mut cx, "IllegalStateError")?
97
+ .downcast_or_throw::<JsFunction, FunctionContext>(&mut cx)?
98
+ .root(&mut cx);
93
99
 
94
100
  TRANSPORT_ERROR.get_or_try_init(|| Ok(transport_error))?;
95
101
  SHUTDOWN_ERROR.get_or_try_init(|| Ok(shutdown_error))?;
96
102
  NO_WORKER_ERROR.get_or_try_init(|| Ok(no_worker_error))?;
97
103
  UNEXPECTED_ERROR.get_or_try_init(|| Ok(unexpected_error))?;
104
+ ILLEGAL_STATE_ERROR.get_or_try_init(|| Ok(illegal_state_error))?;
98
105
 
99
106
  Ok(cx.undefined())
100
107
  }
package/src/lib.rs CHANGED
@@ -8,6 +8,7 @@ use once_cell::sync::OnceCell;
8
8
  use opentelemetry::trace::{FutureExt, SpanContext, TraceContextExt};
9
9
  use prost::Message;
10
10
  use std::{
11
+ cell::RefCell,
11
12
  fmt::Display,
12
13
  future::Future,
13
14
  sync::Arc,
@@ -135,7 +136,7 @@ struct Client {
135
136
  core_client: Arc<RawClient>,
136
137
  }
137
138
 
138
- type BoxedClient = JsBox<Client>;
139
+ type BoxedClient = JsBox<RefCell<Option<Client>>>;
139
140
  impl Finalize for Client {}
140
141
 
141
142
  /// Worker struct, hold a reference for the channel sender responsible for sending requests from
@@ -291,10 +292,10 @@ fn start_bridge_loop(event_queue: Arc<EventQueue>, receiver: &mut UnboundedRecei
291
292
  }
292
293
  Ok(client) => {
293
294
  send_result(event_queue.clone(), callback, |cx| {
294
- Ok(cx.boxed(Client {
295
+ Ok(cx.boxed(RefCell::new(Some(Client {
295
296
  runtime,
296
297
  core_client: Arc::new(client),
297
- }))
298
+ }))))
298
299
  });
299
300
  }
300
301
  }
@@ -590,15 +591,23 @@ fn worker_new(mut cx: FunctionContext) -> JsResult<JsUndefined> {
590
591
  let callback = cx.argument::<JsFunction>(2)?;
591
592
 
592
593
  let config = worker_options.as_worker_config(&mut cx)?;
593
-
594
- let request = Request::InitWorker {
595
- client: client.core_client.clone(),
596
- runtime: client.runtime.clone(),
597
- config,
598
- callback: callback.root(&mut cx),
599
- };
600
- if let Err(err) = client.runtime.sender.send(request) {
601
- callback_with_unexpected_error(&mut cx, callback, err)?;
594
+ match &*client.borrow() {
595
+ None => {
596
+ callback_with_error(&mut cx, callback, move |cx| {
597
+ UNEXPECTED_ERROR.from_string(cx, "Tried to use closed Client".to_string())
598
+ })?;
599
+ }
600
+ Some(client) => {
601
+ let request = Request::InitWorker {
602
+ client: client.core_client.clone(),
603
+ runtime: client.runtime.clone(),
604
+ config,
605
+ callback: callback.root(&mut cx),
606
+ };
607
+ if let Err(err) = client.runtime.sender.send(request) {
608
+ callback_with_unexpected_error(&mut cx, callback, err)?;
609
+ };
610
+ }
602
611
  };
603
612
 
604
613
  Ok(cx.undefined())
@@ -783,13 +792,26 @@ fn worker_record_activity_heartbeat(mut cx: FunctionContext) -> JsResult<JsUndef
783
792
  fn worker_shutdown(mut cx: FunctionContext) -> JsResult<JsUndefined> {
784
793
  let worker = cx.argument::<BoxedWorker>(0)?;
785
794
  let callback = cx.argument::<JsFunction>(1)?;
786
- match worker.runtime.sender.send(Request::ShutdownWorker {
795
+ if let Err(err) = worker.runtime.sender.send(Request::ShutdownWorker {
787
796
  worker: worker.core_worker.clone(),
788
797
  callback: callback.root(&mut cx),
789
798
  }) {
790
- Err(err) => cx.throw_error(format!("{}", err)),
791
- _ => Ok(cx.undefined()),
792
- }
799
+ UNEXPECTED_ERROR
800
+ .from_error(&mut cx, err)
801
+ .and_then(|err| cx.throw(err))?;
802
+ };
803
+ Ok(cx.undefined())
804
+ }
805
+
806
+ /// Drop a reference to a Client, once all references are dropped, the Client will be closed.
807
+ fn client_close(mut cx: FunctionContext) -> JsResult<JsUndefined> {
808
+ let client = cx.argument::<BoxedClient>(0)?;
809
+ if client.replace(None).is_none() {
810
+ ILLEGAL_STATE_ERROR
811
+ .from_error(&mut cx, "Client already closed")
812
+ .and_then(|err| cx.throw(err))?;
813
+ };
814
+ Ok(cx.undefined())
793
815
  }
794
816
 
795
817
  /// Convert Rust SystemTime into a JS array with 2 numbers (seconds, nanos)
@@ -824,6 +846,7 @@ fn main(mut cx: ModuleContext) -> NeonResult<()> {
824
846
  cx.export_function("newWorker", worker_new)?;
825
847
  cx.export_function("newReplayWorker", replay_worker_new)?;
826
848
  cx.export_function("workerShutdown", worker_shutdown)?;
849
+ cx.export_function("clientClose", client_close)?;
827
850
  cx.export_function("runtimeShutdown", runtime_shutdown)?;
828
851
  cx.export_function("pollLogs", poll_logs)?;
829
852
  cx.export_function(