@temporalio/core-bridge 0.16.3 → 0.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.node CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@temporalio/core-bridge",
3
- "version": "0.16.3",
3
+ "version": "0.17.2",
4
4
  "description": "Temporal.io SDK Core<>Node bridge",
5
5
  "main": "index.node",
6
6
  "types": "index.d.ts",
@@ -19,7 +19,7 @@
19
19
  "license": "MIT",
20
20
  "dependencies": {
21
21
  "@opentelemetry/api": "^1.0.3",
22
- "@temporalio/common": "^0.16.0",
22
+ "@temporalio/common": "^0.17.2",
23
23
  "arg": "^5.0.1",
24
24
  "cargo-cp-artifact": "^0.1.4",
25
25
  "which": "^2.0.2"
@@ -40,5 +40,5 @@
40
40
  "publishConfig": {
41
41
  "access": "public"
42
42
  },
43
- "gitHead": "af54140a4da7e191c06627c5e06b92a3fd929bcc"
43
+ "gitHead": "2232465a4f9b0cade28f0c21c2d7856053678728"
44
44
  }
@@ -264,6 +264,7 @@ pub mod coresdk {
264
264
  }
265
265
 
266
266
  pub mod workflow_completion {
267
+ use crate::temporal::api::failure;
267
268
  tonic::include_proto!("coresdk.workflow_completion");
268
269
 
269
270
  impl wf_activation_completion::Status {
@@ -274,6 +275,12 @@ pub mod coresdk {
274
275
  }
275
276
  }
276
277
  }
278
+
279
+ impl From<failure::v1::Failure> for Failure {
280
+ fn from(f: failure::v1::Failure) -> Self {
281
+ Failure { failure: Some(f) }
282
+ }
283
+ }
277
284
  }
278
285
 
279
286
  pub mod child_workflow {
@@ -290,10 +290,17 @@ async fn legacy_query_failure_on_wft_failure() {
290
290
  core.shutdown().await;
291
291
  }
292
292
 
293
+ #[rstest::rstest]
293
294
  #[tokio::test]
294
- async fn legacy_query_with_full_history_after_complete() {
295
+ async fn legacy_query_after_complete(#[values(false, true)] full_history: bool) {
295
296
  let wfid = "fake_wf_id";
296
- let t = canned_histories::single_timer_wf_completes("1");
297
+ let t = if full_history {
298
+ canned_histories::single_timer_wf_completes("1")
299
+ } else {
300
+ let mut t = canned_histories::single_timer("1");
301
+ t.add_workflow_task_completed();
302
+ t
303
+ };
297
304
  let query_with_hist_task = {
298
305
  let mut pr = hist_to_poll_resp(
299
306
  &t,
@@ -1619,19 +1619,19 @@ async fn failing_wft_doesnt_eat_permit_forever() {
1619
1619
  t.add_by_type(EventType::WorkflowExecutionStarted);
1620
1620
  t.add_workflow_task_scheduled_and_started();
1621
1621
 
1622
- let failures = 5;
1623
- // One extra response for when we stop failing
1624
- let resps = (1..=(failures + 1)).map(|_| 1);
1625
1622
  let mock = MockServerGatewayApis::new();
1626
- let mut mock = single_hist_mock_sg("fake_wf_id", t, resps, mock, true);
1623
+ let mut mock = single_hist_mock_sg("fake_wf_id", t, [1, 1, 1], mock, true);
1627
1624
  mock.worker_cfg(TEST_Q, |cfg| {
1628
1625
  cfg.max_cached_workflows = 2;
1629
1626
  cfg.max_outstanding_workflow_tasks = 2;
1630
1627
  });
1628
+ let outstanding_mock_tasks = mock.outstanding_task_map.clone();
1631
1629
  let core = mock_core(mock);
1632
1630
 
1633
- // Spin failing the WFT to verify that we don't get stuck
1634
- for _ in 1..=failures {
1631
+ let mut run_id = "".to_string();
1632
+ // Fail twice, verifying a permit is eaten. We cannot fail the same run more than twice in a row
1633
+ // because we purposefully time out rather than spamming.
1634
+ for _ in 1..=2 {
1635
1635
  let activation = core.poll_workflow_activation(TEST_Q).await.unwrap();
1636
1636
  // Issue a nonsense completion that will trigger a WFT failure
1637
1637
  core.complete_workflow_activation(WfActivationCompletion::from_cmd(
@@ -1648,12 +1648,91 @@ async fn failing_wft_doesnt_eat_permit_forever() {
1648
1648
  variant: Some(wf_activation_job::Variant::RemoveFromCache(_)),
1649
1649
  },]
1650
1650
  );
1651
+ run_id = activation.run_id.clone();
1651
1652
  core.complete_workflow_activation(WfActivationCompletion::empty(TEST_Q, activation.run_id))
1652
1653
  .await
1653
1654
  .unwrap();
1654
1655
  assert_eq!(core.outstanding_wfts(TEST_Q), 0);
1655
1656
  assert_eq!(core.available_wft_permits(TEST_Q), 2);
1656
1657
  }
1658
+ // We should be "out of work" because the mock service thinks we didn't complete the last task,
1659
+ // which we didn't, because we don't spam failures. The real server would eventually time out
1660
+ // the task. Mock doesn't understand that, so the WFT permit is released because eventually a
1661
+ // new one will be generated. We manually clear the mock's outstanding task list so the next
1662
+ // poll will work.
1663
+ outstanding_mock_tasks
1664
+ .unwrap()
1665
+ .write()
1666
+ .remove_by_left(&run_id);
1667
+ let activation = core.poll_workflow_activation(TEST_Q).await.unwrap();
1668
+ core.complete_workflow_activation(WfActivationCompletion::from_cmd(
1669
+ TEST_Q,
1670
+ activation.run_id,
1671
+ CompleteWorkflowExecution { result: None }.into(),
1672
+ ))
1673
+ .await
1674
+ .unwrap();
1675
+
1676
+ core.shutdown().await;
1677
+ }
1678
+
1679
+ #[tokio::test]
1680
+ async fn cache_miss_doesnt_eat_permit_forever() {
1681
+ let mut t = TestHistoryBuilder::default();
1682
+ t.add_by_type(EventType::WorkflowExecutionStarted);
1683
+ t.add_full_wf_task();
1684
+ t.add_we_signaled("sig", vec![]);
1685
+ t.add_full_wf_task();
1686
+ t.add_workflow_execution_completed();
1687
+
1688
+ let mut mh = MockPollCfg::from_resp_batches(
1689
+ "fake_wf_id",
1690
+ t,
1691
+ [
1692
+ ResponseType::ToTaskNum(1),
1693
+ ResponseType::OneTask(2),
1694
+ ResponseType::ToTaskNum(1),
1695
+ ResponseType::OneTask(2),
1696
+ ResponseType::ToTaskNum(1),
1697
+ ResponseType::OneTask(2),
1698
+ // Last one to complete successfully
1699
+ ResponseType::ToTaskNum(1),
1700
+ ],
1701
+ MockServerGatewayApis::new(),
1702
+ );
1703
+ mh.num_expected_fails = Some(3);
1704
+ mh.expect_fail_wft_matcher =
1705
+ Box::new(|_, cause, _| matches!(cause, WorkflowTaskFailedCause::ResetStickyTaskQueue));
1706
+ let mut mock = build_mock_pollers(mh);
1707
+ mock.worker_cfg(TEST_Q, |cfg| {
1708
+ cfg.max_outstanding_workflow_tasks = 2;
1709
+ });
1710
+ let core = mock_core(mock);
1711
+
1712
+ // Spin missing the cache to verify that we don't get stuck
1713
+ for _ in 1..=3 {
1714
+ // Start
1715
+ let activation = core.poll_workflow_activation(TEST_Q).await.unwrap();
1716
+ core.complete_workflow_activation(WfActivationCompletion::empty(TEST_Q, activation.run_id))
1717
+ .await
1718
+ .unwrap();
1719
+ // Evict
1720
+ let activation = core.poll_workflow_activation(TEST_Q).await.unwrap();
1721
+ assert_matches!(
1722
+ activation.jobs.as_slice(),
1723
+ [WfActivationJob {
1724
+ variant: Some(wf_activation_job::Variant::RemoveFromCache(_)),
1725
+ },]
1726
+ );
1727
+ core.complete_workflow_activation(WfActivationCompletion::empty(TEST_Q, activation.run_id))
1728
+ .await
1729
+ .unwrap();
1730
+ assert_eq!(core.outstanding_wfts(TEST_Q), 0);
1731
+ assert_eq!(core.available_wft_permits(TEST_Q), 2);
1732
+ // When we loop back up, the poll will trigger a cache miss, which we should immediately
1733
+ // reply to WFT with failure, and then poll again, which will deliver the from-the-start
1734
+ // history
1735
+ }
1657
1736
  let activation = core.poll_workflow_activation(TEST_Q).await.unwrap();
1658
1737
  core.complete_workflow_activation(WfActivationCompletion::from_cmd(
1659
1738
  TEST_Q,
@@ -1,6 +1,6 @@
1
1
  //! Error types exposed by public APIs
2
2
 
3
- use crate::{machines::WFMachinesError, task_token::TaskToken, WorkerLookupErr};
3
+ use crate::{machines::WFMachinesError, WorkerLookupErr};
4
4
  use temporal_sdk_core_protos::coresdk::{
5
5
  activity_result::ActivityResult, workflow_completion::WfActivationCompletion,
6
6
  };
@@ -11,9 +11,8 @@ pub(crate) struct WorkflowUpdateError {
11
11
  /// Underlying workflow error
12
12
  pub source: WFMachinesError,
13
13
  /// The run id of the erring workflow
14
+ #[allow(dead_code)] // Useful in debug output
14
15
  pub run_id: String,
15
- /// The task token associated with this update, if one existed yet.
16
- pub task_token: Option<TaskToken>,
17
16
  }
18
17
 
19
18
  impl From<WorkflowMissingError> for WorkflowUpdateError {
@@ -21,7 +20,6 @@ impl From<WorkflowMissingError> for WorkflowUpdateError {
21
20
  Self {
22
21
  source: WFMachinesError::Fatal("Workflow machines missing".to_string()),
23
22
  run_id: wme.run_id,
24
- task_token: None,
25
23
  }
26
24
  }
27
25
  }
@@ -7,8 +7,7 @@ use std::convert::{TryFrom, TryInto};
7
7
  use temporal_sdk_core_protos::{
8
8
  coresdk::{
9
9
  child_workflow::{
10
- self as wfr, child_workflow_result::Status as ChildWorkflowStatus,
11
- ChildWorkflowCancellationType, ChildWorkflowResult,
10
+ self as wfr, child_workflow_result::Status as ChildWorkflowStatus, ChildWorkflowResult,
12
11
  },
13
12
  common::Payload,
14
13
  workflow_activation::{
@@ -115,7 +114,6 @@ impl StartCommandCreated {
115
114
  StartEventRecorded::default(),
116
115
  SharedState {
117
116
  initiated_event_id,
118
- attrs: None, // Drop the attributes to avoid holding large payloads in memory
119
117
  ..state
120
118
  },
121
119
  )
@@ -303,9 +301,7 @@ pub(super) struct SharedState {
303
301
  workflow_id: String,
304
302
  run_id: String,
305
303
  workflow_type: String,
306
- cancellation_type: ChildWorkflowCancellationType,
307
304
  cancelled_before_sent: bool,
308
- attrs: Option<StartChildWorkflowExecution>,
309
305
  }
310
306
 
311
307
  /// Creates a new child workflow state machine and a command to start it on the server.
@@ -329,11 +325,6 @@ impl ChildWorkflowMachine {
329
325
  workflow_id: attribs.workflow_id.clone(),
330
326
  workflow_type: attribs.workflow_type.clone(),
331
327
  namespace: attribs.namespace.clone(),
332
- cancellation_type: ChildWorkflowCancellationType::from_i32(
333
- attribs.cancellation_type,
334
- )
335
- .unwrap(),
336
- attrs: Some(attribs.clone()),
337
328
  ..Default::default()
338
329
  },
339
330
  };
@@ -61,6 +61,10 @@ pub(crate) struct WorkflowMachines {
61
61
  /// Eventually, this number should reach the started id in the latest history update, but
62
62
  /// we must incrementally apply the history while communicating with lang.
63
63
  next_started_event_id: i64,
64
+ /// The event id of the most recent event processed. It's possible in some situations (ex legacy
65
+ /// queries) to receive a history with no new workflow tasks. If the last history we processed
66
+ /// also had no new tasks, we need a way to know not to apply the same events over again.
67
+ last_processed_event: i64,
64
68
  /// True if the workflow is replaying from history
65
69
  pub replaying: bool,
66
70
  /// Namespace this workflow exists in
@@ -120,7 +124,6 @@ struct CommandAndMachine {
120
124
 
121
125
  #[derive(Debug, Clone, Copy)]
122
126
  struct ChangeInfo {
123
- deprecated: bool,
124
127
  created_command: bool,
125
128
  }
126
129
 
@@ -196,6 +199,7 @@ impl WorkflowMachines {
196
199
  // In an ideal world one could say ..Default::default() here and it'd still work.
197
200
  current_started_event_id: 0,
198
201
  next_started_event_id: 0,
202
+ last_processed_event: 0,
199
203
  workflow_start_time: None,
200
204
  workflow_end_time: None,
201
205
  current_wf_time: None,
@@ -529,11 +533,16 @@ impl WorkflowMachines {
529
533
  }
530
534
 
531
535
  let last_handled_wft_started_id = self.current_started_event_id;
532
- let events = self
533
- .last_history_from_server
534
- .take_next_wft_sequence(last_handled_wft_started_id)
535
- .await
536
- .map_err(WFMachinesError::HistoryFetchingError)?;
536
+ let events = {
537
+ let mut evts = self
538
+ .last_history_from_server
539
+ .take_next_wft_sequence(last_handled_wft_started_id)
540
+ .await
541
+ .map_err(WFMachinesError::HistoryFetchingError)?;
542
+ // Do not re-process events we have already processed
543
+ evts.retain(|e| e.event_id > self.last_processed_event);
544
+ evts
545
+ };
537
546
 
538
547
  // We're caught up on reply if there are no new events to process
539
548
  // TODO: Probably this is unneeded if we evict whenever history is from non-sticky queue
@@ -564,23 +573,17 @@ impl WorkflowMachines {
564
573
 
565
574
  while let Some(event) = history.next() {
566
575
  let next_event = history.peek();
567
-
568
- if event.event_type == EventType::WorkflowTaskStarted as i32 && next_event.is_none() {
569
- self.handle_event(event, false)?;
570
- break;
571
- }
572
-
573
576
  self.handle_event(event, next_event.is_some())?;
577
+ self.last_processed_event = event.event_id;
574
578
  }
575
579
 
576
580
  // Scan through to the next WFT, searching for any patch markers, so that we can
577
581
  // pre-resolve them.
578
582
  for e in self.last_history_from_server.peek_next_wft_sequence() {
579
- if let Some((patch_id, deprecated)) = e.get_changed_marker_details() {
583
+ if let Some((patch_id, _)) = e.get_changed_marker_details() {
580
584
  self.encountered_change_markers.insert(
581
585
  patch_id.clone(),
582
586
  ChangeInfo {
583
- deprecated,
584
587
  created_command: false,
585
588
  },
586
589
  );
@@ -743,7 +746,6 @@ impl WorkflowMachines {
743
746
  self.encountered_change_markers.insert(
744
747
  attrs.patch_id,
745
748
  ChangeInfo {
746
- deprecated: attrs.deprecated,
747
749
  created_command: true,
748
750
  },
749
751
  );
@@ -342,6 +342,16 @@ impl TestHistoryBuilder {
342
342
  HistoryInfo::new_from_history(&self.events.clone().into(), None)
343
343
  }
344
344
 
345
+ pub(crate) fn get_one_wft(
346
+ &self,
347
+ from_wft_number: usize,
348
+ ) -> Result<HistoryInfo, HistoryInfoError> {
349
+ let mut histinfo =
350
+ HistoryInfo::new_from_history(&self.events.clone().into(), Some(from_wft_number))?;
351
+ histinfo.make_incremental();
352
+ Ok(histinfo)
353
+ }
354
+
345
355
  fn build_and_push_event(&mut self, event_type: EventType, attribs: Attributes) {
346
356
  self.current_event_id += 1;
347
357
  let evt = HistoryEvent {
@@ -39,6 +39,7 @@ impl HistoryInfo {
39
39
  return Err(HistoryInfoError::HistoryEndsUnexpectedly);
40
40
  }
41
41
 
42
+ let is_all_hist = to_wf_task_num.is_none();
42
43
  let to_wf_task_num = to_wf_task_num.unwrap_or(usize::MAX);
43
44
  let mut workflow_task_started_event_id = 0;
44
45
  let mut previous_started_event_id = 0;
@@ -83,7 +84,7 @@ impl HistoryInfo {
83
84
  }
84
85
 
85
86
  if next_event.is_none() {
86
- if event.is_final_wf_execution_event() {
87
+ if event.is_final_wf_execution_event() || is_all_hist {
87
88
  return Ok(Self {
88
89
  previous_started_event_id,
89
90
  workflow_task_started_event_id,
@@ -100,6 +101,18 @@ impl HistoryInfo {
100
101
  unreachable!()
101
102
  }
102
103
 
104
+ /// Remove events from the beginning of this history such that it looks like what would've been
105
+ /// delivered on a sticky queue where the previously started task was the one before the last
106
+ /// task in this history.
107
+ pub(crate) fn make_incremental(&mut self) {
108
+ let last_complete_ix = self
109
+ .events
110
+ .iter()
111
+ .rposition(|he| he.event_type() == EventType::WorkflowTaskCompleted)
112
+ .expect("Must be a WFT completed event in history");
113
+ self.events.drain(0..=last_complete_ix);
114
+ }
115
+
103
116
  pub(crate) fn events(&self) -> &[HistoryEvent] {
104
117
  &self.events
105
118
  }
@@ -135,4 +148,11 @@ mod tests {
135
148
  let history_info = t.get_history_info(2).unwrap();
136
149
  assert_eq!(8, history_info.events.len());
137
150
  }
151
+
152
+ #[test]
153
+ fn incremental_works() {
154
+ let t = canned_histories::single_timer("timer1");
155
+ let hi = t.get_one_wft(2).unwrap();
156
+ dbg!(hi.events);
157
+ }
138
158
  }
@@ -53,6 +53,10 @@ pub static NO_MORE_WORK_ERROR_MSG: &str = "No more work to do";
53
53
  #[derive(derive_more::From, Debug, Clone, Copy, Eq, PartialEq, Hash)]
54
54
  pub enum ResponseType {
55
55
  ToTaskNum(usize),
56
+ /// Returns just the history after the WFT completed of the provided task number - 1, through to
57
+ /// the next WFT started. Simulating the incremental history for just the provided task number
58
+ #[from(ignore)]
59
+ OneTask(usize),
56
60
  AllHistory,
57
61
  }
58
62
 
@@ -142,6 +146,7 @@ pub struct FakeWfResponses {
142
146
  pub struct MocksHolder<SG> {
143
147
  sg: SG,
144
148
  mock_pollers: HashMap<String, MockWorker>,
149
+ // bidirectional mapping of run id / task token
145
150
  pub outstanding_task_map: Option<Arc<RwLock<BiMap<String, TaskToken>>>>,
146
151
  }
147
152
 
@@ -377,16 +382,6 @@ pub fn build_mock_pollers(mut cfg: MockPollCfg) -> MocksHolder<MockServerGateway
377
382
  }
378
383
  }
379
384
 
380
- // TODO: Fix -- or not? Sticky invalidation could make this pointless anyway
381
- // Verify response batches only ever return longer histories (IE: Are sorted ascending)
382
- // assert!(
383
- // hist.response_batches
384
- // .as_slice()
385
- // .windows(2)
386
- // .all(|w| w[0] <= w[1]),
387
- // "response batches must have increasing wft numbers"
388
- // );
389
-
390
385
  if cfg.enforce_correct_number_of_polls {
391
386
  *correct_num_polls.get_or_insert(0) += hist.response_batches.len();
392
387
  }
@@ -495,6 +490,7 @@ pub fn hist_to_poll_resp(
495
490
  };
496
491
  let hist_info = match response_type {
497
492
  ResponseType::ToTaskNum(tn) => t.get_history_info(tn).unwrap(),
493
+ ResponseType::OneTask(tn) => t.get_one_wft(tn).unwrap(),
498
494
  ResponseType::AllHistory => t.get_full_history_info().unwrap(),
499
495
  };
500
496
  let batch = hist_info.events().to_vec();
@@ -329,20 +329,28 @@ impl Worker {
329
329
  completion: WfActivationCompletion,
330
330
  ) -> Result<(), CompleteWfError> {
331
331
  let wfstatus = completion.status;
332
- let did_complete_wft = match wfstatus {
332
+ let report_outcome = match wfstatus {
333
333
  Some(wf_activation_completion::Status::Successful(success)) => {
334
334
  self.wf_activation_success(&completion.run_id, success)
335
335
  .await
336
336
  }
337
+
337
338
  Some(wf_activation_completion::Status::Failed(failure)) => {
338
- self.wf_activation_failed(&completion.run_id, failure).await
339
+ self.wf_activation_failed(
340
+ &completion.run_id,
341
+ WorkflowTaskFailedCause::Unspecified,
342
+ failure,
343
+ )
344
+ .await
345
+ }
346
+ None => {
347
+ return Err(CompleteWfError::MalformedWorkflowCompletion {
348
+ reason: "Workflow completion had empty status field".to_owned(),
349
+ completion: None,
350
+ })
339
351
  }
340
- None => Err(CompleteWfError::MalformedWorkflowCompletion {
341
- reason: "Workflow completion had empty status field".to_owned(),
342
- completion: None,
343
- }),
344
352
  }?;
345
- self.after_workflow_activation(&completion.run_id, did_complete_wft);
353
+ self.after_workflow_activation(&completion.run_id, report_outcome);
346
354
  Ok(())
347
355
  }
348
356
 
@@ -488,6 +496,7 @@ impl Worker {
488
496
  }),
489
497
  )
490
498
  .await?;
499
+ self.return_workflow_task_permit();
491
500
  None
492
501
  }
493
502
  NewWfTaskOutcome::Evict(e) => {
@@ -508,7 +517,7 @@ impl Worker {
508
517
  &self,
509
518
  run_id: &str,
510
519
  success: workflow_completion::Success,
511
- ) -> Result<bool, CompleteWfError> {
520
+ ) -> Result<WFTReportOutcome, CompleteWfError> {
512
521
  // Convert to wf commands
513
522
  let cmds = success
514
523
  .commands
@@ -552,7 +561,10 @@ impl Worker {
552
561
  .await
553
562
  })
554
563
  .await?;
555
- Ok(true)
564
+ Ok(WFTReportOutcome {
565
+ reported_to_server: true,
566
+ failed: false,
567
+ })
556
568
  }
557
569
  Ok(Some(ServerCommandsWithWorkflowInfo {
558
570
  task_token,
@@ -562,9 +574,15 @@ impl Worker {
562
574
  self.server_gateway
563
575
  .respond_legacy_query(task_token, result)
564
576
  .await?;
565
- Ok(true)
577
+ Ok(WFTReportOutcome {
578
+ reported_to_server: true,
579
+ failed: false,
580
+ })
566
581
  }
567
- Ok(None) => Ok(false),
582
+ Ok(None) => Ok(WFTReportOutcome {
583
+ reported_to_server: false,
584
+ failed: false,
585
+ }),
568
586
  Err(update_err) => {
569
587
  // Automatically fail the workflow task in the event we couldn't update machines
570
588
  let fail_cause = if matches!(&update_err.source, WFMachinesError::Nondeterminism(_))
@@ -573,30 +591,13 @@ impl Worker {
573
591
  } else {
574
592
  WorkflowTaskFailedCause::Unspecified
575
593
  };
576
-
577
- warn!(run_id, error=?update_err, "Failing workflow task");
578
-
579
- if let Some(ref tt) = update_err.task_token {
580
- let wft_fail_str = format!("{:?}", update_err);
581
- self.handle_wft_reporting_errs(run_id, || async {
582
- self.server_gateway
583
- .fail_workflow_task(
584
- tt.clone(),
585
- fail_cause,
586
- Some(Failure::application_failure(wft_fail_str.clone(), false)),
587
- )
588
- .await
589
- })
590
- .await?;
591
- // We must evict the workflow since we've failed a WFT
592
- self.request_wf_eviction(
593
- run_id,
594
- format!("Workflow task failure: {}", wft_fail_str),
595
- );
596
- Ok(true)
597
- } else {
598
- Ok(false)
599
- }
594
+ let wft_fail_str = format!("{:?}", update_err);
595
+ self.wf_activation_failed(
596
+ run_id,
597
+ fail_cause,
598
+ Failure::application_failure(wft_fail_str.clone(), false).into(),
599
+ )
600
+ .await
600
601
  }
601
602
  }
602
603
  }
@@ -607,35 +608,46 @@ impl Worker {
607
608
  async fn wf_activation_failed(
608
609
  &self,
609
610
  run_id: &str,
611
+ cause: WorkflowTaskFailedCause,
610
612
  failure: workflow_completion::Failure,
611
- ) -> Result<bool, CompleteWfError> {
613
+ ) -> Result<WFTReportOutcome, CompleteWfError> {
612
614
  Ok(match self.wft_manager.failed_activation(run_id) {
613
615
  FailedActivationOutcome::Report(tt) => {
616
+ warn!(run_id, failure=?failure, "Failing workflow activation");
614
617
  self.handle_wft_reporting_errs(run_id, || async {
615
618
  self.server_gateway
616
- .fail_workflow_task(
617
- tt,
618
- WorkflowTaskFailedCause::Unspecified,
619
- failure.failure.map(Into::into),
620
- )
619
+ .fail_workflow_task(tt, cause, failure.failure.map(Into::into))
621
620
  .await
622
621
  })
623
622
  .await?;
624
- true
623
+ WFTReportOutcome {
624
+ reported_to_server: true,
625
+ failed: true,
626
+ }
625
627
  }
626
628
  FailedActivationOutcome::ReportLegacyQueryFailure(task_token) => {
629
+ warn!(run_id, failure=?failure, "Failing legacy query request");
627
630
  self.server_gateway
628
631
  .respond_legacy_query(task_token, legacy_query_failure(failure))
629
632
  .await?;
630
- true
633
+ WFTReportOutcome {
634
+ reported_to_server: true,
635
+ failed: true,
636
+ }
631
637
  }
632
- FailedActivationOutcome::NoReport => false,
638
+ FailedActivationOutcome::NoReport => WFTReportOutcome {
639
+ reported_to_server: false,
640
+ failed: true,
641
+ },
633
642
  })
634
643
  }
635
644
 
636
- fn after_workflow_activation(&self, run_id: &str, did_complete_wft: bool) {
645
+ fn after_workflow_activation(&self, run_id: &str, report_outcome: WFTReportOutcome) {
637
646
  self.wft_manager.after_wft_report(run_id);
638
- if did_complete_wft {
647
+ if report_outcome.reported_to_server || report_outcome.failed {
648
+ // If we failed the WFT but didn't report anything, we still want to release the WFT
649
+ // permit since the server will eventually time out the task and we've already evicted
650
+ // the run.
639
651
  self.return_workflow_task_permit();
640
652
  }
641
653
  self.wft_manager.on_activation_done(run_id);
@@ -717,6 +729,11 @@ impl WorkerConfig {
717
729
  }
718
730
  }
719
731
 
732
+ struct WFTReportOutcome {
733
+ reported_to_server: bool,
734
+ failed: bool,
735
+ }
736
+
720
737
  #[cfg(test)]
721
738
  mod tests {
722
739
  use super::*;
@@ -135,7 +135,7 @@ pub enum ActivationAction {
135
135
  }
136
136
 
137
137
  macro_rules! machine_mut {
138
- ($myself:ident, $run_id:ident, $task_token:ident, $clos:expr) => {{
138
+ ($myself:ident, $run_id:ident, $clos:expr) => {{
139
139
  $myself
140
140
  .workflow_machines
141
141
  .access($run_id, $clos)
@@ -143,7 +143,6 @@ macro_rules! machine_mut {
143
143
  .map_err(|source| WorkflowUpdateError {
144
144
  source,
145
145
  run_id: $run_id.to_owned(),
146
- task_token: Some($task_token.clone()),
147
146
  })
148
147
  }};
149
148
  }
@@ -256,6 +255,7 @@ impl WorkflowTaskManager {
256
255
  debug!(
257
256
  task_token = %&work.task_token,
258
257
  history_length = %work.history.events.len(),
258
+ attempt = %work.attempt,
259
259
  "Applying new workflow task from server"
260
260
  );
261
261
  let task_start_time = Instant::now();
@@ -325,19 +325,20 @@ impl WorkflowTaskManager {
325
325
  return Ok(None);
326
326
  }
327
327
 
328
- let task_token = if let Some(entry) = self.workflow_machines.get_task(run_id) {
329
- entry.info.task_token.clone()
330
- } else {
331
- if !self.activation_has_eviction(run_id) {
332
- // Don't bother warning if this was an eviction, since it's normal to issue
333
- // eviction activations without an associated workflow task in that case.
334
- warn!(
335
- run_id,
336
- "Attempted to complete activation for nonexistent run"
337
- );
338
- }
339
- return Ok(None);
340
- };
328
+ let (task_token, is_leg_query_task) =
329
+ if let Some(entry) = self.workflow_machines.get_task(run_id) {
330
+ (entry.info.task_token.clone(), entry.legacy_query.is_some())
331
+ } else {
332
+ if !self.activation_has_eviction(run_id) {
333
+ // Don't bother warning if this was an eviction, since it's normal to issue
334
+ // eviction activations without an associated workflow task in that case.
335
+ warn!(
336
+ run_id,
337
+ "Attempted to complete activation for run without associated workflow task"
338
+ );
339
+ }
340
+ return Ok(None);
341
+ };
341
342
 
342
343
  // If the only command in the activation is a legacy query response, that means we need
343
344
  // to respond differently than a typical activation.
@@ -364,7 +365,6 @@ impl WorkflowTaskManager {
364
365
  return Err(WorkflowUpdateError {
365
366
  source: WFMachinesError::Fatal("Legacy query activation response included other commands, this is not allowed and constitutes an error in the lang SDK".to_string()),
366
367
  run_id: run_id.to_string(),
367
- task_token: Some(task_token)
368
368
  });
369
369
  }
370
370
  query_responses.push(qr);
@@ -375,30 +375,32 @@ impl WorkflowTaskManager {
375
375
  }
376
376
 
377
377
  // Send commands from lang into the machines
378
- machine_mut!(self, run_id, task_token, |wfm: &mut WorkflowManager| {
378
+ machine_mut!(self, run_id, |wfm: &mut WorkflowManager| {
379
379
  wfm.push_commands(commands).boxed()
380
380
  })?;
381
381
  // Check if the workflow run needs another activation and queue it up if there is one
382
382
  // by pushing it into the pending activations list
383
- let next_activation = machine_mut!(
384
- self,
385
- run_id,
386
- task_token,
387
- move |mgr: &mut WorkflowManager| mgr.get_next_activation().boxed()
388
- )?;
383
+ let next_activation = machine_mut!(self, run_id, move |mgr: &mut WorkflowManager| mgr
384
+ .get_next_activation()
385
+ .boxed())?;
389
386
  if !next_activation.jobs.is_empty() {
390
387
  self.pending_activations.push(next_activation);
391
388
  let _ = self.pending_activations_notifier.send(true);
392
389
  }
393
390
  // We want to fetch the outgoing commands only after any new activation has been queued,
394
391
  // as doing so may have altered the outgoing commands.
395
- let server_cmds =
396
- machine_mut!(self, run_id, task_token, |wfm: &mut WorkflowManager| {
397
- async move { Ok(wfm.get_server_commands()) }.boxed()
398
- })?;
392
+ let server_cmds = machine_mut!(self, run_id, |wfm: &mut WorkflowManager| {
393
+ async move { Ok(wfm.get_server_commands()) }.boxed()
394
+ })?;
395
+ let is_query_playback = is_leg_query_task && query_responses.is_empty();
399
396
  // We only actually want to send commands back to the server if there are no more
400
- // pending activations and we are caught up on replay.
401
- if !self.pending_activations.has_pending(run_id) && !server_cmds.replaying {
397
+ // pending activations and we are caught up on replay. We don't want to complete a wft
398
+ // if we already saw the final event in the workflow, or if we are playing back for the
399
+ // express purpose of fulfilling a query
400
+ if !self.pending_activations.has_pending(run_id)
401
+ && !server_cmds.replaying
402
+ && !is_query_playback
403
+ {
402
404
  Some(ServerCommandsWithWorkflowInfo {
403
405
  task_token,
404
406
  action: ActivationAction::WftComplete {
@@ -448,7 +450,7 @@ impl WorkflowTaskManager {
448
450
  } else {
449
451
  // Blow up any cached data associated with the workflow
450
452
  let should_report = self
451
- .request_eviction(run_id, "Activation failed by lang")
453
+ .request_eviction(run_id, "Activation failed")
452
454
  .map_or(true, |attempt| attempt <= 1);
453
455
  if should_report {
454
456
  FailedActivationOutcome::Report(tt)
@@ -507,11 +509,7 @@ impl WorkflowTaskManager {
507
509
 
508
510
  Ok((wft_info, activation))
509
511
  }
510
- Err(source) => Err(WorkflowUpdateError {
511
- source,
512
- run_id,
513
- task_token: Some(wft_info.task_token),
514
- }),
512
+ Err(source) => Err(WorkflowUpdateError { source, run_id }),
515
513
  }
516
514
  }
517
515