@temporalio/core-bridge 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/Cargo.lock +255 -48
  2. package/package.json +4 -4
  3. package/releases/aarch64-apple-darwin/index.node +0 -0
  4. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  5. package/releases/x86_64-apple-darwin/index.node +0 -0
  6. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  7. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  8. package/sdk-core/.buildkite/pipeline.yml +1 -3
  9. package/sdk-core/.cargo/config.toml +5 -2
  10. package/sdk-core/.github/workflows/heavy.yml +28 -0
  11. package/sdk-core/Cargo.toml +1 -1
  12. package/sdk-core/README.md +9 -5
  13. package/sdk-core/client/src/lib.rs +211 -36
  14. package/sdk-core/client/src/raw.rs +1 -1
  15. package/sdk-core/client/src/retry.rs +32 -20
  16. package/sdk-core/core/Cargo.toml +23 -9
  17. package/sdk-core/core/src/abstractions.rs +11 -0
  18. package/sdk-core/core/src/core_tests/activity_tasks.rs +6 -5
  19. package/sdk-core/core/src/core_tests/local_activities.rs +263 -22
  20. package/sdk-core/core/src/core_tests/queries.rs +2 -2
  21. package/sdk-core/core/src/core_tests/workflow_tasks.rs +249 -5
  22. package/sdk-core/core/src/ephemeral_server/mod.rs +5 -6
  23. package/sdk-core/core/src/lib.rs +2 -0
  24. package/sdk-core/core/src/protosext/mod.rs +1 -1
  25. package/sdk-core/core/src/telemetry/log_export.rs +1 -1
  26. package/sdk-core/core/src/telemetry/mod.rs +23 -8
  27. package/sdk-core/core/src/test_help/mod.rs +8 -1
  28. package/sdk-core/core/src/worker/activities/local_activities.rs +259 -125
  29. package/sdk-core/core/src/worker/activities.rs +3 -2
  30. package/sdk-core/core/src/worker/mod.rs +53 -26
  31. package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  32. package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  33. package/sdk-core/core/src/worker/workflow/history_update.rs +835 -277
  34. package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +9 -17
  35. package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +3 -5
  36. package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +1 -2
  37. package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +3 -5
  38. package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +1 -2
  39. package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +1 -2
  40. package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +1 -2
  41. package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +73 -51
  42. package/sdk-core/core/src/worker/workflow/machines/mod.rs +3 -3
  43. package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +4 -4
  44. package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +1 -2
  45. package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +3 -5
  46. package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +6 -7
  47. package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  48. package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +4 -4
  49. package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
  50. package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +89 -58
  51. package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +4 -7
  52. package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +21 -9
  53. package/sdk-core/core/src/worker/workflow/managed_run.rs +1021 -360
  54. package/sdk-core/core/src/worker/workflow/mod.rs +306 -346
  55. package/sdk-core/core/src/worker/workflow/run_cache.rs +29 -53
  56. package/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  57. package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  58. package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +115 -0
  59. package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  60. package/sdk-core/core/src/worker/workflow/workflow_stream.rs +444 -714
  61. package/sdk-core/core-api/Cargo.toml +2 -0
  62. package/sdk-core/core-api/src/errors.rs +1 -34
  63. package/sdk-core/core-api/src/lib.rs +6 -2
  64. package/sdk-core/core-api/src/worker.rs +14 -1
  65. package/sdk-core/etc/deps.svg +115 -140
  66. package/sdk-core/etc/regen-depgraph.sh +5 -0
  67. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +6 -6
  68. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +7 -3
  69. package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  70. package/sdk-core/protos/api_upstream/Makefile +5 -5
  71. package/sdk-core/protos/api_upstream/build/go.mod +7 -0
  72. package/sdk-core/protos/api_upstream/build/go.sum +5 -0
  73. package/sdk-core/protos/api_upstream/build/tools.go +29 -0
  74. package/sdk-core/protos/api_upstream/go.mod +6 -0
  75. package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
  76. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +12 -19
  77. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +2 -2
  78. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
  79. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -2
  80. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
  81. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +3 -3
  82. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +20 -2
  83. package/sdk-core/protos/api_upstream/temporal/api/{update/v1/message.proto → enums/v1/interaction_type.proto} +11 -18
  84. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
  85. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
  86. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
  87. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
  88. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
  89. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +2 -13
  90. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
  91. package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
  92. package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
  93. package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
  94. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +13 -19
  95. package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +87 -0
  96. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -2
  97. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +2 -2
  98. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
  99. package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
  100. package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
  101. package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
  102. package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
  103. package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
  104. package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
  105. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +13 -8
  106. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  107. package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +2 -0
  108. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
  109. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
  110. package/sdk-core/sdk/Cargo.toml +4 -3
  111. package/sdk-core/sdk/src/lib.rs +87 -21
  112. package/sdk-core/sdk/src/workflow_future.rs +7 -12
  113. package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  114. package/sdk-core/sdk-core-protos/build.rs +36 -2
  115. package/sdk-core/sdk-core-protos/src/history_builder.rs +26 -19
  116. package/sdk-core/sdk-core-protos/src/history_info.rs +4 -0
  117. package/sdk-core/sdk-core-protos/src/lib.rs +78 -34
  118. package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  119. package/sdk-core/test-utils/Cargo.toml +3 -1
  120. package/sdk-core/test-utils/src/histfetch.rs +1 -1
  121. package/sdk-core/test-utils/src/lib.rs +50 -18
  122. package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  123. package/sdk-core/test-utils/src/workflows.rs +29 -0
  124. package/sdk-core/tests/fuzzy_workflow.rs +130 -0
  125. package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +114 -7
  126. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -2
  127. package/sdk-core/tests/integ_tests/metrics_tests.rs +1 -1
  128. package/sdk-core/tests/integ_tests/polling_tests.rs +1 -39
  129. package/sdk-core/tests/integ_tests/queries_tests.rs +2 -127
  130. package/sdk-core/tests/integ_tests/visibility_tests.rs +52 -5
  131. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +74 -1
  132. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +5 -13
  133. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +1 -1
  134. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +2 -10
  135. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +69 -197
  136. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +4 -28
  137. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  138. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +14 -14
  139. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -19
  140. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +3 -19
  141. package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +1 -1
  142. package/sdk-core/tests/integ_tests/workflow_tests.rs +5 -6
  143. package/sdk-core/tests/main.rs +2 -12
  144. package/sdk-core/tests/runner.rs +71 -34
  145. package/sdk-core/tests/wf_input_replay.rs +32 -0
  146. package/sdk-core/bridge-ffi/Cargo.toml +0 -24
  147. package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
  148. package/sdk-core/bridge-ffi/build.rs +0 -25
  149. package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
  150. package/sdk-core/bridge-ffi/src/lib.rs +0 -746
  151. package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
  152. package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
  153. package/sdk-core/sdk/src/conversions.rs +0 -8
@@ -1,13 +1,18 @@
1
1
  use crate::{
2
- abstractions::{MeteredSemaphore, OwnedMeteredSemPermit},
2
+ abstractions::{dbg_panic, MeteredSemaphore, OwnedMeteredSemPermit},
3
3
  protosext::ValidScheduleLA,
4
4
  retry_logic::RetryPolicyExt,
5
+ worker::workflow::HeartbeatTimeoutMsg,
5
6
  MetricsContext, TaskToken,
6
7
  };
7
- use parking_lot::Mutex;
8
+ use futures::{stream::BoxStream, Stream};
9
+ use futures_util::{future, future::AbortRegistration, stream, StreamExt};
10
+ use parking_lot::{Mutex, MutexGuard};
8
11
  use std::{
9
- collections::HashMap,
12
+ collections::{hash_map::Entry, HashMap},
10
13
  fmt::{Debug, Formatter},
14
+ pin::Pin,
15
+ task::{Context, Poll},
11
16
  time::{Duration, Instant, SystemTime},
12
17
  };
13
18
  use temporal_sdk_core_protos::{
@@ -25,6 +30,7 @@ use tokio::{
25
30
  task::JoinHandle,
26
31
  time::sleep,
27
32
  };
33
+ use tokio_stream::wrappers::UnboundedReceiverStream;
28
34
  use tokio_util::sync::CancellationToken;
29
35
 
30
36
  #[allow(clippy::large_enum_variant)] // Timeouts are relatively rare
@@ -49,6 +55,10 @@ pub(crate) struct LocalInFlightActInfo {
49
55
  }
50
56
 
51
57
  #[derive(Debug, Clone)]
58
+ #[cfg_attr(
59
+ feature = "save_wf_inputs",
60
+ derive(serde::Serialize, serde::Deserialize)
61
+ )]
52
62
  pub(crate) enum LocalActivityExecutionResult {
53
63
  Completed(Success),
54
64
  Failed(ActFail),
@@ -65,6 +75,10 @@ impl LocalActivityExecutionResult {
65
75
  }
66
76
 
67
77
  #[derive(Debug, Clone)]
78
+ #[cfg_attr(
79
+ feature = "save_wf_inputs",
80
+ derive(serde::Serialize, serde::Deserialize)
81
+ )]
68
82
  pub(crate) struct LocalActivityResolution {
69
83
  pub seq: u32,
70
84
  pub result: LocalActivityExecutionResult,
@@ -96,6 +110,12 @@ impl Debug for NewLocalAct {
96
110
  pub(crate) enum LocalActRequest {
97
111
  New(NewLocalAct),
98
112
  Cancel(ExecutingLAId),
113
+ CancelAllInRun(String),
114
+ StartHeartbeatTimeout {
115
+ send_on_elapse: HeartbeatTimeoutMsg,
116
+ deadline: Instant,
117
+ abort_reg: AbortRegistration,
118
+ },
99
119
  }
100
120
 
101
121
  #[derive(Debug, Clone, Eq, PartialEq, Hash)]
@@ -107,28 +127,39 @@ pub(crate) struct ExecutingLAId {
107
127
  pub(crate) struct LocalActivityManager {
108
128
  /// Just so we can provide activity tasks the same namespace as the worker
109
129
  namespace: String,
110
- /// Constrains number of currently executing local activities
111
- semaphore: MeteredSemaphore,
112
130
  /// Sink for new activity execution requests
113
131
  act_req_tx: UnboundedSender<NewOrRetry>,
114
132
  /// Cancels need a different queue since they should be taken first, and don't take a permit
115
133
  cancels_req_tx: UnboundedSender<CancelOrTimeout>,
134
+ /// For the emission of heartbeat timeouts, back into the workflow machines. This channel
135
+ /// needs to come in from above us, because we cannot rely on callers getting the next
136
+ /// activation as a way to deliver heartbeats.
137
+ heartbeat_timeout_tx: UnboundedSender<HeartbeatTimeoutMsg>,
116
138
  /// Wakes every time a complete is processed
117
139
  complete_notify: Notify,
140
+ /// Set once workflows have finished shutting down, and thus we know we will no longer receive
141
+ /// any requests to spawn new LAs
142
+ workflows_have_shut_down: CancellationToken,
118
143
 
119
144
  rcvs: tokio::sync::Mutex<RcvChans>,
120
145
  shutdown_complete_tok: CancellationToken,
121
146
  dat: Mutex<LAMData>,
122
147
  }
123
148
 
149
+ struct LocalActivityInfo {
150
+ task_token: TaskToken,
151
+ /// Tasks for the current backoff until the next retry, if any.
152
+ backing_off_task: Option<JoinHandle<()>>,
153
+ /// Tasks / info about timeouts associated with this LA. May be empty for very brief periods
154
+ /// while the LA id has been generated, but it has not yet been scheduled.
155
+ timeout_bag: Option<TimeoutBag>,
156
+ }
157
+
124
158
  struct LAMData {
159
+ /// Maps local activity identifiers to information about them
160
+ la_info: HashMap<ExecutingLAId, LocalActivityInfo>,
125
161
  /// Activities that have been issued to lang but not yet completed
126
162
  outstanding_activity_tasks: HashMap<TaskToken, LocalInFlightActInfo>,
127
- id_to_tt: HashMap<ExecutingLAId, TaskToken>,
128
- /// Tasks for activities which are currently backing off. May be used to cancel retrying them.
129
- backing_off_tasks: HashMap<ExecutingLAId, JoinHandle<()>>,
130
- /// Tasks for timing out activities which are currently in the queue or dispatched.
131
- timeout_tasks: HashMap<ExecutingLAId, TimeoutBag>,
132
163
  next_tt_num: u32,
133
164
  }
134
165
 
@@ -143,42 +174,46 @@ impl LocalActivityManager {
143
174
  pub(crate) fn new(
144
175
  max_concurrent: usize,
145
176
  namespace: String,
177
+ heartbeat_timeout_tx: UnboundedSender<HeartbeatTimeoutMsg>,
146
178
  metrics_context: MetricsContext,
147
179
  ) -> Self {
148
180
  let (act_req_tx, act_req_rx) = unbounded_channel();
149
181
  let (cancels_req_tx, cancels_req_rx) = unbounded_channel();
150
182
  let shutdown_complete_tok = CancellationToken::new();
183
+ let semaphore = MeteredSemaphore::new(
184
+ max_concurrent,
185
+ metrics_context,
186
+ MetricsContext::available_task_slots,
187
+ );
151
188
  Self {
152
189
  namespace,
153
- semaphore: MeteredSemaphore::new(
154
- max_concurrent,
155
- metrics_context,
156
- MetricsContext::available_task_slots,
157
- ),
190
+ rcvs: tokio::sync::Mutex::new(RcvChans::new(
191
+ act_req_rx,
192
+ semaphore,
193
+ cancels_req_rx,
194
+ shutdown_complete_tok.clone(),
195
+ )),
158
196
  act_req_tx,
159
197
  cancels_req_tx,
198
+ heartbeat_timeout_tx,
160
199
  complete_notify: Notify::new(),
161
- rcvs: tokio::sync::Mutex::new(RcvChans {
162
- act_req_rx,
163
- cancels_req_rx,
164
- shutdown: shutdown_complete_tok.clone(),
165
- }),
166
200
  shutdown_complete_tok,
167
201
  dat: Mutex::new(LAMData {
168
202
  outstanding_activity_tasks: Default::default(),
169
- id_to_tt: Default::default(),
170
- backing_off_tasks: Default::default(),
171
- timeout_tasks: Default::default(),
203
+ la_info: Default::default(),
172
204
  next_tt_num: 0,
173
205
  }),
206
+ workflows_have_shut_down: Default::default(),
174
207
  }
175
208
  }
176
209
 
177
210
  #[cfg(test)]
178
211
  fn test(max_concurrent: usize) -> Self {
212
+ let (hb_tx, _hb_rx) = unbounded_channel();
179
213
  Self::new(
180
214
  max_concurrent,
181
215
  "fake_ns".to_string(),
216
+ hb_tx,
182
217
  MetricsContext::no_op(),
183
218
  )
184
219
  }
@@ -190,76 +225,103 @@ impl LocalActivityManager {
190
225
 
191
226
  #[cfg(test)]
192
227
  fn num_in_backoff(&self) -> usize {
193
- self.dat.lock().backing_off_tasks.len()
228
+ self.dat
229
+ .lock()
230
+ .la_info
231
+ .values()
232
+ .filter(|lai| lai.backing_off_task.is_some())
233
+ .count()
194
234
  }
195
235
 
196
236
  pub(crate) fn enqueue(
197
237
  &self,
198
238
  reqs: impl IntoIterator<Item = LocalActRequest>,
199
239
  ) -> Vec<LocalActivityResolution> {
240
+ if self.workflows_have_shut_down.is_cancelled() {
241
+ dbg_panic!("Tried to enqueue local activity after workflows were shut down");
242
+ return vec![];
243
+ }
200
244
  let mut immediate_resolutions = vec![];
201
245
  for req in reqs {
202
- debug!(local_activity = ?req, "Queuing local activity");
203
246
  match req {
204
247
  LocalActRequest::New(act) => {
248
+ debug!(local_activity=?act, "Queuing local activity");
205
249
  let id = ExecutingLAId {
206
250
  run_id: act.workflow_exec_info.run_id.clone(),
207
251
  seq_num: act.schedule_cmd.seq,
208
252
  };
209
253
  let mut dlock = self.dat.lock();
210
- if dlock.id_to_tt.contains_key(&id) {
211
- // Do not queue local activities which are in fact already executing.
212
- // This can happen during evictions.
213
- debug!("Tried to queue already-executing local activity {:?}", &id);
214
- continue;
215
- }
216
- // Pre-generate and insert the task token now, before we may or may not dispatch
217
- // the activity, so we can enforce idempotency. Prevents two identical LAs
218
- // ending up in the queue at once.
219
254
  let tt = dlock.gen_next_token();
220
- dlock.id_to_tt.insert(id.clone(), tt);
221
-
222
- // Set up timeouts for the new activity
223
- match TimeoutBag::new(&act, self.cancels_req_tx.clone()) {
224
- Ok(tb) => {
225
- dlock.timeout_tasks.insert(id, tb);
226
-
227
- self.act_req_tx
228
- .send(NewOrRetry::New(act))
229
- .expect("Receive half of LA request channel cannot be dropped");
255
+ match dlock.la_info.entry(id) {
256
+ Entry::Occupied(o) => {
257
+ // Do not queue local activities which are in fact already executing.
258
+ // This can happen during evictions.
259
+ debug!(
260
+ "Tried to queue already-executing local activity {:?}",
261
+ o.key()
262
+ );
263
+ continue;
264
+ }
265
+ Entry::Vacant(ve) => {
266
+ // Insert the task token now, before we may or may not dispatch the
267
+ // activity, so we can enforce idempotency. Prevents two identical LAs
268
+ // ending up in the queue at once.
269
+ let lai = ve.insert(LocalActivityInfo {
270
+ task_token: tt,
271
+ backing_off_task: None,
272
+ timeout_bag: None,
273
+ });
274
+
275
+ // Set up timeouts for the new activity
276
+ match TimeoutBag::new(&act, self.cancels_req_tx.clone()) {
277
+ Ok(tb) => {
278
+ lai.timeout_bag = Some(tb);
279
+
280
+ self.act_req_tx.send(NewOrRetry::New(act)).expect(
281
+ "Receive half of LA request channel cannot be dropped",
282
+ );
283
+ }
284
+ Err(res) => immediate_resolutions.push(res),
285
+ }
230
286
  }
231
- Err(res) => immediate_resolutions.push(res),
232
287
  }
233
288
  }
289
+ LocalActRequest::StartHeartbeatTimeout {
290
+ send_on_elapse,
291
+ deadline,
292
+ abort_reg,
293
+ } => {
294
+ let chan = self.heartbeat_timeout_tx.clone();
295
+ tokio::spawn(future::Abortable::new(
296
+ async move {
297
+ tokio::time::sleep_until(deadline.into()).await;
298
+ let _ = chan.send(send_on_elapse);
299
+ },
300
+ abort_reg,
301
+ ));
302
+ }
234
303
  LocalActRequest::Cancel(id) => {
304
+ debug!(id=?id, "Cancelling local activity");
235
305
  let mut dlock = self.dat.lock();
236
-
237
- // First check if this ID is currently backing off, if so abort the backoff
238
- // task
239
- if let Some(t) = dlock.backing_off_tasks.remove(&id) {
240
- t.abort();
241
- immediate_resolutions.push(LocalActivityResolution {
242
- seq: id.seq_num,
243
- result: LocalActivityExecutionResult::Cancelled(
244
- Cancellation::from_details(None),
245
- ),
246
- runtime: Duration::from_secs(0),
247
- attempt: 0,
248
- backoff: None,
249
- original_schedule_time: None,
250
- });
251
- continue;
306
+ if let Some(lai) = dlock.la_info.get_mut(&id) {
307
+ if let Some(immediate_res) = self.cancel_one_la(id.seq_num, lai) {
308
+ immediate_resolutions.push(immediate_res);
309
+ }
252
310
  }
253
-
254
- if let Some(tt) = dlock.id_to_tt.get(&id) {
255
- self.cancels_req_tx
256
- .send(CancelOrTimeout::Cancel(ActivityTask {
257
- task_token: tt.0.clone(),
258
- variant: Some(activity_task::Variant::Cancel(Cancel {
259
- reason: ActivityCancelReason::Cancelled as i32,
260
- })),
261
- }))
262
- .expect("Receive half of LA cancel channel cannot be dropped");
311
+ }
312
+ LocalActRequest::CancelAllInRun(run_id) => {
313
+ debug!(run_id=%run_id, "Cancelling all local activities for run");
314
+ let mut dlock = self.dat.lock();
315
+ // Even if we've got 100k+ LAs this should only take a ms or two. Not worth
316
+ // adding another map to keep in sync.
317
+ let las_for_run = dlock
318
+ .la_info
319
+ .iter_mut()
320
+ .filter(|(id, _)| id.run_id == run_id);
321
+ for (laid, lainf) in las_for_run {
322
+ if let Some(immediate_res) = self.cancel_one_la(laid.seq_num, lainf) {
323
+ immediate_resolutions.push(immediate_res);
324
+ }
263
325
  }
264
326
  }
265
327
  }
@@ -270,7 +332,7 @@ impl LocalActivityManager {
270
332
  /// Returns the next pending local-activity related action, or None if shutdown has initiated
271
333
  /// and there are no more remaining actions to take.
272
334
  pub(crate) async fn next_pending(&self) -> Option<DispatchOrTimeoutLA> {
273
- let (new_or_retry, permit) = match self.rcvs.lock().await.next(&self.semaphore).await? {
335
+ let (new_or_retry, permit) = match self.rcvs.lock().await.next().await? {
274
336
  NewOrCancel::Cancel(c) => {
275
337
  return match c {
276
338
  CancelOrTimeout::Cancel(c) => Some(DispatchOrTimeoutLA::Dispatch(c)),
@@ -283,12 +345,13 @@ impl LocalActivityManager {
283
345
  let tt = self
284
346
  .dat
285
347
  .lock()
286
- .id_to_tt
348
+ .la_info
287
349
  .get(&ExecutingLAId {
288
350
  run_id: run_id.clone(),
289
351
  seq_num: resolution.seq,
290
352
  })
291
- .map(Clone::clone);
353
+ .as_ref()
354
+ .map(|lai| lai.task_token.clone());
292
355
  if let Some(task_token) = tt {
293
356
  self.complete(&task_token, &resolution.result);
294
357
  Some(ActivityTask {
@@ -323,18 +386,21 @@ impl LocalActivityManager {
323
386
  }
324
387
  NewOrRetry::Retry { in_flight, attempt } => (in_flight, attempt),
325
388
  };
326
- let orig = new_la.clone();
389
+ let la_info_for_in_flight_map = new_la.clone();
327
390
  let id = ExecutingLAId {
328
391
  run_id: new_la.workflow_exec_info.run_id.clone(),
329
392
  seq_num: new_la.schedule_cmd.seq,
330
393
  };
394
+ let orig_sched_time = new_la.schedule_cmd.original_schedule_time;
331
395
  let sa = new_la.schedule_cmd;
332
396
 
333
397
  let mut dat = self.dat.lock();
334
398
  // If this request originated from a local backoff task, clear the entry for it. We
335
399
  // don't await the handle because we know it must already be done, and there's no
336
400
  // meaningful value.
337
- dat.backing_off_tasks.remove(&id);
401
+ dat.la_info
402
+ .get_mut(&id)
403
+ .map(|lai| lai.backing_off_task.take());
338
404
 
339
405
  // If this task sat in the queue for too long, return a timeout for it instead
340
406
  if let Some(s2s) = sa.schedule_to_start_timeout.as_ref() {
@@ -348,30 +414,27 @@ impl LocalActivityManager {
348
414
  runtime: sat_for,
349
415
  attempt,
350
416
  backoff: None,
351
- original_schedule_time: Some(new_la.schedule_time),
417
+ original_schedule_time: orig_sched_time,
352
418
  },
353
419
  task: None,
354
420
  });
355
421
  }
356
422
  }
357
423
 
358
- let tt = dat
359
- .id_to_tt
360
- .get(&id)
361
- .expect("Task token must exist")
362
- .clone();
424
+ let la_info = dat.la_info.get_mut(&id).expect("Activity must exist");
425
+ let tt = la_info.task_token.clone();
426
+ if let Some(to) = la_info.timeout_bag.as_mut() {
427
+ to.mark_started();
428
+ }
363
429
  dat.outstanding_activity_tasks.insert(
364
430
  tt.clone(),
365
431
  LocalInFlightActInfo {
366
- la_info: orig,
432
+ la_info: la_info_for_in_flight_map,
367
433
  dispatch_time: Instant::now(),
368
434
  attempt,
369
435
  _permit: permit,
370
436
  },
371
437
  );
372
- if let Some(to) = dat.timeout_tasks.get_mut(&id) {
373
- to.mark_started();
374
- }
375
438
 
376
439
  let (schedule_to_close, start_to_close) = sa.close_timeouts.into_sched_and_start();
377
440
  Some(DispatchOrTimeoutLA::Dispatch(ActivityTask {
@@ -406,11 +469,23 @@ impl LocalActivityManager {
406
469
  ) -> LACompleteAction {
407
470
  let mut dlock = self.dat.lock();
408
471
  if let Some(info) = dlock.outstanding_activity_tasks.remove(task_token) {
472
+ if self.workflows_have_shut_down.is_cancelled() {
473
+ // If workflows are already shut down, the results of all this don't matter.
474
+ // Just say we're done if there's nothing outstanding any more.
475
+ self.set_shutdown_complete_if_ready(&mut dlock);
476
+ }
477
+
409
478
  let exec_id = ExecutingLAId {
410
479
  run_id: info.la_info.workflow_exec_info.run_id.clone(),
411
480
  seq_num: info.la_info.schedule_cmd.seq,
412
481
  };
413
- dlock.id_to_tt.remove(&exec_id);
482
+ let maybe_old_lai = dlock.la_info.remove(&exec_id);
483
+ if let Some(ref oldlai) = maybe_old_lai {
484
+ if let Some(ref bot) = oldlai.backing_off_task {
485
+ dbg_panic!("Just-resolved LA should not have backoff task");
486
+ bot.abort();
487
+ }
488
+ }
414
489
 
415
490
  match status {
416
491
  LocalActivityExecutionResult::Completed(_)
@@ -446,8 +521,6 @@ impl LocalActivityManager {
446
521
  }
447
522
  // Immediately create a new task token for the to-be-retried LA
448
523
  let tt = dlock.gen_next_token();
449
- dlock.id_to_tt.insert(exec_id.clone(), tt);
450
-
451
524
  // Send the retry request after waiting the backoff duration
452
525
  let send_chan = self.act_req_tx.clone();
453
526
  let jh = tokio::spawn(async move {
@@ -460,7 +533,14 @@ impl LocalActivityManager {
460
533
  })
461
534
  .expect("Receive half of LA request channel cannot be dropped");
462
535
  });
463
- dlock.backing_off_tasks.insert(exec_id, jh);
536
+ dlock.la_info.insert(
537
+ exec_id,
538
+ LocalActivityInfo {
539
+ task_token: tt,
540
+ backing_off_task: Some(jh),
541
+ timeout_bag: maybe_old_lai.and_then(|old| old.timeout_bag),
542
+ },
543
+ );
464
544
 
465
545
  LACompleteAction::WillBeRetried
466
546
  } else {
@@ -473,11 +553,53 @@ impl LocalActivityManager {
473
553
  }
474
554
  }
475
555
 
476
- pub(crate) async fn shutdown_and_wait_all_finished(&self) {
477
- while !self.dat.lock().outstanding_activity_tasks.is_empty() {
556
+ pub(crate) fn workflows_have_shutdown(&self) {
557
+ self.workflows_have_shut_down.cancel();
558
+ self.set_shutdown_complete_if_ready(&mut self.dat.lock());
559
+ }
560
+
561
+ pub(crate) async fn wait_all_outstanding_tasks_finished(&self) {
562
+ while !self.set_shutdown_complete_if_ready(&mut self.dat.lock()) {
478
563
  self.complete_notify.notified().await;
479
564
  }
480
- self.shutdown_complete_tok.cancel();
565
+ }
566
+
567
+ fn set_shutdown_complete_if_ready(&self, dlock: &mut MutexGuard<LAMData>) -> bool {
568
+ let nothing_outstanding = dlock.outstanding_activity_tasks.is_empty();
569
+ if nothing_outstanding {
570
+ self.shutdown_complete_tok.cancel();
571
+ }
572
+ nothing_outstanding
573
+ }
574
+
575
+ fn cancel_one_la(
576
+ &self,
577
+ seq: u32,
578
+ lai: &mut LocalActivityInfo,
579
+ ) -> Option<LocalActivityResolution> {
580
+ // First check if this ID is currently backing off, if so abort the backoff
581
+ // task
582
+ if let Some(t) = lai.backing_off_task.take() {
583
+ t.abort();
584
+ return Some(LocalActivityResolution {
585
+ seq,
586
+ result: LocalActivityExecutionResult::Cancelled(Cancellation::from_details(None)),
587
+ runtime: Duration::from_secs(0),
588
+ attempt: 0,
589
+ backoff: None,
590
+ original_schedule_time: None,
591
+ });
592
+ }
593
+
594
+ self.cancels_req_tx
595
+ .send(CancelOrTimeout::Cancel(ActivityTask {
596
+ task_token: lai.task_token.0.clone(),
597
+ variant: Some(activity_task::Variant::Cancel(Cancel {
598
+ reason: ActivityCancelReason::Cancelled as i32,
599
+ })),
600
+ }))
601
+ .expect("Receive half of LA cancel channel cannot be dropped");
602
+ None
481
603
  }
482
604
  }
483
605
 
@@ -521,32 +643,45 @@ enum NewOrCancel {
521
643
  Cancel(CancelOrTimeout),
522
644
  }
523
645
 
646
+ #[pin_project::pin_project]
524
647
  struct RcvChans {
525
- /// Activities that need to be executed by lang
526
- act_req_rx: UnboundedReceiver<NewOrRetry>,
527
- /// Cancels to send to lang or apply internally
528
- cancels_req_rx: UnboundedReceiver<CancelOrTimeout>,
529
- shutdown: CancellationToken,
648
+ #[pin]
649
+ inner: BoxStream<'static, NewOrCancel>,
530
650
  }
531
651
 
532
652
  impl RcvChans {
533
- async fn next(&mut self, new_sem: &MeteredSemaphore) -> Option<NewOrCancel> {
534
- tokio::select! {
535
- cancel = async { self.cancels_req_rx.recv().await } => {
536
- Some(NewOrCancel::Cancel(cancel.expect("Send halves of LA manager are not dropped")))
537
- }
538
- (maybe_new_or_retry, perm) = async {
539
- // Wait for a permit to take a task and forget it. Permits are removed until a
540
- // completion.
541
- let perm = new_sem.acquire_owned().await.expect("is never closed");
542
- (self.act_req_rx.recv().await, perm)
543
- } => Some(NewOrCancel::New(
544
- maybe_new_or_retry.expect("Send halves of LA manager are not dropped"), perm
545
- )),
546
- _ = self.shutdown.cancelled() => None
653
+ fn new(
654
+ new_reqs: UnboundedReceiver<NewOrRetry>,
655
+ new_sem: MeteredSemaphore,
656
+ cancels: UnboundedReceiver<CancelOrTimeout>,
657
+ shutdown_completed: CancellationToken,
658
+ ) -> Self {
659
+ let cancel_stream = UnboundedReceiverStream::new(cancels).map(NewOrCancel::Cancel);
660
+ let new_stream = UnboundedReceiverStream::new(new_reqs)
661
+ // Get a permit for each new activity request
662
+ .zip(stream::unfold(new_sem, |new_sem| async move {
663
+ let permit = new_sem
664
+ .acquire_owned()
665
+ .await
666
+ .expect("Local activity semaphore is never closed");
667
+ Some((permit, new_sem))
668
+ }))
669
+ .map(|(req, permit)| NewOrCancel::New(req, permit));
670
+ Self {
671
+ inner: tokio_stream::StreamExt::merge(cancel_stream, new_stream)
672
+ .take_until(async move { shutdown_completed.cancelled().await })
673
+ .boxed(),
547
674
  }
548
675
  }
549
676
  }
677
+ impl Stream for RcvChans {
678
+ type Item = NewOrCancel;
679
+
680
+ fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
681
+ let this = self.project();
682
+ this.inner.poll_next(cx)
683
+ }
684
+ }
550
685
 
551
686
  struct TimeoutBag {
552
687
  sched_to_close_handle: JoinHandle<()>,
@@ -567,17 +702,21 @@ impl TimeoutBag {
567
702
  let (schedule_to_close, start_to_close) =
568
703
  new_la.schedule_cmd.close_timeouts.into_sched_and_start();
569
704
 
705
+ let sched_time = new_la
706
+ .schedule_cmd
707
+ .original_schedule_time
708
+ .unwrap_or(new_la.schedule_time);
570
709
  let resolution = LocalActivityResolution {
571
710
  seq: new_la.schedule_cmd.seq,
572
711
  result: LocalActivityExecutionResult::timeout(TimeoutType::ScheduleToClose),
573
712
  runtime: Default::default(),
574
713
  attempt: new_la.schedule_cmd.attempt,
575
714
  backoff: None,
576
- original_schedule_time: Some(new_la.schedule_time),
715
+ original_schedule_time: new_la.schedule_cmd.original_schedule_time,
577
716
  };
578
717
  // Remove any time already elapsed since the scheduling time
579
718
  let schedule_to_close = schedule_to_close
580
- .map(|s2c| s2c.saturating_sub(new_la.schedule_time.elapsed().unwrap_or_default()));
719
+ .map(|s2c| s2c.saturating_sub(sched_time.elapsed().unwrap_or_default()));
581
720
  if let Some(ref s2c) = schedule_to_close {
582
721
  if s2c.is_zero() {
583
722
  return Err(resolution);
@@ -640,18 +779,19 @@ impl Drop for TimeoutBag {
640
779
  mod tests {
641
780
  use super::*;
642
781
  use crate::{prost_dur, protosext::LACloseTimeouts};
782
+ use futures_util::FutureExt;
643
783
  use temporal_sdk_core_protos::temporal::api::{
644
784
  common::v1::RetryPolicy,
645
785
  failure::v1::{failure::FailureInfo, ApplicationFailureInfo, Failure},
646
786
  };
647
- use tokio::{sync::mpsc::error::TryRecvError, task::yield_now};
787
+ use tokio::task::yield_now;
648
788
 
649
789
  impl DispatchOrTimeoutLA {
650
790
  fn unwrap(self) -> ActivityTask {
651
791
  match self {
652
792
  DispatchOrTimeoutLA::Dispatch(t) => t,
653
- DispatchOrTimeoutLA::Timeout { .. } => {
654
- panic!("Timeout returned when expected a task")
793
+ _ => {
794
+ panic!("Non-dispatched action returned")
655
795
  }
656
796
  }
657
797
  }
@@ -1026,18 +1166,12 @@ mod tests {
1026
1166
  lam.next_pending().await.unwrap().unwrap();
1027
1167
  assert_eq!(lam.num_outstanding(), 1);
1028
1168
  // There should be nothing else in the queue
1029
- assert_eq!(
1030
- lam.rcvs.lock().await.act_req_rx.try_recv().unwrap_err(),
1031
- TryRecvError::Empty
1032
- );
1169
+ assert!(lam.rcvs.lock().await.next().now_or_never().is_none());
1033
1170
 
1034
1171
  // Verify that if we now enqueue the same act again, after the task is outstanding, we still
1035
1172
  // don't add it.
1036
1173
  lam.enqueue([new_la.into()]);
1037
1174
  assert_eq!(lam.num_outstanding(), 1);
1038
- assert_eq!(
1039
- lam.rcvs.lock().await.act_req_rx.try_recv().unwrap_err(),
1040
- TryRecvError::Empty
1041
- );
1175
+ assert!(lam.rcvs.lock().await.next().now_or_never().is_none());
1042
1176
  }
1043
1177
  }
@@ -7,11 +7,12 @@ pub(crate) use local_activities::{
7
7
  LocalInFlightActInfo, NewLocalAct,
8
8
  };
9
9
 
10
- use crate::telemetry::metrics::eager;
11
10
  use crate::{
12
11
  abstractions::{MeteredSemaphore, OwnedMeteredSemPermit},
13
12
  pollers::BoxedActPoller,
14
- telemetry::metrics::{activity_type, activity_worker_type, workflow_type, MetricsContext},
13
+ telemetry::metrics::{
14
+ activity_type, activity_worker_type, eager, workflow_type, MetricsContext,
15
+ },
15
16
  worker::{
16
17
  activities::activity_heartbeat_manager::ActivityHeartbeatError, client::WorkerClient,
17
18
  },