@temporalio/core-bridge 1.6.0 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/Cargo.lock +520 -456
  2. package/lib/index.d.ts +8 -6
  3. package/lib/index.js.map +1 -1
  4. package/package.json +8 -3
  5. package/releases/aarch64-apple-darwin/index.node +0 -0
  6. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  7. package/releases/x86_64-apple-darwin/index.node +0 -0
  8. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  9. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  10. package/sdk-core/.buildkite/docker/Dockerfile +2 -2
  11. package/sdk-core/.buildkite/docker/docker-compose.yaml +1 -1
  12. package/sdk-core/.buildkite/pipeline.yml +1 -1
  13. package/sdk-core/.github/workflows/heavy.yml +1 -0
  14. package/sdk-core/README.md +13 -7
  15. package/sdk-core/client/src/lib.rs +27 -9
  16. package/sdk-core/client/src/metrics.rs +17 -8
  17. package/sdk-core/client/src/raw.rs +3 -3
  18. package/sdk-core/core/Cargo.toml +3 -4
  19. package/sdk-core/core/src/abstractions/take_cell.rs +28 -0
  20. package/sdk-core/core/src/abstractions.rs +197 -18
  21. package/sdk-core/core/src/core_tests/activity_tasks.rs +137 -45
  22. package/sdk-core/core/src/core_tests/child_workflows.rs +6 -5
  23. package/sdk-core/core/src/core_tests/determinism.rs +212 -2
  24. package/sdk-core/core/src/core_tests/local_activities.rs +183 -36
  25. package/sdk-core/core/src/core_tests/queries.rs +32 -14
  26. package/sdk-core/core/src/core_tests/workers.rs +8 -5
  27. package/sdk-core/core/src/core_tests/workflow_tasks.rs +340 -51
  28. package/sdk-core/core/src/ephemeral_server/mod.rs +110 -8
  29. package/sdk-core/core/src/internal_flags.rs +141 -0
  30. package/sdk-core/core/src/lib.rs +14 -9
  31. package/sdk-core/core/src/replay/mod.rs +16 -27
  32. package/sdk-core/core/src/telemetry/metrics.rs +69 -35
  33. package/sdk-core/core/src/telemetry/mod.rs +38 -14
  34. package/sdk-core/core/src/telemetry/prometheus_server.rs +19 -13
  35. package/sdk-core/core/src/test_help/mod.rs +65 -13
  36. package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +119 -160
  37. package/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
  38. package/sdk-core/core/src/worker/activities/local_activities.rs +122 -6
  39. package/sdk-core/core/src/worker/activities.rs +347 -173
  40. package/sdk-core/core/src/worker/client/mocks.rs +22 -2
  41. package/sdk-core/core/src/worker/client.rs +18 -2
  42. package/sdk-core/core/src/worker/mod.rs +137 -44
  43. package/sdk-core/core/src/worker/workflow/history_update.rs +132 -51
  44. package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +207 -166
  45. package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +6 -7
  46. package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +6 -7
  47. package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +157 -82
  48. package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +12 -12
  49. package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +6 -7
  50. package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +13 -15
  51. package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +170 -60
  52. package/sdk-core/core/src/worker/workflow/machines/mod.rs +24 -16
  53. package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +6 -8
  54. package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +320 -204
  55. package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +10 -13
  56. package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +15 -23
  57. package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +187 -46
  58. package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +237 -111
  59. package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +13 -13
  60. package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +10 -6
  61. package/sdk-core/core/src/worker/workflow/managed_run.rs +81 -62
  62. package/sdk-core/core/src/worker/workflow/mod.rs +341 -79
  63. package/sdk-core/core/src/worker/workflow/run_cache.rs +18 -11
  64. package/sdk-core/core/src/worker/workflow/wft_extraction.rs +15 -3
  65. package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +2 -0
  66. package/sdk-core/core/src/worker/workflow/workflow_stream.rs +75 -52
  67. package/sdk-core/core-api/Cargo.toml +0 -1
  68. package/sdk-core/core-api/src/lib.rs +13 -7
  69. package/sdk-core/core-api/src/telemetry.rs +4 -6
  70. package/sdk-core/core-api/src/worker.rs +5 -0
  71. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +80 -55
  72. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +22 -68
  73. package/sdk-core/histories/ends_empty_wft_complete.bin +0 -0
  74. package/sdk-core/histories/old_change_marker_format.bin +0 -0
  75. package/sdk-core/protos/api_upstream/.github/CODEOWNERS +2 -1
  76. package/sdk-core/protos/api_upstream/Makefile +1 -1
  77. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
  78. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
  79. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
  80. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
  81. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
  82. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
  83. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
  84. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
  85. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
  86. package/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
  87. package/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
  88. package/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
  89. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
  90. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  91. package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +7 -8
  92. package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +10 -7
  93. package/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +19 -30
  94. package/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
  95. package/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
  96. package/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +8 -0
  97. package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +65 -60
  98. package/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +85 -84
  99. package/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +9 -3
  100. package/sdk-core/sdk/Cargo.toml +1 -1
  101. package/sdk-core/sdk/src/lib.rs +21 -5
  102. package/sdk-core/sdk/src/workflow_context/options.rs +7 -1
  103. package/sdk-core/sdk/src/workflow_context.rs +24 -17
  104. package/sdk-core/sdk/src/workflow_future.rs +9 -3
  105. package/sdk-core/sdk-core-protos/src/history_builder.rs +114 -89
  106. package/sdk-core/sdk-core-protos/src/history_info.rs +6 -1
  107. package/sdk-core/sdk-core-protos/src/lib.rs +205 -64
  108. package/sdk-core/test-utils/src/canned_histories.rs +106 -296
  109. package/sdk-core/test-utils/src/lib.rs +32 -5
  110. package/sdk-core/tests/heavy_tests.rs +10 -43
  111. package/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
  112. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
  113. package/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
  114. package/sdk-core/tests/integ_tests/polling_tests.rs +3 -8
  115. package/sdk-core/tests/integ_tests/queries_tests.rs +4 -2
  116. package/sdk-core/tests/integ_tests/visibility_tests.rs +34 -23
  117. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +97 -81
  118. package/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
  119. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +1 -0
  120. package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +80 -3
  121. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +5 -1
  122. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +1 -0
  123. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +25 -3
  124. package/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +2 -4
  125. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +30 -0
  126. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +64 -0
  127. package/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
  128. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +4 -0
  129. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -1
  130. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +7 -2
  131. package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -7
  132. package/sdk-core/tests/integ_tests/workflow_tests.rs +8 -8
  133. package/sdk-core/tests/main.rs +16 -25
  134. package/sdk-core/tests/runner.rs +11 -9
  135. package/src/conversions.rs +14 -8
  136. package/src/runtime.rs +9 -8
  137. package/ts/index.ts +8 -6
  138. package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
@@ -1,4 +1,5 @@
1
1
  mod activity_heartbeat_manager;
2
+ mod activity_task_poller_stream;
2
3
  mod local_activities;
3
4
 
4
5
  pub(crate) use local_activities::{
@@ -8,26 +9,34 @@ pub(crate) use local_activities::{
8
9
  };
9
10
 
10
11
  use crate::{
11
- abstractions::{MeteredSemaphore, OwnedMeteredSemPermit},
12
+ abstractions::{
13
+ ClosableMeteredSemaphore, MeteredSemaphore, OwnedMeteredSemPermit,
14
+ TrackedOwnedMeteredSemPermit, UsedMeteredSemPermit,
15
+ },
12
16
  pollers::BoxedActPoller,
13
17
  telemetry::metrics::{
14
18
  activity_type, activity_worker_type, eager, workflow_type, MetricsContext,
15
19
  },
16
20
  worker::{
17
- activities::activity_heartbeat_manager::ActivityHeartbeatError, client::WorkerClient,
21
+ activities::{
22
+ activity_heartbeat_manager::ActivityHeartbeatError,
23
+ activity_task_poller_stream::new_activity_task_poller,
24
+ },
25
+ client::WorkerClient,
18
26
  },
19
27
  PollActivityError, TaskToken,
20
28
  };
21
29
  use activity_heartbeat_manager::ActivityHeartbeatManager;
22
30
  use dashmap::DashMap;
23
- use governor::{
24
- clock::DefaultClock,
25
- middleware::NoOpMiddleware,
26
- state::{InMemoryState, NotKeyed},
27
- Quota, RateLimiter,
31
+ use futures::{
32
+ stream,
33
+ stream::{BoxStream, PollNext},
34
+ Stream, StreamExt,
28
35
  };
36
+ use governor::{Quota, RateLimiter};
29
37
  use std::{
30
38
  convert::TryInto,
39
+ future,
31
40
  sync::Arc,
32
41
  time::{Duration, Instant},
33
42
  };
@@ -38,13 +47,23 @@ use temporal_sdk_core_protos::{
38
47
  ActivityHeartbeat,
39
48
  },
40
49
  temporal::api::{
41
- failure::v1::{failure::FailureInfo, CanceledFailureInfo, Failure},
50
+ failure::v1::{failure::FailureInfo, ApplicationFailureInfo, CanceledFailureInfo, Failure},
42
51
  workflowservice::v1::PollActivityTaskQueueResponse,
43
52
  },
44
53
  };
45
- use tokio::sync::Notify;
54
+ use tokio::{
55
+ join,
56
+ sync::{
57
+ mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
58
+ Mutex, Notify,
59
+ },
60
+ };
61
+ use tokio_stream::wrappers::UnboundedReceiverStream;
62
+ use tokio_util::sync::CancellationToken;
46
63
  use tracing::Span;
47
64
 
65
+ type OutstandingActMap = Arc<DashMap<TaskToken, RemoteInFlightActInfo>>;
66
+
48
67
  #[derive(Debug, derive_more::Constructor)]
49
68
  struct PendingActivityCancel {
50
69
  task_token: TaskToken,
@@ -68,17 +87,18 @@ struct RemoteInFlightActInfo {
68
87
  pub base: InFlightActInfo,
69
88
  /// Used to calculate aggregation delay between activity heartbeats.
70
89
  pub heartbeat_timeout: Option<prost_types::Duration>,
71
- /// Set to true if we have already issued a cancellation activation to lang for this activity
72
- pub issued_cancel_to_lang: bool,
90
+ /// Set if we have already issued a cancellation activation to lang for this activity, with
91
+ /// the original reason we issued the cancel.
92
+ pub issued_cancel_to_lang: Option<ActivityCancelReason>,
73
93
  /// Set to true if we have already learned from the server this activity doesn't exist. EX:
74
94
  /// we have learned from heartbeating and issued a cancel task, in which case we may simply
75
95
  /// discard the reply.
76
96
  pub known_not_found: bool,
77
97
  /// The permit from the max concurrent semaphore
78
- _permit: OwnedMeteredSemPermit,
98
+ _permit: UsedMeteredSemPermit,
79
99
  }
80
100
  impl RemoteInFlightActInfo {
81
- fn new(poll_resp: &PollActivityTaskQueueResponse, permit: OwnedMeteredSemPermit) -> Self {
101
+ fn new(poll_resp: &PollActivityTaskQueueResponse, permit: UsedMeteredSemPermit) -> Self {
82
102
  let wec = poll_resp.workflow_execution.clone().unwrap_or_default();
83
103
  Self {
84
104
  base: InFlightActInfo {
@@ -89,53 +109,51 @@ impl RemoteInFlightActInfo {
89
109
  start_time: Instant::now(),
90
110
  },
91
111
  heartbeat_timeout: poll_resp.heartbeat_timeout.clone(),
92
- issued_cancel_to_lang: false,
112
+ issued_cancel_to_lang: None,
93
113
  known_not_found: false,
94
114
  _permit: permit,
95
115
  }
96
116
  }
97
117
  }
98
118
 
99
- struct NonPollActBuffer {
100
- tx: async_channel::Sender<PermittedTqResp>,
101
- rx: async_channel::Receiver<PermittedTqResp>,
102
- }
103
- impl NonPollActBuffer {
104
- pub fn new() -> Self {
105
- let (tx, rx) = async_channel::unbounded();
106
- Self { tx, rx }
107
- }
108
-
109
- pub async fn next(&self) -> PermittedTqResp {
110
- self.rx.recv().await.expect("Send half cannot be dropped")
111
- }
112
- }
113
-
114
119
  pub(crate) struct WorkerActivityTasks {
120
+ /// Token used to signal the server task poller that shutdown is beginning
121
+ poller_shutdown_token: CancellationToken,
115
122
  /// Centralizes management of heartbeat issuing / throttling
116
123
  heartbeat_manager: ActivityHeartbeatManager,
124
+ /// Combined stream for any ActivityTask producing source (polls, eager activities,
125
+ /// cancellations)
126
+ activity_task_stream: Mutex<BoxStream<'static, Result<ActivityTask, PollActivityError>>>,
117
127
  /// Activities that have been issued to lang but not yet completed
118
- outstanding_activity_tasks: DashMap<TaskToken, RemoteInFlightActInfo>,
119
- /// Buffers activity task polling in the event we need to return a cancellation while a poll is
120
- /// ongoing.
121
- poller: BoxedActPoller,
122
- /// Holds activity tasks we have received by non-polling means. EX: In direct response to
123
- /// workflow task completion.
124
- non_poll_tasks: NonPollActBuffer,
125
- /// Ensures we stay at or below this worker's maximum concurrent activity limit
126
- activities_semaphore: Arc<MeteredSemaphore>,
127
- /// Enables per-worker rate-limiting of activity tasks
128
- ratelimiter: Option<RateLimiter<NotKeyed, InMemoryState, DefaultClock, NoOpMiddleware>>,
129
- /// Wakes every time an activity is removed from the outstanding map
130
- complete_notify: Notify,
128
+ outstanding_activity_tasks: OutstandingActMap,
129
+ /// Ensures we don't exceed this worker's maximum concurrent activity limit for activities. This
130
+ /// semaphore is used to limit eager activities but shares the same underlying
131
+ /// [MeteredSemaphore] that is used to limit the concurrency for non-eager activities.
132
+ eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
133
+ /// Holds activity tasks we have received in direct response to workflow task completion (a.k.a
134
+ /// eager activities). Tasks received in this stream hold a "tracked" permit that is issued by
135
+ /// the `eager_activities_semaphore`.
136
+ eager_activities_tx: UnboundedSender<TrackedPermittedTqResp>,
131
137
 
132
138
  metrics: MetricsContext,
133
139
 
134
140
  max_heartbeat_throttle_interval: Duration,
135
141
  default_heartbeat_throttle_interval: Duration,
142
+
143
+ /// Wakes every time an activity is removed from the outstanding map
144
+ complete_notify: Arc<Notify>,
145
+ /// Token to notify when poll returned a shutdown error
146
+ poll_returned_shutdown_token: CancellationToken,
147
+ }
148
+
149
+ #[derive(derive_more::From)]
150
+ enum ActivityTaskSource {
151
+ PendingCancel(PendingActivityCancel),
152
+ PendingStart(Result<(PermittedTqResp, bool), PollActivityError>),
136
153
  }
137
154
 
138
155
  impl WorkerActivityTasks {
156
+ #[allow(clippy::too_many_arguments)]
139
157
  pub(crate) fn new(
140
158
  max_activity_tasks: usize,
141
159
  max_worker_act_per_sec: Option<f64>,
@@ -144,91 +162,230 @@ impl WorkerActivityTasks {
144
162
  metrics: MetricsContext,
145
163
  max_heartbeat_throttle_interval: Duration,
146
164
  default_heartbeat_throttle_interval: Duration,
165
+ graceful_shutdown: Option<Duration>,
147
166
  ) -> Self {
148
- Self {
149
- heartbeat_manager: ActivityHeartbeatManager::new(client),
150
- outstanding_activity_tasks: Default::default(),
167
+ let semaphore = Arc::new(MeteredSemaphore::new(
168
+ max_activity_tasks,
169
+ metrics.with_new_attrs([activity_worker_type()]),
170
+ MetricsContext::available_task_slots,
171
+ ));
172
+ let poller_shutdown_token = CancellationToken::new();
173
+ let rate_limiter = max_worker_act_per_sec.and_then(|ps| {
174
+ Quota::with_period(Duration::from_secs_f64(ps.recip())).map(RateLimiter::direct)
175
+ });
176
+ let outstanding_activity_tasks = Arc::new(DashMap::new());
177
+ let server_poller_stream = new_activity_task_poller(
151
178
  poller,
152
- non_poll_tasks: NonPollActBuffer::new(),
153
- activities_semaphore: Arc::new(MeteredSemaphore::new(
154
- max_activity_tasks,
155
- metrics.with_new_attrs([activity_worker_type()]),
156
- MetricsContext::available_task_slots,
157
- )),
158
- ratelimiter: max_worker_act_per_sec.and_then(|ps| {
159
- Quota::with_period(Duration::from_secs_f64(ps.recip())).map(RateLimiter::direct)
160
- }),
161
- complete_notify: Notify::new(),
179
+ semaphore.clone(),
180
+ rate_limiter,
181
+ metrics.clone(),
182
+ poller_shutdown_token.clone(),
183
+ );
184
+ let (eager_activities_tx, eager_activities_rx) = unbounded_channel();
185
+ let eager_activities_semaphore = ClosableMeteredSemaphore::new_arc(semaphore);
186
+
187
+ let start_tasks_stream_complete = CancellationToken::new();
188
+ let starts_stream = Self::merge_start_task_sources(
189
+ eager_activities_rx,
190
+ server_poller_stream,
191
+ eager_activities_semaphore.clone(),
192
+ start_tasks_stream_complete.clone(),
193
+ );
194
+ let (cancels_tx, cancels_rx) = unbounded_channel();
195
+ let heartbeat_manager = ActivityHeartbeatManager::new(client, cancels_tx.clone());
196
+ let complete_notify = Arc::new(Notify::new());
197
+ let source_stream = stream::select_with_strategy(
198
+ UnboundedReceiverStream::new(cancels_rx).map(ActivityTaskSource::from),
199
+ starts_stream.map(ActivityTaskSource::from),
200
+ |_: &mut ()| PollNext::Left,
201
+ );
202
+ // Create a task stream composed of (in poll preference order):
203
+ // cancels_stream ------------------------------+--- activity_task_stream
204
+ // eager_activities_rx ---+--- starts_stream ---|
205
+ // server_poll_stream ---|
206
+ let activity_task_stream = Self::merge_source_streams(
207
+ source_stream,
208
+ outstanding_activity_tasks.clone(),
209
+ start_tasks_stream_complete,
210
+ complete_notify.clone(),
211
+ graceful_shutdown,
212
+ cancels_tx,
213
+ metrics.clone(),
214
+ );
215
+
216
+ Self {
217
+ poller_shutdown_token,
218
+ eager_activities_tx,
219
+ heartbeat_manager,
220
+ activity_task_stream: Mutex::new(activity_task_stream.boxed()),
221
+ eager_activities_semaphore,
222
+ complete_notify,
162
223
  metrics,
163
224
  max_heartbeat_throttle_interval,
164
225
  default_heartbeat_throttle_interval,
226
+ poll_returned_shutdown_token: CancellationToken::new(),
227
+ outstanding_activity_tasks,
165
228
  }
166
229
  }
167
230
 
168
- pub(crate) fn notify_shutdown(&self) {
169
- self.poller.notify_shutdown();
231
+ /// Merges the server poll and eager [ActivityTask] sources
232
+ fn merge_start_task_sources(
233
+ non_poll_tasks_rx: UnboundedReceiver<TrackedPermittedTqResp>,
234
+ poller_stream: impl Stream<Item = Result<PermittedTqResp, tonic::Status>>,
235
+ eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
236
+ on_complete_token: CancellationToken,
237
+ ) -> impl Stream<Item = Result<(PermittedTqResp, bool), PollActivityError>> {
238
+ let non_poll_stream = stream::unfold(
239
+ (non_poll_tasks_rx, eager_activities_semaphore),
240
+ |(mut non_poll_tasks_rx, eager_activities_semaphore)| async move {
241
+ loop {
242
+ tokio::select! {
243
+ biased;
244
+
245
+ task_opt = non_poll_tasks_rx.recv() => {
246
+ // Add is_eager true and wrap in Result
247
+ return task_opt.map(|task| (
248
+ Ok((PermittedTqResp{ permit: task.permit.into(), resp: task.resp },
249
+ true)),
250
+ (non_poll_tasks_rx, eager_activities_semaphore)));
251
+ }
252
+ _ = eager_activities_semaphore.close_complete() => {
253
+ // Once shutting down, we stop accepting eager activities
254
+ non_poll_tasks_rx.close();
255
+ continue;
256
+ }
257
+ }
258
+ }
259
+ },
260
+ );
261
+ // Add is_eager false
262
+ let poller_stream = poller_stream.map(|res| res.map(|task| (task, false)));
263
+
264
+ // Prefer eager activities over polling the server
265
+ stream::select_with_strategy(non_poll_stream, poller_stream, |_: &mut ()| PollNext::Left)
266
+ .map(|res| res.map_err(|err| err.into()))
267
+ // This map, chain, filter_map sequence is here to cancel the token when this stream ends.
268
+ .map(Some)
269
+ .chain(futures::stream::once(async move {
270
+ on_complete_token.cancel();
271
+ None
272
+ }))
273
+ .filter_map(future::ready)
170
274
  }
171
275
 
172
- /// Wait for all outstanding activity tasks to finish
173
- pub(crate) async fn wait_all_finished(&self) {
174
- while !self.outstanding_activity_tasks.is_empty() {
175
- self.complete_notify.notified().await
176
- }
276
+ /// Builds an [ActivityTask] stream for both cancellation tasks from cancels delivered from
277
+ /// heartbeats as well as new activity starts
278
+ fn merge_source_streams(
279
+ source_stream: impl Stream<Item = ActivityTaskSource>,
280
+ outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
281
+ start_tasks_stream_complete: CancellationToken,
282
+ complete_notify: Arc<Notify>,
283
+ grace_period: Option<Duration>,
284
+ cancels_tx: UnboundedSender<PendingActivityCancel>,
285
+ metrics: MetricsContext,
286
+ ) -> impl Stream<Item = Result<ActivityTask, PollActivityError>> {
287
+ let outstanding_tasks_clone = outstanding_tasks.clone();
288
+ source_stream
289
+ .filter_map(move |source| {
290
+ let outstanding_tasks = outstanding_tasks.clone();
291
+ let metrics = metrics.clone();
292
+ async move {
293
+ match source {
294
+ ActivityTaskSource::PendingCancel(next_pc) => {
295
+ // It's possible that activity has been completed and we no longer have
296
+ // an outstanding activity task. This is fine because it means that we
297
+ // no longer need to cancel this activity, so we'll just ignore such
298
+ // orphaned cancellations.
299
+ if let Some(mut details) =
300
+ outstanding_tasks.get_mut(&next_pc.task_token)
301
+ {
302
+ if details.issued_cancel_to_lang.is_some() {
303
+ // Don't double-issue cancellations
304
+ return None;
305
+ }
306
+
307
+ details.issued_cancel_to_lang = Some(next_pc.reason);
308
+ if next_pc.reason == ActivityCancelReason::NotFound {
309
+ details.known_not_found = true;
310
+ }
311
+ Some(Ok(ActivityTask::cancel_from_ids(
312
+ next_pc.task_token.0,
313
+ next_pc.reason,
314
+ )))
315
+ } else {
316
+ debug!(task_token = ?next_pc.task_token,
317
+ "Unknown activity task when issuing cancel");
318
+ // If we can't find the activity here, it's already been completed,
319
+ // in which case issuing a cancel again is pointless.
320
+ None
321
+ }
322
+ }
323
+ ActivityTaskSource::PendingStart(res) => {
324
+ Some(res.map(|(task, is_eager)| {
325
+ Self::about_to_issue_task(
326
+ outstanding_tasks,
327
+ task,
328
+ is_eager,
329
+ metrics,
330
+ )
331
+ }))
332
+ }
333
+ }
334
+ }
335
+ })
336
+ .take_until(async move {
337
+ start_tasks_stream_complete.cancelled().await;
338
+ // Issue cancels for any still-living act tasks after the grace period
339
+ let (grace_killer, stop_grace) = futures_util::future::abortable(async {
340
+ if let Some(gp) = grace_period {
341
+ // Make sure we've waited at least the grace period. This way if waiting for
342
+ // starts to finish took a while, we subtract that from the grace period.
343
+ tokio::time::sleep(gp).await;
344
+ for mapref in outstanding_tasks_clone.iter() {
345
+ let _ = cancels_tx.send(PendingActivityCancel::new(
346
+ mapref.key().clone(),
347
+ ActivityCancelReason::WorkerShutdown,
348
+ ));
349
+ }
350
+ }
351
+ });
352
+ join!(
353
+ async {
354
+ while !outstanding_tasks_clone.is_empty() {
355
+ complete_notify.notified().await
356
+ }
357
+ // If we were waiting for the grace period but everything already finished,
358
+ // we don't need to keep waiting.
359
+ stop_grace.abort();
360
+ },
361
+ grace_killer
362
+ )
363
+ })
177
364
  }
178
365
 
179
- pub(crate) async fn shutdown(self) {
180
- self.poller.shutdown_box().await;
366
+ pub(crate) fn initiate_shutdown(&self) {
367
+ self.poller_shutdown_token.cancel();
368
+ self.eager_activities_semaphore.close();
369
+ }
370
+
371
+ pub(crate) async fn shutdown(&self) {
372
+ self.initiate_shutdown();
373
+ self.poll_returned_shutdown_token.cancelled().await;
181
374
  self.heartbeat_manager.shutdown().await;
182
375
  }
183
376
 
184
- /// Wait until not at the outstanding activity limit, and then poll for an activity task.
377
+ /// Exclusive poll for activity tasks
185
378
  ///
186
- /// Returns `Ok(None)` if no activity is ready and the overall polling loop should be retried.
187
- pub(crate) async fn poll(&self) -> Result<Option<ActivityTask>, PollActivityError> {
188
- let poll_with_semaphore = async {
189
- // Acquire and subsequently forget a permit for an outstanding activity. When they are
190
- // completed, we must add a new permit to the semaphore, since holding the permit the
191
- // entire time lang does work would be a challenge.
192
- let perm = self
193
- .activities_semaphore
194
- .acquire_owned()
195
- .await
196
- .expect("outstanding activity semaphore not closed");
197
- if let Some(ref rl) = self.ratelimiter {
198
- rl.until_ready().await;
199
- }
200
- (self.poller.poll().await, perm)
201
- };
202
-
203
- tokio::select! {
204
- biased;
205
-
206
- cancel_task = self.next_pending_cancel_task() => {
207
- cancel_task
208
- }
209
- task = self.non_poll_tasks.next() => {
210
- Ok(Some(self.about_to_issue_task(task, true)))
211
- }
212
- (work, permit) = poll_with_semaphore => {
213
- match work {
214
- Some(Ok(work)) => {
215
- if work == PollActivityTaskQueueResponse::default() {
216
- // Timeout
217
- self.metrics.act_poll_timeout();
218
- return Ok(None)
219
- }
220
- let work = self.about_to_issue_task(PermittedTqResp {
221
- resp: work, permit
222
- }, false);
223
- Ok(Some(work))
224
- }
225
- None => {
226
- Err(PollActivityError::ShutDown)
227
- }
228
- Some(Err(e)) => Err(e.into())
229
- }
230
- }
231
- }
379
+ /// Polls the various task sources (server polls, eager activities, cancellations) while
380
+ /// respecting the provided rate limits and allowed concurrency. Returns
381
+ /// [PollActivityError::ShutDown] after shutdown is completed and all tasks sources are
382
+ /// depleted.
383
+ pub(crate) async fn poll(&self) -> Result<ActivityTask, PollActivityError> {
384
+ let mut poller_stream = self.activity_task_stream.lock().await;
385
+ poller_stream.next().await.unwrap_or_else(|| {
386
+ self.poll_returned_shutdown_token.cancel();
387
+ Err(PollActivityError::ShutDown)
388
+ })
232
389
  }
233
390
 
234
391
  pub(crate) async fn complete(
@@ -266,22 +423,40 @@ impl WorkerActivityTasks {
266
423
  .err()
267
424
  }
268
425
  aer::Status::Cancelled(ar::Cancellation { failure }) => {
269
- let details = if let Some(Failure {
270
- failure_info:
271
- Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo { details })),
272
- ..
273
- }) = failure
274
- {
275
- details
426
+ if matches!(
427
+ act_info.issued_cancel_to_lang,
428
+ Some(ActivityCancelReason::WorkerShutdown),
429
+ ) {
430
+ // We don't report cancels for graceful shutdown as failures, so we
431
+ // don't wait for the whole timeout to elapse, which is what would
432
+ // happen anyway.
433
+ client
434
+ .fail_activity_task(
435
+ task_token.clone(),
436
+ Some(worker_shutdown_failure()),
437
+ )
438
+ .await
439
+ .err()
276
440
  } else {
277
- warn!(task_token = ? task_token,
441
+ let details = if let Some(Failure {
442
+ failure_info:
443
+ Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo {
444
+ details,
445
+ })),
446
+ ..
447
+ }) = failure
448
+ {
449
+ details
450
+ } else {
451
+ warn!(task_token = ? task_token,
278
452
  "Expected activity cancelled status with CanceledFailureInfo");
279
- None
280
- };
281
- client
282
- .cancel_activity_task(task_token.clone(), details.map(Into::into))
283
- .await
284
- .err()
453
+ None
454
+ };
455
+ client
456
+ .cancel_activity_task(task_token.clone(), details.map(Into::into))
457
+ .await
458
+ .err()
459
+ }
285
460
  }
286
461
  };
287
462
 
@@ -338,48 +513,21 @@ impl WorkerActivityTasks {
338
513
  /// Returns a handle that the workflows management side can use to interact with this manager
339
514
  pub(crate) fn get_handle_for_workflows(&self) -> ActivitiesFromWFTsHandle {
340
515
  ActivitiesFromWFTsHandle {
341
- sem: self.activities_semaphore.clone(),
342
- tx: self.non_poll_tasks.tx.clone(),
516
+ sem: self.eager_activities_semaphore.clone(),
517
+ tx: self.eager_activities_tx.clone(),
343
518
  }
344
519
  }
345
520
 
346
- async fn next_pending_cancel_task(&self) -> Result<Option<ActivityTask>, PollActivityError> {
347
- let next_pc = self.heartbeat_manager.next_pending_cancel().await;
348
- // Issue cancellations for anything we noticed was cancelled during heartbeating
349
- if let Some(PendingActivityCancel { task_token, reason }) = next_pc {
350
- // It's possible that activity has been completed and we no longer have an
351
- // outstanding activity task. This is fine because it means that we no
352
- // longer need to cancel this activity, so we'll just ignore such orphaned
353
- // cancellations.
354
- if let Some(mut details) = self.outstanding_activity_tasks.get_mut(&task_token) {
355
- if details.issued_cancel_to_lang {
356
- // Don't double-issue cancellations
357
- return Ok(None);
358
- }
359
-
360
- details.issued_cancel_to_lang = true;
361
- if reason == ActivityCancelReason::NotFound {
362
- details.known_not_found = true;
363
- }
364
- Ok(Some(ActivityTask::cancel_from_ids(task_token.0, reason)))
365
- } else {
366
- debug!(task_token = ?task_token, "Unknown activity task when issuing cancel");
367
- // If we can't find the activity here, it's already been completed,
368
- // in which case issuing a cancel again is pointless.
369
- Ok(None)
370
- }
371
- } else {
372
- // The only situation where the next cancel would return none is if the manager
373
- // was dropped, which can only happen on shutdown.
374
- Err(PollActivityError::ShutDown)
375
- }
376
- }
377
-
378
- /// Called when there is a new act task about to be bubbled up out of the manager
379
- fn about_to_issue_task(&self, task: PermittedTqResp, is_eager: bool) -> ActivityTask {
521
+ /// Called when there is a new [ActivityTask] about to be bubbled up out of the poller
522
+ fn about_to_issue_task(
523
+ outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
524
+ task: PermittedTqResp,
525
+ is_eager: bool,
526
+ metrics: MetricsContext,
527
+ ) -> ActivityTask {
380
528
  if let Some(ref act_type) = task.resp.activity_type {
381
529
  if let Some(ref wf_type) = task.resp.workflow_type {
382
- self.metrics
530
+ metrics
383
531
  .with_new_attrs([
384
532
  activity_type(act_type.name.clone()),
385
533
  workflow_type(wf_type.name.clone()),
@@ -392,12 +540,12 @@ impl WorkerActivityTasks {
392
540
  // activity_type and workflow_type, we won't bother.
393
541
 
394
542
  if let Some(dur) = task.resp.sched_to_start() {
395
- self.metrics.act_sched_to_start_latency(dur);
543
+ metrics.act_sched_to_start_latency(dur);
396
544
  };
397
545
 
398
- self.outstanding_activity_tasks.insert(
546
+ outstanding_tasks.insert(
399
547
  task.resp.task_token.clone().into(),
400
- RemoteInFlightActInfo::new(&task.resp, task.permit),
548
+ RemoteInFlightActInfo::new(&task.resp, task.permit.into_used()),
401
549
  );
402
550
 
403
551
  ActivityTask::start_from_poll_resp(task.resp)
@@ -405,40 +553,65 @@ impl WorkerActivityTasks {
405
553
 
406
554
  #[cfg(test)]
407
555
  pub(crate) fn remaining_activity_capacity(&self) -> usize {
408
- self.activities_semaphore.available_permits()
556
+ self.eager_activities_semaphore.available_permits()
409
557
  }
410
558
  }
411
559
 
412
560
  /// Provides facilities for the workflow side of things to interact with the activity manager.
413
561
  /// Allows for the handling of activities returned by WFT completions.
414
562
  pub(crate) struct ActivitiesFromWFTsHandle {
415
- sem: Arc<MeteredSemaphore>,
416
- tx: async_channel::Sender<PermittedTqResp>,
563
+ sem: Arc<ClosableMeteredSemaphore>,
564
+ tx: UnboundedSender<TrackedPermittedTqResp>,
417
565
  }
418
566
 
419
567
  impl ActivitiesFromWFTsHandle {
420
568
  /// Returns a handle that can be used to reserve an activity slot. EX: When requesting eager
421
569
  /// dispatch of an activity to this worker upon workflow task completion
422
- pub(crate) fn reserve_slot(&self) -> Option<OwnedMeteredSemPermit> {
570
+ pub(crate) fn reserve_slot(&self) -> Option<TrackedOwnedMeteredSemPermit> {
571
+ // TODO: check if rate limit is not exceeded and count this reservation towards the rate limit
423
572
  self.sem.try_acquire_owned().ok()
424
573
  }
425
574
 
426
575
  /// Queue new activity tasks for dispatch received from non-polling sources (ex: eager returns
427
576
  /// from WFT completion)
428
- pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item = PermittedTqResp>) {
577
+ pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item = TrackedPermittedTqResp>) {
429
578
  for t in tasks.into_iter() {
430
579
  // Technically we should be reporting `activity_task_received` here, but for simplicity
431
580
  // and time insensitivity, that metric is tracked in `about_to_issue_task`.
432
- self.tx.try_send(t).expect("Receive half cannot be dropped");
581
+ self.tx.send(t).expect("Receive half cannot be dropped");
433
582
  }
434
583
  }
435
584
  }
436
585
 
586
+ #[derive(Debug)]
437
587
  pub(crate) struct PermittedTqResp {
438
588
  pub permit: OwnedMeteredSemPermit,
439
589
  pub resp: PollActivityTaskQueueResponse,
440
590
  }
441
591
 
592
+ #[derive(Debug)]
593
+ pub(crate) struct TrackedPermittedTqResp {
594
+ pub permit: TrackedOwnedMeteredSemPermit,
595
+ pub resp: PollActivityTaskQueueResponse,
596
+ }
597
+
598
+ fn worker_shutdown_failure() -> Failure {
599
+ Failure {
600
+ message: "Worker is shutting down and this activity did not complete in time".to_string(),
601
+ source: "".to_string(),
602
+ stack_trace: "".to_string(),
603
+ encoded_attributes: None,
604
+ cause: None,
605
+ failure_info: Some(FailureInfo::ApplicationFailureInfo(
606
+ ApplicationFailureInfo {
607
+ r#type: "WorkerShutdown".to_string(),
608
+ non_retryable: false,
609
+ details: None,
610
+ },
611
+ )),
612
+ }
613
+ }
614
+
442
615
  #[cfg(test)]
443
616
  mod tests {
444
617
  use super::*;
@@ -470,10 +643,11 @@ mod tests {
470
643
  MetricsContext::no_op(),
471
644
  Duration::from_secs(1),
472
645
  Duration::from_secs(1),
646
+ None,
473
647
  );
474
648
  let start = Instant::now();
475
- atm.poll().await.unwrap().unwrap();
476
- atm.poll().await.unwrap().unwrap();
649
+ atm.poll().await.unwrap();
650
+ atm.poll().await.unwrap();
477
651
  // At least half a second will have elapsed since we only allow 2 tasks per second.
478
652
  // With no ratelimit, even on a slow CI server with lots of load, this would typically take
479
653
  // low single digit ms or less.