@temporalio/core-bridge 1.6.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +520 -456
- package/lib/index.d.ts +8 -6
- package/lib/index.js.map +1 -1
- package/package.json +8 -3
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.buildkite/docker/Dockerfile +2 -2
- package/sdk-core/.buildkite/docker/docker-compose.yaml +1 -1
- package/sdk-core/.buildkite/pipeline.yml +1 -1
- package/sdk-core/.github/workflows/heavy.yml +1 -0
- package/sdk-core/README.md +13 -7
- package/sdk-core/client/src/lib.rs +27 -9
- package/sdk-core/client/src/metrics.rs +17 -8
- package/sdk-core/client/src/raw.rs +3 -3
- package/sdk-core/core/Cargo.toml +3 -4
- package/sdk-core/core/src/abstractions/take_cell.rs +28 -0
- package/sdk-core/core/src/abstractions.rs +197 -18
- package/sdk-core/core/src/core_tests/activity_tasks.rs +137 -45
- package/sdk-core/core/src/core_tests/child_workflows.rs +6 -5
- package/sdk-core/core/src/core_tests/determinism.rs +212 -2
- package/sdk-core/core/src/core_tests/local_activities.rs +183 -36
- package/sdk-core/core/src/core_tests/queries.rs +32 -14
- package/sdk-core/core/src/core_tests/workers.rs +8 -5
- package/sdk-core/core/src/core_tests/workflow_tasks.rs +340 -51
- package/sdk-core/core/src/ephemeral_server/mod.rs +110 -8
- package/sdk-core/core/src/internal_flags.rs +141 -0
- package/sdk-core/core/src/lib.rs +14 -9
- package/sdk-core/core/src/replay/mod.rs +16 -27
- package/sdk-core/core/src/telemetry/metrics.rs +69 -35
- package/sdk-core/core/src/telemetry/mod.rs +38 -14
- package/sdk-core/core/src/telemetry/prometheus_server.rs +19 -13
- package/sdk-core/core/src/test_help/mod.rs +65 -13
- package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +119 -160
- package/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
- package/sdk-core/core/src/worker/activities/local_activities.rs +122 -6
- package/sdk-core/core/src/worker/activities.rs +347 -173
- package/sdk-core/core/src/worker/client/mocks.rs +22 -2
- package/sdk-core/core/src/worker/client.rs +18 -2
- package/sdk-core/core/src/worker/mod.rs +137 -44
- package/sdk-core/core/src/worker/workflow/history_update.rs +132 -51
- package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +207 -166
- package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +6 -7
- package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +6 -7
- package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +157 -82
- package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +12 -12
- package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +6 -7
- package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +13 -15
- package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +170 -60
- package/sdk-core/core/src/worker/workflow/machines/mod.rs +24 -16
- package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +6 -8
- package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +320 -204
- package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +10 -13
- package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +15 -23
- package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +187 -46
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +237 -111
- package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +13 -13
- package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +10 -6
- package/sdk-core/core/src/worker/workflow/managed_run.rs +81 -62
- package/sdk-core/core/src/worker/workflow/mod.rs +341 -79
- package/sdk-core/core/src/worker/workflow/run_cache.rs +18 -11
- package/sdk-core/core/src/worker/workflow/wft_extraction.rs +15 -3
- package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +2 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream.rs +75 -52
- package/sdk-core/core-api/Cargo.toml +0 -1
- package/sdk-core/core-api/src/lib.rs +13 -7
- package/sdk-core/core-api/src/telemetry.rs +4 -6
- package/sdk-core/core-api/src/worker.rs +5 -0
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +80 -55
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +22 -68
- package/sdk-core/histories/ends_empty_wft_complete.bin +0 -0
- package/sdk-core/histories/old_change_marker_format.bin +0 -0
- package/sdk-core/protos/api_upstream/.github/CODEOWNERS +2 -1
- package/sdk-core/protos/api_upstream/Makefile +1 -1
- package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +5 -17
- package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +11 -0
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +1 -6
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +6 -6
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +5 -0
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +22 -6
- package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +48 -19
- package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -0
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +3 -0
- package/sdk-core/protos/api_upstream/temporal/api/{enums/v1/interaction_type.proto → protocol/v1/message.proto} +29 -11
- package/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
- package/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +111 -0
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +59 -28
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
- package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +7 -8
- package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +10 -7
- package/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +19 -30
- package/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
- package/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
- package/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +8 -0
- package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +65 -60
- package/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +85 -84
- package/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +9 -3
- package/sdk-core/sdk/Cargo.toml +1 -1
- package/sdk-core/sdk/src/lib.rs +21 -5
- package/sdk-core/sdk/src/workflow_context/options.rs +7 -1
- package/sdk-core/sdk/src/workflow_context.rs +24 -17
- package/sdk-core/sdk/src/workflow_future.rs +9 -3
- package/sdk-core/sdk-core-protos/src/history_builder.rs +114 -89
- package/sdk-core/sdk-core-protos/src/history_info.rs +6 -1
- package/sdk-core/sdk-core-protos/src/lib.rs +205 -64
- package/sdk-core/test-utils/src/canned_histories.rs +106 -296
- package/sdk-core/test-utils/src/lib.rs +32 -5
- package/sdk-core/tests/heavy_tests.rs +10 -43
- package/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
- package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -3
- package/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
- package/sdk-core/tests/integ_tests/polling_tests.rs +3 -8
- package/sdk-core/tests/integ_tests/queries_tests.rs +4 -2
- package/sdk-core/tests/integ_tests/visibility_tests.rs +34 -23
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +97 -81
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +80 -3
- package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +5 -1
- package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +25 -3
- package/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +2 -4
- package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +30 -0
- package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +64 -0
- package/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +4 -0
- package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -1
- package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +7 -2
- package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -7
- package/sdk-core/tests/integ_tests/workflow_tests.rs +8 -8
- package/sdk-core/tests/main.rs +16 -25
- package/sdk-core/tests/runner.rs +11 -9
- package/src/conversions.rs +14 -8
- package/src/runtime.rs +9 -8
- package/ts/index.ts +8 -6
- package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +0 -87
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
mod activity_heartbeat_manager;
|
|
2
|
+
mod activity_task_poller_stream;
|
|
2
3
|
mod local_activities;
|
|
3
4
|
|
|
4
5
|
pub(crate) use local_activities::{
|
|
@@ -8,26 +9,34 @@ pub(crate) use local_activities::{
|
|
|
8
9
|
};
|
|
9
10
|
|
|
10
11
|
use crate::{
|
|
11
|
-
abstractions::{
|
|
12
|
+
abstractions::{
|
|
13
|
+
ClosableMeteredSemaphore, MeteredSemaphore, OwnedMeteredSemPermit,
|
|
14
|
+
TrackedOwnedMeteredSemPermit, UsedMeteredSemPermit,
|
|
15
|
+
},
|
|
12
16
|
pollers::BoxedActPoller,
|
|
13
17
|
telemetry::metrics::{
|
|
14
18
|
activity_type, activity_worker_type, eager, workflow_type, MetricsContext,
|
|
15
19
|
},
|
|
16
20
|
worker::{
|
|
17
|
-
activities::
|
|
21
|
+
activities::{
|
|
22
|
+
activity_heartbeat_manager::ActivityHeartbeatError,
|
|
23
|
+
activity_task_poller_stream::new_activity_task_poller,
|
|
24
|
+
},
|
|
25
|
+
client::WorkerClient,
|
|
18
26
|
},
|
|
19
27
|
PollActivityError, TaskToken,
|
|
20
28
|
};
|
|
21
29
|
use activity_heartbeat_manager::ActivityHeartbeatManager;
|
|
22
30
|
use dashmap::DashMap;
|
|
23
|
-
use
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Quota, RateLimiter,
|
|
31
|
+
use futures::{
|
|
32
|
+
stream,
|
|
33
|
+
stream::{BoxStream, PollNext},
|
|
34
|
+
Stream, StreamExt,
|
|
28
35
|
};
|
|
36
|
+
use governor::{Quota, RateLimiter};
|
|
29
37
|
use std::{
|
|
30
38
|
convert::TryInto,
|
|
39
|
+
future,
|
|
31
40
|
sync::Arc,
|
|
32
41
|
time::{Duration, Instant},
|
|
33
42
|
};
|
|
@@ -38,13 +47,23 @@ use temporal_sdk_core_protos::{
|
|
|
38
47
|
ActivityHeartbeat,
|
|
39
48
|
},
|
|
40
49
|
temporal::api::{
|
|
41
|
-
failure::v1::{failure::FailureInfo, CanceledFailureInfo, Failure},
|
|
50
|
+
failure::v1::{failure::FailureInfo, ApplicationFailureInfo, CanceledFailureInfo, Failure},
|
|
42
51
|
workflowservice::v1::PollActivityTaskQueueResponse,
|
|
43
52
|
},
|
|
44
53
|
};
|
|
45
|
-
use tokio::
|
|
54
|
+
use tokio::{
|
|
55
|
+
join,
|
|
56
|
+
sync::{
|
|
57
|
+
mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
|
|
58
|
+
Mutex, Notify,
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
use tokio_stream::wrappers::UnboundedReceiverStream;
|
|
62
|
+
use tokio_util::sync::CancellationToken;
|
|
46
63
|
use tracing::Span;
|
|
47
64
|
|
|
65
|
+
type OutstandingActMap = Arc<DashMap<TaskToken, RemoteInFlightActInfo>>;
|
|
66
|
+
|
|
48
67
|
#[derive(Debug, derive_more::Constructor)]
|
|
49
68
|
struct PendingActivityCancel {
|
|
50
69
|
task_token: TaskToken,
|
|
@@ -68,17 +87,18 @@ struct RemoteInFlightActInfo {
|
|
|
68
87
|
pub base: InFlightActInfo,
|
|
69
88
|
/// Used to calculate aggregation delay between activity heartbeats.
|
|
70
89
|
pub heartbeat_timeout: Option<prost_types::Duration>,
|
|
71
|
-
/// Set
|
|
72
|
-
|
|
90
|
+
/// Set if we have already issued a cancellation activation to lang for this activity, with
|
|
91
|
+
/// the original reason we issued the cancel.
|
|
92
|
+
pub issued_cancel_to_lang: Option<ActivityCancelReason>,
|
|
73
93
|
/// Set to true if we have already learned from the server this activity doesn't exist. EX:
|
|
74
94
|
/// we have learned from heartbeating and issued a cancel task, in which case we may simply
|
|
75
95
|
/// discard the reply.
|
|
76
96
|
pub known_not_found: bool,
|
|
77
97
|
/// The permit from the max concurrent semaphore
|
|
78
|
-
_permit:
|
|
98
|
+
_permit: UsedMeteredSemPermit,
|
|
79
99
|
}
|
|
80
100
|
impl RemoteInFlightActInfo {
|
|
81
|
-
fn new(poll_resp: &PollActivityTaskQueueResponse, permit:
|
|
101
|
+
fn new(poll_resp: &PollActivityTaskQueueResponse, permit: UsedMeteredSemPermit) -> Self {
|
|
82
102
|
let wec = poll_resp.workflow_execution.clone().unwrap_or_default();
|
|
83
103
|
Self {
|
|
84
104
|
base: InFlightActInfo {
|
|
@@ -89,53 +109,51 @@ impl RemoteInFlightActInfo {
|
|
|
89
109
|
start_time: Instant::now(),
|
|
90
110
|
},
|
|
91
111
|
heartbeat_timeout: poll_resp.heartbeat_timeout.clone(),
|
|
92
|
-
issued_cancel_to_lang:
|
|
112
|
+
issued_cancel_to_lang: None,
|
|
93
113
|
known_not_found: false,
|
|
94
114
|
_permit: permit,
|
|
95
115
|
}
|
|
96
116
|
}
|
|
97
117
|
}
|
|
98
118
|
|
|
99
|
-
struct NonPollActBuffer {
|
|
100
|
-
tx: async_channel::Sender<PermittedTqResp>,
|
|
101
|
-
rx: async_channel::Receiver<PermittedTqResp>,
|
|
102
|
-
}
|
|
103
|
-
impl NonPollActBuffer {
|
|
104
|
-
pub fn new() -> Self {
|
|
105
|
-
let (tx, rx) = async_channel::unbounded();
|
|
106
|
-
Self { tx, rx }
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
pub async fn next(&self) -> PermittedTqResp {
|
|
110
|
-
self.rx.recv().await.expect("Send half cannot be dropped")
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
119
|
pub(crate) struct WorkerActivityTasks {
|
|
120
|
+
/// Token used to signal the server task poller that shutdown is beginning
|
|
121
|
+
poller_shutdown_token: CancellationToken,
|
|
115
122
|
/// Centralizes management of heartbeat issuing / throttling
|
|
116
123
|
heartbeat_manager: ActivityHeartbeatManager,
|
|
124
|
+
/// Combined stream for any ActivityTask producing source (polls, eager activities,
|
|
125
|
+
/// cancellations)
|
|
126
|
+
activity_task_stream: Mutex<BoxStream<'static, Result<ActivityTask, PollActivityError>>>,
|
|
117
127
|
/// Activities that have been issued to lang but not yet completed
|
|
118
|
-
outstanding_activity_tasks:
|
|
119
|
-
///
|
|
120
|
-
///
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
/// workflow task completion.
|
|
124
|
-
|
|
125
|
-
///
|
|
126
|
-
|
|
127
|
-
/// Enables per-worker rate-limiting of activity tasks
|
|
128
|
-
ratelimiter: Option<RateLimiter<NotKeyed, InMemoryState, DefaultClock, NoOpMiddleware>>,
|
|
129
|
-
/// Wakes every time an activity is removed from the outstanding map
|
|
130
|
-
complete_notify: Notify,
|
|
128
|
+
outstanding_activity_tasks: OutstandingActMap,
|
|
129
|
+
/// Ensures we don't exceed this worker's maximum concurrent activity limit for activities. This
|
|
130
|
+
/// semaphore is used to limit eager activities but shares the same underlying
|
|
131
|
+
/// [MeteredSemaphore] that is used to limit the concurrency for non-eager activities.
|
|
132
|
+
eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
|
|
133
|
+
/// Holds activity tasks we have received in direct response to workflow task completion (a.k.a
|
|
134
|
+
/// eager activities). Tasks received in this stream hold a "tracked" permit that is issued by
|
|
135
|
+
/// the `eager_activities_semaphore`.
|
|
136
|
+
eager_activities_tx: UnboundedSender<TrackedPermittedTqResp>,
|
|
131
137
|
|
|
132
138
|
metrics: MetricsContext,
|
|
133
139
|
|
|
134
140
|
max_heartbeat_throttle_interval: Duration,
|
|
135
141
|
default_heartbeat_throttle_interval: Duration,
|
|
142
|
+
|
|
143
|
+
/// Wakes every time an activity is removed from the outstanding map
|
|
144
|
+
complete_notify: Arc<Notify>,
|
|
145
|
+
/// Token to notify when poll returned a shutdown error
|
|
146
|
+
poll_returned_shutdown_token: CancellationToken,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[derive(derive_more::From)]
|
|
150
|
+
enum ActivityTaskSource {
|
|
151
|
+
PendingCancel(PendingActivityCancel),
|
|
152
|
+
PendingStart(Result<(PermittedTqResp, bool), PollActivityError>),
|
|
136
153
|
}
|
|
137
154
|
|
|
138
155
|
impl WorkerActivityTasks {
|
|
156
|
+
#[allow(clippy::too_many_arguments)]
|
|
139
157
|
pub(crate) fn new(
|
|
140
158
|
max_activity_tasks: usize,
|
|
141
159
|
max_worker_act_per_sec: Option<f64>,
|
|
@@ -144,91 +162,230 @@ impl WorkerActivityTasks {
|
|
|
144
162
|
metrics: MetricsContext,
|
|
145
163
|
max_heartbeat_throttle_interval: Duration,
|
|
146
164
|
default_heartbeat_throttle_interval: Duration,
|
|
165
|
+
graceful_shutdown: Option<Duration>,
|
|
147
166
|
) -> Self {
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
167
|
+
let semaphore = Arc::new(MeteredSemaphore::new(
|
|
168
|
+
max_activity_tasks,
|
|
169
|
+
metrics.with_new_attrs([activity_worker_type()]),
|
|
170
|
+
MetricsContext::available_task_slots,
|
|
171
|
+
));
|
|
172
|
+
let poller_shutdown_token = CancellationToken::new();
|
|
173
|
+
let rate_limiter = max_worker_act_per_sec.and_then(|ps| {
|
|
174
|
+
Quota::with_period(Duration::from_secs_f64(ps.recip())).map(RateLimiter::direct)
|
|
175
|
+
});
|
|
176
|
+
let outstanding_activity_tasks = Arc::new(DashMap::new());
|
|
177
|
+
let server_poller_stream = new_activity_task_poller(
|
|
151
178
|
poller,
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
179
|
+
semaphore.clone(),
|
|
180
|
+
rate_limiter,
|
|
181
|
+
metrics.clone(),
|
|
182
|
+
poller_shutdown_token.clone(),
|
|
183
|
+
);
|
|
184
|
+
let (eager_activities_tx, eager_activities_rx) = unbounded_channel();
|
|
185
|
+
let eager_activities_semaphore = ClosableMeteredSemaphore::new_arc(semaphore);
|
|
186
|
+
|
|
187
|
+
let start_tasks_stream_complete = CancellationToken::new();
|
|
188
|
+
let starts_stream = Self::merge_start_task_sources(
|
|
189
|
+
eager_activities_rx,
|
|
190
|
+
server_poller_stream,
|
|
191
|
+
eager_activities_semaphore.clone(),
|
|
192
|
+
start_tasks_stream_complete.clone(),
|
|
193
|
+
);
|
|
194
|
+
let (cancels_tx, cancels_rx) = unbounded_channel();
|
|
195
|
+
let heartbeat_manager = ActivityHeartbeatManager::new(client, cancels_tx.clone());
|
|
196
|
+
let complete_notify = Arc::new(Notify::new());
|
|
197
|
+
let source_stream = stream::select_with_strategy(
|
|
198
|
+
UnboundedReceiverStream::new(cancels_rx).map(ActivityTaskSource::from),
|
|
199
|
+
starts_stream.map(ActivityTaskSource::from),
|
|
200
|
+
|_: &mut ()| PollNext::Left,
|
|
201
|
+
);
|
|
202
|
+
// Create a task stream composed of (in poll preference order):
|
|
203
|
+
// cancels_stream ------------------------------+--- activity_task_stream
|
|
204
|
+
// eager_activities_rx ---+--- starts_stream ---|
|
|
205
|
+
// server_poll_stream ---|
|
|
206
|
+
let activity_task_stream = Self::merge_source_streams(
|
|
207
|
+
source_stream,
|
|
208
|
+
outstanding_activity_tasks.clone(),
|
|
209
|
+
start_tasks_stream_complete,
|
|
210
|
+
complete_notify.clone(),
|
|
211
|
+
graceful_shutdown,
|
|
212
|
+
cancels_tx,
|
|
213
|
+
metrics.clone(),
|
|
214
|
+
);
|
|
215
|
+
|
|
216
|
+
Self {
|
|
217
|
+
poller_shutdown_token,
|
|
218
|
+
eager_activities_tx,
|
|
219
|
+
heartbeat_manager,
|
|
220
|
+
activity_task_stream: Mutex::new(activity_task_stream.boxed()),
|
|
221
|
+
eager_activities_semaphore,
|
|
222
|
+
complete_notify,
|
|
162
223
|
metrics,
|
|
163
224
|
max_heartbeat_throttle_interval,
|
|
164
225
|
default_heartbeat_throttle_interval,
|
|
226
|
+
poll_returned_shutdown_token: CancellationToken::new(),
|
|
227
|
+
outstanding_activity_tasks,
|
|
165
228
|
}
|
|
166
229
|
}
|
|
167
230
|
|
|
168
|
-
|
|
169
|
-
|
|
231
|
+
/// Merges the server poll and eager [ActivityTask] sources
|
|
232
|
+
fn merge_start_task_sources(
|
|
233
|
+
non_poll_tasks_rx: UnboundedReceiver<TrackedPermittedTqResp>,
|
|
234
|
+
poller_stream: impl Stream<Item = Result<PermittedTqResp, tonic::Status>>,
|
|
235
|
+
eager_activities_semaphore: Arc<ClosableMeteredSemaphore>,
|
|
236
|
+
on_complete_token: CancellationToken,
|
|
237
|
+
) -> impl Stream<Item = Result<(PermittedTqResp, bool), PollActivityError>> {
|
|
238
|
+
let non_poll_stream = stream::unfold(
|
|
239
|
+
(non_poll_tasks_rx, eager_activities_semaphore),
|
|
240
|
+
|(mut non_poll_tasks_rx, eager_activities_semaphore)| async move {
|
|
241
|
+
loop {
|
|
242
|
+
tokio::select! {
|
|
243
|
+
biased;
|
|
244
|
+
|
|
245
|
+
task_opt = non_poll_tasks_rx.recv() => {
|
|
246
|
+
// Add is_eager true and wrap in Result
|
|
247
|
+
return task_opt.map(|task| (
|
|
248
|
+
Ok((PermittedTqResp{ permit: task.permit.into(), resp: task.resp },
|
|
249
|
+
true)),
|
|
250
|
+
(non_poll_tasks_rx, eager_activities_semaphore)));
|
|
251
|
+
}
|
|
252
|
+
_ = eager_activities_semaphore.close_complete() => {
|
|
253
|
+
// Once shutting down, we stop accepting eager activities
|
|
254
|
+
non_poll_tasks_rx.close();
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
},
|
|
260
|
+
);
|
|
261
|
+
// Add is_eager false
|
|
262
|
+
let poller_stream = poller_stream.map(|res| res.map(|task| (task, false)));
|
|
263
|
+
|
|
264
|
+
// Prefer eager activities over polling the server
|
|
265
|
+
stream::select_with_strategy(non_poll_stream, poller_stream, |_: &mut ()| PollNext::Left)
|
|
266
|
+
.map(|res| res.map_err(|err| err.into()))
|
|
267
|
+
// This map, chain, filter_map sequence is here to cancel the token when this stream ends.
|
|
268
|
+
.map(Some)
|
|
269
|
+
.chain(futures::stream::once(async move {
|
|
270
|
+
on_complete_token.cancel();
|
|
271
|
+
None
|
|
272
|
+
}))
|
|
273
|
+
.filter_map(future::ready)
|
|
170
274
|
}
|
|
171
275
|
|
|
172
|
-
///
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
276
|
+
/// Builds an [ActivityTask] stream for both cancellation tasks from cancels delivered from
|
|
277
|
+
/// heartbeats as well as new activity starts
|
|
278
|
+
fn merge_source_streams(
|
|
279
|
+
source_stream: impl Stream<Item = ActivityTaskSource>,
|
|
280
|
+
outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
|
|
281
|
+
start_tasks_stream_complete: CancellationToken,
|
|
282
|
+
complete_notify: Arc<Notify>,
|
|
283
|
+
grace_period: Option<Duration>,
|
|
284
|
+
cancels_tx: UnboundedSender<PendingActivityCancel>,
|
|
285
|
+
metrics: MetricsContext,
|
|
286
|
+
) -> impl Stream<Item = Result<ActivityTask, PollActivityError>> {
|
|
287
|
+
let outstanding_tasks_clone = outstanding_tasks.clone();
|
|
288
|
+
source_stream
|
|
289
|
+
.filter_map(move |source| {
|
|
290
|
+
let outstanding_tasks = outstanding_tasks.clone();
|
|
291
|
+
let metrics = metrics.clone();
|
|
292
|
+
async move {
|
|
293
|
+
match source {
|
|
294
|
+
ActivityTaskSource::PendingCancel(next_pc) => {
|
|
295
|
+
// It's possible that activity has been completed and we no longer have
|
|
296
|
+
// an outstanding activity task. This is fine because it means that we
|
|
297
|
+
// no longer need to cancel this activity, so we'll just ignore such
|
|
298
|
+
// orphaned cancellations.
|
|
299
|
+
if let Some(mut details) =
|
|
300
|
+
outstanding_tasks.get_mut(&next_pc.task_token)
|
|
301
|
+
{
|
|
302
|
+
if details.issued_cancel_to_lang.is_some() {
|
|
303
|
+
// Don't double-issue cancellations
|
|
304
|
+
return None;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
details.issued_cancel_to_lang = Some(next_pc.reason);
|
|
308
|
+
if next_pc.reason == ActivityCancelReason::NotFound {
|
|
309
|
+
details.known_not_found = true;
|
|
310
|
+
}
|
|
311
|
+
Some(Ok(ActivityTask::cancel_from_ids(
|
|
312
|
+
next_pc.task_token.0,
|
|
313
|
+
next_pc.reason,
|
|
314
|
+
)))
|
|
315
|
+
} else {
|
|
316
|
+
debug!(task_token = ?next_pc.task_token,
|
|
317
|
+
"Unknown activity task when issuing cancel");
|
|
318
|
+
// If we can't find the activity here, it's already been completed,
|
|
319
|
+
// in which case issuing a cancel again is pointless.
|
|
320
|
+
None
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
ActivityTaskSource::PendingStart(res) => {
|
|
324
|
+
Some(res.map(|(task, is_eager)| {
|
|
325
|
+
Self::about_to_issue_task(
|
|
326
|
+
outstanding_tasks,
|
|
327
|
+
task,
|
|
328
|
+
is_eager,
|
|
329
|
+
metrics,
|
|
330
|
+
)
|
|
331
|
+
}))
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
})
|
|
336
|
+
.take_until(async move {
|
|
337
|
+
start_tasks_stream_complete.cancelled().await;
|
|
338
|
+
// Issue cancels for any still-living act tasks after the grace period
|
|
339
|
+
let (grace_killer, stop_grace) = futures_util::future::abortable(async {
|
|
340
|
+
if let Some(gp) = grace_period {
|
|
341
|
+
// Make sure we've waited at least the grace period. This way if waiting for
|
|
342
|
+
// starts to finish took a while, we subtract that from the grace period.
|
|
343
|
+
tokio::time::sleep(gp).await;
|
|
344
|
+
for mapref in outstanding_tasks_clone.iter() {
|
|
345
|
+
let _ = cancels_tx.send(PendingActivityCancel::new(
|
|
346
|
+
mapref.key().clone(),
|
|
347
|
+
ActivityCancelReason::WorkerShutdown,
|
|
348
|
+
));
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
});
|
|
352
|
+
join!(
|
|
353
|
+
async {
|
|
354
|
+
while !outstanding_tasks_clone.is_empty() {
|
|
355
|
+
complete_notify.notified().await
|
|
356
|
+
}
|
|
357
|
+
// If we were waiting for the grace period but everything already finished,
|
|
358
|
+
// we don't need to keep waiting.
|
|
359
|
+
stop_grace.abort();
|
|
360
|
+
},
|
|
361
|
+
grace_killer
|
|
362
|
+
)
|
|
363
|
+
})
|
|
177
364
|
}
|
|
178
365
|
|
|
179
|
-
pub(crate)
|
|
180
|
-
self.
|
|
366
|
+
pub(crate) fn initiate_shutdown(&self) {
|
|
367
|
+
self.poller_shutdown_token.cancel();
|
|
368
|
+
self.eager_activities_semaphore.close();
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
pub(crate) async fn shutdown(&self) {
|
|
372
|
+
self.initiate_shutdown();
|
|
373
|
+
self.poll_returned_shutdown_token.cancelled().await;
|
|
181
374
|
self.heartbeat_manager.shutdown().await;
|
|
182
375
|
}
|
|
183
376
|
|
|
184
|
-
///
|
|
377
|
+
/// Exclusive poll for activity tasks
|
|
185
378
|
///
|
|
186
|
-
///
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
.expect("outstanding activity semaphore not closed");
|
|
197
|
-
if let Some(ref rl) = self.ratelimiter {
|
|
198
|
-
rl.until_ready().await;
|
|
199
|
-
}
|
|
200
|
-
(self.poller.poll().await, perm)
|
|
201
|
-
};
|
|
202
|
-
|
|
203
|
-
tokio::select! {
|
|
204
|
-
biased;
|
|
205
|
-
|
|
206
|
-
cancel_task = self.next_pending_cancel_task() => {
|
|
207
|
-
cancel_task
|
|
208
|
-
}
|
|
209
|
-
task = self.non_poll_tasks.next() => {
|
|
210
|
-
Ok(Some(self.about_to_issue_task(task, true)))
|
|
211
|
-
}
|
|
212
|
-
(work, permit) = poll_with_semaphore => {
|
|
213
|
-
match work {
|
|
214
|
-
Some(Ok(work)) => {
|
|
215
|
-
if work == PollActivityTaskQueueResponse::default() {
|
|
216
|
-
// Timeout
|
|
217
|
-
self.metrics.act_poll_timeout();
|
|
218
|
-
return Ok(None)
|
|
219
|
-
}
|
|
220
|
-
let work = self.about_to_issue_task(PermittedTqResp {
|
|
221
|
-
resp: work, permit
|
|
222
|
-
}, false);
|
|
223
|
-
Ok(Some(work))
|
|
224
|
-
}
|
|
225
|
-
None => {
|
|
226
|
-
Err(PollActivityError::ShutDown)
|
|
227
|
-
}
|
|
228
|
-
Some(Err(e)) => Err(e.into())
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
}
|
|
379
|
+
/// Polls the various task sources (server polls, eager activities, cancellations) while
|
|
380
|
+
/// respecting the provided rate limits and allowed concurrency. Returns
|
|
381
|
+
/// [PollActivityError::ShutDown] after shutdown is completed and all tasks sources are
|
|
382
|
+
/// depleted.
|
|
383
|
+
pub(crate) async fn poll(&self) -> Result<ActivityTask, PollActivityError> {
|
|
384
|
+
let mut poller_stream = self.activity_task_stream.lock().await;
|
|
385
|
+
poller_stream.next().await.unwrap_or_else(|| {
|
|
386
|
+
self.poll_returned_shutdown_token.cancel();
|
|
387
|
+
Err(PollActivityError::ShutDown)
|
|
388
|
+
})
|
|
232
389
|
}
|
|
233
390
|
|
|
234
391
|
pub(crate) async fn complete(
|
|
@@ -266,22 +423,40 @@ impl WorkerActivityTasks {
|
|
|
266
423
|
.err()
|
|
267
424
|
}
|
|
268
425
|
aer::Status::Cancelled(ar::Cancellation { failure }) => {
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
426
|
+
if matches!(
|
|
427
|
+
act_info.issued_cancel_to_lang,
|
|
428
|
+
Some(ActivityCancelReason::WorkerShutdown),
|
|
429
|
+
) {
|
|
430
|
+
// We don't report cancels for graceful shutdown as failures, so we
|
|
431
|
+
// don't wait for the whole timeout to elapse, which is what would
|
|
432
|
+
// happen anyway.
|
|
433
|
+
client
|
|
434
|
+
.fail_activity_task(
|
|
435
|
+
task_token.clone(),
|
|
436
|
+
Some(worker_shutdown_failure()),
|
|
437
|
+
)
|
|
438
|
+
.await
|
|
439
|
+
.err()
|
|
276
440
|
} else {
|
|
277
|
-
|
|
441
|
+
let details = if let Some(Failure {
|
|
442
|
+
failure_info:
|
|
443
|
+
Some(FailureInfo::CanceledFailureInfo(CanceledFailureInfo {
|
|
444
|
+
details,
|
|
445
|
+
})),
|
|
446
|
+
..
|
|
447
|
+
}) = failure
|
|
448
|
+
{
|
|
449
|
+
details
|
|
450
|
+
} else {
|
|
451
|
+
warn!(task_token = ? task_token,
|
|
278
452
|
"Expected activity cancelled status with CanceledFailureInfo");
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
453
|
+
None
|
|
454
|
+
};
|
|
455
|
+
client
|
|
456
|
+
.cancel_activity_task(task_token.clone(), details.map(Into::into))
|
|
457
|
+
.await
|
|
458
|
+
.err()
|
|
459
|
+
}
|
|
285
460
|
}
|
|
286
461
|
};
|
|
287
462
|
|
|
@@ -338,48 +513,21 @@ impl WorkerActivityTasks {
|
|
|
338
513
|
/// Returns a handle that the workflows management side can use to interact with this manager
|
|
339
514
|
pub(crate) fn get_handle_for_workflows(&self) -> ActivitiesFromWFTsHandle {
|
|
340
515
|
ActivitiesFromWFTsHandle {
|
|
341
|
-
sem: self.
|
|
342
|
-
tx: self.
|
|
516
|
+
sem: self.eager_activities_semaphore.clone(),
|
|
517
|
+
tx: self.eager_activities_tx.clone(),
|
|
343
518
|
}
|
|
344
519
|
}
|
|
345
520
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
// cancellations.
|
|
354
|
-
if let Some(mut details) = self.outstanding_activity_tasks.get_mut(&task_token) {
|
|
355
|
-
if details.issued_cancel_to_lang {
|
|
356
|
-
// Don't double-issue cancellations
|
|
357
|
-
return Ok(None);
|
|
358
|
-
}
|
|
359
|
-
|
|
360
|
-
details.issued_cancel_to_lang = true;
|
|
361
|
-
if reason == ActivityCancelReason::NotFound {
|
|
362
|
-
details.known_not_found = true;
|
|
363
|
-
}
|
|
364
|
-
Ok(Some(ActivityTask::cancel_from_ids(task_token.0, reason)))
|
|
365
|
-
} else {
|
|
366
|
-
debug!(task_token = ?task_token, "Unknown activity task when issuing cancel");
|
|
367
|
-
// If we can't find the activity here, it's already been completed,
|
|
368
|
-
// in which case issuing a cancel again is pointless.
|
|
369
|
-
Ok(None)
|
|
370
|
-
}
|
|
371
|
-
} else {
|
|
372
|
-
// The only situation where the next cancel would return none is if the manager
|
|
373
|
-
// was dropped, which can only happen on shutdown.
|
|
374
|
-
Err(PollActivityError::ShutDown)
|
|
375
|
-
}
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
/// Called when there is a new act task about to be bubbled up out of the manager
|
|
379
|
-
fn about_to_issue_task(&self, task: PermittedTqResp, is_eager: bool) -> ActivityTask {
|
|
521
|
+
/// Called when there is a new [ActivityTask] about to be bubbled up out of the poller
|
|
522
|
+
fn about_to_issue_task(
|
|
523
|
+
outstanding_tasks: Arc<DashMap<TaskToken, RemoteInFlightActInfo>>,
|
|
524
|
+
task: PermittedTqResp,
|
|
525
|
+
is_eager: bool,
|
|
526
|
+
metrics: MetricsContext,
|
|
527
|
+
) -> ActivityTask {
|
|
380
528
|
if let Some(ref act_type) = task.resp.activity_type {
|
|
381
529
|
if let Some(ref wf_type) = task.resp.workflow_type {
|
|
382
|
-
|
|
530
|
+
metrics
|
|
383
531
|
.with_new_attrs([
|
|
384
532
|
activity_type(act_type.name.clone()),
|
|
385
533
|
workflow_type(wf_type.name.clone()),
|
|
@@ -392,12 +540,12 @@ impl WorkerActivityTasks {
|
|
|
392
540
|
// activity_type and workflow_type, we won't bother.
|
|
393
541
|
|
|
394
542
|
if let Some(dur) = task.resp.sched_to_start() {
|
|
395
|
-
|
|
543
|
+
metrics.act_sched_to_start_latency(dur);
|
|
396
544
|
};
|
|
397
545
|
|
|
398
|
-
|
|
546
|
+
outstanding_tasks.insert(
|
|
399
547
|
task.resp.task_token.clone().into(),
|
|
400
|
-
RemoteInFlightActInfo::new(&task.resp, task.permit),
|
|
548
|
+
RemoteInFlightActInfo::new(&task.resp, task.permit.into_used()),
|
|
401
549
|
);
|
|
402
550
|
|
|
403
551
|
ActivityTask::start_from_poll_resp(task.resp)
|
|
@@ -405,40 +553,65 @@ impl WorkerActivityTasks {
|
|
|
405
553
|
|
|
406
554
|
#[cfg(test)]
|
|
407
555
|
pub(crate) fn remaining_activity_capacity(&self) -> usize {
|
|
408
|
-
self.
|
|
556
|
+
self.eager_activities_semaphore.available_permits()
|
|
409
557
|
}
|
|
410
558
|
}
|
|
411
559
|
|
|
412
560
|
/// Provides facilities for the workflow side of things to interact with the activity manager.
|
|
413
561
|
/// Allows for the handling of activities returned by WFT completions.
|
|
414
562
|
pub(crate) struct ActivitiesFromWFTsHandle {
|
|
415
|
-
sem: Arc<
|
|
416
|
-
tx:
|
|
563
|
+
sem: Arc<ClosableMeteredSemaphore>,
|
|
564
|
+
tx: UnboundedSender<TrackedPermittedTqResp>,
|
|
417
565
|
}
|
|
418
566
|
|
|
419
567
|
impl ActivitiesFromWFTsHandle {
|
|
420
568
|
/// Returns a handle that can be used to reserve an activity slot. EX: When requesting eager
|
|
421
569
|
/// dispatch of an activity to this worker upon workflow task completion
|
|
422
|
-
pub(crate) fn reserve_slot(&self) -> Option<
|
|
570
|
+
pub(crate) fn reserve_slot(&self) -> Option<TrackedOwnedMeteredSemPermit> {
|
|
571
|
+
// TODO: check if rate limit is not exceeded and count this reservation towards the rate limit
|
|
423
572
|
self.sem.try_acquire_owned().ok()
|
|
424
573
|
}
|
|
425
574
|
|
|
426
575
|
/// Queue new activity tasks for dispatch received from non-polling sources (ex: eager returns
|
|
427
576
|
/// from WFT completion)
|
|
428
|
-
pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item =
|
|
577
|
+
pub(crate) fn add_tasks(&self, tasks: impl IntoIterator<Item = TrackedPermittedTqResp>) {
|
|
429
578
|
for t in tasks.into_iter() {
|
|
430
579
|
// Technically we should be reporting `activity_task_received` here, but for simplicity
|
|
431
580
|
// and time insensitivity, that metric is tracked in `about_to_issue_task`.
|
|
432
|
-
self.tx.
|
|
581
|
+
self.tx.send(t).expect("Receive half cannot be dropped");
|
|
433
582
|
}
|
|
434
583
|
}
|
|
435
584
|
}
|
|
436
585
|
|
|
586
|
+
#[derive(Debug)]
|
|
437
587
|
pub(crate) struct PermittedTqResp {
|
|
438
588
|
pub permit: OwnedMeteredSemPermit,
|
|
439
589
|
pub resp: PollActivityTaskQueueResponse,
|
|
440
590
|
}
|
|
441
591
|
|
|
592
|
+
#[derive(Debug)]
|
|
593
|
+
pub(crate) struct TrackedPermittedTqResp {
|
|
594
|
+
pub permit: TrackedOwnedMeteredSemPermit,
|
|
595
|
+
pub resp: PollActivityTaskQueueResponse,
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
fn worker_shutdown_failure() -> Failure {
|
|
599
|
+
Failure {
|
|
600
|
+
message: "Worker is shutting down and this activity did not complete in time".to_string(),
|
|
601
|
+
source: "".to_string(),
|
|
602
|
+
stack_trace: "".to_string(),
|
|
603
|
+
encoded_attributes: None,
|
|
604
|
+
cause: None,
|
|
605
|
+
failure_info: Some(FailureInfo::ApplicationFailureInfo(
|
|
606
|
+
ApplicationFailureInfo {
|
|
607
|
+
r#type: "WorkerShutdown".to_string(),
|
|
608
|
+
non_retryable: false,
|
|
609
|
+
details: None,
|
|
610
|
+
},
|
|
611
|
+
)),
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
|
|
442
615
|
#[cfg(test)]
|
|
443
616
|
mod tests {
|
|
444
617
|
use super::*;
|
|
@@ -470,10 +643,11 @@ mod tests {
|
|
|
470
643
|
MetricsContext::no_op(),
|
|
471
644
|
Duration::from_secs(1),
|
|
472
645
|
Duration::from_secs(1),
|
|
646
|
+
None,
|
|
473
647
|
);
|
|
474
648
|
let start = Instant::now();
|
|
475
|
-
atm.poll().await.unwrap()
|
|
476
|
-
atm.poll().await.unwrap()
|
|
649
|
+
atm.poll().await.unwrap();
|
|
650
|
+
atm.poll().await.unwrap();
|
|
477
651
|
// At least half a second will have elapsed since we only allow 2 tasks per second.
|
|
478
652
|
// With no ratelimit, even on a slow CI server with lots of load, this would typically take
|
|
479
653
|
// low single digit ms or less.
|