@temporalio/core-bridge 1.12.0 → 1.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +64 -119
- package/Cargo.toml +1 -1
- package/index.js +3 -2
- package/package.json +3 -3
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.cargo/config.toml +1 -2
- package/sdk-core/.github/workflows/per-pr.yml +2 -0
- package/sdk-core/AGENTS.md +7 -0
- package/sdk-core/Cargo.toml +9 -5
- package/sdk-core/README.md +6 -5
- package/sdk-core/client/Cargo.toml +3 -2
- package/sdk-core/client/src/lib.rs +17 -8
- package/sdk-core/client/src/metrics.rs +57 -23
- package/sdk-core/client/src/raw.rs +33 -15
- package/sdk-core/core/Cargo.toml +11 -9
- package/sdk-core/core/benches/workflow_replay.rs +114 -15
- package/sdk-core/core/src/core_tests/activity_tasks.rs +18 -18
- package/sdk-core/core/src/core_tests/child_workflows.rs +4 -4
- package/sdk-core/core/src/core_tests/determinism.rs +6 -6
- package/sdk-core/core/src/core_tests/local_activities.rs +20 -20
- package/sdk-core/core/src/core_tests/mod.rs +40 -5
- package/sdk-core/core/src/core_tests/queries.rs +25 -16
- package/sdk-core/core/src/core_tests/replay_flag.rs +3 -3
- package/sdk-core/core/src/core_tests/updates.rs +3 -3
- package/sdk-core/core/src/core_tests/workers.rs +9 -7
- package/sdk-core/core/src/core_tests/workflow_tasks.rs +40 -42
- package/sdk-core/core/src/ephemeral_server/mod.rs +1 -19
- package/sdk-core/core/src/lib.rs +10 -1
- package/sdk-core/core/src/pollers/poll_buffer.rs +2 -2
- package/sdk-core/core/src/replay/mod.rs +3 -3
- package/sdk-core/core/src/telemetry/metrics.rs +306 -152
- package/sdk-core/core/src/telemetry/mod.rs +11 -4
- package/sdk-core/core/src/telemetry/otel.rs +134 -131
- package/sdk-core/core/src/telemetry/prometheus_meter.rs +885 -0
- package/sdk-core/core/src/telemetry/prometheus_server.rs +48 -28
- package/sdk-core/core/src/test_help/mod.rs +27 -12
- package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +7 -7
- package/sdk-core/core/src/worker/activities.rs +4 -4
- package/sdk-core/core/src/worker/client/mocks.rs +10 -3
- package/sdk-core/core/src/worker/client.rs +68 -5
- package/sdk-core/core/src/worker/heartbeat.rs +229 -0
- package/sdk-core/core/src/worker/mod.rs +35 -14
- package/sdk-core/core/src/worker/tuner/resource_based.rs +4 -4
- package/sdk-core/core/src/worker/workflow/history_update.rs +71 -19
- package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +1 -1
- package/sdk-core/core/src/worker/workflow/machines/nexus_operation_state_machine.rs +31 -48
- package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +3 -3
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +4 -1
- package/sdk-core/core/src/worker/workflow/managed_run.rs +1 -1
- package/sdk-core/core/src/worker/workflow/mod.rs +15 -15
- package/sdk-core/core-api/Cargo.toml +2 -2
- package/sdk-core/core-api/src/envconfig.rs +204 -99
- package/sdk-core/core-api/src/lib.rs +9 -0
- package/sdk-core/core-api/src/telemetry/metrics.rs +548 -100
- package/sdk-core/core-api/src/worker.rs +11 -5
- package/sdk-core/core-c-bridge/Cargo.toml +49 -0
- package/sdk-core/core-c-bridge/build.rs +26 -0
- package/sdk-core/core-c-bridge/include/temporal-sdk-core-c-bridge.h +817 -0
- package/sdk-core/core-c-bridge/src/client.rs +679 -0
- package/sdk-core/core-c-bridge/src/lib.rs +245 -0
- package/sdk-core/core-c-bridge/src/metric.rs +682 -0
- package/sdk-core/core-c-bridge/src/random.rs +61 -0
- package/sdk-core/core-c-bridge/src/runtime.rs +445 -0
- package/sdk-core/core-c-bridge/src/testing.rs +282 -0
- package/sdk-core/core-c-bridge/src/tests/context.rs +644 -0
- package/sdk-core/core-c-bridge/src/tests/mod.rs +178 -0
- package/sdk-core/core-c-bridge/src/tests/utils.rs +108 -0
- package/sdk-core/core-c-bridge/src/worker.rs +1069 -0
- package/sdk-core/etc/deps.svg +64 -64
- package/sdk-core/sdk/src/activity_context.rs +6 -4
- package/sdk-core/sdk/src/lib.rs +49 -27
- package/sdk-core/sdk/src/workflow_future.rs +18 -25
- package/sdk-core/sdk-core-protos/protos/api_upstream/README.md +4 -0
- package/sdk-core/sdk-core-protos/protos/api_upstream/buf.yaml +0 -2
- package/sdk-core/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json +630 -83
- package/sdk-core/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml +632 -78
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/batch/v1/message.proto +4 -4
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/command/v1/message.proto +6 -4
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto +2 -2
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto +32 -2
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/common.proto +10 -1
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/deployment.proto +26 -0
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +2 -0
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/reset.proto +4 -4
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/history/v1/message.proto +47 -31
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/nexus/v1/message.proto +4 -4
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/schedule/v1/message.proto +7 -1
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto +134 -0
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/workflow/v1/message.proto +14 -11
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +148 -37
- package/sdk-core/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +21 -0
- package/sdk-core/sdk-core-protos/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +4 -4
- package/sdk-core/sdk-core-protos/src/history_builder.rs +9 -5
- package/sdk-core/sdk-core-protos/src/lib.rs +96 -6
- package/sdk-core/test-utils/src/lib.rs +11 -3
- package/sdk-core/tests/cloud_tests.rs +3 -3
- package/sdk-core/tests/heavy_tests.rs +11 -3
- package/sdk-core/tests/integ_tests/client_tests.rs +12 -13
- package/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +1 -1
- package/sdk-core/tests/integ_tests/metrics_tests.rs +188 -83
- package/sdk-core/tests/integ_tests/polling_tests.rs +1 -1
- package/sdk-core/tests/integ_tests/queries_tests.rs +56 -40
- package/sdk-core/tests/integ_tests/update_tests.rs +2 -7
- package/sdk-core/tests/integ_tests/worker_tests.rs +3 -4
- package/sdk-core/tests/integ_tests/worker_versioning_tests.rs +3 -7
- package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +3 -5
- package/sdk-core/tests/integ_tests/workflow_tests/nexus.rs +24 -17
- package/src/client.rs +6 -0
- package/src/metrics.rs +6 -6
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
use crate::WorkerClient;
|
|
2
|
+
use crate::abstractions::dbg_panic;
|
|
3
|
+
use gethostname::gethostname;
|
|
4
|
+
use parking_lot::Mutex;
|
|
5
|
+
use prost_types::Duration as PbDuration;
|
|
6
|
+
use std::sync::{Arc, OnceLock};
|
|
7
|
+
use std::time::{Duration, SystemTime};
|
|
8
|
+
use temporal_sdk_core_api::worker::WorkerConfig;
|
|
9
|
+
use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo};
|
|
10
|
+
use tokio::sync::Notify;
|
|
11
|
+
use tokio::task::JoinHandle;
|
|
12
|
+
use tokio::time::MissedTickBehavior;
|
|
13
|
+
use uuid::Uuid;
|
|
14
|
+
|
|
15
|
+
pub(crate) type HeartbeatFn = Box<dyn Fn() -> Option<WorkerHeartbeat> + Send + Sync>;
|
|
16
|
+
|
|
17
|
+
pub(crate) struct WorkerHeartbeatManager {
|
|
18
|
+
heartbeat_handle: JoinHandle<()>,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
impl WorkerHeartbeatManager {
|
|
22
|
+
pub(crate) fn new(
|
|
23
|
+
config: WorkerConfig,
|
|
24
|
+
identity: String,
|
|
25
|
+
heartbeat_fn: Arc<OnceLock<HeartbeatFn>>,
|
|
26
|
+
client: Arc<dyn WorkerClient>,
|
|
27
|
+
) -> Self {
|
|
28
|
+
let sdk_name_and_ver = client.sdk_name_and_version();
|
|
29
|
+
let reset_notify = Arc::new(Notify::new());
|
|
30
|
+
let data = Arc::new(Mutex::new(WorkerHeartbeatData::new(
|
|
31
|
+
config,
|
|
32
|
+
identity,
|
|
33
|
+
sdk_name_and_ver,
|
|
34
|
+
reset_notify.clone(),
|
|
35
|
+
)));
|
|
36
|
+
let data_clone = data.clone();
|
|
37
|
+
|
|
38
|
+
let heartbeat_handle = tokio::spawn(async move {
|
|
39
|
+
let mut ticker = tokio::time::interval(data_clone.lock().heartbeat_interval);
|
|
40
|
+
ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
|
|
41
|
+
loop {
|
|
42
|
+
tokio::select! {
|
|
43
|
+
_ = ticker.tick() => {
|
|
44
|
+
let heartbeat = if let Some(heartbeat) = data_clone.lock().capture_heartbeat_if_needed() {
|
|
45
|
+
heartbeat
|
|
46
|
+
} else {
|
|
47
|
+
continue
|
|
48
|
+
};
|
|
49
|
+
if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await {
|
|
50
|
+
if matches!(
|
|
51
|
+
e.code(),
|
|
52
|
+
tonic::Code::Unimplemented
|
|
53
|
+
) {
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
warn!(error=?e, "Network error while sending worker heartbeat");
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
_ = reset_notify.notified() => {
|
|
60
|
+
ticker.reset();
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
let data_clone = data.clone();
|
|
67
|
+
if heartbeat_fn
|
|
68
|
+
.set(Box::new(move || {
|
|
69
|
+
data_clone.lock().capture_heartbeat_if_needed()
|
|
70
|
+
}))
|
|
71
|
+
.is_err()
|
|
72
|
+
{
|
|
73
|
+
dbg_panic!(
|
|
74
|
+
"Failed to set heartbeat_fn, heartbeat_fn should only be set once, when a singular WorkerHeartbeatInfo is created"
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
Self { heartbeat_handle }
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
pub(crate) fn shutdown(&self) {
|
|
82
|
+
self.heartbeat_handle.abort()
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[derive(Debug, Clone)]
|
|
87
|
+
struct WorkerHeartbeatData {
|
|
88
|
+
worker_instance_key: String,
|
|
89
|
+
worker_identity: String,
|
|
90
|
+
host_info: WorkerHostInfo,
|
|
91
|
+
// Time of the last heartbeat. This is used to both for heartbeat_time and last_heartbeat_time
|
|
92
|
+
heartbeat_time: Option<SystemTime>,
|
|
93
|
+
task_queue: String,
|
|
94
|
+
/// SDK name
|
|
95
|
+
sdk_name: String,
|
|
96
|
+
/// SDK version
|
|
97
|
+
sdk_version: String,
|
|
98
|
+
/// Worker start time
|
|
99
|
+
start_time: SystemTime,
|
|
100
|
+
heartbeat_interval: Duration,
|
|
101
|
+
reset_notify: Arc<Notify>,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
impl WorkerHeartbeatData {
|
|
105
|
+
fn new(
|
|
106
|
+
worker_config: WorkerConfig,
|
|
107
|
+
worker_identity: String,
|
|
108
|
+
sdk_name_and_ver: (String, String),
|
|
109
|
+
reset_notify: Arc<Notify>,
|
|
110
|
+
) -> Self {
|
|
111
|
+
Self {
|
|
112
|
+
worker_identity,
|
|
113
|
+
host_info: WorkerHostInfo {
|
|
114
|
+
host_name: gethostname().to_string_lossy().to_string(),
|
|
115
|
+
process_id: std::process::id().to_string(),
|
|
116
|
+
..Default::default()
|
|
117
|
+
},
|
|
118
|
+
sdk_name: sdk_name_and_ver.0,
|
|
119
|
+
sdk_version: sdk_name_and_ver.1,
|
|
120
|
+
task_queue: worker_config.task_queue.clone(),
|
|
121
|
+
start_time: SystemTime::now(),
|
|
122
|
+
heartbeat_time: None,
|
|
123
|
+
worker_instance_key: Uuid::new_v4().to_string(),
|
|
124
|
+
heartbeat_interval: worker_config.heartbeat_interval,
|
|
125
|
+
reset_notify,
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
fn capture_heartbeat_if_needed(&mut self) -> Option<WorkerHeartbeat> {
|
|
130
|
+
let now = SystemTime::now();
|
|
131
|
+
let elapsed_since_last_heartbeat = if let Some(heartbeat_time) = self.heartbeat_time {
|
|
132
|
+
let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO);
|
|
133
|
+
|
|
134
|
+
// Only send poll data if it's nearly been a full interval since this data has been sent
|
|
135
|
+
// In this case, "nearly" is 90% of the interval
|
|
136
|
+
if dur.as_secs_f64() < 0.9 * self.heartbeat_interval.as_secs_f64() {
|
|
137
|
+
return None;
|
|
138
|
+
}
|
|
139
|
+
Some(PbDuration {
|
|
140
|
+
seconds: dur.as_secs() as i64,
|
|
141
|
+
nanos: dur.subsec_nanos() as i32,
|
|
142
|
+
})
|
|
143
|
+
} else {
|
|
144
|
+
None
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
self.heartbeat_time = Some(now);
|
|
148
|
+
|
|
149
|
+
self.reset_notify.notify_one();
|
|
150
|
+
|
|
151
|
+
Some(WorkerHeartbeat {
|
|
152
|
+
worker_instance_key: self.worker_instance_key.clone(),
|
|
153
|
+
worker_identity: self.worker_identity.clone(),
|
|
154
|
+
host_info: Some(self.host_info.clone()),
|
|
155
|
+
task_queue: self.task_queue.clone(),
|
|
156
|
+
sdk_name: self.sdk_name.clone(),
|
|
157
|
+
sdk_version: self.sdk_version.clone(),
|
|
158
|
+
status: 0,
|
|
159
|
+
start_time: Some(self.start_time.into()),
|
|
160
|
+
heartbeat_time: Some(SystemTime::now().into()),
|
|
161
|
+
elapsed_since_last_heartbeat,
|
|
162
|
+
..Default::default()
|
|
163
|
+
})
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[cfg(test)]
|
|
168
|
+
mod tests {
|
|
169
|
+
use super::*;
|
|
170
|
+
use crate::test_help::WorkerExt;
|
|
171
|
+
use crate::test_help::test_worker_cfg;
|
|
172
|
+
use crate::worker;
|
|
173
|
+
use crate::worker::client::mocks::mock_worker_client;
|
|
174
|
+
use std::sync::Arc;
|
|
175
|
+
use std::time::Duration;
|
|
176
|
+
use temporal_sdk_core_api::worker::PollerBehavior;
|
|
177
|
+
use temporal_sdk_core_protos::temporal::api::workflowservice::v1::RecordWorkerHeartbeatResponse;
|
|
178
|
+
|
|
179
|
+
#[tokio::test]
|
|
180
|
+
async fn worker_heartbeat() {
|
|
181
|
+
let mut mock = mock_worker_client();
|
|
182
|
+
mock.expect_record_worker_heartbeat()
|
|
183
|
+
.times(2)
|
|
184
|
+
.returning(move |heartbeat| {
|
|
185
|
+
let host_info = heartbeat.host_info.clone().unwrap();
|
|
186
|
+
assert_eq!("test-identity", heartbeat.worker_identity);
|
|
187
|
+
assert!(!heartbeat.worker_instance_key.is_empty());
|
|
188
|
+
assert_eq!(
|
|
189
|
+
host_info.host_name,
|
|
190
|
+
gethostname::gethostname().to_string_lossy().to_string()
|
|
191
|
+
);
|
|
192
|
+
assert_eq!(host_info.process_id, std::process::id().to_string());
|
|
193
|
+
assert_eq!(heartbeat.sdk_name, "test-core");
|
|
194
|
+
assert_eq!(heartbeat.sdk_version, "0.0.0");
|
|
195
|
+
assert_eq!(heartbeat.task_queue, "q");
|
|
196
|
+
assert!(heartbeat.heartbeat_time.is_some());
|
|
197
|
+
assert!(heartbeat.start_time.is_some());
|
|
198
|
+
|
|
199
|
+
Ok(RecordWorkerHeartbeatResponse {})
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
let config = test_worker_cfg()
|
|
203
|
+
.activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize))
|
|
204
|
+
.max_outstanding_activities(1_usize)
|
|
205
|
+
.heartbeat_interval(Duration::from_millis(200))
|
|
206
|
+
.build()
|
|
207
|
+
.unwrap();
|
|
208
|
+
|
|
209
|
+
let heartbeat_fn = Arc::new(OnceLock::new());
|
|
210
|
+
let client = Arc::new(mock);
|
|
211
|
+
let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_fn.clone()));
|
|
212
|
+
heartbeat_fn.get().unwrap()();
|
|
213
|
+
|
|
214
|
+
// heartbeat timer fires once
|
|
215
|
+
advance_time(Duration::from_millis(300)).await;
|
|
216
|
+
// it hasn't been >90% of the interval since the last heartbeat, so no data should be returned here
|
|
217
|
+
assert_eq!(None, heartbeat_fn.get().unwrap()());
|
|
218
|
+
// heartbeat timer fires once
|
|
219
|
+
advance_time(Duration::from_millis(300)).await;
|
|
220
|
+
|
|
221
|
+
worker.drain_activity_poller_and_shutdown().await;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
async fn advance_time(dur: Duration) {
|
|
225
|
+
tokio::time::pause();
|
|
226
|
+
tokio::time::advance(dur).await;
|
|
227
|
+
tokio::time::resume();
|
|
228
|
+
}
|
|
229
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
mod activities;
|
|
2
2
|
pub(crate) mod client;
|
|
3
|
+
mod heartbeat;
|
|
3
4
|
mod nexus;
|
|
4
5
|
mod slot_provider;
|
|
5
6
|
pub(crate) mod tuner;
|
|
@@ -19,6 +20,7 @@ pub(crate) use activities::{
|
|
|
19
20
|
pub(crate) use wft_poller::WFTPollerShared;
|
|
20
21
|
pub(crate) use workflow::LEGACY_QUERY_ID;
|
|
21
22
|
|
|
23
|
+
use crate::worker::heartbeat::{HeartbeatFn, WorkerHeartbeatManager};
|
|
22
24
|
use crate::{
|
|
23
25
|
ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait,
|
|
24
26
|
abstractions::{MeteredPermitDealer, PermitDealerContextData, dbg_panic},
|
|
@@ -41,10 +43,15 @@ use crate::{
|
|
|
41
43
|
},
|
|
42
44
|
},
|
|
43
45
|
};
|
|
46
|
+
use crate::{
|
|
47
|
+
pollers::{ActivityTaskOptions, LongPollBuffer},
|
|
48
|
+
worker::workflow::wft_poller,
|
|
49
|
+
};
|
|
44
50
|
use activities::WorkerActivityTasks;
|
|
45
51
|
use futures_util::{StreamExt, stream};
|
|
46
52
|
use parking_lot::Mutex;
|
|
47
53
|
use slot_provider::SlotProvider;
|
|
54
|
+
use std::sync::OnceLock;
|
|
48
55
|
use std::{
|
|
49
56
|
convert::TryInto,
|
|
50
57
|
future,
|
|
@@ -77,11 +84,6 @@ use temporal_sdk_core_protos::{
|
|
|
77
84
|
use tokio::sync::{mpsc::unbounded_channel, watch};
|
|
78
85
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
|
79
86
|
use tokio_util::sync::CancellationToken;
|
|
80
|
-
|
|
81
|
-
use crate::{
|
|
82
|
-
pollers::{ActivityTaskOptions, LongPollBuffer},
|
|
83
|
-
worker::workflow::wft_poller,
|
|
84
|
-
};
|
|
85
87
|
#[cfg(test)]
|
|
86
88
|
use {
|
|
87
89
|
crate::{
|
|
@@ -119,6 +121,8 @@ pub struct Worker {
|
|
|
119
121
|
local_activities_complete: Arc<AtomicBool>,
|
|
120
122
|
/// Used to track all permits have been released
|
|
121
123
|
all_permits_tracker: tokio::sync::Mutex<AllPermitsTracker>,
|
|
124
|
+
/// Used to shutdown the worker heartbeat task
|
|
125
|
+
worker_heartbeat: Option<WorkerHeartbeatManager>,
|
|
122
126
|
}
|
|
123
127
|
|
|
124
128
|
struct AllPermitsTracker {
|
|
@@ -271,6 +275,7 @@ impl Worker {
|
|
|
271
275
|
sticky_queue_name: Option<String>,
|
|
272
276
|
client: Arc<dyn WorkerClient>,
|
|
273
277
|
telem_instance: Option<&TelemetryInstance>,
|
|
278
|
+
heartbeat_fn: Option<Arc<OnceLock<HeartbeatFn>>>,
|
|
274
279
|
) -> Self {
|
|
275
280
|
info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker");
|
|
276
281
|
|
|
@@ -280,6 +285,7 @@ impl Worker {
|
|
|
280
285
|
client,
|
|
281
286
|
TaskPollers::Real,
|
|
282
287
|
telem_instance,
|
|
288
|
+
heartbeat_fn,
|
|
283
289
|
)
|
|
284
290
|
}
|
|
285
291
|
|
|
@@ -297,7 +303,7 @@ impl Worker {
|
|
|
297
303
|
|
|
298
304
|
#[cfg(test)]
|
|
299
305
|
pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self {
|
|
300
|
-
Self::new(config, None, Arc::new(client), None)
|
|
306
|
+
Self::new(config, None, Arc::new(client), None, None)
|
|
301
307
|
}
|
|
302
308
|
|
|
303
309
|
pub(crate) fn new_with_pollers(
|
|
@@ -306,6 +312,7 @@ impl Worker {
|
|
|
306
312
|
client: Arc<dyn WorkerClient>,
|
|
307
313
|
task_pollers: TaskPollers,
|
|
308
314
|
telem_instance: Option<&TelemetryInstance>,
|
|
315
|
+
heartbeat_fn: Option<Arc<OnceLock<HeartbeatFn>>>,
|
|
309
316
|
) -> Self {
|
|
310
317
|
let (metrics, meter) = if let Some(ti) = telem_instance {
|
|
311
318
|
(
|
|
@@ -325,7 +332,7 @@ impl Worker {
|
|
|
325
332
|
let shutdown_token = CancellationToken::new();
|
|
326
333
|
let slot_context_data = Arc::new(PermitDealerContextData {
|
|
327
334
|
task_queue: config.task_queue.clone(),
|
|
328
|
-
worker_identity:
|
|
335
|
+
worker_identity: client.get_identity(),
|
|
329
336
|
worker_deployment_version: config.computed_deployment_version(),
|
|
330
337
|
});
|
|
331
338
|
let wft_slots = MeteredPermitDealer::new(
|
|
@@ -437,17 +444,17 @@ impl Worker {
|
|
|
437
444
|
};
|
|
438
445
|
|
|
439
446
|
let (hb_tx, hb_rx) = unbounded_channel();
|
|
440
|
-
let
|
|
447
|
+
let la_permit_dealer = MeteredPermitDealer::new(
|
|
441
448
|
tuner.local_activity_slot_supplier(),
|
|
442
449
|
metrics.with_new_attrs([local_activity_worker_type()]),
|
|
443
450
|
None,
|
|
444
|
-
slot_context_data,
|
|
451
|
+
slot_context_data.clone(),
|
|
445
452
|
meter.clone(),
|
|
446
453
|
);
|
|
447
|
-
let la_permits =
|
|
454
|
+
let la_permits = la_permit_dealer.get_extant_count_rcv();
|
|
448
455
|
let local_act_mgr = Arc::new(LocalActivityManager::new(
|
|
449
456
|
config.namespace.clone(),
|
|
450
|
-
|
|
457
|
+
la_permit_dealer,
|
|
451
458
|
hb_tx,
|
|
452
459
|
metrics.clone(),
|
|
453
460
|
));
|
|
@@ -484,6 +491,16 @@ impl Worker {
|
|
|
484
491
|
);
|
|
485
492
|
let worker_key = Mutex::new(client.workers().register(Box::new(provider)));
|
|
486
493
|
let sdk_name_and_ver = client.sdk_name_and_version();
|
|
494
|
+
|
|
495
|
+
let worker_heartbeat = heartbeat_fn.map(|heartbeat_fn| {
|
|
496
|
+
WorkerHeartbeatManager::new(
|
|
497
|
+
config.clone(),
|
|
498
|
+
client.get_identity(),
|
|
499
|
+
heartbeat_fn,
|
|
500
|
+
client.clone(),
|
|
501
|
+
)
|
|
502
|
+
});
|
|
503
|
+
|
|
487
504
|
Self {
|
|
488
505
|
worker_key,
|
|
489
506
|
client: client.clone(),
|
|
@@ -540,6 +557,7 @@ impl Worker {
|
|
|
540
557
|
la_permits,
|
|
541
558
|
}),
|
|
542
559
|
nexus_mgr,
|
|
560
|
+
worker_heartbeat,
|
|
543
561
|
}
|
|
544
562
|
}
|
|
545
563
|
|
|
@@ -584,6 +602,9 @@ impl Worker {
|
|
|
584
602
|
dbg_panic!("Waiting for all slot permits to release took too long!");
|
|
585
603
|
}
|
|
586
604
|
}
|
|
605
|
+
if let Some(heartbeat) = self.worker_heartbeat.as_ref() {
|
|
606
|
+
heartbeat.shutdown();
|
|
607
|
+
}
|
|
587
608
|
}
|
|
588
609
|
|
|
589
610
|
/// Finish shutting down by consuming the background pollers and freeing all resources
|
|
@@ -883,7 +904,7 @@ mod tests {
|
|
|
883
904
|
use crate::{
|
|
884
905
|
advance_fut,
|
|
885
906
|
test_help::test_worker_cfg,
|
|
886
|
-
worker::client::mocks::{
|
|
907
|
+
worker::client::mocks::{mock_manual_worker_client, mock_worker_client},
|
|
887
908
|
};
|
|
888
909
|
use futures_util::FutureExt;
|
|
889
910
|
use temporal_sdk_core_api::worker::PollerBehavior;
|
|
@@ -891,7 +912,7 @@ mod tests {
|
|
|
891
912
|
|
|
892
913
|
#[tokio::test]
|
|
893
914
|
async fn activity_timeouts_maintain_permit() {
|
|
894
|
-
let mut mock_client =
|
|
915
|
+
let mut mock_client = mock_worker_client();
|
|
895
916
|
mock_client
|
|
896
917
|
.expect_poll_activity_task()
|
|
897
918
|
.returning(|_, _| Ok(PollActivityTaskQueueResponse::default()));
|
|
@@ -913,7 +934,7 @@ mod tests {
|
|
|
913
934
|
async fn activity_errs_dont_eat_permits() {
|
|
914
935
|
// Return one error followed by simulating waiting on the poll, otherwise the poller will
|
|
915
936
|
// loop very fast and be in some indeterminate state.
|
|
916
|
-
let mut mock_client =
|
|
937
|
+
let mut mock_client = mock_manual_worker_client();
|
|
917
938
|
mock_client
|
|
918
939
|
.expect_poll_activity_task()
|
|
919
940
|
.returning(|_, _| async { Err(tonic::Status::internal("ahhh")) }.boxed())
|
|
@@ -190,10 +190,10 @@ struct PidControllers {
|
|
|
190
190
|
}
|
|
191
191
|
struct MetricInstruments {
|
|
192
192
|
attribs: MetricAttributes,
|
|
193
|
-
mem_usage:
|
|
194
|
-
cpu_usage:
|
|
195
|
-
mem_pid_output:
|
|
196
|
-
cpu_pid_output:
|
|
193
|
+
mem_usage: GaugeF64,
|
|
194
|
+
cpu_usage: GaugeF64,
|
|
195
|
+
mem_pid_output: GaugeF64,
|
|
196
|
+
cpu_pid_output: GaugeF64,
|
|
197
197
|
}
|
|
198
198
|
#[derive(Clone, Copy, Default)]
|
|
199
199
|
struct LastMetricVals {
|
|
@@ -686,6 +686,7 @@ impl NextWFTSeqEndIndex {
|
|
|
686
686
|
}
|
|
687
687
|
|
|
688
688
|
/// Discovers the index of the last event in next WFT sequence within the passed-in slice
|
|
689
|
+
/// For more on workflow task chunking, see arch_docs/workflow_task_chunking.md
|
|
689
690
|
fn find_end_index_of_next_wft_seq(
|
|
690
691
|
events: &[HistoryEvent],
|
|
691
692
|
from_event_id: i64,
|
|
@@ -718,11 +719,6 @@ fn find_end_index_of_next_wft_seq(
|
|
|
718
719
|
return NextWFTSeqEndIndex::Complete(last_index);
|
|
719
720
|
}
|
|
720
721
|
|
|
721
|
-
// TODO: Emergency undo for boundary calculation change. Remove if no problems after a bit.
|
|
722
|
-
if std::env::var("TEMPORAL_NO_WFT_BOUNDARY_CHANGE").is_ok() {
|
|
723
|
-
saw_command = false;
|
|
724
|
-
}
|
|
725
|
-
|
|
726
722
|
if e.event_type() == EventType::WorkflowTaskStarted {
|
|
727
723
|
wft_started_event_id_to_index.push((e.event_id, ix));
|
|
728
724
|
if let Some(next_event) = events.get(ix + 1) {
|
|
@@ -737,6 +733,9 @@ fn find_end_index_of_next_wft_seq(
|
|
|
737
733
|
| EventType::WorkflowExecutionTerminated
|
|
738
734
|
| EventType::WorkflowExecutionCanceled
|
|
739
735
|
) {
|
|
736
|
+
// Since we're skipping this WFT, we don't want to include it in the vec used
|
|
737
|
+
// for update accepted sequencing lookups.
|
|
738
|
+
wft_started_event_id_to_index.pop();
|
|
740
739
|
continue;
|
|
741
740
|
} else if next_event_type == EventType::WorkflowTaskCompleted {
|
|
742
741
|
if let Some(next_next_event) = events.get(ix + 2) {
|
|
@@ -761,7 +760,12 @@ fn find_end_index_of_next_wft_seq(
|
|
|
761
760
|
),
|
|
762
761
|
) = next_next_event.attributes
|
|
763
762
|
{
|
|
764
|
-
// Find index of closest WFT started before sequencing id
|
|
763
|
+
// Find index of closest unskipped WFT started before sequencing id.
|
|
764
|
+
// The fact that the WFT wasn't skipped is important. If it was, we
|
|
765
|
+
// need to avoid stopping at that point even though that's where the
|
|
766
|
+
// update was sequenced. If we did, we'll fail to actually include
|
|
767
|
+
// the update accepted event and therefore fail to generate the
|
|
768
|
+
// request to run the update handler on replay.
|
|
765
769
|
if let Some(ret_ix) = wft_started_event_id_to_index
|
|
766
770
|
.iter()
|
|
767
771
|
.rev()
|
|
@@ -803,7 +807,7 @@ mod tests {
|
|
|
803
807
|
use crate::{
|
|
804
808
|
replay::{HistoryInfo, TestHistoryBuilder},
|
|
805
809
|
test_help::{MockPollCfg, ResponseType, canned_histories, hist_to_poll_resp, mock_sdk_cfg},
|
|
806
|
-
worker::client::mocks::
|
|
810
|
+
worker::client::mocks::mock_worker_client,
|
|
807
811
|
};
|
|
808
812
|
use futures_util::{StreamExt, TryStreamExt};
|
|
809
813
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
@@ -965,7 +969,7 @@ mod tests {
|
|
|
965
969
|
let wft_started = hinfo.workflow_task_started_event_id();
|
|
966
970
|
let full_hist = hinfo.into_events();
|
|
967
971
|
let initial_hist = full_hist.chunks(chunk_size).next().unwrap().to_vec();
|
|
968
|
-
let mut mock_client =
|
|
972
|
+
let mut mock_client = mock_worker_client();
|
|
969
973
|
|
|
970
974
|
let mut npt = 1;
|
|
971
975
|
mock_client
|
|
@@ -1162,7 +1166,7 @@ mod tests {
|
|
|
1162
1166
|
// Chop off the last event, which is WFT started, which server doesn't return in get
|
|
1163
1167
|
// history
|
|
1164
1168
|
history_from_get.history.as_mut().map(|h| h.events.pop());
|
|
1165
|
-
let mut mock_client =
|
|
1169
|
+
let mut mock_client = mock_worker_client();
|
|
1166
1170
|
mock_client
|
|
1167
1171
|
.expect_get_workflow_execution_history()
|
|
1168
1172
|
.returning(move |_, _, _| Ok(history_from_get.clone()));
|
|
@@ -1220,7 +1224,7 @@ mod tests {
|
|
|
1220
1224
|
let partial_task = timer_hist.get_one_wft(2).unwrap();
|
|
1221
1225
|
let prev_started_wft_id = partial_task.previous_started_event_id();
|
|
1222
1226
|
let wft_started_id = partial_task.workflow_task_started_event_id();
|
|
1223
|
-
let mut mock_client =
|
|
1227
|
+
let mut mock_client = mock_worker_client();
|
|
1224
1228
|
mock_client
|
|
1225
1229
|
.expect_get_workflow_execution_history()
|
|
1226
1230
|
.returning(move |_, _, _| Ok(Default::default()));
|
|
@@ -1247,7 +1251,7 @@ mod tests {
|
|
|
1247
1251
|
let wft_started_id = partial_task.workflow_task_started_event_id();
|
|
1248
1252
|
let full_resp: GetWorkflowExecutionHistoryResponse =
|
|
1249
1253
|
timer_hist.get_full_history_info().unwrap().into();
|
|
1250
|
-
let mut mock_client =
|
|
1254
|
+
let mut mock_client = mock_worker_client();
|
|
1251
1255
|
mock_client
|
|
1252
1256
|
.expect_get_workflow_execution_history()
|
|
1253
1257
|
.returning(move |_, _, _| {
|
|
@@ -1296,7 +1300,7 @@ mod tests {
|
|
|
1296
1300
|
timer_hist.get_full_history_info().unwrap().into();
|
|
1297
1301
|
full_resp_with_npt.next_page_token = vec![1];
|
|
1298
1302
|
|
|
1299
|
-
let mut mock_client =
|
|
1303
|
+
let mut mock_client = mock_worker_client();
|
|
1300
1304
|
mock_client
|
|
1301
1305
|
.expect_get_workflow_execution_history()
|
|
1302
1306
|
.returning(move |_, _, _| Ok(full_resp_with_npt.clone()))
|
|
@@ -1375,7 +1379,7 @@ mod tests {
|
|
|
1375
1379
|
resp_1.next_page_token = vec![1];
|
|
1376
1380
|
resp_1.history.as_mut().unwrap().events.truncate(4);
|
|
1377
1381
|
|
|
1378
|
-
let mut mock_client =
|
|
1382
|
+
let mut mock_client = mock_worker_client();
|
|
1379
1383
|
mock_client
|
|
1380
1384
|
.expect_get_workflow_execution_history()
|
|
1381
1385
|
.returning(move |_, _, _| Ok(resp_1.clone()))
|
|
@@ -1486,7 +1490,7 @@ mod tests {
|
|
|
1486
1490
|
t.add_we_signaled("hi", vec![]);
|
|
1487
1491
|
t.add_workflow_task_scheduled_and_started();
|
|
1488
1492
|
|
|
1489
|
-
let mut mock_client =
|
|
1493
|
+
let mut mock_client = mock_worker_client();
|
|
1490
1494
|
|
|
1491
1495
|
let events: Vec<HistoryEvent> = t.get_full_history_info().unwrap().into_events();
|
|
1492
1496
|
let first_event = events[0].clone();
|
|
@@ -1602,7 +1606,7 @@ mod tests {
|
|
|
1602
1606
|
let events: Vec<HistoryEvent> = t.get_full_history_info().unwrap().into_events();
|
|
1603
1607
|
let first_event = events[0].clone();
|
|
1604
1608
|
|
|
1605
|
-
let mut mock_client =
|
|
1609
|
+
let mut mock_client = mock_worker_client();
|
|
1606
1610
|
|
|
1607
1611
|
for (i, event) in events.into_iter().enumerate() {
|
|
1608
1612
|
// Add an empty page
|
|
@@ -1722,7 +1726,7 @@ mod tests {
|
|
|
1722
1726
|
t.get_full_history_info().unwrap().into();
|
|
1723
1727
|
resp_1.next_page_token = vec![2];
|
|
1724
1728
|
|
|
1725
|
-
let mut mock_client =
|
|
1729
|
+
let mut mock_client = mock_worker_client();
|
|
1726
1730
|
mock_client
|
|
1727
1731
|
.expect_get_workflow_execution_history()
|
|
1728
1732
|
.returning(move |_, _, _| Ok(resp_1.clone()))
|
|
@@ -1765,7 +1769,7 @@ mod tests {
|
|
|
1765
1769
|
let workflow_task = t.get_full_history_info().unwrap();
|
|
1766
1770
|
let prev_started_wft_id = workflow_task.previous_started_event_id();
|
|
1767
1771
|
let wft_started_id = workflow_task.workflow_task_started_event_id();
|
|
1768
|
-
let mock_client =
|
|
1772
|
+
let mock_client = mock_worker_client();
|
|
1769
1773
|
let mut paginator = HistoryPaginator::new(
|
|
1770
1774
|
workflow_task.into(),
|
|
1771
1775
|
prev_started_wft_id,
|
|
@@ -1802,7 +1806,7 @@ mod tests {
|
|
|
1802
1806
|
let full_resp: GetWorkflowExecutionHistoryResponse =
|
|
1803
1807
|
t.get_full_history_info().unwrap().into();
|
|
1804
1808
|
|
|
1805
|
-
let mut mock_client =
|
|
1809
|
+
let mut mock_client = mock_worker_client();
|
|
1806
1810
|
mock_client
|
|
1807
1811
|
.expect_get_workflow_execution_history()
|
|
1808
1812
|
.returning(move |_, _, _| Ok(full_resp.clone()))
|
|
@@ -1839,7 +1843,7 @@ mod tests {
|
|
|
1839
1843
|
let incremental_task =
|
|
1840
1844
|
hist_to_poll_resp(&t, "wfid".to_owned(), ResponseType::OneTask(3)).resp;
|
|
1841
1845
|
|
|
1842
|
-
let mut mock_client =
|
|
1846
|
+
let mut mock_client = mock_worker_client();
|
|
1843
1847
|
let mut one_task_resp: GetWorkflowExecutionHistoryResponse =
|
|
1844
1848
|
t.get_history_info(1).unwrap().into();
|
|
1845
1849
|
one_task_resp.next_page_token = vec![1];
|
|
@@ -1877,4 +1881,52 @@ mod tests {
|
|
|
1877
1881
|
let seq = next_check_peek(&mut update, 7);
|
|
1878
1882
|
assert_eq!(seq.last().unwrap().event_id, 11);
|
|
1879
1883
|
}
|
|
1884
|
+
|
|
1885
|
+
#[tokio::test]
|
|
1886
|
+
async fn wft_fail_on_first_task_with_update() {
|
|
1887
|
+
let mut t = TestHistoryBuilder::default();
|
|
1888
|
+
t.add_by_type(EventType::WorkflowExecutionStarted);
|
|
1889
|
+
t.add_workflow_task_scheduled_and_started();
|
|
1890
|
+
t.add_workflow_task_failed_with_failure(
|
|
1891
|
+
WorkflowTaskFailedCause::Unspecified,
|
|
1892
|
+
Default::default(),
|
|
1893
|
+
);
|
|
1894
|
+
t.add_full_wf_task();
|
|
1895
|
+
let accept_id = t.add_update_accepted("1", "upd");
|
|
1896
|
+
let timer_id = t.add_timer_started("1".to_string());
|
|
1897
|
+
t.add_update_completed(accept_id);
|
|
1898
|
+
t.add_timer_fired(timer_id, "1".to_string());
|
|
1899
|
+
t.add_full_wf_task();
|
|
1900
|
+
|
|
1901
|
+
let mut update = t.as_history_update();
|
|
1902
|
+
let seq = next_check_peek(&mut update, 0);
|
|
1903
|
+
// In this case, we expect to see up to the task with update, since the task failure
|
|
1904
|
+
// should be skipped. This means that the peek of the _next_ task will include the update
|
|
1905
|
+
// and thus properly synthesize the update request with the first activation.
|
|
1906
|
+
assert_eq!(seq.len(), 6);
|
|
1907
|
+
let seq = next_check_peek(&mut update, 6);
|
|
1908
|
+
assert_eq!(seq.len(), 7);
|
|
1909
|
+
}
|
|
1910
|
+
|
|
1911
|
+
#[test]
|
|
1912
|
+
fn update_accepted_after_empty_wft() {
|
|
1913
|
+
let mut t = TestHistoryBuilder::default();
|
|
1914
|
+
t.add_by_type(EventType::WorkflowExecutionStarted);
|
|
1915
|
+
t.add_full_wf_task();
|
|
1916
|
+
t.add_full_wf_task();
|
|
1917
|
+
let accept_id = t.add_update_accepted("1", "upd");
|
|
1918
|
+
let timer_id = t.add_timer_started("1".to_string());
|
|
1919
|
+
t.add_update_completed(accept_id);
|
|
1920
|
+
t.add_timer_fired(timer_id, "1".to_string());
|
|
1921
|
+
t.add_full_wf_task();
|
|
1922
|
+
|
|
1923
|
+
let mut update = t.as_history_update();
|
|
1924
|
+
let seq = next_check_peek(&mut update, 0);
|
|
1925
|
+
// unlike the case with a wft failure, here the first task should not extend through to
|
|
1926
|
+
// the update, because here the first empty WFT happened with _just_ the workflow init,
|
|
1927
|
+
// not also with the update.
|
|
1928
|
+
assert_eq!(seq.len(), 3);
|
|
1929
|
+
let seq = next_check_peek(&mut update, 3);
|
|
1930
|
+
assert_eq!(seq.len(), 3);
|
|
1931
|
+
}
|
|
1880
1932
|
}
|
|
@@ -69,10 +69,9 @@ pub(super) fn new_external_cancel(
|
|
|
69
69
|
namespace: workflow_execution.namespace,
|
|
70
70
|
workflow_id: workflow_execution.workflow_id,
|
|
71
71
|
run_id: workflow_execution.run_id,
|
|
72
|
-
// Apparently this is effectively deprecated at this point
|
|
73
|
-
control: "".to_string(),
|
|
74
72
|
child_workflow_only: only_child,
|
|
75
73
|
reason,
|
|
74
|
+
..Default::default()
|
|
76
75
|
},
|
|
77
76
|
);
|
|
78
77
|
NewMachineWithCommand {
|