@temporalio/core-bridge 0.23.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +118 -15
- package/Cargo.toml +2 -1
- package/LICENSE.md +1 -1
- package/README.md +1 -1
- package/index.d.ts +47 -18
- package/package.json +7 -7
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.buildkite/docker/docker-compose.yaml +4 -2
- package/sdk-core/ARCHITECTURE.md +9 -7
- package/sdk-core/README.md +5 -1
- package/sdk-core/arch_docs/diagrams/workflow_internals.svg +1 -0
- package/sdk-core/bridge-ffi/src/wrappers.rs +0 -3
- package/sdk-core/client/src/lib.rs +26 -8
- package/sdk-core/client/src/raw.rs +166 -54
- package/sdk-core/client/src/retry.rs +9 -4
- package/sdk-core/client/src/workflow_handle/mod.rs +4 -2
- package/sdk-core/core/Cargo.toml +2 -0
- package/sdk-core/core/src/abstractions.rs +137 -16
- package/sdk-core/core/src/core_tests/activity_tasks.rs +258 -63
- package/sdk-core/core/src/core_tests/child_workflows.rs +1 -2
- package/sdk-core/core/src/core_tests/determinism.rs +2 -2
- package/sdk-core/core/src/core_tests/local_activities.rs +8 -7
- package/sdk-core/core/src/core_tests/queries.rs +146 -60
- package/sdk-core/core/src/core_tests/replay_flag.rs +1 -1
- package/sdk-core/core/src/core_tests/workers.rs +39 -23
- package/sdk-core/core/src/core_tests/workflow_cancels.rs +1 -1
- package/sdk-core/core/src/core_tests/workflow_tasks.rs +387 -280
- package/sdk-core/core/src/lib.rs +6 -4
- package/sdk-core/core/src/pollers/poll_buffer.rs +16 -10
- package/sdk-core/core/src/protosext/mod.rs +6 -6
- package/sdk-core/core/src/retry_logic.rs +1 -1
- package/sdk-core/core/src/telemetry/metrics.rs +21 -7
- package/sdk-core/core/src/telemetry/mod.rs +18 -4
- package/sdk-core/core/src/test_help/mod.rs +341 -109
- package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +18 -9
- package/sdk-core/core/src/worker/activities/local_activities.rs +19 -16
- package/sdk-core/core/src/worker/activities.rs +156 -29
- package/sdk-core/core/src/worker/client.rs +1 -0
- package/sdk-core/core/src/worker/mod.rs +132 -659
- package/sdk-core/core/src/{workflow → worker/workflow}/bridge.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/driven_workflow.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/history_update.rs +16 -2
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/activity_state_machine.rs +39 -4
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/cancel_external_state_machine.rs +5 -2
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/cancel_workflow_state_machine.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/child_workflow_state_machine.rs +2 -4
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/complete_workflow_state_machine.rs +0 -0
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/continue_as_new_workflow_state_machine.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/fail_workflow_state_machine.rs +0 -0
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/local_activity_state_machine.rs +2 -5
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/mod.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/mutable_side_effect_state_machine.rs +0 -0
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/patch_state_machine.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/side_effect_state_machine.rs +0 -0
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/signal_external_state_machine.rs +4 -2
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/timer_state_machine.rs +1 -2
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/transition_coverage.rs +1 -1
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/upsert_search_attributes_state_machine.rs +5 -7
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/workflow_machines/local_acts.rs +2 -2
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/workflow_machines.rs +40 -16
- package/sdk-core/core/src/{workflow → worker/workflow}/machines/workflow_task_state_machine.rs +0 -0
- package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +198 -0
- package/sdk-core/core/src/worker/workflow/managed_run.rs +627 -0
- package/sdk-core/core/src/worker/workflow/mod.rs +1115 -0
- package/sdk-core/core/src/worker/workflow/run_cache.rs +143 -0
- package/sdk-core/core/src/worker/workflow/wft_poller.rs +88 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream.rs +936 -0
- package/sdk-core/core-api/src/errors.rs +3 -10
- package/sdk-core/core-api/src/lib.rs +2 -1
- package/sdk-core/core-api/src/worker.rs +26 -2
- package/sdk-core/etc/dynamic-config.yaml +2 -0
- package/sdk-core/integ-with-otel.sh +1 -1
- package/sdk-core/protos/api_upstream/Makefile +4 -4
- package/sdk-core/protos/api_upstream/api-linter.yaml +2 -0
- package/sdk-core/protos/api_upstream/buf.yaml +8 -9
- package/sdk-core/protos/api_upstream/temporal/api/cluster/v1/message.proto +83 -0
- package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +7 -1
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/cluster.proto +40 -0
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +3 -0
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +3 -1
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +60 -0
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +3 -0
- package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +32 -4
- package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +69 -19
- package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +13 -0
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +163 -0
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +97 -0
- package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +300 -0
- package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +25 -0
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +180 -3
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +53 -3
- package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +2 -2
- package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +6 -5
- package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -1
- package/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +2 -1
- package/sdk-core/protos/local/temporal/sdk/core/common/common.proto +0 -64
- package/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +2 -1
- package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +11 -8
- package/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +30 -25
- package/sdk-core/sdk/src/activity_context.rs +12 -5
- package/sdk-core/sdk/src/app_data.rs +37 -0
- package/sdk-core/sdk/src/lib.rs +76 -43
- package/sdk-core/sdk/src/workflow_context/options.rs +8 -6
- package/sdk-core/sdk/src/workflow_context.rs +14 -19
- package/sdk-core/sdk/src/workflow_future.rs +11 -6
- package/sdk-core/sdk-core-protos/src/history_builder.rs +19 -5
- package/sdk-core/sdk-core-protos/src/history_info.rs +11 -6
- package/sdk-core/sdk-core-protos/src/lib.rs +74 -176
- package/sdk-core/test-utils/src/lib.rs +85 -72
- package/sdk-core/tests/integ_tests/heartbeat_tests.rs +11 -9
- package/sdk-core/tests/integ_tests/polling_tests.rs +12 -0
- package/sdk-core/tests/integ_tests/queries_tests.rs +39 -22
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +49 -4
- package/sdk-core/tests/integ_tests/workflow_tests/appdata_propagation.rs +61 -0
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +1 -1
- package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +74 -13
- package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +19 -0
- package/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -1
- package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +6 -3
- package/sdk-core/tests/integ_tests/workflow_tests.rs +10 -23
- package/sdk-core/tests/load_tests.rs +8 -3
- package/sdk-core/tests/main.rs +2 -1
- package/src/conversions.rs +47 -39
- package/src/errors.rs +10 -21
- package/src/lib.rs +342 -325
- package/sdk-core/core/src/pending_activations.rs +0 -173
- package/sdk-core/core/src/worker/wft_delivery.rs +0 -81
- package/sdk-core/core/src/workflow/mod.rs +0 -478
- package/sdk-core/core/src/workflow/workflow_tasks/cache_manager.rs +0 -194
- package/sdk-core/core/src/workflow/workflow_tasks/concurrency_manager.rs +0 -418
- package/sdk-core/core/src/workflow/workflow_tasks/mod.rs +0 -989
|
@@ -1,989 +0,0 @@
|
|
|
1
|
-
//! Management of workflow tasks
|
|
2
|
-
|
|
3
|
-
mod cache_manager;
|
|
4
|
-
mod concurrency_manager;
|
|
5
|
-
|
|
6
|
-
use crate::{
|
|
7
|
-
pending_activations::PendingActivations,
|
|
8
|
-
protosext::{ValidPollWFTQResponse, WorkflowActivationExt},
|
|
9
|
-
telemetry::metrics::MetricsContext,
|
|
10
|
-
worker::{client::WorkerClientBag, LocalActRequest, LocalActivityResolution},
|
|
11
|
-
workflow::{
|
|
12
|
-
history_update::NextPageToken,
|
|
13
|
-
machines::WFMachinesError,
|
|
14
|
-
workflow_tasks::{
|
|
15
|
-
cache_manager::WorkflowCacheManager, concurrency_manager::WorkflowConcurrencyManager,
|
|
16
|
-
},
|
|
17
|
-
HistoryPaginator, HistoryUpdate, LocalResolution, WFCommand, WorkflowCachingPolicy,
|
|
18
|
-
WorkflowManager, LEGACY_QUERY_ID,
|
|
19
|
-
},
|
|
20
|
-
};
|
|
21
|
-
use crossbeam::queue::SegQueue;
|
|
22
|
-
use futures::FutureExt;
|
|
23
|
-
use parking_lot::Mutex;
|
|
24
|
-
use std::{
|
|
25
|
-
fmt::Debug,
|
|
26
|
-
future::Future,
|
|
27
|
-
ops::Add,
|
|
28
|
-
sync::Arc,
|
|
29
|
-
time::{Duration, Instant},
|
|
30
|
-
};
|
|
31
|
-
use temporal_sdk_core_protos::{
|
|
32
|
-
coresdk::{
|
|
33
|
-
workflow_activation::{
|
|
34
|
-
create_query_activation, query_to_job, remove_from_cache::EvictionReason,
|
|
35
|
-
workflow_activation_job, QueryWorkflow, WorkflowActivation,
|
|
36
|
-
},
|
|
37
|
-
workflow_commands::QueryResult,
|
|
38
|
-
},
|
|
39
|
-
temporal::api::command::v1::Command as ProtoCommand,
|
|
40
|
-
TaskToken,
|
|
41
|
-
};
|
|
42
|
-
use tokio::{sync::Notify, time::timeout_at};
|
|
43
|
-
|
|
44
|
-
/// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
|
|
45
|
-
/// necessary.
|
|
46
|
-
const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
|
|
47
|
-
|
|
48
|
-
/// Centralizes concerns related to applying new workflow tasks and reporting the activations they
|
|
49
|
-
/// produce.
|
|
50
|
-
///
|
|
51
|
-
/// It is intentionally free of any interactions with the server client to promote testability
|
|
52
|
-
pub struct WorkflowTaskManager {
|
|
53
|
-
/// Manages threadsafe access to workflow machine instances
|
|
54
|
-
workflow_machines: WorkflowConcurrencyManager,
|
|
55
|
-
/// Workflows may generate new activations immediately upon completion (ex: while replaying, or
|
|
56
|
-
/// when cancelling an activity in try-cancel/abandon mode), or for other reasons such as a
|
|
57
|
-
/// requested eviction. They queue here.
|
|
58
|
-
pending_activations: PendingActivations,
|
|
59
|
-
/// Holds activations which are purely query activations needed to respond to legacy queries.
|
|
60
|
-
/// Activations may only be added here for runs which do not have other pending activations.
|
|
61
|
-
pending_queries: SegQueue<WorkflowActivation>,
|
|
62
|
-
/// Holds poll wft responses from the server that need to be applied
|
|
63
|
-
ready_buffered_wft: SegQueue<ValidPollWFTQResponse>,
|
|
64
|
-
/// Used to wake blocked workflow task polling
|
|
65
|
-
pending_activations_notifier: Arc<Notify>,
|
|
66
|
-
/// Lock guarded cache manager, which is the authority for limit-based workflow machine eviction
|
|
67
|
-
/// from the cache.
|
|
68
|
-
// TODO: Also should be moved inside concurrency manager, but there is some complexity around
|
|
69
|
-
// how inserts to it happen that requires a little thought (or a custom LRU impl)
|
|
70
|
-
cache_manager: Mutex<WorkflowCacheManager>,
|
|
71
|
-
|
|
72
|
-
metrics: MetricsContext,
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
#[derive(Clone, Debug)]
|
|
76
|
-
pub(crate) struct OutstandingTask {
|
|
77
|
-
pub info: WorkflowTaskInfo,
|
|
78
|
-
/// Set if the outstanding task has quer(ies) which must be fulfilled upon finishing replay
|
|
79
|
-
pub pending_queries: Vec<QueryWorkflow>,
|
|
80
|
-
start_time: Instant,
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
#[derive(Copy, Clone, Debug)]
|
|
84
|
-
pub(crate) enum OutstandingActivation {
|
|
85
|
-
/// A normal activation with a joblist
|
|
86
|
-
Normal {
|
|
87
|
-
/// True if there is an eviction in the joblist
|
|
88
|
-
contains_eviction: bool,
|
|
89
|
-
/// Number of jobs in the activation
|
|
90
|
-
num_jobs: usize,
|
|
91
|
-
},
|
|
92
|
-
/// An activation for a legacy query
|
|
93
|
-
LegacyQuery,
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
impl OutstandingActivation {
|
|
97
|
-
const fn has_only_eviction(self) -> bool {
|
|
98
|
-
matches!(
|
|
99
|
-
self,
|
|
100
|
-
OutstandingActivation::Normal {
|
|
101
|
-
contains_eviction: true,
|
|
102
|
-
num_jobs: nj
|
|
103
|
-
}
|
|
104
|
-
if nj == 1)
|
|
105
|
-
}
|
|
106
|
-
const fn has_eviction(self) -> bool {
|
|
107
|
-
matches!(
|
|
108
|
-
self,
|
|
109
|
-
OutstandingActivation::Normal {
|
|
110
|
-
contains_eviction: true,
|
|
111
|
-
..
|
|
112
|
-
}
|
|
113
|
-
)
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/// Contains important information about a given workflow task that we need to memorize while
|
|
118
|
-
/// lang handles it.
|
|
119
|
-
#[derive(Clone, Debug)]
|
|
120
|
-
pub struct WorkflowTaskInfo {
|
|
121
|
-
pub task_token: TaskToken,
|
|
122
|
-
pub attempt: u32,
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
#[derive(Debug, derive_more::From)]
|
|
126
|
-
pub(crate) enum NewWfTaskOutcome {
|
|
127
|
-
/// A new activation for the workflow should be issued to lang
|
|
128
|
-
IssueActivation(WorkflowActivation),
|
|
129
|
-
/// The poll loop should be restarted, there is nothing to do
|
|
130
|
-
TaskBuffered,
|
|
131
|
-
/// The workflow task should be auto-completed with an empty command list, as it must be replied
|
|
132
|
-
/// to but there is no meaningful work for lang to do.
|
|
133
|
-
Autocomplete,
|
|
134
|
-
/// The workflow task ran into problems while being applied and we must now evict the workflow
|
|
135
|
-
Evict(WorkflowUpdateError),
|
|
136
|
-
/// No action should be taken. Possibly we are waiting for local activities to complete
|
|
137
|
-
LocalActsOutstanding,
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
#[derive(Debug)]
|
|
141
|
-
pub enum FailedActivationOutcome {
|
|
142
|
-
NoReport,
|
|
143
|
-
Report(TaskToken),
|
|
144
|
-
ReportLegacyQueryFailure(TaskToken),
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[derive(Debug)]
|
|
148
|
-
pub(crate) struct ServerCommandsWithWorkflowInfo {
|
|
149
|
-
pub task_token: TaskToken,
|
|
150
|
-
pub action: ActivationAction,
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[derive(Debug)]
|
|
154
|
-
pub(crate) enum ActivationAction {
|
|
155
|
-
/// We should respond that the workflow task is complete
|
|
156
|
-
WftComplete {
|
|
157
|
-
commands: Vec<ProtoCommand>,
|
|
158
|
-
query_responses: Vec<QueryResult>,
|
|
159
|
-
force_new_wft: bool,
|
|
160
|
-
},
|
|
161
|
-
/// We should respond to a legacy query request
|
|
162
|
-
RespondLegacyQuery { result: QueryResult },
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
#[derive(Debug, Eq, PartialEq, Hash)]
|
|
166
|
-
pub(crate) enum EvictionRequestResult {
|
|
167
|
-
EvictionRequested(Option<u32>),
|
|
168
|
-
NotFound,
|
|
169
|
-
EvictionAlreadyRequested(Option<u32>),
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
macro_rules! machine_mut {
|
|
173
|
-
($myself:ident, $run_id:ident, $clos:expr) => {{
|
|
174
|
-
$myself
|
|
175
|
-
.workflow_machines
|
|
176
|
-
.access($run_id, $clos)
|
|
177
|
-
.await
|
|
178
|
-
.map_err(|source| WorkflowUpdateError {
|
|
179
|
-
source,
|
|
180
|
-
run_id: $run_id.to_owned(),
|
|
181
|
-
})
|
|
182
|
-
}};
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
impl WorkflowTaskManager {
|
|
186
|
-
pub(crate) fn new(
|
|
187
|
-
pending_activations_notifier: Arc<Notify>,
|
|
188
|
-
eviction_policy: WorkflowCachingPolicy,
|
|
189
|
-
metrics: MetricsContext,
|
|
190
|
-
) -> Self {
|
|
191
|
-
Self {
|
|
192
|
-
workflow_machines: WorkflowConcurrencyManager::new(),
|
|
193
|
-
pending_activations: Default::default(),
|
|
194
|
-
pending_queries: Default::default(),
|
|
195
|
-
ready_buffered_wft: Default::default(),
|
|
196
|
-
pending_activations_notifier,
|
|
197
|
-
cache_manager: Mutex::new(WorkflowCacheManager::new(eviction_policy, metrics.clone())),
|
|
198
|
-
metrics,
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
/// Returns number of currently cached workflows
|
|
203
|
-
pub fn cached_workflows(&self) -> usize {
|
|
204
|
-
self.workflow_machines.cached_workflows()
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
/// Resolves once there is either capacity in the cache, or there are no pending evictions.
|
|
208
|
-
/// Inversely: Waits while there are pending evictions and the cache is full.
|
|
209
|
-
/// Waiting while there are no pending evictions must be avoided because it would block forever,
|
|
210
|
-
/// since there is no way for the cache size to be reduced.
|
|
211
|
-
pub fn wait_for_cache_capacity(&self) -> Option<impl Future<Output = ()> + '_> {
|
|
212
|
-
let are_no_pending_evictions = || {
|
|
213
|
-
!self.pending_activations.is_some_eviction()
|
|
214
|
-
&& !self.workflow_machines.are_outstanding_evictions()
|
|
215
|
-
};
|
|
216
|
-
if !are_no_pending_evictions() {
|
|
217
|
-
let wait_fut = {
|
|
218
|
-
self.cache_manager
|
|
219
|
-
.lock()
|
|
220
|
-
.wait_for_capacity(are_no_pending_evictions)?
|
|
221
|
-
};
|
|
222
|
-
return Some(wait_fut);
|
|
223
|
-
}
|
|
224
|
-
None
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
/// Add a new run (as just received from polling) to the cache. If doing so would overflow the
|
|
228
|
-
/// cache, an eviction is queued to make room and the passed-in task is buffered and `None` is
|
|
229
|
-
/// returned.
|
|
230
|
-
///
|
|
231
|
-
/// If the task is for a run already in the cache, the poll response is returned right away
|
|
232
|
-
/// and should be issued.
|
|
233
|
-
pub async fn add_new_run_to_cache(
|
|
234
|
-
&self,
|
|
235
|
-
poll_resp: ValidPollWFTQResponse,
|
|
236
|
-
) -> Option<ValidPollWFTQResponse> {
|
|
237
|
-
let run_id = &poll_resp.workflow_execution.run_id;
|
|
238
|
-
let maybe_evicted = self.cache_manager.lock().insert(run_id);
|
|
239
|
-
|
|
240
|
-
if let Some(evicted_run_id) = maybe_evicted {
|
|
241
|
-
self.request_eviction(
|
|
242
|
-
&evicted_run_id,
|
|
243
|
-
"Workflow cache full",
|
|
244
|
-
EvictionReason::CacheFull,
|
|
245
|
-
);
|
|
246
|
-
debug!(run_id=%poll_resp.workflow_execution.run_id,
|
|
247
|
-
"Received a WFT for a new run while at the cache limit. Buffering the task.");
|
|
248
|
-
// Buffer the task
|
|
249
|
-
if let Some(not_buffered) = self
|
|
250
|
-
.workflow_machines
|
|
251
|
-
.buffer_resp_if_outstanding_work(poll_resp)
|
|
252
|
-
{
|
|
253
|
-
self.make_buffered_poll_ready(not_buffered);
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
return None;
|
|
257
|
-
}
|
|
258
|
-
|
|
259
|
-
Some(poll_resp)
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
pub(crate) fn next_pending_activation(&self) -> Option<WorkflowActivation> {
|
|
263
|
-
// Dispatch pending queries first
|
|
264
|
-
if let leg_q @ Some(_) = self.pending_queries.pop() {
|
|
265
|
-
return leg_q;
|
|
266
|
-
}
|
|
267
|
-
// It is important that we do not issue pending activations for any workflows which already
|
|
268
|
-
// have an outstanding activation. If we did, it can result in races where an in-progress
|
|
269
|
-
// completion may appear to be the last in a task (no more pending activations) because
|
|
270
|
-
// concurrently a poll happened to dequeue the pending activation at the right time.
|
|
271
|
-
// NOTE: This all goes away with the handles-per-workflow poll approach.
|
|
272
|
-
let maybe_act = self
|
|
273
|
-
.pending_activations
|
|
274
|
-
.pop_first_matching(|rid| self.workflow_machines.get_activation(rid).is_none());
|
|
275
|
-
if let Some(pending_info) = maybe_act {
|
|
276
|
-
if let Ok(act) = self
|
|
277
|
-
.workflow_machines
|
|
278
|
-
.access_sync(&pending_info.run_id, |wfm| wfm.machines.get_wf_activation())
|
|
279
|
-
.and_then(|mut act| {
|
|
280
|
-
// Only evict workflows after all other pending work is complete.
|
|
281
|
-
if act.jobs.is_empty() {
|
|
282
|
-
if let Some(reason) = pending_info.needs_eviction {
|
|
283
|
-
act.append_evict_job(reason);
|
|
284
|
-
}
|
|
285
|
-
}
|
|
286
|
-
if !act.jobs.is_empty() {
|
|
287
|
-
self.insert_outstanding_activation(&act)?;
|
|
288
|
-
self.cache_manager.lock().touch(&act.run_id);
|
|
289
|
-
Ok(Some(act))
|
|
290
|
-
} else {
|
|
291
|
-
// If for whatever reason we triggered a pending activation but there wasn't
|
|
292
|
-
// actually any work to be done, just ignore that.
|
|
293
|
-
Ok(None)
|
|
294
|
-
}
|
|
295
|
-
})
|
|
296
|
-
{
|
|
297
|
-
act
|
|
298
|
-
} else {
|
|
299
|
-
self.request_eviction(
|
|
300
|
-
&pending_info.run_id,
|
|
301
|
-
"Tried to apply pending activation for missing run",
|
|
302
|
-
EvictionReason::Fatal,
|
|
303
|
-
);
|
|
304
|
-
// Continue trying to return a valid pending activation
|
|
305
|
-
self.next_pending_activation()
|
|
306
|
-
}
|
|
307
|
-
} else {
|
|
308
|
-
None
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
pub(crate) fn next_buffered_poll(&self) -> Option<ValidPollWFTQResponse> {
|
|
313
|
-
self.ready_buffered_wft.pop()
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
pub(crate) fn outstanding_wft(&self) -> usize {
|
|
317
|
-
self.workflow_machines.outstanding_wft()
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
/// Returns the event id of the most recently processed event for the provided run id.
|
|
321
|
-
pub(crate) fn most_recently_processed_event(
|
|
322
|
-
&self,
|
|
323
|
-
run_id: &str,
|
|
324
|
-
) -> Result<i64, WorkflowMissingError> {
|
|
325
|
-
self.workflow_machines
|
|
326
|
-
.access_sync(run_id, |wfm| wfm.machines.last_processed_event)
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
/// Request a workflow eviction. This will queue up an activation to evict the workflow from
|
|
330
|
-
/// the lang side. Workflow will not *actually* be evicted until lang replies to that activation
|
|
331
|
-
///
|
|
332
|
-
/// Returns, if found, the number of attempts on the current workflow task
|
|
333
|
-
pub(crate) fn request_eviction(
|
|
334
|
-
&self,
|
|
335
|
-
run_id: &str,
|
|
336
|
-
message: impl Into<String>,
|
|
337
|
-
reason: EvictionReason,
|
|
338
|
-
) -> EvictionRequestResult {
|
|
339
|
-
if self.workflow_machines.exists(run_id) {
|
|
340
|
-
let attempts = self
|
|
341
|
-
.workflow_machines
|
|
342
|
-
.get_task(run_id)
|
|
343
|
-
.map(|wt| wt.info.attempt);
|
|
344
|
-
if !self.activation_has_eviction(run_id) {
|
|
345
|
-
let message = message.into();
|
|
346
|
-
debug!(%run_id, %message, "Eviction requested");
|
|
347
|
-
// Queue up an eviction activation
|
|
348
|
-
self.pending_activations
|
|
349
|
-
.notify_needs_eviction(run_id, message, reason);
|
|
350
|
-
self.pending_activations_notifier.notify_waiters();
|
|
351
|
-
EvictionRequestResult::EvictionRequested(attempts)
|
|
352
|
-
} else {
|
|
353
|
-
EvictionRequestResult::EvictionAlreadyRequested(attempts)
|
|
354
|
-
}
|
|
355
|
-
} else {
|
|
356
|
-
warn!(%run_id, "Eviction requested for unknown run");
|
|
357
|
-
EvictionRequestResult::NotFound
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
/// Evict a workflow from the cache by its run id. Any existing pending activations will be
|
|
362
|
-
/// destroyed, and any outstanding activations invalidated.
|
|
363
|
-
fn evict_run(&self, run_id: &str) {
|
|
364
|
-
debug!(run_id=%run_id, "Evicting run");
|
|
365
|
-
|
|
366
|
-
self.cache_manager.lock().remove(run_id);
|
|
367
|
-
let maybe_buffered = self.workflow_machines.evict(run_id);
|
|
368
|
-
self.pending_activations.remove_all_with_run_id(run_id);
|
|
369
|
-
|
|
370
|
-
if let Some(buffered) = maybe_buffered {
|
|
371
|
-
// If we just evicted something and there was a buffered poll response for the workflow,
|
|
372
|
-
// it is now ready to be produced by the next poll. (Not immediate next, since, ignoring
|
|
373
|
-
// other workflows, the next poll will be the eviction we just produced. Buffered polls
|
|
374
|
-
// always are popped after pending activations)
|
|
375
|
-
self.make_buffered_poll_ready(buffered);
|
|
376
|
-
}
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
/// Given a validated poll response from the server, prepare an activation (if there is one) to
|
|
380
|
-
/// be sent to lang.
|
|
381
|
-
///
|
|
382
|
-
/// The new activation is immediately considered to be an outstanding workflow task - so it is
|
|
383
|
-
/// expected that new activations will be dispatched to lang right away.
|
|
384
|
-
pub(crate) async fn apply_new_poll_resp(
|
|
385
|
-
&self,
|
|
386
|
-
work: ValidPollWFTQResponse,
|
|
387
|
-
client: Arc<WorkerClientBag>,
|
|
388
|
-
) -> NewWfTaskOutcome {
|
|
389
|
-
let mut work = if let Some(w) = self.workflow_machines.buffer_resp_if_outstanding_work(work)
|
|
390
|
-
{
|
|
391
|
-
w
|
|
392
|
-
} else {
|
|
393
|
-
return NewWfTaskOutcome::TaskBuffered;
|
|
394
|
-
};
|
|
395
|
-
|
|
396
|
-
let start_event_id = work.history.events.first().map(|e| e.event_id);
|
|
397
|
-
debug!(
|
|
398
|
-
task_token = %&work.task_token,
|
|
399
|
-
history_length = %work.history.events.len(),
|
|
400
|
-
start_event_id = ?start_event_id,
|
|
401
|
-
attempt = %work.attempt,
|
|
402
|
-
run_id = %work.workflow_execution.run_id,
|
|
403
|
-
"Applying new workflow task from server"
|
|
404
|
-
);
|
|
405
|
-
let task_start_time = Instant::now();
|
|
406
|
-
|
|
407
|
-
// Check if there is a legacy query we either need to immediately issue an activation for
|
|
408
|
-
// (if there is no more replay work to do) or we need to store for later answering.
|
|
409
|
-
let legacy_query = work
|
|
410
|
-
.legacy_query
|
|
411
|
-
.take()
|
|
412
|
-
.map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
|
|
413
|
-
|
|
414
|
-
let (info, mut next_activation, mut pending_queries) =
|
|
415
|
-
match self.instantiate_or_update_workflow(work, client).await {
|
|
416
|
-
Ok(res) => res,
|
|
417
|
-
Err(e) => {
|
|
418
|
-
return NewWfTaskOutcome::Evict(e);
|
|
419
|
-
}
|
|
420
|
-
};
|
|
421
|
-
|
|
422
|
-
if !pending_queries.is_empty() && legacy_query.is_some() {
|
|
423
|
-
error!(
|
|
424
|
-
"Server issued both normal and legacy queries. This should not happen. Please \
|
|
425
|
-
file a bug report."
|
|
426
|
-
);
|
|
427
|
-
return NewWfTaskOutcome::Evict(WorkflowUpdateError {
|
|
428
|
-
source: WFMachinesError::Fatal(
|
|
429
|
-
"Server issued both normal and legacy query".to_string(),
|
|
430
|
-
),
|
|
431
|
-
run_id: next_activation.run_id,
|
|
432
|
-
});
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
// Immediately dispatch query activation if no other jobs
|
|
436
|
-
if let Some(lq) = legacy_query {
|
|
437
|
-
if next_activation.jobs.is_empty() {
|
|
438
|
-
debug!("Dispatching legacy query {}", &lq);
|
|
439
|
-
next_activation
|
|
440
|
-
.jobs
|
|
441
|
-
.push(workflow_activation_job::Variant::QueryWorkflow(lq).into());
|
|
442
|
-
} else {
|
|
443
|
-
pending_queries.push(lq);
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
self.workflow_machines
|
|
448
|
-
.insert_wft(
|
|
449
|
-
&next_activation.run_id,
|
|
450
|
-
OutstandingTask {
|
|
451
|
-
info,
|
|
452
|
-
pending_queries,
|
|
453
|
-
start_time: task_start_time,
|
|
454
|
-
},
|
|
455
|
-
)
|
|
456
|
-
.expect("Workflow machines must exist, we just created/updated them");
|
|
457
|
-
|
|
458
|
-
if next_activation.jobs.is_empty() {
|
|
459
|
-
let outstanding_las = self
|
|
460
|
-
.workflow_machines
|
|
461
|
-
.access_sync(&next_activation.run_id, |wfm| {
|
|
462
|
-
wfm.machines.outstanding_local_activity_count()
|
|
463
|
-
})
|
|
464
|
-
.expect("Workflow machines must exist, we just created/updated them");
|
|
465
|
-
if outstanding_las > 0 {
|
|
466
|
-
// If there are outstanding local activities, we don't want to autocomplete the
|
|
467
|
-
// workflow task. We want to give them a chance to complete. If they take longer
|
|
468
|
-
// than the WFT timeout, we will force a new WFT just before the timeout.
|
|
469
|
-
NewWfTaskOutcome::LocalActsOutstanding
|
|
470
|
-
} else {
|
|
471
|
-
NewWfTaskOutcome::Autocomplete
|
|
472
|
-
}
|
|
473
|
-
} else {
|
|
474
|
-
if let Err(wme) = self.insert_outstanding_activation(&next_activation) {
|
|
475
|
-
return NewWfTaskOutcome::Evict(wme.into());
|
|
476
|
-
}
|
|
477
|
-
NewWfTaskOutcome::IssueActivation(next_activation)
|
|
478
|
-
}
|
|
479
|
-
}
|
|
480
|
-
|
|
481
|
-
/// Record a successful activation. Returns (if any) commands that should be reported to the
|
|
482
|
-
/// server as part of wft completion
|
|
483
|
-
pub(crate) async fn successful_activation(
|
|
484
|
-
&self,
|
|
485
|
-
run_id: &str,
|
|
486
|
-
mut commands: Vec<WFCommand>,
|
|
487
|
-
local_activity_request_sink: impl FnOnce(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>,
|
|
488
|
-
) -> Result<Option<ServerCommandsWithWorkflowInfo>, WorkflowUpdateError> {
|
|
489
|
-
// There used to be code here that would return right away if the run reply had no commands
|
|
490
|
-
// and the activation that was just completed only had an eviction in it. That was bad
|
|
491
|
-
// because we wouldn't have yet sent any previously buffered commands since there was a
|
|
492
|
-
// pending activation (the eviction) and then we would *skip* doing anything with them here,
|
|
493
|
-
// because there were no new commands. In general it seems best to avoid short-circuiting
|
|
494
|
-
// here.
|
|
495
|
-
|
|
496
|
-
let activation_was_only_eviction = self.activation_has_only_eviction(run_id);
|
|
497
|
-
let (task_token, has_pending_query, start_time) =
|
|
498
|
-
if let Some(entry) = self.workflow_machines.get_task(run_id) {
|
|
499
|
-
(
|
|
500
|
-
entry.info.task_token.clone(),
|
|
501
|
-
!entry.pending_queries.is_empty(),
|
|
502
|
-
entry.start_time,
|
|
503
|
-
)
|
|
504
|
-
} else {
|
|
505
|
-
if !activation_was_only_eviction {
|
|
506
|
-
// Don't bother warning if this was an eviction, since it's normal to issue
|
|
507
|
-
// eviction activations without an associated workflow task in that case.
|
|
508
|
-
warn!(
|
|
509
|
-
run_id,
|
|
510
|
-
"Attempted to complete activation for run without associated workflow task"
|
|
511
|
-
);
|
|
512
|
-
}
|
|
513
|
-
return Ok(None);
|
|
514
|
-
};
|
|
515
|
-
|
|
516
|
-
// If the only command from the activation is a legacy query response, that means we need
|
|
517
|
-
// to respond differently than a typical activation.
|
|
518
|
-
let ret = if matches!(&commands.as_slice(),
|
|
519
|
-
&[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
|
|
520
|
-
{
|
|
521
|
-
let qr = match commands.remove(0) {
|
|
522
|
-
WFCommand::QueryResponse(qr) => qr,
|
|
523
|
-
_ => unreachable!("We just verified this is the only command"),
|
|
524
|
-
};
|
|
525
|
-
Some(ServerCommandsWithWorkflowInfo {
|
|
526
|
-
task_token,
|
|
527
|
-
action: ActivationAction::RespondLegacyQuery { result: qr },
|
|
528
|
-
})
|
|
529
|
-
} else {
|
|
530
|
-
// First strip out query responses from other commands that actually affect machines
|
|
531
|
-
// Would be prettier with `drain_filter`
|
|
532
|
-
let mut i = 0;
|
|
533
|
-
let mut query_responses = vec![];
|
|
534
|
-
while i < commands.len() {
|
|
535
|
-
if matches!(commands[i], WFCommand::QueryResponse(_)) {
|
|
536
|
-
if let WFCommand::QueryResponse(qr) = commands.remove(i) {
|
|
537
|
-
if qr.query_id == LEGACY_QUERY_ID {
|
|
538
|
-
return Err(WorkflowUpdateError {
|
|
539
|
-
source: WFMachinesError::Fatal(
|
|
540
|
-
"Legacy query activation response included other commands, \
|
|
541
|
-
this is not allowed and constitutes an error in the lang SDK"
|
|
542
|
-
.to_string(),
|
|
543
|
-
),
|
|
544
|
-
run_id: run_id.to_string(),
|
|
545
|
-
});
|
|
546
|
-
}
|
|
547
|
-
query_responses.push(qr);
|
|
548
|
-
}
|
|
549
|
-
} else {
|
|
550
|
-
i += 1;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
let activation_was_eviction = self.activation_has_eviction(run_id);
|
|
555
|
-
let (are_pending, server_cmds, local_activities, wft_timeout) = machine_mut!(
|
|
556
|
-
self,
|
|
557
|
-
run_id,
|
|
558
|
-
|wfm: &mut WorkflowManager| {
|
|
559
|
-
async move {
|
|
560
|
-
// Send commands from lang into the machines then check if the workflow run
|
|
561
|
-
// needs another activation and mark it if so
|
|
562
|
-
wfm.push_commands(commands).await?;
|
|
563
|
-
// Don't bother applying the next task if we're evicting at the end of
|
|
564
|
-
// this activation
|
|
565
|
-
let are_pending = if !activation_was_eviction {
|
|
566
|
-
wfm.apply_next_task_if_ready().await?
|
|
567
|
-
} else {
|
|
568
|
-
false
|
|
569
|
-
};
|
|
570
|
-
// We want to fetch the outgoing commands only after a next WFT may have
|
|
571
|
-
// been applied, as outgoing server commands may be affected.
|
|
572
|
-
let outgoing_cmds = wfm.get_server_commands();
|
|
573
|
-
let new_local_acts = wfm.drain_queued_local_activities();
|
|
574
|
-
|
|
575
|
-
let wft_timeout: Duration = wfm
|
|
576
|
-
.machines
|
|
577
|
-
.get_started_info()
|
|
578
|
-
.and_then(|attrs| attrs.workflow_task_timeout)
|
|
579
|
-
.ok_or_else(|| {
|
|
580
|
-
WFMachinesError::Fatal(
|
|
581
|
-
"Workflow's start attribs were missing a well formed task timeout"
|
|
582
|
-
.to_string(),
|
|
583
|
-
)
|
|
584
|
-
})?;
|
|
585
|
-
|
|
586
|
-
Ok((are_pending, outgoing_cmds, new_local_acts, wft_timeout))
|
|
587
|
-
}
|
|
588
|
-
.boxed()
|
|
589
|
-
}
|
|
590
|
-
)?;
|
|
591
|
-
|
|
592
|
-
if are_pending {
|
|
593
|
-
self.needs_activation(run_id);
|
|
594
|
-
}
|
|
595
|
-
let immediate_resolutions = local_activity_request_sink(local_activities);
|
|
596
|
-
for resolution in immediate_resolutions {
|
|
597
|
-
self.notify_of_local_result(run_id, LocalResolution::LocalActivity(resolution))
|
|
598
|
-
.await?;
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
// The heartbeat deadline is 80% of the WFT timeout
|
|
602
|
-
let wft_heartbeat_deadline =
|
|
603
|
-
start_time.add(wft_timeout.mul_f32(WFT_HEARTBEAT_TIMEOUT_FRACTION));
|
|
604
|
-
// Wait on local activities to resolve if there are any, or for the WFT timeout to
|
|
605
|
-
// be about to expire, in which case we will need to send a WFT heartbeat.
|
|
606
|
-
let must_heartbeat = self
|
|
607
|
-
.wait_for_local_acts_or_heartbeat(run_id, wft_heartbeat_deadline)
|
|
608
|
-
.await;
|
|
609
|
-
let has_query_responses = !query_responses.is_empty();
|
|
610
|
-
let is_query_playback = has_pending_query && !has_query_responses;
|
|
611
|
-
|
|
612
|
-
// We only actually want to send commands back to the server if there are no more
|
|
613
|
-
// pending activations and we are caught up on replay. We don't want to complete a wft
|
|
614
|
-
// if we already saw the final event in the workflow, or if we are playing back for the
|
|
615
|
-
// express purpose of fulfilling a query. If the activation we sent was *only* an
|
|
616
|
-
// eviction, and there were no commands produced during iteration, don't send that
|
|
617
|
-
// either.
|
|
618
|
-
let no_commands_and_evicting =
|
|
619
|
-
server_cmds.commands.is_empty() && activation_was_only_eviction;
|
|
620
|
-
let to_be_sent = ServerCommandsWithWorkflowInfo {
|
|
621
|
-
task_token,
|
|
622
|
-
action: ActivationAction::WftComplete {
|
|
623
|
-
// TODO: Don't force if also sending complete execution cmd
|
|
624
|
-
force_new_wft: must_heartbeat,
|
|
625
|
-
commands: server_cmds.commands,
|
|
626
|
-
query_responses,
|
|
627
|
-
},
|
|
628
|
-
};
|
|
629
|
-
let should_respond = !(self.pending_activations.has_pending(run_id)
|
|
630
|
-
|| server_cmds.replaying
|
|
631
|
-
|| is_query_playback
|
|
632
|
-
|| no_commands_and_evicting);
|
|
633
|
-
if should_respond || has_query_responses {
|
|
634
|
-
Some(to_be_sent)
|
|
635
|
-
} else {
|
|
636
|
-
None
|
|
637
|
-
}
|
|
638
|
-
};
|
|
639
|
-
Ok(ret)
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
/// Record that an activation failed, returns enum that indicates if failure should be reported
|
|
643
|
-
/// to the server
|
|
644
|
-
pub(crate) fn failed_activation(
|
|
645
|
-
&self,
|
|
646
|
-
run_id: &str,
|
|
647
|
-
reason: EvictionReason,
|
|
648
|
-
failstr: String,
|
|
649
|
-
) -> FailedActivationOutcome {
|
|
650
|
-
let tt = if let Some(tt) = self
|
|
651
|
-
.workflow_machines
|
|
652
|
-
.get_task(run_id)
|
|
653
|
-
.map(|t| t.info.task_token.clone())
|
|
654
|
-
{
|
|
655
|
-
tt
|
|
656
|
-
} else {
|
|
657
|
-
warn!(
|
|
658
|
-
"No info for workflow with run id {} found when trying to fail activation",
|
|
659
|
-
run_id
|
|
660
|
-
);
|
|
661
|
-
return FailedActivationOutcome::NoReport;
|
|
662
|
-
};
|
|
663
|
-
if let Some(m) = self.workflow_machines.run_metrics(run_id) {
|
|
664
|
-
m.wf_task_failed();
|
|
665
|
-
}
|
|
666
|
-
// If the outstanding activation is a legacy query task, report that we need to fail it
|
|
667
|
-
if let Some(OutstandingActivation::LegacyQuery) =
|
|
668
|
-
self.workflow_machines.get_activation(run_id)
|
|
669
|
-
{
|
|
670
|
-
FailedActivationOutcome::ReportLegacyQueryFailure(tt)
|
|
671
|
-
} else {
|
|
672
|
-
// Blow up any cached data associated with the workflow
|
|
673
|
-
let should_report = match self.request_eviction(run_id, failstr, reason) {
|
|
674
|
-
EvictionRequestResult::EvictionRequested(Some(attempt))
|
|
675
|
-
| EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
|
|
676
|
-
_ => false,
|
|
677
|
-
};
|
|
678
|
-
if should_report {
|
|
679
|
-
FailedActivationOutcome::Report(tt)
|
|
680
|
-
} else {
|
|
681
|
-
FailedActivationOutcome::NoReport
|
|
682
|
-
}
|
|
683
|
-
}
|
|
684
|
-
}
|
|
685
|
-
|
|
686
|
-
/// Will create a new workflow manager if needed for the workflow activation, if not, it will
|
|
687
|
-
/// feed the existing manager the updated history we received from the server.
|
|
688
|
-
///
|
|
689
|
-
/// Returns the next workflow activation and some info about it, if an activation is needed.
|
|
690
|
-
async fn instantiate_or_update_workflow(
|
|
691
|
-
&self,
|
|
692
|
-
poll_wf_resp: ValidPollWFTQResponse,
|
|
693
|
-
client: Arc<WorkerClientBag>,
|
|
694
|
-
) -> Result<(WorkflowTaskInfo, WorkflowActivation, Vec<QueryWorkflow>), WorkflowUpdateError>
|
|
695
|
-
{
|
|
696
|
-
let run_id = poll_wf_resp.workflow_execution.run_id.clone();
|
|
697
|
-
|
|
698
|
-
let wft_info = WorkflowTaskInfo {
|
|
699
|
-
attempt: poll_wf_resp.attempt,
|
|
700
|
-
task_token: poll_wf_resp.task_token,
|
|
701
|
-
};
|
|
702
|
-
|
|
703
|
-
let poll_resp_is_incremental = poll_wf_resp
|
|
704
|
-
.history
|
|
705
|
-
.events
|
|
706
|
-
.get(0)
|
|
707
|
-
.map(|ev| ev.event_id > 1)
|
|
708
|
-
.unwrap_or_default();
|
|
709
|
-
let poll_resp_is_incremental =
|
|
710
|
-
poll_resp_is_incremental || poll_wf_resp.history.events.is_empty();
|
|
711
|
-
|
|
712
|
-
let mut did_miss_cache = !poll_resp_is_incremental;
|
|
713
|
-
|
|
714
|
-
let page_token = if !self.workflow_machines.exists(&run_id) && poll_resp_is_incremental {
|
|
715
|
-
debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
|
|
716
|
-
cache. Will fetch history");
|
|
717
|
-
self.metrics.sticky_cache_miss();
|
|
718
|
-
did_miss_cache = true;
|
|
719
|
-
NextPageToken::FetchFromStart
|
|
720
|
-
} else {
|
|
721
|
-
poll_wf_resp.next_page_token.into()
|
|
722
|
-
};
|
|
723
|
-
let history_update = HistoryUpdate::new(
|
|
724
|
-
HistoryPaginator::new(
|
|
725
|
-
poll_wf_resp.history,
|
|
726
|
-
poll_wf_resp.workflow_execution.workflow_id.clone(),
|
|
727
|
-
poll_wf_resp.workflow_execution.run_id,
|
|
728
|
-
page_token,
|
|
729
|
-
client.clone(),
|
|
730
|
-
),
|
|
731
|
-
poll_wf_resp.previous_started_event_id,
|
|
732
|
-
);
|
|
733
|
-
|
|
734
|
-
match self
|
|
735
|
-
.workflow_machines
|
|
736
|
-
.create_or_update(
|
|
737
|
-
&run_id,
|
|
738
|
-
history_update,
|
|
739
|
-
&poll_wf_resp.workflow_execution.workflow_id,
|
|
740
|
-
client.namespace(),
|
|
741
|
-
&poll_wf_resp.workflow_type,
|
|
742
|
-
&self.metrics,
|
|
743
|
-
)
|
|
744
|
-
.await
|
|
745
|
-
{
|
|
746
|
-
Ok(mut activation) => {
|
|
747
|
-
// If there are in-poll queries, insert jobs for those queries into the activation,
|
|
748
|
-
// but only if we hit the cache. If we didn't, those queries will need to be dealt
|
|
749
|
-
// with once replay is over
|
|
750
|
-
let mut pending_queries = vec![];
|
|
751
|
-
if !poll_wf_resp.query_requests.is_empty() {
|
|
752
|
-
if !did_miss_cache {
|
|
753
|
-
let query_jobs = poll_wf_resp
|
|
754
|
-
.query_requests
|
|
755
|
-
.into_iter()
|
|
756
|
-
.map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
|
|
757
|
-
activation.jobs.extend(query_jobs);
|
|
758
|
-
} else {
|
|
759
|
-
poll_wf_resp
|
|
760
|
-
.query_requests
|
|
761
|
-
.into_iter()
|
|
762
|
-
.for_each(|q| pending_queries.push(q));
|
|
763
|
-
}
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
Ok((wft_info, activation, pending_queries))
|
|
767
|
-
}
|
|
768
|
-
Err(source) => Err(WorkflowUpdateError { source, run_id }),
|
|
769
|
-
}
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
/// Called after every workflow activation completion or failure, updates outstanding task
|
|
773
|
-
/// status & issues evictions if required. It is important this is called *after* potentially
|
|
774
|
-
/// reporting a successful WFT to server, as some replies (task not found) may require an
|
|
775
|
-
/// eviction, which could be avoided if this is called too early.
|
|
776
|
-
///
|
|
777
|
-
/// Returns true if WFT was marked completed internally
|
|
778
|
-
pub(crate) fn after_wft_report(&self, run_id: &str, reported_wft_to_server: bool) -> bool {
|
|
779
|
-
let mut just_evicted = false;
|
|
780
|
-
|
|
781
|
-
if self
|
|
782
|
-
.workflow_machines
|
|
783
|
-
.get_activation(run_id)
|
|
784
|
-
.map(|a| a.has_eviction())
|
|
785
|
-
.unwrap_or_default()
|
|
786
|
-
{
|
|
787
|
-
self.evict_run(run_id);
|
|
788
|
-
just_evicted = true;
|
|
789
|
-
};
|
|
790
|
-
|
|
791
|
-
// Workflows with no more pending activations (IE: They have completed a WFT) must be
|
|
792
|
-
// removed from the outstanding tasks map
|
|
793
|
-
if !self.pending_activations.has_pending(run_id) && !just_evicted {
|
|
794
|
-
if let Some(ref mut ot) = &mut *self
|
|
795
|
-
.workflow_machines
|
|
796
|
-
.get_task_mut(run_id)
|
|
797
|
-
.expect("Machine must exist")
|
|
798
|
-
{
|
|
799
|
-
// Check if there was a pending query which must be fulfilled, and if there is
|
|
800
|
-
// create a new pending activation for it.
|
|
801
|
-
if !ot.pending_queries.is_empty() {
|
|
802
|
-
for query in ot.pending_queries.drain(..) {
|
|
803
|
-
let na = create_query_activation(run_id.to_string(), [query]);
|
|
804
|
-
self.pending_queries.push(na);
|
|
805
|
-
}
|
|
806
|
-
self.pending_activations_notifier.notify_waiters();
|
|
807
|
-
return false;
|
|
808
|
-
}
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
// Evict run id if cache is full. Non-sticky will always evict.
|
|
812
|
-
let maybe_evicted = self.cache_manager.lock().insert(run_id);
|
|
813
|
-
if let Some(evicted_run_id) = maybe_evicted {
|
|
814
|
-
self.request_eviction(
|
|
815
|
-
&evicted_run_id,
|
|
816
|
-
"Workflow cache full",
|
|
817
|
-
EvictionReason::CacheFull,
|
|
818
|
-
);
|
|
819
|
-
}
|
|
820
|
-
|
|
821
|
-
// If there was a buffered poll response from the server, it is now ready to
|
|
822
|
-
// be handled.
|
|
823
|
-
if let Some(buffd) = self.workflow_machines.take_buffered_poll(run_id) {
|
|
824
|
-
self.make_buffered_poll_ready(buffd);
|
|
825
|
-
}
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
// If we reported to server, we always want to mark it complete.
|
|
829
|
-
let wft_marked_complete = self
|
|
830
|
-
.workflow_machines
|
|
831
|
-
.complete_wft(run_id, reported_wft_to_server)
|
|
832
|
-
.is_some();
|
|
833
|
-
self.on_activation_done(run_id);
|
|
834
|
-
wft_marked_complete
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
/// Must be called after *every* activation is replied to, regardless of whether or not we
|
|
838
|
-
/// had some issue reporting it to server or anything else. This upholds the invariant that
|
|
839
|
-
/// every activation we issue to lang has exactly one reply.
|
|
840
|
-
///
|
|
841
|
-
/// Any subsequent action that needs to be taken will be created as a new activation
|
|
842
|
-
fn on_activation_done(&self, run_id: &str) {
|
|
843
|
-
self.workflow_machines.delete_activation(run_id);
|
|
844
|
-
// It's important to use `notify_one` here to avoid possible races where we're waiting
|
|
845
|
-
// on a cache slot and fail to realize pending activations must be issued before a slot
|
|
846
|
-
// will free up.
|
|
847
|
-
self.pending_activations_notifier.notify_one();
|
|
848
|
-
}
|
|
849
|
-
|
|
850
|
-
/// Let a workflow know that something we've been waiting locally on has resolved, like a local
|
|
851
|
-
/// activity or side effect
|
|
852
|
-
#[instrument(level = "debug", skip(self, resolved))]
|
|
853
|
-
pub(crate) async fn notify_of_local_result(
|
|
854
|
-
&self,
|
|
855
|
-
run_id: &str,
|
|
856
|
-
resolved: LocalResolution,
|
|
857
|
-
) -> Result<(), WorkflowUpdateError> {
|
|
858
|
-
let result_was_important = self
|
|
859
|
-
.workflow_machines
|
|
860
|
-
.access_sync(run_id, |wfm: &mut WorkflowManager| {
|
|
861
|
-
wfm.notify_of_local_result(resolved)
|
|
862
|
-
})?
|
|
863
|
-
.map_err(|wfme| WorkflowUpdateError {
|
|
864
|
-
source: wfme,
|
|
865
|
-
run_id: run_id.to_string(),
|
|
866
|
-
})?;
|
|
867
|
-
|
|
868
|
-
if result_was_important {
|
|
869
|
-
self.needs_activation(run_id);
|
|
870
|
-
}
|
|
871
|
-
Ok(())
|
|
872
|
-
}
|
|
873
|
-
|
|
874
|
-
fn make_buffered_poll_ready(&self, buffd: ValidPollWFTQResponse) {
|
|
875
|
-
self.ready_buffered_wft.push(buffd);
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
fn insert_outstanding_activation(
|
|
879
|
-
&self,
|
|
880
|
-
act: &WorkflowActivation,
|
|
881
|
-
) -> Result<(), WorkflowMissingError> {
|
|
882
|
-
let act_type = if act.is_legacy_query() {
|
|
883
|
-
OutstandingActivation::LegacyQuery
|
|
884
|
-
} else {
|
|
885
|
-
OutstandingActivation::Normal {
|
|
886
|
-
contains_eviction: act.eviction_index().is_some(),
|
|
887
|
-
num_jobs: act.jobs.len(),
|
|
888
|
-
}
|
|
889
|
-
};
|
|
890
|
-
match self
|
|
891
|
-
.workflow_machines
|
|
892
|
-
.insert_activation(&act.run_id, act_type)
|
|
893
|
-
{
|
|
894
|
-
Ok(None) => Ok(()),
|
|
895
|
-
Ok(Some(previous)) => {
|
|
896
|
-
// This is a panic because we have screwed up core logic if this is violated. It
|
|
897
|
-
// must be upheld.
|
|
898
|
-
panic!(
|
|
899
|
-
"Attempted to insert a new outstanding activation {}, but there already was \
|
|
900
|
-
one outstanding: {:?}",
|
|
901
|
-
act, previous
|
|
902
|
-
);
|
|
903
|
-
}
|
|
904
|
-
Err(e) => Err(e),
|
|
905
|
-
}
|
|
906
|
-
}
|
|
907
|
-
|
|
908
|
-
fn activation_has_only_eviction(&self, run_id: &str) -> bool {
|
|
909
|
-
self.workflow_machines
|
|
910
|
-
.get_activation(run_id)
|
|
911
|
-
.map(OutstandingActivation::has_only_eviction)
|
|
912
|
-
.unwrap_or_default()
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
fn activation_has_eviction(&self, run_id: &str) -> bool {
|
|
916
|
-
self.workflow_machines
|
|
917
|
-
.get_activation(run_id)
|
|
918
|
-
.map(OutstandingActivation::has_eviction)
|
|
919
|
-
.unwrap_or_default()
|
|
920
|
-
}
|
|
921
|
-
|
|
922
|
-
fn needs_activation(&self, run_id: &str) {
|
|
923
|
-
self.pending_activations.notify_needs_activation(run_id);
|
|
924
|
-
self.pending_activations_notifier.notify_waiters();
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
/// Wait for either all local activities to resolve, or for 80% of the WFT timeout, in which
|
|
928
|
-
/// case we will "heartbeat" by completing the WFT, even if there are no commands to send.
|
|
929
|
-
///
|
|
930
|
-
/// Returns true if we must heartbeat
|
|
931
|
-
async fn wait_for_local_acts_or_heartbeat(
|
|
932
|
-
&self,
|
|
933
|
-
run_id: &str,
|
|
934
|
-
wft_heartbeat_deadline: Instant,
|
|
935
|
-
) -> bool {
|
|
936
|
-
loop {
|
|
937
|
-
let la_count = self
|
|
938
|
-
.workflow_machines
|
|
939
|
-
.access_sync(run_id, |wfm| {
|
|
940
|
-
wfm.machines.outstanding_local_activity_count()
|
|
941
|
-
})
|
|
942
|
-
.expect("Workflow cannot go missing while we are waiting on LAs");
|
|
943
|
-
if la_count == 0 {
|
|
944
|
-
return false;
|
|
945
|
-
} else if Instant::now() >= wft_heartbeat_deadline {
|
|
946
|
-
// We must heartbeat b/c there are still pending local activities
|
|
947
|
-
return true;
|
|
948
|
-
}
|
|
949
|
-
// Since an LA resolution always results in a new pending activation, we can wait on
|
|
950
|
-
// notifications of that to re-check if they're all resolved.
|
|
951
|
-
let _ = timeout_at(
|
|
952
|
-
wft_heartbeat_deadline.into(),
|
|
953
|
-
self.pending_activations_notifier.notified(),
|
|
954
|
-
)
|
|
955
|
-
.await;
|
|
956
|
-
}
|
|
957
|
-
}
|
|
958
|
-
}
|
|
959
|
-
|
|
960
|
-
#[derive(Debug)]
|
|
961
|
-
pub(crate) struct WorkflowUpdateError {
|
|
962
|
-
/// Underlying workflow error
|
|
963
|
-
pub source: WFMachinesError,
|
|
964
|
-
/// The run id of the erring workflow
|
|
965
|
-
#[allow(dead_code)] // Useful in debug output
|
|
966
|
-
pub run_id: String,
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
impl WorkflowUpdateError {
|
|
970
|
-
pub fn evict_reason(&self) -> EvictionReason {
|
|
971
|
-
self.source.evict_reason()
|
|
972
|
-
}
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
impl From<WorkflowMissingError> for WorkflowUpdateError {
|
|
976
|
-
fn from(wme: WorkflowMissingError) -> Self {
|
|
977
|
-
Self {
|
|
978
|
-
source: WFMachinesError::Fatal("Workflow machines missing".to_string()),
|
|
979
|
-
run_id: wme.run_id,
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
/// The workflow machines were expected to be in the cache but were not
|
|
985
|
-
#[derive(Debug)]
|
|
986
|
-
pub(crate) struct WorkflowMissingError {
|
|
987
|
-
/// The run id of the erring workflow
|
|
988
|
-
pub run_id: String,
|
|
989
|
-
}
|