@temporalio/core-bridge 1.5.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +255 -48
- package/package.json +4 -4
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.buildkite/pipeline.yml +1 -3
- package/sdk-core/.cargo/config.toml +5 -2
- package/sdk-core/.github/workflows/heavy.yml +28 -0
- package/sdk-core/Cargo.toml +1 -1
- package/sdk-core/README.md +9 -5
- package/sdk-core/client/src/lib.rs +211 -36
- package/sdk-core/client/src/raw.rs +1 -1
- package/sdk-core/client/src/retry.rs +32 -20
- package/sdk-core/core/Cargo.toml +23 -9
- package/sdk-core/core/src/abstractions.rs +11 -0
- package/sdk-core/core/src/core_tests/activity_tasks.rs +6 -5
- package/sdk-core/core/src/core_tests/local_activities.rs +263 -22
- package/sdk-core/core/src/core_tests/queries.rs +2 -2
- package/sdk-core/core/src/core_tests/workflow_tasks.rs +249 -5
- package/sdk-core/core/src/ephemeral_server/mod.rs +5 -6
- package/sdk-core/core/src/lib.rs +2 -0
- package/sdk-core/core/src/protosext/mod.rs +1 -1
- package/sdk-core/core/src/telemetry/log_export.rs +1 -1
- package/sdk-core/core/src/telemetry/mod.rs +23 -8
- package/sdk-core/core/src/test_help/mod.rs +8 -1
- package/sdk-core/core/src/worker/activities/local_activities.rs +259 -125
- package/sdk-core/core/src/worker/activities.rs +3 -2
- package/sdk-core/core/src/worker/mod.rs +53 -26
- package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
- package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
- package/sdk-core/core/src/worker/workflow/history_update.rs +835 -277
- package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +9 -17
- package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +3 -5
- package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +3 -5
- package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +73 -51
- package/sdk-core/core/src/worker/workflow/machines/mod.rs +3 -3
- package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +4 -4
- package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +1 -2
- package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +3 -5
- package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +6 -7
- package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
- package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +4 -4
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +89 -58
- package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +4 -7
- package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +21 -9
- package/sdk-core/core/src/worker/workflow/managed_run.rs +1021 -360
- package/sdk-core/core/src/worker/workflow/mod.rs +306 -346
- package/sdk-core/core/src/worker/workflow/run_cache.rs +29 -53
- package/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
- package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
- package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +115 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream.rs +444 -714
- package/sdk-core/core-api/Cargo.toml +2 -0
- package/sdk-core/core-api/src/errors.rs +1 -34
- package/sdk-core/core-api/src/lib.rs +6 -2
- package/sdk-core/core-api/src/worker.rs +14 -1
- package/sdk-core/etc/deps.svg +115 -140
- package/sdk-core/etc/regen-depgraph.sh +5 -0
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +6 -6
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +7 -3
- package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
- package/sdk-core/protos/api_upstream/Makefile +5 -5
- package/sdk-core/protos/api_upstream/build/go.mod +7 -0
- package/sdk-core/protos/api_upstream/build/go.sum +5 -0
- package/sdk-core/protos/api_upstream/build/tools.go +29 -0
- package/sdk-core/protos/api_upstream/go.mod +6 -0
- package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
- package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +12 -19
- package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +3 -3
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +20 -2
- package/sdk-core/protos/api_upstream/temporal/api/{update/v1/message.proto → enums/v1/interaction_type.proto} +11 -18
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +2 -13
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +13 -19
- package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +87 -0
- package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +13 -8
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
- package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +2 -0
- package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
- package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
- package/sdk-core/sdk/Cargo.toml +4 -3
- package/sdk-core/sdk/src/lib.rs +87 -21
- package/sdk-core/sdk/src/workflow_future.rs +7 -12
- package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
- package/sdk-core/sdk-core-protos/build.rs +36 -2
- package/sdk-core/sdk-core-protos/src/history_builder.rs +26 -19
- package/sdk-core/sdk-core-protos/src/history_info.rs +4 -0
- package/sdk-core/sdk-core-protos/src/lib.rs +78 -34
- package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
- package/sdk-core/test-utils/Cargo.toml +3 -1
- package/sdk-core/test-utils/src/histfetch.rs +1 -1
- package/sdk-core/test-utils/src/lib.rs +50 -18
- package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
- package/sdk-core/test-utils/src/workflows.rs +29 -0
- package/sdk-core/tests/fuzzy_workflow.rs +130 -0
- package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +114 -7
- package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -2
- package/sdk-core/tests/integ_tests/metrics_tests.rs +1 -1
- package/sdk-core/tests/integ_tests/polling_tests.rs +1 -39
- package/sdk-core/tests/integ_tests/queries_tests.rs +2 -127
- package/sdk-core/tests/integ_tests/visibility_tests.rs +52 -5
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +74 -1
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +5 -13
- package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +1 -1
- package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +2 -10
- package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +69 -197
- package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +4 -28
- package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
- package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +14 -14
- package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -19
- package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +3 -19
- package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +1 -1
- package/sdk-core/tests/integ_tests/workflow_tests.rs +5 -6
- package/sdk-core/tests/main.rs +2 -12
- package/sdk-core/tests/runner.rs +71 -34
- package/sdk-core/tests/wf_input_replay.rs +32 -0
- package/sdk-core/bridge-ffi/Cargo.toml +0 -24
- package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
- package/sdk-core/bridge-ffi/build.rs +0 -25
- package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
- package/sdk-core/bridge-ffi/src/lib.rs +0 -746
- package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
- package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
- package/sdk-core/sdk/src/conversions.rs +0 -8
|
@@ -1,250 +1,179 @@
|
|
|
1
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
2
|
+
mod saved_wf_inputs;
|
|
3
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
4
|
+
mod tonic_status_serde;
|
|
5
|
+
|
|
6
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
7
|
+
pub use saved_wf_inputs::replay_wf_state_inputs;
|
|
8
|
+
|
|
1
9
|
use crate::{
|
|
2
|
-
abstractions::
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
abstractions::dbg_panic,
|
|
11
|
+
worker::workflow::{
|
|
12
|
+
managed_run::RunUpdateAct,
|
|
13
|
+
run_cache::RunCache,
|
|
14
|
+
wft_extraction::{HistfetchRC, HistoryFetchReq, WFTExtractorOutput},
|
|
15
|
+
*,
|
|
8
16
|
},
|
|
9
17
|
MetricsContext,
|
|
10
18
|
};
|
|
11
19
|
use futures::{stream, stream::PollNext, Stream, StreamExt};
|
|
12
|
-
use std::{collections::VecDeque, fmt::Debug, future, sync::Arc
|
|
13
|
-
use temporal_sdk_core_api::errors::
|
|
20
|
+
use std::{collections::VecDeque, fmt::Debug, future, sync::Arc};
|
|
21
|
+
use temporal_sdk_core_api::errors::PollWfError;
|
|
14
22
|
use temporal_sdk_core_protos::{
|
|
15
|
-
coresdk::
|
|
16
|
-
|
|
17
|
-
create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
|
|
18
|
-
workflow_activation_job,
|
|
19
|
-
},
|
|
20
|
-
workflow_completion::Failure,
|
|
21
|
-
},
|
|
22
|
-
temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure as TFailure},
|
|
23
|
+
coresdk::workflow_activation::remove_from_cache::EvictionReason,
|
|
24
|
+
temporal::api::enums::v1::WorkflowTaskFailedCause,
|
|
23
25
|
};
|
|
24
|
-
use tokio::sync::{mpsc::unbounded_channel, oneshot};
|
|
25
|
-
use tokio_stream::wrappers::UnboundedReceiverStream;
|
|
26
26
|
use tokio_util::sync::CancellationToken;
|
|
27
27
|
use tracing::{Level, Span};
|
|
28
28
|
|
|
29
|
-
/// This struct holds all the state needed for tracking
|
|
30
|
-
/// and
|
|
29
|
+
/// This struct holds all the state needed for tracking the state of currently cached workflow runs
|
|
30
|
+
/// and directs all actions which affect them. It is ultimately the top-level arbiter of nearly
|
|
31
|
+
/// everything important relating to workflow state.
|
|
31
32
|
///
|
|
32
33
|
/// See [WFStream::build] for more
|
|
33
|
-
pub(
|
|
34
|
+
pub(super) struct WFStream {
|
|
34
35
|
runs: RunCache,
|
|
35
36
|
/// Buffered polls for new runs which need a cache slot to open up before we can handle them
|
|
36
37
|
buffered_polls_need_cache_slot: VecDeque<PermittedWFT>,
|
|
38
|
+
/// Is filled with runs that we decided need to have their history fetched during state
|
|
39
|
+
/// manipulation. Must be drained after handling each input.
|
|
40
|
+
runs_needing_fetching: VecDeque<HistoryFetchReq>,
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
client: Arc<dyn WorkerClient>,
|
|
40
|
-
|
|
41
|
-
/// Ensures we stay at or below this worker's maximum concurrent workflow task limit
|
|
42
|
-
wft_semaphore: MeteredSemaphore,
|
|
42
|
+
history_fetch_refcounter: Arc<HistfetchRC>,
|
|
43
43
|
shutdown_token: CancellationToken,
|
|
44
44
|
ignore_evicts_on_shutdown: bool,
|
|
45
45
|
|
|
46
46
|
metrics: MetricsContext,
|
|
47
|
-
}
|
|
48
|
-
impl WFStream {
|
|
49
|
-
fn record_span_fields(&mut self, run_id: &str, span: &Span) {
|
|
50
|
-
if let Some(run_handle) = self.runs.get_mut(run_id) {
|
|
51
|
-
if let Some(spid) = span.id() {
|
|
52
|
-
if run_handle.recorded_span_ids.contains(&spid) {
|
|
53
|
-
return;
|
|
54
|
-
}
|
|
55
|
-
run_handle.recorded_span_ids.insert(spid);
|
|
56
|
-
|
|
57
|
-
if let Some(wid) = run_handle.wft.as_ref().map(|wft| &wft.info.wf_id) {
|
|
58
|
-
span.record("workflow_id", wid.as_str());
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
47
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
enum WFStreamInput {
|
|
68
|
-
NewWft(PermittedWFT),
|
|
69
|
-
Local(LocalInput),
|
|
70
|
-
/// The stream given to us which represents the poller (or a mock) terminated.
|
|
71
|
-
PollerDead,
|
|
72
|
-
/// The stream given to us which represents the poller (or a mock) encountered a non-retryable
|
|
73
|
-
/// error while polling
|
|
74
|
-
PollerError(tonic::Status),
|
|
75
|
-
}
|
|
76
|
-
impl From<RunUpdateResponse> for WFStreamInput {
|
|
77
|
-
fn from(r: RunUpdateResponse) -> Self {
|
|
78
|
-
WFStreamInput::Local(LocalInput {
|
|
79
|
-
input: LocalInputs::RunUpdateResponse(r.kind),
|
|
80
|
-
span: r.span,
|
|
81
|
-
})
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
/// A non-poller-received input to the [WFStream]
|
|
85
|
-
#[derive(derive_more::DebugCustom)]
|
|
86
|
-
#[debug(fmt = "LocalInput {{ {:?} }}", input)]
|
|
87
|
-
pub(super) struct LocalInput {
|
|
88
|
-
pub input: LocalInputs,
|
|
89
|
-
pub span: Span,
|
|
90
|
-
}
|
|
91
|
-
/// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
|
|
92
|
-
/// new polls.
|
|
93
|
-
#[derive(Debug, derive_more::From)]
|
|
94
|
-
pub(super) enum LocalInputs {
|
|
95
|
-
Completion(WFActCompleteMsg),
|
|
96
|
-
LocalResolution(LocalResolutionMsg),
|
|
97
|
-
PostActivation(PostActivationMsg),
|
|
98
|
-
RunUpdateResponse(RunUpdateResponseKind),
|
|
99
|
-
RequestEviction(RequestEvictMsg),
|
|
100
|
-
GetStateInfo(GetStateInfoMsg),
|
|
101
|
-
}
|
|
102
|
-
impl LocalInputs {
|
|
103
|
-
fn run_id(&self) -> Option<&str> {
|
|
104
|
-
Some(match self {
|
|
105
|
-
LocalInputs::Completion(c) => c.completion.run_id(),
|
|
106
|
-
LocalInputs::LocalResolution(lr) => &lr.run_id,
|
|
107
|
-
LocalInputs::PostActivation(pa) => &pa.run_id,
|
|
108
|
-
LocalInputs::RunUpdateResponse(rur) => rur.run_id(),
|
|
109
|
-
LocalInputs::RequestEviction(re) => &re.run_id,
|
|
110
|
-
LocalInputs::GetStateInfo(_) => return None,
|
|
111
|
-
})
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
#[derive(Debug, derive_more::From)]
|
|
115
|
-
#[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
|
|
116
|
-
enum ExternalPollerInputs {
|
|
117
|
-
NewWft(PermittedWFT),
|
|
118
|
-
PollerDead,
|
|
119
|
-
PollerError(tonic::Status),
|
|
48
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
49
|
+
wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
|
|
120
50
|
}
|
|
121
|
-
impl From<ExternalPollerInputs> for WFStreamInput {
|
|
122
|
-
fn from(l: ExternalPollerInputs) -> Self {
|
|
123
|
-
match l {
|
|
124
|
-
ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
|
|
125
|
-
ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
|
|
126
|
-
ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
|
|
131
51
|
impl WFStream {
|
|
132
52
|
/// Constructs workflow state management and returns a stream which outputs activations.
|
|
133
53
|
///
|
|
134
|
-
/// * `
|
|
135
|
-
///
|
|
136
|
-
/// come down.
|
|
54
|
+
/// * `wft_stream` is a stream of validated poll responses and fetched history pages as returned
|
|
55
|
+
/// by a poller (or mock), via [WFTExtractor].
|
|
137
56
|
/// * `local_rx` is a stream of actions that workflow state needs to see. Things like
|
|
138
|
-
///
|
|
57
|
+
/// completions, local activities finishing, etc. See [LocalInputs].
|
|
58
|
+
/// * `local_activity_request_sink` is used to handle outgoing requests to start or cancel
|
|
59
|
+
/// local activities, and may return resolutions that need to be handled immediately.
|
|
139
60
|
///
|
|
140
|
-
///
|
|
141
|
-
///
|
|
142
|
-
///
|
|
61
|
+
/// The stream inputs are combined into a stream of [WFActStreamInput]s. The stream processor
|
|
62
|
+
/// then takes action on those inputs, mutating the [WFStream] state, and then may yield
|
|
63
|
+
/// activations.
|
|
143
64
|
///
|
|
144
|
-
///
|
|
145
|
-
///
|
|
146
|
-
///
|
|
65
|
+
/// Importantly, nothing async happens while actually mutating state. This means all changes to
|
|
66
|
+
/// all workflow state can be represented purely via the stream of inputs, plus the
|
|
67
|
+
/// calls/retvals from the LA request sink, which is the last unfortunate bit of impurity in
|
|
68
|
+
/// the design. Eliminating it would be nice, so that all inputs come from the passed-in streams
|
|
69
|
+
/// and all outputs flow from the return stream, but it's difficult to do so since it would
|
|
70
|
+
/// require "pausing" in-progress changes to a run while sending & waiting for response from
|
|
71
|
+
/// local activity management. Likely the best option would be to move the pure state info
|
|
72
|
+
/// needed to determine immediate responses into LA state machines themselves (out of the LA
|
|
73
|
+
/// manager), which is a quite substantial change.
|
|
147
74
|
pub(super) fn build(
|
|
148
75
|
basics: WorkflowBasics,
|
|
149
|
-
|
|
76
|
+
wft_stream: impl Stream<Item = Result<WFTExtractorOutput, tonic::Status>> + Send + 'static,
|
|
150
77
|
local_rx: impl Stream<Item = LocalInput> + Send + 'static,
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
+ Send
|
|
154
|
-
+ Sync
|
|
155
|
-
+ 'static,
|
|
156
|
-
) -> impl Stream<Item = Result<ActivationOrAuto, PollWfError>> {
|
|
157
|
-
let wft_semaphore = MeteredSemaphore::new(
|
|
158
|
-
basics.max_outstanding_wfts,
|
|
159
|
-
basics.metrics.with_new_attrs([workflow_worker_type()]),
|
|
160
|
-
MetricsContext::available_task_slots,
|
|
161
|
-
);
|
|
162
|
-
let wft_sem_clone = wft_semaphore.clone();
|
|
163
|
-
let proceeder = stream::unfold(wft_sem_clone, |sem| async move {
|
|
164
|
-
Some((sem.acquire_owned().await.unwrap(), sem))
|
|
165
|
-
});
|
|
166
|
-
let poller_wfts = stream_when_allowed(external_wfts, proceeder);
|
|
167
|
-
let (run_update_tx, run_update_rx) = unbounded_channel();
|
|
168
|
-
let local_rx = stream::select(
|
|
169
|
-
local_rx.map(Into::into),
|
|
170
|
-
UnboundedReceiverStream::new(run_update_rx).map(Into::into),
|
|
171
|
-
);
|
|
78
|
+
local_activity_request_sink: impl LocalActivityRequestSink,
|
|
79
|
+
) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
|
|
172
80
|
let all_inputs = stream::select_with_strategy(
|
|
173
|
-
local_rx,
|
|
174
|
-
|
|
175
|
-
.map(
|
|
176
|
-
Ok(wft) => ExternalPollerInputs::NewWft(PermittedWFT { wft, permit }),
|
|
177
|
-
Err(e) => ExternalPollerInputs::PollerError(e),
|
|
178
|
-
})
|
|
81
|
+
local_rx.map(Into::into),
|
|
82
|
+
wft_stream
|
|
83
|
+
.map(Into::into)
|
|
179
84
|
.chain(stream::once(async { ExternalPollerInputs::PollerDead }))
|
|
180
85
|
.map(Into::into)
|
|
181
86
|
.boxed(),
|
|
182
87
|
// Priority always goes to the local stream
|
|
183
88
|
|_: &mut ()| PollNext::Left,
|
|
184
89
|
);
|
|
90
|
+
Self::build_internal(all_inputs, basics, local_activity_request_sink)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
fn build_internal(
|
|
94
|
+
all_inputs: impl Stream<Item = WFStreamInput>,
|
|
95
|
+
basics: WorkflowBasics,
|
|
96
|
+
local_activity_request_sink: impl LocalActivityRequestSink,
|
|
97
|
+
) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
|
|
185
98
|
let mut state = WFStream {
|
|
186
99
|
buffered_polls_need_cache_slot: Default::default(),
|
|
187
100
|
runs: RunCache::new(
|
|
188
101
|
basics.max_cached_workflows,
|
|
189
102
|
basics.namespace.clone(),
|
|
190
|
-
run_update_tx,
|
|
191
103
|
Arc::new(local_activity_request_sink),
|
|
192
104
|
basics.metrics.clone(),
|
|
193
105
|
),
|
|
194
|
-
client,
|
|
195
|
-
wft_semaphore,
|
|
196
106
|
shutdown_token: basics.shutdown_token,
|
|
197
107
|
ignore_evicts_on_shutdown: basics.ignore_evicts_on_shutdown,
|
|
198
108
|
metrics: basics.metrics,
|
|
109
|
+
runs_needing_fetching: Default::default(),
|
|
110
|
+
history_fetch_refcounter: Arc::new(HistfetchRC {}),
|
|
111
|
+
|
|
112
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
113
|
+
wf_state_inputs: basics.wf_state_inputs,
|
|
199
114
|
};
|
|
200
115
|
all_inputs
|
|
201
|
-
.map(move |action| {
|
|
116
|
+
.map(move |action: WFStreamInput| {
|
|
202
117
|
let span = span!(Level::DEBUG, "new_stream_input", action=?action);
|
|
203
118
|
let _span_g = span.enter();
|
|
204
119
|
|
|
205
|
-
|
|
120
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
121
|
+
let maybe_write = state.prep_input(&action);
|
|
122
|
+
|
|
123
|
+
let mut activations = vec![];
|
|
124
|
+
let maybe_act = match action {
|
|
206
125
|
WFStreamInput::NewWft(pwft) => {
|
|
207
|
-
debug!(run_id=%pwft.
|
|
208
|
-
state.instantiate_or_update(pwft)
|
|
209
|
-
None
|
|
126
|
+
debug!(run_id=%pwft.work.execution.run_id, "New WFT");
|
|
127
|
+
state.instantiate_or_update(pwft)
|
|
210
128
|
}
|
|
211
129
|
WFStreamInput::Local(local_input) => {
|
|
212
130
|
let _span_g = local_input.span.enter();
|
|
213
131
|
if let Some(rid) = local_input.input.run_id() {
|
|
214
|
-
state.
|
|
132
|
+
if let Some(rh) = state.runs.get_mut(rid) {
|
|
133
|
+
rh.record_span_fields(&local_input.span);
|
|
134
|
+
}
|
|
215
135
|
}
|
|
216
136
|
match local_input.input {
|
|
217
|
-
LocalInputs::RunUpdateResponse(resp) => {
|
|
218
|
-
state.process_run_update_response(resp)
|
|
219
|
-
}
|
|
220
137
|
LocalInputs::Completion(completion) => {
|
|
221
|
-
|
|
222
|
-
|
|
138
|
+
activations.extend(
|
|
139
|
+
state.process_completion(NewOrFetchedComplete::New(completion)),
|
|
140
|
+
);
|
|
141
|
+
None // completions can return more than one activation
|
|
142
|
+
}
|
|
143
|
+
LocalInputs::FetchedPageCompletion { paginator, update } => {
|
|
144
|
+
activations.extend(state.process_completion(
|
|
145
|
+
NewOrFetchedComplete::Fetched(update, paginator),
|
|
146
|
+
));
|
|
147
|
+
None // completions can return more than one activation
|
|
223
148
|
}
|
|
224
149
|
LocalInputs::PostActivation(report) => {
|
|
225
|
-
state.process_post_activation(report)
|
|
226
|
-
None
|
|
150
|
+
state.process_post_activation(report)
|
|
227
151
|
}
|
|
228
|
-
LocalInputs::LocalResolution(res) =>
|
|
229
|
-
|
|
230
|
-
|
|
152
|
+
LocalInputs::LocalResolution(res) => state.local_resolution(res),
|
|
153
|
+
LocalInputs::HeartbeatTimeout(hbt) => {
|
|
154
|
+
state.process_heartbeat_timeout(hbt)
|
|
231
155
|
}
|
|
232
156
|
LocalInputs::RequestEviction(evict) => {
|
|
233
|
-
state.request_eviction(evict)
|
|
234
|
-
None
|
|
157
|
+
state.request_eviction(evict).into_run_update_resp()
|
|
235
158
|
}
|
|
236
159
|
LocalInputs::GetStateInfo(gsi) => {
|
|
237
160
|
let _ = gsi.response_tx.send(WorkflowStateInfo {
|
|
238
161
|
cached_workflows: state.runs.len(),
|
|
239
162
|
outstanding_wft: state.outstanding_wfts(),
|
|
240
|
-
available_wft_permits: state.wft_semaphore.available_permits(),
|
|
241
163
|
});
|
|
242
164
|
None
|
|
243
165
|
}
|
|
244
166
|
}
|
|
245
167
|
}
|
|
168
|
+
WFStreamInput::FailedFetch { run_id, err } => state
|
|
169
|
+
.request_eviction(RequestEvictMsg {
|
|
170
|
+
run_id,
|
|
171
|
+
message: format!("Fetching history failed: {err:?}"),
|
|
172
|
+
reason: EvictionReason::PaginationOrHistoryFetch,
|
|
173
|
+
})
|
|
174
|
+
.into_run_update_resp(),
|
|
246
175
|
WFStreamInput::PollerDead => {
|
|
247
|
-
debug!("WFT poller died,
|
|
176
|
+
debug!("WFT poller died, beginning shutdown");
|
|
248
177
|
state.shutdown_token.cancel();
|
|
249
178
|
None
|
|
250
179
|
}
|
|
@@ -254,457 +183,226 @@ impl WFStream {
|
|
|
254
183
|
}
|
|
255
184
|
};
|
|
256
185
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
186
|
+
activations.extend(maybe_act.into_iter());
|
|
187
|
+
activations.extend(state.reconcile_buffered());
|
|
188
|
+
|
|
189
|
+
// Always flush *after* actually handling the input, as this allows LA sink
|
|
190
|
+
// responses to be recorded before the input, so they can be read and buffered to be
|
|
191
|
+
// replayed during the handling of the input itself.
|
|
192
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
193
|
+
if let Some(write) = maybe_write {
|
|
194
|
+
state.flush_write(write);
|
|
263
195
|
}
|
|
264
|
-
|
|
196
|
+
|
|
265
197
|
if state.shutdown_done() {
|
|
198
|
+
info!("Workflow shutdown is done");
|
|
266
199
|
return Err(PollWfError::ShutDown);
|
|
267
200
|
}
|
|
268
201
|
|
|
269
|
-
Ok(
|
|
202
|
+
Ok(WFStreamOutput {
|
|
203
|
+
activations: activations.into(),
|
|
204
|
+
fetch_histories: std::mem::take(&mut state.runs_needing_fetching),
|
|
205
|
+
})
|
|
270
206
|
})
|
|
271
|
-
.
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
Err(e) => {
|
|
276
|
-
if !matches!(e, PollWfError::ShutDown) {
|
|
277
|
-
error!(
|
|
207
|
+
.inspect(|o| {
|
|
208
|
+
if let Some(e) = o.as_ref().err() {
|
|
209
|
+
if !matches!(e, PollWfError::ShutDown) {
|
|
210
|
+
error!(
|
|
278
211
|
"Workflow processing encountered fatal error and must shut down {:?}",
|
|
279
212
|
e
|
|
280
|
-
|
|
281
|
-
}
|
|
282
|
-
Some(Err(e))
|
|
213
|
+
);
|
|
283
214
|
}
|
|
284
|
-
}
|
|
215
|
+
}
|
|
285
216
|
})
|
|
286
217
|
// Stop the stream once we have shut down
|
|
287
218
|
.take_while(|o| future::ready(!matches!(o, Err(PollWfError::ShutDown))))
|
|
288
219
|
}
|
|
289
220
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
match
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
.get_mut(&resp.run_id)
|
|
300
|
-
.expect("Workflow must exist, it just sent us an update response");
|
|
301
|
-
run_handle.have_seen_terminal_event = resp.have_seen_terminal_event;
|
|
302
|
-
run_handle.more_pending_work = resp.more_pending_work;
|
|
303
|
-
run_handle.last_action_acked = true;
|
|
304
|
-
run_handle.most_recently_processed_event_number =
|
|
305
|
-
resp.most_recently_processed_event_number;
|
|
306
|
-
|
|
307
|
-
let r = match resp.outgoing_activation {
|
|
308
|
-
Some(ActivationOrAuto::LangActivation(mut activation)) => {
|
|
309
|
-
if resp.in_response_to_wft {
|
|
310
|
-
let wft = run_handle
|
|
311
|
-
.wft
|
|
312
|
-
.as_mut()
|
|
313
|
-
.expect("WFT must exist for run just updated with one");
|
|
314
|
-
// If there are in-poll queries, insert jobs for those queries into the
|
|
315
|
-
// activation, but only if we hit the cache. If we didn't, those queries
|
|
316
|
-
// will need to be dealt with once replay is over
|
|
317
|
-
if wft.hit_cache {
|
|
318
|
-
put_queries_in_act(&mut activation, wft);
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
if activation.jobs.is_empty() {
|
|
323
|
-
dbg_panic!("Should not send lang activation with no jobs");
|
|
324
|
-
}
|
|
325
|
-
Some(ActivationOrAuto::LangActivation(activation))
|
|
326
|
-
}
|
|
327
|
-
Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
|
|
328
|
-
if let Some(wft) = run_handle.wft.as_mut() {
|
|
329
|
-
put_queries_in_act(&mut act, wft);
|
|
330
|
-
Some(ActivationOrAuto::LangActivation(act))
|
|
331
|
-
} else {
|
|
332
|
-
dbg_panic!("Ready for queries but no WFT!");
|
|
333
|
-
None
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
|
|
337
|
-
None => {
|
|
338
|
-
// If the response indicates there is no activation to send yet but there
|
|
339
|
-
// is more pending work, we should check again.
|
|
340
|
-
if run_handle.more_pending_work {
|
|
341
|
-
run_handle.check_more_activations();
|
|
342
|
-
None
|
|
343
|
-
} else if let Some(reason) = run_handle.trying_to_evict.as_ref() {
|
|
344
|
-
// If a run update came back and had nothing to do, but we're trying to
|
|
345
|
-
// evict, just do that now as long as there's no other outstanding work.
|
|
346
|
-
if run_handle.activation.is_none() && !run_handle.more_pending_work {
|
|
347
|
-
let mut evict_act = create_evict_activation(
|
|
348
|
-
resp.run_id,
|
|
349
|
-
reason.message.clone(),
|
|
350
|
-
reason.reason,
|
|
351
|
-
);
|
|
352
|
-
evict_act.history_length =
|
|
353
|
-
run_handle.most_recently_processed_event_number as u32;
|
|
354
|
-
Some(ActivationOrAuto::LangActivation(evict_act))
|
|
355
|
-
} else {
|
|
356
|
-
None
|
|
357
|
-
}
|
|
358
|
-
} else {
|
|
359
|
-
None
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
};
|
|
363
|
-
if let Some(f) = resp.fulfillable_complete.take() {
|
|
364
|
-
f.fulfill();
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// After each run update, check if it's ready to handle any buffered poll
|
|
368
|
-
if matches!(&r, Some(ActivationOrAuto::Autocomplete { .. }) | None)
|
|
369
|
-
&& !run_handle.has_any_pending_work(false, true)
|
|
370
|
-
{
|
|
371
|
-
if let Some(bufft) = run_handle.buffered_resp.take() {
|
|
372
|
-
self.instantiate_or_update(bufft);
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
r
|
|
376
|
-
}
|
|
377
|
-
RunUpdateResponseKind::Fail(fail) => {
|
|
378
|
-
if let Some(r) = self.runs.get_mut(&fail.run_id) {
|
|
379
|
-
r.last_action_acked = true;
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
if let Some(resp_chan) = fail.completion_resp {
|
|
383
|
-
// Automatically fail the workflow task in the event we couldn't update machines
|
|
384
|
-
let fail_cause = if matches!(&fail.err, WFMachinesError::Nondeterminism(_)) {
|
|
385
|
-
WorkflowTaskFailedCause::NonDeterministicError
|
|
386
|
-
} else {
|
|
387
|
-
WorkflowTaskFailedCause::Unspecified
|
|
388
|
-
};
|
|
389
|
-
let wft_fail_str = format!("{:?}", fail.err);
|
|
390
|
-
self.failed_completion(
|
|
391
|
-
fail.run_id,
|
|
392
|
-
fail_cause,
|
|
393
|
-
fail.err.evict_reason(),
|
|
394
|
-
TFailure::application_failure(wft_fail_str, false).into(),
|
|
395
|
-
resp_chan,
|
|
396
|
-
);
|
|
397
|
-
} else {
|
|
398
|
-
// TODO: This should probably also fail workflow tasks, but that wasn't
|
|
399
|
-
// implemented pre-refactor either.
|
|
400
|
-
warn!(error=?fail.err, run_id=%fail.run_id, "Error while updating workflow");
|
|
401
|
-
self.request_eviction(RequestEvictMsg {
|
|
402
|
-
run_id: fail.run_id,
|
|
403
|
-
message: format!("Error while updating workflow: {:?}", fail.err),
|
|
404
|
-
reason: fail.err.evict_reason(),
|
|
405
|
-
});
|
|
406
|
-
}
|
|
407
|
-
None
|
|
221
|
+
/// Instantiate or update run machines with a new WFT
|
|
222
|
+
#[instrument(skip(self, pwft)
|
|
223
|
+
fields(run_id=%pwft.work.execution.run_id,
|
|
224
|
+
workflow_id=%pwft.work.execution.workflow_id))]
|
|
225
|
+
fn instantiate_or_update(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
|
|
226
|
+
match self._instantiate_or_update(pwft) {
|
|
227
|
+
Err(histfetch) => {
|
|
228
|
+
self.runs_needing_fetching.push_back(histfetch);
|
|
229
|
+
Default::default()
|
|
408
230
|
}
|
|
231
|
+
Ok(r) => r,
|
|
409
232
|
}
|
|
410
233
|
}
|
|
411
234
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
235
|
+
fn _instantiate_or_update(
|
|
236
|
+
&mut self,
|
|
237
|
+
pwft: PermittedWFT,
|
|
238
|
+
) -> Result<RunUpdateAct, HistoryFetchReq> {
|
|
239
|
+
// If the run already exists, possibly buffer the work and return early if we can't handle
|
|
240
|
+
// it yet.
|
|
241
|
+
let pwft = if let Some(rh) = self.runs.get_mut(&pwft.work.execution.run_id) {
|
|
242
|
+
if let Some(w) = rh.buffer_wft_if_outstanding_work(pwft) {
|
|
243
|
+
w
|
|
244
|
+
} else {
|
|
245
|
+
return Ok(None);
|
|
246
|
+
}
|
|
418
247
|
} else {
|
|
419
|
-
|
|
248
|
+
pwft
|
|
420
249
|
};
|
|
421
250
|
|
|
422
|
-
let run_id = work.
|
|
251
|
+
let run_id = pwft.work.execution.run_id.clone();
|
|
423
252
|
// If our cache is full and this WFT is for an unseen run we must first evict a run before
|
|
424
253
|
// we can deal with this task. So, buffer the task in that case.
|
|
425
254
|
if !self.runs.has_run(&run_id) && self.runs.is_full() {
|
|
426
|
-
self.buffer_resp_on_full_cache(
|
|
427
|
-
return;
|
|
255
|
+
self.buffer_resp_on_full_cache(pwft);
|
|
256
|
+
return Ok(None);
|
|
428
257
|
}
|
|
429
258
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
history_length = %work.history.events.len(),
|
|
435
|
-
start_event_id = ?start_event_id,
|
|
436
|
-
has_legacy_query = %work.legacy_query.is_some(),
|
|
437
|
-
attempt = %work.attempt,
|
|
438
|
-
"Applying new workflow task from server"
|
|
439
|
-
);
|
|
440
|
-
|
|
441
|
-
let wft_info = WorkflowTaskInfo {
|
|
442
|
-
attempt: work.attempt,
|
|
443
|
-
task_token: work.task_token,
|
|
444
|
-
wf_id: work.workflow_execution.workflow_id.clone(),
|
|
445
|
-
};
|
|
446
|
-
let poll_resp_is_incremental = work
|
|
447
|
-
.history
|
|
448
|
-
.events
|
|
449
|
-
.get(0)
|
|
450
|
-
.map(|ev| ev.event_id > 1)
|
|
451
|
-
.unwrap_or_default();
|
|
452
|
-
let poll_resp_is_incremental = poll_resp_is_incremental || work.history.events.is_empty();
|
|
453
|
-
|
|
454
|
-
let mut did_miss_cache = !poll_resp_is_incremental;
|
|
455
|
-
|
|
456
|
-
let page_token = if !self.runs.has_run(&run_id) && poll_resp_is_incremental {
|
|
259
|
+
// This check can't really be lifted up higher since we could EX: See it's in the cache,
|
|
260
|
+
// not fetch more history, send the task, see cache is full, buffer it, then evict that
|
|
261
|
+
// run, and now we still have a cache miss.
|
|
262
|
+
if !self.runs.has_run(&run_id) && pwft.work.is_incremental() {
|
|
457
263
|
debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
|
|
458
264
|
cache. Will fetch history");
|
|
459
265
|
self.metrics.sticky_cache_miss();
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
};
|
|
465
|
-
let history_update = HistoryUpdate::new(
|
|
466
|
-
HistoryPaginator::new(
|
|
467
|
-
work.history,
|
|
468
|
-
work.workflow_execution.workflow_id.clone(),
|
|
469
|
-
run_id.clone(),
|
|
470
|
-
page_token,
|
|
471
|
-
self.client.clone(),
|
|
472
|
-
),
|
|
473
|
-
work.previous_started_event_id,
|
|
474
|
-
);
|
|
475
|
-
let legacy_query_from_poll = work
|
|
476
|
-
.legacy_query
|
|
477
|
-
.take()
|
|
478
|
-
.map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
|
|
479
|
-
|
|
480
|
-
let mut pending_queries = work.query_requests.into_iter().collect::<Vec<_>>();
|
|
481
|
-
if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
|
|
482
|
-
error!(
|
|
483
|
-
"Server issued both normal and legacy queries. This should not happen. Please \
|
|
484
|
-
file a bug report."
|
|
485
|
-
);
|
|
486
|
-
self.request_eviction(RequestEvictMsg {
|
|
487
|
-
run_id,
|
|
488
|
-
message: "Server issued both normal and legacy query".to_string(),
|
|
489
|
-
reason: EvictionReason::Fatal,
|
|
490
|
-
});
|
|
491
|
-
return;
|
|
492
|
-
}
|
|
493
|
-
if let Some(lq) = legacy_query_from_poll {
|
|
494
|
-
pending_queries.push(lq);
|
|
266
|
+
return Err(HistoryFetchReq::Full(
|
|
267
|
+
CacheMissFetchReq { original_wft: pwft },
|
|
268
|
+
self.history_fetch_refcounter.clone(),
|
|
269
|
+
));
|
|
495
270
|
}
|
|
496
271
|
|
|
497
|
-
let
|
|
498
|
-
|
|
499
|
-
&run_id,
|
|
500
|
-
&work.workflow_execution.workflow_id,
|
|
501
|
-
&work.workflow_type,
|
|
502
|
-
history_update,
|
|
503
|
-
start_time,
|
|
504
|
-
);
|
|
505
|
-
run_handle.wft = Some(OutstandingTask {
|
|
506
|
-
info: wft_info,
|
|
507
|
-
hit_cache: !did_miss_cache,
|
|
508
|
-
pending_queries,
|
|
509
|
-
start_time,
|
|
510
|
-
permit,
|
|
511
|
-
})
|
|
272
|
+
let rur = self.runs.instantiate_or_update(pwft);
|
|
273
|
+
Ok(rur)
|
|
512
274
|
}
|
|
513
275
|
|
|
514
|
-
fn process_completion(&mut self, complete:
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
276
|
+
fn process_completion(&mut self, complete: NewOrFetchedComplete) -> Vec<ActivationOrAuto> {
|
|
277
|
+
let rh = if let Some(rh) = self.runs.get_mut(complete.run_id()) {
|
|
278
|
+
rh
|
|
279
|
+
} else {
|
|
280
|
+
dbg_panic!("Run missing during completion {:?}", complete);
|
|
281
|
+
return vec![];
|
|
282
|
+
};
|
|
283
|
+
let mut acts: Vec<_> = match complete {
|
|
284
|
+
NewOrFetchedComplete::New(complete) => match complete.completion {
|
|
285
|
+
ValidatedCompletion::Success { commands, .. } => {
|
|
286
|
+
match rh.successful_completion(commands, complete.response_tx) {
|
|
287
|
+
Ok(acts) => acts,
|
|
288
|
+
Err(npr) => {
|
|
289
|
+
self.runs_needing_fetching
|
|
290
|
+
.push_back(HistoryFetchReq::NextPage(
|
|
291
|
+
npr,
|
|
292
|
+
self.history_fetch_refcounter.clone(),
|
|
293
|
+
));
|
|
294
|
+
None
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
ValidatedCompletion::Fail { failure, .. } => rh.failed_completion(
|
|
522
299
|
WorkflowTaskFailedCause::Unspecified,
|
|
523
300
|
EvictionReason::LangFail,
|
|
524
301
|
failure,
|
|
525
302
|
complete.response_tx,
|
|
526
|
-
)
|
|
303
|
+
),
|
|
304
|
+
},
|
|
305
|
+
NewOrFetchedComplete::Fetched(update, paginator) => {
|
|
306
|
+
rh.fetched_page_completion(update, paginator)
|
|
527
307
|
}
|
|
528
308
|
}
|
|
309
|
+
.into_iter()
|
|
310
|
+
.collect();
|
|
529
311
|
// Always queue evictions after completion when we have a zero-size cache
|
|
530
312
|
if self.runs.cache_capacity() == 0 {
|
|
531
|
-
self.request_eviction_of_lru_run()
|
|
313
|
+
acts.extend(self.request_eviction_of_lru_run().into_run_update_resp())
|
|
532
314
|
}
|
|
315
|
+
acts
|
|
533
316
|
}
|
|
534
317
|
|
|
535
|
-
fn
|
|
536
|
-
&
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
(
|
|
545
|
-
entry.info.task_token.clone(),
|
|
546
|
-
!entry.pending_queries.is_empty(),
|
|
547
|
-
entry.start_time,
|
|
548
|
-
)
|
|
549
|
-
} else {
|
|
550
|
-
if !activation_was_only_eviction {
|
|
551
|
-
// Not an error if this was an eviction, since it's normal to issue eviction
|
|
552
|
-
// activations without an associated workflow task in that case.
|
|
553
|
-
dbg_panic!(
|
|
554
|
-
"Attempted to complete activation for run {} without associated workflow task",
|
|
318
|
+
fn process_post_activation(&mut self, report: PostActivationMsg) -> RunUpdateAct {
|
|
319
|
+
let run_id = &report.run_id;
|
|
320
|
+
let wft_from_complete = report.wft_from_complete;
|
|
321
|
+
if let Some((wft, _)) = &wft_from_complete {
|
|
322
|
+
if &wft.execution.run_id != run_id {
|
|
323
|
+
dbg_panic!(
|
|
324
|
+
"Server returned a WFT on completion for a different run ({}) than the \
|
|
325
|
+
one being completed ({}). This is a server bug.",
|
|
326
|
+
wft.execution.run_id,
|
|
555
327
|
run_id
|
|
556
|
-
|
|
557
|
-
}
|
|
558
|
-
self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
|
|
559
|
-
return;
|
|
560
|
-
};
|
|
561
|
-
|
|
562
|
-
// If the only command from the activation is a legacy query response, that means we need
|
|
563
|
-
// to respond differently than a typical activation.
|
|
564
|
-
if matches!(&commands.as_slice(),
|
|
565
|
-
&[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
|
|
566
|
-
{
|
|
567
|
-
let qr = match commands.remove(0) {
|
|
568
|
-
WFCommand::QueryResponse(qr) => qr,
|
|
569
|
-
_ => unreachable!("We just verified this is the only command"),
|
|
570
|
-
};
|
|
571
|
-
self.reply_to_complete(
|
|
572
|
-
&run_id,
|
|
573
|
-
ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
|
|
574
|
-
task_token,
|
|
575
|
-
action: ActivationAction::RespondLegacyQuery {
|
|
576
|
-
result: Box::new(qr),
|
|
577
|
-
},
|
|
578
|
-
}),
|
|
579
|
-
resp_chan,
|
|
580
|
-
);
|
|
581
|
-
} else {
|
|
582
|
-
// First strip out query responses from other commands that actually affect machines
|
|
583
|
-
// Would be prettier with `drain_filter`
|
|
584
|
-
let mut i = 0;
|
|
585
|
-
let mut query_responses = vec![];
|
|
586
|
-
while i < commands.len() {
|
|
587
|
-
if matches!(commands[i], WFCommand::QueryResponse(_)) {
|
|
588
|
-
if let WFCommand::QueryResponse(qr) = commands.remove(i) {
|
|
589
|
-
query_responses.push(qr);
|
|
590
|
-
}
|
|
591
|
-
} else {
|
|
592
|
-
i += 1;
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
let activation_was_eviction = self.activation_has_eviction(&run_id);
|
|
597
|
-
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
598
|
-
rh.send_completion(RunActivationCompletion {
|
|
599
|
-
task_token,
|
|
600
|
-
start_time,
|
|
601
|
-
commands,
|
|
602
|
-
activation_was_eviction,
|
|
603
|
-
activation_was_only_eviction,
|
|
604
|
-
has_pending_query,
|
|
605
|
-
query_responses,
|
|
606
|
-
resp_chan: Some(resp_chan),
|
|
607
|
-
});
|
|
608
|
-
} else {
|
|
609
|
-
dbg_panic!("Run {} missing during completion", run_id);
|
|
328
|
+
);
|
|
610
329
|
}
|
|
611
|
-
};
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
fn failed_completion(
|
|
615
|
-
&mut self,
|
|
616
|
-
run_id: String,
|
|
617
|
-
cause: WorkflowTaskFailedCause,
|
|
618
|
-
reason: EvictionReason,
|
|
619
|
-
failure: Failure,
|
|
620
|
-
resp_chan: oneshot::Sender<ActivationCompleteResult>,
|
|
621
|
-
) {
|
|
622
|
-
let tt = if let Some(tt) = self.get_task(&run_id).map(|t| t.info.task_token.clone()) {
|
|
623
|
-
tt
|
|
624
|
-
} else {
|
|
625
|
-
dbg_panic!(
|
|
626
|
-
"No workflow task for run id {} found when trying to fail activation",
|
|
627
|
-
run_id
|
|
628
|
-
);
|
|
629
|
-
self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
|
|
630
|
-
return;
|
|
631
|
-
};
|
|
632
|
-
|
|
633
|
-
if let Some(m) = self.run_metrics(&run_id) {
|
|
634
|
-
m.wf_task_failed();
|
|
635
330
|
}
|
|
636
|
-
let message = format!("Workflow activation completion failed: {:?}", &failure);
|
|
637
|
-
// Blow up any cached data associated with the workflow
|
|
638
|
-
let should_report = match self.request_eviction(RequestEvictMsg {
|
|
639
|
-
run_id: run_id.clone(),
|
|
640
|
-
message,
|
|
641
|
-
reason,
|
|
642
|
-
}) {
|
|
643
|
-
EvictionRequestResult::EvictionRequested(Some(attempt))
|
|
644
|
-
| EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
|
|
645
|
-
_ => false,
|
|
646
|
-
};
|
|
647
|
-
// If the outstanding WFT is a legacy query task, report that we need to fail it
|
|
648
|
-
let outcome = if self
|
|
649
|
-
.runs
|
|
650
|
-
.get(&run_id)
|
|
651
|
-
.map(|rh| rh.pending_work_is_legacy_query())
|
|
652
|
-
.unwrap_or_default()
|
|
653
|
-
{
|
|
654
|
-
ActivationCompleteOutcome::ReportWFTFail(
|
|
655
|
-
FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
|
|
656
|
-
)
|
|
657
|
-
} else if should_report {
|
|
658
|
-
ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
|
|
659
|
-
tt, cause, failure,
|
|
660
|
-
))
|
|
661
|
-
} else {
|
|
662
|
-
ActivationCompleteOutcome::DoNothing
|
|
663
|
-
};
|
|
664
|
-
self.reply_to_complete(&run_id, outcome, resp_chan);
|
|
665
|
-
}
|
|
666
331
|
|
|
667
|
-
|
|
668
|
-
let run_id = &report.run_id;
|
|
332
|
+
let mut res = None;
|
|
669
333
|
|
|
670
334
|
// If we reported to server, we always want to mark it complete.
|
|
671
|
-
let maybe_t = self.complete_wft(run_id, report.
|
|
335
|
+
let maybe_t = self.complete_wft(run_id, report.wft_report_status);
|
|
336
|
+
// Delete the activation
|
|
337
|
+
let activation = self
|
|
338
|
+
.runs
|
|
339
|
+
.get_mut(run_id)
|
|
340
|
+
.and_then(|rh| rh.delete_activation());
|
|
341
|
+
|
|
342
|
+
// Evict the run if the activation contained an eviction
|
|
343
|
+
let mut applied_buffered_poll_for_this_run = false;
|
|
344
|
+
if activation.map(|a| a.has_eviction()).unwrap_or_default() {
|
|
345
|
+
debug!(run_id=%run_id, "Evicting run");
|
|
346
|
+
|
|
347
|
+
if let Some(mut rh) = self.runs.remove(run_id) {
|
|
348
|
+
if let Some(buff) = rh.take_buffered_wft() {
|
|
349
|
+
// Don't try to apply a buffered poll for this run if we just got a new WFT
|
|
350
|
+
// from completing, because by definition that buffered poll is now an
|
|
351
|
+
// out-of-date WFT.
|
|
352
|
+
if wft_from_complete.is_none() {
|
|
353
|
+
res = self.instantiate_or_update(buff);
|
|
354
|
+
applied_buffered_poll_for_this_run = true;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
672
358
|
|
|
673
|
-
|
|
674
|
-
.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
359
|
+
// Attempt to apply a buffered poll for some *other* run, if we didn't have a wft
|
|
360
|
+
// from complete or a buffered poll for *this* run.
|
|
361
|
+
if wft_from_complete.is_none() && !applied_buffered_poll_for_this_run {
|
|
362
|
+
if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
|
|
363
|
+
res = self.instantiate_or_update(buff);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
679
366
|
};
|
|
680
367
|
|
|
681
|
-
if let Some(wft) =
|
|
682
|
-
debug!(run_id=%wft.
|
|
368
|
+
if let Some((wft, pag)) = wft_from_complete {
|
|
369
|
+
debug!(run_id=%wft.execution.run_id, "New WFT from completion");
|
|
683
370
|
if let Some(t) = maybe_t {
|
|
684
|
-
self.instantiate_or_update(PermittedWFT {
|
|
685
|
-
wft,
|
|
371
|
+
res = self.instantiate_or_update(PermittedWFT {
|
|
372
|
+
work: wft,
|
|
686
373
|
permit: t.permit,
|
|
687
|
-
|
|
374
|
+
paginator: pag,
|
|
375
|
+
});
|
|
688
376
|
}
|
|
689
377
|
}
|
|
690
378
|
|
|
691
|
-
if
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
379
|
+
if res.is_none() {
|
|
380
|
+
if let Some(rh) = self.runs.get_mut(run_id) {
|
|
381
|
+
// Attempt to produce the next activation if needed
|
|
382
|
+
res = rh.check_more_activations();
|
|
383
|
+
}
|
|
696
384
|
}
|
|
385
|
+
res
|
|
697
386
|
}
|
|
698
387
|
|
|
699
|
-
fn local_resolution(&mut self, msg: LocalResolutionMsg) {
|
|
388
|
+
fn local_resolution(&mut self, msg: LocalResolutionMsg) -> RunUpdateAct {
|
|
700
389
|
let run_id = msg.run_id;
|
|
701
390
|
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
702
|
-
rh.
|
|
391
|
+
rh.local_resolution(msg.res)
|
|
703
392
|
} else {
|
|
704
393
|
// It isn't an explicit error if the machine is missing when a local activity resolves.
|
|
705
394
|
// This can happen if an activity reports a timeout after we stopped caring about it.
|
|
706
395
|
debug!(run_id = %run_id,
|
|
707
396
|
"Tried to resolve a local activity for a run we are no longer tracking");
|
|
397
|
+
None
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
fn process_heartbeat_timeout(&mut self, run_id: String) -> RunUpdateAct {
|
|
402
|
+
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
403
|
+
rh.heartbeat_timeout()
|
|
404
|
+
} else {
|
|
405
|
+
None
|
|
708
406
|
}
|
|
709
407
|
}
|
|
710
408
|
|
|
@@ -712,17 +410,8 @@ impl WFStream {
|
|
|
712
410
|
/// activation to evict the workflow from the lang side. Workflow will not *actually* be evicted
|
|
713
411
|
/// until lang replies to that activation
|
|
714
412
|
fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
|
|
715
|
-
let activation_has_eviction = self.activation_has_eviction(&info.run_id);
|
|
716
413
|
if let Some(rh) = self.runs.get_mut(&info.run_id) {
|
|
717
|
-
|
|
718
|
-
if !activation_has_eviction && rh.trying_to_evict.is_none() {
|
|
719
|
-
debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
|
|
720
|
-
rh.trying_to_evict = Some(info);
|
|
721
|
-
rh.check_more_activations();
|
|
722
|
-
EvictionRequestResult::EvictionRequested(attempts)
|
|
723
|
-
} else {
|
|
724
|
-
EvictionRequestResult::EvictionAlreadyRequested(attempts)
|
|
725
|
-
}
|
|
414
|
+
rh.request_eviction(info)
|
|
726
415
|
} else {
|
|
727
416
|
debug!(run_id=%info.run_id, "Eviction requested for unknown run");
|
|
728
417
|
EvictionRequestResult::NotFound
|
|
@@ -743,36 +432,10 @@ impl WFStream {
|
|
|
743
432
|
}
|
|
744
433
|
}
|
|
745
434
|
|
|
746
|
-
/// Evict a workflow from the cache by its run id. Any existing pending activations will be
|
|
747
|
-
/// destroyed, and any outstanding activations invalidated.
|
|
748
|
-
fn evict_run(&mut self, run_id: &str) {
|
|
749
|
-
debug!(run_id=%run_id, "Evicting run");
|
|
750
|
-
|
|
751
|
-
let mut did_take_buff = false;
|
|
752
|
-
// Now it can safely be deleted, it'll get recreated once the un-buffered poll is handled if
|
|
753
|
-
// there was one.
|
|
754
|
-
if let Some(mut rh) = self.runs.remove(run_id) {
|
|
755
|
-
rh.handle.abort();
|
|
756
|
-
|
|
757
|
-
if let Some(buff) = rh.buffered_resp.take() {
|
|
758
|
-
self.instantiate_or_update(buff);
|
|
759
|
-
did_take_buff = true;
|
|
760
|
-
}
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
if !did_take_buff {
|
|
764
|
-
// If there wasn't a buffered poll, there might be one for a different run which needs
|
|
765
|
-
// a free cache slot, and now there is.
|
|
766
|
-
if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
|
|
767
|
-
self.instantiate_or_update(buff);
|
|
768
|
-
}
|
|
769
|
-
}
|
|
770
|
-
}
|
|
771
|
-
|
|
772
435
|
fn complete_wft(
|
|
773
436
|
&mut self,
|
|
774
437
|
run_id: &str,
|
|
775
|
-
|
|
438
|
+
wft_report_status: WFTReportStatus,
|
|
776
439
|
) -> Option<OutstandingTask> {
|
|
777
440
|
// If the WFT completion wasn't sent to the server, but we did see the final event, we still
|
|
778
441
|
// want to clear the workflow task. This can really only happen in replay testing, where we
|
|
@@ -782,9 +445,9 @@ impl WFStream {
|
|
|
782
445
|
let saw_final = self
|
|
783
446
|
.runs
|
|
784
447
|
.get(run_id)
|
|
785
|
-
.map(|r| r.have_seen_terminal_event)
|
|
448
|
+
.map(|r| r.have_seen_terminal_event())
|
|
786
449
|
.unwrap_or_default();
|
|
787
|
-
if !saw_final && !
|
|
450
|
+
if !saw_final && matches!(wft_report_status, WFTReportStatus::NotReported) {
|
|
788
451
|
return None;
|
|
789
452
|
}
|
|
790
453
|
|
|
@@ -792,60 +455,26 @@ impl WFStream {
|
|
|
792
455
|
// Can't mark the WFT complete if there are pending queries, as doing so would destroy
|
|
793
456
|
// them.
|
|
794
457
|
if rh
|
|
795
|
-
.wft
|
|
796
|
-
.as_ref()
|
|
458
|
+
.wft()
|
|
797
459
|
.map(|wft| !wft.pending_queries.is_empty())
|
|
798
460
|
.unwrap_or_default()
|
|
799
461
|
{
|
|
800
462
|
return None;
|
|
801
463
|
}
|
|
802
464
|
|
|
803
|
-
|
|
804
|
-
let retme = rh.wft.take();
|
|
805
|
-
if let Some(ot) = &retme {
|
|
806
|
-
if let Some(m) = self.run_metrics(run_id) {
|
|
807
|
-
m.wf_task_latency(ot.start_time.elapsed());
|
|
808
|
-
}
|
|
809
|
-
}
|
|
810
|
-
retme
|
|
465
|
+
rh.mark_wft_complete(wft_report_status)
|
|
811
466
|
} else {
|
|
812
467
|
None
|
|
813
468
|
}
|
|
814
469
|
}
|
|
815
470
|
|
|
816
|
-
/// Stores some work if there is any outstanding WFT or activation for the run. If there was
|
|
817
|
-
/// not, returns the work back out inside the option.
|
|
818
|
-
fn buffer_resp_if_outstanding_work(&mut self, work: PermittedWFT) -> Option<PermittedWFT> {
|
|
819
|
-
let run_id = &work.wft.workflow_execution.run_id;
|
|
820
|
-
if let Some(mut run) = self.runs.get_mut(run_id) {
|
|
821
|
-
let about_to_issue_evict = run.trying_to_evict.is_some() && !run.last_action_acked;
|
|
822
|
-
let has_wft = run.wft.is_some();
|
|
823
|
-
let has_activation = run.activation.is_some();
|
|
824
|
-
if has_wft
|
|
825
|
-
|| has_activation
|
|
826
|
-
|| about_to_issue_evict
|
|
827
|
-
|| run.more_pending_work
|
|
828
|
-
|| !run.last_action_acked
|
|
829
|
-
{
|
|
830
|
-
debug!(run_id = %run_id, run = ?run,
|
|
831
|
-
"Got new WFT for a run with outstanding work, buffering it");
|
|
832
|
-
run.buffered_resp = Some(work);
|
|
833
|
-
None
|
|
834
|
-
} else {
|
|
835
|
-
Some(work)
|
|
836
|
-
}
|
|
837
|
-
} else {
|
|
838
|
-
Some(work)
|
|
839
|
-
}
|
|
840
|
-
}
|
|
841
|
-
|
|
842
471
|
fn buffer_resp_on_full_cache(&mut self, work: PermittedWFT) {
|
|
843
|
-
debug!(run_id=%work.
|
|
472
|
+
debug!(run_id=%work.work.execution.run_id, "Buffering WFT because cache is full");
|
|
844
473
|
// If there's already a buffered poll for the run, replace it.
|
|
845
474
|
if let Some(rh) = self
|
|
846
475
|
.buffered_polls_need_cache_slot
|
|
847
476
|
.iter_mut()
|
|
848
|
-
.find(|w| w.
|
|
477
|
+
.find(|w| w.work.execution.run_id == work.work.execution.run_id)
|
|
849
478
|
{
|
|
850
479
|
*rh = work;
|
|
851
480
|
} else {
|
|
@@ -856,7 +485,7 @@ impl WFStream {
|
|
|
856
485
|
|
|
857
486
|
/// Makes sure we have enough pending evictions to fulfill the needs of buffered WFTs who are
|
|
858
487
|
/// waiting on a cache slot
|
|
859
|
-
fn reconcile_buffered(&mut self) {
|
|
488
|
+
fn reconcile_buffered(&mut self) -> Vec<ActivationOrAuto> {
|
|
860
489
|
// We must ensure that there are at least as many pending evictions as there are tasks
|
|
861
490
|
// that we might need to un-buffer (skipping runs which already have buffered tasks for
|
|
862
491
|
// themselves)
|
|
@@ -865,121 +494,222 @@ impl WFStream {
|
|
|
865
494
|
let num_existing_evictions = self
|
|
866
495
|
.runs
|
|
867
496
|
.runs_lru_order()
|
|
868
|
-
.filter(|(_, h)| h.
|
|
497
|
+
.filter(|(_, h)| h.is_trying_to_evict())
|
|
869
498
|
.count();
|
|
870
499
|
let mut num_evicts_needed = num_in_buff.saturating_sub(num_existing_evictions);
|
|
871
500
|
for (rid, handle) in self.runs.runs_lru_order() {
|
|
872
501
|
if num_evicts_needed == 0 {
|
|
873
502
|
break;
|
|
874
503
|
}
|
|
875
|
-
if handle.
|
|
504
|
+
if !handle.has_buffered_wft() {
|
|
876
505
|
num_evicts_needed -= 1;
|
|
877
506
|
evict_these.push(rid.to_string());
|
|
878
507
|
}
|
|
879
508
|
}
|
|
509
|
+
let mut acts = vec![];
|
|
880
510
|
for run_id in evict_these {
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
511
|
+
acts.extend(
|
|
512
|
+
self.request_eviction(RequestEvictMsg {
|
|
513
|
+
run_id,
|
|
514
|
+
message: "Workflow cache full".to_string(),
|
|
515
|
+
reason: EvictionReason::CacheFull,
|
|
516
|
+
})
|
|
517
|
+
.into_run_update_resp(),
|
|
518
|
+
);
|
|
886
519
|
}
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
fn reply_to_complete(
|
|
890
|
-
&self,
|
|
891
|
-
run_id: &str,
|
|
892
|
-
outcome: ActivationCompleteOutcome,
|
|
893
|
-
chan: oneshot::Sender<ActivationCompleteResult>,
|
|
894
|
-
) {
|
|
895
|
-
let most_recently_processed_event = self
|
|
896
|
-
.runs
|
|
897
|
-
.peek(run_id)
|
|
898
|
-
.map(|rh| rh.most_recently_processed_event_number)
|
|
899
|
-
.unwrap_or_default();
|
|
900
|
-
chan.send(ActivationCompleteResult {
|
|
901
|
-
most_recently_processed_event,
|
|
902
|
-
outcome,
|
|
903
|
-
})
|
|
904
|
-
.expect("Rcv half of activation reply not dropped");
|
|
520
|
+
acts
|
|
905
521
|
}
|
|
906
522
|
|
|
907
523
|
fn shutdown_done(&self) -> bool {
|
|
908
|
-
|
|
909
|
-
.
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
524
|
+
if self.shutdown_token.is_cancelled() {
|
|
525
|
+
if Arc::strong_count(&self.history_fetch_refcounter) > 1 {
|
|
526
|
+
// Don't exit if there are outstanding fetch requests
|
|
527
|
+
return false;
|
|
528
|
+
}
|
|
529
|
+
let all_runs_ready = self
|
|
530
|
+
.runs
|
|
531
|
+
.handles()
|
|
532
|
+
.all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
|
|
533
|
+
if all_runs_ready {
|
|
534
|
+
return true;
|
|
535
|
+
}
|
|
917
536
|
}
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
fn get_task(&mut self, run_id: &str) -> Option<&OutstandingTask> {
|
|
921
|
-
self.runs.get(run_id).and_then(|rh| rh.wft.as_ref())
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
fn get_activation(&mut self, run_id: &str) -> Option<&OutstandingActivation> {
|
|
925
|
-
self.runs.get(run_id).and_then(|rh| rh.activation.as_ref())
|
|
926
|
-
}
|
|
927
|
-
|
|
928
|
-
fn run_metrics(&mut self, run_id: &str) -> Option<&MetricsContext> {
|
|
929
|
-
self.runs.get(run_id).map(|r| &r.metrics)
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
fn activation_has_only_eviction(&mut self, run_id: &str) -> bool {
|
|
933
|
-
self.runs
|
|
934
|
-
.get(run_id)
|
|
935
|
-
.and_then(|rh| rh.activation)
|
|
936
|
-
.map(OutstandingActivation::has_only_eviction)
|
|
937
|
-
.unwrap_or_default()
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
fn activation_has_eviction(&mut self, run_id: &str) -> bool {
|
|
941
|
-
self.runs
|
|
942
|
-
.get(run_id)
|
|
943
|
-
.and_then(|rh| rh.activation)
|
|
944
|
-
.map(OutstandingActivation::has_eviction)
|
|
945
|
-
.unwrap_or_default()
|
|
537
|
+
false
|
|
946
538
|
}
|
|
947
539
|
|
|
948
540
|
fn outstanding_wfts(&self) -> usize {
|
|
949
|
-
self.runs.handles().filter(|r| r.wft.is_some()).count()
|
|
541
|
+
self.runs.handles().filter(|r| r.wft().is_some()).count()
|
|
950
542
|
}
|
|
951
543
|
|
|
952
544
|
// Useful when debugging
|
|
953
545
|
#[allow(dead_code)]
|
|
954
546
|
fn info_dump(&self, run_id: &str) {
|
|
955
547
|
if let Some(r) = self.runs.peek(run_id) {
|
|
956
|
-
info!(run_id, wft=?r.wft, activation=?r.activation
|
|
957
|
-
|
|
958
|
-
|
|
548
|
+
info!(run_id, wft=?r.wft(), activation=?r.activation(),
|
|
549
|
+
buffered_wft=r.has_buffered_wft(),
|
|
550
|
+
trying_to_evict=r.is_trying_to_evict(), more_work=r.more_pending_work());
|
|
959
551
|
} else {
|
|
960
552
|
info!(run_id, "Run not found");
|
|
961
553
|
}
|
|
962
554
|
}
|
|
963
555
|
}
|
|
964
556
|
|
|
965
|
-
///
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
557
|
+
/// All possible inputs to the [WFStream]
|
|
558
|
+
#[derive(derive_more::From, Debug)]
|
|
559
|
+
#[cfg_attr(
|
|
560
|
+
feature = "save_wf_inputs",
|
|
561
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
562
|
+
)]
|
|
563
|
+
enum WFStreamInput {
|
|
564
|
+
NewWft(PermittedWFT),
|
|
565
|
+
Local(LocalInput),
|
|
566
|
+
/// The stream given to us which represents the poller (or a mock) terminated.
|
|
567
|
+
PollerDead,
|
|
568
|
+
/// The stream given to us which represents the poller (or a mock) encountered a non-retryable
|
|
569
|
+
/// error while polling
|
|
570
|
+
PollerError(
|
|
571
|
+
#[cfg_attr(
|
|
572
|
+
feature = "save_wf_inputs",
|
|
573
|
+
serde(with = "tonic_status_serde::SerdeStatus")
|
|
574
|
+
)]
|
|
575
|
+
tonic::Status,
|
|
576
|
+
),
|
|
577
|
+
FailedFetch {
|
|
578
|
+
run_id: String,
|
|
579
|
+
#[cfg_attr(
|
|
580
|
+
feature = "save_wf_inputs",
|
|
581
|
+
serde(with = "tonic_status_serde::SerdeStatus")
|
|
582
|
+
)]
|
|
583
|
+
err: tonic::Status,
|
|
584
|
+
},
|
|
585
|
+
}
|
|
971
586
|
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
587
|
+
/// A non-poller-received input to the [WFStream]
|
|
588
|
+
#[derive(derive_more::DebugCustom)]
|
|
589
|
+
#[cfg_attr(
|
|
590
|
+
feature = "save_wf_inputs",
|
|
591
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
592
|
+
)]
|
|
593
|
+
#[debug(fmt = "LocalInput {{ {input:?} }}")]
|
|
594
|
+
pub(super) struct LocalInput {
|
|
595
|
+
pub input: LocalInputs,
|
|
596
|
+
#[cfg_attr(feature = "save_wf_inputs", serde(skip, default = "Span::current"))]
|
|
597
|
+
pub span: Span,
|
|
598
|
+
}
|
|
599
|
+
impl From<HeartbeatTimeoutMsg> for LocalInput {
|
|
600
|
+
fn from(hb: HeartbeatTimeoutMsg) -> Self {
|
|
601
|
+
Self {
|
|
602
|
+
input: LocalInputs::HeartbeatTimeout(hb.run_id),
|
|
603
|
+
span: hb.span,
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
/// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
|
|
608
|
+
/// new polls.
|
|
609
|
+
#[derive(Debug, derive_more::From)]
|
|
610
|
+
#[cfg_attr(
|
|
611
|
+
feature = "save_wf_inputs",
|
|
612
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
613
|
+
)]
|
|
614
|
+
pub(super) enum LocalInputs {
|
|
615
|
+
Completion(WFActCompleteMsg),
|
|
616
|
+
FetchedPageCompletion {
|
|
617
|
+
paginator: HistoryPaginator,
|
|
618
|
+
update: HistoryUpdate,
|
|
619
|
+
},
|
|
620
|
+
LocalResolution(LocalResolutionMsg),
|
|
621
|
+
PostActivation(PostActivationMsg),
|
|
622
|
+
RequestEviction(RequestEvictMsg),
|
|
623
|
+
HeartbeatTimeout(String),
|
|
624
|
+
#[cfg_attr(feature = "save_wf_inputs", serde(skip))]
|
|
625
|
+
GetStateInfo(GetStateInfoMsg),
|
|
626
|
+
}
|
|
627
|
+
impl LocalInputs {
|
|
628
|
+
fn run_id(&self) -> Option<&str> {
|
|
629
|
+
Some(match self {
|
|
630
|
+
LocalInputs::Completion(c) => c.completion.run_id(),
|
|
631
|
+
LocalInputs::FetchedPageCompletion { paginator, .. } => &paginator.run_id,
|
|
632
|
+
LocalInputs::LocalResolution(lr) => &lr.run_id,
|
|
633
|
+
LocalInputs::PostActivation(pa) => &pa.run_id,
|
|
634
|
+
LocalInputs::RequestEviction(re) => &re.run_id,
|
|
635
|
+
LocalInputs::HeartbeatTimeout(hb) => hb,
|
|
636
|
+
LocalInputs::GetStateInfo(_) => return None,
|
|
637
|
+
})
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
#[derive(Debug)]
|
|
641
|
+
#[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
|
|
642
|
+
enum ExternalPollerInputs {
|
|
643
|
+
NewWft(PermittedWFT),
|
|
644
|
+
PollerDead,
|
|
645
|
+
PollerError(tonic::Status),
|
|
646
|
+
FetchedUpdate(PermittedWFT),
|
|
647
|
+
NextPage {
|
|
648
|
+
paginator: HistoryPaginator,
|
|
649
|
+
update: HistoryUpdate,
|
|
650
|
+
span: Span,
|
|
651
|
+
},
|
|
652
|
+
FailedFetch {
|
|
653
|
+
run_id: String,
|
|
654
|
+
err: tonic::Status,
|
|
655
|
+
},
|
|
656
|
+
}
|
|
657
|
+
impl From<ExternalPollerInputs> for WFStreamInput {
|
|
658
|
+
fn from(l: ExternalPollerInputs) -> Self {
|
|
659
|
+
match l {
|
|
660
|
+
ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
|
|
661
|
+
ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
|
|
662
|
+
ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
|
|
663
|
+
ExternalPollerInputs::FetchedUpdate(wft) => WFStreamInput::NewWft(wft),
|
|
664
|
+
ExternalPollerInputs::FailedFetch { run_id, err } => {
|
|
665
|
+
WFStreamInput::FailedFetch { run_id, err }
|
|
666
|
+
}
|
|
667
|
+
ExternalPollerInputs::NextPage {
|
|
668
|
+
paginator,
|
|
669
|
+
update,
|
|
670
|
+
span,
|
|
671
|
+
} => WFStreamInput::Local(LocalInput {
|
|
672
|
+
input: LocalInputs::FetchedPageCompletion { paginator, update },
|
|
673
|
+
span,
|
|
674
|
+
}),
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
impl From<Result<WFTExtractorOutput, tonic::Status>> for ExternalPollerInputs {
|
|
679
|
+
fn from(v: Result<WFTExtractorOutput, tonic::Status>) -> Self {
|
|
680
|
+
match v {
|
|
681
|
+
Ok(WFTExtractorOutput::NewWFT(pwft)) => ExternalPollerInputs::NewWft(pwft),
|
|
682
|
+
Ok(WFTExtractorOutput::FetchResult(updated_wft, _)) => {
|
|
683
|
+
ExternalPollerInputs::FetchedUpdate(updated_wft)
|
|
684
|
+
}
|
|
685
|
+
Ok(WFTExtractorOutput::NextPage {
|
|
686
|
+
paginator,
|
|
687
|
+
update,
|
|
688
|
+
span,
|
|
689
|
+
rc: _rc,
|
|
690
|
+
}) => ExternalPollerInputs::NextPage {
|
|
691
|
+
paginator,
|
|
692
|
+
update,
|
|
693
|
+
span,
|
|
694
|
+
},
|
|
695
|
+
Ok(WFTExtractorOutput::FailedFetch { run_id, err }) => {
|
|
696
|
+
ExternalPollerInputs::FailedFetch { run_id, err }
|
|
697
|
+
}
|
|
698
|
+
Ok(WFTExtractorOutput::PollerDead) => ExternalPollerInputs::PollerDead,
|
|
699
|
+
Err(e) => ExternalPollerInputs::PollerError(e),
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
#[derive(Debug)]
|
|
704
|
+
enum NewOrFetchedComplete {
|
|
705
|
+
New(WFActCompleteMsg),
|
|
706
|
+
Fetched(HistoryUpdate, HistoryPaginator),
|
|
707
|
+
}
|
|
708
|
+
impl NewOrFetchedComplete {
|
|
709
|
+
fn run_id(&self) -> &str {
|
|
710
|
+
match self {
|
|
711
|
+
NewOrFetchedComplete::New(c) => c.completion.run_id(),
|
|
712
|
+
NewOrFetchedComplete::Fetched(_, p) => &p.run_id,
|
|
713
|
+
}
|
|
977
714
|
}
|
|
978
|
-
|
|
979
|
-
debug!(queries=?wft.pending_queries, "Dispatching queries");
|
|
980
|
-
let query_jobs = wft
|
|
981
|
-
.pending_queries
|
|
982
|
-
.drain(..)
|
|
983
|
-
.map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
|
|
984
|
-
act.jobs.extend(query_jobs);
|
|
985
715
|
}
|