@temporalio/core-bridge 1.5.2 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +304 -112
- package/lib/index.d.ts +8 -6
- package/lib/index.js.map +1 -1
- package/package.json +9 -4
- package/releases/aarch64-apple-darwin/index.node +0 -0
- package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
- package/releases/x86_64-apple-darwin/index.node +0 -0
- package/releases/x86_64-pc-windows-msvc/index.node +0 -0
- package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
- package/sdk-core/.buildkite/docker/Dockerfile +2 -2
- package/sdk-core/.buildkite/docker/docker-compose.yaml +1 -1
- package/sdk-core/.buildkite/pipeline.yml +2 -4
- package/sdk-core/.cargo/config.toml +5 -2
- package/sdk-core/.github/workflows/heavy.yml +29 -0
- package/sdk-core/Cargo.toml +1 -1
- package/sdk-core/README.md +20 -10
- package/sdk-core/client/src/lib.rs +215 -39
- package/sdk-core/client/src/metrics.rs +17 -8
- package/sdk-core/client/src/raw.rs +4 -4
- package/sdk-core/client/src/retry.rs +32 -20
- package/sdk-core/core/Cargo.toml +25 -12
- package/sdk-core/core/src/abstractions/take_cell.rs +28 -0
- package/sdk-core/core/src/abstractions.rs +204 -14
- package/sdk-core/core/src/core_tests/activity_tasks.rs +143 -50
- package/sdk-core/core/src/core_tests/child_workflows.rs +6 -5
- package/sdk-core/core/src/core_tests/determinism.rs +165 -2
- package/sdk-core/core/src/core_tests/local_activities.rs +431 -43
- package/sdk-core/core/src/core_tests/queries.rs +34 -16
- package/sdk-core/core/src/core_tests/workers.rs +8 -5
- package/sdk-core/core/src/core_tests/workflow_tasks.rs +588 -55
- package/sdk-core/core/src/ephemeral_server/mod.rs +113 -12
- package/sdk-core/core/src/internal_flags.rs +155 -0
- package/sdk-core/core/src/lib.rs +16 -9
- package/sdk-core/core/src/protosext/mod.rs +1 -1
- package/sdk-core/core/src/replay/mod.rs +16 -27
- package/sdk-core/core/src/telemetry/log_export.rs +1 -1
- package/sdk-core/core/src/telemetry/metrics.rs +69 -35
- package/sdk-core/core/src/telemetry/mod.rs +60 -21
- package/sdk-core/core/src/telemetry/prometheus_server.rs +19 -13
- package/sdk-core/core/src/test_help/mod.rs +73 -14
- package/sdk-core/core/src/worker/activities/activity_heartbeat_manager.rs +119 -160
- package/sdk-core/core/src/worker/activities/activity_task_poller_stream.rs +89 -0
- package/sdk-core/core/src/worker/activities/local_activities.rs +379 -129
- package/sdk-core/core/src/worker/activities.rs +350 -175
- package/sdk-core/core/src/worker/client/mocks.rs +22 -2
- package/sdk-core/core/src/worker/client.rs +18 -2
- package/sdk-core/core/src/worker/mod.rs +183 -64
- package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
- package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
- package/sdk-core/core/src/worker/workflow/history_update.rs +916 -277
- package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +216 -183
- package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +9 -12
- package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +7 -9
- package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +160 -87
- package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +13 -14
- package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +7 -9
- package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +14 -17
- package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +242 -110
- package/sdk-core/core/src/worker/workflow/machines/mod.rs +27 -19
- package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +9 -11
- package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +321 -206
- package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +13 -18
- package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +20 -29
- package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
- package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +257 -51
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
- package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +310 -150
- package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +17 -20
- package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +31 -15
- package/sdk-core/core/src/worker/workflow/managed_run.rs +1052 -380
- package/sdk-core/core/src/worker/workflow/mod.rs +598 -390
- package/sdk-core/core/src/worker/workflow/run_cache.rs +40 -57
- package/sdk-core/core/src/worker/workflow/wft_extraction.rs +137 -0
- package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
- package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +117 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
- package/sdk-core/core/src/worker/workflow/workflow_stream.rs +469 -718
- package/sdk-core/core-api/Cargo.toml +2 -1
- package/sdk-core/core-api/src/errors.rs +1 -34
- package/sdk-core/core-api/src/lib.rs +19 -9
- package/sdk-core/core-api/src/telemetry.rs +4 -6
- package/sdk-core/core-api/src/worker.rs +19 -1
- package/sdk-core/etc/deps.svg +115 -140
- package/sdk-core/etc/regen-depgraph.sh +5 -0
- package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +86 -61
- package/sdk-core/fsm/rustfsm_trait/src/lib.rs +29 -71
- package/sdk-core/histories/ends_empty_wft_complete.bin +0 -0
- package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
- package/sdk-core/histories/old_change_marker_format.bin +0 -0
- package/sdk-core/protos/api_upstream/.github/CODEOWNERS +2 -1
- package/sdk-core/protos/api_upstream/Makefile +6 -6
- package/sdk-core/protos/api_upstream/build/go.mod +7 -0
- package/sdk-core/protos/api_upstream/build/go.sum +5 -0
- package/sdk-core/protos/api_upstream/build/tools.go +29 -0
- package/sdk-core/protos/api_upstream/go.mod +6 -0
- package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
- package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +7 -26
- package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +13 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -7
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +8 -8
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +25 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +24 -19
- package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +49 -26
- package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +4 -2
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +5 -2
- package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/protocol/v1/message.proto +57 -0
- package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/sdk/v1/task_complete_metadata.proto +63 -0
- package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/update/v1/message.proto +71 -6
- package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +64 -28
- package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +4 -4
- package/sdk-core/protos/local/temporal/sdk/core/activity_result/activity_result.proto +7 -8
- package/sdk-core/protos/local/temporal/sdk/core/activity_task/activity_task.proto +10 -7
- package/sdk-core/protos/local/temporal/sdk/core/child_workflow/child_workflow.proto +19 -30
- package/sdk-core/protos/local/temporal/sdk/core/common/common.proto +1 -0
- package/sdk-core/protos/local/temporal/sdk/core/core_interface.proto +1 -0
- package/sdk-core/protos/local/temporal/sdk/core/external_data/external_data.proto +8 -0
- package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +67 -60
- package/sdk-core/protos/local/temporal/sdk/core/workflow_commands/workflow_commands.proto +85 -84
- package/sdk-core/protos/local/temporal/sdk/core/workflow_completion/workflow_completion.proto +9 -3
- package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
- package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
- package/sdk-core/sdk/Cargo.toml +5 -4
- package/sdk-core/sdk/src/lib.rs +108 -26
- package/sdk-core/sdk/src/workflow_context/options.rs +7 -1
- package/sdk-core/sdk/src/workflow_context.rs +24 -17
- package/sdk-core/sdk/src/workflow_future.rs +16 -15
- package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
- package/sdk-core/sdk-core-protos/build.rs +36 -2
- package/sdk-core/sdk-core-protos/src/history_builder.rs +138 -106
- package/sdk-core/sdk-core-protos/src/history_info.rs +10 -1
- package/sdk-core/sdk-core-protos/src/lib.rs +272 -87
- package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
- package/sdk-core/test-utils/Cargo.toml +3 -1
- package/sdk-core/test-utils/src/canned_histories.rs +106 -296
- package/sdk-core/test-utils/src/histfetch.rs +1 -1
- package/sdk-core/test-utils/src/lib.rs +82 -23
- package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
- package/sdk-core/test-utils/src/workflows.rs +29 -0
- package/sdk-core/tests/fuzzy_workflow.rs +130 -0
- package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +125 -51
- package/sdk-core/tests/integ_tests/ephemeral_server_tests.rs +25 -3
- package/sdk-core/tests/integ_tests/heartbeat_tests.rs +10 -5
- package/sdk-core/tests/integ_tests/metrics_tests.rs +218 -16
- package/sdk-core/tests/integ_tests/polling_tests.rs +4 -47
- package/sdk-core/tests/integ_tests/queries_tests.rs +5 -128
- package/sdk-core/tests/integ_tests/visibility_tests.rs +83 -25
- package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +161 -72
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_external.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +6 -13
- package/sdk-core/tests/integ_tests/workflow_tests/child_workflows.rs +80 -3
- package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +6 -2
- package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +3 -10
- package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +94 -200
- package/sdk-core/tests/integ_tests/workflow_tests/modify_wf_properties.rs +2 -4
- package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +34 -28
- package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +76 -7
- package/sdk-core/tests/integ_tests/workflow_tests/resets.rs +1 -0
- package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +18 -14
- package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +6 -20
- package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +10 -21
- package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +7 -8
- package/sdk-core/tests/integ_tests/workflow_tests.rs +13 -14
- package/sdk-core/tests/main.rs +3 -13
- package/sdk-core/tests/runner.rs +75 -36
- package/sdk-core/tests/wf_input_replay.rs +32 -0
- package/src/conversions.rs +14 -8
- package/src/runtime.rs +9 -8
- package/ts/index.ts +8 -6
- package/sdk-core/bridge-ffi/Cargo.toml +0 -24
- package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
- package/sdk-core/bridge-ffi/build.rs +0 -25
- package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
- package/sdk-core/bridge-ffi/src/lib.rs +0 -746
- package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
- package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
- package/sdk-core/sdk/src/conversions.rs +0 -8
|
@@ -1,250 +1,182 @@
|
|
|
1
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
2
|
+
mod saved_wf_inputs;
|
|
3
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
4
|
+
mod tonic_status_serde;
|
|
5
|
+
|
|
6
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
7
|
+
pub use saved_wf_inputs::replay_wf_state_inputs;
|
|
8
|
+
|
|
1
9
|
use crate::{
|
|
2
|
-
abstractions::
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
abstractions::dbg_panic,
|
|
11
|
+
worker::workflow::{
|
|
12
|
+
managed_run::RunUpdateAct,
|
|
13
|
+
run_cache::RunCache,
|
|
14
|
+
wft_extraction::{HistfetchRC, HistoryFetchReq, WFTExtractorOutput},
|
|
15
|
+
*,
|
|
8
16
|
},
|
|
9
17
|
MetricsContext,
|
|
10
18
|
};
|
|
11
19
|
use futures::{stream, stream::PollNext, Stream, StreamExt};
|
|
12
|
-
use std::{collections::VecDeque, fmt::Debug, future, sync::Arc
|
|
13
|
-
use temporal_sdk_core_api::errors::
|
|
14
|
-
use temporal_sdk_core_protos::
|
|
15
|
-
coresdk::{
|
|
16
|
-
workflow_activation::{
|
|
17
|
-
create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
|
|
18
|
-
workflow_activation_job,
|
|
19
|
-
},
|
|
20
|
-
workflow_completion::Failure,
|
|
21
|
-
},
|
|
22
|
-
temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure as TFailure},
|
|
23
|
-
};
|
|
24
|
-
use tokio::sync::{mpsc::unbounded_channel, oneshot};
|
|
25
|
-
use tokio_stream::wrappers::UnboundedReceiverStream;
|
|
20
|
+
use std::{collections::VecDeque, fmt::Debug, future, sync::Arc};
|
|
21
|
+
use temporal_sdk_core_api::errors::PollWfError;
|
|
22
|
+
use temporal_sdk_core_protos::coresdk::workflow_activation::remove_from_cache::EvictionReason;
|
|
26
23
|
use tokio_util::sync::CancellationToken;
|
|
27
24
|
use tracing::{Level, Span};
|
|
28
25
|
|
|
29
|
-
/// This struct holds all the state needed for tracking
|
|
30
|
-
/// and
|
|
26
|
+
/// This struct holds all the state needed for tracking the state of currently cached workflow runs
|
|
27
|
+
/// and directs all actions which affect them. It is ultimately the top-level arbiter of nearly
|
|
28
|
+
/// everything important relating to workflow state.
|
|
31
29
|
///
|
|
32
30
|
/// See [WFStream::build] for more
|
|
33
|
-
pub(
|
|
31
|
+
pub(super) struct WFStream {
|
|
34
32
|
runs: RunCache,
|
|
35
33
|
/// Buffered polls for new runs which need a cache slot to open up before we can handle them
|
|
36
34
|
buffered_polls_need_cache_slot: VecDeque<PermittedWFT>,
|
|
35
|
+
/// Is filled with runs that we decided need to have their history fetched during state
|
|
36
|
+
/// manipulation. Must be drained after handling each input.
|
|
37
|
+
runs_needing_fetching: VecDeque<HistoryFetchReq>,
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
client: Arc<dyn WorkerClient>,
|
|
40
|
-
|
|
41
|
-
/// Ensures we stay at or below this worker's maximum concurrent workflow task limit
|
|
42
|
-
wft_semaphore: MeteredSemaphore,
|
|
39
|
+
history_fetch_refcounter: Arc<HistfetchRC>,
|
|
43
40
|
shutdown_token: CancellationToken,
|
|
44
41
|
ignore_evicts_on_shutdown: bool,
|
|
45
42
|
|
|
46
43
|
metrics: MetricsContext,
|
|
47
|
-
}
|
|
48
|
-
impl WFStream {
|
|
49
|
-
fn record_span_fields(&mut self, run_id: &str, span: &Span) {
|
|
50
|
-
if let Some(run_handle) = self.runs.get_mut(run_id) {
|
|
51
|
-
if let Some(spid) = span.id() {
|
|
52
|
-
if run_handle.recorded_span_ids.contains(&spid) {
|
|
53
|
-
return;
|
|
54
|
-
}
|
|
55
|
-
run_handle.recorded_span_ids.insert(spid);
|
|
56
|
-
|
|
57
|
-
if let Some(wid) = run_handle.wft.as_ref().map(|wft| &wft.info.wf_id) {
|
|
58
|
-
span.record("workflow_id", wid.as_str());
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
44
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
enum WFStreamInput {
|
|
68
|
-
NewWft(PermittedWFT),
|
|
69
|
-
Local(LocalInput),
|
|
70
|
-
/// The stream given to us which represents the poller (or a mock) terminated.
|
|
71
|
-
PollerDead,
|
|
72
|
-
/// The stream given to us which represents the poller (or a mock) encountered a non-retryable
|
|
73
|
-
/// error while polling
|
|
74
|
-
PollerError(tonic::Status),
|
|
75
|
-
}
|
|
76
|
-
impl From<RunUpdateResponse> for WFStreamInput {
|
|
77
|
-
fn from(r: RunUpdateResponse) -> Self {
|
|
78
|
-
WFStreamInput::Local(LocalInput {
|
|
79
|
-
input: LocalInputs::RunUpdateResponse(r.kind),
|
|
80
|
-
span: r.span,
|
|
81
|
-
})
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
/// A non-poller-received input to the [WFStream]
|
|
85
|
-
#[derive(derive_more::DebugCustom)]
|
|
86
|
-
#[debug(fmt = "LocalInput {{ {:?} }}", input)]
|
|
87
|
-
pub(super) struct LocalInput {
|
|
88
|
-
pub input: LocalInputs,
|
|
89
|
-
pub span: Span,
|
|
90
|
-
}
|
|
91
|
-
/// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
|
|
92
|
-
/// new polls.
|
|
93
|
-
#[derive(Debug, derive_more::From)]
|
|
94
|
-
pub(super) enum LocalInputs {
|
|
95
|
-
Completion(WFActCompleteMsg),
|
|
96
|
-
LocalResolution(LocalResolutionMsg),
|
|
97
|
-
PostActivation(PostActivationMsg),
|
|
98
|
-
RunUpdateResponse(RunUpdateResponseKind),
|
|
99
|
-
RequestEviction(RequestEvictMsg),
|
|
100
|
-
GetStateInfo(GetStateInfoMsg),
|
|
101
|
-
}
|
|
102
|
-
impl LocalInputs {
|
|
103
|
-
fn run_id(&self) -> Option<&str> {
|
|
104
|
-
Some(match self {
|
|
105
|
-
LocalInputs::Completion(c) => c.completion.run_id(),
|
|
106
|
-
LocalInputs::LocalResolution(lr) => &lr.run_id,
|
|
107
|
-
LocalInputs::PostActivation(pa) => &pa.run_id,
|
|
108
|
-
LocalInputs::RunUpdateResponse(rur) => rur.run_id(),
|
|
109
|
-
LocalInputs::RequestEviction(re) => &re.run_id,
|
|
110
|
-
LocalInputs::GetStateInfo(_) => return None,
|
|
111
|
-
})
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
#[derive(Debug, derive_more::From)]
|
|
115
|
-
#[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
|
|
116
|
-
enum ExternalPollerInputs {
|
|
117
|
-
NewWft(PermittedWFT),
|
|
118
|
-
PollerDead,
|
|
119
|
-
PollerError(tonic::Status),
|
|
120
|
-
}
|
|
121
|
-
impl From<ExternalPollerInputs> for WFStreamInput {
|
|
122
|
-
fn from(l: ExternalPollerInputs) -> Self {
|
|
123
|
-
match l {
|
|
124
|
-
ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
|
|
125
|
-
ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
|
|
126
|
-
ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
|
|
127
|
-
}
|
|
128
|
-
}
|
|
45
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
46
|
+
wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
|
|
129
47
|
}
|
|
130
|
-
|
|
131
48
|
impl WFStream {
|
|
132
49
|
/// Constructs workflow state management and returns a stream which outputs activations.
|
|
133
50
|
///
|
|
134
|
-
/// * `
|
|
135
|
-
///
|
|
136
|
-
/// come down.
|
|
51
|
+
/// * `wft_stream` is a stream of validated poll responses and fetched history pages as returned
|
|
52
|
+
/// by a poller (or mock), via [WFTExtractor].
|
|
137
53
|
/// * `local_rx` is a stream of actions that workflow state needs to see. Things like
|
|
138
|
-
///
|
|
54
|
+
/// completions, local activities finishing, etc. See [LocalInputs].
|
|
55
|
+
/// * `local_activity_request_sink` is used to handle outgoing requests to start or cancel
|
|
56
|
+
/// local activities, and may return resolutions that need to be handled immediately.
|
|
139
57
|
///
|
|
140
|
-
///
|
|
141
|
-
///
|
|
142
|
-
///
|
|
58
|
+
/// The stream inputs are combined into a stream of [WFActStreamInput]s. The stream processor
|
|
59
|
+
/// then takes action on those inputs, mutating the [WFStream] state, and then may yield
|
|
60
|
+
/// activations.
|
|
143
61
|
///
|
|
144
|
-
///
|
|
145
|
-
///
|
|
146
|
-
///
|
|
62
|
+
/// Importantly, nothing async happens while actually mutating state. This means all changes to
|
|
63
|
+
/// all workflow state can be represented purely via the stream of inputs, plus the
|
|
64
|
+
/// calls/retvals from the LA request sink, which is the last unfortunate bit of impurity in
|
|
65
|
+
/// the design. Eliminating it would be nice, so that all inputs come from the passed-in streams
|
|
66
|
+
/// and all outputs flow from the return stream, but it's difficult to do so since it would
|
|
67
|
+
/// require "pausing" in-progress changes to a run while sending & waiting for response from
|
|
68
|
+
/// local activity management. Likely the best option would be to move the pure state info
|
|
69
|
+
/// needed to determine immediate responses into LA state machines themselves (out of the LA
|
|
70
|
+
/// manager), which is a quite substantial change.
|
|
147
71
|
pub(super) fn build(
|
|
148
72
|
basics: WorkflowBasics,
|
|
149
|
-
|
|
73
|
+
wft_stream: impl Stream<Item = Result<WFTExtractorOutput, tonic::Status>> + Send + 'static,
|
|
150
74
|
local_rx: impl Stream<Item = LocalInput> + Send + 'static,
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
+ Send
|
|
154
|
-
+ Sync
|
|
155
|
-
+ 'static,
|
|
156
|
-
) -> impl Stream<Item = Result<ActivationOrAuto, PollWfError>> {
|
|
157
|
-
let wft_semaphore = MeteredSemaphore::new(
|
|
158
|
-
basics.max_outstanding_wfts,
|
|
159
|
-
basics.metrics.with_new_attrs([workflow_worker_type()]),
|
|
160
|
-
MetricsContext::available_task_slots,
|
|
161
|
-
);
|
|
162
|
-
let wft_sem_clone = wft_semaphore.clone();
|
|
163
|
-
let proceeder = stream::unfold(wft_sem_clone, |sem| async move {
|
|
164
|
-
Some((sem.acquire_owned().await.unwrap(), sem))
|
|
165
|
-
});
|
|
166
|
-
let poller_wfts = stream_when_allowed(external_wfts, proceeder);
|
|
167
|
-
let (run_update_tx, run_update_rx) = unbounded_channel();
|
|
168
|
-
let local_rx = stream::select(
|
|
169
|
-
local_rx.map(Into::into),
|
|
170
|
-
UnboundedReceiverStream::new(run_update_rx).map(Into::into),
|
|
171
|
-
);
|
|
75
|
+
local_activity_request_sink: impl LocalActivityRequestSink,
|
|
76
|
+
) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
|
|
172
77
|
let all_inputs = stream::select_with_strategy(
|
|
173
|
-
local_rx,
|
|
174
|
-
|
|
175
|
-
.map(
|
|
176
|
-
Ok(wft) => ExternalPollerInputs::NewWft(PermittedWFT { wft, permit }),
|
|
177
|
-
Err(e) => ExternalPollerInputs::PollerError(e),
|
|
178
|
-
})
|
|
78
|
+
local_rx.map(Into::into),
|
|
79
|
+
wft_stream
|
|
80
|
+
.map(Into::into)
|
|
179
81
|
.chain(stream::once(async { ExternalPollerInputs::PollerDead }))
|
|
180
82
|
.map(Into::into)
|
|
181
83
|
.boxed(),
|
|
182
84
|
// Priority always goes to the local stream
|
|
183
85
|
|_: &mut ()| PollNext::Left,
|
|
184
86
|
);
|
|
87
|
+
Self::build_internal(all_inputs, basics, local_activity_request_sink)
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
fn build_internal(
|
|
91
|
+
all_inputs: impl Stream<Item = WFStreamInput>,
|
|
92
|
+
basics: WorkflowBasics,
|
|
93
|
+
local_activity_request_sink: impl LocalActivityRequestSink,
|
|
94
|
+
) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
|
|
185
95
|
let mut state = WFStream {
|
|
186
96
|
buffered_polls_need_cache_slot: Default::default(),
|
|
187
97
|
runs: RunCache::new(
|
|
188
98
|
basics.max_cached_workflows,
|
|
189
99
|
basics.namespace.clone(),
|
|
190
|
-
|
|
191
|
-
|
|
100
|
+
basics.server_capabilities.clone(),
|
|
101
|
+
local_activity_request_sink,
|
|
192
102
|
basics.metrics.clone(),
|
|
193
103
|
),
|
|
194
|
-
client,
|
|
195
|
-
wft_semaphore,
|
|
196
104
|
shutdown_token: basics.shutdown_token,
|
|
197
105
|
ignore_evicts_on_shutdown: basics.ignore_evicts_on_shutdown,
|
|
198
106
|
metrics: basics.metrics,
|
|
107
|
+
runs_needing_fetching: Default::default(),
|
|
108
|
+
history_fetch_refcounter: Arc::new(HistfetchRC {}),
|
|
109
|
+
|
|
110
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
111
|
+
wf_state_inputs: basics.wf_state_inputs,
|
|
199
112
|
};
|
|
200
113
|
all_inputs
|
|
201
|
-
.map(move |action| {
|
|
114
|
+
.map(move |action: WFStreamInput| {
|
|
202
115
|
let span = span!(Level::DEBUG, "new_stream_input", action=?action);
|
|
203
116
|
let _span_g = span.enter();
|
|
204
117
|
|
|
205
|
-
|
|
118
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
119
|
+
let maybe_write = state.prep_input(&action);
|
|
120
|
+
|
|
121
|
+
let mut activations = vec![];
|
|
122
|
+
let maybe_act = match action {
|
|
206
123
|
WFStreamInput::NewWft(pwft) => {
|
|
207
|
-
debug!(run_id=%pwft.
|
|
208
|
-
state.instantiate_or_update(pwft)
|
|
209
|
-
None
|
|
124
|
+
debug!(run_id=%pwft.work.execution.run_id, "New WFT");
|
|
125
|
+
state.instantiate_or_update(pwft)
|
|
210
126
|
}
|
|
211
127
|
WFStreamInput::Local(local_input) => {
|
|
212
128
|
let _span_g = local_input.span.enter();
|
|
213
129
|
if let Some(rid) = local_input.input.run_id() {
|
|
214
|
-
state.
|
|
130
|
+
if let Some(rh) = state.runs.get_mut(rid) {
|
|
131
|
+
rh.record_span_fields(&local_input.span);
|
|
132
|
+
}
|
|
215
133
|
}
|
|
216
134
|
match local_input.input {
|
|
217
|
-
LocalInputs::RunUpdateResponse(resp) => {
|
|
218
|
-
state.process_run_update_response(resp)
|
|
219
|
-
}
|
|
220
135
|
LocalInputs::Completion(completion) => {
|
|
221
|
-
|
|
222
|
-
|
|
136
|
+
activations.extend(
|
|
137
|
+
state.process_completion(NewOrFetchedComplete::New(completion)),
|
|
138
|
+
);
|
|
139
|
+
None // completions can return more than one activation
|
|
140
|
+
}
|
|
141
|
+
LocalInputs::FetchedPageCompletion { paginator, update } => {
|
|
142
|
+
activations.extend(state.process_completion(
|
|
143
|
+
NewOrFetchedComplete::Fetched(update, paginator),
|
|
144
|
+
));
|
|
145
|
+
None // completions can return more than one activation
|
|
223
146
|
}
|
|
224
147
|
LocalInputs::PostActivation(report) => {
|
|
225
|
-
state.process_post_activation(report)
|
|
226
|
-
None
|
|
148
|
+
state.process_post_activation(report)
|
|
227
149
|
}
|
|
228
|
-
LocalInputs::LocalResolution(res) =>
|
|
229
|
-
|
|
230
|
-
|
|
150
|
+
LocalInputs::LocalResolution(res) => state.local_resolution(res),
|
|
151
|
+
LocalInputs::HeartbeatTimeout(hbt) => {
|
|
152
|
+
state.process_heartbeat_timeout(hbt)
|
|
231
153
|
}
|
|
232
154
|
LocalInputs::RequestEviction(evict) => {
|
|
233
|
-
state.request_eviction(evict)
|
|
234
|
-
None
|
|
155
|
+
state.request_eviction(evict).into_run_update_resp()
|
|
235
156
|
}
|
|
236
157
|
LocalInputs::GetStateInfo(gsi) => {
|
|
237
158
|
let _ = gsi.response_tx.send(WorkflowStateInfo {
|
|
238
159
|
cached_workflows: state.runs.len(),
|
|
239
160
|
outstanding_wft: state.outstanding_wfts(),
|
|
240
|
-
available_wft_permits: state.wft_semaphore.available_permits(),
|
|
241
161
|
});
|
|
242
162
|
None
|
|
243
163
|
}
|
|
244
164
|
}
|
|
245
165
|
}
|
|
166
|
+
WFStreamInput::FailedFetch {
|
|
167
|
+
run_id,
|
|
168
|
+
err,
|
|
169
|
+
auto_reply_fail_tt,
|
|
170
|
+
} => state
|
|
171
|
+
.request_eviction(RequestEvictMsg {
|
|
172
|
+
run_id,
|
|
173
|
+
message: format!("Fetching history failed: {err:?}"),
|
|
174
|
+
reason: EvictionReason::PaginationOrHistoryFetch,
|
|
175
|
+
auto_reply_fail_tt,
|
|
176
|
+
})
|
|
177
|
+
.into_run_update_resp(),
|
|
246
178
|
WFStreamInput::PollerDead => {
|
|
247
|
-
debug!("WFT poller died,
|
|
179
|
+
debug!("WFT poller died, beginning shutdown");
|
|
248
180
|
state.shutdown_token.cancel();
|
|
249
181
|
None
|
|
250
182
|
}
|
|
@@ -254,457 +186,228 @@ impl WFStream {
|
|
|
254
186
|
}
|
|
255
187
|
};
|
|
256
188
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
189
|
+
activations.extend(maybe_act.into_iter());
|
|
190
|
+
activations.extend(state.reconcile_buffered());
|
|
191
|
+
|
|
192
|
+
// Always flush *after* actually handling the input, as this allows LA sink
|
|
193
|
+
// responses to be recorded before the input, so they can be read and buffered to be
|
|
194
|
+
// replayed during the handling of the input itself.
|
|
195
|
+
#[cfg(feature = "save_wf_inputs")]
|
|
196
|
+
if let Some(write) = maybe_write {
|
|
197
|
+
state.flush_write(write);
|
|
263
198
|
}
|
|
264
|
-
|
|
199
|
+
|
|
265
200
|
if state.shutdown_done() {
|
|
201
|
+
info!("Workflow shutdown is done");
|
|
266
202
|
return Err(PollWfError::ShutDown);
|
|
267
203
|
}
|
|
268
204
|
|
|
269
|
-
Ok(
|
|
205
|
+
Ok(WFStreamOutput {
|
|
206
|
+
activations: activations.into(),
|
|
207
|
+
fetch_histories: std::mem::take(&mut state.runs_needing_fetching),
|
|
208
|
+
})
|
|
270
209
|
})
|
|
271
|
-
.
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
Err(e) => {
|
|
276
|
-
if !matches!(e, PollWfError::ShutDown) {
|
|
277
|
-
error!(
|
|
210
|
+
.inspect(|o| {
|
|
211
|
+
if let Some(e) = o.as_ref().err() {
|
|
212
|
+
if !matches!(e, PollWfError::ShutDown) {
|
|
213
|
+
error!(
|
|
278
214
|
"Workflow processing encountered fatal error and must shut down {:?}",
|
|
279
215
|
e
|
|
280
|
-
|
|
281
|
-
}
|
|
282
|
-
Some(Err(e))
|
|
216
|
+
);
|
|
283
217
|
}
|
|
284
|
-
}
|
|
218
|
+
}
|
|
285
219
|
})
|
|
286
220
|
// Stop the stream once we have shut down
|
|
287
221
|
.take_while(|o| future::ready(!matches!(o, Err(PollWfError::ShutDown))))
|
|
288
222
|
}
|
|
289
223
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
match
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
.get_mut(&resp.run_id)
|
|
300
|
-
.expect("Workflow must exist, it just sent us an update response");
|
|
301
|
-
run_handle.have_seen_terminal_event = resp.have_seen_terminal_event;
|
|
302
|
-
run_handle.more_pending_work = resp.more_pending_work;
|
|
303
|
-
run_handle.last_action_acked = true;
|
|
304
|
-
run_handle.most_recently_processed_event_number =
|
|
305
|
-
resp.most_recently_processed_event_number;
|
|
306
|
-
|
|
307
|
-
let r = match resp.outgoing_activation {
|
|
308
|
-
Some(ActivationOrAuto::LangActivation(mut activation)) => {
|
|
309
|
-
if resp.in_response_to_wft {
|
|
310
|
-
let wft = run_handle
|
|
311
|
-
.wft
|
|
312
|
-
.as_mut()
|
|
313
|
-
.expect("WFT must exist for run just updated with one");
|
|
314
|
-
// If there are in-poll queries, insert jobs for those queries into the
|
|
315
|
-
// activation, but only if we hit the cache. If we didn't, those queries
|
|
316
|
-
// will need to be dealt with once replay is over
|
|
317
|
-
if wft.hit_cache {
|
|
318
|
-
put_queries_in_act(&mut activation, wft);
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
if activation.jobs.is_empty() {
|
|
323
|
-
dbg_panic!("Should not send lang activation with no jobs");
|
|
324
|
-
}
|
|
325
|
-
Some(ActivationOrAuto::LangActivation(activation))
|
|
326
|
-
}
|
|
327
|
-
Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
|
|
328
|
-
if let Some(wft) = run_handle.wft.as_mut() {
|
|
329
|
-
put_queries_in_act(&mut act, wft);
|
|
330
|
-
Some(ActivationOrAuto::LangActivation(act))
|
|
331
|
-
} else {
|
|
332
|
-
dbg_panic!("Ready for queries but no WFT!");
|
|
333
|
-
None
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
|
|
337
|
-
None => {
|
|
338
|
-
// If the response indicates there is no activation to send yet but there
|
|
339
|
-
// is more pending work, we should check again.
|
|
340
|
-
if run_handle.more_pending_work {
|
|
341
|
-
run_handle.check_more_activations();
|
|
342
|
-
None
|
|
343
|
-
} else if let Some(reason) = run_handle.trying_to_evict.as_ref() {
|
|
344
|
-
// If a run update came back and had nothing to do, but we're trying to
|
|
345
|
-
// evict, just do that now as long as there's no other outstanding work.
|
|
346
|
-
if run_handle.activation.is_none() && !run_handle.more_pending_work {
|
|
347
|
-
let mut evict_act = create_evict_activation(
|
|
348
|
-
resp.run_id,
|
|
349
|
-
reason.message.clone(),
|
|
350
|
-
reason.reason,
|
|
351
|
-
);
|
|
352
|
-
evict_act.history_length =
|
|
353
|
-
run_handle.most_recently_processed_event_number as u32;
|
|
354
|
-
Some(ActivationOrAuto::LangActivation(evict_act))
|
|
355
|
-
} else {
|
|
356
|
-
None
|
|
357
|
-
}
|
|
358
|
-
} else {
|
|
359
|
-
None
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
};
|
|
363
|
-
if let Some(f) = resp.fulfillable_complete.take() {
|
|
364
|
-
f.fulfill();
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// After each run update, check if it's ready to handle any buffered poll
|
|
368
|
-
if matches!(&r, Some(ActivationOrAuto::Autocomplete { .. }) | None)
|
|
369
|
-
&& !run_handle.has_any_pending_work(false, true)
|
|
370
|
-
{
|
|
371
|
-
if let Some(bufft) = run_handle.buffered_resp.take() {
|
|
372
|
-
self.instantiate_or_update(bufft);
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
r
|
|
376
|
-
}
|
|
377
|
-
RunUpdateResponseKind::Fail(fail) => {
|
|
378
|
-
if let Some(r) = self.runs.get_mut(&fail.run_id) {
|
|
379
|
-
r.last_action_acked = true;
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
if let Some(resp_chan) = fail.completion_resp {
|
|
383
|
-
// Automatically fail the workflow task in the event we couldn't update machines
|
|
384
|
-
let fail_cause = if matches!(&fail.err, WFMachinesError::Nondeterminism(_)) {
|
|
385
|
-
WorkflowTaskFailedCause::NonDeterministicError
|
|
386
|
-
} else {
|
|
387
|
-
WorkflowTaskFailedCause::Unspecified
|
|
388
|
-
};
|
|
389
|
-
let wft_fail_str = format!("{:?}", fail.err);
|
|
390
|
-
self.failed_completion(
|
|
391
|
-
fail.run_id,
|
|
392
|
-
fail_cause,
|
|
393
|
-
fail.err.evict_reason(),
|
|
394
|
-
TFailure::application_failure(wft_fail_str, false).into(),
|
|
395
|
-
resp_chan,
|
|
396
|
-
);
|
|
397
|
-
} else {
|
|
398
|
-
// TODO: This should probably also fail workflow tasks, but that wasn't
|
|
399
|
-
// implemented pre-refactor either.
|
|
400
|
-
warn!(error=?fail.err, run_id=%fail.run_id, "Error while updating workflow");
|
|
401
|
-
self.request_eviction(RequestEvictMsg {
|
|
402
|
-
run_id: fail.run_id,
|
|
403
|
-
message: format!("Error while updating workflow: {:?}", fail.err),
|
|
404
|
-
reason: fail.err.evict_reason(),
|
|
405
|
-
});
|
|
406
|
-
}
|
|
407
|
-
None
|
|
224
|
+
/// Instantiate or update run machines with a new WFT
|
|
225
|
+
#[instrument(skip(self, pwft)
|
|
226
|
+
fields(run_id=%pwft.work.execution.run_id,
|
|
227
|
+
workflow_id=%pwft.work.execution.workflow_id))]
|
|
228
|
+
fn instantiate_or_update(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
|
|
229
|
+
match self._instantiate_or_update(pwft) {
|
|
230
|
+
Err(histfetch) => {
|
|
231
|
+
self.runs_needing_fetching.push_back(histfetch);
|
|
232
|
+
Default::default()
|
|
408
233
|
}
|
|
234
|
+
Ok(r) => r,
|
|
409
235
|
}
|
|
410
236
|
}
|
|
411
237
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
238
|
+
fn _instantiate_or_update(
|
|
239
|
+
&mut self,
|
|
240
|
+
pwft: PermittedWFT,
|
|
241
|
+
) -> Result<RunUpdateAct, HistoryFetchReq> {
|
|
242
|
+
// If the run already exists, possibly buffer the work and return early if we can't handle
|
|
243
|
+
// it yet.
|
|
244
|
+
let pwft = if let Some(rh) = self.runs.get_mut(&pwft.work.execution.run_id) {
|
|
245
|
+
if let Some(w) = rh.buffer_wft_if_outstanding_work(pwft) {
|
|
246
|
+
w
|
|
247
|
+
} else {
|
|
248
|
+
return Ok(None);
|
|
249
|
+
}
|
|
418
250
|
} else {
|
|
419
|
-
|
|
251
|
+
pwft
|
|
420
252
|
};
|
|
421
253
|
|
|
422
|
-
let run_id = work.
|
|
254
|
+
let run_id = pwft.work.execution.run_id.clone();
|
|
423
255
|
// If our cache is full and this WFT is for an unseen run we must first evict a run before
|
|
424
256
|
// we can deal with this task. So, buffer the task in that case.
|
|
425
257
|
if !self.runs.has_run(&run_id) && self.runs.is_full() {
|
|
426
|
-
self.buffer_resp_on_full_cache(
|
|
427
|
-
return;
|
|
258
|
+
self.buffer_resp_on_full_cache(pwft);
|
|
259
|
+
return Ok(None);
|
|
428
260
|
}
|
|
429
261
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
history_length = %work.history.events.len(),
|
|
435
|
-
start_event_id = ?start_event_id,
|
|
436
|
-
has_legacy_query = %work.legacy_query.is_some(),
|
|
437
|
-
attempt = %work.attempt,
|
|
438
|
-
"Applying new workflow task from server"
|
|
439
|
-
);
|
|
440
|
-
|
|
441
|
-
let wft_info = WorkflowTaskInfo {
|
|
442
|
-
attempt: work.attempt,
|
|
443
|
-
task_token: work.task_token,
|
|
444
|
-
wf_id: work.workflow_execution.workflow_id.clone(),
|
|
445
|
-
};
|
|
446
|
-
let poll_resp_is_incremental = work
|
|
447
|
-
.history
|
|
448
|
-
.events
|
|
449
|
-
.get(0)
|
|
450
|
-
.map(|ev| ev.event_id > 1)
|
|
451
|
-
.unwrap_or_default();
|
|
452
|
-
let poll_resp_is_incremental = poll_resp_is_incremental || work.history.events.is_empty();
|
|
453
|
-
|
|
454
|
-
let mut did_miss_cache = !poll_resp_is_incremental;
|
|
455
|
-
|
|
456
|
-
let page_token = if !self.runs.has_run(&run_id) && poll_resp_is_incremental {
|
|
262
|
+
// This check can't really be lifted up higher since we could EX: See it's in the cache,
|
|
263
|
+
// not fetch more history, send the task, see cache is full, buffer it, then evict that
|
|
264
|
+
// run, and now we still have a cache miss.
|
|
265
|
+
if !self.runs.has_run(&run_id) && pwft.work.is_incremental() {
|
|
457
266
|
debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
|
|
458
267
|
cache. Will fetch history");
|
|
459
268
|
self.metrics.sticky_cache_miss();
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
};
|
|
465
|
-
let history_update = HistoryUpdate::new(
|
|
466
|
-
HistoryPaginator::new(
|
|
467
|
-
work.history,
|
|
468
|
-
work.workflow_execution.workflow_id.clone(),
|
|
469
|
-
run_id.clone(),
|
|
470
|
-
page_token,
|
|
471
|
-
self.client.clone(),
|
|
472
|
-
),
|
|
473
|
-
work.previous_started_event_id,
|
|
474
|
-
);
|
|
475
|
-
let legacy_query_from_poll = work
|
|
476
|
-
.legacy_query
|
|
477
|
-
.take()
|
|
478
|
-
.map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
|
|
479
|
-
|
|
480
|
-
let mut pending_queries = work.query_requests.into_iter().collect::<Vec<_>>();
|
|
481
|
-
if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
|
|
482
|
-
error!(
|
|
483
|
-
"Server issued both normal and legacy queries. This should not happen. Please \
|
|
484
|
-
file a bug report."
|
|
485
|
-
);
|
|
486
|
-
self.request_eviction(RequestEvictMsg {
|
|
487
|
-
run_id,
|
|
488
|
-
message: "Server issued both normal and legacy query".to_string(),
|
|
489
|
-
reason: EvictionReason::Fatal,
|
|
490
|
-
});
|
|
491
|
-
return;
|
|
492
|
-
}
|
|
493
|
-
if let Some(lq) = legacy_query_from_poll {
|
|
494
|
-
pending_queries.push(lq);
|
|
269
|
+
return Err(HistoryFetchReq::Full(
|
|
270
|
+
CacheMissFetchReq { original_wft: pwft },
|
|
271
|
+
self.history_fetch_refcounter.clone(),
|
|
272
|
+
));
|
|
495
273
|
}
|
|
496
274
|
|
|
497
|
-
let
|
|
498
|
-
|
|
499
|
-
&run_id,
|
|
500
|
-
&work.workflow_execution.workflow_id,
|
|
501
|
-
&work.workflow_type,
|
|
502
|
-
history_update,
|
|
503
|
-
start_time,
|
|
504
|
-
);
|
|
505
|
-
run_handle.wft = Some(OutstandingTask {
|
|
506
|
-
info: wft_info,
|
|
507
|
-
hit_cache: !did_miss_cache,
|
|
508
|
-
pending_queries,
|
|
509
|
-
start_time,
|
|
510
|
-
permit,
|
|
511
|
-
})
|
|
275
|
+
let rur = self.runs.instantiate_or_update(pwft);
|
|
276
|
+
Ok(rur)
|
|
512
277
|
}
|
|
513
278
|
|
|
514
|
-
fn process_completion(&mut self, complete:
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
279
|
+
fn process_completion(&mut self, complete: NewOrFetchedComplete) -> Vec<ActivationOrAuto> {
|
|
280
|
+
let rh = if let Some(rh) = self.runs.get_mut(complete.run_id()) {
|
|
281
|
+
rh
|
|
282
|
+
} else {
|
|
283
|
+
dbg_panic!("Run missing during completion {:?}", complete);
|
|
284
|
+
return vec![];
|
|
285
|
+
};
|
|
286
|
+
let mut acts: Vec<_> = match complete {
|
|
287
|
+
NewOrFetchedComplete::New(complete) => match complete.completion {
|
|
288
|
+
ValidatedCompletion::Success {
|
|
289
|
+
commands,
|
|
290
|
+
used_flags,
|
|
291
|
+
..
|
|
292
|
+
} => match rh.successful_completion(commands, used_flags, complete.response_tx) {
|
|
293
|
+
Ok(acts) => acts,
|
|
294
|
+
Err(npr) => {
|
|
295
|
+
self.runs_needing_fetching
|
|
296
|
+
.push_back(HistoryFetchReq::NextPage(
|
|
297
|
+
npr,
|
|
298
|
+
self.history_fetch_refcounter.clone(),
|
|
299
|
+
));
|
|
300
|
+
None
|
|
301
|
+
}
|
|
302
|
+
},
|
|
303
|
+
ValidatedCompletion::Fail { failure, .. } => rh.failed_completion(
|
|
304
|
+
failure.force_cause(),
|
|
523
305
|
EvictionReason::LangFail,
|
|
524
306
|
failure,
|
|
525
307
|
complete.response_tx,
|
|
526
|
-
)
|
|
308
|
+
),
|
|
309
|
+
},
|
|
310
|
+
NewOrFetchedComplete::Fetched(update, paginator) => {
|
|
311
|
+
rh.fetched_page_completion(update, paginator)
|
|
527
312
|
}
|
|
528
313
|
}
|
|
314
|
+
.into_iter()
|
|
315
|
+
.collect();
|
|
529
316
|
// Always queue evictions after completion when we have a zero-size cache
|
|
530
317
|
if self.runs.cache_capacity() == 0 {
|
|
531
|
-
self.request_eviction_of_lru_run()
|
|
318
|
+
acts.extend(self.request_eviction_of_lru_run().into_run_update_resp())
|
|
532
319
|
}
|
|
320
|
+
acts
|
|
533
321
|
}
|
|
534
322
|
|
|
535
|
-
fn
|
|
536
|
-
&
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
(
|
|
545
|
-
entry.info.task_token.clone(),
|
|
546
|
-
!entry.pending_queries.is_empty(),
|
|
547
|
-
entry.start_time,
|
|
548
|
-
)
|
|
549
|
-
} else {
|
|
550
|
-
if !activation_was_only_eviction {
|
|
551
|
-
// Not an error if this was an eviction, since it's normal to issue eviction
|
|
552
|
-
// activations without an associated workflow task in that case.
|
|
553
|
-
dbg_panic!(
|
|
554
|
-
"Attempted to complete activation for run {} without associated workflow task",
|
|
323
|
+
fn process_post_activation(&mut self, report: PostActivationMsg) -> RunUpdateAct {
|
|
324
|
+
let run_id = &report.run_id;
|
|
325
|
+
let wft_from_complete = report.wft_from_complete;
|
|
326
|
+
if let Some((wft, _)) = &wft_from_complete {
|
|
327
|
+
if &wft.execution.run_id != run_id {
|
|
328
|
+
dbg_panic!(
|
|
329
|
+
"Server returned a WFT on completion for a different run ({}) than the \
|
|
330
|
+
one being completed ({}). This is a server bug.",
|
|
331
|
+
wft.execution.run_id,
|
|
555
332
|
run_id
|
|
556
|
-
|
|
557
|
-
}
|
|
558
|
-
self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
|
|
559
|
-
return;
|
|
560
|
-
};
|
|
561
|
-
|
|
562
|
-
// If the only command from the activation is a legacy query response, that means we need
|
|
563
|
-
// to respond differently than a typical activation.
|
|
564
|
-
if matches!(&commands.as_slice(),
|
|
565
|
-
&[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
|
|
566
|
-
{
|
|
567
|
-
let qr = match commands.remove(0) {
|
|
568
|
-
WFCommand::QueryResponse(qr) => qr,
|
|
569
|
-
_ => unreachable!("We just verified this is the only command"),
|
|
570
|
-
};
|
|
571
|
-
self.reply_to_complete(
|
|
572
|
-
&run_id,
|
|
573
|
-
ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
|
|
574
|
-
task_token,
|
|
575
|
-
action: ActivationAction::RespondLegacyQuery {
|
|
576
|
-
result: Box::new(qr),
|
|
577
|
-
},
|
|
578
|
-
}),
|
|
579
|
-
resp_chan,
|
|
580
|
-
);
|
|
581
|
-
} else {
|
|
582
|
-
// First strip out query responses from other commands that actually affect machines
|
|
583
|
-
// Would be prettier with `drain_filter`
|
|
584
|
-
let mut i = 0;
|
|
585
|
-
let mut query_responses = vec![];
|
|
586
|
-
while i < commands.len() {
|
|
587
|
-
if matches!(commands[i], WFCommand::QueryResponse(_)) {
|
|
588
|
-
if let WFCommand::QueryResponse(qr) = commands.remove(i) {
|
|
589
|
-
query_responses.push(qr);
|
|
590
|
-
}
|
|
591
|
-
} else {
|
|
592
|
-
i += 1;
|
|
593
|
-
}
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
let activation_was_eviction = self.activation_has_eviction(&run_id);
|
|
597
|
-
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
598
|
-
rh.send_completion(RunActivationCompletion {
|
|
599
|
-
task_token,
|
|
600
|
-
start_time,
|
|
601
|
-
commands,
|
|
602
|
-
activation_was_eviction,
|
|
603
|
-
activation_was_only_eviction,
|
|
604
|
-
has_pending_query,
|
|
605
|
-
query_responses,
|
|
606
|
-
resp_chan: Some(resp_chan),
|
|
607
|
-
});
|
|
608
|
-
} else {
|
|
609
|
-
dbg_panic!("Run {} missing during completion", run_id);
|
|
333
|
+
);
|
|
610
334
|
}
|
|
611
|
-
};
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
fn failed_completion(
|
|
615
|
-
&mut self,
|
|
616
|
-
run_id: String,
|
|
617
|
-
cause: WorkflowTaskFailedCause,
|
|
618
|
-
reason: EvictionReason,
|
|
619
|
-
failure: Failure,
|
|
620
|
-
resp_chan: oneshot::Sender<ActivationCompleteResult>,
|
|
621
|
-
) {
|
|
622
|
-
let tt = if let Some(tt) = self.get_task(&run_id).map(|t| t.info.task_token.clone()) {
|
|
623
|
-
tt
|
|
624
|
-
} else {
|
|
625
|
-
dbg_panic!(
|
|
626
|
-
"No workflow task for run id {} found when trying to fail activation",
|
|
627
|
-
run_id
|
|
628
|
-
);
|
|
629
|
-
self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
|
|
630
|
-
return;
|
|
631
|
-
};
|
|
632
|
-
|
|
633
|
-
if let Some(m) = self.run_metrics(&run_id) {
|
|
634
|
-
m.wf_task_failed();
|
|
635
335
|
}
|
|
636
|
-
let message = format!("Workflow activation completion failed: {:?}", &failure);
|
|
637
|
-
// Blow up any cached data associated with the workflow
|
|
638
|
-
let should_report = match self.request_eviction(RequestEvictMsg {
|
|
639
|
-
run_id: run_id.clone(),
|
|
640
|
-
message,
|
|
641
|
-
reason,
|
|
642
|
-
}) {
|
|
643
|
-
EvictionRequestResult::EvictionRequested(Some(attempt))
|
|
644
|
-
| EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
|
|
645
|
-
_ => false,
|
|
646
|
-
};
|
|
647
|
-
// If the outstanding WFT is a legacy query task, report that we need to fail it
|
|
648
|
-
let outcome = if self
|
|
649
|
-
.runs
|
|
650
|
-
.get(&run_id)
|
|
651
|
-
.map(|rh| rh.pending_work_is_legacy_query())
|
|
652
|
-
.unwrap_or_default()
|
|
653
|
-
{
|
|
654
|
-
ActivationCompleteOutcome::ReportWFTFail(
|
|
655
|
-
FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
|
|
656
|
-
)
|
|
657
|
-
} else if should_report {
|
|
658
|
-
ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
|
|
659
|
-
tt, cause, failure,
|
|
660
|
-
))
|
|
661
|
-
} else {
|
|
662
|
-
ActivationCompleteOutcome::DoNothing
|
|
663
|
-
};
|
|
664
|
-
self.reply_to_complete(&run_id, outcome, resp_chan);
|
|
665
|
-
}
|
|
666
336
|
|
|
667
|
-
|
|
668
|
-
let run_id = &report.run_id;
|
|
337
|
+
let mut res = None;
|
|
669
338
|
|
|
670
339
|
// If we reported to server, we always want to mark it complete.
|
|
671
|
-
let maybe_t = self.complete_wft(run_id, report.
|
|
340
|
+
let maybe_t = self.complete_wft(run_id, report.wft_report_status);
|
|
341
|
+
// Delete the activation
|
|
342
|
+
let activation = self
|
|
343
|
+
.runs
|
|
344
|
+
.get_mut(run_id)
|
|
345
|
+
.and_then(|rh| rh.delete_activation());
|
|
346
|
+
|
|
347
|
+
// Evict the run if the activation contained an eviction
|
|
348
|
+
let mut applied_buffered_poll_for_this_run = false;
|
|
349
|
+
if activation.map(|a| a.has_eviction()).unwrap_or_default() {
|
|
350
|
+
debug!(run_id=%run_id, "Evicting run");
|
|
351
|
+
|
|
352
|
+
if let Some(mut rh) = self.runs.remove(run_id) {
|
|
353
|
+
if let Some(buff) = rh.take_buffered_wft() {
|
|
354
|
+
// Don't try to apply a buffered poll for this run if we just got a new WFT
|
|
355
|
+
// from completing, because by definition that buffered poll is now an
|
|
356
|
+
// out-of-date WFT.
|
|
357
|
+
if wft_from_complete.is_none() {
|
|
358
|
+
res = self.instantiate_or_update(buff);
|
|
359
|
+
applied_buffered_poll_for_this_run = true;
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
672
363
|
|
|
673
|
-
|
|
674
|
-
.
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
364
|
+
// Attempt to apply a buffered poll for some *other* run, if we didn't have a wft
|
|
365
|
+
// from complete or a buffered poll for *this* run.
|
|
366
|
+
if wft_from_complete.is_none() && !applied_buffered_poll_for_this_run {
|
|
367
|
+
if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
|
|
368
|
+
res = self.instantiate_or_update(buff);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
679
371
|
};
|
|
680
372
|
|
|
681
|
-
if let Some(wft) =
|
|
682
|
-
debug!(run_id=%wft.
|
|
373
|
+
if let Some((wft, pag)) = wft_from_complete {
|
|
374
|
+
debug!(run_id=%wft.execution.run_id, "New WFT from completion");
|
|
683
375
|
if let Some(t) = maybe_t {
|
|
684
|
-
self.instantiate_or_update(PermittedWFT {
|
|
685
|
-
wft,
|
|
376
|
+
res = self.instantiate_or_update(PermittedWFT {
|
|
377
|
+
work: wft,
|
|
686
378
|
permit: t.permit,
|
|
687
|
-
|
|
379
|
+
paginator: pag,
|
|
380
|
+
});
|
|
688
381
|
}
|
|
689
382
|
}
|
|
690
383
|
|
|
691
|
-
if
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
384
|
+
if res.is_none() {
|
|
385
|
+
if let Some(rh) = self.runs.get_mut(run_id) {
|
|
386
|
+
// Attempt to produce the next activation if needed
|
|
387
|
+
res = rh.check_more_activations();
|
|
388
|
+
}
|
|
696
389
|
}
|
|
390
|
+
res
|
|
697
391
|
}
|
|
698
392
|
|
|
699
|
-
fn local_resolution(&mut self, msg: LocalResolutionMsg) {
|
|
393
|
+
fn local_resolution(&mut self, msg: LocalResolutionMsg) -> RunUpdateAct {
|
|
700
394
|
let run_id = msg.run_id;
|
|
701
395
|
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
702
|
-
rh.
|
|
396
|
+
rh.local_resolution(msg.res)
|
|
703
397
|
} else {
|
|
704
398
|
// It isn't an explicit error if the machine is missing when a local activity resolves.
|
|
705
399
|
// This can happen if an activity reports a timeout after we stopped caring about it.
|
|
706
400
|
debug!(run_id = %run_id,
|
|
707
401
|
"Tried to resolve a local activity for a run we are no longer tracking");
|
|
402
|
+
None
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
fn process_heartbeat_timeout(&mut self, run_id: String) -> RunUpdateAct {
|
|
407
|
+
if let Some(rh) = self.runs.get_mut(&run_id) {
|
|
408
|
+
rh.heartbeat_timeout()
|
|
409
|
+
} else {
|
|
410
|
+
None
|
|
708
411
|
}
|
|
709
412
|
}
|
|
710
413
|
|
|
@@ -712,17 +415,8 @@ impl WFStream {
|
|
|
712
415
|
/// activation to evict the workflow from the lang side. Workflow will not *actually* be evicted
|
|
713
416
|
/// until lang replies to that activation
|
|
714
417
|
fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
|
|
715
|
-
let activation_has_eviction = self.activation_has_eviction(&info.run_id);
|
|
716
418
|
if let Some(rh) = self.runs.get_mut(&info.run_id) {
|
|
717
|
-
|
|
718
|
-
if !activation_has_eviction && rh.trying_to_evict.is_none() {
|
|
719
|
-
debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
|
|
720
|
-
rh.trying_to_evict = Some(info);
|
|
721
|
-
rh.check_more_activations();
|
|
722
|
-
EvictionRequestResult::EvictionRequested(attempts)
|
|
723
|
-
} else {
|
|
724
|
-
EvictionRequestResult::EvictionAlreadyRequested(attempts)
|
|
725
|
-
}
|
|
419
|
+
rh.request_eviction(info)
|
|
726
420
|
} else {
|
|
727
421
|
debug!(run_id=%info.run_id, "Eviction requested for unknown run");
|
|
728
422
|
EvictionRequestResult::NotFound
|
|
@@ -736,6 +430,7 @@ impl WFStream {
|
|
|
736
430
|
run_id,
|
|
737
431
|
message: "Workflow cache full".to_string(),
|
|
738
432
|
reason: EvictionReason::CacheFull,
|
|
433
|
+
auto_reply_fail_tt: None,
|
|
739
434
|
})
|
|
740
435
|
} else {
|
|
741
436
|
// This branch shouldn't really be possible
|
|
@@ -743,36 +438,10 @@ impl WFStream {
|
|
|
743
438
|
}
|
|
744
439
|
}
|
|
745
440
|
|
|
746
|
-
/// Evict a workflow from the cache by its run id. Any existing pending activations will be
|
|
747
|
-
/// destroyed, and any outstanding activations invalidated.
|
|
748
|
-
fn evict_run(&mut self, run_id: &str) {
|
|
749
|
-
debug!(run_id=%run_id, "Evicting run");
|
|
750
|
-
|
|
751
|
-
let mut did_take_buff = false;
|
|
752
|
-
// Now it can safely be deleted, it'll get recreated once the un-buffered poll is handled if
|
|
753
|
-
// there was one.
|
|
754
|
-
if let Some(mut rh) = self.runs.remove(run_id) {
|
|
755
|
-
rh.handle.abort();
|
|
756
|
-
|
|
757
|
-
if let Some(buff) = rh.buffered_resp.take() {
|
|
758
|
-
self.instantiate_or_update(buff);
|
|
759
|
-
did_take_buff = true;
|
|
760
|
-
}
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
if !did_take_buff {
|
|
764
|
-
// If there wasn't a buffered poll, there might be one for a different run which needs
|
|
765
|
-
// a free cache slot, and now there is.
|
|
766
|
-
if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
|
|
767
|
-
self.instantiate_or_update(buff);
|
|
768
|
-
}
|
|
769
|
-
}
|
|
770
|
-
}
|
|
771
|
-
|
|
772
441
|
fn complete_wft(
|
|
773
442
|
&mut self,
|
|
774
443
|
run_id: &str,
|
|
775
|
-
|
|
444
|
+
wft_report_status: WFTReportStatus,
|
|
776
445
|
) -> Option<OutstandingTask> {
|
|
777
446
|
// If the WFT completion wasn't sent to the server, but we did see the final event, we still
|
|
778
447
|
// want to clear the workflow task. This can really only happen in replay testing, where we
|
|
@@ -782,9 +451,9 @@ impl WFStream {
|
|
|
782
451
|
let saw_final = self
|
|
783
452
|
.runs
|
|
784
453
|
.get(run_id)
|
|
785
|
-
.map(|r| r.have_seen_terminal_event)
|
|
454
|
+
.map(|r| r.have_seen_terminal_event())
|
|
786
455
|
.unwrap_or_default();
|
|
787
|
-
if !saw_final && !
|
|
456
|
+
if !saw_final && matches!(wft_report_status, WFTReportStatus::NotReported) {
|
|
788
457
|
return None;
|
|
789
458
|
}
|
|
790
459
|
|
|
@@ -792,60 +461,26 @@ impl WFStream {
|
|
|
792
461
|
// Can't mark the WFT complete if there are pending queries, as doing so would destroy
|
|
793
462
|
// them.
|
|
794
463
|
if rh
|
|
795
|
-
.wft
|
|
796
|
-
.as_ref()
|
|
464
|
+
.wft()
|
|
797
465
|
.map(|wft| !wft.pending_queries.is_empty())
|
|
798
466
|
.unwrap_or_default()
|
|
799
467
|
{
|
|
800
468
|
return None;
|
|
801
469
|
}
|
|
802
470
|
|
|
803
|
-
|
|
804
|
-
let retme = rh.wft.take();
|
|
805
|
-
if let Some(ot) = &retme {
|
|
806
|
-
if let Some(m) = self.run_metrics(run_id) {
|
|
807
|
-
m.wf_task_latency(ot.start_time.elapsed());
|
|
808
|
-
}
|
|
809
|
-
}
|
|
810
|
-
retme
|
|
471
|
+
rh.mark_wft_complete(wft_report_status)
|
|
811
472
|
} else {
|
|
812
473
|
None
|
|
813
474
|
}
|
|
814
475
|
}
|
|
815
476
|
|
|
816
|
-
/// Stores some work if there is any outstanding WFT or activation for the run. If there was
|
|
817
|
-
/// not, returns the work back out inside the option.
|
|
818
|
-
fn buffer_resp_if_outstanding_work(&mut self, work: PermittedWFT) -> Option<PermittedWFT> {
|
|
819
|
-
let run_id = &work.wft.workflow_execution.run_id;
|
|
820
|
-
if let Some(mut run) = self.runs.get_mut(run_id) {
|
|
821
|
-
let about_to_issue_evict = run.trying_to_evict.is_some() && !run.last_action_acked;
|
|
822
|
-
let has_wft = run.wft.is_some();
|
|
823
|
-
let has_activation = run.activation.is_some();
|
|
824
|
-
if has_wft
|
|
825
|
-
|| has_activation
|
|
826
|
-
|| about_to_issue_evict
|
|
827
|
-
|| run.more_pending_work
|
|
828
|
-
|| !run.last_action_acked
|
|
829
|
-
{
|
|
830
|
-
debug!(run_id = %run_id, run = ?run,
|
|
831
|
-
"Got new WFT for a run with outstanding work, buffering it");
|
|
832
|
-
run.buffered_resp = Some(work);
|
|
833
|
-
None
|
|
834
|
-
} else {
|
|
835
|
-
Some(work)
|
|
836
|
-
}
|
|
837
|
-
} else {
|
|
838
|
-
Some(work)
|
|
839
|
-
}
|
|
840
|
-
}
|
|
841
|
-
|
|
842
477
|
fn buffer_resp_on_full_cache(&mut self, work: PermittedWFT) {
|
|
843
|
-
debug!(run_id=%work.
|
|
478
|
+
debug!(run_id=%work.work.execution.run_id, "Buffering WFT because cache is full");
|
|
844
479
|
// If there's already a buffered poll for the run, replace it.
|
|
845
480
|
if let Some(rh) = self
|
|
846
481
|
.buffered_polls_need_cache_slot
|
|
847
482
|
.iter_mut()
|
|
848
|
-
.find(|w| w.
|
|
483
|
+
.find(|w| w.work.execution.run_id == work.work.execution.run_id)
|
|
849
484
|
{
|
|
850
485
|
*rh = work;
|
|
851
486
|
} else {
|
|
@@ -856,7 +491,7 @@ impl WFStream {
|
|
|
856
491
|
|
|
857
492
|
/// Makes sure we have enough pending evictions to fulfill the needs of buffered WFTs who are
|
|
858
493
|
/// waiting on a cache slot
|
|
859
|
-
fn reconcile_buffered(&mut self) {
|
|
494
|
+
fn reconcile_buffered(&mut self) -> Vec<ActivationOrAuto> {
|
|
860
495
|
// We must ensure that there are at least as many pending evictions as there are tasks
|
|
861
496
|
// that we might need to un-buffer (skipping runs which already have buffered tasks for
|
|
862
497
|
// themselves)
|
|
@@ -865,121 +500,237 @@ impl WFStream {
|
|
|
865
500
|
let num_existing_evictions = self
|
|
866
501
|
.runs
|
|
867
502
|
.runs_lru_order()
|
|
868
|
-
.filter(|(_, h)| h.
|
|
503
|
+
.filter(|(_, h)| h.is_trying_to_evict())
|
|
869
504
|
.count();
|
|
870
505
|
let mut num_evicts_needed = num_in_buff.saturating_sub(num_existing_evictions);
|
|
871
506
|
for (rid, handle) in self.runs.runs_lru_order() {
|
|
872
507
|
if num_evicts_needed == 0 {
|
|
873
508
|
break;
|
|
874
509
|
}
|
|
875
|
-
if handle.
|
|
510
|
+
if !handle.has_buffered_wft() {
|
|
876
511
|
num_evicts_needed -= 1;
|
|
877
512
|
evict_these.push(rid.to_string());
|
|
878
513
|
}
|
|
879
514
|
}
|
|
515
|
+
let mut acts = vec![];
|
|
880
516
|
for run_id in evict_these {
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
517
|
+
acts.extend(
|
|
518
|
+
self.request_eviction(RequestEvictMsg {
|
|
519
|
+
run_id,
|
|
520
|
+
message: "Workflow cache full".to_string(),
|
|
521
|
+
reason: EvictionReason::CacheFull,
|
|
522
|
+
auto_reply_fail_tt: None,
|
|
523
|
+
})
|
|
524
|
+
.into_run_update_resp(),
|
|
525
|
+
);
|
|
886
526
|
}
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
fn reply_to_complete(
|
|
890
|
-
&self,
|
|
891
|
-
run_id: &str,
|
|
892
|
-
outcome: ActivationCompleteOutcome,
|
|
893
|
-
chan: oneshot::Sender<ActivationCompleteResult>,
|
|
894
|
-
) {
|
|
895
|
-
let most_recently_processed_event = self
|
|
896
|
-
.runs
|
|
897
|
-
.peek(run_id)
|
|
898
|
-
.map(|rh| rh.most_recently_processed_event_number)
|
|
899
|
-
.unwrap_or_default();
|
|
900
|
-
chan.send(ActivationCompleteResult {
|
|
901
|
-
most_recently_processed_event,
|
|
902
|
-
outcome,
|
|
903
|
-
})
|
|
904
|
-
.expect("Rcv half of activation reply not dropped");
|
|
527
|
+
acts
|
|
905
528
|
}
|
|
906
529
|
|
|
907
530
|
fn shutdown_done(&self) -> bool {
|
|
908
|
-
|
|
909
|
-
.
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
531
|
+
if self.shutdown_token.is_cancelled() {
|
|
532
|
+
if Arc::strong_count(&self.history_fetch_refcounter) > 1 {
|
|
533
|
+
// Don't exit if there are outstanding fetch requests
|
|
534
|
+
return false;
|
|
535
|
+
}
|
|
536
|
+
let all_runs_ready = self
|
|
537
|
+
.runs
|
|
538
|
+
.handles()
|
|
539
|
+
.all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
|
|
540
|
+
if all_runs_ready {
|
|
541
|
+
return true;
|
|
542
|
+
}
|
|
917
543
|
}
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
fn get_task(&mut self, run_id: &str) -> Option<&OutstandingTask> {
|
|
921
|
-
self.runs.get(run_id).and_then(|rh| rh.wft.as_ref())
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
fn get_activation(&mut self, run_id: &str) -> Option<&OutstandingActivation> {
|
|
925
|
-
self.runs.get(run_id).and_then(|rh| rh.activation.as_ref())
|
|
926
|
-
}
|
|
927
|
-
|
|
928
|
-
fn run_metrics(&mut self, run_id: &str) -> Option<&MetricsContext> {
|
|
929
|
-
self.runs.get(run_id).map(|r| &r.metrics)
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
fn activation_has_only_eviction(&mut self, run_id: &str) -> bool {
|
|
933
|
-
self.runs
|
|
934
|
-
.get(run_id)
|
|
935
|
-
.and_then(|rh| rh.activation)
|
|
936
|
-
.map(OutstandingActivation::has_only_eviction)
|
|
937
|
-
.unwrap_or_default()
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
fn activation_has_eviction(&mut self, run_id: &str) -> bool {
|
|
941
|
-
self.runs
|
|
942
|
-
.get(run_id)
|
|
943
|
-
.and_then(|rh| rh.activation)
|
|
944
|
-
.map(OutstandingActivation::has_eviction)
|
|
945
|
-
.unwrap_or_default()
|
|
544
|
+
false
|
|
946
545
|
}
|
|
947
546
|
|
|
948
547
|
fn outstanding_wfts(&self) -> usize {
|
|
949
|
-
self.runs.handles().filter(|r| r.wft.is_some()).count()
|
|
548
|
+
self.runs.handles().filter(|r| r.wft().is_some()).count()
|
|
950
549
|
}
|
|
951
550
|
|
|
952
551
|
// Useful when debugging
|
|
953
552
|
#[allow(dead_code)]
|
|
954
553
|
fn info_dump(&self, run_id: &str) {
|
|
955
554
|
if let Some(r) = self.runs.peek(run_id) {
|
|
956
|
-
info!(run_id, wft=?r.wft, activation=?r.activation
|
|
957
|
-
|
|
958
|
-
|
|
555
|
+
info!(run_id, wft=?r.wft(), activation=?r.activation(),
|
|
556
|
+
buffered_wft=r.has_buffered_wft(),
|
|
557
|
+
trying_to_evict=r.is_trying_to_evict(), more_work=r.more_pending_work());
|
|
959
558
|
} else {
|
|
960
559
|
info!(run_id, "Run not found");
|
|
961
560
|
}
|
|
962
561
|
}
|
|
963
562
|
}
|
|
964
563
|
|
|
965
|
-
///
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
564
|
+
/// All possible inputs to the [WFStream]
|
|
565
|
+
#[derive(derive_more::From, Debug)]
|
|
566
|
+
#[cfg_attr(
|
|
567
|
+
feature = "save_wf_inputs",
|
|
568
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
569
|
+
)]
|
|
570
|
+
enum WFStreamInput {
|
|
571
|
+
NewWft(PermittedWFT),
|
|
572
|
+
Local(LocalInput),
|
|
573
|
+
/// The stream given to us which represents the poller (or a mock) terminated.
|
|
574
|
+
PollerDead,
|
|
575
|
+
/// The stream given to us which represents the poller (or a mock) encountered a non-retryable
|
|
576
|
+
/// error while polling
|
|
577
|
+
PollerError(
|
|
578
|
+
#[cfg_attr(
|
|
579
|
+
feature = "save_wf_inputs",
|
|
580
|
+
serde(with = "tonic_status_serde::SerdeStatus")
|
|
581
|
+
)]
|
|
582
|
+
tonic::Status,
|
|
583
|
+
),
|
|
584
|
+
FailedFetch {
|
|
585
|
+
run_id: String,
|
|
586
|
+
#[cfg_attr(
|
|
587
|
+
feature = "save_wf_inputs",
|
|
588
|
+
serde(with = "tonic_status_serde::SerdeStatus")
|
|
589
|
+
)]
|
|
590
|
+
err: tonic::Status,
|
|
591
|
+
auto_reply_fail_tt: Option<TaskToken>,
|
|
592
|
+
},
|
|
593
|
+
}
|
|
971
594
|
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
595
|
+
/// A non-poller-received input to the [WFStream]
|
|
596
|
+
#[derive(derive_more::DebugCustom)]
|
|
597
|
+
#[cfg_attr(
|
|
598
|
+
feature = "save_wf_inputs",
|
|
599
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
600
|
+
)]
|
|
601
|
+
#[debug(fmt = "LocalInput {{ {input:?} }}")]
|
|
602
|
+
pub(super) struct LocalInput {
|
|
603
|
+
pub input: LocalInputs,
|
|
604
|
+
#[cfg_attr(feature = "save_wf_inputs", serde(skip, default = "Span::current"))]
|
|
605
|
+
pub span: Span,
|
|
606
|
+
}
|
|
607
|
+
impl From<HeartbeatTimeoutMsg> for LocalInput {
|
|
608
|
+
fn from(hb: HeartbeatTimeoutMsg) -> Self {
|
|
609
|
+
Self {
|
|
610
|
+
input: LocalInputs::HeartbeatTimeout(hb.run_id),
|
|
611
|
+
span: hb.span,
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
/// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
|
|
616
|
+
/// new polls.
|
|
617
|
+
#[derive(Debug, derive_more::From)]
|
|
618
|
+
#[cfg_attr(
|
|
619
|
+
feature = "save_wf_inputs",
|
|
620
|
+
derive(serde::Serialize, serde::Deserialize)
|
|
621
|
+
)]
|
|
622
|
+
pub(super) enum LocalInputs {
|
|
623
|
+
Completion(WFActCompleteMsg),
|
|
624
|
+
FetchedPageCompletion {
|
|
625
|
+
paginator: HistoryPaginator,
|
|
626
|
+
update: HistoryUpdate,
|
|
627
|
+
},
|
|
628
|
+
LocalResolution(LocalResolutionMsg),
|
|
629
|
+
PostActivation(PostActivationMsg),
|
|
630
|
+
RequestEviction(RequestEvictMsg),
|
|
631
|
+
HeartbeatTimeout(String),
|
|
632
|
+
#[cfg_attr(feature = "save_wf_inputs", serde(skip))]
|
|
633
|
+
GetStateInfo(GetStateInfoMsg),
|
|
634
|
+
}
|
|
635
|
+
impl LocalInputs {
|
|
636
|
+
fn run_id(&self) -> Option<&str> {
|
|
637
|
+
Some(match self {
|
|
638
|
+
LocalInputs::Completion(c) => c.completion.run_id(),
|
|
639
|
+
LocalInputs::FetchedPageCompletion { paginator, .. } => &paginator.run_id,
|
|
640
|
+
LocalInputs::LocalResolution(lr) => &lr.run_id,
|
|
641
|
+
LocalInputs::PostActivation(pa) => &pa.run_id,
|
|
642
|
+
LocalInputs::RequestEviction(re) => &re.run_id,
|
|
643
|
+
LocalInputs::HeartbeatTimeout(hb) => hb,
|
|
644
|
+
LocalInputs::GetStateInfo(_) => return None,
|
|
645
|
+
})
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
#[derive(Debug)]
|
|
649
|
+
#[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
|
|
650
|
+
enum ExternalPollerInputs {
|
|
651
|
+
NewWft(PermittedWFT),
|
|
652
|
+
PollerDead,
|
|
653
|
+
PollerError(tonic::Status),
|
|
654
|
+
FetchedUpdate(PermittedWFT),
|
|
655
|
+
NextPage {
|
|
656
|
+
paginator: HistoryPaginator,
|
|
657
|
+
update: HistoryUpdate,
|
|
658
|
+
span: Span,
|
|
659
|
+
},
|
|
660
|
+
FailedFetch {
|
|
661
|
+
run_id: String,
|
|
662
|
+
err: tonic::Status,
|
|
663
|
+
auto_reply_fail_tt: Option<TaskToken>,
|
|
664
|
+
},
|
|
665
|
+
}
|
|
666
|
+
impl From<ExternalPollerInputs> for WFStreamInput {
|
|
667
|
+
fn from(l: ExternalPollerInputs) -> Self {
|
|
668
|
+
match l {
|
|
669
|
+
ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
|
|
670
|
+
ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
|
|
671
|
+
ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
|
|
672
|
+
ExternalPollerInputs::FetchedUpdate(wft) => WFStreamInput::NewWft(wft),
|
|
673
|
+
ExternalPollerInputs::FailedFetch {
|
|
674
|
+
run_id,
|
|
675
|
+
err,
|
|
676
|
+
auto_reply_fail_tt,
|
|
677
|
+
} => WFStreamInput::FailedFetch {
|
|
678
|
+
run_id,
|
|
679
|
+
err,
|
|
680
|
+
auto_reply_fail_tt,
|
|
681
|
+
},
|
|
682
|
+
ExternalPollerInputs::NextPage {
|
|
683
|
+
paginator,
|
|
684
|
+
update,
|
|
685
|
+
span,
|
|
686
|
+
} => WFStreamInput::Local(LocalInput {
|
|
687
|
+
input: LocalInputs::FetchedPageCompletion { paginator, update },
|
|
688
|
+
span,
|
|
689
|
+
}),
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
impl From<Result<WFTExtractorOutput, tonic::Status>> for ExternalPollerInputs {
|
|
694
|
+
fn from(v: Result<WFTExtractorOutput, tonic::Status>) -> Self {
|
|
695
|
+
match v {
|
|
696
|
+
Ok(WFTExtractorOutput::NewWFT(pwft)) => ExternalPollerInputs::NewWft(pwft),
|
|
697
|
+
Ok(WFTExtractorOutput::FetchResult(updated_wft, _)) => {
|
|
698
|
+
ExternalPollerInputs::FetchedUpdate(updated_wft)
|
|
699
|
+
}
|
|
700
|
+
Ok(WFTExtractorOutput::NextPage {
|
|
701
|
+
paginator,
|
|
702
|
+
update,
|
|
703
|
+
span,
|
|
704
|
+
rc: _rc,
|
|
705
|
+
}) => ExternalPollerInputs::NextPage {
|
|
706
|
+
paginator,
|
|
707
|
+
update,
|
|
708
|
+
span,
|
|
709
|
+
},
|
|
710
|
+
Ok(WFTExtractorOutput::FailedFetch {
|
|
711
|
+
run_id,
|
|
712
|
+
err,
|
|
713
|
+
auto_reply_fail_tt,
|
|
714
|
+
}) => ExternalPollerInputs::FailedFetch {
|
|
715
|
+
run_id,
|
|
716
|
+
err,
|
|
717
|
+
auto_reply_fail_tt,
|
|
718
|
+
},
|
|
719
|
+
Ok(WFTExtractorOutput::PollerDead) => ExternalPollerInputs::PollerDead,
|
|
720
|
+
Err(e) => ExternalPollerInputs::PollerError(e),
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
#[derive(Debug)]
|
|
725
|
+
enum NewOrFetchedComplete {
|
|
726
|
+
New(WFActCompleteMsg),
|
|
727
|
+
Fetched(HistoryUpdate, HistoryPaginator),
|
|
728
|
+
}
|
|
729
|
+
impl NewOrFetchedComplete {
|
|
730
|
+
fn run_id(&self) -> &str {
|
|
731
|
+
match self {
|
|
732
|
+
NewOrFetchedComplete::New(c) => c.completion.run_id(),
|
|
733
|
+
NewOrFetchedComplete::Fetched(_, p) => &p.run_id,
|
|
734
|
+
}
|
|
977
735
|
}
|
|
978
|
-
|
|
979
|
-
debug!(queries=?wft.pending_queries, "Dispatching queries");
|
|
980
|
-
let query_jobs = wft
|
|
981
|
-
.pending_queries
|
|
982
|
-
.drain(..)
|
|
983
|
-
.map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
|
|
984
|
-
act.jobs.extend(query_jobs);
|
|
985
736
|
}
|