@temporalio/core-bridge 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/Cargo.lock +255 -48
  2. package/package.json +4 -4
  3. package/releases/aarch64-apple-darwin/index.node +0 -0
  4. package/releases/aarch64-unknown-linux-gnu/index.node +0 -0
  5. package/releases/x86_64-apple-darwin/index.node +0 -0
  6. package/releases/x86_64-pc-windows-msvc/index.node +0 -0
  7. package/releases/x86_64-unknown-linux-gnu/index.node +0 -0
  8. package/sdk-core/.buildkite/pipeline.yml +1 -3
  9. package/sdk-core/.cargo/config.toml +5 -2
  10. package/sdk-core/.github/workflows/heavy.yml +28 -0
  11. package/sdk-core/Cargo.toml +1 -1
  12. package/sdk-core/README.md +9 -5
  13. package/sdk-core/client/src/lib.rs +211 -36
  14. package/sdk-core/client/src/raw.rs +1 -1
  15. package/sdk-core/client/src/retry.rs +32 -20
  16. package/sdk-core/core/Cargo.toml +23 -9
  17. package/sdk-core/core/src/abstractions.rs +11 -0
  18. package/sdk-core/core/src/core_tests/activity_tasks.rs +6 -5
  19. package/sdk-core/core/src/core_tests/local_activities.rs +263 -22
  20. package/sdk-core/core/src/core_tests/queries.rs +2 -2
  21. package/sdk-core/core/src/core_tests/workflow_tasks.rs +249 -5
  22. package/sdk-core/core/src/ephemeral_server/mod.rs +5 -6
  23. package/sdk-core/core/src/lib.rs +2 -0
  24. package/sdk-core/core/src/protosext/mod.rs +1 -1
  25. package/sdk-core/core/src/telemetry/log_export.rs +1 -1
  26. package/sdk-core/core/src/telemetry/mod.rs +23 -8
  27. package/sdk-core/core/src/test_help/mod.rs +8 -1
  28. package/sdk-core/core/src/worker/activities/local_activities.rs +259 -125
  29. package/sdk-core/core/src/worker/activities.rs +3 -2
  30. package/sdk-core/core/src/worker/mod.rs +53 -26
  31. package/sdk-core/core/src/worker/workflow/bridge.rs +1 -3
  32. package/sdk-core/core/src/worker/workflow/driven_workflow.rs +3 -5
  33. package/sdk-core/core/src/worker/workflow/history_update.rs +835 -277
  34. package/sdk-core/core/src/worker/workflow/machines/activity_state_machine.rs +9 -17
  35. package/sdk-core/core/src/worker/workflow/machines/cancel_external_state_machine.rs +3 -5
  36. package/sdk-core/core/src/worker/workflow/machines/cancel_workflow_state_machine.rs +1 -2
  37. package/sdk-core/core/src/worker/workflow/machines/child_workflow_state_machine.rs +3 -5
  38. package/sdk-core/core/src/worker/workflow/machines/complete_workflow_state_machine.rs +1 -2
  39. package/sdk-core/core/src/worker/workflow/machines/continue_as_new_workflow_state_machine.rs +1 -2
  40. package/sdk-core/core/src/worker/workflow/machines/fail_workflow_state_machine.rs +1 -2
  41. package/sdk-core/core/src/worker/workflow/machines/local_activity_state_machine.rs +73 -51
  42. package/sdk-core/core/src/worker/workflow/machines/mod.rs +3 -3
  43. package/sdk-core/core/src/worker/workflow/machines/modify_workflow_properties_state_machine.rs +4 -4
  44. package/sdk-core/core/src/worker/workflow/machines/patch_state_machine.rs +1 -2
  45. package/sdk-core/core/src/worker/workflow/machines/signal_external_state_machine.rs +3 -5
  46. package/sdk-core/core/src/worker/workflow/machines/timer_state_machine.rs +6 -7
  47. package/sdk-core/core/src/worker/workflow/machines/transition_coverage.rs +2 -2
  48. package/sdk-core/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +4 -4
  49. package/sdk-core/core/src/worker/workflow/machines/workflow_machines/local_acts.rs +6 -17
  50. package/sdk-core/core/src/worker/workflow/machines/workflow_machines.rs +89 -58
  51. package/sdk-core/core/src/worker/workflow/machines/workflow_task_state_machine.rs +4 -7
  52. package/sdk-core/core/src/worker/workflow/managed_run/managed_wf_test.rs +21 -9
  53. package/sdk-core/core/src/worker/workflow/managed_run.rs +1021 -360
  54. package/sdk-core/core/src/worker/workflow/mod.rs +306 -346
  55. package/sdk-core/core/src/worker/workflow/run_cache.rs +29 -53
  56. package/sdk-core/core/src/worker/workflow/wft_extraction.rs +125 -0
  57. package/sdk-core/core/src/worker/workflow/wft_poller.rs +1 -4
  58. package/sdk-core/core/src/worker/workflow/workflow_stream/saved_wf_inputs.rs +115 -0
  59. package/sdk-core/core/src/worker/workflow/workflow_stream/tonic_status_serde.rs +24 -0
  60. package/sdk-core/core/src/worker/workflow/workflow_stream.rs +444 -714
  61. package/sdk-core/core-api/Cargo.toml +2 -0
  62. package/sdk-core/core-api/src/errors.rs +1 -34
  63. package/sdk-core/core-api/src/lib.rs +6 -2
  64. package/sdk-core/core-api/src/worker.rs +14 -1
  65. package/sdk-core/etc/deps.svg +115 -140
  66. package/sdk-core/etc/regen-depgraph.sh +5 -0
  67. package/sdk-core/fsm/rustfsm_procmacro/src/lib.rs +6 -6
  68. package/sdk-core/fsm/rustfsm_trait/src/lib.rs +7 -3
  69. package/sdk-core/histories/evict_while_la_running_no_interference-16_history.bin +0 -0
  70. package/sdk-core/protos/api_upstream/Makefile +5 -5
  71. package/sdk-core/protos/api_upstream/build/go.mod +7 -0
  72. package/sdk-core/protos/api_upstream/build/go.sum +5 -0
  73. package/sdk-core/protos/api_upstream/build/tools.go +29 -0
  74. package/sdk-core/protos/api_upstream/go.mod +6 -0
  75. package/sdk-core/protos/api_upstream/temporal/api/batch/v1/message.proto +9 -2
  76. package/sdk-core/protos/api_upstream/temporal/api/command/v1/message.proto +12 -19
  77. package/sdk-core/protos/api_upstream/temporal/api/common/v1/message.proto +2 -2
  78. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/batch_operation.proto +3 -2
  79. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/command_type.proto +3 -2
  80. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/common.proto +3 -2
  81. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/event_type.proto +3 -3
  82. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +20 -2
  83. package/sdk-core/protos/api_upstream/temporal/api/{update/v1/message.proto → enums/v1/interaction_type.proto} +11 -18
  84. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/namespace.proto +2 -2
  85. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/query.proto +2 -2
  86. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/reset.proto +2 -2
  87. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/schedule.proto +2 -2
  88. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/task_queue.proto +2 -2
  89. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/update.proto +2 -13
  90. package/sdk-core/protos/api_upstream/temporal/api/enums/v1/workflow.proto +2 -2
  91. package/sdk-core/protos/api_upstream/temporal/api/errordetails/v1/message.proto +2 -2
  92. package/sdk-core/protos/api_upstream/temporal/api/failure/v1/message.proto +2 -2
  93. package/sdk-core/protos/api_upstream/temporal/api/filter/v1/message.proto +2 -2
  94. package/sdk-core/protos/api_upstream/temporal/api/history/v1/message.proto +13 -19
  95. package/sdk-core/protos/api_upstream/temporal/api/interaction/v1/message.proto +87 -0
  96. package/sdk-core/protos/api_upstream/temporal/api/namespace/v1/message.proto +2 -2
  97. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/request_response.proto +2 -2
  98. package/sdk-core/protos/api_upstream/temporal/api/operatorservice/v1/service.proto +2 -2
  99. package/sdk-core/protos/api_upstream/temporal/api/query/v1/message.proto +2 -2
  100. package/sdk-core/protos/api_upstream/temporal/api/replication/v1/message.proto +2 -2
  101. package/sdk-core/protos/api_upstream/temporal/api/schedule/v1/message.proto +2 -2
  102. package/sdk-core/protos/api_upstream/temporal/api/taskqueue/v1/message.proto +2 -2
  103. package/sdk-core/protos/api_upstream/temporal/api/version/v1/message.proto +2 -2
  104. package/sdk-core/protos/api_upstream/temporal/api/workflow/v1/message.proto +2 -2
  105. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +13 -8
  106. package/sdk-core/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +2 -2
  107. package/sdk-core/protos/local/temporal/sdk/core/workflow_activation/workflow_activation.proto +2 -0
  108. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/request_response.proto +2 -2
  109. package/sdk-core/protos/testsrv_upstream/temporal/api/testservice/v1/service.proto +2 -2
  110. package/sdk-core/sdk/Cargo.toml +4 -3
  111. package/sdk-core/sdk/src/lib.rs +87 -21
  112. package/sdk-core/sdk/src/workflow_future.rs +7 -12
  113. package/sdk-core/sdk-core-protos/Cargo.toml +5 -2
  114. package/sdk-core/sdk-core-protos/build.rs +36 -2
  115. package/sdk-core/sdk-core-protos/src/history_builder.rs +26 -19
  116. package/sdk-core/sdk-core-protos/src/history_info.rs +4 -0
  117. package/sdk-core/sdk-core-protos/src/lib.rs +78 -34
  118. package/sdk-core/sdk-core-protos/src/task_token.rs +12 -2
  119. package/sdk-core/test-utils/Cargo.toml +3 -1
  120. package/sdk-core/test-utils/src/histfetch.rs +1 -1
  121. package/sdk-core/test-utils/src/lib.rs +50 -18
  122. package/sdk-core/test-utils/src/wf_input_saver.rs +50 -0
  123. package/sdk-core/test-utils/src/workflows.rs +29 -0
  124. package/sdk-core/tests/fuzzy_workflow.rs +130 -0
  125. package/sdk-core/tests/{load_tests.rs → heavy_tests.rs} +114 -7
  126. package/sdk-core/tests/integ_tests/heartbeat_tests.rs +5 -2
  127. package/sdk-core/tests/integ_tests/metrics_tests.rs +1 -1
  128. package/sdk-core/tests/integ_tests/polling_tests.rs +1 -39
  129. package/sdk-core/tests/integ_tests/queries_tests.rs +2 -127
  130. package/sdk-core/tests/integ_tests/visibility_tests.rs +52 -5
  131. package/sdk-core/tests/integ_tests/workflow_tests/activities.rs +74 -1
  132. package/sdk-core/tests/integ_tests/workflow_tests/cancel_wf.rs +5 -13
  133. package/sdk-core/tests/integ_tests/workflow_tests/continue_as_new.rs +1 -1
  134. package/sdk-core/tests/integ_tests/workflow_tests/determinism.rs +2 -10
  135. package/sdk-core/tests/integ_tests/workflow_tests/local_activities.rs +69 -197
  136. package/sdk-core/tests/integ_tests/workflow_tests/patches.rs +4 -28
  137. package/sdk-core/tests/integ_tests/workflow_tests/replay.rs +12 -7
  138. package/sdk-core/tests/integ_tests/workflow_tests/signals.rs +14 -14
  139. package/sdk-core/tests/integ_tests/workflow_tests/stickyness.rs +3 -19
  140. package/sdk-core/tests/integ_tests/workflow_tests/timers.rs +3 -19
  141. package/sdk-core/tests/integ_tests/workflow_tests/upsert_search_attrs.rs +1 -1
  142. package/sdk-core/tests/integ_tests/workflow_tests.rs +5 -6
  143. package/sdk-core/tests/main.rs +2 -12
  144. package/sdk-core/tests/runner.rs +71 -34
  145. package/sdk-core/tests/wf_input_replay.rs +32 -0
  146. package/sdk-core/bridge-ffi/Cargo.toml +0 -24
  147. package/sdk-core/bridge-ffi/LICENSE.txt +0 -23
  148. package/sdk-core/bridge-ffi/build.rs +0 -25
  149. package/sdk-core/bridge-ffi/include/sdk-core-bridge.h +0 -224
  150. package/sdk-core/bridge-ffi/src/lib.rs +0 -746
  151. package/sdk-core/bridge-ffi/src/wrappers.rs +0 -221
  152. package/sdk-core/protos/local/temporal/sdk/core/bridge/bridge.proto +0 -210
  153. package/sdk-core/sdk/src/conversions.rs +0 -8
@@ -1,250 +1,179 @@
1
+ #[cfg(feature = "save_wf_inputs")]
2
+ mod saved_wf_inputs;
3
+ #[cfg(feature = "save_wf_inputs")]
4
+ mod tonic_status_serde;
5
+
6
+ #[cfg(feature = "save_wf_inputs")]
7
+ pub use saved_wf_inputs::replay_wf_state_inputs;
8
+
1
9
  use crate::{
2
- abstractions::{dbg_panic, stream_when_allowed, MeteredSemaphore},
3
- protosext::ValidPollWFTQResponse,
4
- telemetry::metrics::workflow_worker_type,
5
- worker::{
6
- workflow::{history_update::NextPageToken, run_cache::RunCache, *},
7
- LocalActRequest, LocalActivityResolution, LEGACY_QUERY_ID,
10
+ abstractions::dbg_panic,
11
+ worker::workflow::{
12
+ managed_run::RunUpdateAct,
13
+ run_cache::RunCache,
14
+ wft_extraction::{HistfetchRC, HistoryFetchReq, WFTExtractorOutput},
15
+ *,
8
16
  },
9
17
  MetricsContext,
10
18
  };
11
19
  use futures::{stream, stream::PollNext, Stream, StreamExt};
12
- use std::{collections::VecDeque, fmt::Debug, future, sync::Arc, time::Instant};
13
- use temporal_sdk_core_api::errors::{PollWfError, WFMachinesError};
20
+ use std::{collections::VecDeque, fmt::Debug, future, sync::Arc};
21
+ use temporal_sdk_core_api::errors::PollWfError;
14
22
  use temporal_sdk_core_protos::{
15
- coresdk::{
16
- workflow_activation::{
17
- create_evict_activation, query_to_job, remove_from_cache::EvictionReason,
18
- workflow_activation_job,
19
- },
20
- workflow_completion::Failure,
21
- },
22
- temporal::api::{enums::v1::WorkflowTaskFailedCause, failure::v1::Failure as TFailure},
23
+ coresdk::workflow_activation::remove_from_cache::EvictionReason,
24
+ temporal::api::enums::v1::WorkflowTaskFailedCause,
23
25
  };
24
- use tokio::sync::{mpsc::unbounded_channel, oneshot};
25
- use tokio_stream::wrappers::UnboundedReceiverStream;
26
26
  use tokio_util::sync::CancellationToken;
27
27
  use tracing::{Level, Span};
28
28
 
29
- /// This struct holds all the state needed for tracking what workflow runs are currently cached
30
- /// and how WFTs should be dispatched to them, etc.
29
+ /// This struct holds all the state needed for tracking the state of currently cached workflow runs
30
+ /// and directs all actions which affect them. It is ultimately the top-level arbiter of nearly
31
+ /// everything important relating to workflow state.
31
32
  ///
32
33
  /// See [WFStream::build] for more
33
- pub(crate) struct WFStream {
34
+ pub(super) struct WFStream {
34
35
  runs: RunCache,
35
36
  /// Buffered polls for new runs which need a cache slot to open up before we can handle them
36
37
  buffered_polls_need_cache_slot: VecDeque<PermittedWFT>,
38
+ /// Is filled with runs that we decided need to have their history fetched during state
39
+ /// manipulation. Must be drained after handling each input.
40
+ runs_needing_fetching: VecDeque<HistoryFetchReq>,
37
41
 
38
- /// Client for accessing server for history pagination etc.
39
- client: Arc<dyn WorkerClient>,
40
-
41
- /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
42
- wft_semaphore: MeteredSemaphore,
42
+ history_fetch_refcounter: Arc<HistfetchRC>,
43
43
  shutdown_token: CancellationToken,
44
44
  ignore_evicts_on_shutdown: bool,
45
45
 
46
46
  metrics: MetricsContext,
47
- }
48
- impl WFStream {
49
- fn record_span_fields(&mut self, run_id: &str, span: &Span) {
50
- if let Some(run_handle) = self.runs.get_mut(run_id) {
51
- if let Some(spid) = span.id() {
52
- if run_handle.recorded_span_ids.contains(&spid) {
53
- return;
54
- }
55
- run_handle.recorded_span_ids.insert(spid);
56
-
57
- if let Some(wid) = run_handle.wft.as_ref().map(|wft| &wft.info.wf_id) {
58
- span.record("workflow_id", wid.as_str());
59
- }
60
- }
61
- }
62
- }
63
- }
64
47
 
65
- /// All possible inputs to the [WFStream]
66
- #[derive(derive_more::From, Debug)]
67
- enum WFStreamInput {
68
- NewWft(PermittedWFT),
69
- Local(LocalInput),
70
- /// The stream given to us which represents the poller (or a mock) terminated.
71
- PollerDead,
72
- /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
73
- /// error while polling
74
- PollerError(tonic::Status),
75
- }
76
- impl From<RunUpdateResponse> for WFStreamInput {
77
- fn from(r: RunUpdateResponse) -> Self {
78
- WFStreamInput::Local(LocalInput {
79
- input: LocalInputs::RunUpdateResponse(r.kind),
80
- span: r.span,
81
- })
82
- }
83
- }
84
- /// A non-poller-received input to the [WFStream]
85
- #[derive(derive_more::DebugCustom)]
86
- #[debug(fmt = "LocalInput {{ {:?} }}", input)]
87
- pub(super) struct LocalInput {
88
- pub input: LocalInputs,
89
- pub span: Span,
90
- }
91
- /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
92
- /// new polls.
93
- #[derive(Debug, derive_more::From)]
94
- pub(super) enum LocalInputs {
95
- Completion(WFActCompleteMsg),
96
- LocalResolution(LocalResolutionMsg),
97
- PostActivation(PostActivationMsg),
98
- RunUpdateResponse(RunUpdateResponseKind),
99
- RequestEviction(RequestEvictMsg),
100
- GetStateInfo(GetStateInfoMsg),
101
- }
102
- impl LocalInputs {
103
- fn run_id(&self) -> Option<&str> {
104
- Some(match self {
105
- LocalInputs::Completion(c) => c.completion.run_id(),
106
- LocalInputs::LocalResolution(lr) => &lr.run_id,
107
- LocalInputs::PostActivation(pa) => &pa.run_id,
108
- LocalInputs::RunUpdateResponse(rur) => rur.run_id(),
109
- LocalInputs::RequestEviction(re) => &re.run_id,
110
- LocalInputs::GetStateInfo(_) => return None,
111
- })
112
- }
113
- }
114
- #[derive(Debug, derive_more::From)]
115
- #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
116
- enum ExternalPollerInputs {
117
- NewWft(PermittedWFT),
118
- PollerDead,
119
- PollerError(tonic::Status),
48
+ #[cfg(feature = "save_wf_inputs")]
49
+ wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
120
50
  }
121
- impl From<ExternalPollerInputs> for WFStreamInput {
122
- fn from(l: ExternalPollerInputs) -> Self {
123
- match l {
124
- ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
125
- ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
126
- ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
127
- }
128
- }
129
- }
130
-
131
51
  impl WFStream {
132
52
  /// Constructs workflow state management and returns a stream which outputs activations.
133
53
  ///
134
- /// * `external_wfts` is a stream of validated poll responses as returned by a poller (or mock)
135
- /// * `wfts_from_complete` is the recv side of a channel that new WFTs from completions should
136
- /// come down.
54
+ /// * `wft_stream` is a stream of validated poll responses and fetched history pages as returned
55
+ /// by a poller (or mock), via [WFTExtractor].
137
56
  /// * `local_rx` is a stream of actions that workflow state needs to see. Things like
138
- /// completions, local activities finishing, etc. See [LocalInputs].
57
+ /// completions, local activities finishing, etc. See [LocalInputs].
58
+ /// * `local_activity_request_sink` is used to handle outgoing requests to start or cancel
59
+ /// local activities, and may return resolutions that need to be handled immediately.
139
60
  ///
140
- /// These inputs are combined, along with an internal feedback channel for run-specific updates,
141
- /// to form the inputs to a stream of [WFActStreamInput]s. The stream processor then takes
142
- /// action on those inputs, and then may yield activations.
61
+ /// The stream inputs are combined into a stream of [WFActStreamInput]s. The stream processor
62
+ /// then takes action on those inputs, mutating the [WFStream] state, and then may yield
63
+ /// activations.
143
64
  ///
144
- /// Updating runs may need to do async work like fetching additional history. In order to
145
- /// facilitate this, each run lives in its own task which is communicated with by sending
146
- /// [RunAction]s and receiving [RunUpdateResponse]s via its [ManagedRunHandle].
65
+ /// Importantly, nothing async happens while actually mutating state. This means all changes to
66
+ /// all workflow state can be represented purely via the stream of inputs, plus the
67
+ /// calls/retvals from the LA request sink, which is the last unfortunate bit of impurity in
68
+ /// the design. Eliminating it would be nice, so that all inputs come from the passed-in streams
69
+ /// and all outputs flow from the return stream, but it's difficult to do so since it would
70
+ /// require "pausing" in-progress changes to a run while sending & waiting for response from
71
+ /// local activity management. Likely the best option would be to move the pure state info
72
+ /// needed to determine immediate responses into LA state machines themselves (out of the LA
73
+ /// manager), which is a quite substantial change.
147
74
  pub(super) fn build(
148
75
  basics: WorkflowBasics,
149
- external_wfts: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
76
+ wft_stream: impl Stream<Item = Result<WFTExtractorOutput, tonic::Status>> + Send + 'static,
150
77
  local_rx: impl Stream<Item = LocalInput> + Send + 'static,
151
- client: Arc<dyn WorkerClient>,
152
- local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
153
- + Send
154
- + Sync
155
- + 'static,
156
- ) -> impl Stream<Item = Result<ActivationOrAuto, PollWfError>> {
157
- let wft_semaphore = MeteredSemaphore::new(
158
- basics.max_outstanding_wfts,
159
- basics.metrics.with_new_attrs([workflow_worker_type()]),
160
- MetricsContext::available_task_slots,
161
- );
162
- let wft_sem_clone = wft_semaphore.clone();
163
- let proceeder = stream::unfold(wft_sem_clone, |sem| async move {
164
- Some((sem.acquire_owned().await.unwrap(), sem))
165
- });
166
- let poller_wfts = stream_when_allowed(external_wfts, proceeder);
167
- let (run_update_tx, run_update_rx) = unbounded_channel();
168
- let local_rx = stream::select(
169
- local_rx.map(Into::into),
170
- UnboundedReceiverStream::new(run_update_rx).map(Into::into),
171
- );
78
+ local_activity_request_sink: impl LocalActivityRequestSink,
79
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
172
80
  let all_inputs = stream::select_with_strategy(
173
- local_rx,
174
- poller_wfts
175
- .map(|(wft, permit)| match wft {
176
- Ok(wft) => ExternalPollerInputs::NewWft(PermittedWFT { wft, permit }),
177
- Err(e) => ExternalPollerInputs::PollerError(e),
178
- })
81
+ local_rx.map(Into::into),
82
+ wft_stream
83
+ .map(Into::into)
179
84
  .chain(stream::once(async { ExternalPollerInputs::PollerDead }))
180
85
  .map(Into::into)
181
86
  .boxed(),
182
87
  // Priority always goes to the local stream
183
88
  |_: &mut ()| PollNext::Left,
184
89
  );
90
+ Self::build_internal(all_inputs, basics, local_activity_request_sink)
91
+ }
92
+
93
+ fn build_internal(
94
+ all_inputs: impl Stream<Item = WFStreamInput>,
95
+ basics: WorkflowBasics,
96
+ local_activity_request_sink: impl LocalActivityRequestSink,
97
+ ) -> impl Stream<Item = Result<WFStreamOutput, PollWfError>> {
185
98
  let mut state = WFStream {
186
99
  buffered_polls_need_cache_slot: Default::default(),
187
100
  runs: RunCache::new(
188
101
  basics.max_cached_workflows,
189
102
  basics.namespace.clone(),
190
- run_update_tx,
191
103
  Arc::new(local_activity_request_sink),
192
104
  basics.metrics.clone(),
193
105
  ),
194
- client,
195
- wft_semaphore,
196
106
  shutdown_token: basics.shutdown_token,
197
107
  ignore_evicts_on_shutdown: basics.ignore_evicts_on_shutdown,
198
108
  metrics: basics.metrics,
109
+ runs_needing_fetching: Default::default(),
110
+ history_fetch_refcounter: Arc::new(HistfetchRC {}),
111
+
112
+ #[cfg(feature = "save_wf_inputs")]
113
+ wf_state_inputs: basics.wf_state_inputs,
199
114
  };
200
115
  all_inputs
201
- .map(move |action| {
116
+ .map(move |action: WFStreamInput| {
202
117
  let span = span!(Level::DEBUG, "new_stream_input", action=?action);
203
118
  let _span_g = span.enter();
204
119
 
205
- let maybe_activation = match action {
120
+ #[cfg(feature = "save_wf_inputs")]
121
+ let maybe_write = state.prep_input(&action);
122
+
123
+ let mut activations = vec![];
124
+ let maybe_act = match action {
206
125
  WFStreamInput::NewWft(pwft) => {
207
- debug!(run_id=%pwft.wft.workflow_execution.run_id, "New WFT");
208
- state.instantiate_or_update(pwft);
209
- None
126
+ debug!(run_id=%pwft.work.execution.run_id, "New WFT");
127
+ state.instantiate_or_update(pwft)
210
128
  }
211
129
  WFStreamInput::Local(local_input) => {
212
130
  let _span_g = local_input.span.enter();
213
131
  if let Some(rid) = local_input.input.run_id() {
214
- state.record_span_fields(rid, &local_input.span);
132
+ if let Some(rh) = state.runs.get_mut(rid) {
133
+ rh.record_span_fields(&local_input.span);
134
+ }
215
135
  }
216
136
  match local_input.input {
217
- LocalInputs::RunUpdateResponse(resp) => {
218
- state.process_run_update_response(resp)
219
- }
220
137
  LocalInputs::Completion(completion) => {
221
- state.process_completion(completion);
222
- None
138
+ activations.extend(
139
+ state.process_completion(NewOrFetchedComplete::New(completion)),
140
+ );
141
+ None // completions can return more than one activation
142
+ }
143
+ LocalInputs::FetchedPageCompletion { paginator, update } => {
144
+ activations.extend(state.process_completion(
145
+ NewOrFetchedComplete::Fetched(update, paginator),
146
+ ));
147
+ None // completions can return more than one activation
223
148
  }
224
149
  LocalInputs::PostActivation(report) => {
225
- state.process_post_activation(report);
226
- None
150
+ state.process_post_activation(report)
227
151
  }
228
- LocalInputs::LocalResolution(res) => {
229
- state.local_resolution(res);
230
- None
152
+ LocalInputs::LocalResolution(res) => state.local_resolution(res),
153
+ LocalInputs::HeartbeatTimeout(hbt) => {
154
+ state.process_heartbeat_timeout(hbt)
231
155
  }
232
156
  LocalInputs::RequestEviction(evict) => {
233
- state.request_eviction(evict);
234
- None
157
+ state.request_eviction(evict).into_run_update_resp()
235
158
  }
236
159
  LocalInputs::GetStateInfo(gsi) => {
237
160
  let _ = gsi.response_tx.send(WorkflowStateInfo {
238
161
  cached_workflows: state.runs.len(),
239
162
  outstanding_wft: state.outstanding_wfts(),
240
- available_wft_permits: state.wft_semaphore.available_permits(),
241
163
  });
242
164
  None
243
165
  }
244
166
  }
245
167
  }
168
+ WFStreamInput::FailedFetch { run_id, err } => state
169
+ .request_eviction(RequestEvictMsg {
170
+ run_id,
171
+ message: format!("Fetching history failed: {err:?}"),
172
+ reason: EvictionReason::PaginationOrHistoryFetch,
173
+ })
174
+ .into_run_update_resp(),
246
175
  WFStreamInput::PollerDead => {
247
- debug!("WFT poller died, shutting down");
176
+ debug!("WFT poller died, beginning shutdown");
248
177
  state.shutdown_token.cancel();
249
178
  None
250
179
  }
@@ -254,457 +183,226 @@ impl WFStream {
254
183
  }
255
184
  };
256
185
 
257
- if let Some(ref act) = maybe_activation {
258
- if let Some(run_handle) = state.runs.get_mut(act.run_id()) {
259
- run_handle.insert_outstanding_activation(act);
260
- } else {
261
- dbg_panic!("Tried to insert activation for missing run!");
262
- }
186
+ activations.extend(maybe_act.into_iter());
187
+ activations.extend(state.reconcile_buffered());
188
+
189
+ // Always flush *after* actually handling the input, as this allows LA sink
190
+ // responses to be recorded before the input, so they can be read and buffered to be
191
+ // replayed during the handling of the input itself.
192
+ #[cfg(feature = "save_wf_inputs")]
193
+ if let Some(write) = maybe_write {
194
+ state.flush_write(write);
263
195
  }
264
- state.reconcile_buffered();
196
+
265
197
  if state.shutdown_done() {
198
+ info!("Workflow shutdown is done");
266
199
  return Err(PollWfError::ShutDown);
267
200
  }
268
201
 
269
- Ok(maybe_activation)
202
+ Ok(WFStreamOutput {
203
+ activations: activations.into(),
204
+ fetch_histories: std::mem::take(&mut state.runs_needing_fetching),
205
+ })
270
206
  })
271
- .filter_map(|o| {
272
- future::ready(match o {
273
- Ok(None) => None,
274
- Ok(Some(v)) => Some(Ok(v)),
275
- Err(e) => {
276
- if !matches!(e, PollWfError::ShutDown) {
277
- error!(
207
+ .inspect(|o| {
208
+ if let Some(e) = o.as_ref().err() {
209
+ if !matches!(e, PollWfError::ShutDown) {
210
+ error!(
278
211
  "Workflow processing encountered fatal error and must shut down {:?}",
279
212
  e
280
- );
281
- }
282
- Some(Err(e))
213
+ );
283
214
  }
284
- })
215
+ }
285
216
  })
286
217
  // Stop the stream once we have shut down
287
218
  .take_while(|o| future::ready(!matches!(o, Err(PollWfError::ShutDown))))
288
219
  }
289
220
 
290
- fn process_run_update_response(
291
- &mut self,
292
- resp: RunUpdateResponseKind,
293
- ) -> Option<ActivationOrAuto> {
294
- debug!(resp=%resp, "Processing run update response from machines");
295
- match resp {
296
- RunUpdateResponseKind::Good(mut resp) => {
297
- let run_handle = self
298
- .runs
299
- .get_mut(&resp.run_id)
300
- .expect("Workflow must exist, it just sent us an update response");
301
- run_handle.have_seen_terminal_event = resp.have_seen_terminal_event;
302
- run_handle.more_pending_work = resp.more_pending_work;
303
- run_handle.last_action_acked = true;
304
- run_handle.most_recently_processed_event_number =
305
- resp.most_recently_processed_event_number;
306
-
307
- let r = match resp.outgoing_activation {
308
- Some(ActivationOrAuto::LangActivation(mut activation)) => {
309
- if resp.in_response_to_wft {
310
- let wft = run_handle
311
- .wft
312
- .as_mut()
313
- .expect("WFT must exist for run just updated with one");
314
- // If there are in-poll queries, insert jobs for those queries into the
315
- // activation, but only if we hit the cache. If we didn't, those queries
316
- // will need to be dealt with once replay is over
317
- if wft.hit_cache {
318
- put_queries_in_act(&mut activation, wft);
319
- }
320
- }
321
-
322
- if activation.jobs.is_empty() {
323
- dbg_panic!("Should not send lang activation with no jobs");
324
- }
325
- Some(ActivationOrAuto::LangActivation(activation))
326
- }
327
- Some(ActivationOrAuto::ReadyForQueries(mut act)) => {
328
- if let Some(wft) = run_handle.wft.as_mut() {
329
- put_queries_in_act(&mut act, wft);
330
- Some(ActivationOrAuto::LangActivation(act))
331
- } else {
332
- dbg_panic!("Ready for queries but no WFT!");
333
- None
334
- }
335
- }
336
- a @ Some(ActivationOrAuto::Autocomplete { .. }) => a,
337
- None => {
338
- // If the response indicates there is no activation to send yet but there
339
- // is more pending work, we should check again.
340
- if run_handle.more_pending_work {
341
- run_handle.check_more_activations();
342
- None
343
- } else if let Some(reason) = run_handle.trying_to_evict.as_ref() {
344
- // If a run update came back and had nothing to do, but we're trying to
345
- // evict, just do that now as long as there's no other outstanding work.
346
- if run_handle.activation.is_none() && !run_handle.more_pending_work {
347
- let mut evict_act = create_evict_activation(
348
- resp.run_id,
349
- reason.message.clone(),
350
- reason.reason,
351
- );
352
- evict_act.history_length =
353
- run_handle.most_recently_processed_event_number as u32;
354
- Some(ActivationOrAuto::LangActivation(evict_act))
355
- } else {
356
- None
357
- }
358
- } else {
359
- None
360
- }
361
- }
362
- };
363
- if let Some(f) = resp.fulfillable_complete.take() {
364
- f.fulfill();
365
- }
366
-
367
- // After each run update, check if it's ready to handle any buffered poll
368
- if matches!(&r, Some(ActivationOrAuto::Autocomplete { .. }) | None)
369
- && !run_handle.has_any_pending_work(false, true)
370
- {
371
- if let Some(bufft) = run_handle.buffered_resp.take() {
372
- self.instantiate_or_update(bufft);
373
- }
374
- }
375
- r
376
- }
377
- RunUpdateResponseKind::Fail(fail) => {
378
- if let Some(r) = self.runs.get_mut(&fail.run_id) {
379
- r.last_action_acked = true;
380
- }
381
-
382
- if let Some(resp_chan) = fail.completion_resp {
383
- // Automatically fail the workflow task in the event we couldn't update machines
384
- let fail_cause = if matches!(&fail.err, WFMachinesError::Nondeterminism(_)) {
385
- WorkflowTaskFailedCause::NonDeterministicError
386
- } else {
387
- WorkflowTaskFailedCause::Unspecified
388
- };
389
- let wft_fail_str = format!("{:?}", fail.err);
390
- self.failed_completion(
391
- fail.run_id,
392
- fail_cause,
393
- fail.err.evict_reason(),
394
- TFailure::application_failure(wft_fail_str, false).into(),
395
- resp_chan,
396
- );
397
- } else {
398
- // TODO: This should probably also fail workflow tasks, but that wasn't
399
- // implemented pre-refactor either.
400
- warn!(error=?fail.err, run_id=%fail.run_id, "Error while updating workflow");
401
- self.request_eviction(RequestEvictMsg {
402
- run_id: fail.run_id,
403
- message: format!("Error while updating workflow: {:?}", fail.err),
404
- reason: fail.err.evict_reason(),
405
- });
406
- }
407
- None
221
+ /// Instantiate or update run machines with a new WFT
222
+ #[instrument(skip(self, pwft)
223
+ fields(run_id=%pwft.work.execution.run_id,
224
+ workflow_id=%pwft.work.execution.workflow_id))]
225
+ fn instantiate_or_update(&mut self, pwft: PermittedWFT) -> RunUpdateAct {
226
+ match self._instantiate_or_update(pwft) {
227
+ Err(histfetch) => {
228
+ self.runs_needing_fetching.push_back(histfetch);
229
+ Default::default()
408
230
  }
231
+ Ok(r) => r,
409
232
  }
410
233
  }
411
234
 
412
- #[instrument(skip(self, pwft),
413
- fields(run_id=%pwft.wft.workflow_execution.run_id,
414
- workflow_id=%pwft.wft.workflow_execution.workflow_id))]
415
- fn instantiate_or_update(&mut self, pwft: PermittedWFT) {
416
- let (mut work, permit) = if let Some(w) = self.buffer_resp_if_outstanding_work(pwft) {
417
- (w.wft, w.permit)
235
+ fn _instantiate_or_update(
236
+ &mut self,
237
+ pwft: PermittedWFT,
238
+ ) -> Result<RunUpdateAct, HistoryFetchReq> {
239
+ // If the run already exists, possibly buffer the work and return early if we can't handle
240
+ // it yet.
241
+ let pwft = if let Some(rh) = self.runs.get_mut(&pwft.work.execution.run_id) {
242
+ if let Some(w) = rh.buffer_wft_if_outstanding_work(pwft) {
243
+ w
244
+ } else {
245
+ return Ok(None);
246
+ }
418
247
  } else {
419
- return;
248
+ pwft
420
249
  };
421
250
 
422
- let run_id = work.workflow_execution.run_id.clone();
251
+ let run_id = pwft.work.execution.run_id.clone();
423
252
  // If our cache is full and this WFT is for an unseen run we must first evict a run before
424
253
  // we can deal with this task. So, buffer the task in that case.
425
254
  if !self.runs.has_run(&run_id) && self.runs.is_full() {
426
- self.buffer_resp_on_full_cache(PermittedWFT { wft: work, permit });
427
- return;
255
+ self.buffer_resp_on_full_cache(pwft);
256
+ return Ok(None);
428
257
  }
429
258
 
430
- let start_event_id = work.history.events.first().map(|e| e.event_id);
431
- debug!(
432
- run_id = %run_id,
433
- task_token = %&work.task_token,
434
- history_length = %work.history.events.len(),
435
- start_event_id = ?start_event_id,
436
- has_legacy_query = %work.legacy_query.is_some(),
437
- attempt = %work.attempt,
438
- "Applying new workflow task from server"
439
- );
440
-
441
- let wft_info = WorkflowTaskInfo {
442
- attempt: work.attempt,
443
- task_token: work.task_token,
444
- wf_id: work.workflow_execution.workflow_id.clone(),
445
- };
446
- let poll_resp_is_incremental = work
447
- .history
448
- .events
449
- .get(0)
450
- .map(|ev| ev.event_id > 1)
451
- .unwrap_or_default();
452
- let poll_resp_is_incremental = poll_resp_is_incremental || work.history.events.is_empty();
453
-
454
- let mut did_miss_cache = !poll_resp_is_incremental;
455
-
456
- let page_token = if !self.runs.has_run(&run_id) && poll_resp_is_incremental {
259
+ // This check can't really be lifted up higher since we could EX: See it's in the cache,
260
+ // not fetch more history, send the task, see cache is full, buffer it, then evict that
261
+ // run, and now we still have a cache miss.
262
+ if !self.runs.has_run(&run_id) && pwft.work.is_incremental() {
457
263
  debug!(run_id=?run_id, "Workflow task has partial history, but workflow is not in \
458
264
  cache. Will fetch history");
459
265
  self.metrics.sticky_cache_miss();
460
- did_miss_cache = true;
461
- NextPageToken::FetchFromStart
462
- } else {
463
- work.next_page_token.into()
464
- };
465
- let history_update = HistoryUpdate::new(
466
- HistoryPaginator::new(
467
- work.history,
468
- work.workflow_execution.workflow_id.clone(),
469
- run_id.clone(),
470
- page_token,
471
- self.client.clone(),
472
- ),
473
- work.previous_started_event_id,
474
- );
475
- let legacy_query_from_poll = work
476
- .legacy_query
477
- .take()
478
- .map(|q| query_to_job(LEGACY_QUERY_ID.to_string(), q));
479
-
480
- let mut pending_queries = work.query_requests.into_iter().collect::<Vec<_>>();
481
- if !pending_queries.is_empty() && legacy_query_from_poll.is_some() {
482
- error!(
483
- "Server issued both normal and legacy queries. This should not happen. Please \
484
- file a bug report."
485
- );
486
- self.request_eviction(RequestEvictMsg {
487
- run_id,
488
- message: "Server issued both normal and legacy query".to_string(),
489
- reason: EvictionReason::Fatal,
490
- });
491
- return;
492
- }
493
- if let Some(lq) = legacy_query_from_poll {
494
- pending_queries.push(lq);
266
+ return Err(HistoryFetchReq::Full(
267
+ CacheMissFetchReq { original_wft: pwft },
268
+ self.history_fetch_refcounter.clone(),
269
+ ));
495
270
  }
496
271
 
497
- let start_time = Instant::now();
498
- let run_handle = self.runs.instantiate_or_update(
499
- &run_id,
500
- &work.workflow_execution.workflow_id,
501
- &work.workflow_type,
502
- history_update,
503
- start_time,
504
- );
505
- run_handle.wft = Some(OutstandingTask {
506
- info: wft_info,
507
- hit_cache: !did_miss_cache,
508
- pending_queries,
509
- start_time,
510
- permit,
511
- })
272
+ let rur = self.runs.instantiate_or_update(pwft);
273
+ Ok(rur)
512
274
  }
513
275
 
514
- fn process_completion(&mut self, complete: WFActCompleteMsg) {
515
- match complete.completion {
516
- ValidatedCompletion::Success { run_id, commands } => {
517
- self.successful_completion(run_id, commands, complete.response_tx);
518
- }
519
- ValidatedCompletion::Fail { run_id, failure } => {
520
- self.failed_completion(
521
- run_id,
276
+ fn process_completion(&mut self, complete: NewOrFetchedComplete) -> Vec<ActivationOrAuto> {
277
+ let rh = if let Some(rh) = self.runs.get_mut(complete.run_id()) {
278
+ rh
279
+ } else {
280
+ dbg_panic!("Run missing during completion {:?}", complete);
281
+ return vec![];
282
+ };
283
+ let mut acts: Vec<_> = match complete {
284
+ NewOrFetchedComplete::New(complete) => match complete.completion {
285
+ ValidatedCompletion::Success { commands, .. } => {
286
+ match rh.successful_completion(commands, complete.response_tx) {
287
+ Ok(acts) => acts,
288
+ Err(npr) => {
289
+ self.runs_needing_fetching
290
+ .push_back(HistoryFetchReq::NextPage(
291
+ npr,
292
+ self.history_fetch_refcounter.clone(),
293
+ ));
294
+ None
295
+ }
296
+ }
297
+ }
298
+ ValidatedCompletion::Fail { failure, .. } => rh.failed_completion(
522
299
  WorkflowTaskFailedCause::Unspecified,
523
300
  EvictionReason::LangFail,
524
301
  failure,
525
302
  complete.response_tx,
526
- );
303
+ ),
304
+ },
305
+ NewOrFetchedComplete::Fetched(update, paginator) => {
306
+ rh.fetched_page_completion(update, paginator)
527
307
  }
528
308
  }
309
+ .into_iter()
310
+ .collect();
529
311
  // Always queue evictions after completion when we have a zero-size cache
530
312
  if self.runs.cache_capacity() == 0 {
531
- self.request_eviction_of_lru_run();
313
+ acts.extend(self.request_eviction_of_lru_run().into_run_update_resp())
532
314
  }
315
+ acts
533
316
  }
534
317
 
535
- fn successful_completion(
536
- &mut self,
537
- run_id: String,
538
- mut commands: Vec<WFCommand>,
539
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
540
- ) {
541
- let activation_was_only_eviction = self.activation_has_only_eviction(&run_id);
542
- let (task_token, has_pending_query, start_time) =
543
- if let Some(entry) = self.get_task(&run_id) {
544
- (
545
- entry.info.task_token.clone(),
546
- !entry.pending_queries.is_empty(),
547
- entry.start_time,
548
- )
549
- } else {
550
- if !activation_was_only_eviction {
551
- // Not an error if this was an eviction, since it's normal to issue eviction
552
- // activations without an associated workflow task in that case.
553
- dbg_panic!(
554
- "Attempted to complete activation for run {} without associated workflow task",
318
+ fn process_post_activation(&mut self, report: PostActivationMsg) -> RunUpdateAct {
319
+ let run_id = &report.run_id;
320
+ let wft_from_complete = report.wft_from_complete;
321
+ if let Some((wft, _)) = &wft_from_complete {
322
+ if &wft.execution.run_id != run_id {
323
+ dbg_panic!(
324
+ "Server returned a WFT on completion for a different run ({}) than the \
325
+ one being completed ({}). This is a server bug.",
326
+ wft.execution.run_id,
555
327
  run_id
556
- );
557
- }
558
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
559
- return;
560
- };
561
-
562
- // If the only command from the activation is a legacy query response, that means we need
563
- // to respond differently than a typical activation.
564
- if matches!(&commands.as_slice(),
565
- &[WFCommand::QueryResponse(qr)] if qr.query_id == LEGACY_QUERY_ID)
566
- {
567
- let qr = match commands.remove(0) {
568
- WFCommand::QueryResponse(qr) => qr,
569
- _ => unreachable!("We just verified this is the only command"),
570
- };
571
- self.reply_to_complete(
572
- &run_id,
573
- ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
574
- task_token,
575
- action: ActivationAction::RespondLegacyQuery {
576
- result: Box::new(qr),
577
- },
578
- }),
579
- resp_chan,
580
- );
581
- } else {
582
- // First strip out query responses from other commands that actually affect machines
583
- // Would be prettier with `drain_filter`
584
- let mut i = 0;
585
- let mut query_responses = vec![];
586
- while i < commands.len() {
587
- if matches!(commands[i], WFCommand::QueryResponse(_)) {
588
- if let WFCommand::QueryResponse(qr) = commands.remove(i) {
589
- query_responses.push(qr);
590
- }
591
- } else {
592
- i += 1;
593
- }
594
- }
595
-
596
- let activation_was_eviction = self.activation_has_eviction(&run_id);
597
- if let Some(rh) = self.runs.get_mut(&run_id) {
598
- rh.send_completion(RunActivationCompletion {
599
- task_token,
600
- start_time,
601
- commands,
602
- activation_was_eviction,
603
- activation_was_only_eviction,
604
- has_pending_query,
605
- query_responses,
606
- resp_chan: Some(resp_chan),
607
- });
608
- } else {
609
- dbg_panic!("Run {} missing during completion", run_id);
328
+ );
610
329
  }
611
- };
612
- }
613
-
614
- fn failed_completion(
615
- &mut self,
616
- run_id: String,
617
- cause: WorkflowTaskFailedCause,
618
- reason: EvictionReason,
619
- failure: Failure,
620
- resp_chan: oneshot::Sender<ActivationCompleteResult>,
621
- ) {
622
- let tt = if let Some(tt) = self.get_task(&run_id).map(|t| t.info.task_token.clone()) {
623
- tt
624
- } else {
625
- dbg_panic!(
626
- "No workflow task for run id {} found when trying to fail activation",
627
- run_id
628
- );
629
- self.reply_to_complete(&run_id, ActivationCompleteOutcome::DoNothing, resp_chan);
630
- return;
631
- };
632
-
633
- if let Some(m) = self.run_metrics(&run_id) {
634
- m.wf_task_failed();
635
330
  }
636
- let message = format!("Workflow activation completion failed: {:?}", &failure);
637
- // Blow up any cached data associated with the workflow
638
- let should_report = match self.request_eviction(RequestEvictMsg {
639
- run_id: run_id.clone(),
640
- message,
641
- reason,
642
- }) {
643
- EvictionRequestResult::EvictionRequested(Some(attempt))
644
- | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => attempt <= 1,
645
- _ => false,
646
- };
647
- // If the outstanding WFT is a legacy query task, report that we need to fail it
648
- let outcome = if self
649
- .runs
650
- .get(&run_id)
651
- .map(|rh| rh.pending_work_is_legacy_query())
652
- .unwrap_or_default()
653
- {
654
- ActivationCompleteOutcome::ReportWFTFail(
655
- FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
656
- )
657
- } else if should_report {
658
- ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
659
- tt, cause, failure,
660
- ))
661
- } else {
662
- ActivationCompleteOutcome::DoNothing
663
- };
664
- self.reply_to_complete(&run_id, outcome, resp_chan);
665
- }
666
331
 
667
- fn process_post_activation(&mut self, report: PostActivationMsg) {
668
- let run_id = &report.run_id;
332
+ let mut res = None;
669
333
 
670
334
  // If we reported to server, we always want to mark it complete.
671
- let maybe_t = self.complete_wft(run_id, report.reported_wft_to_server);
335
+ let maybe_t = self.complete_wft(run_id, report.wft_report_status);
336
+ // Delete the activation
337
+ let activation = self
338
+ .runs
339
+ .get_mut(run_id)
340
+ .and_then(|rh| rh.delete_activation());
341
+
342
+ // Evict the run if the activation contained an eviction
343
+ let mut applied_buffered_poll_for_this_run = false;
344
+ if activation.map(|a| a.has_eviction()).unwrap_or_default() {
345
+ debug!(run_id=%run_id, "Evicting run");
346
+
347
+ if let Some(mut rh) = self.runs.remove(run_id) {
348
+ if let Some(buff) = rh.take_buffered_wft() {
349
+ // Don't try to apply a buffered poll for this run if we just got a new WFT
350
+ // from completing, because by definition that buffered poll is now an
351
+ // out-of-date WFT.
352
+ if wft_from_complete.is_none() {
353
+ res = self.instantiate_or_update(buff);
354
+ applied_buffered_poll_for_this_run = true;
355
+ }
356
+ }
357
+ }
672
358
 
673
- if self
674
- .get_activation(run_id)
675
- .map(|a| a.has_eviction())
676
- .unwrap_or_default()
677
- {
678
- self.evict_run(run_id);
359
+ // Attempt to apply a buffered poll for some *other* run, if we didn't have a wft
360
+ // from complete or a buffered poll for *this* run.
361
+ if wft_from_complete.is_none() && !applied_buffered_poll_for_this_run {
362
+ if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
363
+ res = self.instantiate_or_update(buff);
364
+ }
365
+ }
679
366
  };
680
367
 
681
- if let Some(wft) = report.wft_from_complete {
682
- debug!(run_id=%wft.workflow_execution.run_id, "New WFT from completion");
368
+ if let Some((wft, pag)) = wft_from_complete {
369
+ debug!(run_id=%wft.execution.run_id, "New WFT from completion");
683
370
  if let Some(t) = maybe_t {
684
- self.instantiate_or_update(PermittedWFT {
685
- wft,
371
+ res = self.instantiate_or_update(PermittedWFT {
372
+ work: wft,
686
373
  permit: t.permit,
687
- })
374
+ paginator: pag,
375
+ });
688
376
  }
689
377
  }
690
378
 
691
- if let Some(rh) = self.runs.get_mut(run_id) {
692
- // Delete the activation
693
- rh.activation.take();
694
- // Attempt to produce the next activation if needed
695
- rh.check_more_activations();
379
+ if res.is_none() {
380
+ if let Some(rh) = self.runs.get_mut(run_id) {
381
+ // Attempt to produce the next activation if needed
382
+ res = rh.check_more_activations();
383
+ }
696
384
  }
385
+ res
697
386
  }
698
387
 
699
- fn local_resolution(&mut self, msg: LocalResolutionMsg) {
388
+ fn local_resolution(&mut self, msg: LocalResolutionMsg) -> RunUpdateAct {
700
389
  let run_id = msg.run_id;
701
390
  if let Some(rh) = self.runs.get_mut(&run_id) {
702
- rh.send_local_resolution(msg.res)
391
+ rh.local_resolution(msg.res)
703
392
  } else {
704
393
  // It isn't an explicit error if the machine is missing when a local activity resolves.
705
394
  // This can happen if an activity reports a timeout after we stopped caring about it.
706
395
  debug!(run_id = %run_id,
707
396
  "Tried to resolve a local activity for a run we are no longer tracking");
397
+ None
398
+ }
399
+ }
400
+
401
+ fn process_heartbeat_timeout(&mut self, run_id: String) -> RunUpdateAct {
402
+ if let Some(rh) = self.runs.get_mut(&run_id) {
403
+ rh.heartbeat_timeout()
404
+ } else {
405
+ None
708
406
  }
709
407
  }
710
408
 
@@ -712,17 +410,8 @@ impl WFStream {
712
410
  /// activation to evict the workflow from the lang side. Workflow will not *actually* be evicted
713
411
  /// until lang replies to that activation
714
412
  fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
715
- let activation_has_eviction = self.activation_has_eviction(&info.run_id);
716
413
  if let Some(rh) = self.runs.get_mut(&info.run_id) {
717
- let attempts = rh.wft.as_ref().map(|wt| wt.info.attempt);
718
- if !activation_has_eviction && rh.trying_to_evict.is_none() {
719
- debug!(run_id=%info.run_id, reason=%info.message, "Eviction requested");
720
- rh.trying_to_evict = Some(info);
721
- rh.check_more_activations();
722
- EvictionRequestResult::EvictionRequested(attempts)
723
- } else {
724
- EvictionRequestResult::EvictionAlreadyRequested(attempts)
725
- }
414
+ rh.request_eviction(info)
726
415
  } else {
727
416
  debug!(run_id=%info.run_id, "Eviction requested for unknown run");
728
417
  EvictionRequestResult::NotFound
@@ -743,36 +432,10 @@ impl WFStream {
743
432
  }
744
433
  }
745
434
 
746
- /// Evict a workflow from the cache by its run id. Any existing pending activations will be
747
- /// destroyed, and any outstanding activations invalidated.
748
- fn evict_run(&mut self, run_id: &str) {
749
- debug!(run_id=%run_id, "Evicting run");
750
-
751
- let mut did_take_buff = false;
752
- // Now it can safely be deleted, it'll get recreated once the un-buffered poll is handled if
753
- // there was one.
754
- if let Some(mut rh) = self.runs.remove(run_id) {
755
- rh.handle.abort();
756
-
757
- if let Some(buff) = rh.buffered_resp.take() {
758
- self.instantiate_or_update(buff);
759
- did_take_buff = true;
760
- }
761
- }
762
-
763
- if !did_take_buff {
764
- // If there wasn't a buffered poll, there might be one for a different run which needs
765
- // a free cache slot, and now there is.
766
- if let Some(buff) = self.buffered_polls_need_cache_slot.pop_front() {
767
- self.instantiate_or_update(buff);
768
- }
769
- }
770
- }
771
-
772
435
  fn complete_wft(
773
436
  &mut self,
774
437
  run_id: &str,
775
- reported_wft_to_server: bool,
438
+ wft_report_status: WFTReportStatus,
776
439
  ) -> Option<OutstandingTask> {
777
440
  // If the WFT completion wasn't sent to the server, but we did see the final event, we still
778
441
  // want to clear the workflow task. This can really only happen in replay testing, where we
@@ -782,9 +445,9 @@ impl WFStream {
782
445
  let saw_final = self
783
446
  .runs
784
447
  .get(run_id)
785
- .map(|r| r.have_seen_terminal_event)
448
+ .map(|r| r.have_seen_terminal_event())
786
449
  .unwrap_or_default();
787
- if !saw_final && !reported_wft_to_server {
450
+ if !saw_final && matches!(wft_report_status, WFTReportStatus::NotReported) {
788
451
  return None;
789
452
  }
790
453
 
@@ -792,60 +455,26 @@ impl WFStream {
792
455
  // Can't mark the WFT complete if there are pending queries, as doing so would destroy
793
456
  // them.
794
457
  if rh
795
- .wft
796
- .as_ref()
458
+ .wft()
797
459
  .map(|wft| !wft.pending_queries.is_empty())
798
460
  .unwrap_or_default()
799
461
  {
800
462
  return None;
801
463
  }
802
464
 
803
- debug!("Marking WFT completed");
804
- let retme = rh.wft.take();
805
- if let Some(ot) = &retme {
806
- if let Some(m) = self.run_metrics(run_id) {
807
- m.wf_task_latency(ot.start_time.elapsed());
808
- }
809
- }
810
- retme
465
+ rh.mark_wft_complete(wft_report_status)
811
466
  } else {
812
467
  None
813
468
  }
814
469
  }
815
470
 
816
- /// Stores some work if there is any outstanding WFT or activation for the run. If there was
817
- /// not, returns the work back out inside the option.
818
- fn buffer_resp_if_outstanding_work(&mut self, work: PermittedWFT) -> Option<PermittedWFT> {
819
- let run_id = &work.wft.workflow_execution.run_id;
820
- if let Some(mut run) = self.runs.get_mut(run_id) {
821
- let about_to_issue_evict = run.trying_to_evict.is_some() && !run.last_action_acked;
822
- let has_wft = run.wft.is_some();
823
- let has_activation = run.activation.is_some();
824
- if has_wft
825
- || has_activation
826
- || about_to_issue_evict
827
- || run.more_pending_work
828
- || !run.last_action_acked
829
- {
830
- debug!(run_id = %run_id, run = ?run,
831
- "Got new WFT for a run with outstanding work, buffering it");
832
- run.buffered_resp = Some(work);
833
- None
834
- } else {
835
- Some(work)
836
- }
837
- } else {
838
- Some(work)
839
- }
840
- }
841
-
842
471
  fn buffer_resp_on_full_cache(&mut self, work: PermittedWFT) {
843
- debug!(run_id=%work.wft.workflow_execution.run_id, "Buffering WFT because cache is full");
472
+ debug!(run_id=%work.work.execution.run_id, "Buffering WFT because cache is full");
844
473
  // If there's already a buffered poll for the run, replace it.
845
474
  if let Some(rh) = self
846
475
  .buffered_polls_need_cache_slot
847
476
  .iter_mut()
848
- .find(|w| w.wft.workflow_execution.run_id == work.wft.workflow_execution.run_id)
477
+ .find(|w| w.work.execution.run_id == work.work.execution.run_id)
849
478
  {
850
479
  *rh = work;
851
480
  } else {
@@ -856,7 +485,7 @@ impl WFStream {
856
485
 
857
486
  /// Makes sure we have enough pending evictions to fulfill the needs of buffered WFTs who are
858
487
  /// waiting on a cache slot
859
- fn reconcile_buffered(&mut self) {
488
+ fn reconcile_buffered(&mut self) -> Vec<ActivationOrAuto> {
860
489
  // We must ensure that there are at least as many pending evictions as there are tasks
861
490
  // that we might need to un-buffer (skipping runs which already have buffered tasks for
862
491
  // themselves)
@@ -865,121 +494,222 @@ impl WFStream {
865
494
  let num_existing_evictions = self
866
495
  .runs
867
496
  .runs_lru_order()
868
- .filter(|(_, h)| h.trying_to_evict.is_some())
497
+ .filter(|(_, h)| h.is_trying_to_evict())
869
498
  .count();
870
499
  let mut num_evicts_needed = num_in_buff.saturating_sub(num_existing_evictions);
871
500
  for (rid, handle) in self.runs.runs_lru_order() {
872
501
  if num_evicts_needed == 0 {
873
502
  break;
874
503
  }
875
- if handle.buffered_resp.is_none() {
504
+ if !handle.has_buffered_wft() {
876
505
  num_evicts_needed -= 1;
877
506
  evict_these.push(rid.to_string());
878
507
  }
879
508
  }
509
+ let mut acts = vec![];
880
510
  for run_id in evict_these {
881
- self.request_eviction(RequestEvictMsg {
882
- run_id,
883
- message: "Workflow cache full".to_string(),
884
- reason: EvictionReason::CacheFull,
885
- });
511
+ acts.extend(
512
+ self.request_eviction(RequestEvictMsg {
513
+ run_id,
514
+ message: "Workflow cache full".to_string(),
515
+ reason: EvictionReason::CacheFull,
516
+ })
517
+ .into_run_update_resp(),
518
+ );
886
519
  }
887
- }
888
-
889
- fn reply_to_complete(
890
- &self,
891
- run_id: &str,
892
- outcome: ActivationCompleteOutcome,
893
- chan: oneshot::Sender<ActivationCompleteResult>,
894
- ) {
895
- let most_recently_processed_event = self
896
- .runs
897
- .peek(run_id)
898
- .map(|rh| rh.most_recently_processed_event_number)
899
- .unwrap_or_default();
900
- chan.send(ActivationCompleteResult {
901
- most_recently_processed_event,
902
- outcome,
903
- })
904
- .expect("Rcv half of activation reply not dropped");
520
+ acts
905
521
  }
906
522
 
907
523
  fn shutdown_done(&self) -> bool {
908
- let all_runs_ready = self
909
- .runs
910
- .handles()
911
- .all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
912
- if self.shutdown_token.is_cancelled() && all_runs_ready {
913
- info!("Workflow shutdown is done");
914
- true
915
- } else {
916
- false
524
+ if self.shutdown_token.is_cancelled() {
525
+ if Arc::strong_count(&self.history_fetch_refcounter) > 1 {
526
+ // Don't exit if there are outstanding fetch requests
527
+ return false;
528
+ }
529
+ let all_runs_ready = self
530
+ .runs
531
+ .handles()
532
+ .all(|r| !r.has_any_pending_work(self.ignore_evicts_on_shutdown, false));
533
+ if all_runs_ready {
534
+ return true;
535
+ }
917
536
  }
918
- }
919
-
920
- fn get_task(&mut self, run_id: &str) -> Option<&OutstandingTask> {
921
- self.runs.get(run_id).and_then(|rh| rh.wft.as_ref())
922
- }
923
-
924
- fn get_activation(&mut self, run_id: &str) -> Option<&OutstandingActivation> {
925
- self.runs.get(run_id).and_then(|rh| rh.activation.as_ref())
926
- }
927
-
928
- fn run_metrics(&mut self, run_id: &str) -> Option<&MetricsContext> {
929
- self.runs.get(run_id).map(|r| &r.metrics)
930
- }
931
-
932
- fn activation_has_only_eviction(&mut self, run_id: &str) -> bool {
933
- self.runs
934
- .get(run_id)
935
- .and_then(|rh| rh.activation)
936
- .map(OutstandingActivation::has_only_eviction)
937
- .unwrap_or_default()
938
- }
939
-
940
- fn activation_has_eviction(&mut self, run_id: &str) -> bool {
941
- self.runs
942
- .get(run_id)
943
- .and_then(|rh| rh.activation)
944
- .map(OutstandingActivation::has_eviction)
945
- .unwrap_or_default()
537
+ false
946
538
  }
947
539
 
948
540
  fn outstanding_wfts(&self) -> usize {
949
- self.runs.handles().filter(|r| r.wft.is_some()).count()
541
+ self.runs.handles().filter(|r| r.wft().is_some()).count()
950
542
  }
951
543
 
952
544
  // Useful when debugging
953
545
  #[allow(dead_code)]
954
546
  fn info_dump(&self, run_id: &str) {
955
547
  if let Some(r) = self.runs.peek(run_id) {
956
- info!(run_id, wft=?r.wft, activation=?r.activation, buffered=r.buffered_resp.is_some(),
957
- trying_to_evict=r.trying_to_evict.is_some(), more_work=r.more_pending_work,
958
- last_action_acked=r.last_action_acked);
548
+ info!(run_id, wft=?r.wft(), activation=?r.activation(),
549
+ buffered_wft=r.has_buffered_wft(),
550
+ trying_to_evict=r.is_trying_to_evict(), more_work=r.more_pending_work());
959
551
  } else {
960
552
  info!(run_id, "Run not found");
961
553
  }
962
554
  }
963
555
  }
964
556
 
965
- /// Drains pending queries from the workflow task and appends them to the activation's jobs
966
- fn put_queries_in_act(act: &mut WorkflowActivation, wft: &mut OutstandingTask) {
967
- // Nothing to do if there are no pending queries
968
- if wft.pending_queries.is_empty() {
969
- return;
970
- }
557
+ /// All possible inputs to the [WFStream]
558
+ #[derive(derive_more::From, Debug)]
559
+ #[cfg_attr(
560
+ feature = "save_wf_inputs",
561
+ derive(serde::Serialize, serde::Deserialize)
562
+ )]
563
+ enum WFStreamInput {
564
+ NewWft(PermittedWFT),
565
+ Local(LocalInput),
566
+ /// The stream given to us which represents the poller (or a mock) terminated.
567
+ PollerDead,
568
+ /// The stream given to us which represents the poller (or a mock) encountered a non-retryable
569
+ /// error while polling
570
+ PollerError(
571
+ #[cfg_attr(
572
+ feature = "save_wf_inputs",
573
+ serde(with = "tonic_status_serde::SerdeStatus")
574
+ )]
575
+ tonic::Status,
576
+ ),
577
+ FailedFetch {
578
+ run_id: String,
579
+ #[cfg_attr(
580
+ feature = "save_wf_inputs",
581
+ serde(with = "tonic_status_serde::SerdeStatus")
582
+ )]
583
+ err: tonic::Status,
584
+ },
585
+ }
971
586
 
972
- let has_legacy = wft.has_pending_legacy_query();
973
- // Cannot dispatch legacy query if there are any other jobs - which can happen if, ex, a local
974
- // activity resolves while we've gotten a legacy query after heartbeating.
975
- if has_legacy && !act.jobs.is_empty() {
976
- return;
587
+ /// A non-poller-received input to the [WFStream]
588
+ #[derive(derive_more::DebugCustom)]
589
+ #[cfg_attr(
590
+ feature = "save_wf_inputs",
591
+ derive(serde::Serialize, serde::Deserialize)
592
+ )]
593
+ #[debug(fmt = "LocalInput {{ {input:?} }}")]
594
+ pub(super) struct LocalInput {
595
+ pub input: LocalInputs,
596
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip, default = "Span::current"))]
597
+ pub span: Span,
598
+ }
599
+ impl From<HeartbeatTimeoutMsg> for LocalInput {
600
+ fn from(hb: HeartbeatTimeoutMsg) -> Self {
601
+ Self {
602
+ input: LocalInputs::HeartbeatTimeout(hb.run_id),
603
+ span: hb.span,
604
+ }
605
+ }
606
+ }
607
+ /// Everything that _isn't_ a poll which may affect workflow state. Always higher priority than
608
+ /// new polls.
609
+ #[derive(Debug, derive_more::From)]
610
+ #[cfg_attr(
611
+ feature = "save_wf_inputs",
612
+ derive(serde::Serialize, serde::Deserialize)
613
+ )]
614
+ pub(super) enum LocalInputs {
615
+ Completion(WFActCompleteMsg),
616
+ FetchedPageCompletion {
617
+ paginator: HistoryPaginator,
618
+ update: HistoryUpdate,
619
+ },
620
+ LocalResolution(LocalResolutionMsg),
621
+ PostActivation(PostActivationMsg),
622
+ RequestEviction(RequestEvictMsg),
623
+ HeartbeatTimeout(String),
624
+ #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
625
+ GetStateInfo(GetStateInfoMsg),
626
+ }
627
+ impl LocalInputs {
628
+ fn run_id(&self) -> Option<&str> {
629
+ Some(match self {
630
+ LocalInputs::Completion(c) => c.completion.run_id(),
631
+ LocalInputs::FetchedPageCompletion { paginator, .. } => &paginator.run_id,
632
+ LocalInputs::LocalResolution(lr) => &lr.run_id,
633
+ LocalInputs::PostActivation(pa) => &pa.run_id,
634
+ LocalInputs::RequestEviction(re) => &re.run_id,
635
+ LocalInputs::HeartbeatTimeout(hb) => hb,
636
+ LocalInputs::GetStateInfo(_) => return None,
637
+ })
638
+ }
639
+ }
640
+ #[derive(Debug)]
641
+ #[allow(clippy::large_enum_variant)] // PollerDead only ever gets used once, so not important.
642
+ enum ExternalPollerInputs {
643
+ NewWft(PermittedWFT),
644
+ PollerDead,
645
+ PollerError(tonic::Status),
646
+ FetchedUpdate(PermittedWFT),
647
+ NextPage {
648
+ paginator: HistoryPaginator,
649
+ update: HistoryUpdate,
650
+ span: Span,
651
+ },
652
+ FailedFetch {
653
+ run_id: String,
654
+ err: tonic::Status,
655
+ },
656
+ }
657
+ impl From<ExternalPollerInputs> for WFStreamInput {
658
+ fn from(l: ExternalPollerInputs) -> Self {
659
+ match l {
660
+ ExternalPollerInputs::NewWft(v) => WFStreamInput::NewWft(v),
661
+ ExternalPollerInputs::PollerDead => WFStreamInput::PollerDead,
662
+ ExternalPollerInputs::PollerError(e) => WFStreamInput::PollerError(e),
663
+ ExternalPollerInputs::FetchedUpdate(wft) => WFStreamInput::NewWft(wft),
664
+ ExternalPollerInputs::FailedFetch { run_id, err } => {
665
+ WFStreamInput::FailedFetch { run_id, err }
666
+ }
667
+ ExternalPollerInputs::NextPage {
668
+ paginator,
669
+ update,
670
+ span,
671
+ } => WFStreamInput::Local(LocalInput {
672
+ input: LocalInputs::FetchedPageCompletion { paginator, update },
673
+ span,
674
+ }),
675
+ }
676
+ }
677
+ }
678
+ impl From<Result<WFTExtractorOutput, tonic::Status>> for ExternalPollerInputs {
679
+ fn from(v: Result<WFTExtractorOutput, tonic::Status>) -> Self {
680
+ match v {
681
+ Ok(WFTExtractorOutput::NewWFT(pwft)) => ExternalPollerInputs::NewWft(pwft),
682
+ Ok(WFTExtractorOutput::FetchResult(updated_wft, _)) => {
683
+ ExternalPollerInputs::FetchedUpdate(updated_wft)
684
+ }
685
+ Ok(WFTExtractorOutput::NextPage {
686
+ paginator,
687
+ update,
688
+ span,
689
+ rc: _rc,
690
+ }) => ExternalPollerInputs::NextPage {
691
+ paginator,
692
+ update,
693
+ span,
694
+ },
695
+ Ok(WFTExtractorOutput::FailedFetch { run_id, err }) => {
696
+ ExternalPollerInputs::FailedFetch { run_id, err }
697
+ }
698
+ Ok(WFTExtractorOutput::PollerDead) => ExternalPollerInputs::PollerDead,
699
+ Err(e) => ExternalPollerInputs::PollerError(e),
700
+ }
701
+ }
702
+ }
703
+ #[derive(Debug)]
704
+ enum NewOrFetchedComplete {
705
+ New(WFActCompleteMsg),
706
+ Fetched(HistoryUpdate, HistoryPaginator),
707
+ }
708
+ impl NewOrFetchedComplete {
709
+ fn run_id(&self) -> &str {
710
+ match self {
711
+ NewOrFetchedComplete::New(c) => c.completion.run_id(),
712
+ NewOrFetchedComplete::Fetched(_, p) => &p.run_id,
713
+ }
977
714
  }
978
-
979
- debug!(queries=?wft.pending_queries, "Dispatching queries");
980
- let query_jobs = wft
981
- .pending_queries
982
- .drain(..)
983
- .map(|q| workflow_activation_job::Variant::QueryWorkflow(q).into());
984
- act.jobs.extend(query_jobs);
985
715
  }