npm - @temporalio/core-bridge - Versions diffs - 1.5.2 → 1.6.0 - Mend

@temporalio/core-bridge 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

package/sdk-core/core/src/worker/workflow/mod.rs CHANGED Viewed

@@ -8,36 +8,43 @@ mod history_update;
 mod machines;
 mod managed_run;
 mod run_cache;
+mod wft_extraction;
 pub(crate) mod wft_poller;
 mod workflow_stream;
+#[cfg(feature = "save_wf_inputs")]
+pub use workflow_stream::replay_wf_state_inputs;
 pub(crate) use bridge::WorkflowBridge;
 pub(crate) use driven_workflow::{DrivenWorkflow, WorkflowFetcher};
-pub(crate) use history_update::{HistoryPaginator, HistoryUpdate};
-pub(crate) use machines::WFMachinesError;
+pub(crate) use history_update::HistoryUpdate;
 #[cfg(test)]
 pub(crate) use managed_run::ManagedWFFunc;
 use crate::{
-    abstractions::OwnedMeteredSemPermit,
-    protosext::{legacy_query_failure, ValidPollWFTQResponse, WorkflowActivationExt},
-    telemetry::VecDisplayer,
+    abstractions::{stream_when_allowed, MeteredSemaphore, OwnedMeteredSemPermit},
+    protosext::{legacy_query_failure, ValidPollWFTQResponse},
+    telemetry::{metrics::workflow_worker_type, VecDisplayer},
     worker::{
-        activities::{ActivitiesFromWFTsHandle, PermittedTqResp},
+        activities::{ActivitiesFromWFTsHandle, LocalActivityManager, PermittedTqResp},
         client::{WorkerClient, WorkflowTaskCompletion},
         workflow::{
-            managed_run::{ManagedRun, WorkflowManager},
+            history_update::HistoryPaginator,
+            managed_run::RunUpdateAct,
+            wft_extraction::{HistoryFetchReq, WFTExtractor},
             wft_poller::validate_wft,
             workflow_stream::{LocalInput, LocalInputs, WFStream},
         },
-        LocalActRequest, LocalActivityResolution,
+        LocalActRequest, LocalActivityExecutionResult, LocalActivityResolution,
     },
     MetricsContext,
 };
 use futures::{stream::BoxStream, Stream, StreamExt};
+use futures_util::stream;
+use prost_types::TimestampError;
 use std::{
-    collections::HashSet,
-    fmt::{Debug, Display, Formatter},
+    collections::VecDeque,
+    fmt::Debug,
     future::Future,
     ops::DerefMut,
     result,
@@ -59,8 +66,9 @@ use temporal_sdk_core_protos::{
     },
     temporal::api::{
         command::v1::{command::Attributes, Command as ProtoCommand, Command},
-        common::v1::{Memo, RetryPolicy, SearchAttributes},
+        common::v1::{Memo, RetryPolicy, SearchAttributes, WorkflowExecution},
         enums::v1::WorkflowTaskFailedCause,
+        query::v1::WorkflowQuery,
         taskqueue::v1::StickyExecutionAttributes,
         workflowservice::v1::PollActivityTaskQueueResponse,
     },
@@ -68,7 +76,7 @@ use temporal_sdk_core_protos::{
 };
 use tokio::{
     sync::{
-        mpsc::{unbounded_channel, UnboundedSender},
+        mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender},
         oneshot,
     },
     task,
@@ -79,6 +87,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::Span;
 pub(crate) const LEGACY_QUERY_ID: &str = "legacy_query";
+/// What percentage of a WFT timeout we are willing to wait before sending a WFT heartbeat when
+/// necessary.
+const WFT_HEARTBEAT_TIMEOUT_FRACTION: f32 = 0.8;
 const MAX_EAGER_ACTIVITY_RESERVATIONS_PER_WORKFLOW_TASK: usize = 3;
 type Result<T, E = WFMachinesError> = result::Result<T, E>;
@@ -100,9 +111,11 @@ pub(crate) struct Workflows {
     sticky_attrs: Option<StickyExecutionAttributes>,
     /// If set, can be used to reserve activity task slots for eager-return of new activity tasks.
     activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
+    /// Ensures we stay at or below this worker's maximum concurrent workflow task limit
+    wft_semaphore: MeteredSemaphore,
 }
-pub(super) struct WorkflowBasics {
+pub(crate) struct WorkflowBasics {
     pub max_cached_workflows: usize,
     pub max_outstanding_wfts: usize,
     pub shutdown_token: CancellationToken,
@@ -110,6 +123,9 @@ pub(super) struct WorkflowBasics {
     pub namespace: String,
     pub task_queue: String,
     pub ignore_evicts_on_shutdown: bool,
+    pub fetching_concurrency: usize,
+    #[cfg(feature = "save_wf_inputs")]
+    pub wf_state_inputs: Option<UnboundedSender<Vec<u8>>>,
 }
 impl Workflows {
@@ -118,20 +134,38 @@ impl Workflows {
         sticky_attrs: Option<StickyExecutionAttributes>,
         client: Arc<dyn WorkerClient>,
         wft_stream: impl Stream<Item = Result<ValidPollWFTQResponse, tonic::Status>> + Send + 'static,
-        local_activity_request_sink: impl Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution>
-            + Send
-            + Sync
-            + 'static,
+        local_activity_request_sink: impl LocalActivityRequestSink,
+        heartbeat_timeout_rx: UnboundedReceiver<HeartbeatTimeoutMsg>,
         activity_tasks_handle: Option<ActivitiesFromWFTsHandle>,
     ) -> Self {
         let (local_tx, local_rx) = unbounded_channel();
+        let (fetch_tx, fetch_rx) = unbounded_channel();
         let shutdown_tok = basics.shutdown_token.clone();
         let task_queue = basics.task_queue.clone();
-        let mut stream = WFStream::build(
-            basics,
+        let wft_semaphore = MeteredSemaphore::new(
+            basics.max_outstanding_wfts,
+            basics.metrics.with_new_attrs([workflow_worker_type()]),
+            MetricsContext::available_task_slots,
+        );
+        // Only allow polling of the new WFT stream if there are available task slots
+        let proceeder = stream::unfold(wft_semaphore.clone(), |sem| async move {
+            Some((sem.acquire_owned().await.unwrap(), sem))
+        });
+        let wft_stream = stream_when_allowed(wft_stream, proceeder);
+        let extracted_wft_stream = WFTExtractor::build(
+            client.clone(),
+            basics.fetching_concurrency,
             wft_stream,
+            UnboundedReceiverStream::new(fetch_rx),
+        );
+        let locals_stream = stream::select(
             UnboundedReceiverStream::new(local_rx),
-            client.clone(),
+            UnboundedReceiverStream::new(heartbeat_timeout_rx).map(Into::into),
+        );
+        let mut stream = WFStream::build(
+            basics,
+            extracted_wft_stream,
+            locals_stream,
             local_activity_request_sink,
         );
         let (activation_tx, activation_rx) = unbounded_channel();
@@ -152,10 +186,24 @@ impl Workflows {
             if !do_poll {
                 return;
             }
-            while let Some(act) = stream.next().await {
-                activation_tx
-                    .send(act)
-                    .expect("Activation processor channel not dropped");
+            while let Some(output) = stream.next().await {
+                match output {
+                    Ok(o) => {
+                        for fetchreq in o.fetch_histories {
+                            fetch_tx
+                                .send(fetchreq)
+                                .expect("Fetch channel must not be dropped");
+                        }
+                        for act in o.activations {
+                            activation_tx
+                                .send(Ok(act))
+                                .expect("Activation processor channel not dropped");
+                        }
+                    }
+                    Err(e) => activation_tx
+                        .send(Err(e))
+                        .expect("Activation processor channel not dropped"),
+                }
             }
         });
         Self {
@@ -169,12 +217,13 @@ impl Workflows {
             client,
             sticky_attrs,
             activity_tasks_handle,
+            wft_semaphore,
         }
     }
     pub async fn next_workflow_activation(&self) -> Result<WorkflowActivation, PollWfError> {
         loop {
-            let r = {
+            let al = {
                 let mut lock = self.activation_stream.lock().await;
                 let (ref mut stream, ref mut beginner) = lock.deref_mut();
                 if let Some(beginner) = beginner.take() {
@@ -182,8 +231,8 @@ impl Workflows {
                 }
                 stream.next().await.unwrap_or(Err(PollWfError::ShutDown))?
             };
-            Span::current().record("run_id", r.run_id());
-            match r {
+            Span::current().record("run_id", al.run_id());
+            match al {
                 ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
                     debug!(activation=%act, "Sending activation to lang");
                     break Ok(act);
@@ -202,7 +251,7 @@ impl Workflows {
     /// Queue an activation completion for processing, returning a future that will resolve with
     /// the outcome of that completion. See [ActivationCompletedOutcome].
     ///
-    /// Returns the most-recently-processed event number for the run
+    /// Returns the most-recently-processed event number for the run.
     pub async fn activation_completed(
         &self,
         completion: WorkflowActivationCompletion,
@@ -213,7 +262,7 @@ impl Workflows {
         let (tx, rx) = oneshot::channel();
         let was_sent = self.send_local(WFActCompleteMsg {
             completion,
-            response_tx: tx,
+            response_tx: Some(tx),
         });
         if !was_sent {
             if is_empty_completion {
@@ -230,7 +279,7 @@ impl Workflows {
             .await
             .expect("Send half of activation complete response not dropped");
         let mut wft_from_complete = None;
-        let reported_wft_to_server = match completion_outcome.outcome {
+        let wft_report_status = match completion_outcome.outcome {
             ActivationCompleteOutcome::ReportWFTSuccess(report) => match report {
                 ServerCommandsWithWorkflowInfo {
                     task_token,
@@ -273,14 +322,14 @@ impl Workflows {
                         Ok(())
                     })
                     .await;
-                    true
+                    WFTReportStatus::Reported
                 }
                 ServerCommandsWithWorkflowInfo {
                     task_token,
                     action: ActivationAction::RespondLegacyQuery { result },
                 } => {
                     self.respond_legacy_query(task_token, *result).await;
-                    true
+                    WFTReportStatus::Reported
                 }
             },
             ActivationCompleteOutcome::ReportWFTFail(outcome) => match outcome {
@@ -292,22 +341,39 @@ impl Workflows {
                             .await
                     })
                     .await;
-                    true
+                    WFTReportStatus::Reported
                 }
                 FailedActivationWFTReport::ReportLegacyQueryFailure(task_token, failure) => {
                     warn!(run_id=%run_id, failure=?failure, "Failing legacy query request");
                     self.respond_legacy_query(task_token, legacy_query_failure(failure))
                         .await;
-                    true
+                    WFTReportStatus::Reported
                 }
             },
-            ActivationCompleteOutcome::DoNothing => false,
+            ActivationCompleteOutcome::WFTFailedDontReport => WFTReportStatus::DropWft,
+            ActivationCompleteOutcome::DoNothing => WFTReportStatus::NotReported,
+        };
+        let maybe_pwft = if let Some(wft) = wft_from_complete {
+            match HistoryPaginator::from_poll(wft, self.client.clone()).await {
+                Ok((paginator, pwft)) => Some((pwft, paginator)),
+                Err(e) => {
+                    self.request_eviction(
+                        &run_id,
+                        format!("Failed to paginate workflow task from completion: {e:?}"),
+                        EvictionReason::Fatal,
+                    );
+                    None
+                }
+            }
+        } else {
+            None
         };
         self.post_activation(PostActivationMsg {
             run_id,
-            reported_wft_to_server,
-            wft_from_complete,
+            wft_report_status,
+            wft_from_complete: maybe_pwft,
         });
         Ok(completion_outcome.most_recently_processed_event)
@@ -342,12 +408,16 @@ impl Workflows {
         async move { rx.await.ok() }
     }
+    pub fn available_wft_permits(&self) -> usize {
+        self.wft_semaphore.available_permits()
+    }
     pub async fn shutdown(&self) -> Result<(), JoinError> {
         let maybe_jh = self.processing_task.lock().await.take();
         if let Some(jh) = maybe_jh {
             // This acts as a final wake up in case the stream is still alive and wouldn't otherwise
             // receive another message. It allows it to shut itself down.
-            let _ = self.get_state_info();
+            let _ = self.get_state_info().await;
             jh.await
         } else {
             Ok(())
@@ -393,7 +463,11 @@ impl Workflows {
     /// successfully.
     fn send_local(&self, msg: impl Into<LocalInputs>) -> bool {
         let msg = msg.into();
-        let print_err = !matches!(msg, LocalInputs::GetStateInfo(_));
+        let print_err = match &msg {
+            LocalInputs::GetStateInfo(_) => false,
+            LocalInputs::LocalResolution(lr) if lr.res.is_la_cancel_confirmation() => false,
+            _ => true,
+        };
         if let Err(e) = self.local_tx.send(LocalInput {
             input: msg,
             span: Span::current(),
@@ -509,186 +583,30 @@ impl Workflows {
     }
 }
-/// Manages access to a specific workflow run, and contains various bookkeeping information that the
-/// [WFStream] may need to access quickly.
-#[derive(derive_more::DebugCustom)]
-#[debug(
-    fmt = "ManagedRunHandle {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
-           have_seen_terminal_event: {}, most_recently_processed_event: {}, more_pending_work: {}, \
-           trying_to_evict: {}, last_action_acked: {} }}",
-    wft,
-    activation,
-    buffered_resp,
-    have_seen_terminal_event,
-    most_recently_processed_event_number,
-    more_pending_work,
-    "trying_to_evict.is_some()",
-    last_action_acked
+/// Returned when a cache miss happens and we need to fetch history from the beginning to
+/// replay a run
+#[derive(Debug, derive_more::Display)]
+#[display(
+    fmt = "CacheMissFetchReq(run_id: {})",
+    "original_wft.work.execution.run_id"
 )]
-struct ManagedRunHandle {
-    /// If set, the WFT this run is currently/will be processing.
-    wft: Option<OutstandingTask>,
-    /// An outstanding activation to lang
-    activation: Option<OutstandingActivation>,
-    /// If set, it indicates there is a buffered poll response from the server that applies to this
-    /// run. This can happen when lang takes too long to complete a task and the task times out, for
-    /// example. Upon next completion, the buffered response will be removed and can be made ready
-    /// to be returned from polling
-    buffered_resp: Option<PermittedWFT>,
-    /// True if this machine has seen an event which ends the execution
-    have_seen_terminal_event: bool,
-    /// The most recently processed event id this machine has seen. 0 means it has seen nothing.
-    most_recently_processed_event_number: usize,
-    /// Is set true when the machines indicate that there is additional known work to be processed
-    more_pending_work: bool,
-    /// Is set if an eviction has been requested for this run
-    trying_to_evict: Option<RequestEvictMsg>,
-    /// Set to true if the last action we tried to take to this run has been processed (ie: the
-    /// [RunUpdateResponse] for it has been seen.
-    last_action_acked: bool,
-    /// For sending work to the machines
-    run_actions_tx: UnboundedSender<RunAction>,
-    /// Handle to the task where the actual machines live
-    handle: JoinHandle<()>,
-    /// We track if we have recorded useful debugging values onto a certain span yet, to overcome
-    /// duplicating field values. Remove this once https://github.com/tokio-rs/tracing/issues/2334
-    /// is fixed.
-    recorded_span_ids: HashSet<tracing::Id>,
-    metrics: MetricsContext,
+#[must_use]
+struct CacheMissFetchReq {
+    original_wft: PermittedWFT,
+}
+/// Bubbled up from inside workflow state if we're trying to apply the next workflow task but it
+/// isn't in memory
+#[derive(Debug)]
+#[must_use]
+struct NextPageReq {
+    paginator: HistoryPaginator,
+    span: Span,
 }
-impl ManagedRunHandle {
-    fn new(
-        wfm: WorkflowManager,
-        activations_tx: UnboundedSender<RunUpdateResponse>,
-        local_activity_request_sink: LocalActivityRequestSink,
-        metrics: MetricsContext,
-    ) -> Self {
-        let (run_actions_tx, run_actions_rx) = unbounded_channel();
-        let managed = ManagedRun::new(wfm, activations_tx, local_activity_request_sink);
-        let handle = tokio::task::spawn(managed.run(run_actions_rx));
-        Self {
-            wft: None,
-            activation: None,
-            buffered_resp: None,
-            have_seen_terminal_event: false,
-            most_recently_processed_event_number: 0,
-            more_pending_work: false,
-            trying_to_evict: None,
-            last_action_acked: true,
-            run_actions_tx,
-            handle,
-            recorded_span_ids: Default::default(),
-            metrics,
-        }
-    }
-    fn incoming_wft(&mut self, wft: NewIncomingWFT) {
-        if self.wft.is_some() {
-            error!("Trying to send a new WFT for a run which already has one!");
-        }
-        self.send_run_action(RunActions::NewIncomingWFT(wft));
-    }
-    fn check_more_activations(&mut self) {
-        // No point in checking for more activations if we have not acked the last update, or
-        // if there's already an outstanding activation.
-        if self.last_action_acked && self.activation.is_none() {
-            self.send_run_action(RunActions::CheckMoreWork {
-                want_to_evict: self.trying_to_evict.clone(),
-                has_pending_queries: self
-                    .wft
-                    .as_ref()
-                    .map(|wft| !wft.pending_queries.is_empty())
-                    .unwrap_or_default(),
-                has_wft: self.wft.is_some(),
-            });
-        }
-    }
-    fn send_completion(&mut self, c: RunActivationCompletion) {
-        self.send_run_action(RunActions::ActivationCompletion(c));
-    }
-    fn send_local_resolution(&mut self, r: LocalResolution) {
-        self.send_run_action(RunActions::LocalResolution(r));
-    }
-    fn insert_outstanding_activation(&mut self, act: &ActivationOrAuto) {
-        let act_type = match &act {
-            ActivationOrAuto::LangActivation(act) | ActivationOrAuto::ReadyForQueries(act) => {
-                if act.is_legacy_query() {
-                    OutstandingActivation::LegacyQuery
-                } else {
-                    OutstandingActivation::Normal {
-                        contains_eviction: act.eviction_index().is_some(),
-                        num_jobs: act.jobs.len(),
-                    }
-                }
-            }
-            ActivationOrAuto::Autocomplete { .. } => OutstandingActivation::Autocomplete,
-        };
-        if let Some(old_act) = self.activation {
-            // This is a panic because we have screwed up core logic if this is violated. It must be
-            // upheld.
-            panic!(
-                "Attempted to insert a new outstanding activation {:?}, but there already was \
-                 one outstanding: {:?}",
-                act, old_act
-            );
-        }
-        self.activation = Some(act_type);
-    }
-    fn send_run_action(&mut self, action: RunActions) {
-        self.last_action_acked = false;
-        self.run_actions_tx
-            .send(RunAction {
-                action,
-                trace_span: Span::current(),
-            })
-            .expect("Receive half of run actions not dropped");
-    }
-    /// Returns true if the managed run has any form of pending work
-    /// If `ignore_evicts` is true, pending evictions do not count as pending work.
-    /// If `ignore_buffered` is true, buffered workflow tasks do not count as pending work.
-    fn has_any_pending_work(&self, ignore_evicts: bool, ignore_buffered: bool) -> bool {
-        let evict_work = if ignore_evicts {
-            false
-        } else {
-            self.trying_to_evict.is_some()
-        };
-        let act_work = if ignore_evicts {
-            if let Some(ref act) = self.activation {
-                !act.has_only_eviction()
-            } else {
-                false
-            }
-        } else {
-            self.activation.is_some()
-        };
-        let buffered = if ignore_buffered {
-            false
-        } else {
-            self.buffered_resp.is_some()
-        };
-        self.wft.is_some()
-            || buffered
-            || !self.last_action_acked
-            || self.more_pending_work
-            || act_work
-            || evict_work
-    }
-    /// Returns true if the handle is currently processing a WFT which contains a legacy query.
-    fn pending_work_is_legacy_query(&self) -> bool {
-        // Either we know because there is a pending legacy query, or it's already been drained and
-        // sent as an activation.
-        matches!(self.activation, Some(OutstandingActivation::LegacyQuery))
-            || self
-                .wft
-                .as_ref()
-                .map(|t| t.has_pending_legacy_query())
-                .unwrap_or_default()
-    }
+#[derive(Debug)]
+struct WFStreamOutput {
+    activations: VecDeque<ActivationOrAuto>,
+    fetch_histories: VecDeque<HistoryFetchReq>,
 }
 #[derive(Debug, derive_more::Display)]
@@ -697,6 +615,7 @@ enum ActivationOrAuto {
     /// This type should only be filled with an empty activation which is ready to have queries
     /// inserted into the joblist
     ReadyForQueries(WorkflowActivation),
+    #[display(fmt = "Autocomplete(run_id={run_id})")]
     Autocomplete {
         run_id: String,
     },
@@ -711,11 +630,48 @@ impl ActivationOrAuto {
     }
 }
+/// A processed WFT which has been validated and had a history update extracted from it
 #[derive(derive_more::DebugCustom)]
-#[debug(fmt = "PermittedWft {{ {:?} }}", wft)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
+#[debug(fmt = "PermittedWft({work:?})")]
 pub(crate) struct PermittedWFT {
-    wft: ValidPollWFTQResponse,
+    work: PreparedWFT,
+    #[cfg_attr(
+        feature = "save_wf_inputs",
+        serde(skip, default = "OwnedMeteredSemPermit::fake_deserialized")
+    )]
     permit: OwnedMeteredSemPermit,
+    #[cfg_attr(
+        feature = "save_wf_inputs",
+        serde(skip, default = "HistoryPaginator::fake_deserialized")
+    )]
+    paginator: HistoryPaginator,
+}
+#[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
+struct PreparedWFT {
+    task_token: TaskToken,
+    attempt: u32,
+    execution: WorkflowExecution,
+    workflow_type: String,
+    legacy_query: Option<WorkflowQuery>,
+    query_requests: Vec<QueryWorkflow>,
+    update: HistoryUpdate,
+}
+impl PreparedWFT {
+    /// Returns true if the contained history update is incremental (IE: expects to hit a cached
+    /// workflow)
+    pub fn is_incremental(&self) -> bool {
+        let start_event_id = self.update.first_event_id();
+        let poll_resp_is_incremental = start_event_id.map(|eid| eid > 1).unwrap_or_default();
+        poll_resp_is_incremental || start_event_id.is_none()
+    }
 }
 #[derive(Debug)]
@@ -811,44 +767,74 @@ pub(crate) enum ActivationAction {
     RespondLegacyQuery { result: Box<QueryResult> },
 }
-#[derive(Debug, Eq, PartialEq, Hash)]
-pub(crate) enum EvictionRequestResult {
-    EvictionRequested(Option<u32>),
+#[derive(Debug)]
+enum EvictionRequestResult {
+    EvictionRequested(Option<u32>, RunUpdateAct),
     NotFound,
     EvictionAlreadyRequested(Option<u32>),
 }
+impl EvictionRequestResult {
+    fn into_run_update_resp(self) -> RunUpdateAct {
+        match self {
+            EvictionRequestResult::EvictionRequested(_, resp) => resp,
+            EvictionRequestResult::NotFound
+            | EvictionRequestResult::EvictionAlreadyRequested(_) => None,
+        }
+    }
+}
 #[derive(Debug)]
 #[allow(dead_code)] // Not always used in non-test
 pub(crate) struct WorkflowStateInfo {
     pub cached_workflows: usize,
     pub outstanding_wft: usize,
-    pub available_wft_permits: usize,
 }
 #[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 struct WFActCompleteMsg {
     completion: ValidatedCompletion,
-    response_tx: oneshot::Sender<ActivationCompleteResult>,
+    #[cfg_attr(feature = "save_wf_inputs", serde(skip))]
+    response_tx: Option<oneshot::Sender<ActivationCompleteResult>>,
 }
 #[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 struct LocalResolutionMsg {
     run_id: String,
     res: LocalResolution,
 }
 #[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 struct PostActivationMsg {
     run_id: String,
-    reported_wft_to_server: bool,
-    wft_from_complete: Option<ValidPollWFTQResponse>,
+    wft_report_status: WFTReportStatus,
+    wft_from_complete: Option<(PreparedWFT, HistoryPaginator)>,
 }
 #[derive(Debug, Clone)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 struct RequestEvictMsg {
     run_id: String,
     message: String,
     reason: EvictionReason,
 }
 #[derive(Debug)]
+pub(crate) struct HeartbeatTimeoutMsg {
+    pub(crate) run_id: String,
+    pub(crate) span: Span,
+}
+#[derive(Debug)]
 struct GetStateInfoMsg {
     response_tx: oneshot::Sender<WorkflowStateInfo>,
 }
@@ -869,16 +855,24 @@ enum ActivationCompleteOutcome {
     ReportWFTFail(FailedActivationWFTReport),
     /// There's nothing to do right now. EX: The workflow needs to keep replaying.
     DoNothing,
+    /// The workflow task failed, but we shouldn't report it. EX: We have failed 2 or more attempts
+    /// in a row.
+    WFTFailedDontReport,
 }
-#[derive(Debug)]
-struct FulfillableActivationComplete {
-    result: ActivationCompleteResult,
-    resp_chan: oneshot::Sender<ActivationCompleteResult>,
-}
-impl FulfillableActivationComplete {
-    fn fulfill(self) {
-        let _ = self.resp_chan.send(self.result);
-    }
+/// Did we report, or not, completion of a WFT to server?
+#[derive(Debug, Copy, Clone)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
+enum WFTReportStatus {
+    Reported,
+    /// The WFT completion was not reported when finishing the activation, because there's still
+    /// work to be done. EX: Running LAs.
+    NotReported,
+    /// We didn't report, but we want to clear the outstanding workflow task anyway. See
+    /// [ActivationCompleteOutcome::WFTFailedDontReport]
+    DropWft,
 }
 fn validate_completion(
@@ -908,8 +902,7 @@ fn validate_completion(
                     reason: format!(
                         "Workflow completion had a legacy query response along with other \
                          commands. This is not allowed and constitutes an error in the \
-                         lang SDK. Commands: {:?}",
-                        commands
+                         lang SDK. Commands: {commands:?}"
                     ),
                     run_id: completion.run_id,
                 });
@@ -934,6 +927,10 @@ fn validate_completion(
 }
 #[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 #[allow(clippy::large_enum_variant)]
 enum ValidatedCompletion {
     Success {
@@ -955,112 +952,6 @@ impl ValidatedCompletion {
     }
 }
-/// Input to run tasks, sent to [ManagedRun]s via [ManagedRunHandle]s
-#[derive(Debug)]
-struct RunAction {
-    action: RunActions,
-    trace_span: Span,
-}
-#[derive(Debug)]
-#[allow(clippy::large_enum_variant)]
-enum RunActions {
-    NewIncomingWFT(NewIncomingWFT),
-    ActivationCompletion(RunActivationCompletion),
-    CheckMoreWork {
-        want_to_evict: Option<RequestEvictMsg>,
-        has_pending_queries: bool,
-        has_wft: bool,
-    },
-    LocalResolution(LocalResolution),
-    HeartbeatTimeout,
-}
-#[derive(Debug)]
-struct NewIncomingWFT {
-    /// This field is only populated if the machines already exist. Otherwise the machines
-    /// are instantiated with the workflow history.
-    history_update: Option<HistoryUpdate>,
-    /// Wft start time
-    start_time: Instant,
-}
-#[derive(Debug)]
-struct RunActivationCompletion {
-    task_token: TaskToken,
-    start_time: Instant,
-    commands: Vec<WFCommand>,
-    activation_was_eviction: bool,
-    activation_was_only_eviction: bool,
-    has_pending_query: bool,
-    query_responses: Vec<QueryResult>,
-    /// Used to notify the worker when the completion is done processing and the completion can
-    /// unblock. Must always be `Some` when initialized.
-    resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
-}
-/// A response from a [ManagedRun] held by a [ManagedRunHandle]
-#[derive(Debug)]
-struct RunUpdateResponse {
-    kind: RunUpdateResponseKind,
-    span: Span,
-}
-#[derive(Debug, derive_more::Display)]
-#[allow(clippy::large_enum_variant)]
-enum RunUpdateResponseKind {
-    Good(GoodRunUpdate),
-    Fail(FailRunUpdate),
-}
-impl RunUpdateResponseKind {
-    pub(crate) fn run_id(&self) -> &str {
-        match self {
-            RunUpdateResponseKind::Good(g) => &g.run_id,
-            RunUpdateResponseKind::Fail(f) => &f.run_id,
-        }
-    }
-}
-#[derive(Debug)]
-struct GoodRunUpdate {
-    run_id: String,
-    outgoing_activation: Option<ActivationOrAuto>,
-    fulfillable_complete: Option<FulfillableActivationComplete>,
-    have_seen_terminal_event: bool,
-    /// Is true if there are more jobs that need to be sent to lang
-    more_pending_work: bool,
-    most_recently_processed_event_number: usize,
-    /// Is true if this update was in response to a new WFT
-    in_response_to_wft: bool,
-}
-impl Display for GoodRunUpdate {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "GoodRunUpdate(run_id: {}, outgoing_activation: {}, more_pending_work: {})",
-            self.run_id,
-            if let Some(og) = self.outgoing_activation.as_ref() {
-                format!("{}", og)
-            } else {
-                "None".to_string()
-            },
-            self.more_pending_work
-        )
-    }
-}
-#[derive(Debug)]
-pub(crate) struct FailRunUpdate {
-    run_id: String,
-    err: WFMachinesError,
-    /// This is populated if the run update failed while processing a completion - and thus we
-    /// must respond down it when handling the failure.
-    completion_resp: Option<oneshot::Sender<ActivationCompleteResult>>,
-}
-impl Display for FailRunUpdate {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "FailRunUpdate(run_id: {}, error: {:?})",
-            self.run_id, self.err
-        )
-    }
-}
 #[derive(Debug)]
 pub struct OutgoingServerCommands {
     pub commands: Vec<ProtoCommand>,
@@ -1068,9 +959,22 @@ pub struct OutgoingServerCommands {
 }
 #[derive(Debug)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 pub(crate) enum LocalResolution {
     LocalActivity(LocalActivityResolution),
 }
+impl LocalResolution {
+    pub fn is_la_cancel_confirmation(&self) -> bool {
+        match self {
+            LocalResolution::LocalActivity(lar) => {
+                matches!(lar.result, LocalActivityExecutionResult::Cancelled(_))
+            }
+        }
+    }
+}
 #[derive(thiserror::Error, Debug, derive_more::From)]
 #[error("Lang provided workflow command with empty variant")]
@@ -1079,6 +983,10 @@ pub struct EmptyWorkflowCommandErr;
 /// [DrivenWorkflow]s respond with these when called, to indicate what they want to do next.
 /// EX: Create a new timer, complete the workflow, etc.
 #[derive(Debug, derive_more::From, derive_more::Display)]
+#[cfg_attr(
+    feature = "save_wf_inputs",
+    derive(serde::Serialize, serde::Deserialize)
+)]
 #[allow(clippy::large_enum_variant)]
 pub enum WFCommand {
     /// Returned when we need to wait for the lang sdk to send us something
@@ -1171,12 +1079,9 @@ pub struct WorkflowStartedInfo {
     retry_policy: Option<RetryPolicy>,
 }
-type LocalActivityRequestSink =
-    Arc<dyn Fn(Vec<LocalActRequest>) -> Vec<LocalActivityResolution> + Send + Sync>;
 /// Wraps outgoing activation job protos with some internal details core might care about
 #[derive(Debug, derive_more::Display)]
-#[display(fmt = "{}", variant)]
+#[display(fmt = "{variant}")]
 struct OutgoingJob {
     variant: workflow_activation_job::Variant,
     /// Since LA resolutions are not distinguished from non-LA resolutions as far as lang is
@@ -1198,3 +1103,58 @@ impl From<OutgoingJob> for WorkflowActivationJob {
         }
     }
 }
+/// Errors thrown inside of workflow machines
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum WFMachinesError {
+    #[error("Nondeterminism error: {0}")]
+    Nondeterminism(String),
+    #[error("Fatal error in workflow machines: {0}")]
+    Fatal(String),
+}
+impl WFMachinesError {
+    pub fn evict_reason(&self) -> EvictionReason {
+        match self {
+            WFMachinesError::Nondeterminism(_) => EvictionReason::Nondeterminism,
+            WFMachinesError::Fatal(_) => EvictionReason::Fatal,
+        }
+    }
+}
+impl From<TimestampError> for WFMachinesError {
+    fn from(_: TimestampError) -> Self {
+        Self::Fatal("Could not decode timestamp".to_string())
+    }
+}
+pub(crate) trait LocalActivityRequestSink: Send + Sync + 'static {
+    fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution>;
+}
+#[derive(derive_more::Constructor)]
+pub(super) struct LAReqSink {
+    lam: Arc<LocalActivityManager>,
+    /// If we're recording WF inputs, we also need to store immediate resolutions so they're
+    /// available on replay.
+    #[allow(dead_code)] // sometimes appears unused due to feature flagging
+    recorder: Option<UnboundedSender<Vec<u8>>>,
+}
+impl LocalActivityRequestSink for LAReqSink {
+    fn sink_reqs(&self, reqs: Vec<LocalActRequest>) -> Vec<LocalActivityResolution> {
+        if reqs.is_empty() {
+            return vec![];
+        }
+        #[allow(clippy::let_and_return)] // When feature is off clippy doesn't like this
+        let res = self.lam.enqueue(reqs);
+        // We always save when there are any reqs, even if the response might be empty, so that
+        // calls/responses are 1:1
+        #[cfg(feature = "save_wf_inputs")]
+        self.write_req(&res);
+        res
+    }
+}