npm - @temporalio/core-bridge - Versions diffs - 1.8.6 → 1.9.0 - Mend

@temporalio/core-bridge 1.8.6 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

package/sdk-core/core/src/worker/workflow/managed_run.rs CHANGED Viewed

@@ -1,21 +1,17 @@
-#[cfg(test)]
-mod managed_wf_test;
-#[cfg(test)]
-pub(crate) use managed_wf_test::ManagedWFFunc;
 use crate::{
     abstractions::dbg_panic,
-    protosext::WorkflowActivationExt,
+    protosext::{protocol_messages::IncomingProtocolMessage, WorkflowActivationExt},
     worker::{
         workflow::{
-            history_update::HistoryPaginator, machines::WorkflowMachines, ActivationAction,
-            ActivationCompleteOutcome, ActivationCompleteResult, ActivationOrAuto,
-            EvictionRequestResult, FailedActivationWFTReport, HeartbeatTimeoutMsg, HistoryUpdate,
-            LocalActivityRequestSink, LocalResolution, NextPageReq, OutgoingServerCommands,
-            OutstandingActivation, OutstandingTask, PermittedWFT, RequestEvictMsg, RunBasics,
+            history_update::HistoryPaginator,
+            machines::{MachinesWFTResponseContent, WorkflowMachines},
+            ActivationAction, ActivationCompleteOutcome, ActivationCompleteResult,
+            ActivationOrAuto, BufferedTasks, DrivenWorkflow, EvictionRequestResult,
+            FailedActivationWFTReport, HeartbeatTimeoutMsg, HistoryUpdate,
+            LocalActivityRequestSink, LocalResolution, NextPageReq, OutstandingActivation,
+            OutstandingTask, PermittedWFT, RequestEvictMsg, RunBasics,
             ServerCommandsWithWorkflowInfo, WFCommand, WFMachinesError, WFTReportStatus,
-            WorkflowBridge, WorkflowTaskInfo, WFT_HEARTBEAT_TIMEOUT_FRACTION,
+            WorkflowTaskInfo, WFT_HEARTBEAT_TIMEOUT_FRACTION,
         },
         LocalActRequest, LEGACY_QUERY_ID,
     },
@@ -24,6 +20,7 @@ use crate::{
 use futures_util::future::AbortHandle;
 use std::{
     collections::HashSet,
+    mem,
     ops::Add,
     rc::Rc,
     sync::mpsc::Sender,
@@ -51,11 +48,11 @@ pub(super) type RunUpdateAct = Option<ActivationOrAuto>;
 /// remain that way.
 #[derive(derive_more::DebugCustom)]
 #[debug(
-    fmt = "ManagedRun {{ wft: {:?}, activation: {:?}, buffered_resp: {:?} \
+    fmt = "ManagedRun {{ wft: {:?}, activation: {:?}, task_buffer: {:?} \
            trying_to_evict: {} }}",
     wft,
     activation,
-    buffered_resp,
+    task_buffer,
     "trying_to_evict.is_some()"
 )]
 pub(super) struct ManagedRun {
@@ -76,11 +73,14 @@ pub(super) struct ManagedRun {
     wft: Option<OutstandingTask>,
     /// An outstanding activation to lang
     activation: Option<OutstandingActivation>,
-    /// If set, it indicates there is a buffered poll response from the server that applies to this
-    /// run. This can happen when lang takes too long to complete a task and the task times out, for
-    /// example. Upon next completion, the buffered response will be removed and can be made ready
-    /// to be returned from polling
-    buffered_resp: Option<PermittedWFT>,
+    /// Contains buffered poll responses from the server that apply to this run. This can happen
+    /// when:
+    ///   * Lang takes too long to complete a task and the task times out
+    ///   * Many queries are submitted concurrently and reach this worker (in this case, multiple
+    ///     tasks can be outstanding)
+    ///   * Multiple speculative tasks (ex: for updates) may also exist at once (but only the
+    ///     latest one will matter).
+    task_buffer: BufferedTasks,
     /// Is set if an eviction has been requested for this run
     trying_to_evict: Option<RequestEvictMsg>,
@@ -96,24 +96,27 @@ pub(super) struct ManagedRun {
 impl ManagedRun {
     pub(super) fn new(
         basics: RunBasics,
+        wft: PermittedWFT,
         local_activity_request_sink: Rc<dyn LocalActivityRequestSink>,
-    ) -> Self {
+    ) -> (Self, RunUpdateAct) {
         let metrics = basics.metrics.clone();
         let wfm = WorkflowManager::new(basics);
-        Self {
+        let mut me = Self {
             wfm,
             local_activity_request_sink,
             waiting_on_la: None,
             am_broken: false,
             wft: None,
             activation: None,
-            buffered_resp: None,
+            task_buffer: Default::default(),
             trying_to_evict: None,
             recorded_span_ids: Default::default(),
             metrics,
             paginator: None,
             completion_waiting_on_page_fetch: None,
-        }
+        };
+        let rua = me.incoming_wft(wft);
+        (me, rua)
     }
     /// Returns true if there are pending jobs that need to be sent to lang.
@@ -162,10 +165,10 @@ impl ManagedRun {
         let work = pwft.work;
         debug!(
-            run_id = %work.execution.run_id,
             task_token = %&work.task_token,
             update = ?work.update,
             has_legacy_query = %work.legacy_query.is_some(),
+            messages = ?work.messages,
             attempt = %work.attempt,
             "Applying new workflow task from server"
         );
@@ -192,6 +195,7 @@ impl ManagedRun {
                 complete_resp_chan: None,
             });
         }
+        let was_legacy_query = legacy_query_from_poll.is_some();
         if let Some(lq) = legacy_query_from_poll {
             pending_queries.push(lq);
         }
@@ -204,10 +208,20 @@ impl ManagedRun {
             permit: pwft.permit,
         });
+        if was_legacy_query
+            && work.update.wft_started_id == 0
+            && work.update.previous_wft_started_id < self.wfm.machines.get_last_wft_started_id()
+        {
+            return Ok(Some(ActivationOrAuto::AutoFail {
+                run_id: self.run_id().to_string(),
+                machines_err: WFMachinesError::Fatal("Query expired".to_string()),
+            }));
+        }
         // The update field is only populated in the event we hit the cache
         let activation = if work.update.is_real() {
             self.metrics.sticky_cache_hit();
-            self.wfm.feed_history_from_server(work.update)?
+            self.wfm.new_work_from_server(work.update, work.messages)?
         } else {
             let r = self.wfm.get_next_activation()?;
             if r.jobs.is_empty() {
@@ -266,10 +280,16 @@ impl ManagedRun {
         let retme = self.wft.take();
         // Only record latency metrics if we genuinely reported to server
-        if matches!(report_status, WFTReportStatus::Reported) {
+        if let WFTReportStatus::Reported {
+            reset_last_started_to,
+        } = report_status
+        {
             if let Some(ot) = &retme {
                 self.metrics.wf_task_latency(ot.start_time.elapsed());
             }
+            if let Some(id) = reset_last_started_to {
+                self.wfm.machines.reset_last_started_id(id);
+            }
             // Tell the LA manager that we're done with the WFT
             self.local_activity_request_sink.sink_reqs(vec![
                 LocalActRequest::IndicateWorkflowTaskCompleted(self.wfm.machines.run_id.clone()),
@@ -403,17 +423,18 @@ impl ManagedRun {
         } else {
             // First strip out query responses from other commands that actually affect machines
             // Would be prettier with `drain_filter`
-            let mut i = 0;
             let mut query_responses = vec![];
-            while i < commands.len() {
-                if matches!(commands[i], WFCommand::QueryResponse(_)) {
-                    if let WFCommand::QueryResponse(qr) = commands.remove(i) {
+            commands = std::mem::take(&mut commands)
+                .into_iter()
+                .filter_map(|x| {
+                    if let WFCommand::QueryResponse(qr) = x {
                         query_responses.push(qr);
+                        None
+                    } else {
+                        Some(x)
                     }
-                } else {
-                    i += 1;
-                }
-            }
+                })
+                .collect();
             if activation_was_only_eviction && !commands.is_empty() {
                 dbg_panic!("Reply to an eviction only containing an eviction included commands");
@@ -491,14 +512,15 @@ impl ManagedRun {
     }
     /// Called whenever either core lang cannot complete a workflow activation. EX: Nondeterminism
-    /// or user code threw/panicked, respectively. The `cause` and `reason` fields are determined
-    /// inside core always. The `failure` field may come from lang. `resp_chan` will be used to
-    /// unblock the completion call when everything we need to do to fulfill it has happened.
+    /// or user code threw/panicked. The `cause` and `reason` fields are determined inside core
+    /// always. The `failure` field may come from lang. `resp_chan` will be used to unblock the
+    /// completion call when everything we need to do to fulfill it has happened.
     pub(super) fn failed_completion(
         &mut self,
         cause: WorkflowTaskFailedCause,
         reason: EvictionReason,
         failure: workflow_completion::Failure,
+        is_auto_fail: bool,
         resp_chan: Option<oneshot::Sender<ActivationCompleteResult>>,
     ) -> RunUpdateAct {
         let tt = if let Some(tt) = self.wft.as_ref().map(|t| t.info.task_token.clone()) {
@@ -514,24 +536,39 @@ impl ManagedRun {
         self.metrics.wf_task_failed();
         let message = format!("Workflow activation completion failed: {:?}", &failure);
-        // Blow up any cached data associated with the workflow
-        let evict_req_outcome = self.request_eviction(RequestEvictMsg {
-            run_id: self.run_id().to_string(),
-            message,
-            reason,
-            auto_reply_fail_tt: None,
-        });
-        let should_report = match &evict_req_outcome {
-            EvictionRequestResult::EvictionRequested(Some(attempt), _)
-            | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => *attempt <= 1,
-            _ => false,
+        // We don't want to fail queries that could otherwise be retried
+        let is_no_report_query_fail = self.pending_work_is_legacy_query()
+            && is_auto_fail
+            && matches!(
+                reason,
+                EvictionReason::Unspecified | EvictionReason::PaginationOrHistoryFetch
+            );
+        let (should_report, rur) = if is_no_report_query_fail {
+            (false, None)
+        } else {
+            // Blow up any cached data associated with the workflow
+            let evict_req_outcome = self.request_eviction(RequestEvictMsg {
+                run_id: self.run_id().to_string(),
+                message,
+                reason,
+                auto_reply_fail_tt: None,
+            });
+            let should_report = match &evict_req_outcome {
+                EvictionRequestResult::EvictionRequested(Some(attempt), _)
+                | EvictionRequestResult::EvictionAlreadyRequested(Some(attempt)) => *attempt <= 1,
+                _ => false,
+            };
+            let rur = evict_req_outcome.into_run_update_resp();
+            (should_report, rur)
         };
-        let rur = evict_req_outcome.into_run_update_resp();
-        // If the outstanding WFT is a legacy query task, report that we need to fail it
         let outcome = if self.pending_work_is_legacy_query() {
-            ActivationCompleteOutcome::ReportWFTFail(
-                FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
-            )
+            if is_no_report_query_fail {
+                ActivationCompleteOutcome::WFTFailedDontReport
+            } else {
+                ActivationCompleteOutcome::ReportWFTFail(
+                    FailedActivationWFTReport::ReportLegacyQueryFailure(tt, failure),
+                )
+            }
         } else if should_report {
             ActivationCompleteOutcome::ReportWFTFail(FailedActivationWFTReport::Report(
                 tt, cause, failure,
@@ -543,17 +580,30 @@ impl ManagedRun {
         rur
     }
-    /// Delete the currently tracked workflow activation and return it, if any. Should be called
-    /// after the processing of the activation completion, and WFT reporting.
-    pub(super) fn delete_activation(
+    /// Must be called after the processing of the activation completion and WFT reporting.
+    ///
+    /// It will delete the currently tracked workflow activation (if there is one) and `pred`
+    /// evaluates to true. In the event the activation was an eviction, the bool part of the return
+    /// tuple is true. The [BufferedTasks] part will contain any buffered tasks that may still exist
+    /// and need to be instantiated into a new instance of the run, if a `wft_from_complete` was
+    /// provided, it will supersede any real WFTs in the buffer as by definition those are now
+    /// out-of-date.
+    pub(super) fn finish_activation(
         &mut self,
         pred: impl FnOnce(&OutstandingActivation) -> bool,
-    ) -> Option<OutstandingActivation> {
-        if self.activation().map(pred).unwrap_or_default() {
-            self.activation.take()
+    ) -> (bool, BufferedTasks) {
+        let evict = if self.activation().map(pred).unwrap_or_default() {
+            let act = self.activation.take();
+            act.map(|a| a.has_eviction()).unwrap_or_default()
         } else {
-            None
-        }
+            false
+        };
+        let buffered = if evict {
+            mem::take(&mut self.task_buffer)
+        } else {
+            Default::default()
+        };
+        (evict, buffered)
     }
     /// Called when local activities resolve
@@ -570,7 +620,7 @@ impl ManagedRun {
     fn _process_completion(
         &mut self,
         completion: RunActivationCompletion,
-        new_update: Option<HistoryUpdate>,
+        update_from_new_page: Option<HistoryUpdate>,
     ) -> Result<Option<FulfillableActivationComplete>, RunUpdateErr> {
         let data = CompletionDataForWFT {
             task_token: completion.task_token,
@@ -595,8 +645,7 @@ impl ManagedRun {
             // Send commands from lang into the machines then check if the workflow run needs
             // another activation and mark it if so
             self.wfm.push_commands_and_iterate(completion.commands)?;
-            // If there was a new update included as part of the completion, apply it.
-            if let Some(update) = new_update {
+            if let Some(update) = update_from_new_page {
                 self.wfm.feed_history_from_new_page(update)?;
             }
             // Don't bother applying the next task if we're evicting at the end of this activation
@@ -683,7 +732,7 @@ impl ManagedRun {
         } else {
             None
         };
-        self.update_to_acts(Ok(maybe_act).map(Into::into))
+        self.update_to_acts(Ok(maybe_act.into()))
     }
     /// Returns `true` if autocompletion should be issued, which will actually cause us to end up
     /// in [completion] again, at which point we'll start a new heartbeat timeout, which will
@@ -726,7 +775,7 @@ impl ManagedRun {
         let buffered = if ignore_buffered {
             false
         } else {
-            self.buffered_resp.is_some()
+            self.task_buffer.has_tasks()
         };
         trace!(wft=self.wft.is_some(), buffered=?buffered, more_work=?self.more_pending_work(),
                act_work, evict_work, "Does run have pending work?");
@@ -740,12 +789,11 @@ impl ManagedRun {
         work: PermittedWFT,
     ) -> Option<PermittedWFT> {
         let about_to_issue_evict = self.trying_to_evict.is_some();
-        let has_wft = self.wft().is_some();
         let has_activation = self.activation().is_some();
-        if has_wft || has_activation || about_to_issue_evict || self.more_pending_work() {
+        if has_activation || about_to_issue_evict || self.more_pending_work() {
             debug!(run_id = %self.run_id(),
-                   "Got new WFT for a run with outstanding work, buffering it");
-            self.buffered_resp = Some(work);
+                   "Got new WFT for a run with outstanding work, buffering it act: {:?} wft: {:?} about to evict: {:?}", &self.activation(), &self.wft, about_to_issue_evict);
+            self.task_buffer.buffer(work);
             None
         } else {
             Some(work)
@@ -754,12 +802,7 @@ impl ManagedRun {
     /// Returns true if there is a buffered workflow task for this run.
     pub(super) fn has_buffered_wft(&self) -> bool {
-        self.buffered_resp.is_some()
-    }
-    /// Removes and returns the buffered workflow task, if any.
-    pub(super) fn take_buffered_wft(&mut self) -> Option<PermittedWFT> {
-        self.buffered_resp.take()
+        self.task_buffer.has_tasks()
     }
     pub(super) fn request_eviction(&mut self, info: RequestEvictMsg) -> EvictionRequestResult {
@@ -776,6 +819,7 @@ impl ManagedRun {
                 WorkflowTaskFailedCause::Unspecified,
                 info.reason,
                 Failure::application_failure(info.message, false).into(),
+                true,
                 c.resp_chan,
             );
             return EvictionRequestResult::EvictionRequested(attempts, run_upd);
@@ -799,6 +843,7 @@ impl ManagedRun {
             }
             self.recorded_span_ids.insert(spid);
+            span.record("run_id", self.run_id());
             if let Some(wid) = self.wft().map(|wft| &wft.info.wf_id) {
                 span.record("workflow_id", wid.as_str());
             }
@@ -868,11 +913,11 @@ impl ManagedRun {
                 }
                 match r {
-                    // After each run update, check if it's ready to handle any buffered poll
+                    // After each run update, check if it's ready to handle any buffered task
                     None | Some(ActivationOrAuto::Autocomplete { .. })
                         if !self.has_any_pending_work(false, true) =>
                     {
-                        if let Some(bufft) = self.buffered_resp.take() {
+                        if let Some(bufft) = self.task_buffer.get_next_wft() {
                             self.incoming_wft(bufft)
                         } else {
                             None
@@ -899,6 +944,7 @@ impl ManagedRun {
                         fail_cause,
                         fail.source.evict_reason(),
                         Failure::application_failure(wft_fail_str, false).into(),
+                        true,
                         Some(resp_chan),
                     )
                 } else {
@@ -946,18 +992,16 @@ impl ManagedRun {
         data: CompletionDataForWFT,
         due_to_heartbeat_timeout: bool,
     ) -> FulfillableActivationComplete {
-        let mut outgoing_cmds = self.wfm.get_server_commands();
-        if data.activation_was_only_eviction && !outgoing_cmds.commands.is_empty() {
-            if self.am_broken {
-                // If we broke there could be commands in the pipe that we didn't get a chance to
-                // handle properly during replay, just wipe them all out.
-                outgoing_cmds.commands = vec![];
-            } else {
-                dbg_panic!(
-                "There should not be any outgoing commands when preparing a completion response \
-                 if the activation was only an eviction. This is an SDK bug."
-                );
-            }
+        let mut machines_wft_response = self.wfm.prepare_for_wft_response();
+        if data.activation_was_only_eviction
+            && (machines_wft_response.commands().peek().is_some()
+                || machines_wft_response.has_messages())
+            && !self.am_broken
+        {
+            dbg_panic!(
+                "There should not be any outgoing commands or messages when preparing a completion \
+                 response if the activation was only an eviction. This is an SDK bug."
+            );
         }
         let query_responses = data.query_responses;
@@ -970,34 +1014,50 @@ impl ManagedRun {
         // saw the final event in the workflow, or if we are playing back for the express purpose of
         // fulfilling a query. If the activation we sent was *only* an eviction, don't send that
         // either.
-        let should_respond = !(self.wfm.machines.has_pending_jobs()
-            || outgoing_cmds.replaying
+        let should_respond = !(machines_wft_response.has_pending_jobs
+            || machines_wft_response.replaying
             || is_query_playback
             || data.activation_was_only_eviction
-            || self.wfm.machines.have_seen_terminal_event);
+            || machines_wft_response.have_seen_terminal_event);
         // If there are pending LA resolutions, and we're responding to a query here,
         // we want to make sure to force a new task, as otherwise once we tell lang about
         // the LA resolution there wouldn't be any task to reply to with the result of iterating
         // the workflow.
-        if has_query_responses && self.wfm.machines.has_pending_la_resolutions() {
+        if has_query_responses && machines_wft_response.have_pending_la_resolutions {
             force_new_wft = true;
         }
         let outcome = if should_respond || has_query_responses {
+            // If we broke there could be commands or messages in the pipe that we didn't
+            // get a chance to handle properly during replay. Don't send them.
+            let (commands, messages) = if self.am_broken && data.activation_was_only_eviction {
+                (vec![], vec![])
+            } else {
+                (
+                    machines_wft_response.commands().collect(),
+                    machines_wft_response.messages(),
+                )
+            };
             ActivationCompleteOutcome::ReportWFTSuccess(ServerCommandsWithWorkflowInfo {
                 task_token: data.task_token,
                 action: ActivationAction::WftComplete {
                     force_new_wft,
-                    commands: outgoing_cmds.commands,
+                    commands,
+                    messages,
                     query_responses,
-                    sdk_metadata: self.wfm.machines.get_metadata_for_wft_complete(),
+                    sdk_metadata: machines_wft_response.metadata_for_complete(),
                 },
             })
         } else {
             ActivationCompleteOutcome::DoNothing
         };
         FulfillableActivationComplete {
-            result: self.build_activation_complete_result(outcome),
+            result: ActivationCompleteResult {
+                outcome,
+                most_recently_processed_event: machines_wft_response.last_processed_event as usize,
+                replaying: machines_wft_response.replaying,
+            },
             resp_chan,
         }
     }
@@ -1023,7 +1083,12 @@ impl ManagedRun {
     ) {
         if let Some(chan) = chan {
             if chan
-                .send(self.build_activation_complete_result(outcome))
+                .send(ActivationCompleteResult {
+                    outcome,
+                    most_recently_processed_event: self.most_recently_processed_event_number()
+                        as usize,
+                    replaying: self.wfm.machines.replaying,
+                })
                 .is_err()
             {
                 let warnstr = "The workflow task completer went missing! This likely indicates an \
@@ -1040,17 +1105,6 @@ impl ManagedRun {
         }
     }
-    fn build_activation_complete_result(
-        &self,
-        outcome: ActivationCompleteOutcome,
-    ) -> ActivationCompleteResult {
-        ActivationCompleteResult {
-            outcome,
-            most_recently_processed_event: self.most_recently_processed_event_number() as usize,
-            replaying: self.wfm.machines.replaying,
-        }
-    }
     /// Returns true if the handle is currently processing a WFT which contains a legacy query.
     fn pending_work_is_legacy_query(&self) -> bool {
         // Either we know because there is a pending legacy query, or it's already been drained and
@@ -1160,33 +1214,30 @@ impl WorkflowManager {
     /// Create a new workflow manager given workflow history and execution info as would be found
     /// in [PollWorkflowTaskQueueResponse]
     fn new(basics: RunBasics) -> Self {
-        let (wfb, cmd_sink) = WorkflowBridge::new();
-        let state_machines = WorkflowMachines::new(basics, Box::new(wfb).into());
+        let (wfb, cmd_sink) = DrivenWorkflow::new();
+        let state_machines = WorkflowMachines::new(basics, wfb);
         Self {
             machines: state_machines,
             command_sink: Some(cmd_sink),
         }
     }
-    #[cfg(test)]
-    const fn new_from_machines(workflow_machines: WorkflowMachines) -> Self {
-        Self {
-            machines: workflow_machines,
-            command_sink: None,
-        }
-    }
-    /// Given history that was just obtained from the server, pipe it into this workflow's machines.
+    /// Given info that was just obtained from a new WFT from server, pipe it into this workflow's
+    /// machines.
     ///
     /// Should only be called when a workflow has caught up on replay (or is just beginning). It
     /// will return a workflow activation if one is needed.
-    fn feed_history_from_server(&mut self, update: HistoryUpdate) -> Result<WorkflowActivation> {
-        self.machines.new_history_from_server(update)?;
+    fn new_work_from_server(
+        &mut self,
+        update: HistoryUpdate,
+        messages: Vec<IncomingProtocolMessage>,
+    ) -> Result<WorkflowActivation> {
+        self.machines.new_work_from_server(update, messages)?;
         self.get_next_activation()
     }
     /// Update the machines with some events from fetching another page of history. Does *not*
-    /// attempt to pull the next activation, unlike [Self::feed_history_from_server].
+    /// attempt to pull the next activation, unlike [Self::new_work_from_server].
     fn feed_history_from_new_page(&mut self, update: HistoryUpdate) -> Result<()> {
         self.machines.new_history_from_server(update)
     }
@@ -1243,14 +1294,10 @@ impl WorkflowManager {
         Ok(self.machines.has_pending_jobs())
     }
-    /// Typically called after [get_next_activation], use this to retrieve commands to be sent to
-    /// the server which have been generated by the machines. Does *not* drain those commands.
-    /// See [WorkflowMachines::get_commands].
-    fn get_server_commands(&self) -> OutgoingServerCommands {
-        OutgoingServerCommands {
-            commands: self.machines.get_commands(),
-            replaying: self.machines.replaying,
-        }
+    /// Must be called when we're ready to respond to a WFT after handling catching up on replay
+    /// and handling all activation completions from lang.
+    fn prepare_for_wft_response(&mut self) -> MachinesWFTResponseContent {
+        self.machines.prepare_for_wft_response()
     }
     /// Remove and return all queued local activities. Once this is called, they need to be