npm - @vellumai/assistant - Versions diffs - 0.8.7 → 0.8.8-dev.202606052332.17fc8ea - Mend

@vellumai/assistant 0.8.7 → 0.8.8-dev.202606052332.17fc8ea

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (570) hide show

package/src/agent/loop.ts CHANGED Viewed

@@ -1,45 +1,32 @@
 import * as Sentry from "@sentry/node";
 import type { LLMCallSite } from "../config/schemas/llm.js";
+import { stripInjectionsForCompaction } from "../context/strip-injections.js";
 import {
   estimatePromptTokensRaw,
+  estimatePromptTokensWithTools,
   estimateToolsTokens,
   getCalibrationProviderKey,
 } from "../context/token-estimator.js";
-import { calculateMaxToolResultChars } from "../context/tool-result-truncation.js";
 import type { ContextWindowResult } from "../context/window-manager.js";
 import type { ToolActivityMetadata } from "../daemon/message-types/web-activity.js";
-import { defaultCompactionTerminal } from "../plugins/defaults/compaction/terminal.js";
-import { defaultEmptyResponseTerminal } from "../plugins/defaults/empty-response/terminal.js";
-import { defaultTokenEstimateTerminal } from "../plugins/defaults/token-estimate/terminal.js";
-import { defaultToolErrorTerminal } from "../plugins/defaults/tool-error/terminal.js";
-import { defaultToolResultTruncateTerminal } from "../plugins/defaults/tool-result-truncate/terminal.js";
-import type {
-  ToolResultTruncateArgs,
-  ToolResultTruncateResult,
-} from "../plugins/defaults/tool-result-truncate/types.js";
-import { DEFAULT_TIMEOUTS, runPipeline } from "../plugins/pipeline.js";
-import { getMiddlewaresFor } from "../plugins/registry.js";
-import type {
-  CompactionArgs,
-  CompactionCircuitEvent,
-  CompactionResult,
-  EmptyResponseArgs,
-  EmptyResponseDecision,
-  EstimateArgs,
-  EstimateResult,
-  LLMCallArgs,
-  LLMCallResult,
-  ToolErrorArgs,
-  ToolErrorDecision,
-  TurnContext,
-} from "../plugins/types.js";
-import { PluginTimeoutError } from "../plugins/types.js";
+import { HOOKS } from "../plugin-api/constants.js";
+import type { PostToolUseContext, StopContext } from "../plugin-api/types.js";
+import {
+  DEFAULT_COMPACTION_PLUGIN_NAME,
+  defaultCompact,
+} from "../plugins/defaults/compaction/compact.js";
+import type { PostCompactionHookInput } from "../plugins/defaults/memory-retrieval/hooks/post-compact.js";
+import { runHook } from "../plugins/pipeline.js";
+import type { CompactionCircuitEvent, TurnContext } from "../plugins/types.js";
+import { PluginExecutionError } from "../plugins/types.js";
 import { normalizeThinkingConfigForWire } from "../providers/thinking-config.js";
 import type {
   ContentBlock,
   Message,
   Provider,
+  ProviderResponse,
+  SendMessageOptions,
   ToolDefinition,
   ToolResultContent,
 } from "../providers/types.js";
@@ -48,7 +35,7 @@ import {
   applyStreamingSubstitution,
   applySubstitutions,
 } from "../tools/sensitive-output-placeholders.js";
-import { AssistantError, ErrorCode, ProviderError } from "../util/errors.js";
+import { ProviderError } from "../util/errors.js";
 import { getLogger } from "../util/logger.js";
 import { isRetryableNetworkError } from "../util/retry.js";
 import { CompactionCircuit } from "./compaction-circuit.js";
@@ -95,17 +82,28 @@ export type ExitReason = "handoff" | "budget";
 export type CheckpointDecision = "continue" | ExitReason;
-/**
- * Result of {@link AgentLoop.run}.
- *
- * `exitReason` carries the reason the loop paused at a checkpoint so the
- * orchestrator reads the loop's own signal instead of inferring it from
- * callback side-effects. It is `null` whenever the loop reached a terminal
- * stop (completion, error, abort, or a tool-requested yield-to-user).
- */
+/** Result of {@link AgentLoop.run}. */
 export interface AgentLoopRunResult {
+  /** Full conversation history after the run, including everything appended this run. */
   history: Message[];
+  /**
+   * Reason the loop paused at a checkpoint, or `null` on a terminal stop
+   * (completion, error, abort, or a tool-requested yield-to-user).
+   */
   exitReason: ExitReason | null;
+  /**
+   * Whether the loop produced at least one new assistant message this run —
+   * the forward-progress signal for the ordering-error retry gate and the
+   * overflow convergence fold (immune to in-loop compaction shrinking history
+   * below a pre-run length).
+   */
+  appendedNewMessages: boolean;
+  /**
+   * Slice of `history` appended this run, measured from the loop's input or
+   * from the compacted base when it compacts in place. The loop owns this
+   * boundary, so it cannot desync the way an externally-held index can.
+   */
+  newMessages: Message[];
 }
 /**
@@ -129,8 +127,6 @@ export interface AgentLoopRunResult {
 export type AgentLoopExitReason =
   /** `if (signal?.aborted) break;` at the top of the loop. */
   | "aborted_pre_call"
-  /** Empty assistant response after the configured retry budget. */
-  | "empty_response_exhausted"
   /** Assistant message has no tool-use blocks (or no tool executor). */
   | "no_tool_calls"
   /** Signal aborted while building the user-side tool-results message. */
@@ -209,6 +205,14 @@ export type AgentEvent =
       approvalReason?: string;
       riskThreshold?: string;
       activityMetadata?: ToolActivityMetadata;
+      /**
+       * Set when the loop synthesizes this result for a tool_use that never
+       * executed (a "Cancelled by user" block on abort). The daemon still
+       * captures it into `pendingToolResults` and forwards it to the client,
+       * but skips the side effects that assume the tool ran — marking the
+       * workspace dirty and emitting a post-tool "thinking" activity state.
+       */
+      cancelled?: boolean;
     }
   | { type: "tool_use_preview_start"; toolUseId: string; toolName: string }
   | {
@@ -243,7 +247,7 @@ export type AgentEvent =
   | { type: "error"; error: Error }
   | {
       /**
-       * Emitted when the `llmCall` pipeline throws — i.e. the provider
+       * Emitted when the provider call throws — i.e. the provider
        * rejected the request before returning a usable response. Carries
        * the loop-level raw request we attempted to send (messages, tools,
        * system prompt, provider-agnostic config) plus the thrown error.
@@ -295,6 +299,42 @@ export type AgentEvent =
        */
       type: "context_compacting";
     }
+  | {
+      /**
+       * Emitted after the loop's inline mid-loop compaction pipeline runs,
+       * immediately before re-injection — whether or not the pipeline actually
+       * compacted. The daemon's event dispatcher always commits `basis` (the
+       * stripped pre-compaction history) as the conversation's durable message
+       * state, so re-injection ({@link MidLoopCompaction.reinject}) re-applies
+       * injections onto the stripped base rather than stacking on top of the
+       * still-injected messages. When `result.compacted` is set it
+       * additionally commits the durable compaction result (DB-record fields,
+       * graph-memory side effects, SSE) and flips the per-turn re-injection
+       * guards on the handler state.
+       *
+       * Treated as a critical event: a failed durable commit re-throws so the
+       * turn aborts rather than re-injecting against half-applied state.
+       *
+       * `basis` is the stripped pre-compaction history the summary was built
+       * from; the dispatcher uses it to project Slack provenance onto the
+       * compacted result.
+       */
+      type: "compaction_completed";
+      result: ContextWindowResult;
+      basis: Message[];
+    }
+  | {
+      /**
+       * Emitted right after the loop strips runtime injections from the
+       * running history, before the compaction pipeline runs. The daemon's
+       * event dispatcher records the history-stripped marker — a Conversation
+       * DB-record field read back at load time to strip embedded injection
+       * prefixes from pre-strip messages. Best-effort: a transient marker
+       * write must not abort the turn, so unlike `compaction_completed` this
+       * event is not treated as critical.
+       */
+      type: "history_stripped";
+    }
   /**
    * Circuit-breaker transitions emitted when auto-compaction is paused
    * (`compaction_circuit_open`, after three consecutive summary-LLM
@@ -324,8 +364,7 @@ const DEFAULT_CONFIG: AgentLoopConfig = {
   minTurnIntervalMs: 150,
 };
-const MAX_CONSECUTIVE_ERROR_NUDGES = 3;
-const MAX_EMPTY_RESPONSE_RETRIES = 1;
+const MAX_STOP_CONTINUE_RETRIES = 1;
 const MAX_TOKENS_STOP_REASONS = new Set([
   "length",
   "max_output_tokens",
@@ -346,12 +385,11 @@ export function isMaxTokensStopReason(
  * {@link AgentLoop.run}); this helper is the fallback used only by unit
  * tests that construct `AgentLoop` directly without an orchestrator.
  *
- * When the orchestrator-supplied context is present, {@link resolveLoopTurnContext}
- * is used instead of this helper so the pipeline sees the real
- * `conversationId`, trust, and `contextWindowManager`. In the fallback path
- * the returned context is still useful for pipeline logging: `requestId`
- * surfaces in every structured record, and `turnIndex` reflects the
- * current tool-use iteration.
+ * When the orchestrator-supplied context is present it is used directly so the
+ * pipeline sees the real `conversationId`, trust, and `contextWindowManager`.
+ * In the fallback path the returned context is still useful for pipeline
+ * logging: `requestId` surfaces in every structured record, and `turnIndex`
+ * reflects the current tool-use iteration.
  */
 function buildLoopTurnContext(
   requestId: string | undefined,
@@ -371,29 +409,6 @@ function buildLoopTurnContext(
   };
 }
-/**
- * Produce a `TurnContext` for a pipeline call inside {@link AgentLoop.run}.
- *
- * When the orchestrator supplied a `turnContext`, clone it and overwrite
- * `requestId` + `turnIndex` with the loop-scoped values so plugin log
- * records correctly attribute the call to the current tool-use iteration
- * while preserving the real `conversationId`, trust context, and
- * `contextWindowManager` the orchestrator assembled for the turn. Without
- * an orchestrator context (unit tests that instantiate `AgentLoop` with no
- * `turnContext`), fall back to {@link buildLoopTurnContext}'s synthesized
- * placeholder.
- */
-function resolveLoopTurnContext(
-  base: TurnContext | undefined,
-  requestId: string | undefined,
-  turnIndex: number,
-): TurnContext {
-  if (base) {
-    return { ...base, requestId: requestId ?? base.requestId, turnIndex };
-  }
-  return buildLoopTurnContext(requestId, turnIndex);
-}
 /**
  * User-config HTTP status codes that should never page the on-call: billing
  * exhaustion (402), invalid credentials (401), and forbidden/plan-gated (403).
@@ -437,28 +452,25 @@ export interface ResolvedSystemPrompt {
 }
 /**
- * Orchestrator-supplied hooks the loop invokes when the mid-loop budget gate
+ * Orchestrator-supplied hook the loop invokes when the mid-loop budget gate
  * trips and inline compaction runs. The loop owns the trigger, the
- * `compaction` pipeline call, the result interpretation (circuit-breaker
- * bookkeeping + the exhaustion decision), and the inline continue; these hooks
- * bridge the durable / injection state the loop is intentionally blind to.
- * Durable persistence ({@link applyResult}) and re-injection
- * ({@link reinject}) remain orchestrator-supplied for now and are expected to
- * move into the loop in a future change.
+ * compaction call, the result interpretation (circuit-breaker
+ * bookkeeping + the exhaustion decision), and the inline continue; this hook
+ * bridges the injection state the loop is intentionally blind to. Durable
+ * persistence is signalled out-of-band via the `history_stripped` (marker)
+ * and `compaction_completed` (basis commit + successful summary) {@link
+ * AgentEvent}s; the {@link MidLoopCompaction.postCompactionHook} is
+ * orchestrator-supplied, and its inputs migrate loop-ward as the loop
+ * subsumes the re-injection ceremony.
  */
 export interface MidLoopCompaction {
-  /** Strip runtime injections, commit stripped messages, and resolve pipeline options. */
-  prepare: (history: Message[]) => {
-    rawHistory: Message[];
-    options: CompactionArgs["options"];
-  };
-  /** Commit a successful compaction result to durable state. */
-  applyResult: (
-    result: ContextWindowResult,
-    rawHistory: Message[],
-  ) => Promise<void>;
-  /** Re-apply runtime injections and return the history to continue from. */
-  reinject: () => Promise<Message[]>;
+  /**
+   * Re-apply runtime injections onto the post-compaction history and return
+   * the history to continue from. The loop supplies its own working state via
+   * {@link PostCompactionHookInput} so the hook re-injects from that rather
+   * than reading it back from orchestrator state.
+   */
+  postCompactionHook: (input: PostCompactionHookInput) => Promise<Message[]>;
 }
 export interface AgentLoopRunOptions {
@@ -518,21 +530,12 @@ export interface AgentLoopRunOptions {
 /**
  * Callback shape the loop uses to execute a tool invocation.
- *
- * The trailing `turnContext` is optional so in-process tests that wire the
- * callback without an orchestrator keep working. Production sites (the
- * `Conversation`'s `createToolExecutor`) forward the supplied context into
- * `ToolExecutor.execute` so the `toolExecute` pipeline sees the orchestrator's
- * real conversation identity/trust/contextWindowManager instead of the
- * synthesized placeholder `ToolExecutor` would otherwise build from the
- * `ToolContext` alone.
  */
 export type LoopToolExecutor = (
   name: string,
   input: Record<string, unknown>,
   onOutput?: (chunk: string) => void,
   toolUseId?: string,
-  turnContext?: TurnContext,
 ) => Promise<{
   content: string;
   isError: boolean;
@@ -624,10 +627,9 @@ export class AgentLoop {
    * Resolve the tool definitions sent to the provider for the given turn.
    *
    * Mirrors the logic of {@link getToolTokenBudget} but returns the tool
-   * array itself — callers that need to thread the tool set into a plugin
-   * pipeline (e.g. `tokenEstimate`, where the pipeline's args include
-   * `tools`) use this rather than re-implementing the dynamic-vs-static
-   * resolver fork.
+   * array itself — callers that need to thread the tool set into the token
+   * estimate (`estimatePromptTokensWithTools`, whose args include `tools`)
+   * use this rather than re-implementing the dynamic-vs-static resolver fork.
    */
   getResolvedTools(history?: Message[]): ToolDefinition[] {
     return history && this.resolveTools
@@ -648,28 +650,15 @@ export class AgentLoop {
   }
   /**
-   * Estimate total prompt tokens for `history` via the `tokenEstimate`
-   * pipeline. Args are shallow-frozen so a mutating middleware cannot strip
-   * context from the loop's live `history`.
+   * Calibrated prompt-token estimate for `history`, including the
+   * resolved-tool budget for the turn.
    */
-  private estimateTokens(
-    history: Message[],
-    turnContext: TurnContext,
-  ): Promise<EstimateResult> {
-    return runPipeline<EstimateArgs, EstimateResult>(
-      "tokenEstimate",
-      getMiddlewaresFor("tokenEstimate"),
-      defaultTokenEstimateTerminal,
-      {
-        history: Object.freeze([...history]) as Message[],
-        systemPrompt: this.systemPrompt,
-        tools: Object.freeze([
-          ...this.getResolvedTools(history),
-        ]) as ToolDefinition[],
-        providerName: getCalibrationProviderKey(this.provider),
-      },
-      turnContext,
-      DEFAULT_TIMEOUTS.tokenEstimate,
+  private estimateTokens(history: Message[]): number {
+    return estimatePromptTokensWithTools(
+      history,
+      this.systemPrompt,
+      this.getResolvedTools(history),
+      getCalibrationProviderKey(this.provider),
     );
   }
@@ -688,15 +677,7 @@ export class AgentLoop {
     onEvent: (event: AgentEvent) => void | Promise<void>,
   ): Promise<void> {
     try {
-      await this.compactionCircuit.recordOutcome(
-        {
-          currentRequestId: turnContext.requestId,
-          currentTurnTrustContext: turnContext.trust,
-          turnCount: turnContext.turnIndex,
-        },
-        summaryFailed,
-        onEvent,
-      );
+      await this.compactionCircuit.recordOutcome(summaryFailed, onEvent);
     } catch (recordError) {
       log.error(
         { err: recordError, requestId: turnContext.requestId },
@@ -708,11 +689,10 @@ export class AgentLoop {
   /**
    * Compact the running history in place when the mid-loop budget gate trips.
    *
-   * Runs the `compaction` pipeline natively (like {@link estimateTokens}) on
-   * the stripped history, then re-applies injections via the supplied hooks.
-   * Returns the history to continue from, or `null` when the compactor timed
-   * out or exhausted its retry budget so the caller yields
-   * `exitReason = "budget"` and the orchestrator escalates.
+   * Calls the default compaction plugin on the stripped history, then
+   * re-applies injections via the supplied hooks. Returns the history to
+   * continue from, or `null` when the compactor exhausted its retry budget so
+   * the caller yields `exitReason = "budget"` and the orchestrator escalates.
    */
   private async compact(
     history: Message[],
@@ -720,32 +700,37 @@ export class AgentLoop {
     compaction: MidLoopCompaction,
     signal: AbortSignal | undefined,
     onEvent: (event: AgentEvent) => void | Promise<void>,
+    overrideProfile: string | null,
   ): Promise<Message[] | null> {
     await onEvent({ type: "context_compacting" });
-    const { rawHistory, options } = compaction.prepare(history);
-    let result: CompactionResult;
-    try {
-      result = await runPipeline<CompactionArgs, CompactionResult>(
-        "compaction",
-        getMiddlewaresFor("compaction"),
-        (args) => defaultCompactionTerminal(args, turnContext),
-        { messages: rawHistory, signal, options },
-        turnContext,
-        DEFAULT_TIMEOUTS.compaction,
+    // Strip runtime injections so the compactor summarizes the raw persistent
+    // messages.
+    const rawHistory = stripInjectionsForCompaction(history);
+    // Record the history-stripped marker right after stripping, before the
+    // pipeline runs.
+    await onEvent({ type: "history_stripped" });
+    const manager = turnContext.contextWindowManager;
+    if (manager == null) {
+      throw new PluginExecutionError(
+        "default-compaction: turnContext.contextWindowManager is missing — orchestrator must attach it before invoking compaction",
+        DEFAULT_COMPACTION_PLUGIN_NAME,
       );
-    } catch (error) {
-      if (error instanceof PluginTimeoutError) {
-        // A timeout counts as a compaction failure against the circuit breaker.
-        await this.recordCompactionOutcome(turnContext, true, onEvent);
-        return null;
-      }
-      throw error;
     }
-    // `CompactionResult` is intentionally `unknown` at the plugin boundary so
-    // plugin consumers don't import the window manager; the loop ran the
-    // pipeline, so it interprets the concrete result here.
-    const compactResult = result as ContextWindowResult;
-    // `force: true` bypasses the cooldown/threshold gates, but early returns
+    // The mid-loop budget gate is reached only when this turn decides to
+    // compact in place, so `force` past the auto-threshold check.
+    // `actorTrustClass` comes from the turn context (the actor whose turn
+    // triggered compaction) so the compactor's image manifest excludes
+    // guardian-only attachments for untrusted actors. `overrideProfile` is the
+    // turn's resolved inference-profile override for the summary call.
+    const compactResult = await defaultCompact({
+      manager,
+      messages: rawHistory,
+      signal,
+      force: true,
+      actorTrustClass: turnContext.trust.trustClass,
+      overrideProfile,
+    });
+    // `force: true` bypasses the auto-threshold gate, but early returns
     // for "no eligible messages" / "insufficient messages" still leave
     // `summaryFailed` undefined. Only record an outcome when the summary LLM
     // actually ran.
@@ -756,13 +741,25 @@ export class AgentLoop {
         onEvent,
       );
     }
-    if (compactResult.compacted) {
-      await compaction.applyResult(compactResult, rawHistory);
-    }
+    // Emit unconditionally: the dispatcher commits the stripped `basis` as the
+    // durable message base whether or not the pipeline compacted (re-injection
+    // reads it), and runs the durable compaction commit only when
+    // `result.compacted`.
+    await onEvent({
+      type: "compaction_completed",
+      result: compactResult,
+      basis: rawHistory,
+    });
     if (compactResult.exhausted ?? false) {
       return null;
     }
-    return compaction.reinject();
+    // Re-inject onto the same base the `compaction_completed` dispatch commits:
+    // the compacted messages when the pipeline compacted, the stripped
+    // pre-compaction history otherwise.
+    return compaction.postCompactionHook({
+      history: compactResult.compacted ? compactResult.messages : rawHistory,
+      turnContext,
+    });
   }
   async run(
@@ -783,26 +780,36 @@ export class AgentLoop {
       mutableLatestUserMessage,
     } = options ?? {};
     let history = [...messages];
+    // Index into `history` where this run's appended output begins. It starts
+    // after the input and resets to the compacted base whenever the loop
+    // compacts in place, so `history.slice(newMessagesStart)` is always exactly
+    // what the loop produced since the last (re-injected) base.
+    let newMessagesStart = history.length;
     let producedVisibleTextThisRun = false;
     let toolUseTurns = 0;
-    let consecutiveErrorTurns = 0;
-    let emptyResponseRetries = 0;
+    let stopContinueRetries = 0;
     let lastLlmCallTime = 0;
     let exitReason: ExitReason | null = null;
+    let appendedNewMessages = false;
     const rlog = requestId ? log.child({ requestId }) : log;
+    // Resolve the inference-profile override that applies right now. The
+    // optional resolver lets a turn observe a confirmed mid-turn profile switch
+    // before the next model call; absent a resolver the turn-start value holds.
+    const resolveEffectiveOverrideProfile = (): string | undefined =>
+      resolveOverrideProfile ? resolveOverrideProfile() : overrideProfile;
     // Per-run substitution map for sensitive output placeholders.
     // Bindings are accumulated from tool results; placeholders are
     // resolved in streamed deltas and final assistant message text.
     const substitutionMap = new Map<string, string>();
     let streamingPending = "";
-    // Idempotency guard for `emitExit`. Used so the throw path in the
-    // empty-response branch can stamp its reason ("empty_response_exhausted")
-    // before throwing — the catch handler that observes the rethrow will
-    // then attempt to stamp "error" and harmlessly no-op, preserving the
-    // more specific reason. Also defends against accidental future
-    // double-emits if a new break site is added without checking this.
+    // Idempotency guard for `emitExit`: the first reason stamped wins. A break
+    // site that stamps a specific reason before unwinding into the catch
+    // handler keeps that reason instead of the generic "error", and the guard
+    // also defends against accidental double-emits if a new break site is
+    // added without checking this.
     let exitReasonEmitted = false;
     const emitExit = async (reason: AgentLoopExitReason): Promise<void> => {
       if (exitReasonEmitted) return;
@@ -923,12 +930,8 @@ export class AgentLoop {
         // `activeProfile` and any call-site named profile. Threading it on
         // every send (rather than once at construction) keeps subagents that
         // share an `AgentLoop` instance but ought to inherit a different
-        // profile correct — and matches how `callSite` is plumbed. The
-        // optional resolver lets a turn observe an explicitly confirmed
-        // profile-session switch before the next model call.
-        const effectiveOverrideProfile = resolveOverrideProfile
-          ? resolveOverrideProfile()
-          : overrideProfile;
+        // profile correct — and matches how `callSite` is plumbed.
+        const effectiveOverrideProfile = resolveEffectiveOverrideProfile();
         if (effectiveOverrideProfile) {
           providerConfig.overrideProfile = effectiveOverrideProfile;
         }
@@ -974,95 +977,76 @@ export class AgentLoop {
           stripOldMediaBlocks(history),
         );
-        // Wrap the provider call in the `llmCall` pipeline so middleware
-        // contributed by plugins may observe, rewrite, short-circuit, or
-        // post-process every LLM request. The terminal below is the real
-        // `provider.sendMessage(...)` call; middleware reach it by calling
-        // `next(args)`. The default `defaultLlmCallPlugin` contributes a
-        // passthrough middleware that forwards to `next(args)` — it
-        // registers at module load and sits at the outermost onion layer,
-        // so it must yield to keep user-registered `llmCall` middleware
-        // reachable. Timeout is `null` (`DEFAULT_TIMEOUTS.llmCall`) — the
-        // provider layer already enforces its own HTTP-level budgets.
-        //
-        // The `onEvent` wrapping is kept inside `args.options` so substitution
-        // and streaming behavior exactly match the pre-pipeline call site.
-        const llmCallArgs: LLMCallArgs = {
-          provider: this.provider,
-          messages: providerHistory,
-          options: {
-            tools: currentTools.length > 0 ? currentTools : undefined,
-            systemPrompt: turnSystemPrompt,
-            config: providerConfig,
-            onEvent: (event) => {
-              if (event.type === "text_delta") {
-                // Apply sensitive-output placeholder substitution (chunk-safe)
-                if (substitutionMap.size > 0) {
-                  const combined = streamingPending + event.text;
-                  const { emit, pending } = applyStreamingSubstitution(
-                    combined,
-                    substitutionMap,
-                  );
-                  streamingPending = pending;
-                  if (emit.length > 0) {
-                    onEvent({ type: "text_delta", text: emit });
-                  }
-                } else {
-                  onEvent({ type: "text_delta", text: event.text });
+        // The `onEvent` wrapping below applies sensitive-output placeholder
+        // substitution to streamed text while forwarding every other event
+        // type through unchanged.
+        const providerOptions: SendMessageOptions = {
+          tools: currentTools.length > 0 ? currentTools : undefined,
+          systemPrompt: turnSystemPrompt,
+          config: providerConfig,
+          onEvent: (event) => {
+            if (event.type === "text_delta") {
+              // Apply sensitive-output placeholder substitution (chunk-safe)
+              if (substitutionMap.size > 0) {
+                const combined = streamingPending + event.text;
+                const { emit, pending } = applyStreamingSubstitution(
+                  combined,
+                  substitutionMap,
+                );
+                streamingPending = pending;
+                if (emit.length > 0) {
+                  onEvent({ type: "text_delta", text: emit });
                 }
-              } else if (event.type === "thinking_delta") {
-                onEvent({ type: "thinking_delta", thinking: event.thinking });
-              } else if (event.type === "tool_use_preview_start") {
-                onEvent({
-                  type: "tool_use_preview_start",
-                  toolUseId: event.toolUseId,
-                  toolName: event.toolName,
-                });
-              } else if (event.type === "input_json_delta") {
-                onEvent({
-                  type: "input_json_delta",
-                  toolName: event.toolName,
-                  toolUseId: event.toolUseId,
-                  accumulatedJson: event.accumulatedJson,
-                });
-              } else if (event.type === "server_tool_start") {
-                onEvent({
-                  type: "server_tool_start",
-                  name: event.name,
-                  toolUseId: event.toolUseId,
-                  input: event.input,
-                });
-              } else if (event.type === "server_tool_complete") {
-                onEvent({
-                  type: "server_tool_complete",
-                  toolUseId: event.toolUseId,
-                  isError: event.isError,
-                  ...(event.content ? { content: event.content } : {}),
-                  ...(event.resolvedInput
-                    ? { resolvedInput: event.resolvedInput }
-                    : {}),
-                  ...(event.errorCode ? { errorCode: event.errorCode } : {}),
-                  ...(event.errorMessage
-                    ? { errorMessage: event.errorMessage }
-                    : {}),
-                });
+              } else {
+                onEvent({ type: "text_delta", text: event.text });
               }
-            },
-            signal,
+            } else if (event.type === "thinking_delta") {
+              onEvent({ type: "thinking_delta", thinking: event.thinking });
+            } else if (event.type === "tool_use_preview_start") {
+              onEvent({
+                type: "tool_use_preview_start",
+                toolUseId: event.toolUseId,
+                toolName: event.toolName,
+              });
+            } else if (event.type === "input_json_delta") {
+              onEvent({
+                type: "input_json_delta",
+                toolName: event.toolName,
+                toolUseId: event.toolUseId,
+                accumulatedJson: event.accumulatedJson,
+              });
+            } else if (event.type === "server_tool_start") {
+              onEvent({
+                type: "server_tool_start",
+                name: event.name,
+                toolUseId: event.toolUseId,
+                input: event.input,
+              });
+            } else if (event.type === "server_tool_complete") {
+              onEvent({
+                type: "server_tool_complete",
+                toolUseId: event.toolUseId,
+                isError: event.isError,
+                ...(event.content ? { content: event.content } : {}),
+                ...(event.resolvedInput
+                  ? { resolvedInput: event.resolvedInput }
+                  : {}),
+                ...(event.errorCode ? { errorCode: event.errorCode } : {}),
+                ...(event.errorMessage
+                  ? { errorMessage: event.errorMessage }
+                  : {}),
+              });
+            }
           },
+          signal,
         };
-        // Per-turn pipeline context. When the orchestrator threaded a full
-        // `turnContext` into `run()`, use it (overwriting `turnIndex` with
-        // the loop-scoped tool-use iteration) so middleware sees the real
-        // conversation identity, trust, and `contextWindowManager`. The
-        // synthesized fallback is only reached by standalone unit-test
-        // instantiations that never plumb a context through.
-        const turnCtx = resolveLoopTurnContext(
-          turnContext,
-          requestId,
-          toolUseTurns,
-        );
+        // Per-turn pipeline context. Real call sites thread a full
+        // `turnContext` into `run()` and it is used directly; standalone
+        // unit-test instantiations that never plumb a context through fall
+        // back to a synthesized placeholder scoped to the tool-use iteration.
+        const turnCtx =
+          turnContext ?? buildLoopTurnContext(requestId, toolUseTurns);
         // Announce the LLM-call boundary so downstream handlers (the
         // daemon's persistence pipeline) can reserve an empty assistant row
@@ -1085,15 +1069,11 @@ export class AgentLoop {
         // `llm_request_logs` row, then re-throw so the existing outer catch
         // continues to handle abort sync, Sentry capture, the `error` event,
         // and the loop break unchanged.
-        let response: LLMCallResult;
+        let response: ProviderResponse;
         try {
-          response = await runPipeline<LLMCallArgs, LLMCallResult>(
-            "llmCall",
-            getMiddlewaresFor("llmCall"),
-            (args) => args.provider.sendMessage(args.messages, args.options),
-            llmCallArgs,
-            turnCtx,
-            DEFAULT_TIMEOUTS.llmCall,
+          response = await this.provider.sendMessage(
+            providerHistory,
+            providerOptions,
           );
         } catch (llmCallError) {
           // Skip recording on abort — the user cancelled the request and
@@ -1111,10 +1091,10 @@ export class AgentLoop {
             // misrepresent both.
             const rawRequest = {
               provider: this.provider.name,
-              messages: llmCallArgs.messages,
-              tools: llmCallArgs.options?.tools,
-              systemPrompt: llmCallArgs.options?.systemPrompt,
-              config: llmCallArgs.options?.config,
+              messages: providerHistory,
+              tools: providerOptions.tools,
+              systemPrompt: providerOptions.systemPrompt,
+              config: providerOptions.config,
             };
             onEvent({
               type: "provider_error",
@@ -1203,6 +1183,7 @@ export class AgentLoop {
             "LLM response reached output token limit",
           );
           history.push(safeAssistantMessage);
+          appendedNewMessages = true;
           await onEvent({
             type: "max_tokens_reached",
             stopReason: response.stopReason,
@@ -1215,126 +1196,65 @@ export class AgentLoop {
           break;
         }
-        // Detect empty responses: no user-visible text and no tool calls.
-        // This can happen when the model fails to produce output after
-        // receiving a large tool result. Retry once with a nudge before
-        // the message is persisted.
-        //
-        // Only nudge when the model hasn't already delivered text to the user
-        // earlier in this tool-use chain. If a prior assistant turn in history
-        // contained visible text (e.g. the model said its piece before calling
-        // a side-effect tool like `remember`), an empty follow-up is the model
-        // correctly ending its turn — nudging would mislead it into thinking
-        // its earlier text didn't land and cause a verbatim re-send.
-        //
-        // Note: we check ANY prior assistant turn from this run()
-        // invocation, not just the most recent one. In multi-step tool-use
-        // chains (say-something → call-tool → call-another-tool → end),
-        // the "say-something" text lives on an earlier assistant turn while
-        // the most recent assistant turn is a pure tool_use with no text.
-        // Restricting the check to the most recent assistant turn would
-        // falsely nudge in that case and trigger a duplicate re-send of
-        // text the user already saw.
-        //
-        // Scope the scan to messages appended during this run() call only.
-        // Assistant text from prior conversation turns (earlier run()
-        // invocations passed in via `messages`) must NOT suppress the
-        // nudge — those turns completed long ago and have no bearing on
-        // whether the current tool-use chain has delivered text yet.
-        //
-        // The actual decision (nudge vs. accept vs. error) is delegated to
-        // the `emptyResponse` plugin pipeline. The pipeline returns a
-        // decision; the loop carries out the side-effect (pushing the nudge
-        // or surfacing the error). See `plugins/defaults/empty-response/register.ts`
-        // for the default decision logic.
+        // The model's "stop" moment: a response with no tool calls is about to
+        // yield to the user. The `stop` hook (below) decides whether to accept
+        // the turn or re-query with a follow-up; `priorAssistantHadVisibleText`
+        // gates the ops log for the post-tool empty case.
         const hasVisibleText = response.content.some(
           (block) => block.type === "text" && block.text.trim().length > 0,
         );
-        // Track whether the model produced visible text earlier in this
-        // run() invocation. Run-scoped rather than derived from `history` so
-        // it survives inline compaction rewriting the message array: an empty
-        // completion after a compaction must not be nudged into re-sending
-        // text the user already saw.
         const priorAssistantHadVisibleText = producedVisibleTextThisRun;
         if (hasVisibleText) {
           producedVisibleTextThisRun = true;
         }
-        const emptyResponseArgs: EmptyResponseArgs = {
-          responseContent: response.content,
-          toolUseBlocksLength: toolUseBlocks.length,
-          toolUseTurns,
-          emptyResponseRetries,
-          maxEmptyResponseRetries: MAX_EMPTY_RESPONSE_RETRIES,
-          priorAssistantHadVisibleText,
-          stopReason: response.stopReason,
-        };
-        const emptyResponseCtx = resolveLoopTurnContext(
-          turnContext,
-          requestId,
-          toolUseTurns,
-        );
-        const emptyResponseDecision: EmptyResponseDecision = await runPipeline(
-          "emptyResponse",
-          getMiddlewaresFor("emptyResponse"),
-          async (args) => defaultEmptyResponseTerminal(args),
-          emptyResponseArgs,
-          emptyResponseCtx,
-          DEFAULT_TIMEOUTS.emptyResponse,
-        );
-        if (emptyResponseDecision.action === "nudge") {
-          // Fall back to the canonical nudge text if the plugin returned
-          // `action: "nudge"` but forgot `nudgeText`. Keeps a misbehaving
-          // plugin from silently breaking the loop invariant that the
-          // model sees a coherent prompt.
-          const nudgeText =
-            emptyResponseDecision.nudgeText ??
-            "<system_notice>Your previous response was empty. You must respond to the user with a summary of what you found or did. Do not use any tools — just respond with text.</system_notice>";
-          emptyResponseRetries++;
-          rlog.warn(
-            { turn: toolUseTurns, retry: emptyResponseRetries },
-            "Model returned empty response after tool results — retrying",
-          );
-          history.push({
-            role: "user",
-            content: [{ type: "text", text: nudgeText }],
-          });
-          continue;
-        }
-        if (emptyResponseDecision.action === "error") {
-          rlog.error(
-            { turn: toolUseTurns, retries: emptyResponseRetries },
-            "emptyResponse pipeline requested error surface",
-          );
-          // Stamp the specific exit reason *before* throwing. The catch
-          // handler below will see the rethrown error and attempt to stamp
-          // "error" — guarded by `exitReasonEmitted`, that becomes a no-op
-          // and the more specific reason wins.
-          await emitExit("empty_response_exhausted");
-          throw new AssistantError(
-            "Model returned empty response after tool results",
-            ErrorCode.INTERNAL_ERROR,
-          );
-        }
+        if (toolUseBlocks.length === 0) {
+          // The model stopped requesting tools — the run's stop boundary. The
+          // `stop` hook decides whether to let the turn end or re-query with a
+          // follow-up turn. It receives the full history and, when it asks to
+          // continue, appends the follow-up turn itself.
+          const stopCtx: StopContext = {
+            conversationId: turnCtx.conversationId,
+            messages: [...history],
+            responseContent: response.content,
+            stopReason: response.stopReason,
+            decision: "stop",
+            logger: rlog,
+          };
+          const finalStopCtx = await runHook(HOOKS.STOP, stopCtx);
+          if (finalStopCtx.decision === "continue") {
+            // The loop owns the retry budget: a hook always asks to continue
+            // when a nudge is warranted, and the loop stops anyway once the
+            // budget is spent. This bounds the hook-driven re-query loop.
+            if (stopContinueRetries < MAX_STOP_CONTINUE_RETRIES) {
+              stopContinueRetries++;
+              rlog.warn(
+                { turn: toolUseTurns, retry: stopContinueRetries },
+                "Model returned empty response after tool results — retrying",
+              );
+              history = finalStopCtx.messages;
+              continue;
+            }
-        // action === "accept" — fall through. Emit a dedicated log line for
-        // the specific "empty turn after tool results, retries exhausted"
-        // case so ops dashboards that grep on this line keep working.
-        if (
-          !hasVisibleText &&
-          toolUseBlocks.length === 0 &&
-          toolUseTurns > 0 &&
-          !priorAssistantHadVisibleText
-        ) {
-          rlog.error(
-            { turn: toolUseTurns, retries: emptyResponseRetries },
-            "Model returned empty response after tool results — retries exhausted",
-          );
+            // Budget spent — accept the empty turn. Emit a dedicated log line
+            // for the post-tool empty case so ops dashboards that grep on it
+            // keep working.
+            if (
+              !hasVisibleText &&
+              toolUseTurns > 0 &&
+              !priorAssistantHadVisibleText
+            ) {
+              rlog.error(
+                { turn: toolUseTurns, retries: stopContinueRetries },
+                "Model returned empty response after tool results — retries exhausted",
+              );
+            }
+          }
         }
         history.push(assistantMessage);
+        appendedNewMessages = true;
         await onEvent({ type: "message_complete", message: assistantMessage });
@@ -1364,6 +1284,15 @@ export class AgentLoop {
             }),
           );
           history.push({ role: "user", content: cancelledBlocks });
+          for (const toolUse of toolUseBlocks) {
+            await onEvent({
+              type: "tool_result",
+              toolUseId: toolUse.id,
+              content: "Cancelled by user",
+              isError: true,
+              cancelled: true,
+            });
+          }
           await emitExit("aborted_post_response");
           break;
         }
@@ -1393,14 +1322,6 @@ export class AgentLoop {
                 });
               },
               toolUse.id,
-              // Forward the loop's resolved `TurnContext` through the
-              // executor callback so `ToolExecutor.execute` can thread the
-              // real orchestrator context into the `toolExecute` pipeline.
-              // Standalone tests that don't wire a `turnContext` into
-              // `AgentLoop.run()` pass `undefined` here and the executor
-              // falls back to the synthesized placeholder — preserving the
-              // existing unit-test behavior.
-              turnCtx,
             );
             return { toolUse, result };
@@ -1464,60 +1385,39 @@ export class AgentLoop {
           }),
         );
-        // Pre-emptively truncate oversized tool results to prevent context
-        // overflow. The work is delegated to the `toolResultTruncate`
-        // plugin pipeline so downstream plugins can swap in a smarter
-        // truncation strategy (e.g. a summariser) while the default
-        // middleware preserves the historical tail-drop behaviour.
+        // Run the `post-tool-use` hook once per tool result, after the tool
+        // returns and before the result joins the provider-bound history.
+        // The default tool-result-truncate plugin tail-drops oversized output
+        // to fit the context window; user hooks can swap in a smarter strategy
+        // (e.g. a summariser) or observe results for side effects.
         const contextWindowTokens =
           resolveContextWindow?.().maxInputTokens ??
           this.config.maxInputTokens ??
           180_000;
-        const maxChars = calculateMaxToolResultChars(contextWindowTokens);
-        const truncateMiddlewares = getMiddlewaresFor("toolResultTruncate");
-        let truncatedCount = 0;
-        const truncatedBlocks: ContentBlock[] = [];
+        const resultBlocks: ContentBlock[] = [];
+        const additionalContextBlocks: ContentBlock[] = [];
         for (const block of rawResultBlocks) {
           if (block.type !== "tool_result") {
-            truncatedBlocks.push(block);
+            resultBlocks.push(block);
             continue;
           }
-          const toolBlock = block as ToolResultContent;
-          if (
-            typeof toolBlock.content !== "string" ||
-            toolBlock.content.length <= maxChars
-          ) {
-            truncatedBlocks.push(block);
-            continue;
-          }
-          const pipelineResult = await runPipeline<
-            ToolResultTruncateArgs,
-            ToolResultTruncateResult
-          >(
-            "toolResultTruncate",
-            truncateMiddlewares,
-            async (args) => defaultToolResultTruncateTerminal(args),
-            { content: toolBlock.content, maxChars },
-            turnCtx,
-            DEFAULT_TIMEOUTS.toolResultTruncate,
-          );
-          if (pipelineResult.truncated) {
-            truncatedCount++;
-            truncatedBlocks.push({
-              ...toolBlock,
-              content: pipelineResult.content,
+          const postToolUseCtx: PostToolUseContext = {
+            conversationId: turnCtx.conversationId,
+            toolResponse: block as ToolResultContent,
+            messages: history,
+            maxInputTokens: contextWindowTokens,
+            logger: rlog,
+          };
+          const finalCtx = await runHook(HOOKS.POST_TOOL_USE, postToolUseCtx);
+          resultBlocks.push(finalCtx.toolResponse);
+          if (finalCtx.additionalContext !== undefined) {
+            additionalContextBlocks.push({
+              type: "text",
+              text: finalCtx.additionalContext,
             });
-          } else {
-            truncatedBlocks.push(block);
           }
         }
-        const resultBlocks = truncatedBlocks;
-        if (truncatedCount > 0) {
-          log.warn(
-            `Truncated ${truncatedCount} oversized tool result(s) to prevent context overflow`,
-          );
-        }
         // Emit tool_result events AFTER truncation so downstream consumers
         // (e.g. session persistence) receive the truncated content.
@@ -1569,54 +1469,15 @@ export class AgentLoop {
         toolUseTurns++;
-        // When any tool returned an error, nudge the LLM to retry with
-        // corrected parameters instead of ending its turn. Skip the nudge
-        // after MAX_CONSECUTIVE_ERROR_NUDGES consecutive error turns
-        // (the error is likely unrecoverable at that point). The nudge
-        // decision is delegated to the `toolError` plugin pipeline so user
-        // plugins can change the text, observe the event, or suppress it.
-        const hasToolError = toolResults.some(({ result }) => result.isError);
-        if (hasToolError) {
-          consecutiveErrorTurns++;
-        } else {
-          consecutiveErrorTurns = 0;
-        }
-        const toolErrorArgs: ToolErrorArgs = {
-          hasToolError,
-          consecutiveErrorTurns,
-          maxConsecutiveErrorNudges: MAX_CONSECUTIVE_ERROR_NUDGES,
-        };
-        const toolErrorCtx: TurnContext = resolveLoopTurnContext(
-          turnContext,
-          requestId,
-          toolUseTurns - 1,
-        );
-        const toolErrorDecision = await runPipeline<
-          ToolErrorArgs,
-          ToolErrorDecision
-        >(
-          "toolError",
-          getMiddlewaresFor("toolError"),
-          // Terminal: the canonical nudge decision. The default plugin's
-          // middleware is a passthrough (so later-registered user plugins
-          // aren't shadowed), so this terminal is what actually produces
-          // the decision when no user plugin overrides it. Wiring the
-          // decision here also ensures the nudge fires for direct
-          // AgentLoop callers (tests, benchmarks) that skip
-          // `bootstrapPlugins()` and therefore never register the default.
-          async (args) => defaultToolErrorTerminal(args),
-          toolErrorArgs,
-          toolErrorCtx,
-          DEFAULT_TIMEOUTS.toolError,
-        );
-        if (toolErrorDecision.action === "nudge") {
-          resultBlocks.push({
-            type: "text",
-            text: toolErrorDecision.nudgeText,
-          });
-        }
+        // Append any guidance a post-tool-use hook surfaced via
+        // `additionalContext` (e.g. tool-error retry coaching) as separate
+        // blocks. They join the provider-bound history below but were not part
+        // of the tool_result events emitted above, so the model sees the
+        // guidance while the client-facing and persisted tool output stay the
+        // tool's actual result.
+        resultBlocks.push(...additionalContextBlocks);
-        // Add tool results as a user message and continue the loop
+        // Add tool results as a user message and continue the loop.
         history.push({ role: "user", content: resultBlocks });
         // Invoke checkpoint callback after tool results are in history.
@@ -1659,7 +1520,7 @@ export class AgentLoop {
           );
           const midLoopThreshold =
             preflightBudget * MID_LOOP_YIELD_THRESHOLD_RATIO;
-          const estimated = await this.estimateTokens(history, turnCtx);
+          const estimated = this.estimateTokens(history);
           if (estimated > midLoopThreshold) {
             if (compaction) {
               rlog.info(
@@ -1672,9 +1533,13 @@ export class AgentLoop {
                 compaction,
                 signal,
                 onEvent,
+                resolveEffectiveOverrideProfile() ?? null,
               );
               if (compacted) {
                 history = compacted;
+                // The compacted, re-injected array is the new base; output
+                // produced after this point is what the orchestrator persists.
+                newMessagesStart = history.length;
                 continue;
               }
             }
@@ -1701,6 +1566,15 @@ export class AgentLoop {
               }),
             );
             history.push({ role: "user", content: cancelledBlocks });
+            for (const toolUse of toolUseBlocks) {
+              await onEvent({
+                type: "tool_result",
+                toolUseId: toolUse.id,
+                content: "Cancelled by user",
+                isError: true,
+                cancelled: true,
+              });
+            }
           }
           await emitExit("aborted_via_error");
           break;
@@ -1714,11 +1588,9 @@ export class AgentLoop {
           Sentry.captureException(err);
         }
         onEvent({ type: "error", error: err });
-        // Catch-block fallback. If the rethrow came from the
-        // empty-response throw path above, `emitExit("error")` no-ops
-        // because `emitExit("empty_response_exhausted")` already ran
-        // before the throw. Otherwise, this is the genuine
-        // unhandled-error exit.
+        // Catch-block fallback. A break site that stamped a more specific
+        // reason before unwinding here keeps it; the guard makes this a no-op.
+        // Otherwise this is the genuine unhandled-error exit.
         await emitExit("error");
         break;
       }
@@ -1733,7 +1605,12 @@ export class AgentLoop {
       "Agent loop exited",
     );
-    return { history, exitReason };
+    return {
+      history,
+      exitReason,
+      appendedNewMessages,
+      newMessages: history.slice(newMessagesStart),
+    };
   }
 }