npm - @vellumai/assistant - Versions diffs - 0.8.7 → 0.8.8-dev.202606052332.17fc8ea - Mend

@vellumai/assistant 0.8.7 → 0.8.8-dev.202606052332.17fc8ea

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (570) hide show

package/src/daemon/conversation-agent-loop.ts CHANGED Viewed

@@ -7,8 +7,6 @@
  * runAgentLoop method here via the AgentLoopConversationContext interface.
  */
-import { join } from "node:path";
 import { v4 as uuid } from "uuid";
 import { optimizeImageForTransport } from "../agent/image-optimize.js";
@@ -46,10 +44,13 @@ import {
 } from "../context/post-turn-tool-result-truncation.js";
 import {
   estimatePromptTokens,
+  estimatePromptTokensWithTools,
   getCalibrationProviderKey,
 } from "../context/token-estimator.js";
-import type { ContextWindowManager } from "../context/window-manager.js";
-import { getDocumentsForConversation } from "../documents/document-store.js";
+import type {
+  ContextWindowCompactOptions,
+  ContextWindowManager,
+} from "../context/window-manager.js";
 import type { ToolProfiler } from "../events/tool-profiling-listener.js";
 import { writeRelationshipState } from "../home/relationship-state-writer.js";
 import {
@@ -57,9 +58,9 @@ import {
   setSentryConversationContext,
 } from "../instrument.js";
 import { commitAppTurnChanges } from "../memory/app-git-service.js";
-import { getApp, listAppFiles, resolveAppDir } from "../memory/app-store.js";
 import { enqueueAutoAnalysisOnCompaction } from "../memory/auto-analysis-enqueue.js";
 import {
+  addMessage,
   deleteMessageById,
   getConversation,
   getConversationOriginChannel,
@@ -68,77 +69,40 @@ import {
   getLastUserTimestampBefore,
   getMessageById,
   provenanceFromTrustContext,
-  setConversationHistoryStrippedAt,
-  setLastNotifiedInferenceProfile,
   updateConversationContextWindow,
   updateConversationSlackContextWatermark,
+  updateMessageMetadata,
 } from "../memory/conversation-crud.js";
 import { getResolvedConversationDirPath } from "../memory/conversation-directories.js";
 import { syncMessageToDisk } from "../memory/conversation-disk-view.js";
-import {
-  isReplaceableTitle,
-  queueRegenerateConversationTitle,
-} from "../memory/conversation-title-service.js";
+import { isReplaceableTitle } from "../memory/conversation-title-service.js";
 import { isBackgroundConversationType } from "../memory/conversation-types.js";
 import type { ConversationGraphMemory } from "../memory/graph/conversation-graph-memory.js";
 import {
   backfillMessageIdOnLogs,
   recordSyntheticAgentErrorMessageLog,
 } from "../memory/llm-request-log-store.js";
-import { recordMemoryRecallLog } from "../memory/memory-recall-log-store.js";
 import { enqueueMemoryRetrospectiveOnCompaction } from "../memory/memory-retrospective-enqueue.js";
-import { PKB_WORKSPACE_SCOPE } from "../memory/pkb/types.js";
-import type { QdrantSparseVector } from "../memory/qdrant-client.js";
-import {
-  readMemoryV2StaticContent,
-  shouldExposePersonalMemory,
-} from "../memory/v2/static-context.js";
 import type { PermissionPrompter } from "../permissions/prompter.js";
 import { HOOKS } from "../plugin-api/constants.js";
 import type { UserPromptSubmitContext } from "../plugin-api/types.js";
-import { defaultCompactionTerminal } from "../plugins/defaults/compaction/terminal.js";
+import { defaultCompact } from "../plugins/defaults/compaction/compact.js";
 import { deepRepairHistory } from "../plugins/defaults/history-repair/terminal.js";
-import {
-  asDefaultGraphPayload,
-  type DefaultMemoryRetrievalDeps,
-  type GraphMemoryPayload,
-  runDefaultMemoryRetrieval,
-} from "../plugins/defaults/memory-retrieval/register.js";
-import { defaultPersistenceTerminal } from "../plugins/defaults/persistence/terminal.js";
-import { defaultTitleGenerateTerminal } from "../plugins/defaults/title-generate/terminal.js";
-import { defaultTokenEstimateTerminal } from "../plugins/defaults/token-estimate/terminal.js";
-import { DEFAULT_TIMEOUTS, runHook, runPipeline } from "../plugins/pipeline.js";
-import { getMiddlewaresFor } from "../plugins/registry.js";
-import type {
-  CompactionArgs,
-  CompactionResult,
-  EstimateArgs,
-  EstimateResult,
-  MemoryArgs,
-  MemoryResult,
-  OverflowReduceArgs,
-  OverflowReduceResult,
-  PersistAddResult,
-  PersistArgs,
-  PersistResult,
-  TurnContext as PluginTurnContext,
-} from "../plugins/types.js";
-import { PluginExecutionError, PluginTimeoutError } from "../plugins/types.js";
-import type {
-  ContentBlock,
-  Message,
-  ToolDefinition,
-} from "../providers/types.js";
+import postCompactReinject from "../plugins/defaults/memory-retrieval/hooks/post-compact.js";
+import userPromptSubmitMemoryRetrieval, {
+  type MemoryRetrievalHookContext,
+} from "../plugins/defaults/memory-retrieval/hooks/user-prompt-submit-temp.js";
+import { runHook } from "../plugins/pipeline.js";
+import type { TurnContext as PluginTurnContext } from "../plugins/types.js";
+import type { ContentBlock, Message } from "../providers/types.js";
 import type { Provider } from "../providers/types.js";
 import { resolveActorTrust } from "../runtime/actor-trust-resolver.js";
 import { broadcastMessage } from "../runtime/assistant-event-hub.js";
 import { DAEMON_INTERNAL_ASSISTANT_ID } from "../runtime/assistant-scope.js";
 import { publishConversationMessagesChanged } from "../runtime/sync/resource-sync-events.js";
-import { redactSecrets } from "../security/secret-scanner.js";
 import { getSubagentManager } from "../subagent/index.js";
 import type { UsageActor } from "../usage/actors.js";
 import { getLogger } from "../util/logger.js";
-import { getWorkspaceDir } from "../util/platform.js";
 import { timeAgo } from "../util/time.js";
 import { truncate } from "../util/truncate.js";
 import { getWorkspaceGitService } from "../workspace/git-service.js";
@@ -147,7 +111,6 @@ import {
   type AssistantAttachmentDraft,
   cleanAssistantContent,
 } from "./assistant-attachments.js";
-import { cleanupBootstrapAfterTurnThreshold } from "./bootstrap-turn-cleanup.js";
 import { resolveOverflowAction } from "./context-overflow-policy.js";
 import {
   createInitialReducerState,
@@ -158,6 +121,8 @@ import {
   createEventHandlerState,
   dispatchAgentEvent,
   type EventHandlerDeps,
+  finalizePendingToolResultRow,
+  markHistoryStrippedBestEffort,
 } from "./conversation-agent-loop-handlers.js";
 import {
   approveHostAttachmentRead,
@@ -173,7 +138,6 @@ import { raceWithTimeout } from "./conversation-media-retry.js";
 import type { MessageQueue } from "./conversation-queue-manager.js";
 import type { QueueDrainReason } from "./conversation-queue-manager.js";
 import type {
-  ActiveSurfaceContext,
   ChannelCapabilities,
   InboundActorContext,
   InjectionMode,
@@ -182,8 +146,6 @@ import {
   applyRuntimeInjections,
   buildSubagentStatusBlock,
   buildUnifiedTurnContextBlock,
-  findLastInjectedNowContent,
-  getPkbAutoInjectList,
   getSlackCompactionWatermarkForPrefix,
   inboundActorContextFromTrust,
   inboundActorContextFromTrustContext,
@@ -194,7 +156,6 @@ import {
 } from "./conversation-runtime-assembly.js";
 import type { SkillProjectionCache } from "./conversation-skill-tools.js";
 import { markSurfaceCompleted } from "./conversation-surfaces.js";
-import { resolveTrustClass } from "./conversation-tool-setup.js";
 import { recordUsage } from "./conversation-usage.js";
 import {
   formatTurnTimestamp,
@@ -203,45 +164,27 @@ import {
 import { getDiskPressureStatus } from "./disk-pressure-guard.js";
 import { classifyDiskPressureTurnPolicy } from "./disk-pressure-policy.js";
 import type {
-  DynamicPageSurfaceData,
   ServerMessage,
   SurfaceData,
   SurfaceType,
   UsageStats,
 } from "./message-protocol.js";
-import type { MemoryRecalled } from "./message-types/memory.js";
 import type { ConfirmationStateChanged } from "./message-types/messages.js";
-import { conversationMetadataSyncTag } from "./message-types/sync.js";
+import {
+  type OverflowReduceArgs,
+  runOverflowReductionLoop,
+} from "./overflow-reduction-loop.js";
 import { parseActualTokensFromError } from "./parse-actual-tokens-from-error.js";
+import {
+  persistUnsendableImageDowngrades,
+  UNSENDABLE_IMAGE_NOTE,
+} from "./persist-unsendable-image.js";
 import type { TraceEmitter } from "./trace-emitter.js";
-import type { TrustContext } from "./trust-context.js";
+import { resolveTrustClass, type TrustContext } from "./trust-context.js";
 import { stripHistoricalWebSearchResults } from "./web-search-history.js";
 const log = getLogger("conversation-agent-loop");
-/**
- * Best-effort persistence of the history-stripped marker after an
- * injection-strip event (compaction / overflow recovery). The marker is a
- * durability hint, not turn-critical state — a transient SQLite write failure
- * (SQLITE_BUSY, disk-full, read-only FS) must not abort the turn. Logs a
- * warning and continues on failure, preserving the long-standing non-fatal
- * contract for this metadata write.
- */
-function markHistoryStrippedBestEffort(
-  conversationId: string,
-  strippedAt: number,
-  logger: ReturnType<typeof getLogger>,
-): void {
-  try {
-    setConversationHistoryStrippedAt(conversationId, strippedAt);
-  } catch (err) {
-    logger.warn(
-      { err },
-      "Failed to persist history-stripped marker after compaction strip (non-fatal)",
-    );
-  }
-}
 const DISK_PRESSURE_ERROR_CODE = "DISK_SPACE_CRITICAL" as const;
 const DISK_PRESSURE_ERROR_CATEGORY = "disk_pressure";
@@ -270,12 +213,12 @@ function formatDiskPressureBlockedMessage(): string {
 // ── Plugin pipeline helpers ──────────────────────────────────────────
 //
 // Canonical {@link PluginTurnContext} builder threaded into every
-// `runPipeline` call inside `runAgentLoopImpl`. The orchestrator composes
+// `runHook` call inside `runAgentLoopImpl`. The orchestrator composes
 // the context on demand at each call site from ambient state rather than
 // carrying a persistent `TurnContext` instance across the turn.
 /**
- * Synthetic fallback trust context used when the orchestrator fires a pipeline
+ * Synthetic fallback trust context used when the orchestrator fires a hook
  * before the per-turn trust snapshot has been captured (e.g. invocations that
  * bypass `processMessage` / `drainQueue`). We bias to `unknown` rather than
  * `guardian` so a missing snapshot cannot accidentally grant elevated trust
@@ -287,14 +230,14 @@ const FALLBACK_TURN_TRUST: TrustContext = {
 };
 /**
- * Build the {@link TurnContext} passed to {@link runPipeline}.
+ * Build the {@link TurnContext} passed to {@link runHook}.
  *
- * Canonical source of truth for every pipeline call site inside the agent
- * loop. Every `runPipeline` invocation in `runAgentLoopImpl` (and in the
+ * Canonical source of truth for every hook call site inside the agent
+ * loop. Every `runHook` invocation in `runAgentLoopImpl` (and in the
  * handlers that share its ambient state) must route through this helper
  * rather than constructing a `TurnContext` literal inline — this keeps
  * `turnIndex`, trust resolution, and the `contextWindowManager` attachment
- * consistent across pipeline slots, which in turn keeps structured logs
+ * consistent across hooks, which in turn keeps structured logs
  * filtered by `conversationId`/`turnIndex` coherent across slots.
  *
  * Behavior:
@@ -306,9 +249,9 @@ const FALLBACK_TURN_TRUST: TrustContext = {
  *   level context, then {@link FALLBACK_TURN_TRUST}. The cascade matches
  *   the one inside the orchestrator's inline injection assembly so
  *   middleware reads the same trust class the runtime sees.
- * - `contextWindowManager` is attached unconditionally. Pipelines that
- *   don't need it can ignore it; the default compaction plugin reads it
- *   via the typed optional field on `TurnContext`.
+ * - `contextWindowManager` is attached unconditionally. Hooks that
+ *   don't need it can ignore it; it remains available via the typed
+ *   optional field on `TurnContext`.
  */
 function buildPluginTurnContext(
   ctx: AgentLoopConversationContext,
@@ -322,9 +265,23 @@ function buildPluginTurnContext(
     turnIndex: ctx.turnCount,
     trust,
     contextWindowManager: ctx.contextWindowManager,
+    callSite: ctx.currentCallSite,
   };
 }
+/**
+ * Trust class of the actor whose turn is in progress, for the compactor's
+ * image manifest filter. Prefers the turn-start snapshot
+ * ({@link AgentLoopConversationContext.currentTurnTrustContext}) over the live
+ * trust context so compaction running in a later tool iteration can't pick up
+ * a concurrent request's actor.
+ */
+function resolveTurnActorTrustClass(
+  ctx: AgentLoopConversationContext,
+): TrustContext["trustClass"] | undefined {
+  return (ctx.currentTurnTrustContext ?? ctx.trustContext)?.trustClass;
+}
 // ── Context Interface ────────────────────────────────────────────────
 /**
@@ -352,9 +309,18 @@ export interface AssistantSurface {
 export interface AgentLoopConversationContext {
   readonly conversationId: string;
   messages: Message[];
-  processing: boolean;
+  isProcessing(): boolean;
+  setProcessing(value: boolean): void;
   abortController: AbortController | null;
   currentRequestId?: string;
+  /**
+   * The {@link LLMCallSite} of the in-flight turn, set at turn start from
+   * `options?.callSite ?? "mainAgent"`. Read by {@link buildPluginTurnContext}
+   * so pipeline/injector plugins can tell the main reply apart from
+   * background agent-loop work (compaction, subagents, …) on this same
+   * conversation. Per-turn mutable, mirroring {@link currentRequestId}.
+   */
+  currentCallSite?: LLMCallSite;
   readonly agentLoop: AgentLoop;
   readonly provider: Provider;
@@ -397,8 +363,6 @@ export interface AgentLoopConversationContext {
   currentTurnSurfaces: AssistantSurface[];
   workingDir: string;
-  workspaceTopLevelContext: string | null;
-  workspaceTopLevelDirty: boolean;
   channelCapabilities?: ChannelCapabilities;
   /** Per-turn snapshot of trustContext, frozen at message-processing start. */
   currentTurnTrustContext?: TrustContext;
@@ -424,8 +388,6 @@ export interface AgentLoopConversationContext {
   /** Task-run scope for the current turn. Cleared at turn end so queued/drained turns don't inherit it. */
   taskRunId?: string;
   assistantId?: string;
-  voiceCallControlPrompt?: string;
-  transportHints?: string[];
   clientTimezone?: string;
   readonly coreToolNames: Set<string>;
@@ -500,7 +462,6 @@ export interface AgentLoopConversationContext {
   getWorkspaceGitService?: (workspaceDir: string) => GitServiceInitializer;
   commitTurnChanges?: typeof commitTurnChanges;
-  refreshWorkspaceTopLevelContextIfNeeded(): void;
   markWorkspaceTopLevelDirty(): void;
   getQueueDepth(): number;
   hasQueuedMessages(): boolean;
@@ -561,6 +522,13 @@ export async function runAgentLoopImpl(
   });
   let yieldedForHandoff = false;
   let yieldedForBudget = false;
+  // Whether the most recent agent-loop run produced at least one new assistant
+  // message — the loop's own forward-progress signal, used by the ordering
+  // retry gate and the overflow convergence fold.
+  let lastRunAppendedNewMessages = false;
+  // The messages the most recent agent-loop run appended on top of its base —
+  // the loop's own new-output boundary, persisted as this turn's new messages.
+  let lastRunNewMessages: Message[] = [];
   let pendingCheckpointYield: "budget" | "handoff" | null = null;
   // Captured when the auto_compress_latest_turn rerun yields at the mid-loop
   // budget checkpoint. SSE emission happens immediately at the detection site;
@@ -579,6 +547,9 @@ export async function runAgentLoopImpl(
   // `resolveCallSiteConfig`, picking up any user overrides under
   // `llm.callSites.mainAgent` (falling back to `llm.default` when absent).
   const turnCallSite: LLMCallSite = options?.callSite ?? "mainAgent";
+  // Expose the turn's call site to plugin pipeline/injector contexts (read by
+  // buildPluginTurnContext) so plugins can scope behaviour to the main reply.
+  ctx.currentCallSite = turnCallSite;
   // Read the conversation row once for both the override-profile derivation
   // below and the title-replaceability check at turn start. Later reads in
@@ -792,10 +763,6 @@ export async function runAgentLoopImpl(
         : null,
     },
   );
-  const diskPressureContext =
-    diskPressureDecision.action === "allow-cleanup-mode"
-      ? { cleanupModeActive: true }
-      : null;
   ctx.diskPressureCleanupModeActive =
     diskPressureDecision.action === "allow-cleanup-mode";
@@ -898,55 +865,6 @@ export async function runAgentLoopImpl(
       }
     }
-    // Generate title early — the user message alone is sufficient context.
-    // Firing before the main LLM call removes the delay of waiting for the
-    // full assistant response. The second-pass regeneration at turn 3 will
-    // refine the title with more context.
-    // No abort signal — title generation should complete even if the user
-    // cancels the response, since the user message is already persisted.
-    // Deferred via setTimeout so the main agent loop LLM call enqueues
-    // first, avoiding rate-limit slot contention on strict configs.
-    if (isReplaceableTitle(turnStartConversation?.title ?? null)) {
-      // TurnContext routed through the canonical builder so the pipeline's
-      // log record reports the same `conversationId`/`turnIndex` shape as
-      // every other slot in this turn. Title generation does not depend on
-      // the context-window manager attached by the builder, but sharing the
-      // builder keeps the invariant enforced in one place.
-      const titlePipelineCtx = buildPluginTurnContext(ctx, reqId);
-      const titleArgs = {
-        conversationId: ctx.conversationId,
-        provider: ctx.provider,
-        userMessage: options?.titleText ?? content,
-        onTitleUpdated: (title: string) => {
-          onEvent({
-            type: "conversation_title_updated",
-            conversationId: ctx.conversationId,
-            title,
-          });
-          onEvent({
-            type: "sync_changed",
-            tags: [conversationMetadataSyncTag(ctx.conversationId)],
-          });
-        },
-      };
-      setTimeout(() => {
-        runPipeline(
-          "titleGenerate",
-          getMiddlewaresFor("titleGenerate"),
-          defaultTitleGenerateTerminal,
-          titleArgs,
-          titlePipelineCtx,
-          DEFAULT_TIMEOUTS.titleGenerate,
-        ).catch((err) => {
-          // Fire-and-forget — keep previous non-propagating semantics.
-          // queueGenerateConversationTitle already swallows internal
-          // errors; this catch covers pipeline-layer errors (timeouts,
-          // middleware throws) without surfacing them to the agent loop.
-          rlog.warn({ err }, "titleGenerate pipeline failed (non-fatal)");
-        });
-      }, 0);
-    }
     const isFirstMessage = ctx.messages.length === 1;
     // Promote a pending post-compaction re-inject signal (e.g. from `/compact`)
     // into `compactedThisTurn` so NOW.md / PKB / v2 static blocks land on this
@@ -954,7 +872,6 @@ export async function runAgentLoopImpl(
     // so this fires exactly once per `/compact` event.
     const consumedPostCompactReinject = ctx.pendingPostCompactReinject;
     ctx.pendingPostCompactReinject = false;
-    let shouldInjectWorkspace = isFirstMessage || consumedPostCompactReinject;
     let compactedThisTurn = consumedPostCompactReinject;
     let slackCompactedThisTurn = false;
     const isSlackConversation = ctx.channelCapabilities?.channel === "slack";
@@ -1092,70 +1009,32 @@ export async function runAgentLoopImpl(
     // Skip auto-compaction while the circuit breaker is open. Force paths
     // and user-initiated /compact bypass this check.
     const autoCompactAllowed =
-      !(await ctx.agentLoop.compactionCircuit.isOpen(ctx));
+      !(await ctx.agentLoop.compactionCircuit.isOpen());
     if (compactCheck.needed && autoCompactAllowed) {
       ctx.emitActivityState("thinking", "context_compacting", {
         requestId: reqId,
       });
     }
-    const compactionOptions = {
-      lastCompactedAt: ctx.contextCompactedAt ?? undefined,
-      precomputedEstimate: compactCheck.estimatedTokens,
-      conversationOriginChannel:
-        getConversationOriginChannel(ctx.conversationId) ?? undefined,
-      overrideProfile: resolveCurrentOverrideProfile() ?? null,
-      actorTrustClass: ctx.trustContext?.trustClass,
-    };
     let compacted: Awaited<
       ReturnType<typeof ctx.contextWindowManager.maybeCompact>
     > | null = null;
     if (autoCompactAllowed) {
-      try {
-        compacted = (await runPipeline<CompactionArgs, CompactionResult>(
-          "compaction",
-          getMiddlewaresFor("compaction"),
-          (args) =>
-            defaultCompactionTerminal(args, buildPluginTurnContext(ctx, reqId)),
-          {
-            messages: messagesForStartOfTurnCompaction,
-            signal: abortController.signal,
-            options: compactionOptions,
-          },
-          buildPluginTurnContext(ctx, reqId),
-          DEFAULT_TIMEOUTS.compaction,
-        )) as Awaited<ReturnType<typeof ctx.contextWindowManager.maybeCompact>>;
-      } catch (err) {
-        if (err instanceof PluginTimeoutError) {
-          // Pipeline exceeded its budget. Record the failure so the circuit
-          // breaker tracks consecutive timeouts (it trips after three),
-          // then degrade gracefully by skipping compaction this turn —
-          // the turn proceeds with the un-compacted history rather than
-          // hard-failing. The inner summary call has been aborted by the
-          // runner's signal-linking, so updateSummary's local fallback
-          // also ran before this catch block is reached.
-          rlog.warn(
-            { err, phase: "start-of-turn-compaction" },
-            "Compaction pipeline timed out — skipping compaction this turn",
-          );
-          await ctx.agentLoop.compactionCircuit.recordOutcome(
-            ctx,
-            true,
-            onEvent,
-          );
-          compacted = null;
-        } else {
-          throw err;
-        }
-      }
+      compacted = await defaultCompact({
+        manager: ctx.contextWindowManager,
+        messages: messagesForStartOfTurnCompaction,
+        signal: abortController.signal,
+        precomputedEstimate: compactCheck.estimatedTokens,
+        overrideProfile: resolveCurrentOverrideProfile() ?? null,
+        actorTrustClass: resolveTurnActorTrustClass(ctx),
+      });
     }
     // Only track circuit-breaker state when a summary LLM call actually ran.
     // `summaryFailed` is `undefined` on early returns (compaction disabled,
-    // below threshold, cooldown active, no eligible messages, truncation-only
+    // below threshold, no eligible messages, truncation-only
     // path) — treating those as "successful" compactions would silently reset
     // the 3-strike counter and break the invariant.
     if (compacted && compacted.summaryFailed !== undefined) {
       await ctx.agentLoop.compactionCircuit.recordOutcome(
-        ctx,
         compacted.summaryFailed,
         onEvent,
       );
@@ -1165,7 +1044,6 @@ export async function runAgentLoopImpl(
         compacted,
         messagesForStartOfTurnCompaction,
       );
-      shouldInjectWorkspace = true;
       if (compacted.compactedPersistedMessages > 0) {
         compactedThisTurn = true;
       }
@@ -1203,213 +1081,10 @@ export async function runAgentLoopImpl(
       }
     };
-    let runMessages = ctx.messages;
-    // Memory retrieval pipeline — fetches PKB, NOW.md, and memory-graph
-    // outputs through a single `memoryRetrieval` pipeline. Plugins may
-    // replace the terminal behavior by registering a middleware that
-    // short-circuits with its own `MemoryResult`; the default terminal
-    // below runs `runDefaultMemoryRetrieval` which reproduces the prior
-    // in-lined behavior (PKB/NOW reads + gated graph call).
-    const isTrustedActor = resolveTrustClass(ctx.trustContext) === "guardian";
-    // Canonical builder — pulls trust from per-turn snapshot, then
-    // conversation-level, then the synthetic fallback. Memory retrieval
-    // does not need the context-window handle the builder attaches, but
-    // keeping every call site on one helper is load-bearing for log
-    // coherence across pipeline slots.
-    const memoryPluginTurnCtx = buildPluginTurnContext(ctx, reqId);
-    const memoryArgs: MemoryArgs = {
-      conversationId: ctx.conversationId,
-      trustContext: ctx.trustContext,
-      turnIndex: ctx.turnCount,
-      // Pass the abort signal via `args` (not `deps`) so the pipeline
-      // runner's `linkAbortSignal` can swap it for a signal linked to the
-      // pipeline's internal controller — on a plugin-set timeout or
-      // external cancel, the linked signal aborts and `prepareMemory`
-      // stops mutating graph state / emitting events after the pipeline
-      // has already errored.
-      signal: abortController.signal,
-    };
-    const memoryDeps: DefaultMemoryRetrievalDeps = {
-      messages: ctx.messages,
-      graphMemory: ctx.graphMemory,
-      config: getConfig(),
-      onEvent,
-      isTrustedActor,
-    };
-    const memoryResult: MemoryResult = await runPipeline(
-      "memoryRetrieval",
-      getMiddlewaresFor("memoryRetrieval"),
-      (args) => runDefaultMemoryRetrieval(args, memoryDeps),
-      memoryArgs,
-      memoryPluginTurnCtx,
-      DEFAULT_TIMEOUTS.memoryRetrieval,
-    );
-    // Consume the memory-graph block when the default retriever emitted
-    // one. Custom plugins that substitute their own blocks without the
-    // default discriminator are expected to handle their own side effects
-    // (event emission, metric persistence) inside their middleware; this
-    // block short-circuits to the original no-op behavior in that case.
-    const defaultGraphPayload: GraphMemoryPayload | null =
-      asDefaultGraphPayload(memoryResult.memoryGraphBlocks);
-    let pkbQueryVector: number[] | undefined;
-    let pkbSparseVector: QdrantSparseVector | undefined;
-    if (defaultGraphPayload) {
-      const graphResult = defaultGraphPayload.result;
-      runMessages = graphResult.runMessages;
-      // Select dense+sparse as a matched pair so RRF fusion combines two
-      // signals aligned to the same query text:
-      //   1. Context-load with a user query: user-query dense + user-query
-      //      sparse — the cleanest pairing.
-      //   2. Otherwise (context-load without a user query, or per-turn):
-      //      whatever `queryVector` / `sparseVector` the retriever produced,
-      //      which are themselves co-aligned (both summary-derived in
-      //      context-load, both user-last-message-derived in per-turn).
-      // Never pair a user-query dense with a summary-aligned sparse.
-      if (graphResult.userQueryVector) {
-        pkbQueryVector = graphResult.userQueryVector;
-        pkbSparseVector = graphResult.userQuerySparseVector;
-      } else {
-        pkbQueryVector = graphResult.queryVector;
-        pkbSparseVector = graphResult.sparseVector;
-      }
-      // Persist the injected block text in message metadata so it survives
-      // conversation reloads (eviction, restart, fork). loadFromDb re-injects
-      // from metadata. Routed through the `persistence` pipeline so plugins
-      // can observe or override metadata updates alongside add/delete.
-      if (graphResult.injectedBlockText) {
-        try {
-          await runPipeline<PersistArgs, PersistResult>(
-            "persistence",
-            getMiddlewaresFor("persistence"),
-            defaultPersistenceTerminal,
-            {
-              op: "update",
-              messageId: userMessageId,
-              updates: {
-                memoryInjectedBlock: graphResult.injectedBlockText,
-              },
-            },
-            buildPluginTurnContext(ctx, reqId),
-            DEFAULT_TIMEOUTS.persistence,
-          );
-        } catch (err) {
-          rlog.warn(
-            { err },
-            "Failed to persist memory injection to metadata (non-fatal)",
-          );
-        }
-      }
-      const m = graphResult.metrics;
-      try {
-        recordMemoryRecallLog({
-          conversationId: ctx.conversationId,
-          enabled: true,
-          degraded: false,
-          provider: m?.embeddingProvider ?? undefined,
-          model: m?.embeddingModel ?? undefined,
-          semanticHits: m?.semanticHits ?? 0,
-          mergedCount: m?.mergedCount ?? 0,
-          selectedCount: m?.selectedCount ?? 0,
-          tier1Count: m?.tier1Count ?? 0,
-          tier2Count: m?.tier2Count ?? 0,
-          hybridSearchLatencyMs: m?.hybridSearchLatencyMs ?? 0,
-          sparseVectorUsed: m?.sparseVectorUsed ?? false,
-          injectedTokens: graphResult.injectedTokens,
-          latencyMs: graphResult.latencyMs,
-          topCandidatesJson: (m?.topCandidates ?? []).map((c) => ({
-            key: c.nodeId,
-            type: c.type,
-            kind: "graph",
-            finalScore: c.score,
-            semantic: c.semanticSimilarity,
-            recency: c.recencyBoost,
-          })),
-          injectedText: graphResult.injectedBlockText ?? undefined,
-          reason: `graph:${graphResult.mode}`,
-          queryContext: m?.queryContext ?? undefined,
-        });
-      } catch (err) {
-        log.warn({ err }, "Failed to persist memory recall log (non-fatal)");
-      }
-      if (m) {
-        const memoryRecalledEvent: MemoryRecalled = {
-          type: "memory_recalled",
-          provider: m.embeddingProvider ?? "unknown",
-          model: m.embeddingModel ?? "unknown",
-          semanticHits: m.semanticHits,
-          mergedCount: m.mergedCount,
-          selectedCount: m.selectedCount,
-          tier1Count: m.tier1Count,
-          tier2Count: m.tier2Count,
-          hybridSearchLatencyMs: m.hybridSearchLatencyMs,
-          sparseVectorUsed: m.sparseVectorUsed,
-          injectedTokens: graphResult.injectedTokens,
-          latencyMs: graphResult.latencyMs,
-          topCandidates: m.topCandidates.map((c) => ({
-            key: c.nodeId,
-            type: c.type,
-            kind: "graph",
-            finalScore: c.score,
-            semantic: c.semanticSimilarity,
-            recency: c.recencyBoost,
-          })),
-        };
-        onEvent(memoryRecalledEvent);
-      }
-    }
-    // Build active surface context
-    let activeSurface: ActiveSurfaceContext | null = null;
-    if (ctx.currentActiveSurfaceId) {
-      const stored = ctx.surfaceState.get(ctx.currentActiveSurfaceId);
-      if (stored && stored.surfaceType === "dynamic_page") {
-        const data = stored.data as DynamicPageSurfaceData;
-        activeSurface = {
-          surfaceId: ctx.currentActiveSurfaceId,
-          html: data.html,
-          currentPage: ctx.currentPage,
-        };
-        if (data.appId) {
-          const app = getApp(data.appId);
-          if (app) {
-            activeSurface.appId = app.id;
-            activeSurface.appName = app.name;
-            activeSurface.appDirName = resolveAppDir(app.id).dirName;
-            activeSurface.appSchemaJson = app.schemaJson;
-            activeSurface.appFiles = listAppFiles(app.id);
-            if (app.pages && Object.keys(app.pages).length > 0) {
-              activeSurface.appPages = app.pages;
-            }
-          }
-        }
-      }
-    }
-    // Query active documents for this conversation so the injector chain
-    // can surface them to the assistant (prevents duplicate document_create
-    // calls when existing documents should be targeted with document_update).
-    const conversationDocs = getDocumentsForConversation(ctx.conversationId);
-    const activeDocuments =
-      conversationDocs.length > 0
-        ? conversationDocs.map((d) => ({
-            surfaceId: d.surfaceId,
-            title: d.title,
-            wordCount: d.wordCount,
-            updatedAt: d.updatedAt,
-          }))
-        : null;
-    ctx.refreshWorkspaceTopLevelContextIfNeeded();
-    // Compute fresh turn timestamp for date grounding.
-    // Absolute "now" is always anchored to assistant host clock, while local
-    // date semantics prefer configured user timezone, then device timezones.
+    // Resolve the turn's timezone cascade up front. It depends only on config
+    // and the inbound request — never on retrieval output — so it can be
+    // settled before context assembly. Local date semantics prefer the
+    // configured user timezone, then device timezones, then the host clock.
     const hostTimeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
     const timezoneContext = resolveTurnTimezoneContext({
       configuredUserTimeZone: config.ui.userTimezone ?? null,
@@ -1417,9 +1092,6 @@ export async function runAgentLoopImpl(
       detectedTimezone: config.ui.detectedTimezone ?? null,
       hostTimeZone,
     });
-    const timestamp = formatTurnTimestamp({
-      timeZone: timezoneContext.effectiveTimezone,
-    });
     // Resolve the inbound actor context for the unified <turn_context> block.
     // When the conversation carries enough identity info, use the unified
@@ -1443,8 +1115,10 @@ export async function runAgentLoopImpl(
       }
     }
-    // Build unified turn context block that replaces the separate temporal,
-    // channel, interface, and actor context blocks.
+    // Resolve the channel/interface labels and the guardian flag for this
+    // turn. These derive only from the captured turn context and the resolved
+    // actor trust class — never from retrieval — so they settle before context
+    // assembly.
     const interfaceName =
       capturedTurnInterfaceContext.userMessageInterface ?? undefined;
     const channelName =
@@ -1489,9 +1163,54 @@ export async function runAgentLoopImpl(
       });
       const label = profileEntry?.label ?? effectiveProfileKey;
       modelProfileStr = resolved.model ? `${label} (${resolved.model})` : label;
-      setLastNotifiedInferenceProfile(ctx.conversationId, effectiveProfileKey);
+      // Record the notification for persistence on delivery rather than here:
+      // the model only "learns" the profile once it receives this turn
+      // context, signalled by the first `message_complete`. Persisting inline
+      // would mark the profile notified even if the turn is cancelled or fails
+      // before the model ever sees the notice.
+      state.pendingNotifiedInferenceProfile = effectiveProfileKey;
     }
+    // Memory retrieval — fetches PKB, NOW.md, and memory-graph outputs and
+    // persists the retrieval's own side effects (injected-block metadata,
+    // recall log, `memory_recalled` event). Runs at the early "prompt
+    // submitted, before context assembly" moment because its outputs feed the
+    // injection and overflow-reduction transforms below. It is shaped as the
+    // `user-prompt-submit-temp` hook handler but invoked directly for now: it
+    // must run early, while the canonical late `user-prompt-submit` hook
+    // (history repair, title) runs after those transforms, so the two cannot
+    // share a fire site until compaction is cleared from the gap between them.
+    const isTrustedActor = resolveTrustClass(ctx.trustContext) === "guardian";
+    const memoryCtx: MemoryRetrievalHookContext = {
+      graphMemory: ctx.graphMemory,
+      config: getConfig(),
+      onEvent,
+      isTrustedActor,
+      conversationId: ctx.conversationId,
+      userMessageId,
+      logger: rlog,
+      // An external cancel aborts `prepareMemory` instead of letting it run
+      // to completion after the turn has already been torn down.
+      signal: abortController.signal,
+      latestMessages: ctx.messages,
+    };
+    await userPromptSubmitMemoryRetrieval(memoryCtx);
+    // The retriever owns its side effects (injected-block metadata, recall
+    // log, `memory_recalled` event) and records the dense/sparse PKB query
+    // pair on the graph handle for the PKB-reminder injector to read back; the
+    // loop only reuses the injected message list downstream.
+    let runMessages = memoryCtx.latestMessages;
+    // Capture wall-clock "now" at its point of use, after the blocking memory
+    // retrieval, so the injected `<turn_context>` timestamp reflects current
+    // time rather than the moment the turn began.
+    const timestamp = formatTurnTimestamp({
+      timeZone: timezoneContext.effectiveTimezone,
+    });
+    // Build unified turn context block that replaces the separate temporal,
+    // channel, interface, and actor context blocks.
     const baseTurnContext = {
       timestamp,
       interfaceName,
@@ -1513,64 +1232,6 @@ export async function runAgentLoopImpl(
     // The `remember` tool handles scratchpad-style memory writes directly to the graph.
-    // Personal-memory trust gate: PKB, NOW.md, and v2 static blocks all
-    // hold private user content. Block exposure to non-guardian actors
-    // arriving over a remote channel; internal/local flows pass through.
-    // See `shouldExposePersonalMemory` for the threat model.
-    const personalMemoryAllowed = shouldExposePersonalMemory({
-      sourceChannel: ctx.trustContext?.sourceChannel,
-      isTrustedActor,
-    });
-    // Inject NOW.md and PKB content only on the first turn (or after
-    // compaction re-strips them).  Old injections persist in history and
-    // are never stripped on normal turns — this preserves the cached prefix.
-    // PKB/NOW content is sourced from the `memoryRetrieval` pipeline above
-    // so plugins can override either source without touching the agent loop.
-    // NOW.md injection can be disabled via `memory.retrieval.scratchpadInjection.enabled`.
-    const scratchpadInjectionEnabled =
-      getConfig().memory.retrieval.scratchpadInjection.enabled;
-    const currentNowContent =
-      personalMemoryAllowed && scratchpadInjectionEnabled
-        ? memoryResult.nowContent
-        : null;
-    const shouldInjectNowAndPkb = isFirstMessage || compactedThisTurn;
-    const nowScratchpad = shouldInjectNowAndPkb ? currentNowContent : null;
-    const currentPkbContent = personalMemoryAllowed
-      ? memoryResult.pkbContent
-      : null;
-    const pkbContext = shouldInjectNowAndPkb ? currentPkbContent : null;
-    const pkbActive = currentPkbContent !== null;
-    // V2 static memory block (essentials/threads/recent/buffer).
-    // `currentMemoryV2Static` is the trust-gated content reused by every
-    // re-injection path — it stays non-null on non-full-mode turns so
-    // that mid-turn reducer compaction (which strips the prior `<info>`
-    // block) can restore the freshest content. `memoryV2Static` is the
-    // first-turn / post-compaction cadence-gated value for initial
-    // injection only. `readMemoryV2StaticContent` self-gates on the v2
-    // flag + config and returns null when v2 is off.
-    const currentMemoryV2Static = personalMemoryAllowed
-      ? readMemoryV2StaticContent()
-      : null;
-    const memoryV2Static = shouldInjectNowAndPkb ? currentMemoryV2Static : null;
-    // PKB relevance-hint inputs. Resolved once per turn and reused across
-    // re-injections so post-compaction rebuilds pick up fresh hints against
-    // the updated conversation history.
-    const pkbRoot = pkbActive ? join(getWorkspaceDir(), "pkb") : undefined;
-    const pkbAutoInjectList = pkbRoot
-      ? getPkbAutoInjectList(pkbRoot)
-      : undefined;
-    // Pass `ctx` directly — `PkbContextConversation` is structural and
-    // `getInContextPkbPaths` re-reads `conversation.messages` on each call,
-    // so post-compaction re-injects see the updated history.
-    const pkbConversation = pkbActive ? ctx : undefined;
-    // PKB points live under a single workspace sentinel scope.
-    // See `PKB_WORKSPACE_SCOPE` for why.
-    const pkbScopeId = pkbActive ? PKB_WORKSPACE_SCOPE : undefined;
     // Subagent status injection — gives the parent LLM visibility into active/completed children.
     // Skipped when this conversation IS a subagent (no nesting) or has no children.
     const subagentStatusBlock = ctx.isSubagent
@@ -1625,21 +1286,14 @@ export async function runAgentLoopImpl(
         )
       : null;
-    // Guards the chronological-transcript override on re-injection after
-    // the reducer compacts `ctx.messages`. The captured transcript is the
-    // full persisted history; blindly replaying it on every re-inject would
-    // overwrite the reducer's compacted messages and undo compaction. Flip
-    // to `true` after any compaction so subsequent re-injections fall back
-    // to the reduced `ctx.messages`.
-    let reducerCompacted = compactedThisTurn;
-    // memory-v3-live: route the turn's `<memory>` block to the v3 injector.
-    // When on, runtime assembly suppresses v2's `<memory>` injection (only
-    // when the v3 injector actually produced a block — otherwise v2 stays as a
-    // fallback) and the provider anchors its long-TTL cache breakpoint on the
-    // most recent STABLE user message, since the latest user message now
-    // carries the volatile per-turn memory block. Flag off → bit-for-bit
-    // identical to today's v2 path.
+    state.reducerCompacted = compactedThisTurn;
+    // memory-v3-live: when on, the provider anchors its long-TTL cache
+    // breakpoint on the most recent STABLE user message, since the latest user
+    // message now carries the volatile per-turn `<memory>` block the v3
+    // injector emits. The matching v2-suppression strip is owned by
+    // `applyRuntimeInjections`, which reads the same flag itself. Flag off →
+    // bit-for-bit identical to today's v2 path.
     const memoryV3Live = isAssistantFeatureFlagEnabled(
       "memory-v3-live",
       getConfig(),
@@ -1647,29 +1301,7 @@ export async function runAgentLoopImpl(
     // Shared injection options — reused whenever we need to re-inject after reduction.
     const injectionOpts = {
-      suppressV2MemoryForV3: memoryV3Live,
-      diskPressureContext,
-      activeSurface,
-      activeDocuments,
-      workspaceTopLevelContext: shouldInjectWorkspace
-        ? ctx.workspaceTopLevelContext
-        : null,
-      channelCapabilities: ctx.channelCapabilities ?? null,
-      channelCommandContext: ctx.commandIntent ?? null,
       unifiedTurnContext: unifiedTurnContextStr,
-      pkbContext,
-      pkbActive,
-      pkbQueryVector,
-      pkbSparseVector,
-      pkbScopeId,
-      pkbConversation,
-      pkbAutoInjectList,
-      pkbRoot,
-      pkbWorkingDir: pkbActive ? ctx.workingDir : undefined,
-      memoryV2Static,
-      nowScratchpad,
-      voiceCallControlPrompt: ctx.voiceCallControlPrompt ?? null,
-      transportHints: ctx.transportHints ?? null,
       isNonInteractive: !isInteractiveResolved,
       isBackgroundConversation: isBackgroundConversationType(
         turnStartConversation?.conversationType,
@@ -1689,7 +1321,7 @@ export async function runAgentLoopImpl(
     const injection = await applyRuntimeInjections(runMessages, {
       ...injectionOpts,
-      slackChronologicalMessages: reducerCompacted
+      slackChronologicalMessages: state.reducerCompacted
         ? null
         : injectionOpts.slackChronologicalMessages,
       mode: currentInjectionMode,
@@ -1735,18 +1367,7 @@ export async function runAgentLoopImpl(
           metadataUpdates.memoryV2StaticBlock =
             injection.blocks.memoryV2StaticBlock;
         }
-        await runPipeline<PersistArgs, PersistResult>(
-          "persistence",
-          getMiddlewaresFor("persistence"),
-          defaultPersistenceTerminal,
-          {
-            op: "update",
-            messageId: userMessageId,
-            updates: metadataUpdates,
-          },
-          buildPluginTurnContext(ctx, reqId),
-          DEFAULT_TIMEOUTS.persistence,
-        );
+        updateMessageMetadata(userMessageId, metadataUpdates);
       } catch (err) {
         rlog.warn({ err }, "Failed to persist injection metadata (non-fatal)");
       }
@@ -1762,51 +1383,18 @@ export async function runAgentLoopImpl(
     let reducerState: ReducerState | undefined;
     const toolTokenBudget = ctx.agentLoop.getToolTokenBudget(runMessages);
-    // Canonical calibration key — passed to the `tokenEstimate` pipeline for
-    // every preflight/mid-loop estimate, the overflow reducer config, and the
-    // convergence-path `estimatePromptTokens` call. Matches the key recorded
-    // by `handleUsage` for wrapper providers (OpenRouter routing to
-    // Anthropic → key is `"anthropic"`).
+    // Canonical calibration key — used by the preflight estimate, the
+    // overflow reducer config, and the convergence-path `estimatePromptTokens`
+    // call. Matches the key recorded by `handleUsage` for wrapper providers
+    // (OpenRouter routing to Anthropic → key is `"anthropic"`).
     const estimationProviderName = getCalibrationProviderKey(ctx.provider);
-    // Shared `TurnContext` for every `tokenEstimate` pipeline invocation in
-    // this turn. The pipeline is the extension point for plugins that want
-    // to substitute an alternate estimator (e.g. provider-native tokenization)
-    // without touching orchestrator code.
-    //
-    // Routed through the canonical builder — `turnIndex` is `ctx.turnCount`,
-    // trust cascades through per-turn/conversation-level/fallback, and the
-    // context-window handle rides along so any middleware that wants to
-    // reuse the manager (e.g. to compute compaction-aware estimates) can.
-    const pipelineTurnCtx = buildPluginTurnContext(ctx, reqId);
-    const runTokenEstimatePipeline = (
-      history: Message[],
-    ): Promise<EstimateResult> =>
-      runPipeline<EstimateArgs, EstimateResult>(
-        "tokenEstimate",
-        getMiddlewaresFor("tokenEstimate"),
-        defaultTokenEstimateTerminal,
-        {
-          // Shallow-frozen copies so a misbehaving middleware that mutates
-          // `args.history` or `args.tools` in place (e.g. trims the array
-          // before calling next) can't silently strip prompt context from
-          // the orchestrator's live `runMessages` / resolved-tools arrays.
-          // TypeScript `readonly` on `EstimateArgs` does not prevent
-          // `push`/`splice` at runtime; the frozen wrapper throws in strict
-          // mode and isolates any mutation attempts from the call-site state.
-          history: Object.freeze([...history]) as Message[],
-          systemPrompt: ctx.systemPrompt,
-          tools: Object.freeze([
-            ...ctx.agentLoop.getResolvedTools(history),
-          ]) as ToolDefinition[],
-          providerName: estimationProviderName,
-        },
-        pipelineTurnCtx,
-        DEFAULT_TIMEOUTS.tokenEstimate,
-      );
-    const preflightTokens = await runTokenEstimatePipeline(runMessages);
+    const preflightTokens = estimatePromptTokensWithTools(
+      runMessages,
+      ctx.systemPrompt,
+      ctx.agentLoop.getResolvedTools(runMessages),
+      estimationProviderName,
+    );
     if (overflowRecovery.enabled && preflightTokens > preflightBudget) {
       rlog.warn(
@@ -1818,16 +1406,12 @@ export async function runAgentLoopImpl(
         "Preflight budget exceeded — running overflow reducer before provider call",
       );
-      // Overflow reduction runs through the plugin pipeline. The default
-      // middleware (`default-overflow-reduce`, registered at bootstrap)
-      // contains the historical tier loop — forced compaction → tool-result
-      // truncation → media stubbing → injection downgrade — plus the
-      // re-inject/re-estimate convergence check. The callbacks below are
-      // the orchestrator-specific side effects that the plugin coordinates
-      // per iteration (activity emission, compaction application, runtime
-      // injection reassembly, token re-estimation). Registered plugins that
-      // wrap the `overflowReduce` slot see each iteration through their own
-      // middleware `next` callback.
+      // `runOverflowReductionLoop` drives the tier loop — forced compaction →
+      // tool-result truncation → media stubbing → injection downgrade — plus
+      // the re-inject/re-estimate convergence check. The callbacks below are
+      // the orchestrator-specific side effects it coordinates per iteration
+      // (activity emission, compaction application, runtime injection
+      // reassembly, token re-estimation).
       const messagesForPreflightOverflowReduction =
         slackChronologicalContext?.messages ?? ctx.messages;
       const overflowArgs: OverflowReduceArgs = {
@@ -1841,72 +1425,18 @@ export async function runAgentLoopImpl(
         maxAttempts: resolveCurrentContextBudget().overflowRecovery.maxAttempts,
         abortSignal: abortController.signal,
         compactFn: async (msgs, signal, opts) => {
-          // Route the reducer's forced-compaction tier through the
-          // `compaction` pipeline so registered plugins observe these
-          // invocations. Without this, custom compaction middleware only
-          // sees the three orchestrator-owned call sites and misses the
-          // reducer-initiated forced compactions entirely.
-          //
-          // Pipeline timeouts must be caught locally — a `PluginTimeoutError`
-          // bubbling out of here would abort the overflow-reducer tier loop
-          // entirely, skipping fallback tiers (tool-result truncation, media
-          // stubbing, injection downgrade) and bypassing circuit-breaker
-          // bookkeeping. On timeout, record the failure and return a
-          // `compacted: false` result so the reducer falls through to the
-          // next tier.
-          try {
-            return (await runPipeline<CompactionArgs, CompactionResult>(
-              "compaction",
-              getMiddlewaresFor("compaction"),
-              (args) =>
-                defaultCompactionTerminal(
-                  args,
-                  buildPluginTurnContext(ctx, reqId),
-                ),
-              {
-                messages: msgs,
-                signal,
-                options: {
-                  ...(opts ?? {}),
-                  overrideProfile: resolveCurrentOverrideProfile() ?? null,
-                  actorTrustClass: ctx.trustContext?.trustClass,
-                },
-              },
-              buildPluginTurnContext(ctx, reqId),
-              DEFAULT_TIMEOUTS.compaction,
-            )) as Awaited<
-              ReturnType<typeof ctx.contextWindowManager.maybeCompact>
-            >;
-          } catch (err) {
-            if (err instanceof PluginTimeoutError) {
-              rlog.warn(
-                { err, phase: "overflow-reducer-forced-compaction" },
-                "Compaction pipeline timed out — falling through to next reducer tier",
-              );
-              await ctx.agentLoop.compactionCircuit.recordOutcome(
-                ctx,
-                true,
-                onEvent,
-              );
-              return {
-                messages: msgs,
-                compacted: false,
-                previousEstimatedInputTokens: 0,
-                estimatedInputTokens: 0,
-                maxInputTokens: 0,
-                thresholdTokens: 0,
-                compactedMessages: 0,
-                compactedPersistedMessages: 0,
-                summaryCalls: 0,
-                summaryInputTokens: 0,
-                summaryOutputTokens: 0,
-                summaryModel: "",
-                summaryText: "",
-                reason: "compaction pipeline timed out",
-              };
-            }
-            throw err;
-          }
+          // Delegate the reducer's forced-compaction tier to the default
+          // compaction plugin, overlaying the turn's resolved inference
+          // profile and actor trust class onto the reducer-supplied options.
+          const reducerOptions = (opts ?? {}) as ContextWindowCompactOptions;
+          return defaultCompact({
+            manager: ctx.contextWindowManager,
+            messages: msgs,
+            signal,
+            ...reducerOptions,
+            overrideProfile: resolveCurrentOverrideProfile() ?? null,
+            actorTrustClass: resolveTurnActorTrustClass(ctx),
+          });
         },
         emitActivityState: () => {
           ctx.emitActivityState("thinking", "context_compacting", {
@@ -1925,14 +1455,12 @@ export async function runAgentLoopImpl(
           // breaker.
           if (result.summaryFailed !== undefined) {
             await ctx.agentLoop.compactionCircuit.recordOutcome(
-              ctx,
               result.summaryFailed,
               onEvent,
             );
           }
           if (result.compacted) {
             await applySuccessfulCompaction(result, compactedBasis);
-            shouldInjectWorkspace = true;
           }
         },
         reinjectForMode: async (
@@ -1943,27 +1471,25 @@ export async function runAgentLoopImpl(
         ) => {
           // Mirror the pre-PR-23 behavior: `ctx.messages` must track the
           // reducer's latest output before re-injection runs, because other
-          // sites consulted through `injectionOpts` (`workspaceTopLevelContext`,
-          // slack history, etc.) depend on it and `applyCompactionResult`
-          // only updates `ctx.messages` on a compaction tier. Assigning here
+          // sites consulted through `injectionOpts` (slack history, etc.) and
+          // the injectors' own message-presence scans depend on it, and
+          // `applyCompactionResult` only updates `ctx.messages` on a
+          // compaction tier. Assigning here
           // keeps non-compaction tiers (tool-result truncation, media
           // stubbing, injection downgrade) observable to downstream
           // injection assembly on the same turn.
           ctx.messages = reducedMessages;
-          // When THIS iteration compacted, it stripped existing NOW.md /
-          // PKB blocks — so we re-inject current content. A later iteration
-          // that only truncates or downgrades must NOT re-force PKB/NOW,
+          // When THIS iteration compacted, it stripped the existing
+          // memory-static block — so we re-inject current content. A later
+          // iteration that only truncates or downgrades must NOT re-force it,
           // or each round would grow the token count.
           // Gate: only the iteration that actually compacted re-injects.
+          // (The `<knowledge_base>`, NOW.md, and v2 static `<info>` blocks
+          // self-gate inside their injectors on whether they are already
+          // present in `reducedMessages`.)
           const injection = await applyRuntimeInjections(reducedMessages, {
             ...injectionOpts,
-            ...(stepCompacted && { pkbContext: currentPkbContent }),
-            ...(stepCompacted && { memoryV2Static: currentMemoryV2Static }),
-            ...(stepCompacted && { nowScratchpad: currentNowContent }),
-            workspaceTopLevelContext: shouldInjectWorkspace
-              ? ctx.workspaceTopLevelContext
-              : null,
             // Once ANY iteration has compacted `ctx.messages`, the captured
             // `slackChronologicalMessages` snapshot (built from the full
             // persisted transcript) would overwrite the compacted history
@@ -1989,41 +1515,17 @@ export async function runAgentLoopImpl(
           }),
       };
-      const overflowResult = await runPipeline<
-        OverflowReduceArgs,
-        OverflowReduceResult
-      >(
-        "overflowReduce",
-        getMiddlewaresFor("overflowReduce"),
-        // Terminal — only reached when every registered middleware calls
-        // `next` and delegates past the innermost layer. The default plugin
-        // is a terminal itself (it doesn't call `next`), so in practice
-        // this fallback fires only when the default has been explicitly
-        // deregistered (tests) and no user plugin replaces it. Strict-fail
-        // semantics: throw so the missing terminal surfaces as a visible
-        // error instead of silently returning the history untouched.
-        async () => {
-          throw new PluginExecutionError(
-            "overflowReduce pipeline has no terminal handler — every reducer middleware called next() without providing a replacement",
-            "overflowReduce",
-          );
-        },
-        overflowArgs,
-        buildPluginTurnContext(ctx, reqId),
-        DEFAULT_TIMEOUTS.overflowReduce,
-      );
+      const overflowResult = await runOverflowReductionLoop(overflowArgs);
       ctx.messages = overflowResult.messages;
       runMessages = overflowResult.runMessages;
       currentInjectionMode = overflowResult.injectionMode;
       reducerState = overflowResult.reducerState;
       if (overflowResult.reducerCompacted) {
-        reducerCompacted = true;
+        state.reducerCompacted = true;
       }
     }
-    let preRepairMessages = runMessages;
     // Replace historical web_search_tool_result blocks with text summaries.
     // The opaque `encrypted_content` tokens Anthropic attaches to each result
     // expire / are route-scoped; replaying a stale token is rejected with
@@ -2046,13 +1548,12 @@ export async function runAgentLoopImpl(
     // context with a fresh array; `runHook` forwards whichever the chain
     // settles on. Order is plugin registration order.
     //
-    // Fires BEFORE `preRunHistoryLength` is captured so the boundary
-    // between pre-existing and hook-emitted messages — consumed by the
-    // ordering-error retry gate, the post-run reconcile loop, and the
-    // new-message extraction for persistence — reflects exactly what
-    // `agentLoop.run` receives.
+    // Fires BEFORE the agent loop runs so the hook-emitted messages are part
+    // of the loop's input; the loop then reports its own appended output via
+    // `AgentLoopRunResult.newMessages`, which is what persistence consumes.
     const userPromptCtx: UserPromptSubmitContext = {
       conversationId: ctx.conversationId,
+      prompt: options?.titleText ?? content,
       originalMessages: ctx.messages,
       latestMessages: runMessages,
       logger: rlog,
@@ -2063,8 +1564,6 @@ export async function runAgentLoopImpl(
     );
     runMessages = finalUserPromptCtx.latestMessages;
-    let preRunHistoryLength = runMessages.length;
     const shouldGenerateTitle = isReplaceableTitle(
       getConversation(ctx.conversationId)?.title ?? null,
     );
@@ -2078,6 +1577,7 @@ export async function runAgentLoopImpl(
       rlog,
       turnChannelContext: capturedTurnChannelContext,
       turnInterfaceContext: capturedTurnInterfaceContext,
+      applyCompaction: applySuccessfulCompaction,
     };
     const eventHandler = (event: AgentEvent): Promise<void> =>
       dispatchAgentEvent(state, deps, event);
@@ -2097,82 +1597,39 @@ export async function runAgentLoopImpl(
     rlog.info({ callSite: turnCallSite }, "Starting agent loop run");
     // Thread the orchestrator's canonical per-turn context into the agent
-    // loop so its internal pipeline invocations (llmCall, emptyResponse,
-    // toolError, toolResultTruncate, toolExecute) see the real
-    // conversation identity / trust / contextWindowManager instead of the
-    // synthesized `"agent-loop"` placeholder. The loop clones this value
+    // loop so its internal pipeline invocations (e.g. compaction) see the
+    // real conversation identity / trust / contextWindowManager instead of
+    // the synthesized `"agent-loop"` placeholder. The loop clones this value
     // and overwrites `turnIndex` with its own tool-use iteration counter.
     const loopTurnCtx = buildPluginTurnContext(ctx, reqId);
-    // Hooks for the loop-owned mid-loop compaction. The agent loop owns the
+    // Hook for the loop-owned mid-loop compaction. The agent loop owns the
     // trigger (its budget gate), the `compaction` pipeline call, the result
     // interpretation (circuit-breaker bookkeeping + the exhaustion decision),
-    // and the inline continue; these callbacks bridge the durable / injection
-    // state the loop is intentionally blind to. Durable persistence and
-    // re-injection stay orchestrator-supplied for now.
+    // and the inline continue; this callback bridges the injection state the
+    // loop is intentionally blind to. Durable persistence is signalled via
+    // events; re-injection stays orchestrator-supplied for now.
     const midLoopCompaction: MidLoopCompaction = {
-      prepare: (history) => {
-        // Strip injected context so the compactor summarizes the raw
-        // persistent messages, and commit the stripped set to durable state.
-        const rawHistory = stripInjectionsForCompaction(history);
-        ctx.messages = rawHistory;
-        markHistoryStrippedBestEffort(ctx.conversationId, Date.now(), rlog);
-        return {
-          rawHistory,
-          options: {
-            lastCompactedAt: ctx.contextCompactedAt ?? undefined,
-            force: true,
-            targetInputTokensOverride:
-              resolveCurrentContextBudget().preflightBudget,
-            conversationOriginChannel:
-              getConversationOriginChannel(ctx.conversationId) ?? undefined,
-            overrideProfile: resolveCurrentOverrideProfile() ?? null,
-            actorTrustClass: ctx.trustContext?.trustClass,
-          },
-        };
-      },
-      applyResult: async (result, rawHistory) => {
-        await applySuccessfulCompaction(result, rawHistory);
-        reducerCompacted = true;
-        shouldInjectWorkspace = true;
-      },
-      reinject: async () => {
+      postCompactionHook: async ({ history, turnContext }) => {
         // stripInjectionsForCompaction() unconditionally removed the existing
-        // NOW.md block, so re-inject the current content regardless of whether
-        // compaction actually ran.
-        const injection = await applyRuntimeInjections(ctx.messages, {
+        // memory-static block, so re-inject the current content regardless of
+        // whether compaction actually ran. The `<knowledge_base>`, NOW.md, and
+        // v2 static `<info>` blocks self-gate inside their injectors on block
+        // presence.
+        const injection = await postCompactReinject({
           ...injectionOpts,
-          pkbContext: currentPkbContent,
-          memoryV2Static: currentMemoryV2Static,
-          nowScratchpad: currentNowContent,
-          workspaceTopLevelContext: shouldInjectWorkspace
-            ? ctx.workspaceTopLevelContext
-            : null,
           // Suppress the chronological-transcript snapshot once the reducer
           // has collapsed `ctx.messages`; the captured snapshot reflects the
           // full persisted transcript and would overwrite compaction.
-          slackChronologicalMessages: reducerCompacted
+          slackChronologicalMessages: state.reducerCompacted
             ? null
             : injectionOpts.slackChronologicalMessages,
           mode: currentInjectionMode,
-          turnContext: buildPluginTurnContext(ctx, reqId),
+          turnContext,
+          history,
+          logger: rlog,
         });
-        runMessages = injection.messages;
-        if (isTrustedActor && currentInjectionMode !== "minimal") {
-          ctx.graphMemory.retrackCachedNodes();
-        }
-        const midLoopCompactStrip =
-          stripHistoricalWebSearchResults(runMessages);
-        if (midLoopCompactStrip.stats.blocksStripped > 0) {
-          rlog.info(
-            { phase: "mid-loop-compact", ...midLoopCompactStrip.stats },
-            "Converted historical web_search_tool_result blocks to text summaries",
-          );
-          runMessages = midLoopCompactStrip.messages;
-        }
-        preRepairMessages = runMessages;
-        preRunHistoryLength = runMessages.length;
-        return runMessages;
+        return injection.messages;
       },
     };
@@ -2188,10 +1645,8 @@ export async function runAgentLoopImpl(
       msgs: Message[],
       compaction?: MidLoopCompaction,
     ): Promise<Message[]> => {
-      const { history, exitReason } = await ctx.agentLoop.run(
-        msgs,
-        eventHandler,
-        {
+      const { history, exitReason, appendedNewMessages, newMessages } =
+        await ctx.agentLoop.run(msgs, eventHandler, {
           signal: abortController.signal,
           requestId: reqId,
           onCheckpoint,
@@ -2205,8 +1660,9 @@ export async function runAgentLoopImpl(
           // `<memory>` block, so anchor the provider's long-TTL cache breakpoint
           // on the most recent stable message instead.
           mutableLatestUserMessage: memoryV3Live,
-        },
-      );
+        });
+      lastRunAppendedNewMessages = appendedNewMessages;
+      lastRunNewMessages = newMessages;
       if (exitReason === "handoff") {
         yieldedForHandoff = true;
         pendingCheckpointYield = "handoff";
@@ -2244,10 +1700,7 @@ export async function runAgentLoopImpl(
     }
     // One-shot ordering error retry
-    if (
-      state.orderingErrorDetected &&
-      updatedHistory.length === preRunHistoryLength
-    ) {
+    if (state.orderingErrorDetected && !lastRunAppendedNewMessages) {
       rlog.warn(
         { phase: "retry" },
         "Provider ordering error detected, attempting one-shot deep-repair retry",
@@ -2261,12 +1714,10 @@ export async function runAgentLoopImpl(
       // `user-prompt-submit` hook (the default history-repair plugin runs
       // `repairHistory` there); widening that surface to deep-repair is
       // intentionally deferred until there's a concrete plugin-level use case.
-      const retryRepair = deepRepairHistory(runMessages);
+      const retryRepair = deepRepairHistory(updatedHistory);
       runMessages = retryRepair.messages;
       const retryStrip = stripHistoricalWebSearchResults(runMessages);
       runMessages = retryStrip.messages;
-      preRepairMessages = runMessages;
-      preRunHistoryLength = runMessages.length;
       state.orderingErrorDetected = false;
       state.deferredOrderingError = null;
@@ -2319,15 +1770,29 @@ export async function runAgentLoopImpl(
             }
             // Can't resize — replace with a text annotation so the model
             // can explain the situation rather than silently dropping context
-            return [
-              {
-                type: "text" as const,
-                text: "(An image was attached but could not be sent — its dimensions exceed the provider limit and automatic resize was not available. Please resize the image and try again.)",
-              },
-            ];
+            return [{ type: "text" as const, text: UNSENDABLE_IMAGE_NOTE }];
           }),
         };
       });
+      // The transform above only mutates ctx.messages for the current retry.
+      // Persist the downgrade for images that can never be sent so the rejected
+      // upload doesn't rehydrate from the DB and resurface on later turns. This
+      // is cleanup for future turns, so a persistence failure must never abort
+      // the retry that is about to run — log it and continue.
+      try {
+        const rewritten = persistUnsendableImageDowngrades(ctx.conversationId);
+        if (rewritten > 0) {
+          rlog.info(
+            { phase: "image-recovery", rewritten },
+            "Persisted unsendable-image downgrades so they cannot resurface",
+          );
+        }
+      } catch (err) {
+        rlog.warn(
+          { phase: "image-recovery", err },
+          "Failed to persist unsendable-image downgrade; continuing with in-memory recovery",
+        );
+      }
       runMessages = ctx.messages;
       updatedHistory = await runAgentLoop(runMessages);
       if (state.imageTooLargeDetected) {
@@ -2356,19 +1821,9 @@ export async function runAgentLoopImpl(
     // limit), incorporate those new messages into ctx.messages so the
     // convergence loop operates on the full (larger) history.
     if (state.contextTooLargeDetected) {
-      // Detect whether ctx.messages currently lacks NOW.md so we know if
-      // it needs to be re-injected.  Mid-loop compaction (line ~1067) may
-      // have already stripped injections before escalating here, so we
-      // check actual message state rather than tracking mutation sites.
-      let convergenceStripped =
-        findLastInjectedNowContent(ctx.messages) === null;
-      if (updatedHistory.length > preRunHistoryLength) {
+      if (lastRunAppendedNewMessages) {
         ctx.messages = stripInjectionsForCompaction(updatedHistory);
-        markHistoryStrippedBestEffort(ctx.conversationId, Date.now(), rlog);
-        convergenceStripped = true;
-        preRepairMessages = updatedHistory;
-        preRunHistoryLength = updatedHistory.length;
+        markHistoryStrippedBestEffort(ctx.conversationId);
       }
       if (!reducerState) {
         reducerState = createInitialReducerState();
@@ -2450,14 +1905,12 @@ export async function runAgentLoopImpl(
             );
             if (emergencyResult.summaryFailed !== undefined) {
               await ctx.agentLoop.compactionCircuit.recordOutcome(
-                ctx,
                 emergencyResult.summaryFailed,
                 onEvent,
               );
             }
             if (emergencyResult.compacted) {
               await applySuccessfulCompaction(emergencyResult, ctx.messages);
-              shouldInjectWorkspace = true;
             }
             // Clear the overflow flag and re-run the agent loop with
             // the compacted context.
@@ -2508,7 +1961,7 @@ export async function runAgentLoopImpl(
             ctx.contextWindowManager.maybeCompact(msgs, signal!, {
               ...(opts ?? {}),
               overrideProfile: resolveCurrentOverrideProfile() ?? null,
-              actorTrustClass: ctx.trustContext?.trustClass,
+              actorTrustClass: resolveTurnActorTrustClass(ctx),
             }),
           abortController.signal,
         );
@@ -2526,7 +1979,6 @@ export async function runAgentLoopImpl(
           step.compactionResult.summaryFailed !== undefined
         ) {
           await ctx.agentLoop.compactionCircuit.recordOutcome(
-            ctx,
             step.compactionResult.summaryFailed,
             onEvent,
           );
@@ -2537,22 +1989,17 @@ export async function runAgentLoopImpl(
             step.compactionResult,
             convergenceCompactionBasis,
           );
-          shouldInjectWorkspace = true;
-          reducerCompacted = true;
+          state.reducerCompacted = true;
         }
-        // Only re-inject NOW.md when ctx.messages was actually stripped;
-        // otherwise the existing NOW.md block is still present and
-        // re-injecting would duplicate it.
+        // Only re-inject the memory-static block when ctx.messages was
+        // actually stripped; otherwise the existing block is still present and
+        // re-injecting would duplicate it. (The `<knowledge_base>` and NOW.md
+        // blocks self-gate inside their injectors on whether they are already
+        // present in `ctx.messages`.)
         const injection = await applyRuntimeInjections(ctx.messages, {
           ...injectionOpts,
-          pkbContext: currentPkbContent,
-          memoryV2Static: convergenceStripped ? currentMemoryV2Static : null,
-          nowScratchpad: convergenceStripped ? currentNowContent : null,
-          workspaceTopLevelContext: shouldInjectWorkspace
-            ? ctx.workspaceTopLevelContext
-            : null,
-          slackChronologicalMessages: reducerCompacted
+          slackChronologicalMessages: state.reducerCompacted
             ? null
             : injectionOpts.slackChronologicalMessages,
           mode: currentInjectionMode,
@@ -2570,8 +2017,6 @@ export async function runAgentLoopImpl(
           );
           runMessages = convergenceStrip.messages;
         }
-        preRepairMessages = runMessages;
-        preRunHistoryLength = runMessages.length;
         state.contextTooLargeDetected = false;
         yieldedForBudget = false;
@@ -2594,12 +2039,9 @@ export async function runAgentLoopImpl(
           // Fold rerun progress into ctx.messages so the next reducer
           // tier operates on up-to-date history instead of stale
           // pre-rerun messages.
-          if (updatedHistory.length > preRunHistoryLength) {
+          if (lastRunAppendedNewMessages) {
             ctx.messages = stripInjectionsForCompaction(updatedHistory);
-            markHistoryStrippedBestEffort(ctx.conversationId, Date.now(), rlog);
-            convergenceStripped = true;
-            preRepairMessages = updatedHistory;
-            preRunHistoryLength = updatedHistory.length;
+            markHistoryStrippedBestEffort(ctx.conversationId);
           }
         }
       }
@@ -2619,86 +2061,35 @@ export async function runAgentLoopImpl(
           ctx.emitActivityState("thinking", "context_compacting", {
             requestId: reqId,
           });
-          let emergencyCompact: Awaited<
-            ReturnType<typeof ctx.contextWindowManager.maybeCompact>
-          > | null = null;
-          try {
-            emergencyCompact = (await runPipeline<
-              CompactionArgs,
-              CompactionResult
-            >(
-              "compaction",
-              getMiddlewaresFor("compaction"),
-              (args) =>
-                defaultCompactionTerminal(
-                  args,
-                  buildPluginTurnContext(ctx, reqId),
-                ),
-              {
-                messages: ctx.messages,
-                signal: abortController.signal,
-                options: {
-                  lastCompactedAt: ctx.contextCompactedAt ?? undefined,
-                  force: true,
-                  minKeepRecentUserTurns: 0,
-                  targetInputTokensOverride: correctedTarget,
-                  overrideProfile: resolveCurrentOverrideProfile() ?? null,
-                },
-              },
-              buildPluginTurnContext(ctx, reqId),
-              DEFAULT_TIMEOUTS.compaction,
-            )) as Awaited<
-              ReturnType<typeof ctx.contextWindowManager.maybeCompact>
-            >;
-          } catch (err) {
-            if (err instanceof PluginTimeoutError) {
-              // Emergency compaction timed out. Record the circuit-breaker
-              // failure and fall through to the graceful-error path below
-              // (the unsuccessful-compaction fallback) rather than hard-
-              // failing the turn.
-              rlog.warn(
-                { err, phase: "emergency-compaction" },
-                "Emergency compaction pipeline timed out — continuing with overflow fallback",
-              );
-              await ctx.agentLoop.compactionCircuit.recordOutcome(
-                ctx,
-                true,
-                onEvent,
-              );
-              emergencyCompact = null;
-            } else {
-              throw err;
-            }
-          }
+          const emergencyCompact = await defaultCompact({
+            manager: ctx.contextWindowManager,
+            messages: ctx.messages,
+            signal: abortController.signal,
+            force: true,
+            minKeepRecentUserTurns: 0,
+            overrideProfile: resolveCurrentOverrideProfile() ?? null,
+          });
           // Only track when the summary LLM actually ran; `force: true`
-          // bypasses the cooldown but not the early-return paths.
-          if (
-            emergencyCompact &&
-            emergencyCompact.summaryFailed !== undefined
-          ) {
+          // bypasses the auto-threshold gate but not the early-return paths.
+          if (emergencyCompact.summaryFailed !== undefined) {
             await ctx.agentLoop.compactionCircuit.recordOutcome(
-              ctx,
               emergencyCompact.summaryFailed,
               onEvent,
             );
           }
-          if (emergencyCompact?.compacted) {
+          if (emergencyCompact.compacted) {
             await applySuccessfulCompaction(emergencyCompact, ctx.messages);
-            reducerCompacted = true;
-            shouldInjectWorkspace = true;
+            state.reducerCompacted = true;
           }
-          // Only re-inject NOW.md when ctx.messages was actually stripped;
-          // otherwise the existing block is still present.
+          // Only re-inject the memory-static block when ctx.messages was
+          // actually stripped; otherwise the existing block is still present.
+          // (The `<knowledge_base>`, NOW.md, and v2 static `<info>` blocks
+          // self-gate inside their injectors on whether they are already
+          // present in `ctx.messages`.)
           const injection = await applyRuntimeInjections(ctx.messages, {
             ...injectionOpts,
-            pkbContext: currentPkbContent,
-            memoryV2Static: convergenceStripped ? currentMemoryV2Static : null,
-            nowScratchpad: convergenceStripped ? currentNowContent : null,
-            workspaceTopLevelContext: shouldInjectWorkspace
-              ? ctx.workspaceTopLevelContext
-              : null,
-            slackChronologicalMessages: reducerCompacted
+            slackChronologicalMessages: state.reducerCompacted
               ? null
               : injectionOpts.slackChronologicalMessages,
             mode: currentInjectionMode,
@@ -2716,8 +2107,6 @@ export async function runAgentLoopImpl(
             );
             runMessages = fallbackStrip.messages;
           }
-          preRepairMessages = runMessages;
-          preRunHistoryLength = runMessages.length;
           state.contextTooLargeDetected = false;
           updatedHistory = await runAgentLoop(runMessages);
@@ -2771,44 +2160,11 @@ export async function runAgentLoopImpl(
       onEvent(buildConversationErrorMessage(ctx.conversationId, classified));
     }
-    // Reconcile synthesized cancellation tool_results
-    for (let i = preRunHistoryLength; i < updatedHistory.length; i++) {
-      const msg = updatedHistory[i];
-      if (msg.role === "user") {
-        for (const block of msg.content) {
-          if (
-            block.type === "tool_result" &&
-            !state.pendingToolResults.has(block.tool_use_id) &&
-            !state.persistedToolUseIds.has(block.tool_use_id)
-          ) {
-            state.pendingToolResults.set(block.tool_use_id, {
-              content: block.content,
-              isError: block.is_error ?? false,
-            });
-          }
-        }
-      }
-    }
-    // Flush remaining tool results
+    // Flush remaining tool results. On a normal turn these drain at the next
+    // `message_complete`; an aborted or yielded loop exits with them still
+    // buffered, so finalize the (possibly already on-arrival-reserved) grouped
+    // row here rather than writing a duplicate.
     if (state.pendingToolResults.size > 0) {
-      const toolResultBlocks = Array.from(
-        state.pendingToolResults.entries(),
-      ).map(([toolUseId, result]) => ({
-        type: "tool_result",
-        tool_use_id: toolUseId,
-        content: redactSecrets(result.content),
-        is_error: result.isError,
-        ...(result.contentBlocks
-          ? {
-              contentBlocks: result.contentBlocks.map((block) =>
-                block.type === "text"
-                  ? { ...block, text: redactSecrets(block.text) }
-                  : block,
-              ),
-            }
-          : {}),
-      }));
       const toolResultMetadata = {
         ...provenanceFromTrustContext(ctx.trustContext),
         userMessageChannel: capturedTurnChannelContext.userMessageChannel,
@@ -2818,21 +2174,12 @@ export async function runAgentLoopImpl(
         assistantMessageInterface:
           capturedTurnInterfaceContext.assistantMessageInterface,
       };
-      await runPipeline<PersistArgs, PersistResult>(
-        "persistence",
-        getMiddlewaresFor("persistence"),
-        defaultPersistenceTerminal,
-        {
-          op: "add",
-          conversationId: ctx.conversationId,
-          role: "user",
-          content: JSON.stringify(toolResultBlocks),
-          metadata: toolResultMetadata,
-        },
-        buildPluginTurnContext(ctx, reqId),
-        DEFAULT_TIMEOUTS.persistence,
+      await finalizePendingToolResultRow(
+        state,
+        ctx.conversationId,
+        toolResultMetadata,
+        rlog,
       );
-      state.pendingToolResults.clear();
     }
     // Persist the budget_yield_unrecovered notice now that any pending
@@ -2856,24 +2203,13 @@ export async function runAgentLoopImpl(
       };
       let yieldNoticePersistedId: string | null = null;
       try {
-        const yieldPersistResult = (await runPipeline<
-          PersistArgs,
-          PersistResult
-        >(
-          "persistence",
-          getMiddlewaresFor("persistence"),
-          defaultPersistenceTerminal,
-          {
-            op: "add",
-            conversationId: ctx.conversationId,
-            role: "assistant",
-            content: JSON.stringify(yieldNoticeMessage.content),
-            metadata: yieldNoticeMetadata,
-          },
-          buildPluginTurnContext(ctx, reqId),
-          DEFAULT_TIMEOUTS.persistence,
-        )) as PersistAddResult;
-        yieldNoticePersistedId = yieldPersistResult.message.id;
+        const yieldRow = await addMessage(
+          ctx.conversationId,
+          "assistant",
+          JSON.stringify(yieldNoticeMessage.content),
+          { metadata: yieldNoticeMetadata },
+        );
+        yieldNoticePersistedId = yieldRow.id;
       } catch (err) {
         // Non-fatal — a DB hiccup must not escalate a budget-yield exit into
         // a turn-level throw. The live SSE event was already emitted, so the
@@ -2929,7 +2265,7 @@ export async function runAgentLoopImpl(
     }
     // Reconstruct history
-    const newMessages = updatedHistory.slice(preRunHistoryLength).map((msg) => {
+    const newMessages = lastRunNewMessages.map((msg) => {
       if (msg.role !== "assistant") return msg;
       const { cleanedContent } = cleanAssistantContent(msg.content);
       const cleanedBlocks = cleanedContent as ContentBlock[];
@@ -2960,10 +2296,6 @@ export async function runAgentLoopImpl(
         state.assistantRowAwaitingFinalization &&
         state.lastAssistantMessageId
       ) {
-        // Direct `deleteMessageById` (not via the `persistence` pipeline):
-        // see the same rationale on the matching cleanup in
-        // `handleLlmCallStarted` — an unfinalized reservation has no
-        // observable history for plugins.
         try {
           deleteMessageById(state.lastAssistantMessageId);
         } catch (err) {
@@ -2985,20 +2317,12 @@ export async function runAgentLoopImpl(
       const errorAssistantMessage = createAssistantMessage(
         state.providerErrorUserMessage,
       );
-      const errorPersistResult = (await runPipeline<PersistArgs, PersistResult>(
-        "persistence",
-        getMiddlewaresFor("persistence"),
-        defaultPersistenceTerminal,
-        {
-          op: "add",
-          conversationId: ctx.conversationId,
-          role: "assistant",
-          content: JSON.stringify(errorAssistantMessage.content),
-          metadata: errChannelMeta,
-        },
-        buildPluginTurnContext(ctx, reqId),
-        DEFAULT_TIMEOUTS.persistence,
-      )) as PersistAddResult;
+      const errorRow = await addMessage(
+        ctx.conversationId,
+        "assistant",
+        JSON.stringify(errorAssistantMessage.content),
+        { metadata: errChannelMeta },
+      );
       persistedErrorAssistantMessage = true;
       // Repoint `lastAssistantMessageId` at the synthetic error row so the
       // post-loop sync, attachment resolution, and `message_complete`/
@@ -3007,7 +2331,7 @@ export async function runAgentLoopImpl(
       // above. Mark finalization complete so the next LLM call in this run
       // (or a downstream handler) doesn't try to clean up an id that
       // already corresponds to a finalized row.
-      state.lastAssistantMessageId = errorPersistResult.message.id;
+      state.lastAssistantMessageId = errorRow.id;
       state.assistantRowAwaitingFinalization = false;
       newMessages.push(errorAssistantMessage);
       // Pipe the just-assigned message id into any orphaned LLM request log
@@ -3021,10 +2345,7 @@ export async function runAgentLoopImpl(
       // other conversations cannot collide. Non-fatal — a DB hiccup must
       // not escalate a provider rejection into a turn-level throw.
       try {
-        backfillMessageIdOnLogs(
-          ctx.conversationId,
-          errorPersistResult.message.id,
-        );
+        backfillMessageIdOnLogs(ctx.conversationId, errorRow.id);
       } catch (err) {
         rlog.warn(
           { err },
@@ -3037,7 +2358,16 @@ export async function runAgentLoopImpl(
       // would create a duplicate plain-text bubble below the alert card.
     }
-    let restoredHistory = [...preRepairMessages, ...newMessages];
+    // Base persisted into `ctx.messages` is the loop's own returned history
+    // (minus the tail it appended this run), with the cleaned `newMessages`
+    // re-appended on top. Sourcing the base from the loop keeps it in lockstep
+    // with any in-loop compaction without the orchestrator maintaining a
+    // parallel snapshot across re-entry sites.
+    const loopBase = updatedHistory.slice(
+      0,
+      updatedHistory.length - lastRunNewMessages.length,
+    );
+    let restoredHistory = [...loopBase, ...newMessages];
     // Post-turn tool result truncation: save large results to disk and
     // replace in-context content with a prefix/suffix stub + file pointer.
@@ -3229,30 +2559,6 @@ export async function runAgentLoopImpl(
         publishLoopMessagesChanged();
       }
     }
-    // Second title pass: after 3 completed turns, re-generate the title
-    // using the last 3 messages for better context. Only fires when the
-    // current title was auto-generated (isAutoTitle = 1) and the user
-    // has not opted out via `conversations.skipAutoRetitling`.
-    if (ctx.turnCount === 2 && !getConfig().conversations.skipAutoRetitling) {
-      // turnCount is 0-indexed, incremented in finally; 2 = about to become 3rd turn
-      queueRegenerateConversationTitle({
-        conversationId: ctx.conversationId,
-        provider: ctx.provider,
-        onTitleUpdated: (title) => {
-          onEvent({
-            type: "conversation_title_updated",
-            conversationId: ctx.conversationId,
-            title,
-          });
-          onEvent({
-            type: "sync_changed",
-            tags: [conversationMetadataSyncTag(ctx.conversationId)],
-          });
-        },
-        signal: abortController.signal,
-      });
-    }
   } catch (err) {
     const errorCtx = {
       phase: "agent_loop" as const,
@@ -3312,8 +2618,6 @@ export async function runAgentLoopImpl(
     }
   } finally {
     if (turnStarted) {
-      cleanupBootstrapAfterTurnThreshold(ctx.conversationId);
       ctx.turnCount++;
       const config = getConfig();
       const maxWait = config.workspaceGit?.turnCommitMaxWaitMs ?? 4000;
@@ -3351,7 +2655,7 @@ export async function runAgentLoopImpl(
     ctx.profiler.emitSummary(ctx.traceEmitter, reqId);
     ctx.abortController = null;
-    ctx.processing = false;
+    ctx.setProcessing(false);
     ctx.onConfirmationOutcome = undefined;
     ctx.surfaceActionRequestIds.delete(ctx.currentRequestId ?? "");
     ctx.approvedViaPromptThisTurn = false;
@@ -3498,7 +2802,7 @@ export async function applyCompactionResult(
     result.summaryText,
     ctx.contextCompactedMessageCount,
   );
-  markHistoryStrippedBestEffort(ctx.conversationId, compactedAt, log);
+  markHistoryStrippedBestEffort(ctx.conversationId);
   if (options.slackContextCompactionWatermarkTs) {
     updateConversationSlackContextWatermark(
       ctx.conversationId,