npm - @loreai/gateway - Versions diffs - 0.13.3 → 0.14.0 - Mend

@loreai/gateway 0.13.3 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/index.js +49694 -3155
package/package.json +14 -6
package/src/batch-queue.ts +21 -1
package/src/cache-analytics.ts +344 -0
package/src/cli/agents.ts +107 -0
package/src/cli/bin.ts +11 -0
package/src/cli/help.ts +55 -0
package/src/cli/lib/binary.ts +353 -0
package/src/cli/lib/bspatch.ts +306 -0
package/src/cli/lib/delta-upgrade.ts +790 -0
package/src/cli/lib/errors.ts +48 -0
package/src/cli/lib/ghcr.ts +389 -0
package/src/cli/lib/patch-cache.ts +342 -0
package/src/cli/lib/upgrade.ts +454 -0
package/src/cli/lib/version-check.ts +385 -0
package/src/cli/main.ts +152 -0
package/src/cli/run.ts +181 -0
package/src/cli/start.ts +82 -0
package/src/cli/upgrade.ts +311 -0
package/src/cli/version.ts +22 -0
package/src/idle.ts +0 -6
package/src/index.ts +27 -27
package/src/llm-adapter.ts +100 -28
package/src/pipeline.ts +254 -177
package/src/recall.ts +223 -91
package/src/temporal-adapter.ts +3 -0
package/src/translate/anthropic.ts +50 -6
package/src/translate/types.ts +54 -9
package/dist/index.js.map +0 -7

package/src/pipeline.ts CHANGED Viewed

@@ -16,7 +16,6 @@ import {
   load,
   config as loreConfig,
   ensureProject,
-  isFirstRun,
   temporal,
   ltm,
   distillation,
@@ -92,6 +91,7 @@ import {
 import type { UpstreamInterceptor } from "./recorder";
 import { startIdleScheduler, buildIdleWorkHandler } from "./idle";
 import { getWorkerModel, resetWorkerModelState } from "./worker-model";
+import { analyzeCacheTurn } from "./cache-analytics";
 import {
   RECALL_GATEWAY_TOOL,
   RECALL_TOOL_NAME,
@@ -100,10 +100,12 @@ import {
   hasRecallToolUse,
   hasOtherToolUse,
   clientHasRecallTool,
-  isPendingRecallValid,
-  injectPendingRecall,
   buildRecallFollowUp,
-  stripRecallFromResponse,
+  buildRecallMarker,
+  recallStoreKey,
+  expandRecallMarkers,
+  cleanupRecallStore,
+  replaceRecallWithMarker,
 } from "./recall";
 // ---------------------------------------------------------------------------
@@ -143,6 +145,7 @@ export async function resetPipelineState(): Promise<void> {
   cachedProjectPath = null;
   sessions.clear();
   ltmSessionCache.clear();
+  ltmPinnedText.clear();
   // Shut down batch queue gracefully before clearing the client
   if (llmClient && "shutdown" in llmClient) {
     await (llmClient as LLMClient & { shutdown: () => Promise<void> }).shutdown();
@@ -175,6 +178,46 @@ const ltmSessionCache = new Map<
   { formatted: string; tokenCount: number }
 >();
+/**
+ * Pinned LTM text per session — the text currently being injected into the
+ * system prompt. When ltmSessionCache is invalidated and recomputed, we
+ * compare the new text against the pin. Only update if >5% character
+ * difference to avoid cache busts from minor BM25 re-ranking changes.
+ */
+const ltmPinnedText = new Map<
+  string,
+  { formatted: string; tokenCount: number }
+>();
+/**
+ * Measure character-level difference between two strings as a ratio (0..1).
+ * Uses a simple length + common-prefix heuristic — not a full diff, but
+ * sufficient to detect "substantially the same" vs "meaningfully different".
+ */
+function textDiffRatio(a: string, b: string): number {
+  if (a === b) return 0;
+  if (!a || !b) return 1;
+  // Common prefix length
+  const minLen = Math.min(a.length, b.length);
+  const maxLen = Math.max(a.length, b.length);
+  let common = 0;
+  for (let i = 0; i < minLen; i++) {
+    if (a[i] === b[i]) common++;
+    else break;
+  }
+  // Common suffix length (non-overlapping with prefix)
+  let suffix = 0;
+  for (let i = 0; i < minLen - common; i++) {
+    if (a[a.length - 1 - i] === b[b.length - 1 - i]) suffix++;
+    else break;
+  }
+  const matched = common + suffix;
+  return 1 - matched / maxLen;
+}
 /** Cached LLM client for background workers. */
 let llmClient: LLMClient | null = null;
@@ -242,8 +285,6 @@ async function initIfNeeded(projectPath: string, config?: GatewayConfig): Promis
       config.upstreamAnthropic,
       () => resolveAuth(),
       sessionModelID,
-      // onLtmInvalidated: clear the LTM session cache
-      () => ltmSessionCache.clear(),
     );
     stopIdleScheduler = startIdleScheduler(config, sessions, idleHandler);
   }
@@ -298,17 +339,23 @@ function getOrCreateSession(
       lastRequestTime: Date.now(),
       messageCount: 0,
       turnsSinceCuration: 0,
+      recallStore: new Map(),
+      cacheAnalytics: {
+        lastRequestBody: null,
+        lastRequestBodyLength: 0,
+        lastCacheRead: 0,
+        lastCacheCreation: 0,
+        turnCount: 0,
+        bustCount: 0,
+      },
     };
     sessions.set(sessionID, state);
   }
   state.lastRequestTime = Date.now();
-  // Lazy cleanup: discard expired pending recall on access
-  if (state.pendingRecall && !isPendingRecallValid(state.pendingRecall)) {
-    log.warn(
-      `lazy cleanup: discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
-    );
-    state.pendingRecall = undefined;
+  // Ensure recallStore exists (upgrade from older session state)
+  if (!state.recallStore) {
+    state.recallStore = new Map();
   }
   return state;
@@ -369,6 +416,13 @@ async function identifySession(
 // Upstream forwarding
 // ---------------------------------------------------------------------------
+/** Result from forwardToUpstream — includes the serialized body for cache analytics. */
+type UpstreamResult = {
+  response: Response;
+  /** The serialized JSON body sent to the upstream provider. */
+  serializedBody: string;
+};
 /**
  * Forward a request to the upstream provider (Anthropic or OpenAI).
  *
@@ -376,14 +430,15 @@ async function identifySession(
  * interceptor is called instead of `fetch` directly.  This enables recording
  * and replay without modifying individual call sites.
  *
- * Returns the raw fetch Response (may be streaming or non-streaming).
+ * Returns the raw fetch Response alongside the serialized request body
+ * (for cache analytics prefix comparison).
  */
 async function forwardToUpstream(
   req: GatewayRequest,
   config: GatewayConfig,
   interceptor?: UpstreamInterceptor,
   cache?: AnthropicCacheOptions,
-): Promise<Response> {
+): Promise<UpstreamResult> {
   let url: string;
   let headers: Record<string, string>;
   let body: unknown;
@@ -405,10 +460,11 @@ async function forwardToUpstream(
     body = result.body;
   }
+  const serializedBody = JSON.stringify(body);
   const effectiveInterceptor = interceptor ?? activeInterceptor;
   if (effectiveInterceptor) {
-    return effectiveInterceptor(
+    const response = await effectiveInterceptor(
       body,
       req.model,
       req.stream,
@@ -416,16 +472,18 @@ async function forwardToUpstream(
         fetch(url, {
           method: "POST",
           headers,
-          body: JSON.stringify(body),
+          body: serializedBody,
         }),
     );
+    return { response, serializedBody };
   }
-  return fetch(url, {
+  const response = await fetch(url, {
     method: "POST",
     headers,
-    body: JSON.stringify(body),
+    body: serializedBody,
   });
+  return { response, serializedBody };
 }
 // ---------------------------------------------------------------------------
@@ -483,44 +541,46 @@ function buildStreamingResponse(
               recallContext.sessionState.sessionID,
             );
+            const scope = input.scope ?? "all";
+            // Store recall result for marker round-trip expansion
+            const storeKey = recallStoreKey(input.query, scope);
+            const position = resp.content.indexOf(recallBlock);
+            recallContext.sessionState.recallStore.set(storeKey, {
+              toolUseId: recallBlock.id,
+              input,
+              position,
+              result,
+            });
+            // Emit marker text block in place of the suppressed recall block
+            const markerText = buildRecallMarker(input.query, scope);
+            const markerIdx = recallAccum.clientBlockCount();
+            const syntheticMarker = [
+              formatSSEEvent("content_block_start", JSON.stringify({
+                type: "content_block_start",
+                index: markerIdx,
+                content_block: { type: "text", text: "" },
+              })),
+              formatSSEEvent("content_block_delta", JSON.stringify({
+                type: "content_block_delta",
+                index: markerIdx,
+                delta: { type: "text_delta", text: markerText },
+              })),
+              formatSSEEvent("content_block_stop", JSON.stringify({
+                type: "content_block_stop",
+                index: markerIdx,
+              })),
+            ].join("");
+            controller.enqueue(encoder.encode(syntheticMarker));
             if (recallAccum.hasOtherTools()) {
-              // Case 2: mixed tools — store pending, forward held-back events
-              const position = resp.content.indexOf(recallBlock);
-              recallContext.sessionState.pendingRecall = {
-                toolUseId: recallBlock.id,
-                input,
-                position,
-                result,
-                timestamp: Date.now(),
-              };
+              // Forward held-back events, close stream
               log.info(
-                `recall (stream, mixed): stored pending result for session ` +
+                `recall (stream, mixed): stored result for session ` +
                   `${recallContext.sessionState.sessionID.slice(0, 16)}`,
               );
-              // Emit a synthetic "[Searching memory...]" text block after all
-              // other tool blocks. The accumulator already re-indexed other
-              // tools to fill the gap, so this goes at clientBlockCount.
-              const searchingIdx = recallAccum.clientBlockCount();
-              const syntheticCase2 = [
-                formatSSEEvent("content_block_start", JSON.stringify({
-                  type: "content_block_start",
-                  index: searchingIdx,
-                  content_block: { type: "text", text: "" },
-                })),
-                formatSSEEvent("content_block_delta", JSON.stringify({
-                  type: "content_block_delta",
-                  index: searchingIdx,
-                  delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
-                })),
-                formatSSEEvent("content_block_stop", JSON.stringify({
-                  type: "content_block_stop",
-                  index: searchingIdx,
-                })),
-              ].join("");
-              controller.enqueue(encoder.encode(syntheticCase2));
-              // Forward the held-back message_delta + message_stop
               const heldBack = recallAccum.heldBackEvents();
               if (heldBack) {
                 controller.enqueue(encoder.encode(heldBack));
@@ -528,51 +588,50 @@ function buildStreamingResponse(
               controller.close();
-              // Post-stream: use stripped response for temporal storage
-              const cleanResp = stripRecallFromResponse(resp);
-              onComplete(cleanResp);
+              // Post-stream: store response with marker text (not raw tool_use)
+              const markerResp = replaceRecallWithMarker(resp);
+              onComplete(markerResp);
               return;
             }
-            // Case 1: recall-only — send follow-up, pipe continuation
+            // Recall-only — send follow-up, pipe continuation
             log.info(
               `recall (stream, only): executing follow-up for session ` +
                 `${recallContext.sessionState.sessionID.slice(0, 16)}`,
             );
-            // Emit a synthetic "[Searching memory...]" text block at the
-            // suppressed recall index so the client sees a natural indicator
-            // during the pause while the recall executes.
-            const searchingIndex = recallAccum.clientBlockCount();
-            const syntheticBlock = [
-              formatSSEEvent("content_block_start", JSON.stringify({
-                type: "content_block_start",
-                index: searchingIndex,
-                content_block: { type: "text", text: "" },
-              })),
-              formatSSEEvent("content_block_delta", JSON.stringify({
-                type: "content_block_delta",
-                index: searchingIndex,
-                delta: { type: "text_delta", text: "\n\n[Searching memory...]" },
-              })),
-              formatSSEEvent("content_block_stop", JSON.stringify({
-                type: "content_block_stop",
-                index: searchingIndex,
-              })),
-            ].join("");
-            controller.enqueue(encoder.encode(syntheticBlock));
             const followUp = buildRecallFollowUp(
               recallContext.modifiedReq,
               resp,
               result,
               recallBlock,
             );
-            const followUpResponse = await forwardToUpstream(
-              followUp,
-              recallContext.config,
-              undefined,
-              recallContext.cacheOptions,
+             let followUpResponse: Response;
+            try {
+              ({ response: followUpResponse } = await forwardToUpstream(
+                followUp,
+                recallContext.config,
+                undefined,
+                recallContext.cacheOptions,
+              ));
+            } catch (fetchErr) {
+              log.error(
+                `recall follow-up fetch error for session ${recallContext.sessionState.sessionID.slice(0, 16)}:`,
+                fetchErr,
+              );
+              const heldBack = recallAccum.heldBackEvents();
+              if (heldBack) {
+                controller.enqueue(encoder.encode(heldBack));
+              }
+              controller.close();
+              const markerResp = replaceRecallWithMarker(resp);
+              onComplete(markerResp);
+              return;
+            }
+            log.info(
+              `recall follow-up response: status=${followUpResponse.status} ` +
+                `hasBody=${!!followUpResponse.body} session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
             );
             if (!followUpResponse.ok) {
@@ -586,22 +645,21 @@ function buildStreamingResponse(
                 controller.enqueue(encoder.encode(heldBack));
               }
               controller.close();
-              const cleanResp = stripRecallFromResponse(resp);
-              onComplete(cleanResp);
+              const markerResp = replaceRecallWithMarker(resp);
+              onComplete(markerResp);
               return;
             }
             // Pipe the continuation stream into the same HTTP response.
             // Suppress message_start (client already has one) and re-index
             // content blocks to continue from where the client left off.
-            // +1 accounts for the synthetic "[Searching memory...]" block.
-            // Use clientBlockCount (not recallBlockIndex) — this is the number
-            // of blocks the client has already seen, so continuation blocks
-            // start at clientBlockCount + 1 (for the synthetic block).
+            // +1 accounts for the synthetic marker block.
             const blockOffset = recallAccum.clientBlockCount() + 1;
             const contReader = followUpResponse.body!.getReader();
+            let contEventCount = 0;
             for await (const { event: contEvent, data: contData } of parseSSEStream(contReader)) {
+              contEventCount++;
               if (contEvent === "message_start") {
                 // Suppress — client already received one
                 continue;
@@ -634,19 +692,18 @@ function buildStreamingResponse(
               controller.enqueue(encoder.encode(forwarded));
             }
+            log.info(
+              `recall follow-up stream complete: ${contEventCount} events piped, ` +
+                `session=${recallContext.sessionState.sessionID.slice(0, 16)}`,
+            );
             controller.close();
-            // Post-stream: accumulate the continuation for temporal storage.
-            // We use resp (original) + continuation for a complete picture,
-            // but for simplicity just store the continuation response since
-            // it's what the model actually produced for the client.
-            // The continuation accumulator was not wired — use the original
-            // response's pre-recall content + continuation's content.
-            // For now, call onComplete with the original response so at least
-            // the pre-recall content is stored. The continuation's text is
-            // visible to the client but not separately stored — acceptable
-            // since temporal storage captures the full conversation on next turn.
-            onComplete(resp);
+            // Post-stream: store response with marker text for temporal storage.
+            // The marker replaces the raw tool_use, so future turns can
+            // round-trip the marker ↔ tool_use/tool_result correctly.
+            const markerResp = replaceRecallWithMarker(resp);
+            onComplete(markerResp);
             return;
           }
         }
@@ -795,6 +852,8 @@ function postResponse(
   resp: GatewayResponse,
   sessionState: SessionState,
   config: GatewayConfig,
+  /** Serialized JSON body sent upstream — for cache prefix comparison. */
+  requestBody?: string,
 ): void {
   const { sessionID, projectPath } = sessionState;
@@ -810,6 +869,11 @@ function postResponse(
       getLastTransformedCount(sessionID),
     );
+    // --- Cache analytics ---
+    if (requestBody) {
+      analyzeCacheTurn(sessionState.cacheAnalytics, requestBody, resp.usage, sessionID);
+    }
     // --- Temporal storage ---
     // Store all messages (user + assistant) from this turn.
     // Convert gateway messages to Lore format.
@@ -1010,7 +1074,7 @@ async function handlePassthrough(
   req: GatewayRequest,
   config: GatewayConfig,
 ): Promise<Response> {
-  const upstreamResponse = await forwardToUpstream(req, config);
+  const { response: upstreamResponse } = await forwardToUpstream(req, config);
   // For streaming, pipe through unchanged
   if (req.stream && upstreamResponse.body) {
@@ -1079,25 +1143,18 @@ async function handleConversationTurn(
   // Track session model for worker model discovery
   lastSeenSessionModel = req.model;
-  // --- Inject pending recall from previous turn (Case 2: mixed tools) ---
-  if (sessionState.pendingRecall) {
-    if (isPendingRecallValid(sessionState.pendingRecall)) {
-      const injected = injectPendingRecall(req, sessionState.pendingRecall);
-      if (injected) {
-        log.info(
-          `injected pending recall result into request for session ${sessionID.slice(0, 16)}`,
-        );
-      } else {
-        log.warn(
-          `failed to inject pending recall — conversation structure mismatch`,
-        );
-      }
-    } else {
-      log.warn(
-        `discarding expired pending recall for session ${sessionID.slice(0, 16)}`,
+  // --- Expand recall markers from previous turns ---
+  // Scan all assistant messages for marker text blocks and restore them
+  // to tool_use + tool_result pairs before forwarding upstream.
+  if (sessionState.recallStore.size > 0) {
+    const expanded = expandRecallMarkers(req, sessionState.recallStore);
+    if (expanded) {
+      log.info(
+        `expanded recall markers for session ${sessionID.slice(0, 16)}`,
       );
     }
-    sessionState.pendingRecall = undefined;
+    // Clean up orphaned store entries (markers evicted by gradient)
+    cleanupRecallStore(req, sessionState.recallStore);
   }
   log.info(
@@ -1130,8 +1187,8 @@ async function handleConversationTurn(
     );
   }
-  // --- 6. LTM injection into system prompt ---
-  let modifiedSystem = req.system;
+  // --- 6. LTM injection (kept separate from host system prompt for caching) ---
+  let ltmText: string | undefined;
   if (cfg.knowledge.enabled) {
     try {
       let cached = ltmSessionCache.get(sessionID);
@@ -1159,8 +1216,21 @@ async function handleConversationTurn(
       }
       if (cached) {
-        setLtmTokens(cached.tokenCount, sessionID);
-        modifiedSystem = `${req.system}\n\n${cached.formatted}`;
+        // Content-diff pinning: only update the injected LTM text if the
+        // new content differs by >5% from what's currently pinned. This
+        // prevents cache busts from minor BM25 re-ranking after background
+        // curation/consolidation invalidates the LTM cache.
+        const pinned = ltmPinnedText.get(sessionID);
+        if (pinned && textDiffRatio(pinned.formatted, cached.formatted) < 0.05) {
+          // Near-identical — keep the pinned text to preserve cache prefix
+          ltmText = pinned.formatted;
+          setLtmTokens(pinned.tokenCount, sessionID);
+        } else {
+          // Substantially different or first injection — pin the new text
+          ltmPinnedText.set(sessionID, cached);
+          ltmText = cached.formatted;
+          setLtmTokens(cached.tokenCount, sessionID);
+        }
       } else {
         setLtmTokens(0, sessionID);
       }
@@ -1175,25 +1245,6 @@ async function handleConversationTurn(
     consumeCameOutOfIdle(sessionID);
   }
-  // First-run greeting
-  if (isFirstRun()) {
-    modifiedSystem +=
-      "\n\n[Lore plugin] This is the first time Lore has been activated. " +
-      "Briefly let the user know that Lore is now active and their " +
-      "coding agent will get progressively smarter on this codebase " +
-      "over time as knowledge accumulates across sessions.";
-  }
-  // Lore knowledge file commit reminder
-  if (cfg.knowledge.enabled) {
-    const filesToTrack = [".lore.md"];
-    if (cfg.agentsFile.enabled) filesToTrack.push(cfg.agentsFile.path);
-    modifiedSystem +=
-      `\n\nWhen making git commits, always check if ${filesToTrack.join(" and ")} ` +
-      `have unstaged changes and include them in the commit. These files contain ` +
-      `shared project knowledge managed by lore and must be version-controlled.`;
-  }
   // --- 7. Gradient transform on messages ---
   const loreMessages = gatewayMessagesToLore(req.messages, sessionID);
   resolveToolResults(loreMessages);
@@ -1225,34 +1276,54 @@ async function handleConversationTurn(
   const modifiedReq: GatewayRequest = {
     ...req,
-    system: modifiedSystem,
+    // Host system prompt is passed through unmodified — LTM is injected
+    // as a separate system block via cache options for prefix stability.
     messages: transformedMessages,
   };
-  // --- 8b. Inject recall tool ---
+  // --- 8b. Inject recall tool (with git reminder appended to description) ---
   // Only inject if the client doesn't already have a recall tool (e.g. from
   // a host plugin like OpenCode) and the request has other tools (so it's a
   // coding agent, not a bare chat).
   if (modifiedReq.tools.length > 0 && !clientHasRecallTool(modifiedReq.tools)) {
-    modifiedReq.tools = [...modifiedReq.tools, RECALL_GATEWAY_TOOL];
+    // Build the recall tool with git reminder baked into its description.
+    // This keeps the reminder in the stable tools prefix (1h cache) rather
+    // than the volatile system prompt.
+    const recallTool = cfg.knowledge.enabled
+      ? {
+          ...RECALL_GATEWAY_TOOL,
+          description:
+            RECALL_GATEWAY_TOOL.description +
+            "\n\nWhen making git commits, always check if .lore.md " +
+            "has unstaged changes and include it in the commit. " +
+            "This file contains shared project knowledge managed " +
+            "by lore and must be version-controlled.",
+        }
+      : RECALL_GATEWAY_TOOL;
+    modifiedReq.tools = [...modifiedReq.tools, recallTool];
   }
   // --- 9. Forward to upstream ---
-  // Enable prompt caching for conversation turns:
-  //  - System prompt: explicit breakpoint with 5m TTL (frequent turns)
-  //  - Conversation: breakpoint on last block so Anthropic caches the prefix
+  // Enable prompt caching for conversation turns with layered breakpoints:
+  //  - System prompt: 1h TTL (host prompt is very stable within a session)
+  //  - LTM: separate system block (no breakpoint, benefits from prefix)
+  //  - Tools: 1h TTL on last tool (recall + git reminder are static)
+  //  - Conversation: 5m TTL on last message block
   // Title/summary passthrough (handlePassthrough) never reaches here — it
   // forwards the raw request without buildAnthropicRequest, so no caching.
   const cacheOptions: AnthropicCacheOptions = {
-    systemTTL: "5m",
+    systemTTL: "1h",
+    ltmSystem: ltmText,
+    cacheTools: true,
     cacheConversation: true,
   };
-  const upstreamResponse = await forwardToUpstream(
-    modifiedReq,
-    config,
-    undefined,
-    cacheOptions,
-  );
+  const { response: upstreamResponse, serializedBody: requestBody } =
+    await forwardToUpstream(
+      modifiedReq,
+      config,
+      undefined,
+      cacheOptions,
+    );
   if (!upstreamResponse.ok) {
     const errorBody = await upstreamResponse.text();
@@ -1273,7 +1344,7 @@ async function handleConversationTurn(
     );
     return buildStreamingResponse(
       upstreamResponse,
-      (resp) => postResponse(req, resp, sessionState, config),
+      (resp) => postResponse(req, resp, sessionState, config, requestBody),
       hasRecallTool
         ? { modifiedReq, config, sessionState, cacheOptions }
         : undefined,
@@ -1292,46 +1363,49 @@ async function handleConversationTurn(
       sessionState.sessionID,
     );
+    // Store recall result for marker round-trip expansion
+    const storeKey = recallStoreKey(input.query, input.scope ?? "all");
+    const position = resp.content.indexOf(recallBlock);
+    sessionState.recallStore.set(storeKey, {
+      toolUseId: recallBlock.id,
+      input,
+      position,
+      result,
+    });
+    // Replace recall tool_use with marker text in the response
+    const markerResp = replaceRecallWithMarker(resp);
     if (hasOtherToolUse(resp)) {
-      // Case 2: recall + other tools — store pending, strip recall from response
-      const position = resp.content.indexOf(recallBlock);
-      sessionState.pendingRecall = {
-        toolUseId: recallBlock.id,
-        input,
-        position,
-        result,
-        timestamp: Date.now(),
-      };
+      // Mixed tools — return response with marker replacing recall tool_use
       log.info(
-        `recall (non-stream, mixed): stored pending result for session ${sessionState.sessionID.slice(0, 16)}`,
+        `recall (non-stream, mixed): stored result for session ${sessionState.sessionID.slice(0, 16)}`,
       );
-      const cleanResp = stripRecallFromResponse(resp);
-      postResponse(req, cleanResp, sessionState, config);
-      return nonStreamHttpResponse(cleanResp);
+      postResponse(req, markerResp, sessionState, config, requestBody);
+      return nonStreamHttpResponse(markerResp);
     }
-    // Case 1: recall-only — send follow-up request
+    // Recall-only — send follow-up request for seamless UX
     log.info(
       `recall (non-stream, only): executing follow-up for session ${sessionState.sessionID.slice(0, 16)}`,
     );
     const followUp = buildRecallFollowUp(modifiedReq, resp, result, recallBlock);
-    // Strip recall from the follow-up tools (already done by buildRecallFollowUp)
-    const followUpResponse = await forwardToUpstream(
+    let followUpResponse: Response;
+    ({ response: followUpResponse } = await forwardToUpstream(
       followUp,
       config,
       undefined,
       cacheOptions,
-    );
+    ));
     if (!followUpResponse.ok) {
       const errorBody = await followUpResponse.text();
       log.error(
         `recall follow-up upstream error: ${followUpResponse.status} ${errorBody.slice(0, 500)}`,
       );
-      // Fall back to the original response without recall
-      const cleanResp = stripRecallFromResponse(resp);
-      postResponse(req, cleanResp, sessionState, config);
-      return nonStreamHttpResponse(cleanResp);
+      // Fall back to response with marker (no continuation)
+      postResponse(req, markerResp, sessionState, config, requestBody);
+      return nonStreamHttpResponse(markerResp);
     }
     const continuationResp = await accumulateNonStreamResponse(followUpResponse);
@@ -1350,11 +1424,11 @@ async function handleConversationTurn(
         resp.usage.cacheCreationInputTokens;
     }
-    postResponse(req, continuationResp, sessionState, config);
+    postResponse(req, continuationResp, sessionState, config, requestBody);
     return nonStreamHttpResponse(continuationResp);
   }
-  postResponse(req, resp, sessionState, config);
+  postResponse(req, resp, sessionState, config, requestBody);
   return nonStreamHttpResponse(resp);
 }
@@ -1417,6 +1491,9 @@ export function loreMessagesToGateway(
           content.push({
             type: "thinking",
             thinking: (part as { text: string }).text ?? "",
+            ...((part as { signature?: string }).signature != null
+              ? { signature: (part as { signature?: string }).signature }
+              : undefined),
           });
           break;
         case "tool": {