npm - @librechat/agents - Versions diffs - 3.2.36 → 3.2.38 - Mend

@librechat/agents 3.2.36 → 3.2.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/dist/cjs/agents/AgentContext.cjs +1 -1
package/dist/cjs/agents/AgentContext.cjs.map +1 -1
package/dist/cjs/graphs/Graph.cjs +7 -8
package/dist/cjs/graphs/Graph.cjs.map +1 -1
package/dist/cjs/langfuse.cjs +16 -5
package/dist/cjs/langfuse.cjs.map +1 -1
package/dist/cjs/langfuseToolOutputTracing.cjs +7 -0
package/dist/cjs/langfuseToolOutputTracing.cjs.map +1 -1
package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +92 -3
package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +24 -4
package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
package/dist/cjs/main.cjs +2 -0
package/dist/cjs/messages/cache.cjs +183 -0
package/dist/cjs/messages/cache.cjs.map +1 -1
package/dist/cjs/summarization/node.cjs +1 -1
package/dist/cjs/summarization/node.cjs.map +1 -1
package/dist/cjs/tools/toolOutputReferences.cjs +28 -14
package/dist/cjs/tools/toolOutputReferences.cjs.map +1 -1
package/dist/esm/agents/AgentContext.mjs +2 -2
package/dist/esm/agents/AgentContext.mjs.map +1 -1
package/dist/esm/graphs/Graph.mjs +8 -9
package/dist/esm/graphs/Graph.mjs.map +1 -1
package/dist/esm/langfuse.mjs +16 -5
package/dist/esm/langfuse.mjs.map +1 -1
package/dist/esm/langfuseToolOutputTracing.mjs +7 -0
package/dist/esm/langfuseToolOutputTracing.mjs.map +1 -1
package/dist/esm/llm/anthropic/utils/message_inputs.mjs +92 -3
package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
package/dist/esm/llm/bedrock/utils/message_inputs.mjs +24 -4
package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
package/dist/esm/main.mjs +2 -2
package/dist/esm/messages/cache.mjs +182 -1
package/dist/esm/messages/cache.mjs.map +1 -1
package/dist/esm/summarization/node.mjs +2 -2
package/dist/esm/summarization/node.mjs.map +1 -1
package/dist/esm/tools/toolOutputReferences.mjs +28 -14
package/dist/esm/tools/toolOutputReferences.mjs.map +1 -1
package/dist/types/messages/cache.d.ts +40 -0
package/dist/types/types/graph.d.ts +2 -0
package/package.json +8 -5
package/src/agents/AgentContext.ts +2 -2
package/src/agents/__tests__/AgentContext.test.ts +3 -9
package/src/graphs/Graph.ts +65 -36
package/src/langfuse.ts +38 -4
package/src/langfuseToolOutputTracing.ts +18 -0
package/src/llm/anthropic/utils/message_inputs.ts +131 -3
package/src/llm/anthropic/utils/stripPrefillCache.test.ts +111 -0
package/src/llm/bedrock/utils/message_inputs.test.ts +129 -0
package/src/llm/bedrock/utils/message_inputs.ts +46 -4
package/src/llm/bedrock/utils/toolResultCachePoint.test.ts +103 -0
package/src/messages/cache.tail.test.ts +340 -0
package/src/messages/cache.ts +266 -0
package/src/messages/tailCacheConversion.test.ts +161 -0
package/src/scripts/bench-prompt-cache.ts +479 -0
package/src/specs/langfuse-config.test.ts +69 -2
package/src/specs/langfuse-metadata.test.ts +44 -0
package/src/specs/langfuse-tool-output-tracing.test.ts +6 -0
package/src/summarization/node.ts +2 -2
package/src/tools/__tests__/annotateMessagesForLLM.test.ts +50 -0
package/src/tools/toolOutputReferences.ts +34 -20
package/src/types/graph.ts +2 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@librechat/agents",
-  "version": "3.2.36",
+  "version": "3.2.38",
   "main": "./dist/cjs/main.cjs",
   "module": "./dist/esm/main.mjs",
   "types": "./dist/types/index.d.ts",
@@ -139,6 +139,7 @@
     "tool": "node --trace-warnings -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/tools.ts --provider 'bedrock' --name 'Jo' --location 'New York, NY'",
     "search": "node -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/search.ts --provider 'bedrock' --name 'Jo' --location 'New York, NY'",
     "tool_search": "node -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/tool_search.ts",
+    "bench:cache": "node --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/bench-prompt-cache.ts",
     "subagent": "node -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/multi-agent-subagent.ts",
     "subagent:events": "node -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/subagent-event-driven-debug.ts",
     "subagent:tools": "node -r dotenv/config --loader ./tsconfig-paths-bootstrap.mjs --experimental-specifier-resolution=node ./src/scripts/subagent-tools-debug.ts",
@@ -211,7 +212,9 @@
     "uuid": "$uuid",
     "fast-xml-parser": "5.7.2",
     "ajv": "6.14.0",
-    "minimatch": "3.1.4"
+    "minimatch": "3.1.4",
+    "@opentelemetry/core": "^2.8.0",
+    "js-yaml": "^4.2.0"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.92.0",
@@ -229,9 +232,9 @@
     "@langchain/openai": "1.4.5",
     "@langchain/textsplitters": "^1.0.1",
     "@langchain/xai": "^1.3.17",
-    "@langfuse/langchain": "^5.3.0",
-    "@langfuse/otel": "^5.3.0",
-    "@langfuse/tracing": "^5.3.0",
+    "@langfuse/langchain": "^5.4.1",
+    "@langfuse/otel": "^5.4.1",
+    "@langfuse/tracing": "^5.4.1",
     "@opentelemetry/context-async-hooks": "2.7.1",
     "@opentelemetry/sdk-node": "^0.218.0",
     "@scarf/scarf": "^1.4.0",

package/src/agents/AgentContext.ts CHANGED Viewed

@@ -16,7 +16,7 @@ import {
   Providers,
 } from '@/common';
 import {
-  addCacheControl,
+  addTailCacheControl,
   addCacheControlToStablePrefixMessages,
   cloneMessage,
 } from '@/messages/cache';
@@ -689,7 +689,7 @@ export class AgentContext {
         dynamicTail.length === 0 &&
         body.length >= 2
       ) {
-        body = addCacheControl(body);
+        body = addTailCacheControl(body);
       }
       return [...prefix, ...body];
     }).withConfig({ runName: 'prompt' });

package/src/agents/__tests__/AgentContext.test.ts CHANGED Viewed

@@ -274,16 +274,11 @@ describe('AgentContext', () => {
         new HumanMessage('First'),
         new HumanMessage('Second'),
       ]);
-      const firstContent = result[1].content as TestSystemContentBlock[];
       const secondContent = result[2].content as TestSystemContentBlock[];
       expect(result).toHaveLength(3);
       expect(result[0].content).toBe('Dynamic only');
-      expect(firstContent[0]).toMatchObject({
-        type: 'text',
-        text: 'First',
-        cache_control: { type: 'ephemeral' },
-      });
+      expect(result[1].content).toBe('First');
       expect(secondContent[0]).toMatchObject({
         type: 'text',
         text: 'Second',
@@ -686,7 +681,7 @@ describe('AgentContext', () => {
       expect(result[8].content).toBe('Now answer without tools');
     });
-    it('adds OpenRouter body cache points when there is no dynamic tail', async () => {
+    it('adds a single OpenRouter body cache point on the tail when there is no dynamic tail', async () => {
       const ctx = createBasicContext({
         agentConfig: {
           provider: Providers.OPENROUTER,
@@ -702,9 +697,8 @@ describe('AgentContext', () => {
         new HumanMessage('First'),
         new HumanMessage('Second'),
       ]);
-      const firstContent = result[1].content as TestSystemContentBlock[];
       const secondContent = result[2].content as TestSystemContentBlock[];
-      expect(firstContent[0]).toHaveProperty('cache_control');
+      expect(result[1].content).toBe('First');
       expect(secondContent[0]).toHaveProperty('cache_control');
     });

package/src/graphs/Graph.ts CHANGED Viewed

@@ -19,14 +19,14 @@ import {
   convertMessagesToContent,
   sanitizeOrphanToolBlocks,
   extractToolDiscoveries,
-  addBedrockCacheControl,
+  addBedrockTailCacheControl,
   formatArtifactPayload,
   enforceOriginalContentCap,
   formatContentStrings,
   isLegacyConvertible,
   createPruneMessages,
   syncBudgetDerivedFields,
-  addCacheControl,
+  addTailCacheControl,
   getMessageId,
   makeIsDeferred,
   partitionAndMarkAnthropicToolCache,
@@ -1733,35 +1733,6 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         }
       }
-      if (agentContext.provider === Providers.ANTHROPIC) {
-        const anthropicOptions = agentContext.clientOptions as
-          | t.AnthropicClientOptions
-          | undefined;
-        if (
-          anthropicOptions?.promptCache === true &&
-          !agentContext.systemRunnable
-        ) {
-          finalMessages = addCacheControl<BaseMessage>(finalMessages);
-        }
-      } else if (agentContext.provider === Providers.BEDROCK) {
-        const bedrockOptions = agentContext.clientOptions as
-          | t.BedrockAnthropicClientOptions
-          | undefined;
-        if (bedrockOptions?.promptCache === true) {
-          finalMessages = addBedrockCacheControl<BaseMessage>(finalMessages);
-        }
-      } else if (agentContext.provider === Providers.OPENROUTER) {
-        const openRouterOptions = agentContext.clientOptions as
-          | t.ProviderOptionsMap[Providers.OPENROUTER]
-          | undefined;
-        if (
-          openRouterOptions?.promptCache === true &&
-          !agentContext.systemRunnable
-        ) {
-          finalMessages = addCacheControl<BaseMessage>(finalMessages);
-        }
-      }
       if (
         isThinkingEnabled(agentContext.provider, agentContext.clientOptions)
       ) {
@@ -1783,13 +1754,53 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         );
       }
-      // Intentionally broad: runs when the pruner wasn't used OR any post-pruning
-      // transform (addCacheControl, ensureThinkingBlock, etc.) reassigned finalMessages.
-      // sanitizeOrphanToolBlocks fast-paths to a Set diff check when no orphans exist,
-      // so the cost is negligible and this acts as a safety net for Anthropic/Bedrock.
+      // Determine the prompt-cache strategy up front. Two distinct facts:
+      //
+      //   `providerPromptCacheEnabled` — prompt caching is on for this provider
+      //   at all. This drives orphan cleanup, because EVERY cached send must be
+      //   sanitized — including the system-runnable path, where AgentContext (not
+      //   this node) adds the body marker.
+      //
+      //   `willAddTailCache` — THIS node will add the marker itself. Anthropic /
+      //   OpenRouter defer to the system runnable when one owns the system-prompt
+      //   breakpoint, so they exclude that case; Bedrock always marks here.
+      const anthropicPromptCacheEnabled =
+        agentContext.provider === Providers.ANTHROPIC &&
+        (agentContext.clientOptions as t.AnthropicClientOptions | undefined)
+          ?.promptCache === true;
+      const openRouterPromptCacheEnabled =
+        agentContext.provider === Providers.OPENROUTER &&
+        (
+          agentContext.clientOptions as
+            | t.ProviderOptionsMap[Providers.OPENROUTER]
+            | undefined
+        )?.promptCache === true;
+      const bedrockPromptCacheEnabled =
+        agentContext.provider === Providers.BEDROCK &&
+        (
+          agentContext.clientOptions as
+            | t.BedrockAnthropicClientOptions
+            | undefined
+        )?.promptCache === true;
+      const providerPromptCacheEnabled =
+        anthropicPromptCacheEnabled ||
+        openRouterPromptCacheEnabled ||
+        bedrockPromptCacheEnabled;
+      // Intentionally broad: runs when the pruner wasn't used, when any
+      // post-pruning transform (ensureThinkingBlock, etc.) reassigned
+      // finalMessages, OR when this is a prompt-cached send. The last clause
+      // matters because the marker is now applied AFTER this gate (and, for the
+      // system-runnable path, in AgentContext entirely): without it, a cached
+      // send whose pruner returned the context unchanged would skip cleanup and
+      // could ship orphaned AI/tool pairs from persisted history.
+      // sanitizeOrphanToolBlocks fast-paths to a Set diff check when no orphans
+      // exist, so the cost is negligible.
       const needsOrphanSanitize =
         anthropicLike &&
-        (!agentContext.pruneMessages || finalMessages !== messagesToUse);
+        (!agentContext.pruneMessages ||
+          finalMessages !== messagesToUse ||
+          providerPromptCacheEnabled);
       if (needsOrphanSanitize) {
         const beforeSanitize = finalMessages.length;
         finalMessages = sanitizeOrphanToolBlocks(finalMessages);
@@ -1809,6 +1820,24 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         }
       }
+      // Place the single tail prompt-cache breakpoint LAST, after thinking
+      // normalization and orphan sanitization. ensureThinkingBlockInMessages can
+      // fold a trailing non-thinking AI→Tool chain into a `[Previous agent
+      // context]` HumanMessage whose builder copies text but not cache_control /
+      // cachePoint, and sanitizeOrphanToolBlocks can drop the anchored block — so
+      // marking earlier would let the only breakpoint vanish before the model
+      // call (zero message caching). Anchoring on the final message list keeps
+      // the marker on a block that actually ships. The system-runnable path
+      // adds its body marker in AgentContext, so this node skips it there.
+      if (
+        (anthropicPromptCacheEnabled || openRouterPromptCacheEnabled) &&
+        !agentContext.systemRunnable
+      ) {
+        finalMessages = addTailCacheControl<BaseMessage>(finalMessages);
+      } else if (bedrockPromptCacheEnabled) {
+        finalMessages = addBedrockTailCacheControl<BaseMessage>(finalMessages);
+      }
       if (
         agentContext.lastStreamCall != null &&
         agentContext.streamBuffer != null

package/src/langfuse.ts CHANGED Viewed

@@ -11,6 +11,7 @@ const TRACE_METADATA_MAX_LENGTH = 200;
 const LANGFUSE_FORCE_FLUSH_ON_DISPOSE = 'LANGFUSE_FORCE_FLUSH_ON_DISPOSE';
 export type LangfuseTraceMetadata = Record<string, string>;
+type LangfuseMetadata = NonNullable<t.LangfuseConfig['metadata']>;
 type LangfuseHandlerParams = {
   userId?: string;
@@ -44,6 +45,13 @@ function hasLangfuseTracingConfig(langfuse?: t.LangfuseConfig): boolean {
   );
 }
+function hasLangfuseTraceAttributes(langfuse?: t.LangfuseConfig): boolean {
+  return (
+    Object.keys(createTraceMetadata(langfuse?.metadata ?? {})).length > 0 ||
+    (mergeLangfuseTags(undefined, langfuse?.tags)?.length ?? 0) > 0
+  );
+}
 export function hasLangfuseConfigCredentials(
   langfuse?: t.LangfuseConfig
 ): langfuse is t.LangfuseConfig & {
@@ -67,6 +75,7 @@ export function isExplicitLangfuseConfig(langfuse?: t.LangfuseConfig): boolean {
     isPresent(langfuse?.publicKey) ||
     isPresent(langfuse?.secretKey) ||
     isPresent(langfuse?.baseUrl) ||
+    hasLangfuseTraceAttributes(langfuse) ||
     hasLangfuseTracingConfig(langfuse)
   );
 }
@@ -110,6 +119,27 @@ export function createLangfuseTraceMetadata({
   });
 }
+function mergeLangfuseTraceMetadata(
+  traceMetadata?: LangfuseTraceMetadata,
+  metadata?: LangfuseMetadata
+): LangfuseTraceMetadata | undefined {
+  const merged = createTraceMetadata({
+    ...(metadata ?? {}),
+    ...(traceMetadata ?? {}),
+  });
+  return Object.keys(merged).length > 0 ? merged : undefined;
+}
+function mergeLangfuseTags(
+  tags?: string[],
+  configTags?: string[]
+): string[] | undefined {
+  const merged = [...(tags ?? []), ...(configTags ?? [])].filter(
+    (tag) => tag.trim() !== ''
+  );
+  return merged.length > 0 ? [...new Set(merged)] : undefined;
+}
 export function getLangfuseTraceName(
   traceMetadata?: LangfuseTraceMetadata,
   fallback: string = 'LibreChat Agent'
@@ -161,12 +191,16 @@ export function createLangfuseHandler({
   return new CallbackHandler({
     userId,
     sessionId,
-    traceMetadata,
-    tags,
+    traceMetadata: mergeLangfuseTraceMetadata(
+      traceMetadata,
+      langfuse?.metadata
+    ),
+    tags: mergeLangfuseTags(tags, langfuse?.tags),
   });
 }
 function createPropagateAttributeParams({
+  langfuse,
   userId,
   sessionId,
   traceMetadata,
@@ -177,8 +211,8 @@ function createPropagateAttributeParams({
     userId,
     sessionId,
     traceName,
-    tags,
-    metadata: traceMetadata,
+    tags: mergeLangfuseTags(tags, langfuse?.tags),
+    metadata: mergeLangfuseTraceMetadata(traceMetadata, langfuse?.metadata),
   };
 }

package/src/langfuseToolOutputTracing.ts CHANGED Viewed

@@ -692,10 +692,28 @@ export function resolveLangfuseConfig(
         ...agentLangfuse.toolOutputTracing,
       }
       : undefined;
+  const metadata =
+    runLangfuse.metadata != null || agentLangfuse.metadata != null
+      ? {
+        ...runLangfuse.metadata,
+        ...agentLangfuse.metadata,
+      }
+      : undefined;
+  const tags =
+    runLangfuse.tags != null || agentLangfuse.tags != null
+      ? [
+        ...new Set([
+          ...(runLangfuse.tags ?? []),
+          ...(agentLangfuse.tags ?? []),
+        ]),
+      ]
+      : undefined;
   return {
     ...runLangfuse,
     ...agentLangfuse,
+    ...(metadata != null ? { metadata } : {}),
+    ...(tags != null ? { tags } : {}),
     ...(toolNodeTracing != null ? { toolNodeTracing } : {}),
     ...(toolOutputTracing != null ? { toolOutputTracing } : {}),
   };

package/src/llm/anthropic/utils/message_inputs.ts CHANGED Viewed

@@ -140,6 +140,35 @@ export function normalizeAnthropicToolCallId(
   return `${sanitized.slice(0, prefixMaxLength)}_${hash}`;
 }
+/**
+ * Lift any `cache_control` off the inner blocks of a tool result onto the
+ * `tool_result` block itself. Anthropic documents the top-level
+ * `messages.content` block as the cacheable position and does not document
+ * caching of sub-content blocks; the API currently honors a nested marker, but
+ * anchoring on the documented position keeps the single tail breakpoint robust
+ * (and mirrors the Bedrock cachePoint hoist). The first marker found wins; it is
+ * stripped from every inner block so exactly one survives, on the outer block.
+ */
+function hoistToolResultCacheControl(
+  content: string | MessageContentComplex[]
+): { content: string | MessageContentComplex[]; cacheControl: unknown } {
+  if (!Array.isArray(content)) {
+    return { content, cacheControl: undefined };
+  }
+  let cacheControl: unknown;
+  const stripped = content.map((block) => {
+    if ('cache_control' in block) {
+      cacheControl ??= (block as Record<string, unknown>).cache_control;
+      const clone = { ...(block as Record<string, unknown>) };
+      delete clone.cache_control;
+      return clone as MessageContentComplex;
+    }
+    return block;
+  });
+  // `stripped` is element-equal to `content` when no marker was present.
+  return { content: stripped, cacheControl };
+}
 function _ensureMessageContents(
   messages: BaseMessage[]
 ): (SystemMessage | HumanMessage | AIMessage)[] {
@@ -183,13 +212,20 @@ function _ensureMessageContents(
         const toolMessageContent = (
           message as { content?: BaseMessage['content'] | null }
         ).content;
+        // Hoist a tail cache_control off the inner content onto the
+        // tool_result block itself (the documented cacheable position).
+        const { content: hoistedContent, cacheControl } =
+          toolMessageContent != null
+            ? hoistToolResultCacheControl(_formatContent(message))
+            : { content: undefined, cacheControl: undefined };
         updatedMsgs.push(
           new HumanMessage({
             content: [
               {
                 type: 'tool_result',
-                ...(toolMessageContent != null
-                  ? { content: _formatContent(message) }
+                ...(hoistedContent != null ? { content: hoistedContent } : {}),
+                ...(cacheControl != null
+                  ? { cache_control: cacheControl as { type: 'ephemeral' } }
                   : {}),
                 tool_use_id: normalizeAnthropicToolCallId(
                   (message as ToolMessage).tool_call_id
@@ -917,6 +953,86 @@ export function modelDisallowsAssistantPrefill(model?: string): boolean {
   return Number(match[1]) >= 6;
 }
+function messagesHaveCacheControl(
+  messages: AnthropicMessageCreateParams['messages']
+): boolean {
+  return messages.some(
+    (message) =>
+      Array.isArray(message.content) &&
+      message.content.some((block) => 'cache_control' in block)
+  );
+}
+/** Anthropic rejects cache_control on these reasoning blocks. */
+const NON_CACHEABLE_PAYLOAD_BLOCK_TYPES = new Set([
+  'thinking',
+  'redacted_thinking',
+]);
+/**
+ * Place one ephemeral `cache_control` on the last cacheable block of the final
+ * message of an already-converted Anthropic payload. Used to re-anchor the tail
+ * breakpoint after a trailing assistant prefill is stripped. Operates on the
+ * post-conversion payload, where blocks the converter drops (foreign reasoning,
+ * input_json_delta) are already gone — only native thinking blocks must be
+ * skipped. Returns a new array only when it actually places a marker.
+ */
+function reanchorTailCacheControl(
+  messages: AnthropicMessageCreateParams['messages']
+): AnthropicMessageCreateParams['messages'] {
+  if (messages.length === 0) {
+    return messages;
+  }
+  const lastIndex = messages.length - 1;
+  const tail = messages[lastIndex];
+  const content = tail.content;
+  if (typeof content === 'string') {
+    if (content.trim() === '') {
+      return messages;
+    }
+    const next = [...messages];
+    next[lastIndex] = {
+      ...tail,
+      content: [
+        { type: 'text', text: content, cache_control: { type: 'ephemeral' } },
+      ],
+    } as (typeof messages)[number];
+    return next;
+  }
+  if (!Array.isArray(content)) {
+    return messages;
+  }
+  let anchor = -1;
+  for (let i = 0; i < content.length; i++) {
+    const type = (content[i] as { type?: string }).type;
+    if (type == null || NON_CACHEABLE_PAYLOAD_BLOCK_TYPES.has(type)) {
+      continue;
+    }
+    if (
+      type === 'text' &&
+      ((content[i] as { text?: string }).text ?? '').trim() === ''
+    ) {
+      continue;
+    }
+    anchor = i;
+  }
+  if (anchor < 0) {
+    return messages;
+  }
+  const next = [...messages];
+  next[lastIndex] = {
+    ...tail,
+    content: content.map((block, i) =>
+      i === anchor ? { ...block, cache_control: { type: 'ephemeral' } } : block
+    ),
+  } as (typeof messages)[number];
+  return next;
+}
 export function stripUnsupportedAssistantPrefill<
   T extends Pick<AnthropicMessageCreateParams, 'messages'> & { model?: string },
 >(request: T): T {
@@ -940,9 +1056,21 @@ export function stripUnsupportedAssistantPrefill<
     nextMessages.pop();
   }
+  /**
+   * If a single tail prompt-cache breakpoint rode the stripped assistant
+   * prefill, the survivors may now carry no `cache_control` at all, dropping
+   * message caching for this request. Re-anchor the breakpoint on the new tail
+   * (only when one was actually lost, so caching-off requests stay untouched).
+   */
+  const reanchored =
+    messagesHaveCacheControl(messages) &&
+    !messagesHaveCacheControl(nextMessages)
+      ? reanchorTailCacheControl(nextMessages)
+      : nextMessages;
   return {
     ...request,
-    messages: nextMessages,
+    messages: reanchored,
   };
 }

package/src/llm/anthropic/utils/stripPrefillCache.test.ts ADDED Viewed

@@ -0,0 +1,111 @@
+import type { AnthropicMessageCreateParams } from '../types';
+import { stripUnsupportedAssistantPrefill } from './message_inputs';
+/**
+ * When a model disallows assistant prefill (Claude 4.6+), the trailing
+ * assistant message is stripped right before the API call. If the single tail
+ * prompt-cache breakpoint rode that assistant prefill, the survivors would lose
+ * their only message-level `cache_control` — so the strip must re-anchor the
+ * breakpoint onto the new tail.
+ */
+type Msgs = AnthropicMessageCreateParams['messages'];
+function cacheControlBlocks(messages: Msgs): number {
+  let n = 0;
+  for (const m of messages) {
+    if (!Array.isArray(m.content)) continue;
+    for (const b of m.content) {
+      if ('cache_control' in b) n++;
+    }
+  }
+  return n;
+}
+describe('stripUnsupportedAssistantPrefill — cache re-anchoring', () => {
+  test('re-anchors the breakpoint onto the new tail when the prefill carried it', () => {
+    const request = {
+      model: 'claude-opus-4-6',
+      max_tokens: 100,
+      messages: [
+        {
+          role: 'user' as const,
+          content: [{ type: 'text' as const, text: 'q' }],
+        },
+        {
+          role: 'assistant' as const,
+          content: [
+            {
+              type: 'text' as const,
+              text: 'prefill',
+              cache_control: { type: 'ephemeral' as const },
+            },
+          ],
+        },
+      ],
+    };
+    const out = stripUnsupportedAssistantPrefill(request);
+    // Prefill removed, and exactly one breakpoint survives — on the new tail.
+    expect(out.messages).toHaveLength(1);
+    expect(out.messages[0].role).toBe('user');
+    expect(cacheControlBlocks(out.messages)).toBe(1);
+    const tail = out.messages[0].content as Array<{ cache_control?: unknown }>;
+    expect(tail[tail.length - 1].cache_control).toEqual({ type: 'ephemeral' });
+  });
+  test('does not add a breakpoint when caching was off (no marker present)', () => {
+    const request = {
+      model: 'claude-opus-4-6',
+      max_tokens: 100,
+      messages: [
+        { role: 'user' as const, content: 'q' },
+        { role: 'assistant' as const, content: 'prefill' },
+      ],
+    };
+    const out = stripUnsupportedAssistantPrefill(request);
+    expect(out.messages).toHaveLength(1);
+    expect(cacheControlBlocks(out.messages)).toBe(0);
+  });
+  test('leaves a surviving breakpoint untouched (no double-anchor)', () => {
+    const request = {
+      model: 'claude-opus-4-6',
+      max_tokens: 100,
+      messages: [
+        {
+          role: 'user' as const,
+          content: [
+            {
+              type: 'text' as const,
+              text: 'q',
+              cache_control: { type: 'ephemeral' as const },
+            },
+          ],
+        },
+        { role: 'assistant' as const, content: 'prefill' },
+      ],
+    };
+    const out = stripUnsupportedAssistantPrefill(request);
+    expect(out.messages).toHaveLength(1);
+    expect(cacheControlBlocks(out.messages)).toBe(1);
+  });
+  test('older models keep the assistant prefill (no strip, no re-anchor)', () => {
+    const request = {
+      model: 'claude-sonnet-4-5-20250929',
+      max_tokens: 100,
+      messages: [
+        { role: 'user' as const, content: 'q' },
+        { role: 'assistant' as const, content: '{' },
+      ],
+    };
+    expect(stripUnsupportedAssistantPrefill(request)).toBe(request);
+  });
+});