npm - @librechat/agents - Versions diffs - 3.2.35 → 3.2.37 - Mend

@librechat/agents 3.2.35 → 3.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/dist/cjs/agents/AgentContext.cjs +75 -2
package/dist/cjs/agents/AgentContext.cjs.map +1 -1
package/dist/cjs/agents/projection.cjs +25 -0
package/dist/cjs/agents/projection.cjs.map +1 -0
package/dist/cjs/graphs/Graph.cjs +10 -26
package/dist/cjs/graphs/Graph.cjs.map +1 -1
package/dist/cjs/langfuse.cjs +16 -5
package/dist/cjs/langfuse.cjs.map +1 -1
package/dist/cjs/langfuseToolOutputTracing.cjs +7 -0
package/dist/cjs/langfuseToolOutputTracing.cjs.map +1 -1
package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +118 -7
package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +44 -4
package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
package/dist/cjs/main.cjs +7 -0
package/dist/cjs/messages/budget.cjs +23 -0
package/dist/cjs/messages/budget.cjs.map +1 -0
package/dist/cjs/messages/cache.cjs +184 -0
package/dist/cjs/messages/cache.cjs.map +1 -1
package/dist/cjs/messages/index.cjs +1 -0
package/dist/cjs/summarization/node.cjs +1 -1
package/dist/cjs/summarization/node.cjs.map +1 -1
package/dist/cjs/tools/search/format.cjs +91 -2
package/dist/cjs/tools/search/format.cjs.map +1 -1
package/dist/cjs/tools/search/tool.cjs +4 -3
package/dist/cjs/tools/search/tool.cjs.map +1 -1
package/dist/cjs/tools/toolOutputReferences.cjs +28 -14
package/dist/cjs/tools/toolOutputReferences.cjs.map +1 -1
package/dist/esm/agents/AgentContext.mjs +76 -3
package/dist/esm/agents/AgentContext.mjs.map +1 -1
package/dist/esm/agents/projection.mjs +25 -0
package/dist/esm/agents/projection.mjs.map +1 -0
package/dist/esm/graphs/Graph.mjs +9 -25
package/dist/esm/graphs/Graph.mjs.map +1 -1
package/dist/esm/langfuse.mjs +16 -5
package/dist/esm/langfuse.mjs.map +1 -1
package/dist/esm/langfuseToolOutputTracing.mjs +7 -0
package/dist/esm/langfuseToolOutputTracing.mjs.map +1 -1
package/dist/esm/llm/anthropic/utils/message_inputs.mjs +118 -7
package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
package/dist/esm/llm/bedrock/utils/message_inputs.mjs +44 -4
package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
package/dist/esm/main.mjs +4 -2
package/dist/esm/messages/budget.mjs +23 -0
package/dist/esm/messages/budget.mjs.map +1 -0
package/dist/esm/messages/cache.mjs +182 -1
package/dist/esm/messages/cache.mjs.map +1 -1
package/dist/esm/messages/index.mjs +1 -0
package/dist/esm/summarization/node.mjs +2 -2
package/dist/esm/summarization/node.mjs.map +1 -1
package/dist/esm/tools/search/format.mjs +91 -2
package/dist/esm/tools/search/format.mjs.map +1 -1
package/dist/esm/tools/search/tool.mjs +4 -3
package/dist/esm/tools/search/tool.mjs.map +1 -1
package/dist/esm/tools/toolOutputReferences.mjs +28 -14
package/dist/esm/tools/toolOutputReferences.mjs.map +1 -1
package/dist/types/agents/AgentContext.d.ts +30 -1
package/dist/types/agents/projection.d.ts +26 -0
package/dist/types/index.d.ts +1 -0
package/dist/types/messages/budget.d.ts +11 -0
package/dist/types/messages/cache.d.ts +47 -0
package/dist/types/messages/index.d.ts +1 -0
package/dist/types/tools/search/format.d.ts +4 -1
package/dist/types/tools/search/types.d.ts +7 -0
package/dist/types/types/graph.d.ts +2 -0
package/package.json +2 -1
package/src/agents/AgentContext.ts +105 -4
package/src/agents/__tests__/AgentContext.test.ts +232 -9
package/src/agents/__tests__/projection.test.ts +73 -0
package/src/agents/projection.ts +46 -0
package/src/graphs/Graph.ts +66 -65
package/src/index.ts +3 -0
package/src/langfuse.ts +38 -4
package/src/langfuseToolOutputTracing.ts +18 -0
package/src/llm/anthropic/utils/cross-provider-reasoning.test.ts +317 -0
package/src/llm/anthropic/utils/message_inputs.ts +209 -19
package/src/llm/anthropic/utils/stripPrefillCache.test.ts +111 -0
package/src/llm/bedrock/utils/cross-provider-reasoning.test.ts +131 -0
package/src/llm/bedrock/utils/message_inputs.test.ts +129 -0
package/src/llm/bedrock/utils/message_inputs.ts +81 -4
package/src/llm/bedrock/utils/toolResultCachePoint.test.ts +103 -0
package/src/messages/budget.ts +32 -0
package/src/messages/cache.tail.test.ts +340 -0
package/src/messages/cache.ts +267 -1
package/src/messages/index.ts +1 -0
package/src/messages/tailCacheConversion.test.ts +161 -0
package/src/scripts/bench-prompt-cache.ts +479 -0
package/src/specs/langfuse-config.test.ts +69 -2
package/src/specs/langfuse-metadata.test.ts +44 -0
package/src/specs/langfuse-tool-output-tracing.test.ts +6 -0
package/src/summarization/node.ts +2 -2
package/src/tools/__tests__/annotateMessagesForLLM.test.ts +50 -0
package/src/tools/search/format.test.ts +242 -0
package/src/tools/search/format.ts +122 -5
package/src/tools/search/tool.ts +5 -1
package/src/tools/search/types.ts +7 -0
package/src/tools/toolOutputReferences.ts +34 -20
package/src/types/graph.ts +2 -0

package/src/messages/tailCacheConversion.test.ts ADDED Viewed

@@ -0,0 +1,161 @@
+import {
+  HumanMessage,
+  AIMessage,
+  ToolMessage,
+  type BaseMessage,
+  type MessageContentComplex,
+} from '@langchain/core/messages';
+import { _convertMessagesToAnthropicPayload } from '@/llm/anthropic/utils/message_inputs';
+import { ensureThinkingBlockInMessages } from './format';
+import { toLangChainContent } from './langchain';
+import { addTailCacheControl } from './cache';
+import { Providers } from '@/common';
+/**
+ * Regression coverage for the single tail prompt-cache breakpoint surviving all
+ * the way into the final Anthropic payload — i.e. the marker must land on a
+ * block that actually ships, not one that downstream conversion / folding
+ * removes. Two ways the breakpoint was silently lost:
+ *
+ *  - Foreign reasoning tail: addTailCacheControl anchored on a
+ *    `reasoning_content`/`reasoning`/`think` block, which the Anthropic
+ *    converter drops on assistant turns (cross-provider handoff).
+ *  - Thinking-fold ordering: marking before ensureThinkingBlockInMessages let
+ *    the fold rewrite the anchored AI→Tool tail into a `[Previous agent
+ *    context]` HumanMessage that copies text but not cache_control.
+ */
+type PayloadMessage = { content: unknown };
+function hasCacheControl(block: unknown): boolean {
+  return (
+    typeof block === 'object' && block !== null && 'cache_control' in block
+  );
+}
+/** Does any block (top-level or nested in tool_result) carry cache_control? */
+function breakpointSurvives(messages: PayloadMessage[]): boolean {
+  for (const m of messages) {
+    if (!Array.isArray(m.content)) {
+      continue;
+    }
+    for (const block of m.content as unknown[]) {
+      if (hasCacheControl(block)) {
+        return true;
+      }
+      const inner = (block as { content?: unknown }).content;
+      if (Array.isArray(inner) && inner.some(hasCacheControl)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+describe('tail breakpoint survives Anthropic conversion', () => {
+  test('foreign reasoning tail keeps a usable breakpoint (anchored on text)', () => {
+    const messages: BaseMessage[] = [
+      new HumanMessage('hello'),
+      new AIMessage({
+        content: toLangChainContent([
+          { type: 'text', text: 'Here is my answer.' },
+          { type: 'reasoning_content', reasoningText: { text: 'r' } },
+        ] as MessageContentComplex[]),
+      }),
+    ];
+    const payload = _convertMessagesToAnthropicPayload(
+      addTailCacheControl(messages)
+    );
+    expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
+  });
+  test('string tool-result tail keeps a usable breakpoint on the tool_result block', () => {
+    const messages: BaseMessage[] = [
+      new HumanMessage('run it'),
+      new AIMessage({
+        content: 'calling',
+        tool_calls: [{ id: 't1', name: 'search', args: {} }],
+      }),
+      new ToolMessage({ tool_call_id: 't1', content: 'result body' }),
+    ];
+    const payload = _convertMessagesToAnthropicPayload(
+      addTailCacheControl(messages)
+    );
+    expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
+    // The marker must sit on the top-level tool_result block (the documented
+    // cacheable position), NOT nested inside tool_result.content.
+    const toolResult = (payload.messages as PayloadMessage[])
+      .flatMap((m) => (Array.isArray(m.content) ? m.content : []))
+      .find(
+        (b): b is Record<string, unknown> =>
+          b != null &&
+          typeof b === 'object' &&
+          'type' in b &&
+          (b as { type?: string }).type === 'tool_result'
+      ) as { cache_control?: unknown; content?: unknown } | undefined;
+    expect(toolResult?.cache_control).toEqual({ type: 'ephemeral' });
+    const inner = toolResult?.content;
+    if (Array.isArray(inner)) {
+      expect(
+        inner.some(
+          (b) => b != null && typeof b === 'object' && 'cache_control' in b
+        )
+      ).toBe(false);
+    }
+  });
+  test('marking AFTER the thinking fold preserves the breakpoint (Graph order)', () => {
+    // A historical non-thinking AI→Tool chain at the tail (no trailing human).
+    const messages: BaseMessage[] = [
+      new HumanMessage('do the thing'),
+      new AIMessage({
+        content: '',
+        tool_calls: [{ id: 't1', name: 'search', args: { q: 'x' } }],
+      }),
+      new ToolMessage({ tool_call_id: 't1', content: 'tool output text' }),
+    ];
+    // Graph applies the fold first, THEN the tail marker.
+    const folded = ensureThinkingBlockInMessages(
+      messages,
+      Providers.ANTHROPIC,
+      undefined,
+      messages.length
+    );
+    const payload = _convertMessagesToAnthropicPayload(
+      addTailCacheControl(folded)
+    );
+    expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
+  });
+  test('marking BEFORE the fold loses the breakpoint (guards the ordering)', () => {
+    const messages: BaseMessage[] = [
+      new HumanMessage('do the thing'),
+      new AIMessage({
+        content: '',
+        tool_calls: [{ id: 't1', name: 'search', args: { q: 'x' } }],
+      }),
+      new ToolMessage({ tool_call_id: 't1', content: 'tool output text' }),
+    ];
+    // The buggy order: mark first, then fold drops the marker.
+    const marked = addTailCacheControl(messages);
+    const folded = ensureThinkingBlockInMessages(
+      marked,
+      Providers.ANTHROPIC,
+      undefined,
+      messages.length
+    );
+    const payload = _convertMessagesToAnthropicPayload(folded);
+    expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(
+      false
+    );
+  });
+});

package/src/scripts/bench-prompt-cache.ts ADDED Viewed

@@ -0,0 +1,479 @@
+/**
+ * Live, reproducible benchmark: single tail prompt-cache breakpoint (new
+ * default) vs. the legacy "last two user messages" strategy.
+ *
+ * It replays realistic harness conversations against a real provider and, for
+ * each model call, records the cache token breakdown the API reports. The two
+ * strategies are run over the SAME conversations (only the cache MARKING
+ * differs) under distinct cache namespaces, then compared.
+ *
+ * What it demonstrates
+ * --------------------
+ *  - Agent tool loop (one user turn, many tool rounds): the legacy strategy
+ *    pins its only message breakpoint on the lone user message, so every
+ *    appended assistant/tool turn is re-sent UNCACHED on the next call — cache
+ *    write/fresh ≫ read. The tail strategy rides the true tail, so the growing
+ *    transcript is written once and read back. This is the dominant agent shape
+ *    and where the legacy approach breaks down hardest.
+ *  - Multi-turn chat (frequent user messages): legacy's two rolling markers do
+ *    fine here; the tail strategy ties (never worse).
+ *  - Realistic agent (user turns interleaved with tool rounds): tail wins.
+ *
+ * Metrics (per strategy, summed over all calls in a scenario)
+ *  - cache_read   : tokens served from cache (HIGHER is better).
+ *  - cache_write  : tokens written to cache (cache_creation).
+ *  - fresh        : uncached input processed at full price
+ *                   (= input_tokens - cache_read - cache_write); this is what
+ *                   balloons when caching fails to cover the transcript.
+ *  - effective    : a cost proxy in input-token-equivalents using Anthropic's
+ *                   published multipliers — read x0.1, write x1.25, fresh x1.0.
+ *                   LOWER is better.
+ *
+ * Usage
+ *   # Anthropic (default). Needs ANTHROPIC_API_KEY in .env (or BENCH_ENV_FILE).
+ *   npm run bench:cache
+ *   # Bedrock. Needs BEDROCK_AWS_* creds.
+ *   npm run bench:cache -- --provider bedrock
+ *   # Options: --provider anthropic|bedrock  --rounds <N>  --model <id>
+ *
+ * Not a unit test (no `.test.` suffix) so CI never runs it; it makes real,
+ * paid API calls.
+ */
+import { config } from 'dotenv';
+config({ path: process.env.BENCH_ENV_FILE || '.env' });
+import {
+  HumanMessage,
+  AIMessage,
+  ToolMessage,
+  type BaseMessage,
+} from '@langchain/core/messages';
+import { CustomAnthropic } from '@/llm/anthropic';
+import { CustomChatBedrockConverse } from '@/llm/bedrock';
+import {
+  addCacheControl,
+  addTailCacheControl,
+  addBedrockCacheControl,
+  addBedrockTailCacheControl,
+} from '@/messages/cache';
+type ProviderName = 'anthropic' | 'bedrock';
+interface Args {
+  provider: ProviderName;
+  rounds: number;
+  model?: string;
+}
+function parseArgs(): Args {
+  const argv = process.argv.slice(2);
+  const out: Args = { provider: 'anthropic', rounds: 6 };
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i];
+    if (a === '--provider') out.provider = argv[++i] as ProviderName;
+    else if (a === '--rounds') out.rounds = Number(argv[++i]);
+    else if (a === '--model') out.model = argv[++i];
+  }
+  return out;
+}
+/** Deterministic filler of roughly `tokens` tokens (~0.75 words/token). */
+function filler(tokens: number, tag: string): string {
+  const words = Math.max(1, Math.round(tokens * 0.75));
+  const out: string[] = [];
+  for (let i = 0; i < words; i++) {
+    out.push(`${tag}${i % 97}`);
+  }
+  return out.join(' ');
+}
+// ---------------------------------------------------------------------------
+// Scenarios. Each returns the message list for every model call (call `i`
+// sends step `i`; the transcript grows append-only between calls), built under
+// a per-run nonce so the two strategy runs never share a cache namespace.
+// ---------------------------------------------------------------------------
+const STABLE_TOKENS = 2000; // big stable context (instructions / first request)
+const TOOL_RESULT_TOKENS = 600; // realistic agent tool output (file/search)
+function processToolCall(id: string, batch: number) {
+  return { id, name: 'process_records', args: { batch } };
+}
+/** Agent tool loop: ONE user turn, then `rounds` assistant→tool rounds. */
+function toolLoopScenario(nonce: string, rounds: number): BaseMessage[][] {
+  const steps: BaseMessage[][] = [];
+  const conv: BaseMessage[] = [
+    new HumanMessage(
+      `Session ${nonce}. Reference data follows.\n${filler(STABLE_TOKENS, `ref${nonce}`)}\n\n` +
+        'Process every batch using the process_records tool until done.'
+    ),
+  ];
+  for (let i = 1; i <= rounds; i++) {
+    steps.push([...conv]);
+    conv.push(
+      new AIMessage({
+        content: `Processing batch ${i}.`,
+        tool_calls: [processToolCall(`tl_${nonce}_${i}`, i)],
+      })
+    );
+    conv.push(
+      new ToolMessage({
+        tool_call_id: `tl_${nonce}_${i}`,
+        content: `Batch ${i} of session ${nonce} complete. ${filler(TOOL_RESULT_TOKENS, `out${i}`)}`,
+      })
+    );
+  }
+  return steps;
+}
+/** Multi-turn chat: frequent user messages, no tools (legacy's good case). */
+function chatScenario(nonce: string, rounds: number): BaseMessage[][] {
+  const steps: BaseMessage[][] = [];
+  const conv: BaseMessage[] = [
+    new HumanMessage(
+      `Session ${nonce}.\n${filler(STABLE_TOKENS, `doc${nonce}`)}\n\nQuestion 1: summarize.`
+    ),
+  ];
+  for (let i = 1; i <= rounds; i++) {
+    steps.push([...conv]);
+    conv.push(new AIMessage(`Answer ${i}. ${filler(120, `ans${i}`)}`));
+    conv.push(
+      new HumanMessage(`Question ${i + 1}: ${filler(60, `q${i + 1}`)}`)
+    );
+  }
+  return steps;
+}
+/** Realistic agent: each user turn triggers two tool rounds, then a new user. */
+function agentMixedScenario(nonce: string, rounds: number): BaseMessage[][] {
+  const steps: BaseMessage[][] = [];
+  const conv: BaseMessage[] = [
+    new HumanMessage(
+      `Session ${nonce}. Project context:\n${filler(STABLE_TOKENS, `ctx${nonce}`)}\n\nTask 1: investigate.`
+    ),
+  ];
+  let tc = 0;
+  for (let turn = 1; turn <= rounds; turn++) {
+    // two tool rounds within this user turn
+    for (let r = 0; r < 2; r++) {
+      steps.push([...conv]);
+      tc++;
+      const id = `am_${nonce}_${tc}`;
+      conv.push(
+        new AIMessage({
+          content: `Turn ${turn} step ${r + 1}.`,
+          tool_calls: [{ id, name: 'process_records', args: { step: tc } }],
+        })
+      );
+      conv.push(
+        new ToolMessage({
+          tool_call_id: id,
+          content: `Result ${tc} (${nonce}). ${filler(TOOL_RESULT_TOKENS, `r${tc}`)}`,
+        })
+      );
+    }
+    // model summarizes, user asks the next task
+    steps.push([...conv]);
+    conv.push(new AIMessage(`Turn ${turn} summary. ${filler(80, `s${turn}`)}`));
+    conv.push(
+      new HumanMessage(`Task ${turn + 1}: ${filler(60, `t${turn + 1}`)}`)
+    );
+  }
+  return steps;
+}
+const SUMMARY_TOKENS = 1500; // compacted-history summary injected post-compaction
+/**
+ * Post-compaction (summarization): a few tool rounds on the original context,
+ * then a compaction event replaces the head with a summary message, then the
+ * agent continues. The compaction step is a deliberate cache miss for BOTH
+ * strategies (the cached prefix genuinely changed — unavoidable). What matters
+ * is the POST-compaction phase: the summary becomes the new stable head and the
+ * tail strategy re-establishes append-only caching over the continuing tool
+ * loop, whereas legacy pins on the lone summary user-message and re-sends the
+ * new tool work uncached. (Tool results here are already the truncated,
+ * persisted strings ToolNode stores — truncation is applied once at exec time
+ * with a model-fixed cap, so it does not mutate the prefix across turns.)
+ */
+function postCompactionScenario(
+  nonce: string,
+  rounds: number
+): BaseMessage[][] {
+  const steps: BaseMessage[][] = [];
+  // Phase 1: pre-compaction growth on the original context.
+  const pre: BaseMessage[] = [
+    new HumanMessage(
+      `Session ${nonce}. ${filler(STABLE_TOKENS, `pre${nonce}`)}\n\nAnalyze the dataset.`
+    ),
+  ];
+  for (let i = 1; i <= 2; i++) {
+    steps.push([...pre]);
+    pre.push(
+      new AIMessage({
+        content: `Pre ${i}.`,
+        tool_calls: [
+          {
+            id: `pc_${nonce}_${i}`,
+            name: 'process_records',
+            args: { batch: i },
+          },
+        ],
+      })
+    );
+    pre.push(
+      new ToolMessage({
+        tool_call_id: `pc_${nonce}_${i}`,
+        content: `Pre result ${i}. ${filler(TOOL_RESULT_TOKENS, `pr${i}`)}`,
+      })
+    );
+  }
+  // Compaction: head replaced by a durable summary; continue from there.
+  const post: BaseMessage[] = [
+    new HumanMessage(
+      `Session ${nonce} (resumed after compaction).\n<summary>\n${filler(SUMMARY_TOKENS, `sum${nonce}`)}\n</summary>\n\nContinue the analysis.`
+    ),
+  ];
+  for (let i = 1; i <= rounds; i++) {
+    steps.push([...post]);
+    post.push(
+      new AIMessage({
+        content: `Post ${i}.`,
+        tool_calls: [
+          {
+            id: `po_${nonce}_${i}`,
+            name: 'process_records',
+            args: { batch: i },
+          },
+        ],
+      })
+    );
+    post.push(
+      new ToolMessage({
+        tool_call_id: `po_${nonce}_${i}`,
+        content: `Post result ${i}. ${filler(TOOL_RESULT_TOKENS, `po${i}`)}`,
+      })
+    );
+  }
+  return steps;
+}
+const SCENARIOS: Array<{
+  name: string;
+  build: (nonce: string, rounds: number) => BaseMessage[][];
+}> = [
+  {
+    name: 'Agent tool loop (1 user turn, N tool rounds)',
+    build: toolLoopScenario,
+  },
+  { name: 'Multi-turn chat (frequent user messages)', build: chatScenario },
+  {
+    name: 'Realistic agent (user turns + tool rounds)',
+    build: agentMixedScenario,
+  },
+  {
+    name: 'Post-compaction (summary head + continued tool loop)',
+    build: postCompactionScenario,
+  },
+];
+// ---------------------------------------------------------------------------
+// Provider plumbing.
+// ---------------------------------------------------------------------------
+const PROCESS_TOOL = {
+  type: 'function' as const,
+  function: {
+    name: 'process_records',
+    description: 'Process a batch of records.',
+    parameters: {
+      type: 'object',
+      properties: { batch: { type: 'number' }, step: { type: 'number' } },
+    },
+  },
+};
+interface StrategyPair {
+  legacy: (m: BaseMessage[]) => BaseMessage[];
+  tail: (m: BaseMessage[]) => BaseMessage[];
+}
+function makeProvider(args: Args): {
+  invoke: (messages: BaseMessage[]) => Promise<Usage | undefined>;
+  strategies: StrategyPair;
+  label: string;
+} {
+  if (args.provider === 'bedrock') {
+    const model = args.model ?? 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
+    const llm = new CustomChatBedrockConverse({
+      model,
+      region:
+        process.env.BEDROCK_AWS_REGION ??
+        process.env.AWS_DEFAULT_REGION ??
+        'us-east-1',
+      credentials: {
+        accessKeyId: process.env.BEDROCK_AWS_ACCESS_KEY_ID!,
+        secretAccessKey: process.env.BEDROCK_AWS_SECRET_ACCESS_KEY!,
+      },
+      streaming: true,
+      streamUsage: true,
+      maxTokens: 32,
+      promptCache: true,
+    }).bindTools([PROCESS_TOOL]);
+    return {
+      label: `bedrock:${model}`,
+      invoke: async (messages) =>
+        (await llm.invoke(messages)).usage_metadata as Usage,
+      strategies: {
+        legacy: (m) => addBedrockCacheControl<BaseMessage>(m),
+        tail: (m) => addBedrockTailCacheControl<BaseMessage>(m),
+      },
+    };
+  }
+  const model = args.model ?? 'claude-sonnet-4-5';
+  const llm = new CustomAnthropic({
+    model,
+    apiKey: process.env.ANTHROPIC_API_KEY,
+    maxTokens: 32,
+    promptCache: true,
+    streaming: true,
+    streamUsage: true,
+  } as never).bindTools([PROCESS_TOOL]);
+  return {
+    label: `anthropic:${model}`,
+    invoke: async (messages) =>
+      (await llm.invoke(messages)).usage_metadata as Usage,
+    strategies: {
+      legacy: (m) => addCacheControl<BaseMessage>(m),
+      tail: (m) => addTailCacheControl<BaseMessage>(m),
+    },
+  };
+}
+type Usage = {
+  input_tokens?: number;
+  output_tokens?: number;
+  total_tokens?: number;
+  input_token_details?: { cache_creation?: number; cache_read?: number };
+};
+interface Totals {
+  read: number;
+  write: number;
+  fresh: number;
+  effective: number;
+}
+function emptyTotals(): Totals {
+  return { read: 0, write: 0, fresh: 0, effective: 0 };
+}
+function addUsage(t: Totals, u: Usage | undefined): void {
+  const d = u?.input_token_details ?? {};
+  const read = d.cache_read ?? 0;
+  const write = d.cache_creation ?? 0;
+  // Provider-agnostic fresh: total prompt tokens minus cached buckets. Avoids
+  // the `input_tokens` ambiguity — Anthropic folds cache tokens INTO
+  // input_tokens, while Bedrock reports input_tokens as fresh-only with cache
+  // tokens separate. `total_tokens - output_tokens` is the full prompt size on
+  // both, so subtracting read+write leaves the truly fresh (full-price) input.
+  const promptTotal = (u?.total_tokens ?? 0) - (u?.output_tokens ?? 0);
+  const fresh = Math.max(0, promptTotal - read - write);
+  t.read += read;
+  t.write += write;
+  t.fresh += fresh;
+  // Anthropic/Bedrock price multipliers: read 0.1x, write 1.25x, fresh 1x.
+  t.effective += fresh + write * 1.25 + read * 0.1;
+}
+async function runStrategy(
+  steps: BaseMessage[][],
+  apply: (m: BaseMessage[]) => BaseMessage[],
+  invoke: (m: BaseMessage[]) => Promise<Usage | undefined>
+): Promise<Totals> {
+  const totals = emptyTotals();
+  for (const step of steps) {
+    const usage = await invoke(apply(step));
+    addUsage(totals, usage);
+  }
+  return totals;
+}
+function pct(legacy: number, tail: number): string {
+  if (legacy === 0) return tail === 0 ? '0%' : 'n/a';
+  const delta = ((tail - legacy) / legacy) * 100;
+  return `${delta >= 0 ? '+' : ''}${delta.toFixed(0)}%`;
+}
+function uniqueNonce(tag: string): string {
+  return `${tag}-${Date.now().toString(36)}-${Math.floor(Math.random() * 1e6).toString(36)}`;
+}
+async function main(): Promise<void> {
+  const args = parseArgs();
+  if (args.provider === 'anthropic' && !process.env.ANTHROPIC_API_KEY) {
+    console.error('Set ANTHROPIC_API_KEY (in .env or via BENCH_ENV_FILE).');
+    process.exit(1);
+  }
+  if (args.provider === 'bedrock' && !process.env.BEDROCK_AWS_ACCESS_KEY_ID) {
+    console.error(
+      'Set BEDROCK_AWS_ACCESS_KEY_ID / BEDROCK_AWS_SECRET_ACCESS_KEY.'
+    );
+    process.exit(1);
+  }
+  const { invoke, strategies, label } = makeProvider(args);
+  console.log(`\nProvider: ${label}   rounds=${args.rounds}`);
+  console.log(
+    'Metrics summed over all calls in a scenario. read↑ better; fresh↓ and effective↓ better.\n'
+  );
+  let tailWins = 0;
+  let scenarioCount = 0;
+  for (const scenario of SCENARIOS) {
+    // Distinct nonce per strategy run so legacy and tail never share a cache.
+    const legacySteps = scenario.build(uniqueNonce('legacy'), args.rounds);
+    const legacy = await runStrategy(legacySteps, strategies.legacy, invoke);
+    const tailSteps = scenario.build(uniqueNonce('tail'), args.rounds);
+    const tail = await runStrategy(tailSteps, strategies.tail, invoke);
+    console.log(`SCENARIO: ${scenario.name}  (${legacySteps.length} calls)`);
+    const row = (name: string, t: Totals): string =>
+      `  ${name.padEnd(8)} read=${String(t.read).padStart(7)}  write=${String(
+        t.write
+      ).padStart(7)}  fresh=${String(t.fresh).padStart(7)}  effective=${String(
+        Math.round(t.effective)
+      ).padStart(7)}`;
+    console.log(row('legacy', legacy));
+    console.log(row('tail', tail));
+    console.log(
+      `  Δ tail vs legacy:  read ${pct(legacy.read, tail.read)}   ` +
+        `fresh ${pct(legacy.fresh, tail.fresh)}   ` +
+        `effective ${pct(legacy.effective, tail.effective)} (lower=cheaper)`
+    );
+    const better = tail.effective <= legacy.effective;
+    const tie =
+      Math.abs(tail.effective - legacy.effective) / (legacy.effective || 1) <
+      0.03;
+    console.log(
+      `  → ${better ? (tie ? '≈ TIE' : '✅ TAIL WINS') : '❌ legacy better'}\n`
+    );
+    scenarioCount++;
+    if (better) tailWins++;
+  }
+  console.log(
+    `RESULT: tail strategy is better-or-equal in ${tailWins}/${scenarioCount} scenarios.`
+  );
+}
+main().catch((err) => {
+  console.error('Benchmark failed:', err);
+  process.exit(1);
+});