@librechat/agents 3.2.35 → 3.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/dist/cjs/agents/AgentContext.cjs +75 -2
  2. package/dist/cjs/agents/AgentContext.cjs.map +1 -1
  3. package/dist/cjs/agents/projection.cjs +25 -0
  4. package/dist/cjs/agents/projection.cjs.map +1 -0
  5. package/dist/cjs/graphs/Graph.cjs +10 -26
  6. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  7. package/dist/cjs/langfuse.cjs +16 -5
  8. package/dist/cjs/langfuse.cjs.map +1 -1
  9. package/dist/cjs/langfuseToolOutputTracing.cjs +7 -0
  10. package/dist/cjs/langfuseToolOutputTracing.cjs.map +1 -1
  11. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +118 -7
  12. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
  13. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +44 -4
  14. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
  15. package/dist/cjs/main.cjs +7 -0
  16. package/dist/cjs/messages/budget.cjs +23 -0
  17. package/dist/cjs/messages/budget.cjs.map +1 -0
  18. package/dist/cjs/messages/cache.cjs +184 -0
  19. package/dist/cjs/messages/cache.cjs.map +1 -1
  20. package/dist/cjs/messages/index.cjs +1 -0
  21. package/dist/cjs/summarization/node.cjs +1 -1
  22. package/dist/cjs/summarization/node.cjs.map +1 -1
  23. package/dist/cjs/tools/search/format.cjs +91 -2
  24. package/dist/cjs/tools/search/format.cjs.map +1 -1
  25. package/dist/cjs/tools/search/tool.cjs +4 -3
  26. package/dist/cjs/tools/search/tool.cjs.map +1 -1
  27. package/dist/cjs/tools/toolOutputReferences.cjs +28 -14
  28. package/dist/cjs/tools/toolOutputReferences.cjs.map +1 -1
  29. package/dist/esm/agents/AgentContext.mjs +76 -3
  30. package/dist/esm/agents/AgentContext.mjs.map +1 -1
  31. package/dist/esm/agents/projection.mjs +25 -0
  32. package/dist/esm/agents/projection.mjs.map +1 -0
  33. package/dist/esm/graphs/Graph.mjs +9 -25
  34. package/dist/esm/graphs/Graph.mjs.map +1 -1
  35. package/dist/esm/langfuse.mjs +16 -5
  36. package/dist/esm/langfuse.mjs.map +1 -1
  37. package/dist/esm/langfuseToolOutputTracing.mjs +7 -0
  38. package/dist/esm/langfuseToolOutputTracing.mjs.map +1 -1
  39. package/dist/esm/llm/anthropic/utils/message_inputs.mjs +118 -7
  40. package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
  41. package/dist/esm/llm/bedrock/utils/message_inputs.mjs +44 -4
  42. package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
  43. package/dist/esm/main.mjs +4 -2
  44. package/dist/esm/messages/budget.mjs +23 -0
  45. package/dist/esm/messages/budget.mjs.map +1 -0
  46. package/dist/esm/messages/cache.mjs +182 -1
  47. package/dist/esm/messages/cache.mjs.map +1 -1
  48. package/dist/esm/messages/index.mjs +1 -0
  49. package/dist/esm/summarization/node.mjs +2 -2
  50. package/dist/esm/summarization/node.mjs.map +1 -1
  51. package/dist/esm/tools/search/format.mjs +91 -2
  52. package/dist/esm/tools/search/format.mjs.map +1 -1
  53. package/dist/esm/tools/search/tool.mjs +4 -3
  54. package/dist/esm/tools/search/tool.mjs.map +1 -1
  55. package/dist/esm/tools/toolOutputReferences.mjs +28 -14
  56. package/dist/esm/tools/toolOutputReferences.mjs.map +1 -1
  57. package/dist/types/agents/AgentContext.d.ts +30 -1
  58. package/dist/types/agents/projection.d.ts +26 -0
  59. package/dist/types/index.d.ts +1 -0
  60. package/dist/types/messages/budget.d.ts +11 -0
  61. package/dist/types/messages/cache.d.ts +47 -0
  62. package/dist/types/messages/index.d.ts +1 -0
  63. package/dist/types/tools/search/format.d.ts +4 -1
  64. package/dist/types/tools/search/types.d.ts +7 -0
  65. package/dist/types/types/graph.d.ts +2 -0
  66. package/package.json +2 -1
  67. package/src/agents/AgentContext.ts +105 -4
  68. package/src/agents/__tests__/AgentContext.test.ts +232 -9
  69. package/src/agents/__tests__/projection.test.ts +73 -0
  70. package/src/agents/projection.ts +46 -0
  71. package/src/graphs/Graph.ts +66 -65
  72. package/src/index.ts +3 -0
  73. package/src/langfuse.ts +38 -4
  74. package/src/langfuseToolOutputTracing.ts +18 -0
  75. package/src/llm/anthropic/utils/cross-provider-reasoning.test.ts +317 -0
  76. package/src/llm/anthropic/utils/message_inputs.ts +209 -19
  77. package/src/llm/anthropic/utils/stripPrefillCache.test.ts +111 -0
  78. package/src/llm/bedrock/utils/cross-provider-reasoning.test.ts +131 -0
  79. package/src/llm/bedrock/utils/message_inputs.test.ts +129 -0
  80. package/src/llm/bedrock/utils/message_inputs.ts +81 -4
  81. package/src/llm/bedrock/utils/toolResultCachePoint.test.ts +103 -0
  82. package/src/messages/budget.ts +32 -0
  83. package/src/messages/cache.tail.test.ts +340 -0
  84. package/src/messages/cache.ts +267 -1
  85. package/src/messages/index.ts +1 -0
  86. package/src/messages/tailCacheConversion.test.ts +161 -0
  87. package/src/scripts/bench-prompt-cache.ts +479 -0
  88. package/src/specs/langfuse-config.test.ts +69 -2
  89. package/src/specs/langfuse-metadata.test.ts +44 -0
  90. package/src/specs/langfuse-tool-output-tracing.test.ts +6 -0
  91. package/src/summarization/node.ts +2 -2
  92. package/src/tools/__tests__/annotateMessagesForLLM.test.ts +50 -0
  93. package/src/tools/search/format.test.ts +242 -0
  94. package/src/tools/search/format.ts +122 -5
  95. package/src/tools/search/tool.ts +5 -1
  96. package/src/tools/search/types.ts +7 -0
  97. package/src/tools/toolOutputReferences.ts +34 -20
  98. package/src/types/graph.ts +2 -0
@@ -0,0 +1,161 @@
1
+ import {
2
+ HumanMessage,
3
+ AIMessage,
4
+ ToolMessage,
5
+ type BaseMessage,
6
+ type MessageContentComplex,
7
+ } from '@langchain/core/messages';
8
+ import { _convertMessagesToAnthropicPayload } from '@/llm/anthropic/utils/message_inputs';
9
+ import { ensureThinkingBlockInMessages } from './format';
10
+ import { toLangChainContent } from './langchain';
11
+ import { addTailCacheControl } from './cache';
12
+ import { Providers } from '@/common';
13
+
14
+ /**
15
+ * Regression coverage for the single tail prompt-cache breakpoint surviving all
16
+ * the way into the final Anthropic payload — i.e. the marker must land on a
17
+ * block that actually ships, not one that downstream conversion / folding
18
+ * removes. Two ways the breakpoint was silently lost:
19
+ *
20
+ * - Foreign reasoning tail: addTailCacheControl anchored on a
21
+ * `reasoning_content`/`reasoning`/`think` block, which the Anthropic
22
+ * converter drops on assistant turns (cross-provider handoff).
23
+ * - Thinking-fold ordering: marking before ensureThinkingBlockInMessages let
24
+ * the fold rewrite the anchored AI→Tool tail into a `[Previous agent
25
+ * context]` HumanMessage that copies text but not cache_control.
26
+ */
27
+
28
+ type PayloadMessage = { content: unknown };
29
+
30
+ function hasCacheControl(block: unknown): boolean {
31
+ return (
32
+ typeof block === 'object' && block !== null && 'cache_control' in block
33
+ );
34
+ }
35
+
36
+ /** Does any block (top-level or nested in tool_result) carry cache_control? */
37
+ function breakpointSurvives(messages: PayloadMessage[]): boolean {
38
+ for (const m of messages) {
39
+ if (!Array.isArray(m.content)) {
40
+ continue;
41
+ }
42
+ for (const block of m.content as unknown[]) {
43
+ if (hasCacheControl(block)) {
44
+ return true;
45
+ }
46
+ const inner = (block as { content?: unknown }).content;
47
+ if (Array.isArray(inner) && inner.some(hasCacheControl)) {
48
+ return true;
49
+ }
50
+ }
51
+ }
52
+ return false;
53
+ }
54
+
55
+ describe('tail breakpoint survives Anthropic conversion', () => {
56
+ test('foreign reasoning tail keeps a usable breakpoint (anchored on text)', () => {
57
+ const messages: BaseMessage[] = [
58
+ new HumanMessage('hello'),
59
+ new AIMessage({
60
+ content: toLangChainContent([
61
+ { type: 'text', text: 'Here is my answer.' },
62
+ { type: 'reasoning_content', reasoningText: { text: 'r' } },
63
+ ] as MessageContentComplex[]),
64
+ }),
65
+ ];
66
+
67
+ const payload = _convertMessagesToAnthropicPayload(
68
+ addTailCacheControl(messages)
69
+ );
70
+
71
+ expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
72
+ });
73
+
74
+ test('string tool-result tail keeps a usable breakpoint on the tool_result block', () => {
75
+ const messages: BaseMessage[] = [
76
+ new HumanMessage('run it'),
77
+ new AIMessage({
78
+ content: 'calling',
79
+ tool_calls: [{ id: 't1', name: 'search', args: {} }],
80
+ }),
81
+ new ToolMessage({ tool_call_id: 't1', content: 'result body' }),
82
+ ];
83
+
84
+ const payload = _convertMessagesToAnthropicPayload(
85
+ addTailCacheControl(messages)
86
+ );
87
+
88
+ expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
89
+
90
+ // The marker must sit on the top-level tool_result block (the documented
91
+ // cacheable position), NOT nested inside tool_result.content.
92
+ const toolResult = (payload.messages as PayloadMessage[])
93
+ .flatMap((m) => (Array.isArray(m.content) ? m.content : []))
94
+ .find(
95
+ (b): b is Record<string, unknown> =>
96
+ b != null &&
97
+ typeof b === 'object' &&
98
+ 'type' in b &&
99
+ (b as { type?: string }).type === 'tool_result'
100
+ ) as { cache_control?: unknown; content?: unknown } | undefined;
101
+ expect(toolResult?.cache_control).toEqual({ type: 'ephemeral' });
102
+ const inner = toolResult?.content;
103
+ if (Array.isArray(inner)) {
104
+ expect(
105
+ inner.some(
106
+ (b) => b != null && typeof b === 'object' && 'cache_control' in b
107
+ )
108
+ ).toBe(false);
109
+ }
110
+ });
111
+
112
+ test('marking AFTER the thinking fold preserves the breakpoint (Graph order)', () => {
113
+ // A historical non-thinking AI→Tool chain at the tail (no trailing human).
114
+ const messages: BaseMessage[] = [
115
+ new HumanMessage('do the thing'),
116
+ new AIMessage({
117
+ content: '',
118
+ tool_calls: [{ id: 't1', name: 'search', args: { q: 'x' } }],
119
+ }),
120
+ new ToolMessage({ tool_call_id: 't1', content: 'tool output text' }),
121
+ ];
122
+
123
+ // Graph applies the fold first, THEN the tail marker.
124
+ const folded = ensureThinkingBlockInMessages(
125
+ messages,
126
+ Providers.ANTHROPIC,
127
+ undefined,
128
+ messages.length
129
+ );
130
+ const payload = _convertMessagesToAnthropicPayload(
131
+ addTailCacheControl(folded)
132
+ );
133
+
134
+ expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(true);
135
+ });
136
+
137
+ test('marking BEFORE the fold loses the breakpoint (guards the ordering)', () => {
138
+ const messages: BaseMessage[] = [
139
+ new HumanMessage('do the thing'),
140
+ new AIMessage({
141
+ content: '',
142
+ tool_calls: [{ id: 't1', name: 'search', args: { q: 'x' } }],
143
+ }),
144
+ new ToolMessage({ tool_call_id: 't1', content: 'tool output text' }),
145
+ ];
146
+
147
+ // The buggy order: mark first, then fold drops the marker.
148
+ const marked = addTailCacheControl(messages);
149
+ const folded = ensureThinkingBlockInMessages(
150
+ marked,
151
+ Providers.ANTHROPIC,
152
+ undefined,
153
+ messages.length
154
+ );
155
+ const payload = _convertMessagesToAnthropicPayload(folded);
156
+
157
+ expect(breakpointSurvives(payload.messages as PayloadMessage[])).toBe(
158
+ false
159
+ );
160
+ });
161
+ });
@@ -0,0 +1,479 @@
1
+ /**
2
+ * Live, reproducible benchmark: single tail prompt-cache breakpoint (new
3
+ * default) vs. the legacy "last two user messages" strategy.
4
+ *
5
+ * It replays realistic harness conversations against a real provider and, for
6
+ * each model call, records the cache token breakdown the API reports. The two
7
+ * strategies are run over the SAME conversations (only the cache MARKING
8
+ * differs) under distinct cache namespaces, then compared.
9
+ *
10
+ * What it demonstrates
11
+ * --------------------
12
+ * - Agent tool loop (one user turn, many tool rounds): the legacy strategy
13
+ * pins its only message breakpoint on the lone user message, so every
14
+ * appended assistant/tool turn is re-sent UNCACHED on the next call — cache
15
+ * write/fresh ≫ read. The tail strategy rides the true tail, so the growing
16
+ * transcript is written once and read back. This is the dominant agent shape
17
+ * and where the legacy approach breaks down hardest.
18
+ * - Multi-turn chat (frequent user messages): legacy's two rolling markers do
19
+ * fine here; the tail strategy ties (never worse).
20
+ * - Realistic agent (user turns interleaved with tool rounds): tail wins.
21
+ *
22
+ * Metrics (per strategy, summed over all calls in a scenario)
23
+ * - cache_read : tokens served from cache (HIGHER is better).
24
+ * - cache_write : tokens written to cache (cache_creation).
25
+ * - fresh : uncached input processed at full price
26
+ * (= input_tokens - cache_read - cache_write); this is what
27
+ * balloons when caching fails to cover the transcript.
28
+ * - effective : a cost proxy in input-token-equivalents using Anthropic's
29
+ * published multipliers — read x0.1, write x1.25, fresh x1.0.
30
+ * LOWER is better.
31
+ *
32
+ * Usage
33
+ * # Anthropic (default). Needs ANTHROPIC_API_KEY in .env (or BENCH_ENV_FILE).
34
+ * npm run bench:cache
35
+ * # Bedrock. Needs BEDROCK_AWS_* creds.
36
+ * npm run bench:cache -- --provider bedrock
37
+ * # Options: --provider anthropic|bedrock --rounds <N> --model <id>
38
+ *
39
+ * Not a unit test (no `.test.` suffix) so CI never runs it; it makes real,
40
+ * paid API calls.
41
+ */
42
+ import { config } from 'dotenv';
43
+ config({ path: process.env.BENCH_ENV_FILE || '.env' });
44
+
45
+ import {
46
+ HumanMessage,
47
+ AIMessage,
48
+ ToolMessage,
49
+ type BaseMessage,
50
+ } from '@langchain/core/messages';
51
+ import { CustomAnthropic } from '@/llm/anthropic';
52
+ import { CustomChatBedrockConverse } from '@/llm/bedrock';
53
+ import {
54
+ addCacheControl,
55
+ addTailCacheControl,
56
+ addBedrockCacheControl,
57
+ addBedrockTailCacheControl,
58
+ } from '@/messages/cache';
59
+
60
+ type ProviderName = 'anthropic' | 'bedrock';
61
+
62
+ interface Args {
63
+ provider: ProviderName;
64
+ rounds: number;
65
+ model?: string;
66
+ }
67
+
68
+ function parseArgs(): Args {
69
+ const argv = process.argv.slice(2);
70
+ const out: Args = { provider: 'anthropic', rounds: 6 };
71
+ for (let i = 0; i < argv.length; i++) {
72
+ const a = argv[i];
73
+ if (a === '--provider') out.provider = argv[++i] as ProviderName;
74
+ else if (a === '--rounds') out.rounds = Number(argv[++i]);
75
+ else if (a === '--model') out.model = argv[++i];
76
+ }
77
+ return out;
78
+ }
79
+
80
+ /** Deterministic filler of roughly `tokens` tokens (~0.75 words/token). */
81
+ function filler(tokens: number, tag: string): string {
82
+ const words = Math.max(1, Math.round(tokens * 0.75));
83
+ const out: string[] = [];
84
+ for (let i = 0; i < words; i++) {
85
+ out.push(`${tag}${i % 97}`);
86
+ }
87
+ return out.join(' ');
88
+ }
89
+
90
+ // ---------------------------------------------------------------------------
91
+ // Scenarios. Each returns the message list for every model call (call `i`
92
+ // sends step `i`; the transcript grows append-only between calls), built under
93
+ // a per-run nonce so the two strategy runs never share a cache namespace.
94
+ // ---------------------------------------------------------------------------
95
+
96
+ const STABLE_TOKENS = 2000; // big stable context (instructions / first request)
97
+ const TOOL_RESULT_TOKENS = 600; // realistic agent tool output (file/search)
98
+
99
+ function processToolCall(id: string, batch: number) {
100
+ return { id, name: 'process_records', args: { batch } };
101
+ }
102
+
103
+ /** Agent tool loop: ONE user turn, then `rounds` assistant→tool rounds. */
104
+ function toolLoopScenario(nonce: string, rounds: number): BaseMessage[][] {
105
+ const steps: BaseMessage[][] = [];
106
+ const conv: BaseMessage[] = [
107
+ new HumanMessage(
108
+ `Session ${nonce}. Reference data follows.\n${filler(STABLE_TOKENS, `ref${nonce}`)}\n\n` +
109
+ 'Process every batch using the process_records tool until done.'
110
+ ),
111
+ ];
112
+ for (let i = 1; i <= rounds; i++) {
113
+ steps.push([...conv]);
114
+ conv.push(
115
+ new AIMessage({
116
+ content: `Processing batch ${i}.`,
117
+ tool_calls: [processToolCall(`tl_${nonce}_${i}`, i)],
118
+ })
119
+ );
120
+ conv.push(
121
+ new ToolMessage({
122
+ tool_call_id: `tl_${nonce}_${i}`,
123
+ content: `Batch ${i} of session ${nonce} complete. ${filler(TOOL_RESULT_TOKENS, `out${i}`)}`,
124
+ })
125
+ );
126
+ }
127
+ return steps;
128
+ }
129
+
130
+ /** Multi-turn chat: frequent user messages, no tools (legacy's good case). */
131
+ function chatScenario(nonce: string, rounds: number): BaseMessage[][] {
132
+ const steps: BaseMessage[][] = [];
133
+ const conv: BaseMessage[] = [
134
+ new HumanMessage(
135
+ `Session ${nonce}.\n${filler(STABLE_TOKENS, `doc${nonce}`)}\n\nQuestion 1: summarize.`
136
+ ),
137
+ ];
138
+ for (let i = 1; i <= rounds; i++) {
139
+ steps.push([...conv]);
140
+ conv.push(new AIMessage(`Answer ${i}. ${filler(120, `ans${i}`)}`));
141
+ conv.push(
142
+ new HumanMessage(`Question ${i + 1}: ${filler(60, `q${i + 1}`)}`)
143
+ );
144
+ }
145
+ return steps;
146
+ }
147
+
148
+ /** Realistic agent: each user turn triggers two tool rounds, then a new user. */
149
+ function agentMixedScenario(nonce: string, rounds: number): BaseMessage[][] {
150
+ const steps: BaseMessage[][] = [];
151
+ const conv: BaseMessage[] = [
152
+ new HumanMessage(
153
+ `Session ${nonce}. Project context:\n${filler(STABLE_TOKENS, `ctx${nonce}`)}\n\nTask 1: investigate.`
154
+ ),
155
+ ];
156
+ let tc = 0;
157
+ for (let turn = 1; turn <= rounds; turn++) {
158
+ // two tool rounds within this user turn
159
+ for (let r = 0; r < 2; r++) {
160
+ steps.push([...conv]);
161
+ tc++;
162
+ const id = `am_${nonce}_${tc}`;
163
+ conv.push(
164
+ new AIMessage({
165
+ content: `Turn ${turn} step ${r + 1}.`,
166
+ tool_calls: [{ id, name: 'process_records', args: { step: tc } }],
167
+ })
168
+ );
169
+ conv.push(
170
+ new ToolMessage({
171
+ tool_call_id: id,
172
+ content: `Result ${tc} (${nonce}). ${filler(TOOL_RESULT_TOKENS, `r${tc}`)}`,
173
+ })
174
+ );
175
+ }
176
+ // model summarizes, user asks the next task
177
+ steps.push([...conv]);
178
+ conv.push(new AIMessage(`Turn ${turn} summary. ${filler(80, `s${turn}`)}`));
179
+ conv.push(
180
+ new HumanMessage(`Task ${turn + 1}: ${filler(60, `t${turn + 1}`)}`)
181
+ );
182
+ }
183
+ return steps;
184
+ }
185
+
186
+ const SUMMARY_TOKENS = 1500; // compacted-history summary injected post-compaction
187
+
188
+ /**
189
+ * Post-compaction (summarization): a few tool rounds on the original context,
190
+ * then a compaction event replaces the head with a summary message, then the
191
+ * agent continues. The compaction step is a deliberate cache miss for BOTH
192
+ * strategies (the cached prefix genuinely changed — unavoidable). What matters
193
+ * is the POST-compaction phase: the summary becomes the new stable head and the
194
+ * tail strategy re-establishes append-only caching over the continuing tool
195
+ * loop, whereas legacy pins on the lone summary user-message and re-sends the
196
+ * new tool work uncached. (Tool results here are already the truncated,
197
+ * persisted strings ToolNode stores — truncation is applied once at exec time
198
+ * with a model-fixed cap, so it does not mutate the prefix across turns.)
199
+ */
200
+ function postCompactionScenario(
201
+ nonce: string,
202
+ rounds: number
203
+ ): BaseMessage[][] {
204
+ const steps: BaseMessage[][] = [];
205
+
206
+ // Phase 1: pre-compaction growth on the original context.
207
+ const pre: BaseMessage[] = [
208
+ new HumanMessage(
209
+ `Session ${nonce}. ${filler(STABLE_TOKENS, `pre${nonce}`)}\n\nAnalyze the dataset.`
210
+ ),
211
+ ];
212
+ for (let i = 1; i <= 2; i++) {
213
+ steps.push([...pre]);
214
+ pre.push(
215
+ new AIMessage({
216
+ content: `Pre ${i}.`,
217
+ tool_calls: [
218
+ {
219
+ id: `pc_${nonce}_${i}`,
220
+ name: 'process_records',
221
+ args: { batch: i },
222
+ },
223
+ ],
224
+ })
225
+ );
226
+ pre.push(
227
+ new ToolMessage({
228
+ tool_call_id: `pc_${nonce}_${i}`,
229
+ content: `Pre result ${i}. ${filler(TOOL_RESULT_TOKENS, `pr${i}`)}`,
230
+ })
231
+ );
232
+ }
233
+
234
+ // Compaction: head replaced by a durable summary; continue from there.
235
+ const post: BaseMessage[] = [
236
+ new HumanMessage(
237
+ `Session ${nonce} (resumed after compaction).\n<summary>\n${filler(SUMMARY_TOKENS, `sum${nonce}`)}\n</summary>\n\nContinue the analysis.`
238
+ ),
239
+ ];
240
+ for (let i = 1; i <= rounds; i++) {
241
+ steps.push([...post]);
242
+ post.push(
243
+ new AIMessage({
244
+ content: `Post ${i}.`,
245
+ tool_calls: [
246
+ {
247
+ id: `po_${nonce}_${i}`,
248
+ name: 'process_records',
249
+ args: { batch: i },
250
+ },
251
+ ],
252
+ })
253
+ );
254
+ post.push(
255
+ new ToolMessage({
256
+ tool_call_id: `po_${nonce}_${i}`,
257
+ content: `Post result ${i}. ${filler(TOOL_RESULT_TOKENS, `po${i}`)}`,
258
+ })
259
+ );
260
+ }
261
+ return steps;
262
+ }
263
+
264
+ const SCENARIOS: Array<{
265
+ name: string;
266
+ build: (nonce: string, rounds: number) => BaseMessage[][];
267
+ }> = [
268
+ {
269
+ name: 'Agent tool loop (1 user turn, N tool rounds)',
270
+ build: toolLoopScenario,
271
+ },
272
+ { name: 'Multi-turn chat (frequent user messages)', build: chatScenario },
273
+ {
274
+ name: 'Realistic agent (user turns + tool rounds)',
275
+ build: agentMixedScenario,
276
+ },
277
+ {
278
+ name: 'Post-compaction (summary head + continued tool loop)',
279
+ build: postCompactionScenario,
280
+ },
281
+ ];
282
+
283
+ // ---------------------------------------------------------------------------
284
+ // Provider plumbing.
285
+ // ---------------------------------------------------------------------------
286
+
287
+ const PROCESS_TOOL = {
288
+ type: 'function' as const,
289
+ function: {
290
+ name: 'process_records',
291
+ description: 'Process a batch of records.',
292
+ parameters: {
293
+ type: 'object',
294
+ properties: { batch: { type: 'number' }, step: { type: 'number' } },
295
+ },
296
+ },
297
+ };
298
+
299
+ interface StrategyPair {
300
+ legacy: (m: BaseMessage[]) => BaseMessage[];
301
+ tail: (m: BaseMessage[]) => BaseMessage[];
302
+ }
303
+
304
+ function makeProvider(args: Args): {
305
+ invoke: (messages: BaseMessage[]) => Promise<Usage | undefined>;
306
+ strategies: StrategyPair;
307
+ label: string;
308
+ } {
309
+ if (args.provider === 'bedrock') {
310
+ const model = args.model ?? 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
311
+ const llm = new CustomChatBedrockConverse({
312
+ model,
313
+ region:
314
+ process.env.BEDROCK_AWS_REGION ??
315
+ process.env.AWS_DEFAULT_REGION ??
316
+ 'us-east-1',
317
+ credentials: {
318
+ accessKeyId: process.env.BEDROCK_AWS_ACCESS_KEY_ID!,
319
+ secretAccessKey: process.env.BEDROCK_AWS_SECRET_ACCESS_KEY!,
320
+ },
321
+ streaming: true,
322
+ streamUsage: true,
323
+ maxTokens: 32,
324
+ promptCache: true,
325
+ }).bindTools([PROCESS_TOOL]);
326
+ return {
327
+ label: `bedrock:${model}`,
328
+ invoke: async (messages) =>
329
+ (await llm.invoke(messages)).usage_metadata as Usage,
330
+ strategies: {
331
+ legacy: (m) => addBedrockCacheControl<BaseMessage>(m),
332
+ tail: (m) => addBedrockTailCacheControl<BaseMessage>(m),
333
+ },
334
+ };
335
+ }
336
+
337
+ const model = args.model ?? 'claude-sonnet-4-5';
338
+ const llm = new CustomAnthropic({
339
+ model,
340
+ apiKey: process.env.ANTHROPIC_API_KEY,
341
+ maxTokens: 32,
342
+ promptCache: true,
343
+ streaming: true,
344
+ streamUsage: true,
345
+ } as never).bindTools([PROCESS_TOOL]);
346
+ return {
347
+ label: `anthropic:${model}`,
348
+ invoke: async (messages) =>
349
+ (await llm.invoke(messages)).usage_metadata as Usage,
350
+ strategies: {
351
+ legacy: (m) => addCacheControl<BaseMessage>(m),
352
+ tail: (m) => addTailCacheControl<BaseMessage>(m),
353
+ },
354
+ };
355
+ }
356
+
357
+ type Usage = {
358
+ input_tokens?: number;
359
+ output_tokens?: number;
360
+ total_tokens?: number;
361
+ input_token_details?: { cache_creation?: number; cache_read?: number };
362
+ };
363
+
364
+ interface Totals {
365
+ read: number;
366
+ write: number;
367
+ fresh: number;
368
+ effective: number;
369
+ }
370
+
371
+ function emptyTotals(): Totals {
372
+ return { read: 0, write: 0, fresh: 0, effective: 0 };
373
+ }
374
+
375
+ function addUsage(t: Totals, u: Usage | undefined): void {
376
+ const d = u?.input_token_details ?? {};
377
+ const read = d.cache_read ?? 0;
378
+ const write = d.cache_creation ?? 0;
379
+ // Provider-agnostic fresh: total prompt tokens minus cached buckets. Avoids
380
+ // the `input_tokens` ambiguity — Anthropic folds cache tokens INTO
381
+ // input_tokens, while Bedrock reports input_tokens as fresh-only with cache
382
+ // tokens separate. `total_tokens - output_tokens` is the full prompt size on
383
+ // both, so subtracting read+write leaves the truly fresh (full-price) input.
384
+ const promptTotal = (u?.total_tokens ?? 0) - (u?.output_tokens ?? 0);
385
+ const fresh = Math.max(0, promptTotal - read - write);
386
+ t.read += read;
387
+ t.write += write;
388
+ t.fresh += fresh;
389
+ // Anthropic/Bedrock price multipliers: read 0.1x, write 1.25x, fresh 1x.
390
+ t.effective += fresh + write * 1.25 + read * 0.1;
391
+ }
392
+
393
+ async function runStrategy(
394
+ steps: BaseMessage[][],
395
+ apply: (m: BaseMessage[]) => BaseMessage[],
396
+ invoke: (m: BaseMessage[]) => Promise<Usage | undefined>
397
+ ): Promise<Totals> {
398
+ const totals = emptyTotals();
399
+ for (const step of steps) {
400
+ const usage = await invoke(apply(step));
401
+ addUsage(totals, usage);
402
+ }
403
+ return totals;
404
+ }
405
+
406
+ function pct(legacy: number, tail: number): string {
407
+ if (legacy === 0) return tail === 0 ? '0%' : 'n/a';
408
+ const delta = ((tail - legacy) / legacy) * 100;
409
+ return `${delta >= 0 ? '+' : ''}${delta.toFixed(0)}%`;
410
+ }
411
+
412
+ function uniqueNonce(tag: string): string {
413
+ return `${tag}-${Date.now().toString(36)}-${Math.floor(Math.random() * 1e6).toString(36)}`;
414
+ }
415
+
416
+ async function main(): Promise<void> {
417
+ const args = parseArgs();
418
+ if (args.provider === 'anthropic' && !process.env.ANTHROPIC_API_KEY) {
419
+ console.error('Set ANTHROPIC_API_KEY (in .env or via BENCH_ENV_FILE).');
420
+ process.exit(1);
421
+ }
422
+ if (args.provider === 'bedrock' && !process.env.BEDROCK_AWS_ACCESS_KEY_ID) {
423
+ console.error(
424
+ 'Set BEDROCK_AWS_ACCESS_KEY_ID / BEDROCK_AWS_SECRET_ACCESS_KEY.'
425
+ );
426
+ process.exit(1);
427
+ }
428
+
429
+ const { invoke, strategies, label } = makeProvider(args);
430
+ console.log(`\nProvider: ${label} rounds=${args.rounds}`);
431
+ console.log(
432
+ 'Metrics summed over all calls in a scenario. read↑ better; fresh↓ and effective↓ better.\n'
433
+ );
434
+
435
+ let tailWins = 0;
436
+ let scenarioCount = 0;
437
+
438
+ for (const scenario of SCENARIOS) {
439
+ // Distinct nonce per strategy run so legacy and tail never share a cache.
440
+ const legacySteps = scenario.build(uniqueNonce('legacy'), args.rounds);
441
+ const legacy = await runStrategy(legacySteps, strategies.legacy, invoke);
442
+ const tailSteps = scenario.build(uniqueNonce('tail'), args.rounds);
443
+ const tail = await runStrategy(tailSteps, strategies.tail, invoke);
444
+
445
+ console.log(`SCENARIO: ${scenario.name} (${legacySteps.length} calls)`);
446
+ const row = (name: string, t: Totals): string =>
447
+ ` ${name.padEnd(8)} read=${String(t.read).padStart(7)} write=${String(
448
+ t.write
449
+ ).padStart(7)} fresh=${String(t.fresh).padStart(7)} effective=${String(
450
+ Math.round(t.effective)
451
+ ).padStart(7)}`;
452
+ console.log(row('legacy', legacy));
453
+ console.log(row('tail', tail));
454
+ console.log(
455
+ ` Δ tail vs legacy: read ${pct(legacy.read, tail.read)} ` +
456
+ `fresh ${pct(legacy.fresh, tail.fresh)} ` +
457
+ `effective ${pct(legacy.effective, tail.effective)} (lower=cheaper)`
458
+ );
459
+
460
+ const better = tail.effective <= legacy.effective;
461
+ const tie =
462
+ Math.abs(tail.effective - legacy.effective) / (legacy.effective || 1) <
463
+ 0.03;
464
+ console.log(
465
+ ` → ${better ? (tie ? '≈ TIE' : '✅ TAIL WINS') : '❌ legacy better'}\n`
466
+ );
467
+ scenarioCount++;
468
+ if (better) tailWins++;
469
+ }
470
+
471
+ console.log(
472
+ `RESULT: tail strategy is better-or-equal in ${tailWins}/${scenarioCount} scenarios.`
473
+ );
474
+ }
475
+
476
+ main().catch((err) => {
477
+ console.error('Benchmark failed:', err);
478
+ process.exit(1);
479
+ });