@librechat/agents 3.2.36 → 3.2.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/agents/AgentContext.cjs +1 -1
- package/dist/cjs/agents/AgentContext.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +7 -8
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/langfuse.cjs +16 -5
- package/dist/cjs/langfuse.cjs.map +1 -1
- package/dist/cjs/langfuseToolOutputTracing.cjs +7 -0
- package/dist/cjs/langfuseToolOutputTracing.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +92 -3
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +24 -4
- package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/main.cjs +2 -0
- package/dist/cjs/messages/cache.cjs +183 -0
- package/dist/cjs/messages/cache.cjs.map +1 -1
- package/dist/cjs/summarization/node.cjs +1 -1
- package/dist/cjs/summarization/node.cjs.map +1 -1
- package/dist/cjs/tools/toolOutputReferences.cjs +28 -14
- package/dist/cjs/tools/toolOutputReferences.cjs.map +1 -1
- package/dist/esm/agents/AgentContext.mjs +2 -2
- package/dist/esm/agents/AgentContext.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +8 -9
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/langfuse.mjs +16 -5
- package/dist/esm/langfuse.mjs.map +1 -1
- package/dist/esm/langfuseToolOutputTracing.mjs +7 -0
- package/dist/esm/langfuseToolOutputTracing.mjs.map +1 -1
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs +92 -3
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/llm/bedrock/utils/message_inputs.mjs +24 -4
- package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/main.mjs +2 -2
- package/dist/esm/messages/cache.mjs +182 -1
- package/dist/esm/messages/cache.mjs.map +1 -1
- package/dist/esm/summarization/node.mjs +2 -2
- package/dist/esm/summarization/node.mjs.map +1 -1
- package/dist/esm/tools/toolOutputReferences.mjs +28 -14
- package/dist/esm/tools/toolOutputReferences.mjs.map +1 -1
- package/dist/types/messages/cache.d.ts +40 -0
- package/dist/types/types/graph.d.ts +2 -0
- package/package.json +8 -5
- package/src/agents/AgentContext.ts +2 -2
- package/src/agents/__tests__/AgentContext.test.ts +3 -9
- package/src/graphs/Graph.ts +65 -36
- package/src/langfuse.ts +38 -4
- package/src/langfuseToolOutputTracing.ts +18 -0
- package/src/llm/anthropic/utils/message_inputs.ts +131 -3
- package/src/llm/anthropic/utils/stripPrefillCache.test.ts +111 -0
- package/src/llm/bedrock/utils/message_inputs.test.ts +129 -0
- package/src/llm/bedrock/utils/message_inputs.ts +46 -4
- package/src/llm/bedrock/utils/toolResultCachePoint.test.ts +103 -0
- package/src/messages/cache.tail.test.ts +340 -0
- package/src/messages/cache.ts +266 -0
- package/src/messages/tailCacheConversion.test.ts +161 -0
- package/src/scripts/bench-prompt-cache.ts +479 -0
- package/src/specs/langfuse-config.test.ts +69 -2
- package/src/specs/langfuse-metadata.test.ts +44 -0
- package/src/specs/langfuse-tool-output-tracing.test.ts +6 -0
- package/src/summarization/node.ts +2 -2
- package/src/tools/__tests__/annotateMessagesForLLM.test.ts +50 -0
- package/src/tools/toolOutputReferences.ts +34 -20
- package/src/types/graph.ts +2 -0
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Live, reproducible benchmark: single tail prompt-cache breakpoint (new
|
|
3
|
+
* default) vs. the legacy "last two user messages" strategy.
|
|
4
|
+
*
|
|
5
|
+
* It replays realistic harness conversations against a real provider and, for
|
|
6
|
+
* each model call, records the cache token breakdown the API reports. The two
|
|
7
|
+
* strategies are run over the SAME conversations (only the cache MARKING
|
|
8
|
+
* differs) under distinct cache namespaces, then compared.
|
|
9
|
+
*
|
|
10
|
+
* What it demonstrates
|
|
11
|
+
* --------------------
|
|
12
|
+
* - Agent tool loop (one user turn, many tool rounds): the legacy strategy
|
|
13
|
+
* pins its only message breakpoint on the lone user message, so every
|
|
14
|
+
* appended assistant/tool turn is re-sent UNCACHED on the next call — cache
|
|
15
|
+
* write/fresh ≫ read. The tail strategy rides the true tail, so the growing
|
|
16
|
+
* transcript is written once and read back. This is the dominant agent shape
|
|
17
|
+
* and where the legacy approach breaks down hardest.
|
|
18
|
+
* - Multi-turn chat (frequent user messages): legacy's two rolling markers do
|
|
19
|
+
* fine here; the tail strategy ties (never worse).
|
|
20
|
+
* - Realistic agent (user turns interleaved with tool rounds): tail wins.
|
|
21
|
+
*
|
|
22
|
+
* Metrics (per strategy, summed over all calls in a scenario)
|
|
23
|
+
* - cache_read : tokens served from cache (HIGHER is better).
|
|
24
|
+
* - cache_write : tokens written to cache (cache_creation).
|
|
25
|
+
* - fresh : uncached input processed at full price
|
|
26
|
+
* (= input_tokens - cache_read - cache_write); this is what
|
|
27
|
+
* balloons when caching fails to cover the transcript.
|
|
28
|
+
* - effective : a cost proxy in input-token-equivalents using Anthropic's
|
|
29
|
+
* published multipliers — read x0.1, write x1.25, fresh x1.0.
|
|
30
|
+
* LOWER is better.
|
|
31
|
+
*
|
|
32
|
+
* Usage
|
|
33
|
+
* # Anthropic (default). Needs ANTHROPIC_API_KEY in .env (or BENCH_ENV_FILE).
|
|
34
|
+
* npm run bench:cache
|
|
35
|
+
* # Bedrock. Needs BEDROCK_AWS_* creds.
|
|
36
|
+
* npm run bench:cache -- --provider bedrock
|
|
37
|
+
* # Options: --provider anthropic|bedrock --rounds <N> --model <id>
|
|
38
|
+
*
|
|
39
|
+
* Not a unit test (no `.test.` suffix) so CI never runs it; it makes real,
|
|
40
|
+
* paid API calls.
|
|
41
|
+
*/
|
|
42
|
+
import { config } from 'dotenv';
|
|
43
|
+
config({ path: process.env.BENCH_ENV_FILE || '.env' });
|
|
44
|
+
|
|
45
|
+
import {
|
|
46
|
+
HumanMessage,
|
|
47
|
+
AIMessage,
|
|
48
|
+
ToolMessage,
|
|
49
|
+
type BaseMessage,
|
|
50
|
+
} from '@langchain/core/messages';
|
|
51
|
+
import { CustomAnthropic } from '@/llm/anthropic';
|
|
52
|
+
import { CustomChatBedrockConverse } from '@/llm/bedrock';
|
|
53
|
+
import {
|
|
54
|
+
addCacheControl,
|
|
55
|
+
addTailCacheControl,
|
|
56
|
+
addBedrockCacheControl,
|
|
57
|
+
addBedrockTailCacheControl,
|
|
58
|
+
} from '@/messages/cache';
|
|
59
|
+
|
|
60
|
+
type ProviderName = 'anthropic' | 'bedrock';
|
|
61
|
+
|
|
62
|
+
interface Args {
|
|
63
|
+
provider: ProviderName;
|
|
64
|
+
rounds: number;
|
|
65
|
+
model?: string;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function parseArgs(): Args {
|
|
69
|
+
const argv = process.argv.slice(2);
|
|
70
|
+
const out: Args = { provider: 'anthropic', rounds: 6 };
|
|
71
|
+
for (let i = 0; i < argv.length; i++) {
|
|
72
|
+
const a = argv[i];
|
|
73
|
+
if (a === '--provider') out.provider = argv[++i] as ProviderName;
|
|
74
|
+
else if (a === '--rounds') out.rounds = Number(argv[++i]);
|
|
75
|
+
else if (a === '--model') out.model = argv[++i];
|
|
76
|
+
}
|
|
77
|
+
return out;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Deterministic filler of roughly `tokens` tokens (~0.75 words/token). */
|
|
81
|
+
function filler(tokens: number, tag: string): string {
|
|
82
|
+
const words = Math.max(1, Math.round(tokens * 0.75));
|
|
83
|
+
const out: string[] = [];
|
|
84
|
+
for (let i = 0; i < words; i++) {
|
|
85
|
+
out.push(`${tag}${i % 97}`);
|
|
86
|
+
}
|
|
87
|
+
return out.join(' ');
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// Scenarios. Each returns the message list for every model call (call `i`
|
|
92
|
+
// sends step `i`; the transcript grows append-only between calls), built under
|
|
93
|
+
// a per-run nonce so the two strategy runs never share a cache namespace.
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
const STABLE_TOKENS = 2000; // big stable context (instructions / first request)
|
|
97
|
+
const TOOL_RESULT_TOKENS = 600; // realistic agent tool output (file/search)
|
|
98
|
+
|
|
99
|
+
function processToolCall(id: string, batch: number) {
|
|
100
|
+
return { id, name: 'process_records', args: { batch } };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** Agent tool loop: ONE user turn, then `rounds` assistant→tool rounds. */
|
|
104
|
+
function toolLoopScenario(nonce: string, rounds: number): BaseMessage[][] {
|
|
105
|
+
const steps: BaseMessage[][] = [];
|
|
106
|
+
const conv: BaseMessage[] = [
|
|
107
|
+
new HumanMessage(
|
|
108
|
+
`Session ${nonce}. Reference data follows.\n${filler(STABLE_TOKENS, `ref${nonce}`)}\n\n` +
|
|
109
|
+
'Process every batch using the process_records tool until done.'
|
|
110
|
+
),
|
|
111
|
+
];
|
|
112
|
+
for (let i = 1; i <= rounds; i++) {
|
|
113
|
+
steps.push([...conv]);
|
|
114
|
+
conv.push(
|
|
115
|
+
new AIMessage({
|
|
116
|
+
content: `Processing batch ${i}.`,
|
|
117
|
+
tool_calls: [processToolCall(`tl_${nonce}_${i}`, i)],
|
|
118
|
+
})
|
|
119
|
+
);
|
|
120
|
+
conv.push(
|
|
121
|
+
new ToolMessage({
|
|
122
|
+
tool_call_id: `tl_${nonce}_${i}`,
|
|
123
|
+
content: `Batch ${i} of session ${nonce} complete. ${filler(TOOL_RESULT_TOKENS, `out${i}`)}`,
|
|
124
|
+
})
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
return steps;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Multi-turn chat: frequent user messages, no tools (legacy's good case). */
|
|
131
|
+
function chatScenario(nonce: string, rounds: number): BaseMessage[][] {
|
|
132
|
+
const steps: BaseMessage[][] = [];
|
|
133
|
+
const conv: BaseMessage[] = [
|
|
134
|
+
new HumanMessage(
|
|
135
|
+
`Session ${nonce}.\n${filler(STABLE_TOKENS, `doc${nonce}`)}\n\nQuestion 1: summarize.`
|
|
136
|
+
),
|
|
137
|
+
];
|
|
138
|
+
for (let i = 1; i <= rounds; i++) {
|
|
139
|
+
steps.push([...conv]);
|
|
140
|
+
conv.push(new AIMessage(`Answer ${i}. ${filler(120, `ans${i}`)}`));
|
|
141
|
+
conv.push(
|
|
142
|
+
new HumanMessage(`Question ${i + 1}: ${filler(60, `q${i + 1}`)}`)
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
return steps;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/** Realistic agent: each user turn triggers two tool rounds, then a new user. */
|
|
149
|
+
function agentMixedScenario(nonce: string, rounds: number): BaseMessage[][] {
|
|
150
|
+
const steps: BaseMessage[][] = [];
|
|
151
|
+
const conv: BaseMessage[] = [
|
|
152
|
+
new HumanMessage(
|
|
153
|
+
`Session ${nonce}. Project context:\n${filler(STABLE_TOKENS, `ctx${nonce}`)}\n\nTask 1: investigate.`
|
|
154
|
+
),
|
|
155
|
+
];
|
|
156
|
+
let tc = 0;
|
|
157
|
+
for (let turn = 1; turn <= rounds; turn++) {
|
|
158
|
+
// two tool rounds within this user turn
|
|
159
|
+
for (let r = 0; r < 2; r++) {
|
|
160
|
+
steps.push([...conv]);
|
|
161
|
+
tc++;
|
|
162
|
+
const id = `am_${nonce}_${tc}`;
|
|
163
|
+
conv.push(
|
|
164
|
+
new AIMessage({
|
|
165
|
+
content: `Turn ${turn} step ${r + 1}.`,
|
|
166
|
+
tool_calls: [{ id, name: 'process_records', args: { step: tc } }],
|
|
167
|
+
})
|
|
168
|
+
);
|
|
169
|
+
conv.push(
|
|
170
|
+
new ToolMessage({
|
|
171
|
+
tool_call_id: id,
|
|
172
|
+
content: `Result ${tc} (${nonce}). ${filler(TOOL_RESULT_TOKENS, `r${tc}`)}`,
|
|
173
|
+
})
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
// model summarizes, user asks the next task
|
|
177
|
+
steps.push([...conv]);
|
|
178
|
+
conv.push(new AIMessage(`Turn ${turn} summary. ${filler(80, `s${turn}`)}`));
|
|
179
|
+
conv.push(
|
|
180
|
+
new HumanMessage(`Task ${turn + 1}: ${filler(60, `t${turn + 1}`)}`)
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
return steps;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const SUMMARY_TOKENS = 1500; // compacted-history summary injected post-compaction
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Post-compaction (summarization): a few tool rounds on the original context,
|
|
190
|
+
* then a compaction event replaces the head with a summary message, then the
|
|
191
|
+
* agent continues. The compaction step is a deliberate cache miss for BOTH
|
|
192
|
+
* strategies (the cached prefix genuinely changed — unavoidable). What matters
|
|
193
|
+
* is the POST-compaction phase: the summary becomes the new stable head and the
|
|
194
|
+
* tail strategy re-establishes append-only caching over the continuing tool
|
|
195
|
+
* loop, whereas legacy pins on the lone summary user-message and re-sends the
|
|
196
|
+
* new tool work uncached. (Tool results here are already the truncated,
|
|
197
|
+
* persisted strings ToolNode stores — truncation is applied once at exec time
|
|
198
|
+
* with a model-fixed cap, so it does not mutate the prefix across turns.)
|
|
199
|
+
*/
|
|
200
|
+
function postCompactionScenario(
|
|
201
|
+
nonce: string,
|
|
202
|
+
rounds: number
|
|
203
|
+
): BaseMessage[][] {
|
|
204
|
+
const steps: BaseMessage[][] = [];
|
|
205
|
+
|
|
206
|
+
// Phase 1: pre-compaction growth on the original context.
|
|
207
|
+
const pre: BaseMessage[] = [
|
|
208
|
+
new HumanMessage(
|
|
209
|
+
`Session ${nonce}. ${filler(STABLE_TOKENS, `pre${nonce}`)}\n\nAnalyze the dataset.`
|
|
210
|
+
),
|
|
211
|
+
];
|
|
212
|
+
for (let i = 1; i <= 2; i++) {
|
|
213
|
+
steps.push([...pre]);
|
|
214
|
+
pre.push(
|
|
215
|
+
new AIMessage({
|
|
216
|
+
content: `Pre ${i}.`,
|
|
217
|
+
tool_calls: [
|
|
218
|
+
{
|
|
219
|
+
id: `pc_${nonce}_${i}`,
|
|
220
|
+
name: 'process_records',
|
|
221
|
+
args: { batch: i },
|
|
222
|
+
},
|
|
223
|
+
],
|
|
224
|
+
})
|
|
225
|
+
);
|
|
226
|
+
pre.push(
|
|
227
|
+
new ToolMessage({
|
|
228
|
+
tool_call_id: `pc_${nonce}_${i}`,
|
|
229
|
+
content: `Pre result ${i}. ${filler(TOOL_RESULT_TOKENS, `pr${i}`)}`,
|
|
230
|
+
})
|
|
231
|
+
);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Compaction: head replaced by a durable summary; continue from there.
|
|
235
|
+
const post: BaseMessage[] = [
|
|
236
|
+
new HumanMessage(
|
|
237
|
+
`Session ${nonce} (resumed after compaction).\n<summary>\n${filler(SUMMARY_TOKENS, `sum${nonce}`)}\n</summary>\n\nContinue the analysis.`
|
|
238
|
+
),
|
|
239
|
+
];
|
|
240
|
+
for (let i = 1; i <= rounds; i++) {
|
|
241
|
+
steps.push([...post]);
|
|
242
|
+
post.push(
|
|
243
|
+
new AIMessage({
|
|
244
|
+
content: `Post ${i}.`,
|
|
245
|
+
tool_calls: [
|
|
246
|
+
{
|
|
247
|
+
id: `po_${nonce}_${i}`,
|
|
248
|
+
name: 'process_records',
|
|
249
|
+
args: { batch: i },
|
|
250
|
+
},
|
|
251
|
+
],
|
|
252
|
+
})
|
|
253
|
+
);
|
|
254
|
+
post.push(
|
|
255
|
+
new ToolMessage({
|
|
256
|
+
tool_call_id: `po_${nonce}_${i}`,
|
|
257
|
+
content: `Post result ${i}. ${filler(TOOL_RESULT_TOKENS, `po${i}`)}`,
|
|
258
|
+
})
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
return steps;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
const SCENARIOS: Array<{
|
|
265
|
+
name: string;
|
|
266
|
+
build: (nonce: string, rounds: number) => BaseMessage[][];
|
|
267
|
+
}> = [
|
|
268
|
+
{
|
|
269
|
+
name: 'Agent tool loop (1 user turn, N tool rounds)',
|
|
270
|
+
build: toolLoopScenario,
|
|
271
|
+
},
|
|
272
|
+
{ name: 'Multi-turn chat (frequent user messages)', build: chatScenario },
|
|
273
|
+
{
|
|
274
|
+
name: 'Realistic agent (user turns + tool rounds)',
|
|
275
|
+
build: agentMixedScenario,
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
name: 'Post-compaction (summary head + continued tool loop)',
|
|
279
|
+
build: postCompactionScenario,
|
|
280
|
+
},
|
|
281
|
+
];
|
|
282
|
+
|
|
283
|
+
// ---------------------------------------------------------------------------
|
|
284
|
+
// Provider plumbing.
|
|
285
|
+
// ---------------------------------------------------------------------------
|
|
286
|
+
|
|
287
|
+
const PROCESS_TOOL = {
|
|
288
|
+
type: 'function' as const,
|
|
289
|
+
function: {
|
|
290
|
+
name: 'process_records',
|
|
291
|
+
description: 'Process a batch of records.',
|
|
292
|
+
parameters: {
|
|
293
|
+
type: 'object',
|
|
294
|
+
properties: { batch: { type: 'number' }, step: { type: 'number' } },
|
|
295
|
+
},
|
|
296
|
+
},
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
interface StrategyPair {
|
|
300
|
+
legacy: (m: BaseMessage[]) => BaseMessage[];
|
|
301
|
+
tail: (m: BaseMessage[]) => BaseMessage[];
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
function makeProvider(args: Args): {
|
|
305
|
+
invoke: (messages: BaseMessage[]) => Promise<Usage | undefined>;
|
|
306
|
+
strategies: StrategyPair;
|
|
307
|
+
label: string;
|
|
308
|
+
} {
|
|
309
|
+
if (args.provider === 'bedrock') {
|
|
310
|
+
const model = args.model ?? 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
|
|
311
|
+
const llm = new CustomChatBedrockConverse({
|
|
312
|
+
model,
|
|
313
|
+
region:
|
|
314
|
+
process.env.BEDROCK_AWS_REGION ??
|
|
315
|
+
process.env.AWS_DEFAULT_REGION ??
|
|
316
|
+
'us-east-1',
|
|
317
|
+
credentials: {
|
|
318
|
+
accessKeyId: process.env.BEDROCK_AWS_ACCESS_KEY_ID!,
|
|
319
|
+
secretAccessKey: process.env.BEDROCK_AWS_SECRET_ACCESS_KEY!,
|
|
320
|
+
},
|
|
321
|
+
streaming: true,
|
|
322
|
+
streamUsage: true,
|
|
323
|
+
maxTokens: 32,
|
|
324
|
+
promptCache: true,
|
|
325
|
+
}).bindTools([PROCESS_TOOL]);
|
|
326
|
+
return {
|
|
327
|
+
label: `bedrock:${model}`,
|
|
328
|
+
invoke: async (messages) =>
|
|
329
|
+
(await llm.invoke(messages)).usage_metadata as Usage,
|
|
330
|
+
strategies: {
|
|
331
|
+
legacy: (m) => addBedrockCacheControl<BaseMessage>(m),
|
|
332
|
+
tail: (m) => addBedrockTailCacheControl<BaseMessage>(m),
|
|
333
|
+
},
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const model = args.model ?? 'claude-sonnet-4-5';
|
|
338
|
+
const llm = new CustomAnthropic({
|
|
339
|
+
model,
|
|
340
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
341
|
+
maxTokens: 32,
|
|
342
|
+
promptCache: true,
|
|
343
|
+
streaming: true,
|
|
344
|
+
streamUsage: true,
|
|
345
|
+
} as never).bindTools([PROCESS_TOOL]);
|
|
346
|
+
return {
|
|
347
|
+
label: `anthropic:${model}`,
|
|
348
|
+
invoke: async (messages) =>
|
|
349
|
+
(await llm.invoke(messages)).usage_metadata as Usage,
|
|
350
|
+
strategies: {
|
|
351
|
+
legacy: (m) => addCacheControl<BaseMessage>(m),
|
|
352
|
+
tail: (m) => addTailCacheControl<BaseMessage>(m),
|
|
353
|
+
},
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
type Usage = {
|
|
358
|
+
input_tokens?: number;
|
|
359
|
+
output_tokens?: number;
|
|
360
|
+
total_tokens?: number;
|
|
361
|
+
input_token_details?: { cache_creation?: number; cache_read?: number };
|
|
362
|
+
};
|
|
363
|
+
|
|
364
|
+
interface Totals {
|
|
365
|
+
read: number;
|
|
366
|
+
write: number;
|
|
367
|
+
fresh: number;
|
|
368
|
+
effective: number;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
function emptyTotals(): Totals {
|
|
372
|
+
return { read: 0, write: 0, fresh: 0, effective: 0 };
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function addUsage(t: Totals, u: Usage | undefined): void {
|
|
376
|
+
const d = u?.input_token_details ?? {};
|
|
377
|
+
const read = d.cache_read ?? 0;
|
|
378
|
+
const write = d.cache_creation ?? 0;
|
|
379
|
+
// Provider-agnostic fresh: total prompt tokens minus cached buckets. Avoids
|
|
380
|
+
// the `input_tokens` ambiguity — Anthropic folds cache tokens INTO
|
|
381
|
+
// input_tokens, while Bedrock reports input_tokens as fresh-only with cache
|
|
382
|
+
// tokens separate. `total_tokens - output_tokens` is the full prompt size on
|
|
383
|
+
// both, so subtracting read+write leaves the truly fresh (full-price) input.
|
|
384
|
+
const promptTotal = (u?.total_tokens ?? 0) - (u?.output_tokens ?? 0);
|
|
385
|
+
const fresh = Math.max(0, promptTotal - read - write);
|
|
386
|
+
t.read += read;
|
|
387
|
+
t.write += write;
|
|
388
|
+
t.fresh += fresh;
|
|
389
|
+
// Anthropic/Bedrock price multipliers: read 0.1x, write 1.25x, fresh 1x.
|
|
390
|
+
t.effective += fresh + write * 1.25 + read * 0.1;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
async function runStrategy(
|
|
394
|
+
steps: BaseMessage[][],
|
|
395
|
+
apply: (m: BaseMessage[]) => BaseMessage[],
|
|
396
|
+
invoke: (m: BaseMessage[]) => Promise<Usage | undefined>
|
|
397
|
+
): Promise<Totals> {
|
|
398
|
+
const totals = emptyTotals();
|
|
399
|
+
for (const step of steps) {
|
|
400
|
+
const usage = await invoke(apply(step));
|
|
401
|
+
addUsage(totals, usage);
|
|
402
|
+
}
|
|
403
|
+
return totals;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
function pct(legacy: number, tail: number): string {
|
|
407
|
+
if (legacy === 0) return tail === 0 ? '0%' : 'n/a';
|
|
408
|
+
const delta = ((tail - legacy) / legacy) * 100;
|
|
409
|
+
return `${delta >= 0 ? '+' : ''}${delta.toFixed(0)}%`;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
function uniqueNonce(tag: string): string {
|
|
413
|
+
return `${tag}-${Date.now().toString(36)}-${Math.floor(Math.random() * 1e6).toString(36)}`;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
async function main(): Promise<void> {
|
|
417
|
+
const args = parseArgs();
|
|
418
|
+
if (args.provider === 'anthropic' && !process.env.ANTHROPIC_API_KEY) {
|
|
419
|
+
console.error('Set ANTHROPIC_API_KEY (in .env or via BENCH_ENV_FILE).');
|
|
420
|
+
process.exit(1);
|
|
421
|
+
}
|
|
422
|
+
if (args.provider === 'bedrock' && !process.env.BEDROCK_AWS_ACCESS_KEY_ID) {
|
|
423
|
+
console.error(
|
|
424
|
+
'Set BEDROCK_AWS_ACCESS_KEY_ID / BEDROCK_AWS_SECRET_ACCESS_KEY.'
|
|
425
|
+
);
|
|
426
|
+
process.exit(1);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const { invoke, strategies, label } = makeProvider(args);
|
|
430
|
+
console.log(`\nProvider: ${label} rounds=${args.rounds}`);
|
|
431
|
+
console.log(
|
|
432
|
+
'Metrics summed over all calls in a scenario. read↑ better; fresh↓ and effective↓ better.\n'
|
|
433
|
+
);
|
|
434
|
+
|
|
435
|
+
let tailWins = 0;
|
|
436
|
+
let scenarioCount = 0;
|
|
437
|
+
|
|
438
|
+
for (const scenario of SCENARIOS) {
|
|
439
|
+
// Distinct nonce per strategy run so legacy and tail never share a cache.
|
|
440
|
+
const legacySteps = scenario.build(uniqueNonce('legacy'), args.rounds);
|
|
441
|
+
const legacy = await runStrategy(legacySteps, strategies.legacy, invoke);
|
|
442
|
+
const tailSteps = scenario.build(uniqueNonce('tail'), args.rounds);
|
|
443
|
+
const tail = await runStrategy(tailSteps, strategies.tail, invoke);
|
|
444
|
+
|
|
445
|
+
console.log(`SCENARIO: ${scenario.name} (${legacySteps.length} calls)`);
|
|
446
|
+
const row = (name: string, t: Totals): string =>
|
|
447
|
+
` ${name.padEnd(8)} read=${String(t.read).padStart(7)} write=${String(
|
|
448
|
+
t.write
|
|
449
|
+
).padStart(7)} fresh=${String(t.fresh).padStart(7)} effective=${String(
|
|
450
|
+
Math.round(t.effective)
|
|
451
|
+
).padStart(7)}`;
|
|
452
|
+
console.log(row('legacy', legacy));
|
|
453
|
+
console.log(row('tail', tail));
|
|
454
|
+
console.log(
|
|
455
|
+
` Δ tail vs legacy: read ${pct(legacy.read, tail.read)} ` +
|
|
456
|
+
`fresh ${pct(legacy.fresh, tail.fresh)} ` +
|
|
457
|
+
`effective ${pct(legacy.effective, tail.effective)} (lower=cheaper)`
|
|
458
|
+
);
|
|
459
|
+
|
|
460
|
+
const better = tail.effective <= legacy.effective;
|
|
461
|
+
const tie =
|
|
462
|
+
Math.abs(tail.effective - legacy.effective) / (legacy.effective || 1) <
|
|
463
|
+
0.03;
|
|
464
|
+
console.log(
|
|
465
|
+
` → ${better ? (tie ? '≈ TIE' : '✅ TAIL WINS') : '❌ legacy better'}\n`
|
|
466
|
+
);
|
|
467
|
+
scenarioCount++;
|
|
468
|
+
if (better) tailWins++;
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
console.log(
|
|
472
|
+
`RESULT: tail strategy is better-or-equal in ${tailWins}/${scenarioCount} scenarios.`
|
|
473
|
+
);
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
main().catch((err) => {
|
|
477
|
+
console.error('Benchmark failed:', err);
|
|
478
|
+
process.exit(1);
|
|
479
|
+
});
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import { CallbackHandler } from '@langfuse/langchain';
|
|
2
2
|
import {
|
|
3
|
-
createLangfuseHandler,
|
|
4
|
-
disposeLangfuseHandler,
|
|
5
3
|
hasLangfuseConfigCredentials,
|
|
6
4
|
shouldCreateLangfuseHandler,
|
|
5
|
+
isExplicitLangfuseConfig,
|
|
6
|
+
disposeLangfuseHandler,
|
|
7
|
+
createLangfuseHandler,
|
|
7
8
|
} from '@/langfuse';
|
|
8
9
|
|
|
9
10
|
const mockForceFlush = jest.fn();
|
|
@@ -68,6 +69,39 @@ describe('createLangfuseHandler', () => {
|
|
|
68
69
|
});
|
|
69
70
|
});
|
|
70
71
|
|
|
72
|
+
it('adds configured trace metadata and tags to the callback handler', () => {
|
|
73
|
+
process.env.LANGFUSE_PUBLIC_KEY = 'pk-env';
|
|
74
|
+
process.env.LANGFUSE_SECRET_KEY = 'sk-env';
|
|
75
|
+
|
|
76
|
+
const handler = createLangfuseHandler({
|
|
77
|
+
langfuse: {
|
|
78
|
+
metadata: {
|
|
79
|
+
tenantId: 'tenant-1',
|
|
80
|
+
empty: '',
|
|
81
|
+
skipped: null,
|
|
82
|
+
},
|
|
83
|
+
tags: ['tenant:tenant-1', 'agent'],
|
|
84
|
+
},
|
|
85
|
+
traceMetadata: {
|
|
86
|
+
messageId: 'message-1',
|
|
87
|
+
agentId: 'agent-1',
|
|
88
|
+
},
|
|
89
|
+
tags: ['librechat', 'agent'],
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
expect(handler).toBeDefined();
|
|
93
|
+
expect(MockedCallbackHandler).toHaveBeenCalledWith({
|
|
94
|
+
userId: undefined,
|
|
95
|
+
sessionId: undefined,
|
|
96
|
+
traceMetadata: {
|
|
97
|
+
tenantId: 'tenant-1',
|
|
98
|
+
messageId: 'message-1',
|
|
99
|
+
agentId: 'agent-1',
|
|
100
|
+
},
|
|
101
|
+
tags: ['librechat', 'agent', 'tenant:tenant-1'],
|
|
102
|
+
});
|
|
103
|
+
});
|
|
104
|
+
|
|
71
105
|
it('creates a handler for explicit credentials supplied in config', () => {
|
|
72
106
|
const handler = createLangfuseHandler({
|
|
73
107
|
langfuse: {
|
|
@@ -158,6 +192,39 @@ describe('createLangfuseHandler', () => {
|
|
|
158
192
|
).toBe(true);
|
|
159
193
|
});
|
|
160
194
|
|
|
195
|
+
it('does not treat sanitized-away trace attributes as explicit config', () => {
|
|
196
|
+
expect(
|
|
197
|
+
isExplicitLangfuseConfig({
|
|
198
|
+
metadata: {
|
|
199
|
+
empty: '',
|
|
200
|
+
whitespace: ' ',
|
|
201
|
+
missing: null,
|
|
202
|
+
tooLong: 'x'.repeat(201),
|
|
203
|
+
},
|
|
204
|
+
tags: ['', ' '],
|
|
205
|
+
})
|
|
206
|
+
).toBe(false);
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
it('treats valid trace metadata or tags as explicit config', () => {
|
|
210
|
+
expect(
|
|
211
|
+
isExplicitLangfuseConfig({
|
|
212
|
+
metadata: {
|
|
213
|
+
tenantId: 'tenant-1',
|
|
214
|
+
},
|
|
215
|
+
tags: ['', ' '],
|
|
216
|
+
})
|
|
217
|
+
).toBe(true);
|
|
218
|
+
expect(
|
|
219
|
+
isExplicitLangfuseConfig({
|
|
220
|
+
metadata: {
|
|
221
|
+
empty: '',
|
|
222
|
+
},
|
|
223
|
+
tags: ['tenant:tenant-1'],
|
|
224
|
+
})
|
|
225
|
+
).toBe(true);
|
|
226
|
+
});
|
|
227
|
+
|
|
161
228
|
it('does not flush the shared Langfuse provider during per-chat cleanup', async () => {
|
|
162
229
|
await expect(disposeLangfuseHandler({})).resolves.toBeUndefined();
|
|
163
230
|
expect(mockForceFlush).not.toHaveBeenCalled();
|
|
@@ -108,6 +108,50 @@ describe('Langfuse trace metadata includes agentName', () => {
|
|
|
108
108
|
});
|
|
109
109
|
});
|
|
110
110
|
|
|
111
|
+
it('propagates configured Langfuse metadata and tags around processStream observations', async () => {
|
|
112
|
+
const run = await createTestRun(
|
|
113
|
+
'DWAINE',
|
|
114
|
+
{},
|
|
115
|
+
{
|
|
116
|
+
langfuse: {
|
|
117
|
+
metadata: { tenantId: 'tenant-1' },
|
|
118
|
+
tags: ['tenant:tenant-1'],
|
|
119
|
+
},
|
|
120
|
+
}
|
|
121
|
+
);
|
|
122
|
+
await run.processStream(
|
|
123
|
+
{ messages: [] },
|
|
124
|
+
{
|
|
125
|
+
configurable: {
|
|
126
|
+
thread_id: 'thread-123',
|
|
127
|
+
user_id: 'user-456',
|
|
128
|
+
},
|
|
129
|
+
version: 'v2',
|
|
130
|
+
}
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
expect(MockedCallbackHandler).toHaveBeenCalledTimes(1);
|
|
134
|
+
const ctorArgs = MockedCallbackHandler.mock.calls[0][0];
|
|
135
|
+
expect(ctorArgs).toMatchObject({
|
|
136
|
+
traceMetadata: {
|
|
137
|
+
tenantId: 'tenant-1',
|
|
138
|
+
messageId: 'test-run-id',
|
|
139
|
+
agentId: 'agent_abc123',
|
|
140
|
+
agentName: 'DWAINE',
|
|
141
|
+
},
|
|
142
|
+
tags: ['librechat', 'agent', 'tenant:tenant-1'],
|
|
143
|
+
});
|
|
144
|
+
expect(MockedPropagateAttributes.mock.calls[0][0]).toMatchObject({
|
|
145
|
+
tags: ['librechat', 'agent', 'tenant:tenant-1'],
|
|
146
|
+
metadata: {
|
|
147
|
+
tenantId: 'tenant-1',
|
|
148
|
+
messageId: 'test-run-id',
|
|
149
|
+
agentId: 'agent_abc123',
|
|
150
|
+
agentName: 'DWAINE',
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
});
|
|
154
|
+
|
|
111
155
|
it('falls back to agentId when agent has no explicit name', async () => {
|
|
112
156
|
const run = await createTestRun();
|
|
113
157
|
await run.processStream(
|
|
@@ -586,6 +586,8 @@ describe('Langfuse tool output tracing redaction', () => {
|
|
|
586
586
|
publicKey: 'pk-run',
|
|
587
587
|
secretKey: 'sk-run',
|
|
588
588
|
baseUrl: 'https://langfuse.test',
|
|
589
|
+
metadata: { tenantId: 'tenant-run' },
|
|
590
|
+
tags: ['tenant:tenant-run', 'shared'],
|
|
589
591
|
toolNodeTracing: { enabled: true },
|
|
590
592
|
toolOutputTracing: {
|
|
591
593
|
enabled: true,
|
|
@@ -593,6 +595,8 @@ describe('Langfuse tool output tracing redaction', () => {
|
|
|
593
595
|
},
|
|
594
596
|
},
|
|
595
597
|
{
|
|
598
|
+
metadata: { agentId: 'agent-1' },
|
|
599
|
+
tags: ['shared', 'agent:agent-1'],
|
|
596
600
|
toolOutputTracing: {
|
|
597
601
|
enabled: false,
|
|
598
602
|
redactedToolNames: ['execute_sql'],
|
|
@@ -605,6 +609,8 @@ describe('Langfuse tool output tracing redaction', () => {
|
|
|
605
609
|
publicKey: 'pk-run',
|
|
606
610
|
secretKey: 'sk-run',
|
|
607
611
|
baseUrl: 'https://langfuse.test',
|
|
612
|
+
metadata: { tenantId: 'tenant-run', agentId: 'agent-1' },
|
|
613
|
+
tags: ['tenant:tenant-run', 'shared', 'agent:agent-1'],
|
|
608
614
|
toolNodeTracing: { enabled: true },
|
|
609
615
|
toolOutputTracing: {
|
|
610
616
|
enabled: false,
|
|
@@ -22,7 +22,7 @@ import { attemptInvoke, tryFallbackProviders } from '@/llm/invoke';
|
|
|
22
22
|
import { createRemoveAllMessage } from '@/messages/reducer';
|
|
23
23
|
import { splitAtRecencyBoundary } from '@/messages/recency';
|
|
24
24
|
import { getMaxOutputTokensKey } from '@/llm/request';
|
|
25
|
-
import {
|
|
25
|
+
import { addTailCacheControl } from '@/messages/cache';
|
|
26
26
|
import { initializeModel } from '@/llm/init';
|
|
27
27
|
import { getChunkContent } from '@/stream';
|
|
28
28
|
import { executeHooks } from '@/hooks';
|
|
@@ -1227,7 +1227,7 @@ async function summarizeWithCacheHit({
|
|
|
1227
1227
|
|
|
1228
1228
|
const fullMessages = [...messages, new HumanMessage(instruction)];
|
|
1229
1229
|
const invokeMessages =
|
|
1230
|
-
usePromptCache === true ?
|
|
1230
|
+
usePromptCache === true ? addTailCacheControl(fullMessages) : fullMessages;
|
|
1231
1231
|
|
|
1232
1232
|
const result = await attemptInvoke(
|
|
1233
1233
|
{
|