@librechat/agents 3.2.36 → 3.2.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/cjs/agents/AgentContext.cjs +1 -1
  2. package/dist/cjs/agents/AgentContext.cjs.map +1 -1
  3. package/dist/cjs/graphs/Graph.cjs +7 -8
  4. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  5. package/dist/cjs/langfuse.cjs +16 -5
  6. package/dist/cjs/langfuse.cjs.map +1 -1
  7. package/dist/cjs/langfuseToolOutputTracing.cjs +7 -0
  8. package/dist/cjs/langfuseToolOutputTracing.cjs.map +1 -1
  9. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +92 -3
  10. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
  11. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +24 -4
  12. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
  13. package/dist/cjs/main.cjs +2 -0
  14. package/dist/cjs/messages/cache.cjs +183 -0
  15. package/dist/cjs/messages/cache.cjs.map +1 -1
  16. package/dist/cjs/summarization/node.cjs +1 -1
  17. package/dist/cjs/summarization/node.cjs.map +1 -1
  18. package/dist/cjs/tools/toolOutputReferences.cjs +28 -14
  19. package/dist/cjs/tools/toolOutputReferences.cjs.map +1 -1
  20. package/dist/esm/agents/AgentContext.mjs +2 -2
  21. package/dist/esm/agents/AgentContext.mjs.map +1 -1
  22. package/dist/esm/graphs/Graph.mjs +8 -9
  23. package/dist/esm/graphs/Graph.mjs.map +1 -1
  24. package/dist/esm/langfuse.mjs +16 -5
  25. package/dist/esm/langfuse.mjs.map +1 -1
  26. package/dist/esm/langfuseToolOutputTracing.mjs +7 -0
  27. package/dist/esm/langfuseToolOutputTracing.mjs.map +1 -1
  28. package/dist/esm/llm/anthropic/utils/message_inputs.mjs +92 -3
  29. package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
  30. package/dist/esm/llm/bedrock/utils/message_inputs.mjs +24 -4
  31. package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
  32. package/dist/esm/main.mjs +2 -2
  33. package/dist/esm/messages/cache.mjs +182 -1
  34. package/dist/esm/messages/cache.mjs.map +1 -1
  35. package/dist/esm/summarization/node.mjs +2 -2
  36. package/dist/esm/summarization/node.mjs.map +1 -1
  37. package/dist/esm/tools/toolOutputReferences.mjs +28 -14
  38. package/dist/esm/tools/toolOutputReferences.mjs.map +1 -1
  39. package/dist/types/messages/cache.d.ts +40 -0
  40. package/dist/types/types/graph.d.ts +2 -0
  41. package/package.json +2 -1
  42. package/src/agents/AgentContext.ts +2 -2
  43. package/src/agents/__tests__/AgentContext.test.ts +3 -9
  44. package/src/graphs/Graph.ts +65 -36
  45. package/src/langfuse.ts +38 -4
  46. package/src/langfuseToolOutputTracing.ts +18 -0
  47. package/src/llm/anthropic/utils/message_inputs.ts +131 -3
  48. package/src/llm/anthropic/utils/stripPrefillCache.test.ts +111 -0
  49. package/src/llm/bedrock/utils/message_inputs.test.ts +129 -0
  50. package/src/llm/bedrock/utils/message_inputs.ts +46 -4
  51. package/src/llm/bedrock/utils/toolResultCachePoint.test.ts +103 -0
  52. package/src/messages/cache.tail.test.ts +340 -0
  53. package/src/messages/cache.ts +266 -0
  54. package/src/messages/tailCacheConversion.test.ts +161 -0
  55. package/src/scripts/bench-prompt-cache.ts +479 -0
  56. package/src/specs/langfuse-config.test.ts +69 -2
  57. package/src/specs/langfuse-metadata.test.ts +44 -0
  58. package/src/specs/langfuse-tool-output-tracing.test.ts +6 -0
  59. package/src/summarization/node.ts +2 -2
  60. package/src/tools/__tests__/annotateMessagesForLLM.test.ts +50 -0
  61. package/src/tools/toolOutputReferences.ts +34 -20
  62. package/src/types/graph.ts +2 -0
@@ -0,0 +1,479 @@
1
+ /**
2
+ * Live, reproducible benchmark: single tail prompt-cache breakpoint (new
3
+ * default) vs. the legacy "last two user messages" strategy.
4
+ *
5
+ * It replays realistic harness conversations against a real provider and, for
6
+ * each model call, records the cache token breakdown the API reports. The two
7
+ * strategies are run over the SAME conversations (only the cache MARKING
8
+ * differs) under distinct cache namespaces, then compared.
9
+ *
10
+ * What it demonstrates
11
+ * --------------------
12
+ * - Agent tool loop (one user turn, many tool rounds): the legacy strategy
13
+ * pins its only message breakpoint on the lone user message, so every
14
+ * appended assistant/tool turn is re-sent UNCACHED on the next call — cache
15
+ * write/fresh ≫ read. The tail strategy rides the true tail, so the growing
16
+ * transcript is written once and read back. This is the dominant agent shape
17
+ * and where the legacy approach breaks down hardest.
18
+ * - Multi-turn chat (frequent user messages): legacy's two rolling markers do
19
+ * fine here; the tail strategy ties (never worse).
20
+ * - Realistic agent (user turns interleaved with tool rounds): tail wins.
21
+ *
22
+ * Metrics (per strategy, summed over all calls in a scenario)
23
+ * - cache_read : tokens served from cache (HIGHER is better).
24
+ * - cache_write : tokens written to cache (cache_creation).
25
+ * - fresh : uncached input processed at full price
26
+ * (= input_tokens - cache_read - cache_write); this is what
27
+ * balloons when caching fails to cover the transcript.
28
+ * - effective : a cost proxy in input-token-equivalents using Anthropic's
29
+ * published multipliers — read x0.1, write x1.25, fresh x1.0.
30
+ * LOWER is better.
31
+ *
32
+ * Usage
33
+ * # Anthropic (default). Needs ANTHROPIC_API_KEY in .env (or BENCH_ENV_FILE).
34
+ * npm run bench:cache
35
+ * # Bedrock. Needs BEDROCK_AWS_* creds.
36
+ * npm run bench:cache -- --provider bedrock
37
+ * # Options: --provider anthropic|bedrock --rounds <N> --model <id>
38
+ *
39
+ * Not a unit test (no `.test.` suffix) so CI never runs it; it makes real,
40
+ * paid API calls.
41
+ */
42
+ import { config } from 'dotenv';
43
+ config({ path: process.env.BENCH_ENV_FILE || '.env' });
44
+
45
+ import {
46
+ HumanMessage,
47
+ AIMessage,
48
+ ToolMessage,
49
+ type BaseMessage,
50
+ } from '@langchain/core/messages';
51
+ import { CustomAnthropic } from '@/llm/anthropic';
52
+ import { CustomChatBedrockConverse } from '@/llm/bedrock';
53
+ import {
54
+ addCacheControl,
55
+ addTailCacheControl,
56
+ addBedrockCacheControl,
57
+ addBedrockTailCacheControl,
58
+ } from '@/messages/cache';
59
+
60
+ type ProviderName = 'anthropic' | 'bedrock';
61
+
62
+ interface Args {
63
+ provider: ProviderName;
64
+ rounds: number;
65
+ model?: string;
66
+ }
67
+
68
+ function parseArgs(): Args {
69
+ const argv = process.argv.slice(2);
70
+ const out: Args = { provider: 'anthropic', rounds: 6 };
71
+ for (let i = 0; i < argv.length; i++) {
72
+ const a = argv[i];
73
+ if (a === '--provider') out.provider = argv[++i] as ProviderName;
74
+ else if (a === '--rounds') out.rounds = Number(argv[++i]);
75
+ else if (a === '--model') out.model = argv[++i];
76
+ }
77
+ return out;
78
+ }
79
+
80
+ /** Deterministic filler of roughly `tokens` tokens (~0.75 words/token). */
81
+ function filler(tokens: number, tag: string): string {
82
+ const words = Math.max(1, Math.round(tokens * 0.75));
83
+ const out: string[] = [];
84
+ for (let i = 0; i < words; i++) {
85
+ out.push(`${tag}${i % 97}`);
86
+ }
87
+ return out.join(' ');
88
+ }
89
+
90
+ // ---------------------------------------------------------------------------
91
+ // Scenarios. Each returns the message list for every model call (call `i`
92
+ // sends step `i`; the transcript grows append-only between calls), built under
93
+ // a per-run nonce so the two strategy runs never share a cache namespace.
94
+ // ---------------------------------------------------------------------------
95
+
96
+ const STABLE_TOKENS = 2000; // big stable context (instructions / first request)
97
+ const TOOL_RESULT_TOKENS = 600; // realistic agent tool output (file/search)
98
+
99
+ function processToolCall(id: string, batch: number) {
100
+ return { id, name: 'process_records', args: { batch } };
101
+ }
102
+
103
+ /** Agent tool loop: ONE user turn, then `rounds` assistant→tool rounds. */
104
+ function toolLoopScenario(nonce: string, rounds: number): BaseMessage[][] {
105
+ const steps: BaseMessage[][] = [];
106
+ const conv: BaseMessage[] = [
107
+ new HumanMessage(
108
+ `Session ${nonce}. Reference data follows.\n${filler(STABLE_TOKENS, `ref${nonce}`)}\n\n` +
109
+ 'Process every batch using the process_records tool until done.'
110
+ ),
111
+ ];
112
+ for (let i = 1; i <= rounds; i++) {
113
+ steps.push([...conv]);
114
+ conv.push(
115
+ new AIMessage({
116
+ content: `Processing batch ${i}.`,
117
+ tool_calls: [processToolCall(`tl_${nonce}_${i}`, i)],
118
+ })
119
+ );
120
+ conv.push(
121
+ new ToolMessage({
122
+ tool_call_id: `tl_${nonce}_${i}`,
123
+ content: `Batch ${i} of session ${nonce} complete. ${filler(TOOL_RESULT_TOKENS, `out${i}`)}`,
124
+ })
125
+ );
126
+ }
127
+ return steps;
128
+ }
129
+
130
+ /** Multi-turn chat: frequent user messages, no tools (legacy's good case). */
131
+ function chatScenario(nonce: string, rounds: number): BaseMessage[][] {
132
+ const steps: BaseMessage[][] = [];
133
+ const conv: BaseMessage[] = [
134
+ new HumanMessage(
135
+ `Session ${nonce}.\n${filler(STABLE_TOKENS, `doc${nonce}`)}\n\nQuestion 1: summarize.`
136
+ ),
137
+ ];
138
+ for (let i = 1; i <= rounds; i++) {
139
+ steps.push([...conv]);
140
+ conv.push(new AIMessage(`Answer ${i}. ${filler(120, `ans${i}`)}`));
141
+ conv.push(
142
+ new HumanMessage(`Question ${i + 1}: ${filler(60, `q${i + 1}`)}`)
143
+ );
144
+ }
145
+ return steps;
146
+ }
147
+
148
+ /** Realistic agent: each user turn triggers two tool rounds, then a new user. */
149
+ function agentMixedScenario(nonce: string, rounds: number): BaseMessage[][] {
150
+ const steps: BaseMessage[][] = [];
151
+ const conv: BaseMessage[] = [
152
+ new HumanMessage(
153
+ `Session ${nonce}. Project context:\n${filler(STABLE_TOKENS, `ctx${nonce}`)}\n\nTask 1: investigate.`
154
+ ),
155
+ ];
156
+ let tc = 0;
157
+ for (let turn = 1; turn <= rounds; turn++) {
158
+ // two tool rounds within this user turn
159
+ for (let r = 0; r < 2; r++) {
160
+ steps.push([...conv]);
161
+ tc++;
162
+ const id = `am_${nonce}_${tc}`;
163
+ conv.push(
164
+ new AIMessage({
165
+ content: `Turn ${turn} step ${r + 1}.`,
166
+ tool_calls: [{ id, name: 'process_records', args: { step: tc } }],
167
+ })
168
+ );
169
+ conv.push(
170
+ new ToolMessage({
171
+ tool_call_id: id,
172
+ content: `Result ${tc} (${nonce}). ${filler(TOOL_RESULT_TOKENS, `r${tc}`)}`,
173
+ })
174
+ );
175
+ }
176
+ // model summarizes, user asks the next task
177
+ steps.push([...conv]);
178
+ conv.push(new AIMessage(`Turn ${turn} summary. ${filler(80, `s${turn}`)}`));
179
+ conv.push(
180
+ new HumanMessage(`Task ${turn + 1}: ${filler(60, `t${turn + 1}`)}`)
181
+ );
182
+ }
183
+ return steps;
184
+ }
185
+
186
+ const SUMMARY_TOKENS = 1500; // compacted-history summary injected post-compaction
187
+
188
+ /**
189
+ * Post-compaction (summarization): a few tool rounds on the original context,
190
+ * then a compaction event replaces the head with a summary message, then the
191
+ * agent continues. The compaction step is a deliberate cache miss for BOTH
192
+ * strategies (the cached prefix genuinely changed — unavoidable). What matters
193
+ * is the POST-compaction phase: the summary becomes the new stable head and the
194
+ * tail strategy re-establishes append-only caching over the continuing tool
195
+ * loop, whereas legacy pins on the lone summary user-message and re-sends the
196
+ * new tool work uncached. (Tool results here are already the truncated,
197
+ * persisted strings ToolNode stores — truncation is applied once at exec time
198
+ * with a model-fixed cap, so it does not mutate the prefix across turns.)
199
+ */
200
+ function postCompactionScenario(
201
+ nonce: string,
202
+ rounds: number
203
+ ): BaseMessage[][] {
204
+ const steps: BaseMessage[][] = [];
205
+
206
+ // Phase 1: pre-compaction growth on the original context.
207
+ const pre: BaseMessage[] = [
208
+ new HumanMessage(
209
+ `Session ${nonce}. ${filler(STABLE_TOKENS, `pre${nonce}`)}\n\nAnalyze the dataset.`
210
+ ),
211
+ ];
212
+ for (let i = 1; i <= 2; i++) {
213
+ steps.push([...pre]);
214
+ pre.push(
215
+ new AIMessage({
216
+ content: `Pre ${i}.`,
217
+ tool_calls: [
218
+ {
219
+ id: `pc_${nonce}_${i}`,
220
+ name: 'process_records',
221
+ args: { batch: i },
222
+ },
223
+ ],
224
+ })
225
+ );
226
+ pre.push(
227
+ new ToolMessage({
228
+ tool_call_id: `pc_${nonce}_${i}`,
229
+ content: `Pre result ${i}. ${filler(TOOL_RESULT_TOKENS, `pr${i}`)}`,
230
+ })
231
+ );
232
+ }
233
+
234
+ // Compaction: head replaced by a durable summary; continue from there.
235
+ const post: BaseMessage[] = [
236
+ new HumanMessage(
237
+ `Session ${nonce} (resumed after compaction).\n<summary>\n${filler(SUMMARY_TOKENS, `sum${nonce}`)}\n</summary>\n\nContinue the analysis.`
238
+ ),
239
+ ];
240
+ for (let i = 1; i <= rounds; i++) {
241
+ steps.push([...post]);
242
+ post.push(
243
+ new AIMessage({
244
+ content: `Post ${i}.`,
245
+ tool_calls: [
246
+ {
247
+ id: `po_${nonce}_${i}`,
248
+ name: 'process_records',
249
+ args: { batch: i },
250
+ },
251
+ ],
252
+ })
253
+ );
254
+ post.push(
255
+ new ToolMessage({
256
+ tool_call_id: `po_${nonce}_${i}`,
257
+ content: `Post result ${i}. ${filler(TOOL_RESULT_TOKENS, `po${i}`)}`,
258
+ })
259
+ );
260
+ }
261
+ return steps;
262
+ }
263
+
264
+ const SCENARIOS: Array<{
265
+ name: string;
266
+ build: (nonce: string, rounds: number) => BaseMessage[][];
267
+ }> = [
268
+ {
269
+ name: 'Agent tool loop (1 user turn, N tool rounds)',
270
+ build: toolLoopScenario,
271
+ },
272
+ { name: 'Multi-turn chat (frequent user messages)', build: chatScenario },
273
+ {
274
+ name: 'Realistic agent (user turns + tool rounds)',
275
+ build: agentMixedScenario,
276
+ },
277
+ {
278
+ name: 'Post-compaction (summary head + continued tool loop)',
279
+ build: postCompactionScenario,
280
+ },
281
+ ];
282
+
283
+ // ---------------------------------------------------------------------------
284
+ // Provider plumbing.
285
+ // ---------------------------------------------------------------------------
286
+
287
+ const PROCESS_TOOL = {
288
+ type: 'function' as const,
289
+ function: {
290
+ name: 'process_records',
291
+ description: 'Process a batch of records.',
292
+ parameters: {
293
+ type: 'object',
294
+ properties: { batch: { type: 'number' }, step: { type: 'number' } },
295
+ },
296
+ },
297
+ };
298
+
299
+ interface StrategyPair {
300
+ legacy: (m: BaseMessage[]) => BaseMessage[];
301
+ tail: (m: BaseMessage[]) => BaseMessage[];
302
+ }
303
+
304
+ function makeProvider(args: Args): {
305
+ invoke: (messages: BaseMessage[]) => Promise<Usage | undefined>;
306
+ strategies: StrategyPair;
307
+ label: string;
308
+ } {
309
+ if (args.provider === 'bedrock') {
310
+ const model = args.model ?? 'us.anthropic.claude-sonnet-4-5-20250929-v1:0';
311
+ const llm = new CustomChatBedrockConverse({
312
+ model,
313
+ region:
314
+ process.env.BEDROCK_AWS_REGION ??
315
+ process.env.AWS_DEFAULT_REGION ??
316
+ 'us-east-1',
317
+ credentials: {
318
+ accessKeyId: process.env.BEDROCK_AWS_ACCESS_KEY_ID!,
319
+ secretAccessKey: process.env.BEDROCK_AWS_SECRET_ACCESS_KEY!,
320
+ },
321
+ streaming: true,
322
+ streamUsage: true,
323
+ maxTokens: 32,
324
+ promptCache: true,
325
+ }).bindTools([PROCESS_TOOL]);
326
+ return {
327
+ label: `bedrock:${model}`,
328
+ invoke: async (messages) =>
329
+ (await llm.invoke(messages)).usage_metadata as Usage,
330
+ strategies: {
331
+ legacy: (m) => addBedrockCacheControl<BaseMessage>(m),
332
+ tail: (m) => addBedrockTailCacheControl<BaseMessage>(m),
333
+ },
334
+ };
335
+ }
336
+
337
+ const model = args.model ?? 'claude-sonnet-4-5';
338
+ const llm = new CustomAnthropic({
339
+ model,
340
+ apiKey: process.env.ANTHROPIC_API_KEY,
341
+ maxTokens: 32,
342
+ promptCache: true,
343
+ streaming: true,
344
+ streamUsage: true,
345
+ } as never).bindTools([PROCESS_TOOL]);
346
+ return {
347
+ label: `anthropic:${model}`,
348
+ invoke: async (messages) =>
349
+ (await llm.invoke(messages)).usage_metadata as Usage,
350
+ strategies: {
351
+ legacy: (m) => addCacheControl<BaseMessage>(m),
352
+ tail: (m) => addTailCacheControl<BaseMessage>(m),
353
+ },
354
+ };
355
+ }
356
+
357
+ type Usage = {
358
+ input_tokens?: number;
359
+ output_tokens?: number;
360
+ total_tokens?: number;
361
+ input_token_details?: { cache_creation?: number; cache_read?: number };
362
+ };
363
+
364
+ interface Totals {
365
+ read: number;
366
+ write: number;
367
+ fresh: number;
368
+ effective: number;
369
+ }
370
+
371
+ function emptyTotals(): Totals {
372
+ return { read: 0, write: 0, fresh: 0, effective: 0 };
373
+ }
374
+
375
+ function addUsage(t: Totals, u: Usage | undefined): void {
376
+ const d = u?.input_token_details ?? {};
377
+ const read = d.cache_read ?? 0;
378
+ const write = d.cache_creation ?? 0;
379
+ // Provider-agnostic fresh: total prompt tokens minus cached buckets. Avoids
380
+ // the `input_tokens` ambiguity — Anthropic folds cache tokens INTO
381
+ // input_tokens, while Bedrock reports input_tokens as fresh-only with cache
382
+ // tokens separate. `total_tokens - output_tokens` is the full prompt size on
383
+ // both, so subtracting read+write leaves the truly fresh (full-price) input.
384
+ const promptTotal = (u?.total_tokens ?? 0) - (u?.output_tokens ?? 0);
385
+ const fresh = Math.max(0, promptTotal - read - write);
386
+ t.read += read;
387
+ t.write += write;
388
+ t.fresh += fresh;
389
+ // Anthropic/Bedrock price multipliers: read 0.1x, write 1.25x, fresh 1x.
390
+ t.effective += fresh + write * 1.25 + read * 0.1;
391
+ }
392
+
393
+ async function runStrategy(
394
+ steps: BaseMessage[][],
395
+ apply: (m: BaseMessage[]) => BaseMessage[],
396
+ invoke: (m: BaseMessage[]) => Promise<Usage | undefined>
397
+ ): Promise<Totals> {
398
+ const totals = emptyTotals();
399
+ for (const step of steps) {
400
+ const usage = await invoke(apply(step));
401
+ addUsage(totals, usage);
402
+ }
403
+ return totals;
404
+ }
405
+
406
+ function pct(legacy: number, tail: number): string {
407
+ if (legacy === 0) return tail === 0 ? '0%' : 'n/a';
408
+ const delta = ((tail - legacy) / legacy) * 100;
409
+ return `${delta >= 0 ? '+' : ''}${delta.toFixed(0)}%`;
410
+ }
411
+
412
+ function uniqueNonce(tag: string): string {
413
+ return `${tag}-${Date.now().toString(36)}-${Math.floor(Math.random() * 1e6).toString(36)}`;
414
+ }
415
+
416
+ async function main(): Promise<void> {
417
+ const args = parseArgs();
418
+ if (args.provider === 'anthropic' && !process.env.ANTHROPIC_API_KEY) {
419
+ console.error('Set ANTHROPIC_API_KEY (in .env or via BENCH_ENV_FILE).');
420
+ process.exit(1);
421
+ }
422
+ if (args.provider === 'bedrock' && !process.env.BEDROCK_AWS_ACCESS_KEY_ID) {
423
+ console.error(
424
+ 'Set BEDROCK_AWS_ACCESS_KEY_ID / BEDROCK_AWS_SECRET_ACCESS_KEY.'
425
+ );
426
+ process.exit(1);
427
+ }
428
+
429
+ const { invoke, strategies, label } = makeProvider(args);
430
+ console.log(`\nProvider: ${label} rounds=${args.rounds}`);
431
+ console.log(
432
+ 'Metrics summed over all calls in a scenario. read↑ better; fresh↓ and effective↓ better.\n'
433
+ );
434
+
435
+ let tailWins = 0;
436
+ let scenarioCount = 0;
437
+
438
+ for (const scenario of SCENARIOS) {
439
+ // Distinct nonce per strategy run so legacy and tail never share a cache.
440
+ const legacySteps = scenario.build(uniqueNonce('legacy'), args.rounds);
441
+ const legacy = await runStrategy(legacySteps, strategies.legacy, invoke);
442
+ const tailSteps = scenario.build(uniqueNonce('tail'), args.rounds);
443
+ const tail = await runStrategy(tailSteps, strategies.tail, invoke);
444
+
445
+ console.log(`SCENARIO: ${scenario.name} (${legacySteps.length} calls)`);
446
+ const row = (name: string, t: Totals): string =>
447
+ ` ${name.padEnd(8)} read=${String(t.read).padStart(7)} write=${String(
448
+ t.write
449
+ ).padStart(7)} fresh=${String(t.fresh).padStart(7)} effective=${String(
450
+ Math.round(t.effective)
451
+ ).padStart(7)}`;
452
+ console.log(row('legacy', legacy));
453
+ console.log(row('tail', tail));
454
+ console.log(
455
+ ` Δ tail vs legacy: read ${pct(legacy.read, tail.read)} ` +
456
+ `fresh ${pct(legacy.fresh, tail.fresh)} ` +
457
+ `effective ${pct(legacy.effective, tail.effective)} (lower=cheaper)`
458
+ );
459
+
460
+ const better = tail.effective <= legacy.effective;
461
+ const tie =
462
+ Math.abs(tail.effective - legacy.effective) / (legacy.effective || 1) <
463
+ 0.03;
464
+ console.log(
465
+ ` → ${better ? (tie ? '≈ TIE' : '✅ TAIL WINS') : '❌ legacy better'}\n`
466
+ );
467
+ scenarioCount++;
468
+ if (better) tailWins++;
469
+ }
470
+
471
+ console.log(
472
+ `RESULT: tail strategy is better-or-equal in ${tailWins}/${scenarioCount} scenarios.`
473
+ );
474
+ }
475
+
476
+ main().catch((err) => {
477
+ console.error('Benchmark failed:', err);
478
+ process.exit(1);
479
+ });
@@ -1,9 +1,10 @@
1
1
  import { CallbackHandler } from '@langfuse/langchain';
2
2
  import {
3
- createLangfuseHandler,
4
- disposeLangfuseHandler,
5
3
  hasLangfuseConfigCredentials,
6
4
  shouldCreateLangfuseHandler,
5
+ isExplicitLangfuseConfig,
6
+ disposeLangfuseHandler,
7
+ createLangfuseHandler,
7
8
  } from '@/langfuse';
8
9
 
9
10
  const mockForceFlush = jest.fn();
@@ -68,6 +69,39 @@ describe('createLangfuseHandler', () => {
68
69
  });
69
70
  });
70
71
 
72
+ it('adds configured trace metadata and tags to the callback handler', () => {
73
+ process.env.LANGFUSE_PUBLIC_KEY = 'pk-env';
74
+ process.env.LANGFUSE_SECRET_KEY = 'sk-env';
75
+
76
+ const handler = createLangfuseHandler({
77
+ langfuse: {
78
+ metadata: {
79
+ tenantId: 'tenant-1',
80
+ empty: '',
81
+ skipped: null,
82
+ },
83
+ tags: ['tenant:tenant-1', 'agent'],
84
+ },
85
+ traceMetadata: {
86
+ messageId: 'message-1',
87
+ agentId: 'agent-1',
88
+ },
89
+ tags: ['librechat', 'agent'],
90
+ });
91
+
92
+ expect(handler).toBeDefined();
93
+ expect(MockedCallbackHandler).toHaveBeenCalledWith({
94
+ userId: undefined,
95
+ sessionId: undefined,
96
+ traceMetadata: {
97
+ tenantId: 'tenant-1',
98
+ messageId: 'message-1',
99
+ agentId: 'agent-1',
100
+ },
101
+ tags: ['librechat', 'agent', 'tenant:tenant-1'],
102
+ });
103
+ });
104
+
71
105
  it('creates a handler for explicit credentials supplied in config', () => {
72
106
  const handler = createLangfuseHandler({
73
107
  langfuse: {
@@ -158,6 +192,39 @@ describe('createLangfuseHandler', () => {
158
192
  ).toBe(true);
159
193
  });
160
194
 
195
+ it('does not treat sanitized-away trace attributes as explicit config', () => {
196
+ expect(
197
+ isExplicitLangfuseConfig({
198
+ metadata: {
199
+ empty: '',
200
+ whitespace: ' ',
201
+ missing: null,
202
+ tooLong: 'x'.repeat(201),
203
+ },
204
+ tags: ['', ' '],
205
+ })
206
+ ).toBe(false);
207
+ });
208
+
209
+ it('treats valid trace metadata or tags as explicit config', () => {
210
+ expect(
211
+ isExplicitLangfuseConfig({
212
+ metadata: {
213
+ tenantId: 'tenant-1',
214
+ },
215
+ tags: ['', ' '],
216
+ })
217
+ ).toBe(true);
218
+ expect(
219
+ isExplicitLangfuseConfig({
220
+ metadata: {
221
+ empty: '',
222
+ },
223
+ tags: ['tenant:tenant-1'],
224
+ })
225
+ ).toBe(true);
226
+ });
227
+
161
228
  it('does not flush the shared Langfuse provider during per-chat cleanup', async () => {
162
229
  await expect(disposeLangfuseHandler({})).resolves.toBeUndefined();
163
230
  expect(mockForceFlush).not.toHaveBeenCalled();
@@ -108,6 +108,50 @@ describe('Langfuse trace metadata includes agentName', () => {
108
108
  });
109
109
  });
110
110
 
111
+ it('propagates configured Langfuse metadata and tags around processStream observations', async () => {
112
+ const run = await createTestRun(
113
+ 'DWAINE',
114
+ {},
115
+ {
116
+ langfuse: {
117
+ metadata: { tenantId: 'tenant-1' },
118
+ tags: ['tenant:tenant-1'],
119
+ },
120
+ }
121
+ );
122
+ await run.processStream(
123
+ { messages: [] },
124
+ {
125
+ configurable: {
126
+ thread_id: 'thread-123',
127
+ user_id: 'user-456',
128
+ },
129
+ version: 'v2',
130
+ }
131
+ );
132
+
133
+ expect(MockedCallbackHandler).toHaveBeenCalledTimes(1);
134
+ const ctorArgs = MockedCallbackHandler.mock.calls[0][0];
135
+ expect(ctorArgs).toMatchObject({
136
+ traceMetadata: {
137
+ tenantId: 'tenant-1',
138
+ messageId: 'test-run-id',
139
+ agentId: 'agent_abc123',
140
+ agentName: 'DWAINE',
141
+ },
142
+ tags: ['librechat', 'agent', 'tenant:tenant-1'],
143
+ });
144
+ expect(MockedPropagateAttributes.mock.calls[0][0]).toMatchObject({
145
+ tags: ['librechat', 'agent', 'tenant:tenant-1'],
146
+ metadata: {
147
+ tenantId: 'tenant-1',
148
+ messageId: 'test-run-id',
149
+ agentId: 'agent_abc123',
150
+ agentName: 'DWAINE',
151
+ },
152
+ });
153
+ });
154
+
111
155
  it('falls back to agentId when agent has no explicit name', async () => {
112
156
  const run = await createTestRun();
113
157
  await run.processStream(
@@ -586,6 +586,8 @@ describe('Langfuse tool output tracing redaction', () => {
586
586
  publicKey: 'pk-run',
587
587
  secretKey: 'sk-run',
588
588
  baseUrl: 'https://langfuse.test',
589
+ metadata: { tenantId: 'tenant-run' },
590
+ tags: ['tenant:tenant-run', 'shared'],
589
591
  toolNodeTracing: { enabled: true },
590
592
  toolOutputTracing: {
591
593
  enabled: true,
@@ -593,6 +595,8 @@ describe('Langfuse tool output tracing redaction', () => {
593
595
  },
594
596
  },
595
597
  {
598
+ metadata: { agentId: 'agent-1' },
599
+ tags: ['shared', 'agent:agent-1'],
596
600
  toolOutputTracing: {
597
601
  enabled: false,
598
602
  redactedToolNames: ['execute_sql'],
@@ -605,6 +609,8 @@ describe('Langfuse tool output tracing redaction', () => {
605
609
  publicKey: 'pk-run',
606
610
  secretKey: 'sk-run',
607
611
  baseUrl: 'https://langfuse.test',
612
+ metadata: { tenantId: 'tenant-run', agentId: 'agent-1' },
613
+ tags: ['tenant:tenant-run', 'shared', 'agent:agent-1'],
608
614
  toolNodeTracing: { enabled: true },
609
615
  toolOutputTracing: {
610
616
  enabled: false,
@@ -22,7 +22,7 @@ import { attemptInvoke, tryFallbackProviders } from '@/llm/invoke';
22
22
  import { createRemoveAllMessage } from '@/messages/reducer';
23
23
  import { splitAtRecencyBoundary } from '@/messages/recency';
24
24
  import { getMaxOutputTokensKey } from '@/llm/request';
25
- import { addCacheControl } from '@/messages/cache';
25
+ import { addTailCacheControl } from '@/messages/cache';
26
26
  import { initializeModel } from '@/llm/init';
27
27
  import { getChunkContent } from '@/stream';
28
28
  import { executeHooks } from '@/hooks';
@@ -1227,7 +1227,7 @@ async function summarizeWithCacheHit({
1227
1227
 
1228
1228
  const fullMessages = [...messages, new HumanMessage(instruction)];
1229
1229
  const invokeMessages =
1230
- usePromptCache === true ? addCacheControl(fullMessages) : fullMessages;
1230
+ usePromptCache === true ? addTailCacheControl(fullMessages) : fullMessages;
1231
1231
 
1232
1232
  const result = await attemptInvoke(
1233
1233
  {