@illuma-ai/agents 1.0.96 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/dist/cjs/agents/AgentContext.cjs +6 -2
  2. package/dist/cjs/agents/AgentContext.cjs.map +1 -1
  3. package/dist/cjs/common/constants.cjs +78 -0
  4. package/dist/cjs/common/constants.cjs.map +1 -1
  5. package/dist/cjs/graphs/Graph.cjs +191 -165
  6. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  7. package/dist/cjs/main.cjs +22 -0
  8. package/dist/cjs/main.cjs.map +1 -1
  9. package/dist/cjs/messages/dedup.cjs +95 -0
  10. package/dist/cjs/messages/dedup.cjs.map +1 -0
  11. package/dist/cjs/tools/CodeExecutor.cjs +22 -3
  12. package/dist/cjs/tools/CodeExecutor.cjs.map +1 -1
  13. package/dist/cjs/types/graph.cjs.map +1 -1
  14. package/dist/cjs/utils/contextPressure.cjs +154 -0
  15. package/dist/cjs/utils/contextPressure.cjs.map +1 -0
  16. package/dist/cjs/utils/pruneCalibration.cjs +78 -0
  17. package/dist/cjs/utils/pruneCalibration.cjs.map +1 -0
  18. package/dist/cjs/utils/run.cjs.map +1 -1
  19. package/dist/cjs/utils/tokens.cjs.map +1 -1
  20. package/dist/cjs/utils/toolDiscoveryCache.cjs +127 -0
  21. package/dist/cjs/utils/toolDiscoveryCache.cjs.map +1 -0
  22. package/dist/esm/agents/AgentContext.mjs +6 -2
  23. package/dist/esm/agents/AgentContext.mjs.map +1 -1
  24. package/dist/esm/common/constants.mjs +71 -1
  25. package/dist/esm/common/constants.mjs.map +1 -1
  26. package/dist/esm/graphs/Graph.mjs +192 -166
  27. package/dist/esm/graphs/Graph.mjs.map +1 -1
  28. package/dist/esm/main.mjs +5 -1
  29. package/dist/esm/main.mjs.map +1 -1
  30. package/dist/esm/messages/dedup.mjs +93 -0
  31. package/dist/esm/messages/dedup.mjs.map +1 -0
  32. package/dist/esm/tools/CodeExecutor.mjs +22 -3
  33. package/dist/esm/tools/CodeExecutor.mjs.map +1 -1
  34. package/dist/esm/types/graph.mjs.map +1 -1
  35. package/dist/esm/utils/contextPressure.mjs +148 -0
  36. package/dist/esm/utils/contextPressure.mjs.map +1 -0
  37. package/dist/esm/utils/pruneCalibration.mjs +74 -0
  38. package/dist/esm/utils/pruneCalibration.mjs.map +1 -0
  39. package/dist/esm/utils/run.mjs.map +1 -1
  40. package/dist/esm/utils/tokens.mjs.map +1 -1
  41. package/dist/esm/utils/toolDiscoveryCache.mjs +125 -0
  42. package/dist/esm/utils/toolDiscoveryCache.mjs.map +1 -0
  43. package/dist/types/agents/AgentContext.d.ts +4 -1
  44. package/dist/types/common/constants.d.ts +49 -0
  45. package/dist/types/graphs/Graph.d.ts +25 -0
  46. package/dist/types/messages/dedup.d.ts +25 -0
  47. package/dist/types/messages/index.d.ts +1 -0
  48. package/dist/types/types/graph.d.ts +63 -0
  49. package/dist/types/utils/contextPressure.d.ts +72 -0
  50. package/dist/types/utils/index.d.ts +3 -0
  51. package/dist/types/utils/pruneCalibration.d.ts +43 -0
  52. package/dist/types/utils/toolDiscoveryCache.d.ts +77 -0
  53. package/package.json +1 -1
  54. package/src/agents/AgentContext.ts +7 -0
  55. package/src/common/constants.ts +82 -0
  56. package/src/graphs/Graph.ts +254 -208
  57. package/src/graphs/contextManagement.e2e.test.ts +28 -20
  58. package/src/graphs/gapFeatures.test.ts +520 -0
  59. package/src/graphs/nonBlockingSummarization.test.ts +307 -0
  60. package/src/messages/__tests__/dedup.test.ts +166 -0
  61. package/src/messages/dedup.ts +104 -0
  62. package/src/messages/index.ts +1 -0
  63. package/src/specs/agent-handoffs-bedrock.integration.test.ts +7 -7
  64. package/src/specs/agent-handoffs.test.ts +36 -36
  65. package/src/specs/thinking-handoff.test.ts +10 -10
  66. package/src/tools/CodeExecutor.ts +22 -3
  67. package/src/types/graph.ts +73 -0
  68. package/src/utils/__tests__/pruneCalibration.test.ts +148 -0
  69. package/src/utils/__tests__/toolDiscoveryCache.test.ts +214 -0
  70. package/src/utils/contextPressure.test.ts +262 -0
  71. package/src/utils/contextPressure.ts +188 -0
  72. package/src/utils/index.ts +3 -0
  73. package/src/utils/pruneCalibration.ts +92 -0
  74. package/src/utils/run.ts +108 -108
  75. package/src/utils/tokens.ts +118 -118
  76. package/src/utils/toolDiscoveryCache.ts +150 -0
@@ -34,9 +34,9 @@ import type * as t from '@/types';
34
34
  import {
35
35
  formatAnthropicArtifactContent,
36
36
  ensureThinkingBlockInMessages,
37
+ deduplicateSystemMessages,
37
38
  convertMessagesToContent,
38
39
  addBedrockCacheControl,
39
- extractToolDiscoveries,
40
40
  modifyDeltaProperties,
41
41
  formatArtifactPayload,
42
42
  formatContentStrings,
@@ -53,14 +53,20 @@ import {
53
53
  MessageTypes,
54
54
  Constants,
55
55
  TOOL_TURN_THINKING_BUDGET,
56
+ SUMMARIZATION_CONTEXT_THRESHOLD,
56
57
  } from '@/common';
57
58
  import {
59
+ ToolDiscoveryCache,
58
60
  resetIfNotEmpty,
59
61
  isOpenAILike,
60
62
  isGoogleLike,
61
63
  joinKeys,
62
64
  sleep,
65
+ createPruneCalibration,
66
+ updatePruneCalibration,
67
+ applyCalibration,
63
68
  } from '@/utils';
69
+ import type { PruneCalibrationState } from '@/types/graph';
64
70
  import {
65
71
  buildContextAnalytics,
66
72
  type ContextAnalytics,
@@ -69,6 +75,13 @@ import { getChatModelClass, manualToolStreamProviders } from '@/llm/providers';
69
75
  import { ToolNode as CustomToolNode, toolsCondition } from '@/tools/ToolNode';
70
76
  import { ChatOpenAI, AzureChatOpenAI } from '@/llm/openai';
71
77
  import { safeDispatchCustomEvent } from '@/utils/events';
78
+ import {
79
+ detectDocuments,
80
+ shouldInjectMultiDocHint,
81
+ buildMultiDocHintContent,
82
+ buildPostPruneNote,
83
+ hasTaskTool,
84
+ } from '@/utils/contextPressure';
72
85
  import { createSchemaOnlyTools } from '@/tools/schema';
73
86
  import { prepareSchemaForProvider } from '@/schemas/validate';
74
87
  import { AgentContext } from '@/agents/AgentContext';
@@ -198,6 +211,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
198
211
  runId: string | undefined;
199
212
  startIndex: number = 0;
200
213
  signal?: AbortSignal;
214
+ /** Cached summary from the first prune in this run.
215
+ * Reused for subsequent prunes to avoid blocking LLM calls on every tool iteration. */
216
+ private _cachedRunSummary: string | undefined;
217
+ /** EMA-based pruning calibration state — smooths token budget adjustments across iterations */
218
+ private _pruneCalibration: PruneCalibrationState;
219
+ /** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
220
+ private _toolDiscoveryCache: ToolDiscoveryCache;
201
221
  /** Map of agent contexts by agent ID */
202
222
  agentContexts: Map<string, AgentContext> = new Map();
203
223
  /** Default agent ID to use */
@@ -232,6 +252,22 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
232
252
  }
233
253
 
234
254
  this.defaultAgentId = agents[0].agentId;
255
+
256
+ // Seed cached summary from persisted storage so the first prune in a
257
+ // resumed conversation can also skip the synchronous LLM summarization call
258
+ const primaryContext = this.agentContexts.get(this.defaultAgentId);
259
+ if (primaryContext?.persistedSummary) {
260
+ this._cachedRunSummary = primaryContext.persistedSummary;
261
+ }
262
+
263
+ // Initialize EMA pruning calibration
264
+ this._pruneCalibration = createPruneCalibration();
265
+
266
+ // Initialize tool discovery cache, seeded with any pre-existing discoveries
267
+ this._toolDiscoveryCache = new ToolDiscoveryCache();
268
+ if (primaryContext?.discoveredToolNames.size) {
269
+ this._toolDiscoveryCache.seed([...primaryContext.discoveredToolNames]);
270
+ }
235
271
  }
236
272
 
237
273
  /* Init */
@@ -265,6 +301,9 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
265
301
  new Map()
266
302
  );
267
303
  this.invokedToolIds = resetIfNotEmpty(this.invokedToolIds, undefined);
304
+ // Reset EMA calibration and tool discovery cache for fresh run
305
+ this._pruneCalibration = createPruneCalibration();
306
+ this._toolDiscoveryCache.reset();
268
307
  for (const context of this.agentContexts.values()) {
269
308
  context.reset();
270
309
  }
@@ -371,6 +410,70 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
371
410
  return clientOptions;
372
411
  }
373
412
 
413
+ /**
414
+ * Determines whether summarization should trigger based on SummarizationConfig.
415
+ *
416
+ * Supports three trigger strategies:
417
+ * - contextPercentage (default): Trigger when context utilization >= threshold%
418
+ * - messageCount: Trigger when pruned message count >= threshold
419
+ * - tokenThreshold: Trigger when total estimated tokens >= threshold
420
+ *
421
+ * When no config is provided, always triggers (preserves backward compatibility).
422
+ *
423
+ * @param prunedMessageCount - Number of messages that were pruned
424
+ * @param maxContextTokens - Maximum context token budget
425
+ * @param indexTokenCountMap - Token count map by message index
426
+ * @param instructionTokens - Token count for instructions/system message
427
+ * @param config - Optional SummarizationConfig
428
+ * @returns Whether summarization should be triggered
429
+ */
430
+ private shouldTriggerSummarization(
431
+ prunedMessageCount: number,
432
+ maxContextTokens: number,
433
+ indexTokenCountMap: Record<string, number | undefined>,
434
+ instructionTokens: number,
435
+ config?: t.SummarizationConfig
436
+ ): boolean {
437
+ // No pruned messages means nothing to summarize
438
+ if (prunedMessageCount === 0) {
439
+ return false;
440
+ }
441
+
442
+ // No config = backward compatible (always summarize when messages are pruned)
443
+ if (!config || !config.triggerType) {
444
+ return true;
445
+ }
446
+
447
+ const threshold = config.triggerThreshold;
448
+
449
+ switch (config.triggerType) {
450
+ case 'contextPercentage': {
451
+ if (maxContextTokens <= 0) return true;
452
+ const effectiveThreshold = threshold ?? SUMMARIZATION_CONTEXT_THRESHOLD;
453
+ let totalTokens = instructionTokens;
454
+ for (const key in indexTokenCountMap) {
455
+ totalTokens += indexTokenCountMap[key] ?? 0;
456
+ }
457
+ const utilization = (totalTokens / maxContextTokens) * 100;
458
+ return utilization >= effectiveThreshold;
459
+ }
460
+ case 'messageCount': {
461
+ const effectiveThreshold = threshold ?? 5;
462
+ return prunedMessageCount >= effectiveThreshold;
463
+ }
464
+ case 'tokenThreshold': {
465
+ if (threshold == null) return true;
466
+ let totalTokens = instructionTokens;
467
+ for (const key in indexTokenCountMap) {
468
+ totalTokens += indexTokenCountMap[key] ?? 0;
469
+ }
470
+ return totalTokens >= threshold;
471
+ }
472
+ default:
473
+ return true;
474
+ }
475
+ }
476
+
374
477
  /**
375
478
  * Returns the normalized finish/stop reason from the last LLM invocation.
376
479
  * Used by callers to detect when the response was truncated due to max_tokens.
@@ -528,9 +631,6 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
528
631
 
529
632
  getRunMessages(): BaseMessage[] | undefined {
530
633
  const result = this.messages.slice(this.startIndex);
531
- console.debug(
532
- `[Graph] getRunMessages() | totalMessages=${this.messages.length} | startIndex=${this.startIndex} | runMessages=${result.length}`
533
- );
534
634
  return result;
535
635
  }
536
636
 
@@ -1320,10 +1420,15 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1320
1420
  messages = [dynamicContextMessage, ackMessage, ...messages];
1321
1421
  }
1322
1422
 
1323
- // Extract tool discoveries from current turn only (similar to formatArtifactPayload pattern)
1324
- const discoveredNames = extractToolDiscoveries(messages);
1325
- if (discoveredNames.length > 0) {
1326
- agentContext.markToolsAsDiscovered(discoveredNames);
1423
+ // Tool discovery caching: only scan new messages since last iteration
1424
+ // instead of re-parsing the full history via extractToolDiscoveries()
1425
+ const cachedDiscoveries =
1426
+ this._toolDiscoveryCache.getNewDiscoveries(messages);
1427
+ if (cachedDiscoveries.length > 0) {
1428
+ agentContext.markToolsAsDiscovered(cachedDiscoveries);
1429
+ console.debug(
1430
+ `[Graph:ToolDiscovery] Cached ${cachedDiscoveries.length} new tools (total: ${this._toolDiscoveryCache.size})`
1431
+ );
1327
1432
  }
1328
1433
 
1329
1434
  const toolsForBinding = agentContext.getToolsForBinding();
@@ -1367,45 +1472,12 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1367
1472
 
1368
1473
  // ====================================================================
1369
1474
  // PRE-PRUNING DELEGATION CHECK
1370
- // Before pruning strips messages (losing context), check if we should
1371
- // delegate instead. If context would be pruned AND the agent has the
1372
- // task tool, inject a delegation hint and SKIP pruning — preserving
1373
- // the content for the LLM to understand what to delegate.
1374
1475
  // ====================================================================
1375
- let delegationInjectedPrePrune = false;
1376
- const hasTaskToolPrePrune = agentContext.tools?.some((tool) => {
1377
- const toolName =
1378
- typeof tool === 'object' && 'name' in tool
1379
- ? (tool as { name: string }).name
1380
- : '';
1381
- return toolName === 'task';
1382
- });
1383
-
1384
- if (
1385
- hasTaskToolPrePrune === true &&
1386
- agentContext.tokenCounter &&
1387
- agentContext.maxContextTokens != null
1388
- ) {
1389
- // Estimate total tokens in messages BEFORE pruning
1390
- let prePruneTokens = 0;
1391
- for (const msg of messages) {
1392
- prePruneTokens += agentContext.tokenCounter(msg);
1393
- }
1394
- // Add instruction tokens (system prompt)
1395
- prePruneTokens += agentContext.instructionTokens;
1396
-
1397
- const prePruneUtilization =
1398
- (prePruneTokens / agentContext.maxContextTokens) * 100;
1399
-
1400
- if (prePruneUtilization > 70) {
1401
- console.warn(
1402
- `[Graph] PRE-PRUNE delegation check: ${prePruneUtilization.toFixed(1)}% utilization ` +
1403
- `(${prePruneTokens}/${agentContext.maxContextTokens} tokens). ` +
1404
- 'Injecting delegation hint INSTEAD of pruning.'
1405
- );
1406
- delegationInjectedPrePrune = true;
1407
- }
1408
- }
1476
+ // Context management is now fully mechanical:
1477
+ // - Pruning always runs when needed (no delegation-based skip)
1478
+ // - Auto-continuation in client.js handles max_tokens finish reason
1479
+ // - LLM never sees raw token numbers (prevents voluntary bail-out)
1480
+ // ====================================================================
1409
1481
 
1410
1482
  if (
1411
1483
  !agentContext.pruneMessages &&
@@ -1426,48 +1498,126 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1426
1498
  ?.thinking as t.AnthropicClientOptions['thinking']
1427
1499
  )?.type === 'enabled');
1428
1500
 
1501
+ // Apply EMA calibration to max token budget — smooths pruning across iterations
1502
+ const calibratedMaxTokens = applyCalibration(
1503
+ agentContext.maxContextTokens,
1504
+ this._pruneCalibration
1505
+ );
1506
+
1429
1507
  agentContext.pruneMessages = createPruneMessages({
1430
1508
  startIndex: this.startIndex,
1431
1509
  provider: agentContext.provider,
1432
1510
  tokenCounter: agentContext.tokenCounter,
1433
- maxTokens: agentContext.maxContextTokens,
1511
+ maxTokens: calibratedMaxTokens,
1434
1512
  thinkingEnabled: isAnthropicWithThinking,
1435
1513
  indexTokenCountMap: agentContext.indexTokenCountMap,
1436
1514
  });
1437
1515
  }
1438
1516
 
1439
- if (agentContext.pruneMessages && !delegationInjectedPrePrune) {
1440
- console.debug(
1441
- `[Graph:ContextMgmt] Pruning messages | inputCount=${messages.length} | maxTokens=${agentContext.maxContextTokens}`
1442
- );
1517
+ // Update EMA calibration with actual token usage from API response
1518
+ if (
1519
+ agentContext.currentUsage?.input_tokens &&
1520
+ agentContext.maxContextTokens
1521
+ ) {
1522
+ const estimatedTokens = Object.values(
1523
+ agentContext.indexTokenCountMap
1524
+ ).reduce((sum, v) => (sum ?? 0) + (v ?? 0), 0) as number;
1525
+ if (estimatedTokens > 0) {
1526
+ this._pruneCalibration = updatePruneCalibration(
1527
+ this._pruneCalibration,
1528
+ agentContext.currentUsage.input_tokens,
1529
+ estimatedTokens
1530
+ );
1531
+ }
1532
+ }
1533
+
1534
+ if (agentContext.pruneMessages) {
1443
1535
  const { context, indexTokenCountMap, messagesToRefine } =
1444
1536
  agentContext.pruneMessages({
1445
1537
  messages,
1446
1538
  usageMetadata: agentContext.currentUsage,
1447
- // startOnMessageType: 'human',
1448
1539
  });
1449
1540
  agentContext.indexTokenCountMap = indexTokenCountMap;
1450
1541
  messagesToUse = context;
1451
- console.debug(
1452
- `[Graph:ContextMgmt] Pruned | kept=${context.length} | discarded=${messagesToRefine.length} | originalCount=${messages.length}`
1542
+
1543
+ // ── Non-blocking summarization ──────────────────────────────────
1544
+ // NEVER block the LLM call waiting for summarization. Instead:
1545
+ // 1. If _cachedRunSummary exists → use it, fire async update
1546
+ // 2. If persistedSummary exists → use it as fallback, fire async update
1547
+ // 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
1548
+ // The summary catches up asynchronously and is available for subsequent
1549
+ // iterations (tool calls) and the next conversation turn.
1550
+ //
1551
+ // SummarizationConfig integration:
1552
+ // - triggerType/triggerThreshold control WHEN summarization fires
1553
+ // - reserveRatio is enforced via calibrated maxTokens (above)
1554
+ // - initialSummary provides cross-run seeding as fallback before persistedSummary
1555
+ let hasSummary = false;
1556
+ const sumConfig = agentContext.summarizationConfig;
1557
+ const shouldSummarize = this.shouldTriggerSummarization(
1558
+ messagesToRefine.length,
1559
+ agentContext.maxContextTokens ?? 0,
1560
+ agentContext.indexTokenCountMap,
1561
+ agentContext.instructionTokens,
1562
+ sumConfig
1453
1563
  );
1454
1564
 
1455
- // Summarize discarded messages if callback provided
1456
- if (messagesToRefine.length > 0 && agentContext.summarizeCallback) {
1457
- console.debug(
1458
- `[Graph:ContextMgmt] Summarizing ${messagesToRefine.length} discarded messages`
1459
- );
1565
+ if (
1566
+ messagesToRefine.length > 0 &&
1567
+ agentContext.summarizeCallback &&
1568
+ shouldSummarize
1569
+ ) {
1460
1570
  try {
1461
- const summary =
1462
- await agentContext.summarizeCallback(messagesToRefine);
1571
+ let summary: string | undefined;
1572
+ let summarySource: string;
1573
+
1574
+ if (this._cachedRunSummary != null) {
1575
+ summary = this._cachedRunSummary;
1576
+ summarySource = 'cached';
1577
+ } else if (
1578
+ agentContext.persistedSummary != null &&
1579
+ agentContext.persistedSummary !== ''
1580
+ ) {
1581
+ summary = agentContext.persistedSummary;
1582
+ this._cachedRunSummary = summary;
1583
+ summarySource = 'persisted';
1584
+ } else if (
1585
+ sumConfig?.initialSummary != null &&
1586
+ sumConfig.initialSummary !== ''
1587
+ ) {
1588
+ // Cross-run seed: use initialSummary when no persisted summary exists
1589
+ summary = sumConfig.initialSummary;
1590
+ this._cachedRunSummary = summary;
1591
+ summarySource = 'initial-seed';
1592
+ } else {
1593
+ summarySource = 'none';
1594
+ }
1595
+
1596
+ // Single consolidated log for the entire prune+summarize decision
1463
1597
  console.debug(
1464
- `[Graph:ContextMgmt] Summary received | len=${summary?.length ?? 0} | hasContent=${summary != null && summary !== ''}`
1598
+ `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
1465
1599
  );
1600
+
1601
+ // Fire background summarization — updates cache for next iteration/turn
1602
+ agentContext
1603
+ .summarizeCallback(messagesToRefine)
1604
+ .then((updated) => {
1605
+ if (updated != null && updated !== '') {
1606
+ this._cachedRunSummary = updated;
1607
+ }
1608
+ })
1609
+ .catch((err) => {
1610
+ console.error(
1611
+ '[Graph] Background summary failed (non-fatal):',
1612
+ err
1613
+ );
1614
+ });
1615
+
1466
1616
  if (summary != null && summary !== '') {
1617
+ hasSummary = true;
1467
1618
  const summaryMsg = new SystemMessage(
1468
1619
  `[Conversation Summary]\n${summary}`
1469
1620
  );
1470
- // Insert after system message (if present), before conversation messages
1471
1621
  const systemIdx =
1472
1622
  messagesToUse[0]?.getType() === 'system' ? 1 : 0;
1473
1623
  messagesToUse = [
@@ -1475,18 +1625,40 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1475
1625
  summaryMsg,
1476
1626
  ...messagesToUse.slice(systemIdx),
1477
1627
  ];
1478
- console.debug(
1479
- `[Graph:ContextMgmt] Summary injected at index ${systemIdx} | finalMsgCount=${messagesToUse.length}`
1480
- );
1481
1628
  }
1482
1629
  } catch (err) {
1483
- console.error('[Graph] Summarization callback failed:', err);
1630
+ console.error('[Graph] Summarization failed:', err);
1631
+ }
1632
+ } else if (messagesToRefine.length > 0) {
1633
+ // Log pruning even when no summarize callback (discard mode)
1634
+ console.debug(
1635
+ `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`
1636
+ );
1637
+ }
1638
+
1639
+ // Deduplicate system messages that accumulate from repeated tool iterations
1640
+ const { messages: dedupedMessages, removedCount } =
1641
+ deduplicateSystemMessages(messagesToUse);
1642
+ if (removedCount > 0) {
1643
+ messagesToUse = dedupedMessages;
1644
+ console.debug(
1645
+ `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
1646
+ );
1647
+ }
1648
+
1649
+ // Post-prune context note for task-tool-enabled agents
1650
+ if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
1651
+ const postPruneNote = buildPostPruneNote(
1652
+ messagesToRefine.length,
1653
+ hasSummary
1654
+ );
1655
+ if (postPruneNote) {
1656
+ messagesToUse = [
1657
+ ...messagesToUse,
1658
+ new SystemMessage(postPruneNote),
1659
+ ];
1484
1660
  }
1485
1661
  }
1486
- } else if (delegationInjectedPrePrune) {
1487
- console.info(
1488
- '[Graph] Skipping pruning — delegation will handle context pressure'
1489
- );
1490
1662
  }
1491
1663
 
1492
1664
  let finalMessages = messagesToUse;
@@ -1645,106 +1817,24 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1645
1817
  );
1646
1818
 
1647
1819
  // ====================================================================
1648
- // CONTEXT PRESSURE AWARENESS Intelligent Sub-Agent Delegation
1820
+ // MULTI-DOCUMENT DELEGATION (task-driven, not budget-driven)
1649
1821
  //
1650
- // Two triggers for delegation hints:
1651
- // 1. DOCUMENT COUNT: When 3+ documents are detected in the conversation,
1652
- // inject a delegation hint on the FIRST iteration (before the LLM
1653
- // has called any tools). This ensures the agent delegates upfront
1654
- // rather than trying to process all documents itself.
1655
- // 2. TOKEN UTILIZATION: At EVERY iteration, if context is filling up
1656
- // (70%/85%), inject escalating hints to delegate remaining work.
1657
- //
1658
- // This runs mid-chain — so even if tool responses push context up
1659
- // after the first LLM call, subsequent iterations get the hint.
1822
+ // Token-based pressure hints have been removed — the LLM never sees
1823
+ // raw token numbers. Context overflow is handled mechanically by
1824
+ // pruning (Graph) + auto-continuation (client.js max_tokens detection).
1825
+ // See: docs/context-overflow-architecture.md
1660
1826
  // ====================================================================
1661
- const hasTaskToolInContext = agentContext.tools?.some((tool) => {
1662
- const toolName =
1663
- typeof tool === 'object' && 'name' in tool
1664
- ? (tool as { name: string }).name
1665
- : '';
1666
- return toolName === 'task';
1667
- });
1827
+ if (hasTaskTool(agentContext.tools)) {
1828
+ const { count: documentCount, names: documentNames } =
1829
+ detectDocuments(finalMessages);
1668
1830
 
1669
- if (
1670
- hasTaskToolInContext === true &&
1671
- contextAnalytics.utilizationPercent != null &&
1672
- contextAnalytics.maxContextTokens != null
1673
- ) {
1674
- const utilization = contextAnalytics.utilizationPercent;
1675
- const totalTokens = contextAnalytics.totalTokens;
1676
- const maxTokens = contextAnalytics.maxContextTokens;
1677
- const remainingTokens = maxTokens - totalTokens;
1678
-
1679
- // Count attached documents by scanning for document patterns in HumanMessages:
1680
- // 1. # "filename" headers in "Attached document(s):" blocks (text content)
1681
- // 2. **filename1, filename2** in "The user has attached:" blocks (embedded files)
1682
- // 3. Filenames in file_search tool results
1683
- let documentCount = 0;
1684
- const documentNames: string[] = [];
1685
- for (const msg of finalMessages) {
1686
- const content =
1687
- typeof msg.content === 'string'
1688
- ? msg.content
1689
- : Array.isArray(msg.content)
1690
- ? msg.content
1691
- .map((p: unknown) => {
1692
- const part = p as Record<string, unknown>;
1693
- return String(part.text ?? part.content ?? '');
1694
- })
1695
- .join(' ')
1696
- : '';
1697
- // Pattern 1: # "filename" headers in attached document blocks
1698
- const docMatches = content.match(/# "([^"]+)"/g);
1699
- if (docMatches) {
1700
- for (const match of docMatches) {
1701
- const name = match.replace(/# "/, '').replace(/"$/, '');
1702
- if (!documentNames.includes(name)) {
1703
- documentNames.push(name);
1704
- documentCount++;
1705
- }
1706
- }
1707
- }
1708
- // Pattern 2: "The user has attached: **file1, file2**" (embedded files)
1709
- const attachedMatch = content.match(
1710
- /user has attached:\s*\*\*([^*]+)\*\*/i
1711
- );
1712
- if (attachedMatch) {
1713
- const names = attachedMatch[1]
1714
- .split(',')
1715
- .map((n: string) => n.trim())
1716
- .filter(Boolean);
1717
- for (const name of names) {
1718
- if (!documentNames.includes(name)) {
1719
- documentNames.push(name);
1720
- documentCount++;
1721
- }
1722
- }
1723
- }
1724
- }
1725
-
1726
- // BASELINE LOG: Always fires so we can verify this code path runs
1727
- console.debug(
1728
- `[Graph] Context utilization: ${utilization.toFixed(1)}% ` +
1729
- `(${totalTokens}/${maxTokens} tokens, ${remainingTokens} remaining) | ` +
1730
- `hasTaskTool: true | messages: ${finalMessages.length} | docs: ${documentCount}`
1731
- );
1732
-
1733
- // TRIGGER 1: Multi-document delegation (3+ documents detected)
1734
- // Only inject on first iteration (no AI messages yet = agent hasn't responded)
1831
+ // Multi-document delegation: first iteration only (before AI has responded)
1735
1832
  const hasAiResponse = finalMessages.some(
1736
1833
  (m) => m._getType() === 'ai' || m._getType() === 'tool'
1737
1834
  );
1738
- if (documentCount >= 3 && !hasAiResponse) {
1835
+ if (shouldInjectMultiDocHint(documentCount, hasAiResponse)) {
1739
1836
  const pressureMsg = new HumanMessage({
1740
- content:
1741
- `[MULTI-DOCUMENT PROCESSING — ${documentCount} documents detected]\n` +
1742
- `Documents: ${documentNames.join(', ')}\n\n` +
1743
- `You have ${documentCount} documents attached. For thorough analysis, use the "task" tool ` +
1744
- 'to delegate each document (or group of related documents) to a sub-agent.\n' +
1745
- 'Each sub-agent has its own fresh context window and can use file_search to retrieve the full document content.\n' +
1746
- 'After all sub-agents complete, synthesize their results into a comprehensive response.\n\n' +
1747
- 'This approach ensures each document gets full attention without context limitations.',
1837
+ content: buildMultiDocHintContent(documentCount, documentNames),
1748
1838
  });
1749
1839
  finalMessages = [...finalMessages, pressureMsg];
1750
1840
  console.info(
@@ -1752,43 +1842,6 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1752
1842
  `${documentNames.join(', ')}`
1753
1843
  );
1754
1844
  }
1755
-
1756
- // TRIGGER 2: Token utilization thresholds (mid-chain safety net)
1757
- // Also fires when we skipped pruning due to delegationInjectedPrePrune
1758
- if (
1759
- utilization > 85 ||
1760
- (delegationInjectedPrePrune && utilization > 50)
1761
- ) {
1762
- // CRITICAL: Context is high — MANDATE delegation
1763
- const pressureMsg = new HumanMessage({
1764
- content:
1765
- `[CONTEXT BUDGET CRITICAL — ${utilization.toFixed(0)}% used]\n` +
1766
- `You have used ${totalTokens} of ${maxTokens} tokens (${remainingTokens} remaining).\n` +
1767
- 'Your context is very large. You MUST use the "task" tool to delegate work to sub-agents.\n' +
1768
- 'Each sub-agent runs in its own fresh context window and can use file_search to access documents.\n' +
1769
- 'Do NOT attempt to process documents directly — delegate each document to a sub-agent, then synthesize results.',
1770
- });
1771
- finalMessages = [...finalMessages, pressureMsg];
1772
- console.warn(
1773
- `[Graph] Context pressure CRITICAL (${utilization.toFixed(0)}%): ` +
1774
- `Injected mandatory delegation hint. ${remainingTokens} tokens remaining. ` +
1775
- `prePruneSkipped: ${delegationInjectedPrePrune}`
1776
- );
1777
- } else if (utilization > 70) {
1778
- // WARNING: Context filling up — suggest delegation
1779
- const pressureMsg = new HumanMessage({
1780
- content:
1781
- `[CONTEXT BUDGET WARNING — ${utilization.toFixed(0)}% used]\n` +
1782
- `You have used ${totalTokens} of ${maxTokens} tokens (${remainingTokens} remaining).\n` +
1783
- 'Your context is filling up. Consider using the "task" tool to delegate complex operations to sub-agents.\n' +
1784
- "Sub-agents run in fresh context windows and won't consume your remaining budget.",
1785
- });
1786
- finalMessages = [...finalMessages, pressureMsg];
1787
- console.info(
1788
- `[Graph] Context pressure WARNING (${utilization.toFixed(0)}%): ` +
1789
- `Injected delegation suggestion. ${remainingTokens} tokens remaining.`
1790
- );
1791
- }
1792
1845
  }
1793
1846
 
1794
1847
  // Structured output mode: when the agent has NO tools, produce structured JSON immediately.
@@ -2302,13 +2355,6 @@ If I seem to be missing something we discussed earlier, just give me a quick rem
2302
2355
  reducer: (a, b) => {
2303
2356
  if (!a.length) {
2304
2357
  this.startIndex = a.length + b.length;
2305
- console.debug(
2306
- `[Graph:Reducer] Initial messages | startIndex=${this.startIndex} | inputMsgCount=${b.length}`
2307
- );
2308
- } else {
2309
- console.debug(
2310
- `[Graph:Reducer] Appending messages | existing=${a.length} | new=${b.length} | startIndex=${this.startIndex}`
2311
- );
2312
2358
  }
2313
2359
  const result = messagesStateReducer(a, b);
2314
2360
  this.messages = result;
@@ -596,30 +596,38 @@ describe('Pre-invocation utilization gate', () => {
596
596
  expect(emergency.length).toBeLessThan(2000); // Emergency summaries are compact
597
597
  });
598
598
 
599
- it('injects delegation hint at >70% utilization for agents with task tool', () => {
600
- const utilization = 75;
601
- const hasTaskTool = true;
602
-
603
- if (utilization > 70 && hasTaskTool) {
604
- const delegationHint = new HumanMessage({
605
- content:
606
- '[System] Context window is at 75% capacity. Consider delegating complex sub-tasks ' +
607
- 'to the task tool to maintain context availability.',
608
- });
609
- expect(delegationHint.content).toContain('75%');
610
- expect(delegationHint.content).toContain('task tool');
599
+ it('does NOT inject token budget hints at any utilization level', () => {
600
+ // Token budget hints were removed to prevent LLM voluntary bail-out.
601
+ // Context overflow is handled mechanically by pruning + auto-continuation.
602
+ // See: docs/context-overflow-architecture.md
603
+ const utilizationLevels = [50, 70, 85, 95, 101];
604
+ for (const utilization of utilizationLevels) {
605
+ const messages = buildConversation(10, 200);
606
+ // No message should contain raw token numbers or budget percentages
607
+ for (const msg of messages) {
608
+ const content =
609
+ typeof msg.content === 'string'
610
+ ? msg.content
611
+ : JSON.stringify(msg.content);
612
+ expect(content).not.toMatch(/CONTEXT BUDGET/);
613
+ expect(content).not.toMatch(/\d+ of \d+ tokens/);
614
+ }
611
615
  }
612
616
  });
613
617
 
614
- it('does not inject delegation hint below 70%', () => {
615
- const utilization = 65;
616
- let delegationInjected = false;
617
-
618
- if (utilization > 70) {
619
- delegationInjected = true;
618
+ it('post-prune note does not contain token numbers', () => {
619
+ // After pruning, a context note is injected but it must not
620
+ // expose any token counts or budget percentages to the LLM
621
+ const { buildPostPruneNote } = require('@/utils/contextPressure');
622
+ const noteWithSummary = buildPostPruneNote(10, true);
623
+ const noteWithout = buildPostPruneNote(10, false);
624
+ for (const note of [noteWithSummary, noteWithout]) {
625
+ expect(note).not.toBeNull();
626
+ expect(note).not.toMatch(/\d+%/);
627
+ expect(note).not.toMatch(/\d+ of \d+ tokens/);
628
+ expect(note).not.toMatch(/BUDGET/i);
629
+ expect(note).toContain('task');
620
630
  }
621
-
622
- expect(delegationInjected).toBe(false);
623
631
  });
624
632
  });
625
633