@illuma-ai/agents 1.0.96 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/dist/cjs/agents/AgentContext.cjs +6 -2
  2. package/dist/cjs/agents/AgentContext.cjs.map +1 -1
  3. package/dist/cjs/common/constants.cjs +78 -0
  4. package/dist/cjs/common/constants.cjs.map +1 -1
  5. package/dist/cjs/graphs/Graph.cjs +191 -165
  6. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  7. package/dist/cjs/main.cjs +22 -0
  8. package/dist/cjs/main.cjs.map +1 -1
  9. package/dist/cjs/messages/dedup.cjs +95 -0
  10. package/dist/cjs/messages/dedup.cjs.map +1 -0
  11. package/dist/cjs/tools/CodeExecutor.cjs +22 -3
  12. package/dist/cjs/tools/CodeExecutor.cjs.map +1 -1
  13. package/dist/cjs/types/graph.cjs.map +1 -1
  14. package/dist/cjs/utils/contextPressure.cjs +154 -0
  15. package/dist/cjs/utils/contextPressure.cjs.map +1 -0
  16. package/dist/cjs/utils/pruneCalibration.cjs +78 -0
  17. package/dist/cjs/utils/pruneCalibration.cjs.map +1 -0
  18. package/dist/cjs/utils/run.cjs.map +1 -1
  19. package/dist/cjs/utils/tokens.cjs.map +1 -1
  20. package/dist/cjs/utils/toolDiscoveryCache.cjs +127 -0
  21. package/dist/cjs/utils/toolDiscoveryCache.cjs.map +1 -0
  22. package/dist/esm/agents/AgentContext.mjs +6 -2
  23. package/dist/esm/agents/AgentContext.mjs.map +1 -1
  24. package/dist/esm/common/constants.mjs +71 -1
  25. package/dist/esm/common/constants.mjs.map +1 -1
  26. package/dist/esm/graphs/Graph.mjs +192 -166
  27. package/dist/esm/graphs/Graph.mjs.map +1 -1
  28. package/dist/esm/main.mjs +5 -1
  29. package/dist/esm/main.mjs.map +1 -1
  30. package/dist/esm/messages/dedup.mjs +93 -0
  31. package/dist/esm/messages/dedup.mjs.map +1 -0
  32. package/dist/esm/tools/CodeExecutor.mjs +22 -3
  33. package/dist/esm/tools/CodeExecutor.mjs.map +1 -1
  34. package/dist/esm/types/graph.mjs.map +1 -1
  35. package/dist/esm/utils/contextPressure.mjs +148 -0
  36. package/dist/esm/utils/contextPressure.mjs.map +1 -0
  37. package/dist/esm/utils/pruneCalibration.mjs +74 -0
  38. package/dist/esm/utils/pruneCalibration.mjs.map +1 -0
  39. package/dist/esm/utils/run.mjs.map +1 -1
  40. package/dist/esm/utils/tokens.mjs.map +1 -1
  41. package/dist/esm/utils/toolDiscoveryCache.mjs +125 -0
  42. package/dist/esm/utils/toolDiscoveryCache.mjs.map +1 -0
  43. package/dist/types/agents/AgentContext.d.ts +4 -1
  44. package/dist/types/common/constants.d.ts +49 -0
  45. package/dist/types/graphs/Graph.d.ts +25 -0
  46. package/dist/types/messages/dedup.d.ts +25 -0
  47. package/dist/types/messages/index.d.ts +1 -0
  48. package/dist/types/types/graph.d.ts +63 -0
  49. package/dist/types/utils/contextPressure.d.ts +72 -0
  50. package/dist/types/utils/index.d.ts +3 -0
  51. package/dist/types/utils/pruneCalibration.d.ts +43 -0
  52. package/dist/types/utils/toolDiscoveryCache.d.ts +77 -0
  53. package/package.json +1 -1
  54. package/src/agents/AgentContext.ts +7 -0
  55. package/src/common/constants.ts +82 -0
  56. package/src/graphs/Graph.ts +254 -208
  57. package/src/graphs/contextManagement.e2e.test.ts +28 -20
  58. package/src/graphs/gapFeatures.test.ts +520 -0
  59. package/src/graphs/nonBlockingSummarization.test.ts +307 -0
  60. package/src/messages/__tests__/dedup.test.ts +166 -0
  61. package/src/messages/dedup.ts +104 -0
  62. package/src/messages/index.ts +1 -0
  63. package/src/specs/agent-handoffs-bedrock.integration.test.ts +7 -7
  64. package/src/specs/agent-handoffs.test.ts +36 -36
  65. package/src/specs/thinking-handoff.test.ts +10 -10
  66. package/src/tools/CodeExecutor.ts +22 -3
  67. package/src/types/graph.ts +73 -0
  68. package/src/utils/__tests__/pruneCalibration.test.ts +148 -0
  69. package/src/utils/__tests__/toolDiscoveryCache.test.ts +214 -0
  70. package/src/utils/contextPressure.test.ts +262 -0
  71. package/src/utils/contextPressure.ts +188 -0
  72. package/src/utils/index.ts +3 -0
  73. package/src/utils/pruneCalibration.ts +92 -0
  74. package/src/utils/run.ts +108 -108
  75. package/src/utils/tokens.ts +118 -118
  76. package/src/utils/toolDiscoveryCache.ts +150 -0
@@ -17,6 +17,76 @@ const MIN_THINKING_BUDGET = 1024;
17
17
  * compounding across multi-tool conversations (e.g., 10 tool calls).
18
18
  */
19
19
  const TOOL_TURN_THINKING_BUDGET = 1024;
20
+ // ============================================================================
21
+ // CONTEXT OVERFLOW MANAGEMENT
22
+ //
23
+ // Context overflow is handled mechanically — no token budget numbers are
24
+ // exposed to the LLM. The system uses: pruning (Graph), summarization
25
+ // (summarizeCallback), and auto-continuation (client.js max_tokens detection).
26
+ //
27
+ // See: docs/context-overflow-architecture.md
28
+ // ============================================================================
29
+ /**
30
+ * Minimum number of attached documents before the multi-document delegation
31
+ * hint is injected. Below this threshold, the agent processes documents
32
+ * directly within its own context.
33
+ */
34
+ const MULTI_DOCUMENT_THRESHOLD = 3;
35
+ /**
36
+ * Context utilization safety buffer multiplier (0-1).
37
+ * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER
38
+ *
39
+ * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.
40
+ * 0.9 = 10% reserved for safety.
41
+ */
42
+ const CONTEXT_SAFETY_BUFFER = 0.9;
43
+ // ============================================================================
44
+ // SUMMARIZATION CONFIGURATION DEFAULTS
45
+ //
46
+ // These constants provide sensible defaults for the SummarizationConfig.
47
+ // They can be overridden per-agent via AgentInputs.summarizationConfig.
48
+ // ============================================================================
49
+ /**
50
+ * Default context utilization percentage (0-100) at which summarization triggers.
51
+ * When the context window is ≥80% full, pruning + summarization activates.
52
+ */
53
+ const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
54
+ /**
55
+ * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
56
+ * 0.3 means 30% of the context budget is reserved for the most recent messages,
57
+ * ensuring the model always has immediate conversation history even after aggressive pruning.
58
+ */
59
+ const SUMMARIZATION_RESERVE_RATIO = 0.3;
60
+ /**
61
+ * Default EMA (Exponential Moving Average) alpha for pruning calibration.
62
+ * Controls how quickly the calibration adapts to new token counts.
63
+ * Higher α = faster adaptation (more responsive to recent changes).
64
+ * Lower α = smoother adaptation (more stable across iterations).
65
+ * 0.3 provides a balance between responsiveness and stability.
66
+ */
67
+ const PRUNING_EMA_ALPHA = 0.3;
68
+ /**
69
+ * Default initial calibration ratio for EMA pruning.
70
+ * 1.0 means no adjustment on the first iteration (trust the raw token counts).
71
+ * Subsequent iterations will adjust based on actual vs. estimated token usage.
72
+ */
73
+ const PRUNING_INITIAL_CALIBRATION = 1.0;
74
+ // ============================================================================
75
+ // TOOL DISCOVERY CACHING
76
+ // ============================================================================
77
+ /**
78
+ * Maximum number of tool discovery entries to cache per conversation.
79
+ * Prevents unbounded memory growth in very long conversations.
80
+ */
81
+ const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;
82
+ // ============================================================================
83
+ // MESSAGE DEDUPLICATION
84
+ // ============================================================================
85
+ /**
86
+ * Maximum length of system message content to hash for deduplication.
87
+ * Messages longer than this are always considered unique (hashing would be expensive).
88
+ */
89
+ const DEDUP_MAX_CONTENT_LENGTH = 10000;
20
90
 
21
- export { MIN_THINKING_BUDGET, TOOL_TURN_THINKING_BUDGET };
91
+ export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET };
22
92
  //# sourceMappingURL=constants.mjs.map
@@ -1 +1 @@
1
- {"version":3,"file":"constants.mjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n"],"names":[],"mappings":"AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;;;;"}
1
+ {"version":3,"file":"constants.mjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":"AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG;;;;"}
@@ -10,9 +10,9 @@ import { createPruneMessages } from '../messages/prune.mjs';
10
10
  import { ensureThinkingBlockInMessages } from '../messages/format.mjs';
11
11
  import { addCacheControl, addBedrockCacheControl } from '../messages/cache.mjs';
12
12
  import { formatContentStrings } from '../messages/content.mjs';
13
- import { extractToolDiscoveries } from '../messages/tools.mjs';
14
13
  import { GraphNodeKeys, Providers, ContentTypes, GraphEvents, MessageTypes, StepTypes, Constants } from '../common/enum.mjs';
15
- import { TOOL_TURN_THINKING_BUDGET } from '../common/constants.mjs';
14
+ import { TOOL_TURN_THINKING_BUDGET, SUMMARIZATION_CONTEXT_THRESHOLD } from '../common/constants.mjs';
15
+ import { deduplicateSystemMessages } from '../messages/dedup.mjs';
16
16
  import { resetIfNotEmpty, joinKeys } from '../utils/graph.mjs';
17
17
  import { isOpenAILike, isGoogleLike } from '../utils/llm.mjs';
18
18
  import { ChatModelStreamHandler } from '../stream.mjs';
@@ -22,6 +22,9 @@ import 'ai-tokenizer';
22
22
  import '../utils/toonFormat.mjs';
23
23
  import { buildContextAnalytics } from '../utils/contextAnalytics.mjs';
24
24
  import 'zod-to-json-schema';
25
+ import { hasTaskTool, buildPostPruneNote, detectDocuments, shouldInjectMultiDocHint, buildMultiDocHintContent } from '../utils/contextPressure.mjs';
26
+ import { ToolDiscoveryCache } from '../utils/toolDiscoveryCache.mjs';
27
+ import { createPruneCalibration, applyCalibration, updatePruneCalibration } from '../utils/pruneCalibration.mjs';
25
28
  import { getChatModelClass, manualToolStreamProviders } from '../llm/providers.mjs';
26
29
  import { ToolNode, toolsCondition } from '../tools/ToolNode.mjs';
27
30
  import { ChatOpenAI, AzureChatOpenAI } from '../llm/openai/index.mjs';
@@ -90,6 +93,13 @@ class StandardGraph extends Graph {
90
93
  runId;
91
94
  startIndex = 0;
92
95
  signal;
96
+ /** Cached summary from the first prune in this run.
97
+ * Reused for subsequent prunes to avoid blocking LLM calls on every tool iteration. */
98
+ _cachedRunSummary;
99
+ /** EMA-based pruning calibration state — smooths token budget adjustments across iterations */
100
+ _pruneCalibration;
101
+ /** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
102
+ _toolDiscoveryCache;
93
103
  /** Map of agent contexts by agent ID */
94
104
  agentContexts = new Map();
95
105
  /** Default agent ID to use */
@@ -110,6 +120,19 @@ class StandardGraph extends Graph {
110
120
  this.agentContexts.set(agentConfig.agentId, agentContext);
111
121
  }
112
122
  this.defaultAgentId = agents[0].agentId;
123
+ // Seed cached summary from persisted storage so the first prune in a
124
+ // resumed conversation can also skip the synchronous LLM summarization call
125
+ const primaryContext = this.agentContexts.get(this.defaultAgentId);
126
+ if (primaryContext?.persistedSummary) {
127
+ this._cachedRunSummary = primaryContext.persistedSummary;
128
+ }
129
+ // Initialize EMA pruning calibration
130
+ this._pruneCalibration = createPruneCalibration();
131
+ // Initialize tool discovery cache, seeded with any pre-existing discoveries
132
+ this._toolDiscoveryCache = new ToolDiscoveryCache();
133
+ if (primaryContext?.discoveredToolNames.size) {
134
+ this._toolDiscoveryCache.seed([...primaryContext.discoveredToolNames]);
135
+ }
113
136
  }
114
137
  /* Init */
115
138
  resetValues(keepContent) {
@@ -132,6 +155,9 @@ class StandardGraph extends Graph {
132
155
  this.messageStepHasToolCalls = resetIfNotEmpty(this.messageStepHasToolCalls, new Map());
133
156
  this.prelimMessageIdsByStepKey = resetIfNotEmpty(this.prelimMessageIdsByStepKey, new Map());
134
157
  this.invokedToolIds = resetIfNotEmpty(this.invokedToolIds, undefined);
158
+ // Reset EMA calibration and tool discovery cache for fresh run
159
+ this._pruneCalibration = createPruneCalibration();
160
+ this._toolDiscoveryCache.reset();
135
161
  for (const context of this.agentContexts.values()) {
136
162
  context.reset();
137
163
  }
@@ -220,6 +246,62 @@ class StandardGraph extends Graph {
220
246
  }
221
247
  return clientOptions;
222
248
  }
249
+ /**
250
+ * Determines whether summarization should trigger based on SummarizationConfig.
251
+ *
252
+ * Supports three trigger strategies:
253
+ * - contextPercentage (default): Trigger when context utilization >= threshold%
254
+ * - messageCount: Trigger when pruned message count >= threshold
255
+ * - tokenThreshold: Trigger when total estimated tokens >= threshold
256
+ *
257
+ * When no config is provided, always triggers (preserves backward compatibility).
258
+ *
259
+ * @param prunedMessageCount - Number of messages that were pruned
260
+ * @param maxContextTokens - Maximum context token budget
261
+ * @param indexTokenCountMap - Token count map by message index
262
+ * @param instructionTokens - Token count for instructions/system message
263
+ * @param config - Optional SummarizationConfig
264
+ * @returns Whether summarization should be triggered
265
+ */
266
+ shouldTriggerSummarization(prunedMessageCount, maxContextTokens, indexTokenCountMap, instructionTokens, config) {
267
+ // No pruned messages means nothing to summarize
268
+ if (prunedMessageCount === 0) {
269
+ return false;
270
+ }
271
+ // No config = backward compatible (always summarize when messages are pruned)
272
+ if (!config || !config.triggerType) {
273
+ return true;
274
+ }
275
+ const threshold = config.triggerThreshold;
276
+ switch (config.triggerType) {
277
+ case 'contextPercentage': {
278
+ if (maxContextTokens <= 0)
279
+ return true;
280
+ const effectiveThreshold = threshold ?? SUMMARIZATION_CONTEXT_THRESHOLD;
281
+ let totalTokens = instructionTokens;
282
+ for (const key in indexTokenCountMap) {
283
+ totalTokens += indexTokenCountMap[key] ?? 0;
284
+ }
285
+ const utilization = (totalTokens / maxContextTokens) * 100;
286
+ return utilization >= effectiveThreshold;
287
+ }
288
+ case 'messageCount': {
289
+ const effectiveThreshold = threshold ?? 5;
290
+ return prunedMessageCount >= effectiveThreshold;
291
+ }
292
+ case 'tokenThreshold': {
293
+ if (threshold == null)
294
+ return true;
295
+ let totalTokens = instructionTokens;
296
+ for (const key in indexTokenCountMap) {
297
+ totalTokens += indexTokenCountMap[key] ?? 0;
298
+ }
299
+ return totalTokens >= threshold;
300
+ }
301
+ default:
302
+ return true;
303
+ }
304
+ }
223
305
  /**
224
306
  * Returns the normalized finish/stop reason from the last LLM invocation.
225
307
  * Used by callers to detect when the response was truncated due to max_tokens.
@@ -358,7 +440,6 @@ class StandardGraph extends Graph {
358
440
  /* Misc.*/
359
441
  getRunMessages() {
360
442
  const result = this.messages.slice(this.startIndex);
361
- console.debug(`[Graph] getRunMessages() | totalMessages=${this.messages.length} | startIndex=${this.startIndex} | runMessages=${result.length}`);
362
443
  return result;
363
444
  }
364
445
  getContentParts() {
@@ -914,10 +995,12 @@ class StandardGraph extends Graph {
914
995
  });
915
996
  messages = [dynamicContextMessage, ackMessage, ...messages];
916
997
  }
917
- // Extract tool discoveries from current turn only (similar to formatArtifactPayload pattern)
918
- const discoveredNames = extractToolDiscoveries(messages);
919
- if (discoveredNames.length > 0) {
920
- agentContext.markToolsAsDiscovered(discoveredNames);
998
+ // Tool discovery caching: only scan new messages since last iteration
999
+ // instead of re-parsing the full history via extractToolDiscoveries()
1000
+ const cachedDiscoveries = this._toolDiscoveryCache.getNewDiscoveries(messages);
1001
+ if (cachedDiscoveries.length > 0) {
1002
+ agentContext.markToolsAsDiscovered(cachedDiscoveries);
1003
+ console.debug(`[Graph:ToolDiscovery] Cached ${cachedDiscoveries.length} new tools (total: ${this._toolDiscoveryCache.size})`);
921
1004
  }
922
1005
  const toolsForBinding = agentContext.getToolsForBinding();
923
1006
  // PERF: Detect subsequent ReAct iterations (tool results present in messages)
@@ -948,36 +1031,12 @@ class StandardGraph extends Graph {
948
1031
  let messagesToUse = messages;
949
1032
  // ====================================================================
950
1033
  // PRE-PRUNING DELEGATION CHECK
951
- // Before pruning strips messages (losing context), check if we should
952
- // delegate instead. If context would be pruned AND the agent has the
953
- // task tool, inject a delegation hint and SKIP pruning — preserving
954
- // the content for the LLM to understand what to delegate.
955
1034
  // ====================================================================
956
- let delegationInjectedPrePrune = false;
957
- const hasTaskToolPrePrune = agentContext.tools?.some((tool) => {
958
- const toolName = typeof tool === 'object' && 'name' in tool
959
- ? tool.name
960
- : '';
961
- return toolName === 'task';
962
- });
963
- if (hasTaskToolPrePrune === true &&
964
- agentContext.tokenCounter &&
965
- agentContext.maxContextTokens != null) {
966
- // Estimate total tokens in messages BEFORE pruning
967
- let prePruneTokens = 0;
968
- for (const msg of messages) {
969
- prePruneTokens += agentContext.tokenCounter(msg);
970
- }
971
- // Add instruction tokens (system prompt)
972
- prePruneTokens += agentContext.instructionTokens;
973
- const prePruneUtilization = (prePruneTokens / agentContext.maxContextTokens) * 100;
974
- if (prePruneUtilization > 70) {
975
- console.warn(`[Graph] PRE-PRUNE delegation check: ${prePruneUtilization.toFixed(1)}% utilization ` +
976
- `(${prePruneTokens}/${agentContext.maxContextTokens} tokens). ` +
977
- 'Injecting delegation hint INSTEAD of pruning.');
978
- delegationInjectedPrePrune = true;
979
- }
980
- }
1035
+ // Context management is now fully mechanical:
1036
+ // - Pruning always runs when needed (no delegation-based skip)
1037
+ // - Auto-continuation in client.js handles max_tokens finish reason
1038
+ // - LLM never sees raw token numbers (prevents voluntary bail-out)
1039
+ // ====================================================================
981
1040
  if (!agentContext.pruneMessages &&
982
1041
  agentContext.tokenCounter &&
983
1042
  agentContext.maxContextTokens != null &&
@@ -991,50 +1050,121 @@ class StandardGraph extends Graph {
991
1050
  (agentContext.provider === Providers.OPENAI &&
992
1051
  agentContext.clientOptions.modelKwargs
993
1052
  ?.thinking?.type === 'enabled');
1053
+ // Apply EMA calibration to max token budget — smooths pruning across iterations
1054
+ const calibratedMaxTokens = applyCalibration(agentContext.maxContextTokens, this._pruneCalibration);
994
1055
  agentContext.pruneMessages = createPruneMessages({
995
1056
  startIndex: this.startIndex,
996
1057
  provider: agentContext.provider,
997
1058
  tokenCounter: agentContext.tokenCounter,
998
- maxTokens: agentContext.maxContextTokens,
1059
+ maxTokens: calibratedMaxTokens,
999
1060
  thinkingEnabled: isAnthropicWithThinking,
1000
1061
  indexTokenCountMap: agentContext.indexTokenCountMap,
1001
1062
  });
1002
1063
  }
1003
- if (agentContext.pruneMessages && !delegationInjectedPrePrune) {
1004
- console.debug(`[Graph:ContextMgmt] Pruning messages | inputCount=${messages.length} | maxTokens=${agentContext.maxContextTokens}`);
1064
+ // Update EMA calibration with actual token usage from API response
1065
+ if (agentContext.currentUsage?.input_tokens &&
1066
+ agentContext.maxContextTokens) {
1067
+ const estimatedTokens = Object.values(agentContext.indexTokenCountMap).reduce((sum, v) => (sum ?? 0) + (v ?? 0), 0);
1068
+ if (estimatedTokens > 0) {
1069
+ this._pruneCalibration = updatePruneCalibration(this._pruneCalibration, agentContext.currentUsage.input_tokens, estimatedTokens);
1070
+ }
1071
+ }
1072
+ if (agentContext.pruneMessages) {
1005
1073
  const { context, indexTokenCountMap, messagesToRefine } = agentContext.pruneMessages({
1006
1074
  messages,
1007
1075
  usageMetadata: agentContext.currentUsage,
1008
- // startOnMessageType: 'human',
1009
1076
  });
1010
1077
  agentContext.indexTokenCountMap = indexTokenCountMap;
1011
1078
  messagesToUse = context;
1012
- console.debug(`[Graph:ContextMgmt] Pruned | kept=${context.length} | discarded=${messagesToRefine.length} | originalCount=${messages.length}`);
1013
- // Summarize discarded messages if callback provided
1014
- if (messagesToRefine.length > 0 && agentContext.summarizeCallback) {
1015
- console.debug(`[Graph:ContextMgmt] Summarizing ${messagesToRefine.length} discarded messages`);
1079
+ // ── Non-blocking summarization ──────────────────────────────────
1080
+ // NEVER block the LLM call waiting for summarization. Instead:
1081
+ // 1. If _cachedRunSummary exists use it, fire async update
1082
+ // 2. If persistedSummary exists → use it as fallback, fire async update
1083
+ // 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
1084
+ // The summary catches up asynchronously and is available for subsequent
1085
+ // iterations (tool calls) and the next conversation turn.
1086
+ //
1087
+ // SummarizationConfig integration:
1088
+ // - triggerType/triggerThreshold control WHEN summarization fires
1089
+ // - reserveRatio is enforced via calibrated maxTokens (above)
1090
+ // - initialSummary provides cross-run seeding as fallback before persistedSummary
1091
+ let hasSummary = false;
1092
+ const sumConfig = agentContext.summarizationConfig;
1093
+ const shouldSummarize = this.shouldTriggerSummarization(messagesToRefine.length, agentContext.maxContextTokens ?? 0, agentContext.indexTokenCountMap, agentContext.instructionTokens, sumConfig);
1094
+ if (messagesToRefine.length > 0 &&
1095
+ agentContext.summarizeCallback &&
1096
+ shouldSummarize) {
1016
1097
  try {
1017
- const summary = await agentContext.summarizeCallback(messagesToRefine);
1018
- console.debug(`[Graph:ContextMgmt] Summary received | len=${summary?.length ?? 0} | hasContent=${summary != null && summary !== ''}`);
1098
+ let summary;
1099
+ let summarySource;
1100
+ if (this._cachedRunSummary != null) {
1101
+ summary = this._cachedRunSummary;
1102
+ summarySource = 'cached';
1103
+ }
1104
+ else if (agentContext.persistedSummary != null &&
1105
+ agentContext.persistedSummary !== '') {
1106
+ summary = agentContext.persistedSummary;
1107
+ this._cachedRunSummary = summary;
1108
+ summarySource = 'persisted';
1109
+ }
1110
+ else if (sumConfig?.initialSummary != null &&
1111
+ sumConfig.initialSummary !== '') {
1112
+ // Cross-run seed: use initialSummary when no persisted summary exists
1113
+ summary = sumConfig.initialSummary;
1114
+ this._cachedRunSummary = summary;
1115
+ summarySource = 'initial-seed';
1116
+ }
1117
+ else {
1118
+ summarySource = 'none';
1119
+ }
1120
+ // Single consolidated log for the entire prune+summarize decision
1121
+ console.debug(`[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`);
1122
+ // Fire background summarization — updates cache for next iteration/turn
1123
+ agentContext
1124
+ .summarizeCallback(messagesToRefine)
1125
+ .then((updated) => {
1126
+ if (updated != null && updated !== '') {
1127
+ this._cachedRunSummary = updated;
1128
+ }
1129
+ })
1130
+ .catch((err) => {
1131
+ console.error('[Graph] Background summary failed (non-fatal):', err);
1132
+ });
1019
1133
  if (summary != null && summary !== '') {
1134
+ hasSummary = true;
1020
1135
  const summaryMsg = new SystemMessage(`[Conversation Summary]\n${summary}`);
1021
- // Insert after system message (if present), before conversation messages
1022
1136
  const systemIdx = messagesToUse[0]?.getType() === 'system' ? 1 : 0;
1023
1137
  messagesToUse = [
1024
1138
  ...messagesToUse.slice(0, systemIdx),
1025
1139
  summaryMsg,
1026
1140
  ...messagesToUse.slice(systemIdx),
1027
1141
  ];
1028
- console.debug(`[Graph:ContextMgmt] Summary injected at index ${systemIdx} | finalMsgCount=${messagesToUse.length}`);
1029
1142
  }
1030
1143
  }
1031
1144
  catch (err) {
1032
- console.error('[Graph] Summarization callback failed:', err);
1145
+ console.error('[Graph] Summarization failed:', err);
1146
+ }
1147
+ }
1148
+ else if (messagesToRefine.length > 0) {
1149
+ // Log pruning even when no summarize callback (discard mode)
1150
+ console.debug(`[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`);
1151
+ }
1152
+ // Deduplicate system messages that accumulate from repeated tool iterations
1153
+ const { messages: dedupedMessages, removedCount } = deduplicateSystemMessages(messagesToUse);
1154
+ if (removedCount > 0) {
1155
+ messagesToUse = dedupedMessages;
1156
+ console.debug(`[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`);
1157
+ }
1158
+ // Post-prune context note for task-tool-enabled agents
1159
+ if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
1160
+ const postPruneNote = buildPostPruneNote(messagesToRefine.length, hasSummary);
1161
+ if (postPruneNote) {
1162
+ messagesToUse = [
1163
+ ...messagesToUse,
1164
+ new SystemMessage(postPruneNote),
1165
+ ];
1033
1166
  }
1034
1167
  }
1035
- }
1036
- else if (delegationInjectedPrePrune) {
1037
- console.info('[Graph] Skipping pruning — delegation will handle context pressure');
1038
1168
  }
1039
1169
  let finalMessages = messagesToUse;
1040
1170
  if (agentContext.useLegacyContent) {
@@ -1147,125 +1277,25 @@ class StandardGraph extends Graph {
1147
1277
  analytics: contextAnalytics,
1148
1278
  }, config);
1149
1279
  // ====================================================================
1150
- // CONTEXT PRESSURE AWARENESS Intelligent Sub-Agent Delegation
1151
- //
1152
- // Two triggers for delegation hints:
1153
- // 1. DOCUMENT COUNT: When 3+ documents are detected in the conversation,
1154
- // inject a delegation hint on the FIRST iteration (before the LLM
1155
- // has called any tools). This ensures the agent delegates upfront
1156
- // rather than trying to process all documents itself.
1157
- // 2. TOKEN UTILIZATION: At EVERY iteration, if context is filling up
1158
- // (70%/85%), inject escalating hints to delegate remaining work.
1280
+ // MULTI-DOCUMENT DELEGATION (task-driven, not budget-driven)
1159
1281
  //
1160
- // This runs mid-chain so even if tool responses push context up
1161
- // after the first LLM call, subsequent iterations get the hint.
1282
+ // Token-based pressure hints have been removed the LLM never sees
1283
+ // raw token numbers. Context overflow is handled mechanically by
1284
+ // pruning (Graph) + auto-continuation (client.js max_tokens detection).
1285
+ // See: docs/context-overflow-architecture.md
1162
1286
  // ====================================================================
1163
- const hasTaskToolInContext = agentContext.tools?.some((tool) => {
1164
- const toolName = typeof tool === 'object' && 'name' in tool
1165
- ? tool.name
1166
- : '';
1167
- return toolName === 'task';
1168
- });
1169
- if (hasTaskToolInContext === true &&
1170
- contextAnalytics.utilizationPercent != null &&
1171
- contextAnalytics.maxContextTokens != null) {
1172
- const utilization = contextAnalytics.utilizationPercent;
1173
- const totalTokens = contextAnalytics.totalTokens;
1174
- const maxTokens = contextAnalytics.maxContextTokens;
1175
- const remainingTokens = maxTokens - totalTokens;
1176
- // Count attached documents by scanning for document patterns in HumanMessages:
1177
- // 1. # "filename" headers in "Attached document(s):" blocks (text content)
1178
- // 2. **filename1, filename2** in "The user has attached:" blocks (embedded files)
1179
- // 3. Filenames in file_search tool results
1180
- let documentCount = 0;
1181
- const documentNames = [];
1182
- for (const msg of finalMessages) {
1183
- const content = typeof msg.content === 'string'
1184
- ? msg.content
1185
- : Array.isArray(msg.content)
1186
- ? msg.content
1187
- .map((p) => {
1188
- const part = p;
1189
- return String(part.text ?? part.content ?? '');
1190
- })
1191
- .join(' ')
1192
- : '';
1193
- // Pattern 1: # "filename" headers in attached document blocks
1194
- const docMatches = content.match(/# "([^"]+)"/g);
1195
- if (docMatches) {
1196
- for (const match of docMatches) {
1197
- const name = match.replace(/# "/, '').replace(/"$/, '');
1198
- if (!documentNames.includes(name)) {
1199
- documentNames.push(name);
1200
- documentCount++;
1201
- }
1202
- }
1203
- }
1204
- // Pattern 2: "The user has attached: **file1, file2**" (embedded files)
1205
- const attachedMatch = content.match(/user has attached:\s*\*\*([^*]+)\*\*/i);
1206
- if (attachedMatch) {
1207
- const names = attachedMatch[1]
1208
- .split(',')
1209
- .map((n) => n.trim())
1210
- .filter(Boolean);
1211
- for (const name of names) {
1212
- if (!documentNames.includes(name)) {
1213
- documentNames.push(name);
1214
- documentCount++;
1215
- }
1216
- }
1217
- }
1218
- }
1219
- // BASELINE LOG: Always fires so we can verify this code path runs
1220
- console.debug(`[Graph] Context utilization: ${utilization.toFixed(1)}% ` +
1221
- `(${totalTokens}/${maxTokens} tokens, ${remainingTokens} remaining) | ` +
1222
- `hasTaskTool: true | messages: ${finalMessages.length} | docs: ${documentCount}`);
1223
- // TRIGGER 1: Multi-document delegation (3+ documents detected)
1224
- // Only inject on first iteration (no AI messages yet = agent hasn't responded)
1287
+ if (hasTaskTool(agentContext.tools)) {
1288
+ const { count: documentCount, names: documentNames } = detectDocuments(finalMessages);
1289
+ // Multi-document delegation: first iteration only (before AI has responded)
1225
1290
  const hasAiResponse = finalMessages.some((m) => m._getType() === 'ai' || m._getType() === 'tool');
1226
- if (documentCount >= 3 && !hasAiResponse) {
1291
+ if (shouldInjectMultiDocHint(documentCount, hasAiResponse)) {
1227
1292
  const pressureMsg = new HumanMessage({
1228
- content: `[MULTI-DOCUMENT PROCESSING — ${documentCount} documents detected]\n` +
1229
- `Documents: ${documentNames.join(', ')}\n\n` +
1230
- `You have ${documentCount} documents attached. For thorough analysis, use the "task" tool ` +
1231
- 'to delegate each document (or group of related documents) to a sub-agent.\n' +
1232
- 'Each sub-agent has its own fresh context window and can use file_search to retrieve the full document content.\n' +
1233
- 'After all sub-agents complete, synthesize their results into a comprehensive response.\n\n' +
1234
- 'This approach ensures each document gets full attention without context limitations.',
1293
+ content: buildMultiDocHintContent(documentCount, documentNames),
1235
1294
  });
1236
1295
  finalMessages = [...finalMessages, pressureMsg];
1237
1296
  console.info(`[Graph] Multi-document delegation hint injected for ${documentCount} documents: ` +
1238
1297
  `${documentNames.join(', ')}`);
1239
1298
  }
1240
- // TRIGGER 2: Token utilization thresholds (mid-chain safety net)
1241
- // Also fires when we skipped pruning due to delegationInjectedPrePrune
1242
- if (utilization > 85 ||
1243
- (delegationInjectedPrePrune && utilization > 50)) {
1244
- // CRITICAL: Context is high — MANDATE delegation
1245
- const pressureMsg = new HumanMessage({
1246
- content: `[CONTEXT BUDGET CRITICAL — ${utilization.toFixed(0)}% used]\n` +
1247
- `You have used ${totalTokens} of ${maxTokens} tokens (${remainingTokens} remaining).\n` +
1248
- 'Your context is very large. You MUST use the "task" tool to delegate work to sub-agents.\n' +
1249
- 'Each sub-agent runs in its own fresh context window and can use file_search to access documents.\n' +
1250
- 'Do NOT attempt to process documents directly — delegate each document to a sub-agent, then synthesize results.',
1251
- });
1252
- finalMessages = [...finalMessages, pressureMsg];
1253
- console.warn(`[Graph] Context pressure CRITICAL (${utilization.toFixed(0)}%): ` +
1254
- `Injected mandatory delegation hint. ${remainingTokens} tokens remaining. ` +
1255
- `prePruneSkipped: ${delegationInjectedPrePrune}`);
1256
- }
1257
- else if (utilization > 70) {
1258
- // WARNING: Context filling up — suggest delegation
1259
- const pressureMsg = new HumanMessage({
1260
- content: `[CONTEXT BUDGET WARNING — ${utilization.toFixed(0)}% used]\n` +
1261
- `You have used ${totalTokens} of ${maxTokens} tokens (${remainingTokens} remaining).\n` +
1262
- 'Your context is filling up. Consider using the "task" tool to delegate complex operations to sub-agents.\n' +
1263
- "Sub-agents run in fresh context windows and won't consume your remaining budget.",
1264
- });
1265
- finalMessages = [...finalMessages, pressureMsg];
1266
- console.info(`[Graph] Context pressure WARNING (${utilization.toFixed(0)}%): ` +
1267
- `Injected delegation suggestion. ${remainingTokens} tokens remaining.`);
1268
- }
1269
1299
  }
1270
1300
  // Structured output mode: when the agent has NO tools, produce structured JSON immediately.
1271
1301
  // When the agent HAS tools, we defer structured output until after tool use completes
@@ -1659,10 +1689,6 @@ If I seem to be missing something we discussed earlier, just give me a quick rem
1659
1689
  reducer: (a, b) => {
1660
1690
  if (!a.length) {
1661
1691
  this.startIndex = a.length + b.length;
1662
- console.debug(`[Graph:Reducer] Initial messages | startIndex=${this.startIndex} | inputMsgCount=${b.length}`);
1663
- }
1664
- else {
1665
- console.debug(`[Graph:Reducer] Appending messages | existing=${a.length} | new=${b.length} | startIndex=${this.startIndex}`);
1666
1692
  }
1667
1693
  const result = messagesStateReducer(a, b);
1668
1694
  this.messages = result;