@illuma-ai/agents 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/constants.cjs +12 -0
- package/dist/cjs/common/constants.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +81 -12
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/main.cjs +1 -0
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/esm/common/constants.mjs +12 -1
- package/dist/esm/common/constants.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +83 -14
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/main.mjs +1 -1
- package/dist/types/common/constants.d.ts +11 -0
- package/dist/types/graphs/Graph.d.ts +9 -0
- package/package.json +1 -1
- package/src/common/constants.ts +12 -0
- package/src/graphs/Graph.ts +108 -15
- package/src/graphs/gapFeatures.test.ts +113 -0
|
@@ -53,6 +53,17 @@ const CONTEXT_SAFETY_BUFFER = 0.9;
|
|
|
53
53
|
* When the context window is ≥80% full, pruning + summarization activates.
|
|
54
54
|
*/
|
|
55
55
|
const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
|
|
56
|
+
/**
|
|
57
|
+
* Proactive summarization threshold (0-1 fraction of context window).
|
|
58
|
+
* At this utilization level, background summarization fires BEFORE pruning is needed.
|
|
59
|
+
* This gives the summary time to complete so it's ready when context actually fills up.
|
|
60
|
+
*
|
|
61
|
+
* Inspired by VS Code Copilot Chat's 3-tier strategy:
|
|
62
|
+
* 80% → proactive background summary
|
|
63
|
+
* 90% → pruning kicks in (with summary already cached)
|
|
64
|
+
* 100% → graceful: use existing summary + recent messages, never block
|
|
65
|
+
*/
|
|
66
|
+
const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
|
|
56
67
|
/**
|
|
57
68
|
* Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
|
|
58
69
|
* 0.3 means 30% of the context budget is reserved for the most recent messages,
|
|
@@ -94,6 +105,7 @@ exports.CONTEXT_SAFETY_BUFFER = CONTEXT_SAFETY_BUFFER;
|
|
|
94
105
|
exports.DEDUP_MAX_CONTENT_LENGTH = DEDUP_MAX_CONTENT_LENGTH;
|
|
95
106
|
exports.MIN_THINKING_BUDGET = MIN_THINKING_BUDGET;
|
|
96
107
|
exports.MULTI_DOCUMENT_THRESHOLD = MULTI_DOCUMENT_THRESHOLD;
|
|
108
|
+
exports.PROACTIVE_SUMMARY_THRESHOLD = PROACTIVE_SUMMARY_THRESHOLD;
|
|
97
109
|
exports.PRUNING_EMA_ALPHA = PRUNING_EMA_ALPHA;
|
|
98
110
|
exports.PRUNING_INITIAL_CALIBRATION = PRUNING_INITIAL_CALIBRATION;
|
|
99
111
|
exports.SUMMARIZATION_CONTEXT_THRESHOLD = SUMMARIZATION_CONTEXT_THRESHOLD;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG
|
|
1
|
+
{"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Proactive summarization threshold (0-1 fraction of context window).\n * At this utilization level, background summarization fires BEFORE pruning is needed.\n * This gives the summary time to complete so it's ready when context actually fills up.\n *\n * Inspired by VS Code Copilot Chat's 3-tier strategy:\n * 80% → proactive background summary\n * 90% → pruning kicks in (with summary already cached)\n * 100% → graceful: use existing summary + recent messages, never block\n */\nexport const PROACTIVE_SUMMARY_THRESHOLD = 0.8;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;;;;;;AASG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;"}
|
|
@@ -102,6 +102,15 @@ class StandardGraph extends Graph {
|
|
|
102
102
|
_pruneCalibration;
|
|
103
103
|
/** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
|
|
104
104
|
_toolDiscoveryCache;
|
|
105
|
+
/**
|
|
106
|
+
* SCALE: Tracks whether a summary call is already in-flight for this Graph instance.
|
|
107
|
+
* Prevents multiple concurrent summary LLM calls when rapid tool iterations each
|
|
108
|
+
* trigger pruning. At 2000 users with 3+ tool calls per turn, this prevents
|
|
109
|
+
* 6000+ summary calls/turn from becoming 2000.
|
|
110
|
+
*/
|
|
111
|
+
_summaryInFlight = false;
|
|
112
|
+
/** Messages accumulated across tool iterations while a summary call is in-flight */
|
|
113
|
+
_pendingMessagesToRefine = [];
|
|
105
114
|
/** Map of agent contexts by agent ID */
|
|
106
115
|
agentContexts = new Map();
|
|
107
116
|
/** Default agent ID to use */
|
|
@@ -157,9 +166,11 @@ class StandardGraph extends Graph {
|
|
|
157
166
|
this.messageStepHasToolCalls = graph.resetIfNotEmpty(this.messageStepHasToolCalls, new Map());
|
|
158
167
|
this.prelimMessageIdsByStepKey = graph.resetIfNotEmpty(this.prelimMessageIdsByStepKey, new Map());
|
|
159
168
|
this.invokedToolIds = graph.resetIfNotEmpty(this.invokedToolIds, undefined);
|
|
160
|
-
// Reset EMA calibration
|
|
169
|
+
// Reset EMA calibration, tool discovery cache, and summary debounce for fresh run
|
|
161
170
|
this._pruneCalibration = pruneCalibration.createPruneCalibration();
|
|
162
171
|
this._toolDiscoveryCache.reset();
|
|
172
|
+
this._summaryInFlight = false;
|
|
173
|
+
this._pendingMessagesToRefine = [];
|
|
163
174
|
for (const context of this.agentContexts.values()) {
|
|
164
175
|
context.reset();
|
|
165
176
|
}
|
|
@@ -1071,6 +1082,47 @@ class StandardGraph extends Graph {
|
|
|
1071
1082
|
this._pruneCalibration = pruneCalibration.updatePruneCalibration(this._pruneCalibration, agentContext.currentUsage.input_tokens, estimatedTokens);
|
|
1072
1083
|
}
|
|
1073
1084
|
}
|
|
1085
|
+
// ── Proactive summarization at context pressure ───────────────────
|
|
1086
|
+
// Inspired by VS Code Copilot Chat's 3-tier strategy:
|
|
1087
|
+
// 80% → fire proactive background summary (BEFORE pruning needed)
|
|
1088
|
+
// 90% → pruning kicks in (summary already cached from 80% trigger)
|
|
1089
|
+
// 100% → graceful: use existing summary + recent messages, NEVER block
|
|
1090
|
+
//
|
|
1091
|
+
// This ensures the summary is READY by the time pruning actually occurs,
|
|
1092
|
+
// so the user never waits and never sees a context cliff.
|
|
1093
|
+
if (agentContext.maxContextTokens != null &&
|
|
1094
|
+
agentContext.maxContextTokens > 0 &&
|
|
1095
|
+
agentContext.summarizeCallback &&
|
|
1096
|
+
!this._summaryInFlight &&
|
|
1097
|
+
!this._cachedRunSummary) {
|
|
1098
|
+
const utilization = prune.getContextUtilization(agentContext.indexTokenCountMap, agentContext.instructionTokens, agentContext.maxContextTokens);
|
|
1099
|
+
const threshold = (agentContext.summarizationConfig?.triggerThreshold ?? constants.PROACTIVE_SUMMARY_THRESHOLD * 100);
|
|
1100
|
+
if (utilization >= threshold) {
|
|
1101
|
+
// Identify older messages to summarize proactively.
|
|
1102
|
+
// Keep the last N messages (recent turns) intact — only summarize older history.
|
|
1103
|
+
// This is incremental: the callback checks for existing summary and updates it.
|
|
1104
|
+
const recentTurnCount = Math.max(4, Math.floor(messages$1.length * 0.3));
|
|
1105
|
+
const oldMessages = messages$1.slice(messages$1[0]?.getType() === 'system' ? 1 : 0, Math.max(1, messages$1.length - recentTurnCount));
|
|
1106
|
+
if (oldMessages.length > 0) {
|
|
1107
|
+
this._summaryInFlight = true;
|
|
1108
|
+
console.debug(`[Graph:ProactiveSummary] Context at ${utilization.toFixed(1)}% (threshold ${threshold}%) — summarizing ${oldMessages.length} older msgs in background`);
|
|
1109
|
+
agentContext
|
|
1110
|
+
.summarizeCallback(oldMessages)
|
|
1111
|
+
.then((updated) => {
|
|
1112
|
+
if (updated != null && updated !== '') {
|
|
1113
|
+
this._cachedRunSummary = updated;
|
|
1114
|
+
console.debug(`[Graph:ProactiveSummary] Background summary ready (len=${updated.length})`);
|
|
1115
|
+
}
|
|
1116
|
+
})
|
|
1117
|
+
.catch((err) => {
|
|
1118
|
+
console.error('[Graph:ProactiveSummary] Background summary failed (non-fatal):', err);
|
|
1119
|
+
})
|
|
1120
|
+
.finally(() => {
|
|
1121
|
+
this._summaryInFlight = false;
|
|
1122
|
+
});
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1074
1126
|
if (agentContext.pruneMessages) {
|
|
1075
1127
|
const { context, indexTokenCountMap, messagesToRefine } = agentContext.pruneMessages({
|
|
1076
1128
|
messages: messages$1,
|
|
@@ -1121,17 +1173,34 @@ class StandardGraph extends Graph {
|
|
|
1121
1173
|
}
|
|
1122
1174
|
// Single consolidated log for the entire prune+summarize decision
|
|
1123
1175
|
console.debug(`[Graph:ContextMgmt] Pruned ${messages$1.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`);
|
|
1124
|
-
//
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
}
|
|
1131
|
-
}
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1176
|
+
// SCALE: Debounce background summarization — if a summary call is already
|
|
1177
|
+
// in-flight (from a prior tool iteration), accumulate messages instead of
|
|
1178
|
+
// firing another concurrent LLM call. At 2000 users with 3+ tool calls
|
|
1179
|
+
// per turn, this prevents 3x summary call volume.
|
|
1180
|
+
if (this._summaryInFlight) {
|
|
1181
|
+
this._pendingMessagesToRefine.push(...messagesToRefine);
|
|
1182
|
+
console.debug(`[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`);
|
|
1183
|
+
}
|
|
1184
|
+
else {
|
|
1185
|
+
this._summaryInFlight = true;
|
|
1186
|
+
const allMessages = this._pendingMessagesToRefine.length > 0
|
|
1187
|
+
? [...this._pendingMessagesToRefine, ...messagesToRefine]
|
|
1188
|
+
: messagesToRefine;
|
|
1189
|
+
this._pendingMessagesToRefine = [];
|
|
1190
|
+
agentContext
|
|
1191
|
+
.summarizeCallback(allMessages)
|
|
1192
|
+
.then((updated) => {
|
|
1193
|
+
if (updated != null && updated !== '') {
|
|
1194
|
+
this._cachedRunSummary = updated;
|
|
1195
|
+
}
|
|
1196
|
+
})
|
|
1197
|
+
.catch((err) => {
|
|
1198
|
+
console.error('[Graph] Background summary failed (non-fatal):', err);
|
|
1199
|
+
})
|
|
1200
|
+
.finally(() => {
|
|
1201
|
+
this._summaryInFlight = false;
|
|
1202
|
+
});
|
|
1203
|
+
}
|
|
1135
1204
|
if (summary != null && summary !== '') {
|
|
1136
1205
|
hasSummary = true;
|
|
1137
1206
|
const summaryMsg = new messages.SystemMessage(`[Conversation Summary]\n${summary}`);
|