@illuma-ai/agents 1.1.2 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/common/constants.cjs +10 -0
- package/dist/cjs/common/constants.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +163 -79
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/main.cjs +1 -0
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/esm/common/constants.mjs +10 -1
- package/dist/esm/common/constants.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +164 -80
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/main.mjs +1 -1
- package/dist/types/common/constants.d.ts +9 -0
- package/package.json +1 -1
- package/src/common/constants.ts +10 -0
- package/src/graphs/Graph.ts +194 -102
- package/src/graphs/gapFeatures.test.ts +321 -2
|
@@ -64,6 +64,15 @@ const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
|
|
|
64
64
|
* 100% → graceful: use existing summary + recent messages, never block
|
|
65
65
|
*/
|
|
66
66
|
const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
|
|
67
|
+
/**
|
|
68
|
+
* Number of recent conversation rounds (human+AI pairs) to keep in the
|
|
69
|
+
* windowed view when a summary is available. Everything older is covered
|
|
70
|
+
* by the summary. 2 rounds = last 2 user questions + 2 AI responses.
|
|
71
|
+
*
|
|
72
|
+
* This prevents wasting tokens on raw messages the summary already covers
|
|
73
|
+
* and keeps context tight for the LLM.
|
|
74
|
+
*/
|
|
75
|
+
const COMPACTION_RECENT_ROUNDS = 2;
|
|
67
76
|
/**
|
|
68
77
|
* Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
|
|
69
78
|
* 0.3 means 30% of the context budget is reserved for the most recent messages,
|
|
@@ -101,6 +110,7 @@ const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;
|
|
|
101
110
|
*/
|
|
102
111
|
const DEDUP_MAX_CONTENT_LENGTH = 10000;
|
|
103
112
|
|
|
113
|
+
exports.COMPACTION_RECENT_ROUNDS = COMPACTION_RECENT_ROUNDS;
|
|
104
114
|
exports.CONTEXT_SAFETY_BUFFER = CONTEXT_SAFETY_BUFFER;
|
|
105
115
|
exports.DEDUP_MAX_CONTENT_LENGTH = DEDUP_MAX_CONTENT_LENGTH;
|
|
106
116
|
exports.MIN_THINKING_BUDGET = MIN_THINKING_BUDGET;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Proactive summarization threshold (0-1 fraction of context window).\n * At this utilization level, background summarization fires BEFORE pruning is needed.\n * This gives the summary time to complete so it's ready when context actually fills up.\n *\n * Inspired by VS Code Copilot Chat's 3-tier strategy:\n * 80% → proactive background summary\n * 90% → pruning kicks in (with summary already cached)\n * 100% → graceful: use existing summary + recent messages, never block\n */\nexport const PROACTIVE_SUMMARY_THRESHOLD = 0.8;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;;;;;;AASG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG
|
|
1
|
+
{"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Proactive summarization threshold (0-1 fraction of context window).\n * At this utilization level, background summarization fires BEFORE pruning is needed.\n * This gives the summary time to complete so it's ready when context actually fills up.\n *\n * Inspired by VS Code Copilot Chat's 3-tier strategy:\n * 80% → proactive background summary\n * 90% → pruning kicks in (with summary already cached)\n * 100% → graceful: use existing summary + recent messages, never block\n */\nexport const PROACTIVE_SUMMARY_THRESHOLD = 0.8;\n\n/**\n * Number of recent conversation rounds (human+AI pairs) to keep in the\n * windowed view when a summary is available. Everything older is covered\n * by the summary. 2 rounds = last 2 user questions + 2 AI responses.\n *\n * This prevents wasting tokens on raw messages the summary already covers\n * and keeps context tight for the LLM.\n */\nexport const COMPACTION_RECENT_ROUNDS = 2;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;;;;;;AASG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;;AAOG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;;"}
|
|
@@ -1124,68 +1124,167 @@ class StandardGraph extends Graph {
|
|
|
1124
1124
|
}
|
|
1125
1125
|
}
|
|
1126
1126
|
if (agentContext.pruneMessages) {
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
messagesToUse = context;
|
|
1133
|
-
// ── Non-blocking summarization ──────────────────────────────────
|
|
1134
|
-
// NEVER block the LLM call waiting for summarization. Instead:
|
|
1135
|
-
// 1. If _cachedRunSummary exists → use it, fire async update
|
|
1136
|
-
// 2. If persistedSummary exists → use it as fallback, fire async update
|
|
1137
|
-
// 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
|
|
1138
|
-
// The summary catches up asynchronously and is available for subsequent
|
|
1139
|
-
// iterations (tool calls) and the next conversation turn.
|
|
1127
|
+
// ── Context Compaction (Copilot-style: never delete messages) ─────
|
|
1128
|
+
//
|
|
1129
|
+
// DESIGN: Original messages are NEVER removed from the array.
|
|
1130
|
+
// Instead, we build a "windowed view" for the LLM:
|
|
1131
|
+
// [system prompt] + [summary of older turns] + [recent turns that fit]
|
|
1140
1132
|
//
|
|
1141
|
-
//
|
|
1142
|
-
// -
|
|
1143
|
-
// -
|
|
1144
|
-
// -
|
|
1145
|
-
|
|
1133
|
+
// This ensures:
|
|
1134
|
+
// - No context is ever lost (summary covers older turns)
|
|
1135
|
+
// - We can always re-summarize from originals if summary is stale
|
|
1136
|
+
// - Conversation chaining works naturally across turns
|
|
1137
|
+
//
|
|
1138
|
+
// Flow:
|
|
1139
|
+
// 1. Resolve best available summary (cached > persisted > seed)
|
|
1140
|
+
// 2. Calculate token budget available for recent messages
|
|
1141
|
+
// 3. Walk newest→oldest, build view of messages that fit
|
|
1142
|
+
// 4. Assemble: [system] + [summary] + [recent window]
|
|
1143
|
+
// 5. Fire background summary update for messages outside the window
|
|
1146
1144
|
const sumConfig = agentContext.summarizationConfig;
|
|
1147
|
-
const
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1145
|
+
const tokenCounter = agentContext.tokenCounter;
|
|
1146
|
+
const maxTokens = agentContext.maxContextTokens ?? 0;
|
|
1147
|
+
// Step 1: Resolve best available summary
|
|
1148
|
+
let summary;
|
|
1149
|
+
let summarySource;
|
|
1150
|
+
if (this._cachedRunSummary != null) {
|
|
1151
|
+
summary = this._cachedRunSummary;
|
|
1152
|
+
summarySource = 'cached';
|
|
1153
|
+
}
|
|
1154
|
+
else if (agentContext.persistedSummary != null &&
|
|
1155
|
+
agentContext.persistedSummary !== '') {
|
|
1156
|
+
summary = agentContext.persistedSummary;
|
|
1157
|
+
this._cachedRunSummary = summary;
|
|
1158
|
+
summarySource = 'persisted';
|
|
1159
|
+
}
|
|
1160
|
+
else if (sumConfig?.initialSummary != null &&
|
|
1161
|
+
sumConfig.initialSummary !== '') {
|
|
1162
|
+
summary = sumConfig.initialSummary;
|
|
1163
|
+
this._cachedRunSummary = summary;
|
|
1164
|
+
summarySource = 'initial-seed';
|
|
1165
|
+
}
|
|
1166
|
+
else {
|
|
1167
|
+
summarySource = 'none';
|
|
1168
|
+
}
|
|
1169
|
+
// Step 2: Calculate token budget
|
|
1170
|
+
// Apply EMA calibration for accuracy across iterations
|
|
1171
|
+
const calibratedMax = pruneCalibration.applyCalibration(maxTokens, this._pruneCalibration);
|
|
1172
|
+
const systemMsg = messages$1[0]?.getType() === 'system' ? messages$1[0] : null;
|
|
1173
|
+
const systemTokens = systemMsg != null
|
|
1174
|
+
? (agentContext.indexTokenCountMap[0] ?? 0)
|
|
1175
|
+
: 0;
|
|
1176
|
+
const summaryMsg = summary != null && summary !== ''
|
|
1177
|
+
? new messages.SystemMessage(`[Conversation Summary]\n${summary}`)
|
|
1178
|
+
: null;
|
|
1179
|
+
const summaryTokens = summaryMsg != null && tokenCounter != null
|
|
1180
|
+
? tokenCounter(summaryMsg)
|
|
1181
|
+
: 0;
|
|
1182
|
+
// Budget for recent messages = total - system - summary - 3 (assistant priming)
|
|
1183
|
+
const recentBudget = calibratedMax - systemTokens - summaryTokens - 3;
|
|
1184
|
+
// Step 3: Determine window of recent messages to include.
|
|
1185
|
+
//
|
|
1186
|
+
// Two modes:
|
|
1187
|
+
// A) No summary available → fill the budget (all messages that fit)
|
|
1188
|
+
// B) Summary available → keep last 2 conversation rounds (H+A pairs)
|
|
1189
|
+
// + any trailing tool messages. The summary covers everything else.
|
|
1190
|
+
// This avoids wasting tokens on raw messages the summary already covers.
|
|
1191
|
+
//
|
|
1192
|
+
// A "round" = one human message + one AI response (+ any tool messages between).
|
|
1193
|
+
const contentStart = systemMsg != null ? 1 : 0;
|
|
1194
|
+
let usedTokens = 0;
|
|
1195
|
+
let windowStart = messages$1.length; // index where the recent window begins
|
|
1196
|
+
if (summary == null || summary === '') {
|
|
1197
|
+
// Mode A: No summary — include as many recent messages as fit in budget
|
|
1198
|
+
for (let i = messages$1.length - 1; i >= contentStart; i--) {
|
|
1199
|
+
const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
|
|
1200
|
+
if (usedTokens + msgTokens > recentBudget) {
|
|
1201
|
+
break;
|
|
1163
1202
|
}
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1203
|
+
usedTokens += msgTokens;
|
|
1204
|
+
windowStart = i;
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
else {
|
|
1208
|
+
// Mode B: Summary exists — keep last 2 rounds (4 core messages: H+A+H+A)
|
|
1209
|
+
// Walk backward counting human messages as round boundaries.
|
|
1210
|
+
const MAX_RECENT_ROUNDS = constants.COMPACTION_RECENT_ROUNDS;
|
|
1211
|
+
let roundsSeen = 0;
|
|
1212
|
+
for (let i = messages$1.length - 1; i >= contentStart; i--) {
|
|
1213
|
+
const msgType = messages$1[i]?.getType();
|
|
1214
|
+
const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
|
|
1215
|
+
// Budget guard — even in round-limited mode, don't exceed budget
|
|
1216
|
+
if (usedTokens + msgTokens > recentBudget) {
|
|
1217
|
+
break;
|
|
1170
1218
|
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1219
|
+
usedTokens += msgTokens;
|
|
1220
|
+
windowStart = i;
|
|
1221
|
+
// Count a human message as a round boundary
|
|
1222
|
+
if (msgType === 'human') {
|
|
1223
|
+
roundsSeen++;
|
|
1224
|
+
if (roundsSeen >= MAX_RECENT_ROUNDS) {
|
|
1225
|
+
break;
|
|
1226
|
+
}
|
|
1173
1227
|
}
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
// Ensure we don't split tool-call / tool-result pairs.
|
|
1231
|
+
// If windowStart lands on a ToolMessage, walk back to include its AI message.
|
|
1232
|
+
while (windowStart > contentStart &&
|
|
1233
|
+
messages$1[windowStart]?.getType() === 'tool') {
|
|
1234
|
+
windowStart--;
|
|
1235
|
+
usedTokens += agentContext.indexTokenCountMap[windowStart] ?? 0;
|
|
1236
|
+
}
|
|
1237
|
+
const recentMessages = messages$1.slice(windowStart);
|
|
1238
|
+
const compactedMessages = messages$1.slice(contentStart, windowStart);
|
|
1239
|
+
const hasSummary = summaryMsg != null;
|
|
1240
|
+
// Step 4: Assemble the windowed view
|
|
1241
|
+
// [system] + [summary (covers compacted messages)] + [recent window]
|
|
1242
|
+
const viewParts = [];
|
|
1243
|
+
if (systemMsg != null) {
|
|
1244
|
+
viewParts.push(systemMsg);
|
|
1245
|
+
}
|
|
1246
|
+
if (summaryMsg != null) {
|
|
1247
|
+
viewParts.push(summaryMsg);
|
|
1248
|
+
}
|
|
1249
|
+
viewParts.push(...recentMessages);
|
|
1250
|
+
messagesToUse = viewParts;
|
|
1251
|
+
// Rebuild indexTokenCountMap for the windowed view so downstream
|
|
1252
|
+
// analytics and summarization triggers see accurate token counts.
|
|
1253
|
+
const viewTokenMap = {};
|
|
1254
|
+
let viewIdx = 0;
|
|
1255
|
+
if (systemMsg != null) {
|
|
1256
|
+
viewTokenMap[viewIdx] = systemTokens;
|
|
1257
|
+
viewIdx++;
|
|
1258
|
+
}
|
|
1259
|
+
if (summaryMsg != null) {
|
|
1260
|
+
viewTokenMap[viewIdx] = summaryTokens;
|
|
1261
|
+
viewIdx++;
|
|
1262
|
+
}
|
|
1263
|
+
for (let i = windowStart; i < messages$1.length; i++) {
|
|
1264
|
+
viewTokenMap[viewIdx] = agentContext.indexTokenCountMap[i];
|
|
1265
|
+
viewIdx++;
|
|
1266
|
+
}
|
|
1267
|
+
agentContext.indexTokenCountMap = viewTokenMap;
|
|
1268
|
+
console.debug(`[Graph:Compaction] View: ${messages$1.length}→${viewParts.length} msgs ` +
|
|
1269
|
+
`(${compactedMessages.length} behind summary, ${recentMessages.length} in window) | ` +
|
|
1270
|
+
`summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | ` +
|
|
1271
|
+
`budget=${recentBudget}/${calibratedMax} used=${usedTokens}`);
|
|
1272
|
+
// Step 5: Fire background summary update (non-blocking)
|
|
1273
|
+
// Summarize messages outside the window so next iteration has a fresh summary.
|
|
1274
|
+
// Only trigger if there are compacted messages worth summarizing.
|
|
1275
|
+
if (compactedMessages.length > 0 &&
|
|
1276
|
+
agentContext.summarizeCallback) {
|
|
1277
|
+
const shouldSummarize = this.shouldTriggerSummarization(compactedMessages.length, maxTokens, agentContext.indexTokenCountMap, agentContext.instructionTokens, sumConfig);
|
|
1278
|
+
if (shouldSummarize) {
|
|
1180
1279
|
if (this._summaryInFlight) {
|
|
1181
|
-
this._pendingMessagesToRefine.push(...
|
|
1182
|
-
console.debug(`[Graph:
|
|
1280
|
+
this._pendingMessagesToRefine.push(...compactedMessages);
|
|
1281
|
+
console.debug(`[Graph:Compaction] Summary in-flight, queued ${compactedMessages.length} msgs (pending=${this._pendingMessagesToRefine.length})`);
|
|
1183
1282
|
}
|
|
1184
1283
|
else {
|
|
1185
1284
|
this._summaryInFlight = true;
|
|
1186
1285
|
const allMessages = this._pendingMessagesToRefine.length > 0
|
|
1187
|
-
? [...this._pendingMessagesToRefine, ...
|
|
1188
|
-
:
|
|
1286
|
+
? [...this._pendingMessagesToRefine, ...compactedMessages]
|
|
1287
|
+
: compactedMessages;
|
|
1189
1288
|
this._pendingMessagesToRefine = [];
|
|
1190
1289
|
agentContext
|
|
1191
1290
|
.summarizeCallback(allMessages)
|
|
@@ -1195,40 +1294,17 @@ class StandardGraph extends Graph {
|
|
|
1195
1294
|
}
|
|
1196
1295
|
})
|
|
1197
1296
|
.catch((err) => {
|
|
1198
|
-
console.error('[Graph] Background summary failed (non-fatal):', err);
|
|
1297
|
+
console.error('[Graph:Compaction] Background summary update failed (non-fatal):', err);
|
|
1199
1298
|
})
|
|
1200
1299
|
.finally(() => {
|
|
1201
1300
|
this._summaryInFlight = false;
|
|
1202
1301
|
});
|
|
1203
1302
|
}
|
|
1204
|
-
if (summary != null && summary !== '') {
|
|
1205
|
-
hasSummary = true;
|
|
1206
|
-
const summaryMsg = new messages.SystemMessage(`[Conversation Summary]\n${summary}`);
|
|
1207
|
-
const systemIdx = messagesToUse[0]?.getType() === 'system' ? 1 : 0;
|
|
1208
|
-
messagesToUse = [
|
|
1209
|
-
...messagesToUse.slice(0, systemIdx),
|
|
1210
|
-
summaryMsg,
|
|
1211
|
-
...messagesToUse.slice(systemIdx),
|
|
1212
|
-
];
|
|
1213
|
-
}
|
|
1214
1303
|
}
|
|
1215
|
-
catch (err) {
|
|
1216
|
-
console.error('[Graph] Summarization failed:', err);
|
|
1217
|
-
}
|
|
1218
|
-
}
|
|
1219
|
-
else if (messagesToRefine.length > 0) {
|
|
1220
|
-
// Log pruning even when no summarize callback (discard mode)
|
|
1221
|
-
console.debug(`[Graph:ContextMgmt] Pruned ${messages$1.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`);
|
|
1222
1304
|
}
|
|
1223
|
-
//
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
messagesToUse = dedupedMessages;
|
|
1227
|
-
console.debug(`[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`);
|
|
1228
|
-
}
|
|
1229
|
-
// Post-prune context note for task-tool-enabled agents
|
|
1230
|
-
if (messagesToRefine.length > 0 && contextPressure.hasTaskTool(agentContext.tools)) {
|
|
1231
|
-
const postPruneNote = contextPressure.buildPostPruneNote(messagesToRefine.length, hasSummary);
|
|
1305
|
+
// Post-compaction context note for task-tool-enabled agents
|
|
1306
|
+
if (compactedMessages.length > 0 && contextPressure.hasTaskTool(agentContext.tools)) {
|
|
1307
|
+
const postPruneNote = contextPressure.buildPostPruneNote(compactedMessages.length, hasSummary);
|
|
1232
1308
|
if (postPruneNote) {
|
|
1233
1309
|
messagesToUse = [
|
|
1234
1310
|
...messagesToUse,
|
|
@@ -1237,6 +1313,14 @@ class StandardGraph extends Graph {
|
|
|
1237
1313
|
}
|
|
1238
1314
|
}
|
|
1239
1315
|
}
|
|
1316
|
+
// Deduplicate system messages — ALWAYS runs, not just during compaction.
|
|
1317
|
+
// Duplicate system messages accumulate from repeated tool iterations,
|
|
1318
|
+
// summary injections, and context notes across turns.
|
|
1319
|
+
const { messages: dedupedMessages, removedCount } = dedup.deduplicateSystemMessages(messagesToUse);
|
|
1320
|
+
if (removedCount > 0) {
|
|
1321
|
+
messagesToUse = dedupedMessages;
|
|
1322
|
+
console.debug(`[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`);
|
|
1323
|
+
}
|
|
1240
1324
|
let finalMessages = messagesToUse;
|
|
1241
1325
|
if (agentContext.useLegacyContent) {
|
|
1242
1326
|
finalMessages = content.formatContentStrings(finalMessages);
|