@illuma-ai/agents 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -53,6 +53,17 @@ const CONTEXT_SAFETY_BUFFER = 0.9;
53
53
  * When the context window is ≥80% full, pruning + summarization activates.
54
54
  */
55
55
  const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
56
+ /**
57
+ * Proactive summarization threshold (0-1 fraction of context window).
58
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
59
+ * This gives the summary time to complete so it's ready when context actually fills up.
60
+ *
61
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
62
+ * 80% → proactive background summary
63
+ * 90% → pruning kicks in (with summary already cached)
64
+ * 100% → graceful: use existing summary + recent messages, never block
65
+ */
66
+ const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
56
67
  /**
57
68
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
58
69
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
@@ -94,6 +105,7 @@ exports.CONTEXT_SAFETY_BUFFER = CONTEXT_SAFETY_BUFFER;
94
105
  exports.DEDUP_MAX_CONTENT_LENGTH = DEDUP_MAX_CONTENT_LENGTH;
95
106
  exports.MIN_THINKING_BUDGET = MIN_THINKING_BUDGET;
96
107
  exports.MULTI_DOCUMENT_THRESHOLD = MULTI_DOCUMENT_THRESHOLD;
108
+ exports.PROACTIVE_SUMMARY_THRESHOLD = PROACTIVE_SUMMARY_THRESHOLD;
97
109
  exports.PRUNING_EMA_ALPHA = PRUNING_EMA_ALPHA;
98
110
  exports.PRUNING_INITIAL_CALIBRATION = PRUNING_INITIAL_CALIBRATION;
99
111
  exports.SUMMARIZATION_CONTEXT_THRESHOLD = SUMMARIZATION_CONTEXT_THRESHOLD;
@@ -1 +1 @@
1
- {"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;"}
1
+ {"version":3,"file":"constants.cjs","sources":["../../../src/common/constants.ts"],"sourcesContent":["// src/common/constants.ts\n\n/**\n * Minimum thinking budget allowed by the Anthropic API.\n * Extended thinking requires at least 1024 budget_tokens.\n * @see https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking\n */\nexport const MIN_THINKING_BUDGET = 1024;\n\n/**\n * Reduced thinking budget for subsequent ReAct iterations (tool-result turns).\n *\n * In a ReAct agent loop, the first LLM call processes the user's query and\n * may need deep reasoning. Subsequent iterations (after tool results return)\n * typically only need to decide \"call next tool\" or \"generate final response\"\n * — 1024 tokens is sufficient for this routing logic.\n *\n * This reduces wall-clock time per iteration from ~20-30s to ~5-10s,\n * compounding across multi-tool conversations (e.g., 10 tool calls).\n */\nexport const TOOL_TURN_THINKING_BUDGET = 1024;\n\n// ============================================================================\n// CONTEXT OVERFLOW MANAGEMENT\n//\n// Context overflow is handled mechanically — no token budget numbers are\n// exposed to the LLM. The system uses: pruning (Graph), summarization\n// (summarizeCallback), and auto-continuation (client.js max_tokens detection).\n//\n// See: docs/context-overflow-architecture.md\n// ============================================================================\n\n/**\n * Minimum number of attached documents before the multi-document delegation\n * hint is injected. Below this threshold, the agent processes documents\n * directly within its own context.\n */\nexport const MULTI_DOCUMENT_THRESHOLD = 3;\n\n/**\n * Context utilization safety buffer multiplier (0-1).\n * Applied as: effectiveMax = (maxContextTokens - maxOutputTokens) * CONTEXT_SAFETY_BUFFER\n *\n * Reserves headroom so the LLM doesn't hit hard token limits mid-generation.\n * 0.9 = 10% reserved for safety.\n */\nexport const CONTEXT_SAFETY_BUFFER = 0.9;\n\n// ============================================================================\n// SUMMARIZATION CONFIGURATION DEFAULTS\n//\n// These constants provide sensible defaults for the SummarizationConfig.\n// They can be overridden per-agent via AgentInputs.summarizationConfig.\n// ============================================================================\n\n/**\n * Default context utilization percentage (0-100) at which summarization triggers.\n * When the context window is ≥80% full, pruning + summarization activates.\n */\nexport const SUMMARIZATION_CONTEXT_THRESHOLD = 80;\n\n/**\n * Proactive summarization threshold (0-1 fraction of context window).\n * At this utilization level, background summarization fires BEFORE pruning is needed.\n * This gives the summary time to complete so it's ready when context actually fills up.\n *\n * Inspired by VS Code Copilot Chat's 3-tier strategy:\n * 80% → proactive background summary\n * 90% → pruning kicks in (with summary already cached)\n * 100% → graceful: use existing summary + recent messages, never block\n */\nexport const PROACTIVE_SUMMARY_THRESHOLD = 0.8;\n\n/**\n * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.\n * 0.3 means 30% of the context budget is reserved for the most recent messages,\n * ensuring the model always has immediate conversation history even after aggressive pruning.\n */\nexport const SUMMARIZATION_RESERVE_RATIO = 0.3;\n\n/**\n * Default EMA (Exponential Moving Average) alpha for pruning calibration.\n * Controls how quickly the calibration adapts to new token counts.\n * Higher α = faster adaptation (more responsive to recent changes).\n * Lower α = smoother adaptation (more stable across iterations).\n * 0.3 provides a balance between responsiveness and stability.\n */\nexport const PRUNING_EMA_ALPHA = 0.3;\n\n/**\n * Default initial calibration ratio for EMA pruning.\n * 1.0 means no adjustment on the first iteration (trust the raw token counts).\n * Subsequent iterations will adjust based on actual vs. estimated token usage.\n */\nexport const PRUNING_INITIAL_CALIBRATION = 1.0;\n\n// ============================================================================\n// TOOL DISCOVERY CACHING\n// ============================================================================\n\n/**\n * Maximum number of tool discovery entries to cache per conversation.\n * Prevents unbounded memory growth in very long conversations.\n */\nexport const TOOL_DISCOVERY_CACHE_MAX_SIZE = 200;\n\n// ============================================================================\n// MESSAGE DEDUPLICATION\n// ============================================================================\n\n/**\n * Maximum length of system message content to hash for deduplication.\n * Messages longer than this are always considered unique (hashing would be expensive).\n */\nexport const DEDUP_MAX_CONTENT_LENGTH = 10000;\n"],"names":[],"mappings":";;AAAA;AAEA;;;;AAIG;AACI,MAAM,mBAAmB,GAAG;AAEnC;;;;;;;;;;AAUG;AACI,MAAM,yBAAyB,GAAG;AAEzC;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AACA;AAEA;;;;AAIG;AACI,MAAM,wBAAwB,GAAG;AAExC;;;;;;AAMG;AACI,MAAM,qBAAqB,GAAG;AAErC;AACA;AACA;AACA;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,+BAA+B,GAAG;AAE/C;;;;;;;;;AASG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;;;;;;AAMG;AACI,MAAM,iBAAiB,GAAG;AAEjC;;;;AAIG;AACI,MAAM,2BAA2B,GAAG;AAE3C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,6BAA6B,GAAG;AAE7C;AACA;AACA;AAEA;;;AAGG;AACI,MAAM,wBAAwB,GAAG;;;;;;;;;;;;;;"}
@@ -1082,69 +1082,158 @@ class StandardGraph extends Graph {
1082
1082
  this._pruneCalibration = pruneCalibration.updatePruneCalibration(this._pruneCalibration, agentContext.currentUsage.input_tokens, estimatedTokens);
1083
1083
  }
1084
1084
  }
1085
+ // ── Proactive summarization at context pressure ───────────────────
1086
+ // Inspired by VS Code Copilot Chat's 3-tier strategy:
1087
+ // 80% → fire proactive background summary (BEFORE pruning needed)
1088
+ // 90% → pruning kicks in (summary already cached from 80% trigger)
1089
+ // 100% → graceful: use existing summary + recent messages, NEVER block
1090
+ //
1091
+ // This ensures the summary is READY by the time pruning actually occurs,
1092
+ // so the user never waits and never sees a context cliff.
1093
+ if (agentContext.maxContextTokens != null &&
1094
+ agentContext.maxContextTokens > 0 &&
1095
+ agentContext.summarizeCallback &&
1096
+ !this._summaryInFlight &&
1097
+ !this._cachedRunSummary) {
1098
+ const utilization = prune.getContextUtilization(agentContext.indexTokenCountMap, agentContext.instructionTokens, agentContext.maxContextTokens);
1099
+ const threshold = (agentContext.summarizationConfig?.triggerThreshold ?? constants.PROACTIVE_SUMMARY_THRESHOLD * 100);
1100
+ if (utilization >= threshold) {
1101
+ // Identify older messages to summarize proactively.
1102
+ // Keep the last N messages (recent turns) intact — only summarize older history.
1103
+ // This is incremental: the callback checks for existing summary and updates it.
1104
+ const recentTurnCount = Math.max(4, Math.floor(messages$1.length * 0.3));
1105
+ const oldMessages = messages$1.slice(messages$1[0]?.getType() === 'system' ? 1 : 0, Math.max(1, messages$1.length - recentTurnCount));
1106
+ if (oldMessages.length > 0) {
1107
+ this._summaryInFlight = true;
1108
+ console.debug(`[Graph:ProactiveSummary] Context at ${utilization.toFixed(1)}% (threshold ${threshold}%) — summarizing ${oldMessages.length} older msgs in background`);
1109
+ agentContext
1110
+ .summarizeCallback(oldMessages)
1111
+ .then((updated) => {
1112
+ if (updated != null && updated !== '') {
1113
+ this._cachedRunSummary = updated;
1114
+ console.debug(`[Graph:ProactiveSummary] Background summary ready (len=${updated.length})`);
1115
+ }
1116
+ })
1117
+ .catch((err) => {
1118
+ console.error('[Graph:ProactiveSummary] Background summary failed (non-fatal):', err);
1119
+ })
1120
+ .finally(() => {
1121
+ this._summaryInFlight = false;
1122
+ });
1123
+ }
1124
+ }
1125
+ }
1085
1126
  if (agentContext.pruneMessages) {
1086
- const { context, indexTokenCountMap, messagesToRefine } = agentContext.pruneMessages({
1087
- messages: messages$1,
1088
- usageMetadata: agentContext.currentUsage,
1089
- });
1090
- agentContext.indexTokenCountMap = indexTokenCountMap;
1091
- messagesToUse = context;
1092
- // ── Non-blocking summarization ──────────────────────────────────
1093
- // NEVER block the LLM call waiting for summarization. Instead:
1094
- // 1. If _cachedRunSummary exists → use it, fire async update
1095
- // 2. If persistedSummary exists → use it as fallback, fire async update
1096
- // 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
1097
- // The summary catches up asynchronously and is available for subsequent
1098
- // iterations (tool calls) and the next conversation turn.
1127
+ // ── Context Compaction (Copilot-style: never delete messages) ─────
1128
+ //
1129
+ // DESIGN: Original messages are NEVER removed from the array.
1130
+ // Instead, we build a "windowed view" for the LLM:
1131
+ // [system prompt] + [summary of older turns] + [recent turns that fit]
1099
1132
  //
1100
- // SummarizationConfig integration:
1101
- // - triggerType/triggerThreshold control WHEN summarization fires
1102
- // - reserveRatio is enforced via calibrated maxTokens (above)
1103
- // - initialSummary provides cross-run seeding as fallback before persistedSummary
1104
- let hasSummary = false;
1133
+ // This ensures:
1134
+ // - No context is ever lost (summary covers older turns)
1135
+ // - We can always re-summarize from originals if summary is stale
1136
+ // - Conversation chaining works naturally across turns
1137
+ //
1138
+ // Flow:
1139
+ // 1. Resolve best available summary (cached > persisted > seed)
1140
+ // 2. Calculate token budget available for recent messages
1141
+ // 3. Walk newest→oldest, build view of messages that fit
1142
+ // 4. Assemble: [system] + [summary] + [recent window]
1143
+ // 5. Fire background summary update for messages outside the window
1105
1144
  const sumConfig = agentContext.summarizationConfig;
1106
- const shouldSummarize = this.shouldTriggerSummarization(messagesToRefine.length, agentContext.maxContextTokens ?? 0, agentContext.indexTokenCountMap, agentContext.instructionTokens, sumConfig);
1107
- if (messagesToRefine.length > 0 &&
1108
- agentContext.summarizeCallback &&
1109
- shouldSummarize) {
1110
- try {
1111
- let summary;
1112
- let summarySource;
1113
- if (this._cachedRunSummary != null) {
1114
- summary = this._cachedRunSummary;
1115
- summarySource = 'cached';
1116
- }
1117
- else if (agentContext.persistedSummary != null &&
1118
- agentContext.persistedSummary !== '') {
1119
- summary = agentContext.persistedSummary;
1120
- this._cachedRunSummary = summary;
1121
- summarySource = 'persisted';
1122
- }
1123
- else if (sumConfig?.initialSummary != null &&
1124
- sumConfig.initialSummary !== '') {
1125
- // Cross-run seed: use initialSummary when no persisted summary exists
1126
- summary = sumConfig.initialSummary;
1127
- this._cachedRunSummary = summary;
1128
- summarySource = 'initial-seed';
1129
- }
1130
- else {
1131
- summarySource = 'none';
1132
- }
1133
- // Single consolidated log for the entire prune+summarize decision
1134
- console.debug(`[Graph:ContextMgmt] Pruned ${messages$1.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`);
1135
- // SCALE: Debounce background summarization — if a summary call is already
1136
- // in-flight (from a prior tool iteration), accumulate messages instead of
1137
- // firing another concurrent LLM call. At 2000 users with 3+ tool calls
1138
- // per turn, this prevents 3x summary call volume.
1145
+ const tokenCounter = agentContext.tokenCounter;
1146
+ const maxTokens = agentContext.maxContextTokens ?? 0;
1147
+ // Step 1: Resolve best available summary
1148
+ let summary;
1149
+ let summarySource;
1150
+ if (this._cachedRunSummary != null) {
1151
+ summary = this._cachedRunSummary;
1152
+ summarySource = 'cached';
1153
+ }
1154
+ else if (agentContext.persistedSummary != null &&
1155
+ agentContext.persistedSummary !== '') {
1156
+ summary = agentContext.persistedSummary;
1157
+ this._cachedRunSummary = summary;
1158
+ summarySource = 'persisted';
1159
+ }
1160
+ else if (sumConfig?.initialSummary != null &&
1161
+ sumConfig.initialSummary !== '') {
1162
+ summary = sumConfig.initialSummary;
1163
+ this._cachedRunSummary = summary;
1164
+ summarySource = 'initial-seed';
1165
+ }
1166
+ else {
1167
+ summarySource = 'none';
1168
+ }
1169
+ // Step 2: Calculate token budget
1170
+ // Apply EMA calibration for accuracy across iterations
1171
+ const calibratedMax = pruneCalibration.applyCalibration(maxTokens, this._pruneCalibration);
1172
+ const systemMsg = messages$1[0]?.getType() === 'system' ? messages$1[0] : null;
1173
+ const systemTokens = systemMsg != null
1174
+ ? (agentContext.indexTokenCountMap[0] ?? 0)
1175
+ : 0;
1176
+ const summaryMsg = summary != null && summary !== ''
1177
+ ? new messages.SystemMessage(`[Conversation Summary]\n${summary}`)
1178
+ : null;
1179
+ const summaryTokens = summaryMsg != null && tokenCounter != null
1180
+ ? tokenCounter(summaryMsg)
1181
+ : 0;
1182
+ // Budget for recent messages = total - system - summary - 3 (assistant priming)
1183
+ const recentBudget = calibratedMax - systemTokens - summaryTokens - 3;
1184
+ // Step 3: Walk newest→oldest, collect messages that fit in the budget
1185
+ const contentStart = systemMsg != null ? 1 : 0;
1186
+ let usedTokens = 0;
1187
+ let windowStart = messages$1.length; // index where the recent window begins
1188
+ for (let i = messages$1.length - 1; i >= contentStart; i--) {
1189
+ const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
1190
+ if (usedTokens + msgTokens > recentBudget) {
1191
+ break;
1192
+ }
1193
+ usedTokens += msgTokens;
1194
+ windowStart = i;
1195
+ }
1196
+ // Ensure we don't split tool-call / tool-result pairs.
1197
+ // If windowStart lands on a ToolMessage, walk back to include its AI message.
1198
+ while (windowStart > contentStart &&
1199
+ messages$1[windowStart]?.getType() === 'tool') {
1200
+ windowStart--;
1201
+ usedTokens += agentContext.indexTokenCountMap[windowStart] ?? 0;
1202
+ }
1203
+ const recentMessages = messages$1.slice(windowStart);
1204
+ const compactedMessages = messages$1.slice(contentStart, windowStart);
1205
+ const hasSummary = summaryMsg != null;
1206
+ // Step 4: Assemble the windowed view
1207
+ // [system] + [summary (covers compacted messages)] + [recent window]
1208
+ const viewParts = [];
1209
+ if (systemMsg != null) {
1210
+ viewParts.push(systemMsg);
1211
+ }
1212
+ if (summaryMsg != null) {
1213
+ viewParts.push(summaryMsg);
1214
+ }
1215
+ viewParts.push(...recentMessages);
1216
+ messagesToUse = viewParts;
1217
+ console.debug(`[Graph:Compaction] View: ${messages$1.length}→${viewParts.length} msgs ` +
1218
+ `(${compactedMessages.length} behind summary, ${recentMessages.length} in window) | ` +
1219
+ `summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | ` +
1220
+ `budget=${recentBudget}/${calibratedMax} used=${usedTokens}`);
1221
+ // Step 5: Fire background summary update (non-blocking)
1222
+ // Summarize messages outside the window so next iteration has a fresh summary.
1223
+ // Only trigger if there are compacted messages worth summarizing.
1224
+ if (compactedMessages.length > 0 &&
1225
+ agentContext.summarizeCallback) {
1226
+ const shouldSummarize = this.shouldTriggerSummarization(compactedMessages.length, maxTokens, agentContext.indexTokenCountMap, agentContext.instructionTokens, sumConfig);
1227
+ if (shouldSummarize) {
1139
1228
  if (this._summaryInFlight) {
1140
- this._pendingMessagesToRefine.push(...messagesToRefine);
1141
- console.debug(`[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`);
1229
+ this._pendingMessagesToRefine.push(...compactedMessages);
1230
+ console.debug(`[Graph:Compaction] Summary in-flight, queued ${compactedMessages.length} msgs (pending=${this._pendingMessagesToRefine.length})`);
1142
1231
  }
1143
1232
  else {
1144
1233
  this._summaryInFlight = true;
1145
1234
  const allMessages = this._pendingMessagesToRefine.length > 0
1146
- ? [...this._pendingMessagesToRefine, ...messagesToRefine]
1147
- : messagesToRefine;
1235
+ ? [...this._pendingMessagesToRefine, ...compactedMessages]
1236
+ : compactedMessages;
1148
1237
  this._pendingMessagesToRefine = [];
1149
1238
  agentContext
1150
1239
  .summarizeCallback(allMessages)
@@ -1154,40 +1243,17 @@ class StandardGraph extends Graph {
1154
1243
  }
1155
1244
  })
1156
1245
  .catch((err) => {
1157
- console.error('[Graph] Background summary failed (non-fatal):', err);
1246
+ console.error('[Graph:Compaction] Background summary update failed (non-fatal):', err);
1158
1247
  })
1159
1248
  .finally(() => {
1160
1249
  this._summaryInFlight = false;
1161
1250
  });
1162
1251
  }
1163
- if (summary != null && summary !== '') {
1164
- hasSummary = true;
1165
- const summaryMsg = new messages.SystemMessage(`[Conversation Summary]\n${summary}`);
1166
- const systemIdx = messagesToUse[0]?.getType() === 'system' ? 1 : 0;
1167
- messagesToUse = [
1168
- ...messagesToUse.slice(0, systemIdx),
1169
- summaryMsg,
1170
- ...messagesToUse.slice(systemIdx),
1171
- ];
1172
- }
1173
- }
1174
- catch (err) {
1175
- console.error('[Graph] Summarization failed:', err);
1176
1252
  }
1177
1253
  }
1178
- else if (messagesToRefine.length > 0) {
1179
- // Log pruning even when no summarize callback (discard mode)
1180
- console.debug(`[Graph:ContextMgmt] Pruned ${messages$1.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`);
1181
- }
1182
- // Deduplicate system messages that accumulate from repeated tool iterations
1183
- const { messages: dedupedMessages, removedCount } = dedup.deduplicateSystemMessages(messagesToUse);
1184
- if (removedCount > 0) {
1185
- messagesToUse = dedupedMessages;
1186
- console.debug(`[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`);
1187
- }
1188
- // Post-prune context note for task-tool-enabled agents
1189
- if (messagesToRefine.length > 0 && contextPressure.hasTaskTool(agentContext.tools)) {
1190
- const postPruneNote = contextPressure.buildPostPruneNote(messagesToRefine.length, hasSummary);
1254
+ // Post-compaction context note for task-tool-enabled agents
1255
+ if (compactedMessages.length > 0 && contextPressure.hasTaskTool(agentContext.tools)) {
1256
+ const postPruneNote = contextPressure.buildPostPruneNote(compactedMessages.length, hasSummary);
1191
1257
  if (postPruneNote) {
1192
1258
  messagesToUse = [
1193
1259
  ...messagesToUse,
@@ -1196,6 +1262,14 @@ class StandardGraph extends Graph {
1196
1262
  }
1197
1263
  }
1198
1264
  }
1265
+ // Deduplicate system messages — ALWAYS runs, not just during compaction.
1266
+ // Duplicate system messages accumulate from repeated tool iterations,
1267
+ // summary injections, and context notes across turns.
1268
+ const { messages: dedupedMessages, removedCount } = dedup.deduplicateSystemMessages(messagesToUse);
1269
+ if (removedCount > 0) {
1270
+ messagesToUse = dedupedMessages;
1271
+ console.debug(`[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`);
1272
+ }
1199
1273
  let finalMessages = messagesToUse;
1200
1274
  if (agentContext.useLegacyContent) {
1201
1275
  finalMessages = content.formatContentStrings(finalMessages);