npm - @illuma-ai/agents - Versions diffs - 1.1.1 → 1.1.3 - Mend

@illuma-ai/agents 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/cjs/common/constants.cjs +12 -0
package/dist/cjs/common/constants.cjs.map +1 -1
package/dist/cjs/graphs/Graph.cjs +156 -82
package/dist/cjs/graphs/Graph.cjs.map +1 -1
package/dist/cjs/main.cjs +1 -0
package/dist/cjs/main.cjs.map +1 -1
package/dist/esm/common/constants.mjs +12 -1
package/dist/esm/common/constants.mjs.map +1 -1
package/dist/esm/graphs/Graph.mjs +158 -84
package/dist/esm/graphs/Graph.mjs.map +1 -1
package/dist/esm/main.mjs +1 -1
package/dist/types/common/constants.d.ts +11 -0
package/package.json +1 -1
package/src/common/constants.ts +12 -0
package/src/graphs/Graph.ts +203 -102
package/src/graphs/gapFeatures.test.ts +345 -0

package/dist/esm/main.mjs CHANGED Viewed

@@ -26,7 +26,7 @@ export { createSearchTool } from './tools/search/tool.mjs';
 export { DATE_RANGE, DEFAULT_COUNTRY_DESCRIPTION, DEFAULT_QUERY_DESCRIPTION, WebSearchToolDefinition, WebSearchToolDescription, WebSearchToolName, WebSearchToolSchema, countrySchema, dateSchema, imagesSchema, newsSchema, querySchema, videosSchema } from './tools/search/schema.mjs';
 export { createValidationErrorMessage, isValidJsonSchema, normalizeJsonSchema, prepareSchemaForProvider, validateStructuredOutput, zodToJsonSchema } from './schemas/validate.mjs';
 export { Callback, CommonEvents, Constants, ContentTypes, EdgeType, EnvVar, FinishReasons, GraphEvents, GraphNodeActions, GraphNodeKeys, MessageTypes, Providers, StepTypes, TitleMethod, ToolCallTypes } from './common/enum.mjs';
-export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
+export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PROACTIVE_SUMMARY_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
 export { joinKeys, resetIfNotEmpty } from './utils/graph.mjs';
 export { isGoogleLike, isOpenAILike } from './utils/llm.mjs';
 export { isPresent, unescapeObject } from './utils/misc.mjs';

package/dist/types/common/constants.d.ts CHANGED Viewed

@@ -35,6 +35,17 @@ export declare const CONTEXT_SAFETY_BUFFER = 0.9;
  * When the context window is ≥80% full, pruning + summarization activates.
  */
 export declare const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
+/**
+ * Proactive summarization threshold (0-1 fraction of context window).
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
+ * This gives the summary time to complete so it's ready when context actually fills up.
+ *
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
+ *   80% → proactive background summary
+ *   90% → pruning kicks in (with summary already cached)
+ *  100% → graceful: use existing summary + recent messages, never block
+ */
+export declare const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
 /**
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
  * 0.3 means 30% of the context budget is reserved for the most recent messages,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@illuma-ai/agents",
-  "version": "1.1.1",
+  "version": "1.1.3",
   "main": "./dist/cjs/main.cjs",
   "module": "./dist/esm/main.mjs",
   "types": "./dist/types/index.d.ts",

package/src/common/constants.ts CHANGED Viewed

@@ -59,6 +59,18 @@ export const CONTEXT_SAFETY_BUFFER = 0.9;
  */
 export const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
+/**
+ * Proactive summarization threshold (0-1 fraction of context window).
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
+ * This gives the summary time to complete so it's ready when context actually fills up.
+ *
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
+ *   80% → proactive background summary
+ *   90% → pruning kicks in (with summary already cached)
+ *  100% → graceful: use existing summary + recent messages, never block
+ */
+export const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
 /**
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
  * 0.3 means 30% of the context budget is reserved for the most recent messages,

package/src/graphs/Graph.ts CHANGED Viewed

@@ -35,6 +35,7 @@ import {
   formatAnthropicArtifactContent,
   ensureThinkingBlockInMessages,
   deduplicateSystemMessages,
+  getContextUtilization,
   convertMessagesToContent,
   addBedrockCacheControl,
   modifyDeltaProperties,
@@ -54,6 +55,7 @@ import {
   Constants,
   TOOL_TURN_THINKING_BUDGET,
   SUMMARIZATION_CONTEXT_THRESHOLD,
+  PROACTIVE_SUMMARY_THRESHOLD,
 } from '@/common';
 import {
   ToolDiscoveryCache,
@@ -1542,87 +1544,205 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         }
       }
-      if (agentContext.pruneMessages) {
-        const { context, indexTokenCountMap, messagesToRefine } =
-          agentContext.pruneMessages({
-            messages,
-            usageMetadata: agentContext.currentUsage,
-          });
-        agentContext.indexTokenCountMap = indexTokenCountMap;
-        messagesToUse = context;
-        // ── Non-blocking summarization ──────────────────────────────────
-        // NEVER block the LLM call waiting for summarization. Instead:
-        //   1. If _cachedRunSummary exists → use it, fire async update
-        //   2. If persistedSummary exists → use it as fallback, fire async update
-        //   3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
-        // The summary catches up asynchronously and is available for subsequent
-        // iterations (tool calls) and the next conversation turn.
-        //
-        // SummarizationConfig integration:
-        //   - triggerType/triggerThreshold control WHEN summarization fires
-        //   - reserveRatio is enforced via calibrated maxTokens (above)
-        //   - initialSummary provides cross-run seeding as fallback before persistedSummary
-        let hasSummary = false;
-        const sumConfig = agentContext.summarizationConfig;
-        const shouldSummarize = this.shouldTriggerSummarization(
-          messagesToRefine.length,
-          agentContext.maxContextTokens ?? 0,
+      // ── Proactive summarization at context pressure ───────────────────
+      // Inspired by VS Code Copilot Chat's 3-tier strategy:
+      //   80% → fire proactive background summary (BEFORE pruning needed)
+      //   90% → pruning kicks in (summary already cached from 80% trigger)
+      //  100% → graceful: use existing summary + recent messages, NEVER block
+      //
+      // This ensures the summary is READY by the time pruning actually occurs,
+      // so the user never waits and never sees a context cliff.
+      if (
+        agentContext.maxContextTokens != null &&
+        agentContext.maxContextTokens > 0 &&
+        agentContext.summarizeCallback &&
+        !this._summaryInFlight &&
+        !this._cachedRunSummary
+      ) {
+        const utilization = getContextUtilization(
           agentContext.indexTokenCountMap,
           agentContext.instructionTokens,
-          sumConfig
+          agentContext.maxContextTokens
         );
+        const threshold = (agentContext.summarizationConfig?.triggerThreshold ?? PROACTIVE_SUMMARY_THRESHOLD * 100);
+        if (utilization >= threshold) {
+          // Identify older messages to summarize proactively.
+          // Keep the last N messages (recent turns) intact — only summarize older history.
+          // This is incremental: the callback checks for existing summary and updates it.
+          const recentTurnCount = Math.max(4, Math.floor(messages.length * 0.3));
+          const oldMessages = messages.slice(
+            messages[0]?.getType() === 'system' ? 1 : 0,
+            Math.max(1, messages.length - recentTurnCount)
+          );
-        if (
-          messagesToRefine.length > 0 &&
-          agentContext.summarizeCallback &&
-          shouldSummarize
+          if (oldMessages.length > 0) {
+            this._summaryInFlight = true;
+            console.debug(
+              `[Graph:ProactiveSummary] Context at ${utilization.toFixed(1)}% (threshold ${threshold}%) — summarizing ${oldMessages.length} older msgs in background`
+            );
+            agentContext
+              .summarizeCallback(oldMessages)
+              .then((updated) => {
+                if (updated != null && updated !== '') {
+                  this._cachedRunSummary = updated;
+                  console.debug(
+                    `[Graph:ProactiveSummary] Background summary ready (len=${updated.length})`
+                  );
+                }
+              })
+              .catch((err) => {
+                console.error(
+                  '[Graph:ProactiveSummary] Background summary failed (non-fatal):',
+                  err
+                );
+              })
+              .finally(() => {
+                this._summaryInFlight = false;
+              });
+          }
+        }
+      }
+      if (agentContext.pruneMessages) {
+        // ── Context Compaction (Copilot-style: never delete messages) ─────
+        //
+        // DESIGN: Original messages are NEVER removed from the array.
+        // Instead, we build a "windowed view" for the LLM:
+        //   [system prompt] + [summary of older turns] + [recent turns that fit]
+        //
+        // This ensures:
+        //   - No context is ever lost (summary covers older turns)
+        //   - We can always re-summarize from originals if summary is stale
+        //   - Conversation chaining works naturally across turns
+        //
+        // Flow:
+        //   1. Resolve best available summary (cached > persisted > seed)
+        //   2. Calculate token budget available for recent messages
+        //   3. Walk newest→oldest, build view of messages that fit
+        //   4. Assemble: [system] + [summary] + [recent window]
+        //   5. Fire background summary update for messages outside the window
+        const sumConfig = agentContext.summarizationConfig;
+        const tokenCounter = agentContext.tokenCounter;
+        const maxTokens = agentContext.maxContextTokens ?? 0;
+        // Step 1: Resolve best available summary
+        let summary: string | undefined;
+        let summarySource: string;
+        if (this._cachedRunSummary != null) {
+          summary = this._cachedRunSummary;
+          summarySource = 'cached';
+        } else if (
+          agentContext.persistedSummary != null &&
+          agentContext.persistedSummary !== ''
         ) {
-          try {
-            let summary: string | undefined;
-            let summarySource: string;
+          summary = agentContext.persistedSummary;
+          this._cachedRunSummary = summary;
+          summarySource = 'persisted';
+        } else if (
+          sumConfig?.initialSummary != null &&
+          sumConfig.initialSummary !== ''
+        ) {
+          summary = sumConfig.initialSummary;
+          this._cachedRunSummary = summary;
+          summarySource = 'initial-seed';
+        } else {
+          summarySource = 'none';
+        }
-            if (this._cachedRunSummary != null) {
-              summary = this._cachedRunSummary;
-              summarySource = 'cached';
-            } else if (
-              agentContext.persistedSummary != null &&
-              agentContext.persistedSummary !== ''
-            ) {
-              summary = agentContext.persistedSummary;
-              this._cachedRunSummary = summary;
-              summarySource = 'persisted';
-            } else if (
-              sumConfig?.initialSummary != null &&
-              sumConfig.initialSummary !== ''
-            ) {
-              // Cross-run seed: use initialSummary when no persisted summary exists
-              summary = sumConfig.initialSummary;
-              this._cachedRunSummary = summary;
-              summarySource = 'initial-seed';
-            } else {
-              summarySource = 'none';
-            }
+        // Step 2: Calculate token budget
+        // Apply EMA calibration for accuracy across iterations
+        const calibratedMax = applyCalibration(maxTokens, this._pruneCalibration);
+        const systemMsg = messages[0]?.getType() === 'system' ? messages[0] : null;
+        const systemTokens = systemMsg != null
+          ? (agentContext.indexTokenCountMap[0] ?? 0)
+          : 0;
+        const summaryMsg = summary != null && summary !== ''
+          ? new SystemMessage(`[Conversation Summary]\n${summary}`)
+          : null;
+        const summaryTokens = summaryMsg != null && tokenCounter != null
+          ? tokenCounter(summaryMsg)
+          : 0;
+        // Budget for recent messages = total - system - summary - 3 (assistant priming)
+        const recentBudget = calibratedMax - systemTokens - summaryTokens - 3;
+        // Step 3: Walk newest→oldest, collect messages that fit in the budget
+        const contentStart = systemMsg != null ? 1 : 0;
+        let usedTokens = 0;
+        let windowStart = messages.length; // index where the recent window begins
+        for (let i = messages.length - 1; i >= contentStart; i--) {
+          const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
+          if (usedTokens + msgTokens > recentBudget) {
+            break;
+          }
+          usedTokens += msgTokens;
+          windowStart = i;
+        }
-            // Single consolidated log for the entire prune+summarize decision
-            console.debug(
-              `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
-            );
+        // Ensure we don't split tool-call / tool-result pairs.
+        // If windowStart lands on a ToolMessage, walk back to include its AI message.
+        while (
+          windowStart > contentStart &&
+          messages[windowStart]?.getType() === 'tool'
+        ) {
+          windowStart--;
+          usedTokens += agentContext.indexTokenCountMap[windowStart] ?? 0;
+        }
+        const recentMessages = messages.slice(windowStart);
+        const compactedMessages = messages.slice(contentStart, windowStart);
+        const hasSummary = summaryMsg != null;
+        // Step 4: Assemble the windowed view
+        // [system] + [summary (covers compacted messages)] + [recent window]
+        const viewParts: BaseMessage[] = [];
+        if (systemMsg != null) {
+          viewParts.push(systemMsg);
+        }
+        if (summaryMsg != null) {
+          viewParts.push(summaryMsg);
+        }
+        viewParts.push(...recentMessages);
+        messagesToUse = viewParts;
-            // SCALE: Debounce background summarization — if a summary call is already
-            // in-flight (from a prior tool iteration), accumulate messages instead of
-            // firing another concurrent LLM call. At 2000 users with 3+ tool calls
-            // per turn, this prevents 3x summary call volume.
+        console.debug(
+          `[Graph:Compaction] View: ${messages.length}→${viewParts.length} msgs ` +
+          `(${compactedMessages.length} behind summary, ${recentMessages.length} in window) | ` +
+          `summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | ` +
+          `budget=${recentBudget}/${calibratedMax} used=${usedTokens}`
+        );
+        // Step 5: Fire background summary update (non-blocking)
+        // Summarize messages outside the window so next iteration has a fresh summary.
+        // Only trigger if there are compacted messages worth summarizing.
+        if (
+          compactedMessages.length > 0 &&
+          agentContext.summarizeCallback
+        ) {
+          const shouldSummarize = this.shouldTriggerSummarization(
+            compactedMessages.length,
+            maxTokens,
+            agentContext.indexTokenCountMap,
+            agentContext.instructionTokens,
+            sumConfig
+          );
+          if (shouldSummarize) {
             if (this._summaryInFlight) {
-              this._pendingMessagesToRefine.push(...messagesToRefine);
+              this._pendingMessagesToRefine.push(...compactedMessages);
               console.debug(
-                `[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`
+                `[Graph:Compaction] Summary in-flight, queued ${compactedMessages.length} msgs (pending=${this._pendingMessagesToRefine.length})`
               );
             } else {
               this._summaryInFlight = true;
               const allMessages = this._pendingMessagesToRefine.length > 0
-                ? [...this._pendingMessagesToRefine, ...messagesToRefine]
-                : messagesToRefine;
+                ? [...this._pendingMessagesToRefine, ...compactedMessages]
+                : compactedMessages;
               this._pendingMessagesToRefine = [];
               agentContext
@@ -1634,7 +1754,7 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
                 })
                 .catch((err) => {
                   console.error(
-                    '[Graph] Background summary failed (non-fatal):',
+                    '[Graph:Compaction] Background summary update failed (non-fatal):',
                     err
                   );
                 })
@@ -1642,44 +1762,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
                   this._summaryInFlight = false;
                 });
             }
-            if (summary != null && summary !== '') {
-              hasSummary = true;
-              const summaryMsg = new SystemMessage(
-                `[Conversation Summary]\n${summary}`
-              );
-              const systemIdx =
-                messagesToUse[0]?.getType() === 'system' ? 1 : 0;
-              messagesToUse = [
-                ...messagesToUse.slice(0, systemIdx),
-                summaryMsg,
-                ...messagesToUse.slice(systemIdx),
-              ];
-            }
-          } catch (err) {
-            console.error('[Graph] Summarization failed:', err);
           }
-        } else if (messagesToRefine.length > 0) {
-          // Log pruning even when no summarize callback (discard mode)
-          console.debug(
-            `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`
-          );
-        }
-        // Deduplicate system messages that accumulate from repeated tool iterations
-        const { messages: dedupedMessages, removedCount } =
-          deduplicateSystemMessages(messagesToUse);
-        if (removedCount > 0) {
-          messagesToUse = dedupedMessages;
-          console.debug(
-            `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
-          );
         }
-        // Post-prune context note for task-tool-enabled agents
-        if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
+        // Post-compaction context note for task-tool-enabled agents
+        if (compactedMessages.length > 0 && hasTaskTool(agentContext.tools)) {
           const postPruneNote = buildPostPruneNote(
-            messagesToRefine.length,
+            compactedMessages.length,
             hasSummary
           );
           if (postPruneNote) {
@@ -1691,6 +1780,18 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         }
       }
+      // Deduplicate system messages — ALWAYS runs, not just during compaction.
+      // Duplicate system messages accumulate from repeated tool iterations,
+      // summary injections, and context notes across turns.
+      const { messages: dedupedMessages, removedCount } =
+        deduplicateSystemMessages(messagesToUse);
+      if (removedCount > 0) {
+        messagesToUse = dedupedMessages;
+        console.debug(
+          `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
+        );
+      }
       let finalMessages = messagesToUse;
       if (agentContext.useLegacyContent) {
         finalMessages = formatContentStrings(finalMessages);