npm - @illuma-ai/agents - Versions diffs - 1.0.98 → 1.1.1 - Mend

@illuma-ai/agents 1.0.98 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

package/dist/cjs/agents/AgentContext.cjs +6 -2
package/dist/cjs/agents/AgentContext.cjs.map +1 -1
package/dist/cjs/common/constants.cjs +53 -0
package/dist/cjs/common/constants.cjs.map +1 -1
package/dist/cjs/graphs/Graph.cjs +195 -31
package/dist/cjs/graphs/Graph.cjs.map +1 -1
package/dist/cjs/main.cjs +14 -0
package/dist/cjs/main.cjs.map +1 -1
package/dist/cjs/messages/dedup.cjs +95 -0
package/dist/cjs/messages/dedup.cjs.map +1 -0
package/dist/cjs/tools/CodeExecutor.cjs +22 -3
package/dist/cjs/tools/CodeExecutor.cjs.map +1 -1
package/dist/cjs/types/graph.cjs.map +1 -1
package/dist/cjs/utils/pruneCalibration.cjs +78 -0
package/dist/cjs/utils/pruneCalibration.cjs.map +1 -0
package/dist/cjs/utils/run.cjs.map +1 -1
package/dist/cjs/utils/tokens.cjs.map +1 -1
package/dist/cjs/utils/toolDiscoveryCache.cjs +127 -0
package/dist/cjs/utils/toolDiscoveryCache.cjs.map +1 -0
package/dist/esm/agents/AgentContext.mjs +6 -2
package/dist/esm/agents/AgentContext.mjs.map +1 -1
package/dist/esm/common/constants.mjs +48 -1
package/dist/esm/common/constants.mjs.map +1 -1
package/dist/esm/graphs/Graph.mjs +196 -32
package/dist/esm/graphs/Graph.mjs.map +1 -1
package/dist/esm/main.mjs +4 -1
package/dist/esm/main.mjs.map +1 -1
package/dist/esm/messages/dedup.mjs +93 -0
package/dist/esm/messages/dedup.mjs.map +1 -0
package/dist/esm/tools/CodeExecutor.mjs +22 -3
package/dist/esm/tools/CodeExecutor.mjs.map +1 -1
package/dist/esm/types/graph.mjs.map +1 -1
package/dist/esm/utils/pruneCalibration.mjs +74 -0
package/dist/esm/utils/pruneCalibration.mjs.map +1 -0
package/dist/esm/utils/run.mjs.map +1 -1
package/dist/esm/utils/tokens.mjs.map +1 -1
package/dist/esm/utils/toolDiscoveryCache.mjs +125 -0
package/dist/esm/utils/toolDiscoveryCache.mjs.map +1 -0
package/dist/types/agents/AgentContext.d.ts +4 -1
package/dist/types/common/constants.d.ts +35 -0
package/dist/types/graphs/Graph.d.ts +34 -0
package/dist/types/messages/dedup.d.ts +25 -0
package/dist/types/messages/index.d.ts +1 -0
package/dist/types/types/graph.d.ts +63 -0
package/dist/types/utils/index.d.ts +2 -0
package/dist/types/utils/pruneCalibration.d.ts +43 -0
package/dist/types/utils/toolDiscoveryCache.d.ts +77 -0
package/package.json +1 -1
package/src/agents/AgentContext.ts +7 -0
package/src/common/constants.ts +56 -0
package/src/graphs/Graph.ts +250 -50
package/src/graphs/gapFeatures.test.ts +520 -0
package/src/graphs/nonBlockingSummarization.test.ts +307 -0
package/src/messages/__tests__/dedup.test.ts +166 -0
package/src/messages/dedup.ts +104 -0
package/src/messages/index.ts +1 -0
package/src/tools/CodeExecutor.ts +22 -3
package/src/types/graph.ts +73 -0
package/src/utils/__tests__/pruneCalibration.test.ts +148 -0
package/src/utils/__tests__/toolDiscoveryCache.test.ts +214 -0
package/src/utils/contextPressure.test.ts +24 -9
package/src/utils/index.ts +2 -0
package/src/utils/pruneCalibration.ts +92 -0
package/src/utils/run.ts +108 -108
package/src/utils/tokens.ts +118 -118
package/src/utils/toolDiscoveryCache.ts +150 -0

package/src/graphs/Graph.ts CHANGED Viewed

@@ -34,9 +34,9 @@ import type * as t from '@/types';
 import {
   formatAnthropicArtifactContent,
   ensureThinkingBlockInMessages,
+  deduplicateSystemMessages,
   convertMessagesToContent,
   addBedrockCacheControl,
-  extractToolDiscoveries,
   modifyDeltaProperties,
   formatArtifactPayload,
   formatContentStrings,
@@ -53,14 +53,20 @@ import {
   MessageTypes,
   Constants,
   TOOL_TURN_THINKING_BUDGET,
+  SUMMARIZATION_CONTEXT_THRESHOLD,
 } from '@/common';
 import {
+  ToolDiscoveryCache,
   resetIfNotEmpty,
   isOpenAILike,
   isGoogleLike,
   joinKeys,
   sleep,
+  createPruneCalibration,
+  updatePruneCalibration,
+  applyCalibration,
 } from '@/utils';
+import type { PruneCalibrationState } from '@/types/graph';
 import {
   buildContextAnalytics,
   type ContextAnalytics,
@@ -205,6 +211,22 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
   runId: string | undefined;
   startIndex: number = 0;
   signal?: AbortSignal;
+  /** Cached summary from the first prune in this run.
+   * Reused for subsequent prunes to avoid blocking LLM calls on every tool iteration. */
+  private _cachedRunSummary: string | undefined;
+  /** EMA-based pruning calibration state — smooths token budget adjustments across iterations */
+  private _pruneCalibration: PruneCalibrationState;
+  /** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
+  private _toolDiscoveryCache: ToolDiscoveryCache;
+  /**
+   * SCALE: Tracks whether a summary call is already in-flight for this Graph instance.
+   * Prevents multiple concurrent summary LLM calls when rapid tool iterations each
+   * trigger pruning. At 2000 users with 3+ tool calls per turn, this prevents
+   * 6000+ summary calls/turn from becoming 2000.
+   */
+  private _summaryInFlight: boolean = false;
+  /** Messages accumulated across tool iterations while a summary call is in-flight */
+  private _pendingMessagesToRefine: BaseMessage[] = [];
   /** Map of agent contexts by agent ID */
   agentContexts: Map<string, AgentContext> = new Map();
   /** Default agent ID to use */
@@ -239,6 +261,22 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
     }
     this.defaultAgentId = agents[0].agentId;
+    // Seed cached summary from persisted storage so the first prune in a
+    // resumed conversation can also skip the synchronous LLM summarization call
+    const primaryContext = this.agentContexts.get(this.defaultAgentId);
+    if (primaryContext?.persistedSummary) {
+      this._cachedRunSummary = primaryContext.persistedSummary;
+    }
+    // Initialize EMA pruning calibration
+    this._pruneCalibration = createPruneCalibration();
+    // Initialize tool discovery cache, seeded with any pre-existing discoveries
+    this._toolDiscoveryCache = new ToolDiscoveryCache();
+    if (primaryContext?.discoveredToolNames.size) {
+      this._toolDiscoveryCache.seed([...primaryContext.discoveredToolNames]);
+    }
   }
   /* Init */
@@ -272,6 +310,11 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
       new Map()
     );
     this.invokedToolIds = resetIfNotEmpty(this.invokedToolIds, undefined);
+    // Reset EMA calibration, tool discovery cache, and summary debounce for fresh run
+    this._pruneCalibration = createPruneCalibration();
+    this._toolDiscoveryCache.reset();
+    this._summaryInFlight = false;
+    this._pendingMessagesToRefine = [];
     for (const context of this.agentContexts.values()) {
       context.reset();
     }
@@ -378,6 +421,70 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
     return clientOptions;
   }
+  /**
+   * Determines whether summarization should trigger based on SummarizationConfig.
+   *
+   * Supports three trigger strategies:
+   * - contextPercentage (default): Trigger when context utilization >= threshold%
+   * - messageCount: Trigger when pruned message count >= threshold
+   * - tokenThreshold: Trigger when total estimated tokens >= threshold
+   *
+   * When no config is provided, always triggers (preserves backward compatibility).
+   *
+   * @param prunedMessageCount - Number of messages that were pruned
+   * @param maxContextTokens - Maximum context token budget
+   * @param indexTokenCountMap - Token count map by message index
+   * @param instructionTokens - Token count for instructions/system message
+   * @param config - Optional SummarizationConfig
+   * @returns Whether summarization should be triggered
+   */
+  private shouldTriggerSummarization(
+    prunedMessageCount: number,
+    maxContextTokens: number,
+    indexTokenCountMap: Record<string, number | undefined>,
+    instructionTokens: number,
+    config?: t.SummarizationConfig
+  ): boolean {
+    // No pruned messages means nothing to summarize
+    if (prunedMessageCount === 0) {
+      return false;
+    }
+    // No config = backward compatible (always summarize when messages are pruned)
+    if (!config || !config.triggerType) {
+      return true;
+    }
+    const threshold = config.triggerThreshold;
+    switch (config.triggerType) {
+      case 'contextPercentage': {
+        if (maxContextTokens <= 0) return true;
+        const effectiveThreshold = threshold ?? SUMMARIZATION_CONTEXT_THRESHOLD;
+        let totalTokens = instructionTokens;
+        for (const key in indexTokenCountMap) {
+          totalTokens += indexTokenCountMap[key] ?? 0;
+        }
+        const utilization = (totalTokens / maxContextTokens) * 100;
+        return utilization >= effectiveThreshold;
+      }
+      case 'messageCount': {
+        const effectiveThreshold = threshold ?? 5;
+        return prunedMessageCount >= effectiveThreshold;
+      }
+      case 'tokenThreshold': {
+        if (threshold == null) return true;
+        let totalTokens = instructionTokens;
+        for (const key in indexTokenCountMap) {
+          totalTokens += indexTokenCountMap[key] ?? 0;
+        }
+        return totalTokens >= threshold;
+      }
+      default:
+        return true;
+    }
+  }
   /**
    * Returns the normalized finish/stop reason from the last LLM invocation.
    * Used by callers to detect when the response was truncated due to max_tokens.
@@ -535,9 +642,6 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
   getRunMessages(): BaseMessage[] | undefined {
     const result = this.messages.slice(this.startIndex);
-    console.debug(
-      `[Graph] getRunMessages() | totalMessages=${this.messages.length} | startIndex=${this.startIndex} | runMessages=${result.length}`
-    );
     return result;
   }
@@ -1327,10 +1431,15 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         messages = [dynamicContextMessage, ackMessage, ...messages];
       }
-      // Extract tool discoveries from current turn only (similar to formatArtifactPayload pattern)
-      const discoveredNames = extractToolDiscoveries(messages);
-      if (discoveredNames.length > 0) {
-        agentContext.markToolsAsDiscovered(discoveredNames);
+      // Tool discovery caching: only scan new messages since last iteration
+      // instead of re-parsing the full history via extractToolDiscoveries()
+      const cachedDiscoveries =
+        this._toolDiscoveryCache.getNewDiscoveries(messages);
+      if (cachedDiscoveries.length > 0) {
+        agentContext.markToolsAsDiscovered(cachedDiscoveries);
+        console.debug(
+          `[Graph:ToolDiscovery] Cached ${cachedDiscoveries.length} new tools (total: ${this._toolDiscoveryCache.size})`
+        );
       }
       const toolsForBinding = agentContext.getToolsForBinding();
@@ -1400,50 +1509,145 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
                 ?.thinking as t.AnthropicClientOptions['thinking']
             )?.type === 'enabled');
+        // Apply EMA calibration to max token budget — smooths pruning across iterations
+        const calibratedMaxTokens = applyCalibration(
+          agentContext.maxContextTokens,
+          this._pruneCalibration
+        );
         agentContext.pruneMessages = createPruneMessages({
           startIndex: this.startIndex,
           provider: agentContext.provider,
           tokenCounter: agentContext.tokenCounter,
-          maxTokens: agentContext.maxContextTokens,
+          maxTokens: calibratedMaxTokens,
           thinkingEnabled: isAnthropicWithThinking,
           indexTokenCountMap: agentContext.indexTokenCountMap,
         });
       }
+      // Update EMA calibration with actual token usage from API response
+      if (
+        agentContext.currentUsage?.input_tokens &&
+        agentContext.maxContextTokens
+      ) {
+        const estimatedTokens = Object.values(
+          agentContext.indexTokenCountMap
+        ).reduce((sum, v) => (sum ?? 0) + (v ?? 0), 0) as number;
+        if (estimatedTokens > 0) {
+          this._pruneCalibration = updatePruneCalibration(
+            this._pruneCalibration,
+            agentContext.currentUsage.input_tokens,
+            estimatedTokens
+          );
+        }
+      }
       if (agentContext.pruneMessages) {
-        console.debug(
-          `[Graph:ContextMgmt] Pruning messages | inputCount=${messages.length} | maxTokens=${agentContext.maxContextTokens}`
-        );
         const { context, indexTokenCountMap, messagesToRefine } =
           agentContext.pruneMessages({
             messages,
             usageMetadata: agentContext.currentUsage,
-            // startOnMessageType: 'human',
           });
         agentContext.indexTokenCountMap = indexTokenCountMap;
         messagesToUse = context;
-        console.debug(
-          `[Graph:ContextMgmt] Pruned | kept=${context.length} | discarded=${messagesToRefine.length} | originalCount=${messages.length}`
-        );
-        // Summarize discarded messages if callback provided
+        // ── Non-blocking summarization ──────────────────────────────────
+        // NEVER block the LLM call waiting for summarization. Instead:
+        //   1. If _cachedRunSummary exists → use it, fire async update
+        //   2. If persistedSummary exists → use it as fallback, fire async update
+        //   3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
+        // The summary catches up asynchronously and is available for subsequent
+        // iterations (tool calls) and the next conversation turn.
+        //
+        // SummarizationConfig integration:
+        //   - triggerType/triggerThreshold control WHEN summarization fires
+        //   - reserveRatio is enforced via calibrated maxTokens (above)
+        //   - initialSummary provides cross-run seeding as fallback before persistedSummary
         let hasSummary = false;
-        if (messagesToRefine.length > 0 && agentContext.summarizeCallback) {
-          console.debug(
-            `[Graph:ContextMgmt] Summarizing ${messagesToRefine.length} discarded messages`
-          );
+        const sumConfig = agentContext.summarizationConfig;
+        const shouldSummarize = this.shouldTriggerSummarization(
+          messagesToRefine.length,
+          agentContext.maxContextTokens ?? 0,
+          agentContext.indexTokenCountMap,
+          agentContext.instructionTokens,
+          sumConfig
+        );
+        if (
+          messagesToRefine.length > 0 &&
+          agentContext.summarizeCallback &&
+          shouldSummarize
+        ) {
           try {
-            const summary =
-              await agentContext.summarizeCallback(messagesToRefine);
+            let summary: string | undefined;
+            let summarySource: string;
+            if (this._cachedRunSummary != null) {
+              summary = this._cachedRunSummary;
+              summarySource = 'cached';
+            } else if (
+              agentContext.persistedSummary != null &&
+              agentContext.persistedSummary !== ''
+            ) {
+              summary = agentContext.persistedSummary;
+              this._cachedRunSummary = summary;
+              summarySource = 'persisted';
+            } else if (
+              sumConfig?.initialSummary != null &&
+              sumConfig.initialSummary !== ''
+            ) {
+              // Cross-run seed: use initialSummary when no persisted summary exists
+              summary = sumConfig.initialSummary;
+              this._cachedRunSummary = summary;
+              summarySource = 'initial-seed';
+            } else {
+              summarySource = 'none';
+            }
+            // Single consolidated log for the entire prune+summarize decision
             console.debug(
-              `[Graph:ContextMgmt] Summary received | len=${summary?.length ?? 0} | hasContent=${summary != null && summary !== ''}`
+              `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
             );
+            // SCALE: Debounce background summarization — if a summary call is already
+            // in-flight (from a prior tool iteration), accumulate messages instead of
+            // firing another concurrent LLM call. At 2000 users with 3+ tool calls
+            // per turn, this prevents 3x summary call volume.
+            if (this._summaryInFlight) {
+              this._pendingMessagesToRefine.push(...messagesToRefine);
+              console.debug(
+                `[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`
+              );
+            } else {
+              this._summaryInFlight = true;
+              const allMessages = this._pendingMessagesToRefine.length > 0
+                ? [...this._pendingMessagesToRefine, ...messagesToRefine]
+                : messagesToRefine;
+              this._pendingMessagesToRefine = [];
+              agentContext
+                .summarizeCallback(allMessages)
+                .then((updated) => {
+                  if (updated != null && updated !== '') {
+                    this._cachedRunSummary = updated;
+                  }
+                })
+                .catch((err) => {
+                  console.error(
+                    '[Graph] Background summary failed (non-fatal):',
+                    err
+                  );
+                })
+                .finally(() => {
+                  this._summaryInFlight = false;
+                });
+            }
             if (summary != null && summary !== '') {
               hasSummary = true;
               const summaryMsg = new SystemMessage(
                 `[Conversation Summary]\n${summary}`
               );
-              // Insert after system message (if present), before conversation messages
               const systemIdx =
                 messagesToUse[0]?.getType() === 'system' ? 1 : 0;
               messagesToUse = [
@@ -1451,27 +1655,38 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
                 summaryMsg,
                 ...messagesToUse.slice(systemIdx),
               ];
-              console.debug(
-                `[Graph:ContextMgmt] Summary injected at index ${systemIdx} | finalMsgCount=${messagesToUse.length}`
-              );
             }
           } catch (err) {
-            console.error('[Graph] Summarization callback failed:', err);
+            console.error('[Graph] Summarization failed:', err);
           }
+        } else if (messagesToRefine.length > 0) {
+          // Log pruning even when no summarize callback (discard mode)
+          console.debug(
+            `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`
+          );
         }
-        // Post-prune context note: inform the LLM that context was compressed
-        // without exposing token numbers (prevents voluntary bail-out)
+        // Deduplicate system messages that accumulate from repeated tool iterations
+        const { messages: dedupedMessages, removedCount } =
+          deduplicateSystemMessages(messagesToUse);
+        if (removedCount > 0) {
+          messagesToUse = dedupedMessages;
+          console.debug(
+            `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
+          );
+        }
+        // Post-prune context note for task-tool-enabled agents
         if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
           const postPruneNote = buildPostPruneNote(
             messagesToRefine.length,
             hasSummary
           );
           if (postPruneNote) {
-            messagesToUse = [...messagesToUse, new SystemMessage(postPruneNote)];
-            console.debug(
-              `[Graph:ContextMgmt] Post-prune note injected | hasSummary=${hasSummary} | discarded=${messagesToRefine.length}`
-            );
+            messagesToUse = [
+              ...messagesToUse,
+              new SystemMessage(postPruneNote),
+            ];
           }
         }
       }
@@ -1643,14 +1858,6 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         const { count: documentCount, names: documentNames } =
           detectDocuments(finalMessages);
-        // Observability log (no token numbers exposed to LLM)
-        if (contextAnalytics.utilizationPercent != null) {
-          console.debug(
-            `[Graph] Context utilization: ${contextAnalytics.utilizationPercent.toFixed(1)}% | ` +
-              `messages: ${finalMessages.length} | docs: ${documentCount}`
-          );
-        }
         // Multi-document delegation: first iteration only (before AI has responded)
         const hasAiResponse = finalMessages.some(
           (m) => m._getType() === 'ai' || m._getType() === 'tool'
@@ -2178,13 +2385,6 @@ If I seem to be missing something we discussed earlier, just give me a quick rem
         reducer: (a, b) => {
           if (!a.length) {
             this.startIndex = a.length + b.length;
-            console.debug(
-              `[Graph:Reducer] Initial messages | startIndex=${this.startIndex} | inputMsgCount=${b.length}`
-            );
-          } else {
-            console.debug(
-              `[Graph:Reducer] Appending messages | existing=${a.length} | new=${b.length} | startIndex=${this.startIndex}`
-            );
           }
           const result = messagesStateReducer(a, b);
           this.messages = result;