@illuma-ai/agents 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/esm/main.mjs CHANGED
@@ -26,7 +26,7 @@ export { createSearchTool } from './tools/search/tool.mjs';
26
26
  export { DATE_RANGE, DEFAULT_COUNTRY_DESCRIPTION, DEFAULT_QUERY_DESCRIPTION, WebSearchToolDefinition, WebSearchToolDescription, WebSearchToolName, WebSearchToolSchema, countrySchema, dateSchema, imagesSchema, newsSchema, querySchema, videosSchema } from './tools/search/schema.mjs';
27
27
  export { createValidationErrorMessage, isValidJsonSchema, normalizeJsonSchema, prepareSchemaForProvider, validateStructuredOutput, zodToJsonSchema } from './schemas/validate.mjs';
28
28
  export { Callback, CommonEvents, Constants, ContentTypes, EdgeType, EnvVar, FinishReasons, GraphEvents, GraphNodeActions, GraphNodeKeys, MessageTypes, Providers, StepTypes, TitleMethod, ToolCallTypes } from './common/enum.mjs';
29
- export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
29
+ export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PROACTIVE_SUMMARY_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
30
30
  export { joinKeys, resetIfNotEmpty } from './utils/graph.mjs';
31
31
  export { isGoogleLike, isOpenAILike } from './utils/llm.mjs';
32
32
  export { isPresent, unescapeObject } from './utils/misc.mjs';
@@ -35,6 +35,17 @@ export declare const CONTEXT_SAFETY_BUFFER = 0.9;
35
35
  * When the context window is ≥80% full, pruning + summarization activates.
36
36
  */
37
37
  export declare const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
38
+ /**
39
+ * Proactive summarization threshold (0-1 fraction of context window).
40
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
41
+ * This gives the summary time to complete so it's ready when context actually fills up.
42
+ *
43
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
44
+ * 80% → proactive background summary
45
+ * 90% → pruning kicks in (with summary already cached)
46
+ * 100% → graceful: use existing summary + recent messages, never block
47
+ */
48
+ export declare const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
38
49
  /**
39
50
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
40
51
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@illuma-ai/agents",
3
- "version": "1.1.1",
3
+ "version": "1.1.3",
4
4
  "main": "./dist/cjs/main.cjs",
5
5
  "module": "./dist/esm/main.mjs",
6
6
  "types": "./dist/types/index.d.ts",
@@ -59,6 +59,18 @@ export const CONTEXT_SAFETY_BUFFER = 0.9;
59
59
  */
60
60
  export const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
61
61
 
62
+ /**
63
+ * Proactive summarization threshold (0-1 fraction of context window).
64
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
65
+ * This gives the summary time to complete so it's ready when context actually fills up.
66
+ *
67
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
68
+ * 80% → proactive background summary
69
+ * 90% → pruning kicks in (with summary already cached)
70
+ * 100% → graceful: use existing summary + recent messages, never block
71
+ */
72
+ export const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
73
+
62
74
  /**
63
75
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
64
76
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
@@ -35,6 +35,7 @@ import {
35
35
  formatAnthropicArtifactContent,
36
36
  ensureThinkingBlockInMessages,
37
37
  deduplicateSystemMessages,
38
+ getContextUtilization,
38
39
  convertMessagesToContent,
39
40
  addBedrockCacheControl,
40
41
  modifyDeltaProperties,
@@ -54,6 +55,7 @@ import {
54
55
  Constants,
55
56
  TOOL_TURN_THINKING_BUDGET,
56
57
  SUMMARIZATION_CONTEXT_THRESHOLD,
58
+ PROACTIVE_SUMMARY_THRESHOLD,
57
59
  } from '@/common';
58
60
  import {
59
61
  ToolDiscoveryCache,
@@ -1542,87 +1544,205 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1542
1544
  }
1543
1545
  }
1544
1546
 
1545
- if (agentContext.pruneMessages) {
1546
- const { context, indexTokenCountMap, messagesToRefine } =
1547
- agentContext.pruneMessages({
1548
- messages,
1549
- usageMetadata: agentContext.currentUsage,
1550
- });
1551
- agentContext.indexTokenCountMap = indexTokenCountMap;
1552
- messagesToUse = context;
1553
-
1554
- // ── Non-blocking summarization ──────────────────────────────────
1555
- // NEVER block the LLM call waiting for summarization. Instead:
1556
- // 1. If _cachedRunSummary exists → use it, fire async update
1557
- // 2. If persistedSummary exists → use it as fallback, fire async update
1558
- // 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
1559
- // The summary catches up asynchronously and is available for subsequent
1560
- // iterations (tool calls) and the next conversation turn.
1561
- //
1562
- // SummarizationConfig integration:
1563
- // - triggerType/triggerThreshold control WHEN summarization fires
1564
- // - reserveRatio is enforced via calibrated maxTokens (above)
1565
- // - initialSummary provides cross-run seeding as fallback before persistedSummary
1566
- let hasSummary = false;
1567
- const sumConfig = agentContext.summarizationConfig;
1568
- const shouldSummarize = this.shouldTriggerSummarization(
1569
- messagesToRefine.length,
1570
- agentContext.maxContextTokens ?? 0,
1547
+ // ── Proactive summarization at context pressure ───────────────────
1548
+ // Inspired by VS Code Copilot Chat's 3-tier strategy:
1549
+ // 80% → fire proactive background summary (BEFORE pruning needed)
1550
+ // 90% → pruning kicks in (summary already cached from 80% trigger)
1551
+ // 100% → graceful: use existing summary + recent messages, NEVER block
1552
+ //
1553
+ // This ensures the summary is READY by the time pruning actually occurs,
1554
+ // so the user never waits and never sees a context cliff.
1555
+ if (
1556
+ agentContext.maxContextTokens != null &&
1557
+ agentContext.maxContextTokens > 0 &&
1558
+ agentContext.summarizeCallback &&
1559
+ !this._summaryInFlight &&
1560
+ !this._cachedRunSummary
1561
+ ) {
1562
+ const utilization = getContextUtilization(
1571
1563
  agentContext.indexTokenCountMap,
1572
1564
  agentContext.instructionTokens,
1573
- sumConfig
1565
+ agentContext.maxContextTokens
1574
1566
  );
1567
+ const threshold = (agentContext.summarizationConfig?.triggerThreshold ?? PROACTIVE_SUMMARY_THRESHOLD * 100);
1568
+
1569
+ if (utilization >= threshold) {
1570
+ // Identify older messages to summarize proactively.
1571
+ // Keep the last N messages (recent turns) intact — only summarize older history.
1572
+ // This is incremental: the callback checks for existing summary and updates it.
1573
+ const recentTurnCount = Math.max(4, Math.floor(messages.length * 0.3));
1574
+ const oldMessages = messages.slice(
1575
+ messages[0]?.getType() === 'system' ? 1 : 0,
1576
+ Math.max(1, messages.length - recentTurnCount)
1577
+ );
1575
1578
 
1576
- if (
1577
- messagesToRefine.length > 0 &&
1578
- agentContext.summarizeCallback &&
1579
- shouldSummarize
1579
+ if (oldMessages.length > 0) {
1580
+ this._summaryInFlight = true;
1581
+ console.debug(
1582
+ `[Graph:ProactiveSummary] Context at ${utilization.toFixed(1)}% (threshold ${threshold}%) — summarizing ${oldMessages.length} older msgs in background`
1583
+ );
1584
+
1585
+ agentContext
1586
+ .summarizeCallback(oldMessages)
1587
+ .then((updated) => {
1588
+ if (updated != null && updated !== '') {
1589
+ this._cachedRunSummary = updated;
1590
+ console.debug(
1591
+ `[Graph:ProactiveSummary] Background summary ready (len=${updated.length})`
1592
+ );
1593
+ }
1594
+ })
1595
+ .catch((err) => {
1596
+ console.error(
1597
+ '[Graph:ProactiveSummary] Background summary failed (non-fatal):',
1598
+ err
1599
+ );
1600
+ })
1601
+ .finally(() => {
1602
+ this._summaryInFlight = false;
1603
+ });
1604
+ }
1605
+ }
1606
+ }
1607
+
1608
+ if (agentContext.pruneMessages) {
1609
+ // ── Context Compaction (Copilot-style: never delete messages) ─────
1610
+ //
1611
+ // DESIGN: Original messages are NEVER removed from the array.
1612
+ // Instead, we build a "windowed view" for the LLM:
1613
+ // [system prompt] + [summary of older turns] + [recent turns that fit]
1614
+ //
1615
+ // This ensures:
1616
+ // - No context is ever lost (summary covers older turns)
1617
+ // - We can always re-summarize from originals if summary is stale
1618
+ // - Conversation chaining works naturally across turns
1619
+ //
1620
+ // Flow:
1621
+ // 1. Resolve best available summary (cached > persisted > seed)
1622
+ // 2. Calculate token budget available for recent messages
1623
+ // 3. Walk newest→oldest, build view of messages that fit
1624
+ // 4. Assemble: [system] + [summary] + [recent window]
1625
+ // 5. Fire background summary update for messages outside the window
1626
+
1627
+ const sumConfig = agentContext.summarizationConfig;
1628
+ const tokenCounter = agentContext.tokenCounter;
1629
+ const maxTokens = agentContext.maxContextTokens ?? 0;
1630
+
1631
+ // Step 1: Resolve best available summary
1632
+ let summary: string | undefined;
1633
+ let summarySource: string;
1634
+
1635
+ if (this._cachedRunSummary != null) {
1636
+ summary = this._cachedRunSummary;
1637
+ summarySource = 'cached';
1638
+ } else if (
1639
+ agentContext.persistedSummary != null &&
1640
+ agentContext.persistedSummary !== ''
1580
1641
  ) {
1581
- try {
1582
- let summary: string | undefined;
1583
- let summarySource: string;
1642
+ summary = agentContext.persistedSummary;
1643
+ this._cachedRunSummary = summary;
1644
+ summarySource = 'persisted';
1645
+ } else if (
1646
+ sumConfig?.initialSummary != null &&
1647
+ sumConfig.initialSummary !== ''
1648
+ ) {
1649
+ summary = sumConfig.initialSummary;
1650
+ this._cachedRunSummary = summary;
1651
+ summarySource = 'initial-seed';
1652
+ } else {
1653
+ summarySource = 'none';
1654
+ }
1584
1655
 
1585
- if (this._cachedRunSummary != null) {
1586
- summary = this._cachedRunSummary;
1587
- summarySource = 'cached';
1588
- } else if (
1589
- agentContext.persistedSummary != null &&
1590
- agentContext.persistedSummary !== ''
1591
- ) {
1592
- summary = agentContext.persistedSummary;
1593
- this._cachedRunSummary = summary;
1594
- summarySource = 'persisted';
1595
- } else if (
1596
- sumConfig?.initialSummary != null &&
1597
- sumConfig.initialSummary !== ''
1598
- ) {
1599
- // Cross-run seed: use initialSummary when no persisted summary exists
1600
- summary = sumConfig.initialSummary;
1601
- this._cachedRunSummary = summary;
1602
- summarySource = 'initial-seed';
1603
- } else {
1604
- summarySource = 'none';
1605
- }
1656
+ // Step 2: Calculate token budget
1657
+ // Apply EMA calibration for accuracy across iterations
1658
+ const calibratedMax = applyCalibration(maxTokens, this._pruneCalibration);
1659
+ const systemMsg = messages[0]?.getType() === 'system' ? messages[0] : null;
1660
+ const systemTokens = systemMsg != null
1661
+ ? (agentContext.indexTokenCountMap[0] ?? 0)
1662
+ : 0;
1663
+ const summaryMsg = summary != null && summary !== ''
1664
+ ? new SystemMessage(`[Conversation Summary]\n${summary}`)
1665
+ : null;
1666
+ const summaryTokens = summaryMsg != null && tokenCounter != null
1667
+ ? tokenCounter(summaryMsg)
1668
+ : 0;
1669
+
1670
+ // Budget for recent messages = total - system - summary - 3 (assistant priming)
1671
+ const recentBudget = calibratedMax - systemTokens - summaryTokens - 3;
1672
+
1673
+ // Step 3: Walk newest→oldest, collect messages that fit in the budget
1674
+ const contentStart = systemMsg != null ? 1 : 0;
1675
+ let usedTokens = 0;
1676
+ let windowStart = messages.length; // index where the recent window begins
1677
+
1678
+ for (let i = messages.length - 1; i >= contentStart; i--) {
1679
+ const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
1680
+ if (usedTokens + msgTokens > recentBudget) {
1681
+ break;
1682
+ }
1683
+ usedTokens += msgTokens;
1684
+ windowStart = i;
1685
+ }
1606
1686
 
1607
- // Single consolidated log for the entire prune+summarize decision
1608
- console.debug(
1609
- `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
1610
- );
1687
+ // Ensure we don't split tool-call / tool-result pairs.
1688
+ // If windowStart lands on a ToolMessage, walk back to include its AI message.
1689
+ while (
1690
+ windowStart > contentStart &&
1691
+ messages[windowStart]?.getType() === 'tool'
1692
+ ) {
1693
+ windowStart--;
1694
+ usedTokens += agentContext.indexTokenCountMap[windowStart] ?? 0;
1695
+ }
1696
+
1697
+ const recentMessages = messages.slice(windowStart);
1698
+ const compactedMessages = messages.slice(contentStart, windowStart);
1699
+ const hasSummary = summaryMsg != null;
1700
+
1701
+ // Step 4: Assemble the windowed view
1702
+ // [system] + [summary (covers compacted messages)] + [recent window]
1703
+ const viewParts: BaseMessage[] = [];
1704
+ if (systemMsg != null) {
1705
+ viewParts.push(systemMsg);
1706
+ }
1707
+ if (summaryMsg != null) {
1708
+ viewParts.push(summaryMsg);
1709
+ }
1710
+ viewParts.push(...recentMessages);
1711
+ messagesToUse = viewParts;
1611
1712
 
1612
- // SCALE: Debounce background summarization — if a summary call is already
1613
- // in-flight (from a prior tool iteration), accumulate messages instead of
1614
- // firing another concurrent LLM call. At 2000 users with 3+ tool calls
1615
- // per turn, this prevents 3x summary call volume.
1713
+ console.debug(
1714
+ `[Graph:Compaction] View: ${messages.length}→${viewParts.length} msgs ` +
1715
+ `(${compactedMessages.length} behind summary, ${recentMessages.length} in window) | ` +
1716
+ `summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | ` +
1717
+ `budget=${recentBudget}/${calibratedMax} used=${usedTokens}`
1718
+ );
1719
+
1720
+ // Step 5: Fire background summary update (non-blocking)
1721
+ // Summarize messages outside the window so next iteration has a fresh summary.
1722
+ // Only trigger if there are compacted messages worth summarizing.
1723
+ if (
1724
+ compactedMessages.length > 0 &&
1725
+ agentContext.summarizeCallback
1726
+ ) {
1727
+ const shouldSummarize = this.shouldTriggerSummarization(
1728
+ compactedMessages.length,
1729
+ maxTokens,
1730
+ agentContext.indexTokenCountMap,
1731
+ agentContext.instructionTokens,
1732
+ sumConfig
1733
+ );
1734
+
1735
+ if (shouldSummarize) {
1616
1736
  if (this._summaryInFlight) {
1617
- this._pendingMessagesToRefine.push(...messagesToRefine);
1737
+ this._pendingMessagesToRefine.push(...compactedMessages);
1618
1738
  console.debug(
1619
- `[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`
1739
+ `[Graph:Compaction] Summary in-flight, queued ${compactedMessages.length} msgs (pending=${this._pendingMessagesToRefine.length})`
1620
1740
  );
1621
1741
  } else {
1622
1742
  this._summaryInFlight = true;
1623
1743
  const allMessages = this._pendingMessagesToRefine.length > 0
1624
- ? [...this._pendingMessagesToRefine, ...messagesToRefine]
1625
- : messagesToRefine;
1744
+ ? [...this._pendingMessagesToRefine, ...compactedMessages]
1745
+ : compactedMessages;
1626
1746
  this._pendingMessagesToRefine = [];
1627
1747
 
1628
1748
  agentContext
@@ -1634,7 +1754,7 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1634
1754
  })
1635
1755
  .catch((err) => {
1636
1756
  console.error(
1637
- '[Graph] Background summary failed (non-fatal):',
1757
+ '[Graph:Compaction] Background summary update failed (non-fatal):',
1638
1758
  err
1639
1759
  );
1640
1760
  })
@@ -1642,44 +1762,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1642
1762
  this._summaryInFlight = false;
1643
1763
  });
1644
1764
  }
1645
-
1646
- if (summary != null && summary !== '') {
1647
- hasSummary = true;
1648
- const summaryMsg = new SystemMessage(
1649
- `[Conversation Summary]\n${summary}`
1650
- );
1651
- const systemIdx =
1652
- messagesToUse[0]?.getType() === 'system' ? 1 : 0;
1653
- messagesToUse = [
1654
- ...messagesToUse.slice(0, systemIdx),
1655
- summaryMsg,
1656
- ...messagesToUse.slice(systemIdx),
1657
- ];
1658
- }
1659
- } catch (err) {
1660
- console.error('[Graph] Summarization failed:', err);
1661
1765
  }
1662
- } else if (messagesToRefine.length > 0) {
1663
- // Log pruning even when no summarize callback (discard mode)
1664
- console.debug(
1665
- `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`
1666
- );
1667
- }
1668
-
1669
- // Deduplicate system messages that accumulate from repeated tool iterations
1670
- const { messages: dedupedMessages, removedCount } =
1671
- deduplicateSystemMessages(messagesToUse);
1672
- if (removedCount > 0) {
1673
- messagesToUse = dedupedMessages;
1674
- console.debug(
1675
- `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
1676
- );
1677
1766
  }
1678
1767
 
1679
- // Post-prune context note for task-tool-enabled agents
1680
- if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
1768
+ // Post-compaction context note for task-tool-enabled agents
1769
+ if (compactedMessages.length > 0 && hasTaskTool(agentContext.tools)) {
1681
1770
  const postPruneNote = buildPostPruneNote(
1682
- messagesToRefine.length,
1771
+ compactedMessages.length,
1683
1772
  hasSummary
1684
1773
  );
1685
1774
  if (postPruneNote) {
@@ -1691,6 +1780,18 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1691
1780
  }
1692
1781
  }
1693
1782
 
1783
+ // Deduplicate system messages — ALWAYS runs, not just during compaction.
1784
+ // Duplicate system messages accumulate from repeated tool iterations,
1785
+ // summary injections, and context notes across turns.
1786
+ const { messages: dedupedMessages, removedCount } =
1787
+ deduplicateSystemMessages(messagesToUse);
1788
+ if (removedCount > 0) {
1789
+ messagesToUse = dedupedMessages;
1790
+ console.debug(
1791
+ `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
1792
+ );
1793
+ }
1794
+
1694
1795
  let finalMessages = messagesToUse;
1695
1796
  if (agentContext.useLegacyContent) {
1696
1797
  finalMessages = formatContentStrings(finalMessages);