@illuma-ai/agents 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/esm/main.mjs CHANGED
@@ -26,7 +26,7 @@ export { createSearchTool } from './tools/search/tool.mjs';
26
26
  export { DATE_RANGE, DEFAULT_COUNTRY_DESCRIPTION, DEFAULT_QUERY_DESCRIPTION, WebSearchToolDefinition, WebSearchToolDescription, WebSearchToolName, WebSearchToolSchema, countrySchema, dateSchema, imagesSchema, newsSchema, querySchema, videosSchema } from './tools/search/schema.mjs';
27
27
  export { createValidationErrorMessage, isValidJsonSchema, normalizeJsonSchema, prepareSchemaForProvider, validateStructuredOutput, zodToJsonSchema } from './schemas/validate.mjs';
28
28
  export { Callback, CommonEvents, Constants, ContentTypes, EdgeType, EnvVar, FinishReasons, GraphEvents, GraphNodeActions, GraphNodeKeys, MessageTypes, Providers, StepTypes, TitleMethod, ToolCallTypes } from './common/enum.mjs';
29
- export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PROACTIVE_SUMMARY_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
29
+ export { COMPACTION_RECENT_ROUNDS, CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PROACTIVE_SUMMARY_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
30
30
  export { joinKeys, resetIfNotEmpty } from './utils/graph.mjs';
31
31
  export { isGoogleLike, isOpenAILike } from './utils/llm.mjs';
32
32
  export { isPresent, unescapeObject } from './utils/misc.mjs';
@@ -46,6 +46,15 @@ export declare const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
46
46
  * 100% → graceful: use existing summary + recent messages, never block
47
47
  */
48
48
  export declare const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
49
+ /**
50
+ * Number of recent conversation rounds (human+AI pairs) to keep in the
51
+ * windowed view when a summary is available. Everything older is covered
52
+ * by the summary. 2 rounds = last 2 user questions + 2 AI responses.
53
+ *
54
+ * This prevents wasting tokens on raw messages the summary already covers
55
+ * and keeps context tight for the LLM.
56
+ */
57
+ export declare const COMPACTION_RECENT_ROUNDS = 2;
49
58
  /**
50
59
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
51
60
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@illuma-ai/agents",
3
- "version": "1.1.2",
3
+ "version": "1.1.4",
4
4
  "main": "./dist/cjs/main.cjs",
5
5
  "module": "./dist/esm/main.mjs",
6
6
  "types": "./dist/types/index.d.ts",
@@ -71,6 +71,16 @@ export const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
71
71
  */
72
72
  export const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
73
73
 
74
+ /**
75
+ * Number of recent conversation rounds (human+AI pairs) to keep in the
76
+ * windowed view when a summary is available. Everything older is covered
77
+ * by the summary. 2 rounds = last 2 user questions + 2 AI responses.
78
+ *
79
+ * This prevents wasting tokens on raw messages the summary already covers
80
+ * and keeps context tight for the LLM.
81
+ */
82
+ export const COMPACTION_RECENT_ROUNDS = 2;
83
+
74
84
  /**
75
85
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
76
86
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
@@ -56,6 +56,7 @@ import {
56
56
  TOOL_TURN_THINKING_BUDGET,
57
57
  SUMMARIZATION_CONTEXT_THRESHOLD,
58
58
  PROACTIVE_SUMMARY_THRESHOLD,
59
+ COMPACTION_RECENT_ROUNDS,
59
60
  } from '@/common';
60
61
  import {
61
62
  ToolDiscoveryCache,
@@ -1606,86 +1607,196 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1606
1607
  }
1607
1608
 
1608
1609
  if (agentContext.pruneMessages) {
1609
- const { context, indexTokenCountMap, messagesToRefine } =
1610
- agentContext.pruneMessages({
1611
- messages,
1612
- usageMetadata: agentContext.currentUsage,
1613
- });
1614
- agentContext.indexTokenCountMap = indexTokenCountMap;
1615
- messagesToUse = context;
1616
-
1617
- // ── Non-blocking summarization ──────────────────────────────────
1618
- // NEVER block the LLM call waiting for summarization. Instead:
1619
- // 1. If _cachedRunSummary exists → use it, fire async update
1620
- // 2. If persistedSummary exists → use it as fallback, fire async update
1621
- // 3. If NOTHING exists (first-ever prune) → skip summary, fire async generation
1622
- // The summary catches up asynchronously and is available for subsequent
1623
- // iterations (tool calls) and the next conversation turn.
1610
+ // ── Context Compaction (Copilot-style: never delete messages) ─────
1624
1611
  //
1625
- // SummarizationConfig integration:
1626
- // - triggerType/triggerThreshold control WHEN summarization fires
1627
- // - reserveRatio is enforced via calibrated maxTokens (above)
1628
- // - initialSummary provides cross-run seeding as fallback before persistedSummary
1629
- let hasSummary = false;
1630
- const sumConfig = agentContext.summarizationConfig;
1631
- const shouldSummarize = this.shouldTriggerSummarization(
1632
- messagesToRefine.length,
1633
- agentContext.maxContextTokens ?? 0,
1634
- agentContext.indexTokenCountMap,
1635
- agentContext.instructionTokens,
1636
- sumConfig
1637
- );
1612
+ // DESIGN: Original messages are NEVER removed from the array.
1613
+ // Instead, we build a "windowed view" for the LLM:
1614
+ // [system prompt] + [summary of older turns] + [recent turns that fit]
1615
+ //
1616
+ // This ensures:
1617
+ // - No context is ever lost (summary covers older turns)
1618
+ // - We can always re-summarize from originals if summary is stale
1619
+ // - Conversation chaining works naturally across turns
1620
+ //
1621
+ // Flow:
1622
+ // 1. Resolve best available summary (cached > persisted > seed)
1623
+ // 2. Calculate token budget available for recent messages
1624
+ // 3. Walk newest→oldest, build view of messages that fit
1625
+ // 4. Assemble: [system] + [summary] + [recent window]
1626
+ // 5. Fire background summary update for messages outside the window
1638
1627
 
1639
- if (
1640
- messagesToRefine.length > 0 &&
1641
- agentContext.summarizeCallback &&
1642
- shouldSummarize
1628
+ const sumConfig = agentContext.summarizationConfig;
1629
+ const tokenCounter = agentContext.tokenCounter;
1630
+ const maxTokens = agentContext.maxContextTokens ?? 0;
1631
+
1632
+ // Step 1: Resolve best available summary
1633
+ let summary: string | undefined;
1634
+ let summarySource: string;
1635
+
1636
+ if (this._cachedRunSummary != null) {
1637
+ summary = this._cachedRunSummary;
1638
+ summarySource = 'cached';
1639
+ } else if (
1640
+ agentContext.persistedSummary != null &&
1641
+ agentContext.persistedSummary !== ''
1643
1642
  ) {
1644
- try {
1645
- let summary: string | undefined;
1646
- let summarySource: string;
1643
+ summary = agentContext.persistedSummary;
1644
+ this._cachedRunSummary = summary;
1645
+ summarySource = 'persisted';
1646
+ } else if (
1647
+ sumConfig?.initialSummary != null &&
1648
+ sumConfig.initialSummary !== ''
1649
+ ) {
1650
+ summary = sumConfig.initialSummary;
1651
+ this._cachedRunSummary = summary;
1652
+ summarySource = 'initial-seed';
1653
+ } else {
1654
+ summarySource = 'none';
1655
+ }
1647
1656
 
1648
- if (this._cachedRunSummary != null) {
1649
- summary = this._cachedRunSummary;
1650
- summarySource = 'cached';
1651
- } else if (
1652
- agentContext.persistedSummary != null &&
1653
- agentContext.persistedSummary !== ''
1654
- ) {
1655
- summary = agentContext.persistedSummary;
1656
- this._cachedRunSummary = summary;
1657
- summarySource = 'persisted';
1658
- } else if (
1659
- sumConfig?.initialSummary != null &&
1660
- sumConfig.initialSummary !== ''
1661
- ) {
1662
- // Cross-run seed: use initialSummary when no persisted summary exists
1663
- summary = sumConfig.initialSummary;
1664
- this._cachedRunSummary = summary;
1665
- summarySource = 'initial-seed';
1666
- } else {
1667
- summarySource = 'none';
1657
+ // Step 2: Calculate token budget
1658
+ // Apply EMA calibration for accuracy across iterations
1659
+ const calibratedMax = applyCalibration(maxTokens, this._pruneCalibration);
1660
+ const systemMsg = messages[0]?.getType() === 'system' ? messages[0] : null;
1661
+ const systemTokens = systemMsg != null
1662
+ ? (agentContext.indexTokenCountMap[0] ?? 0)
1663
+ : 0;
1664
+ const summaryMsg = summary != null && summary !== ''
1665
+ ? new SystemMessage(`[Conversation Summary]\n${summary}`)
1666
+ : null;
1667
+ const summaryTokens = summaryMsg != null && tokenCounter != null
1668
+ ? tokenCounter(summaryMsg)
1669
+ : 0;
1670
+
1671
+ // Budget for recent messages = total - system - summary - 3 (assistant priming)
1672
+ const recentBudget = calibratedMax - systemTokens - summaryTokens - 3;
1673
+
1674
+ // Step 3: Determine window of recent messages to include.
1675
+ //
1676
+ // Two modes:
1677
+ // A) No summary available → fill the budget (all messages that fit)
1678
+ // B) Summary available → keep last 2 conversation rounds (H+A pairs)
1679
+ // + any trailing tool messages. The summary covers everything else.
1680
+ // This avoids wasting tokens on raw messages the summary already covers.
1681
+ //
1682
+ // A "round" = one human message + one AI response (+ any tool messages between).
1683
+ const contentStart = systemMsg != null ? 1 : 0;
1684
+ let usedTokens = 0;
1685
+ let windowStart = messages.length; // index where the recent window begins
1686
+
1687
+ if (summary == null || summary === '') {
1688
+ // Mode A: No summary — include as many recent messages as fit in budget
1689
+ for (let i = messages.length - 1; i >= contentStart; i--) {
1690
+ const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
1691
+ if (usedTokens + msgTokens > recentBudget) {
1692
+ break;
1668
1693
  }
1694
+ usedTokens += msgTokens;
1695
+ windowStart = i;
1696
+ }
1697
+ } else {
1698
+ // Mode B: Summary exists — keep last 2 rounds (4 core messages: H+A+H+A)
1699
+ // Walk backward counting human messages as round boundaries.
1700
+ const MAX_RECENT_ROUNDS = COMPACTION_RECENT_ROUNDS;
1701
+ let roundsSeen = 0;
1702
+ for (let i = messages.length - 1; i >= contentStart; i--) {
1703
+ const msgType = messages[i]?.getType();
1704
+ const msgTokens = agentContext.indexTokenCountMap[i] ?? 0;
1705
+
1706
+ // Budget guard — even in round-limited mode, don't exceed budget
1707
+ if (usedTokens + msgTokens > recentBudget) {
1708
+ break;
1709
+ }
1710
+ usedTokens += msgTokens;
1711
+ windowStart = i;
1712
+
1713
+ // Count a human message as a round boundary
1714
+ if (msgType === 'human') {
1715
+ roundsSeen++;
1716
+ if (roundsSeen >= MAX_RECENT_ROUNDS) {
1717
+ break;
1718
+ }
1719
+ }
1720
+ }
1721
+ }
1669
1722
 
1670
- // Single consolidated log for the entire prune+summarize decision
1671
- console.debug(
1672
- `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
1673
- );
1723
+ // Ensure we don't split tool-call / tool-result pairs.
1724
+ // If windowStart lands on a ToolMessage, walk back to include its AI message.
1725
+ while (
1726
+ windowStart > contentStart &&
1727
+ messages[windowStart]?.getType() === 'tool'
1728
+ ) {
1729
+ windowStart--;
1730
+ usedTokens += agentContext.indexTokenCountMap[windowStart] ?? 0;
1731
+ }
1732
+
1733
+ const recentMessages = messages.slice(windowStart);
1734
+ const compactedMessages = messages.slice(contentStart, windowStart);
1735
+ const hasSummary = summaryMsg != null;
1736
+
1737
+ // Step 4: Assemble the windowed view
1738
+ // [system] + [summary (covers compacted messages)] + [recent window]
1739
+ const viewParts: BaseMessage[] = [];
1740
+ if (systemMsg != null) {
1741
+ viewParts.push(systemMsg);
1742
+ }
1743
+ if (summaryMsg != null) {
1744
+ viewParts.push(summaryMsg);
1745
+ }
1746
+ viewParts.push(...recentMessages);
1747
+ messagesToUse = viewParts;
1748
+
1749
+ // Rebuild indexTokenCountMap for the windowed view so downstream
1750
+ // analytics and summarization triggers see accurate token counts.
1751
+ const viewTokenMap: Record<string, number | undefined> = {};
1752
+ let viewIdx = 0;
1753
+ if (systemMsg != null) {
1754
+ viewTokenMap[viewIdx] = systemTokens;
1755
+ viewIdx++;
1756
+ }
1757
+ if (summaryMsg != null) {
1758
+ viewTokenMap[viewIdx] = summaryTokens;
1759
+ viewIdx++;
1760
+ }
1761
+ for (let i = windowStart; i < messages.length; i++) {
1762
+ viewTokenMap[viewIdx] = agentContext.indexTokenCountMap[i];
1763
+ viewIdx++;
1764
+ }
1765
+ agentContext.indexTokenCountMap = viewTokenMap;
1766
+
1767
+ console.debug(
1768
+ `[Graph:Compaction] View: ${messages.length}→${viewParts.length} msgs ` +
1769
+ `(${compactedMessages.length} behind summary, ${recentMessages.length} in window) | ` +
1770
+ `summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | ` +
1771
+ `budget=${recentBudget}/${calibratedMax} used=${usedTokens}`
1772
+ );
1674
1773
 
1675
- // SCALE: Debounce background summarization — if a summary call is already
1676
- // in-flight (from a prior tool iteration), accumulate messages instead of
1677
- // firing another concurrent LLM call. At 2000 users with 3+ tool calls
1678
- // per turn, this prevents 3x summary call volume.
1774
+ // Step 5: Fire background summary update (non-blocking)
1775
+ // Summarize messages outside the window so next iteration has a fresh summary.
1776
+ // Only trigger if there are compacted messages worth summarizing.
1777
+ if (
1778
+ compactedMessages.length > 0 &&
1779
+ agentContext.summarizeCallback
1780
+ ) {
1781
+ const shouldSummarize = this.shouldTriggerSummarization(
1782
+ compactedMessages.length,
1783
+ maxTokens,
1784
+ agentContext.indexTokenCountMap,
1785
+ agentContext.instructionTokens,
1786
+ sumConfig
1787
+ );
1788
+
1789
+ if (shouldSummarize) {
1679
1790
  if (this._summaryInFlight) {
1680
- this._pendingMessagesToRefine.push(...messagesToRefine);
1791
+ this._pendingMessagesToRefine.push(...compactedMessages);
1681
1792
  console.debug(
1682
- `[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`
1793
+ `[Graph:Compaction] Summary in-flight, queued ${compactedMessages.length} msgs (pending=${this._pendingMessagesToRefine.length})`
1683
1794
  );
1684
1795
  } else {
1685
1796
  this._summaryInFlight = true;
1686
1797
  const allMessages = this._pendingMessagesToRefine.length > 0
1687
- ? [...this._pendingMessagesToRefine, ...messagesToRefine]
1688
- : messagesToRefine;
1798
+ ? [...this._pendingMessagesToRefine, ...compactedMessages]
1799
+ : compactedMessages;
1689
1800
  this._pendingMessagesToRefine = [];
1690
1801
 
1691
1802
  agentContext
@@ -1697,7 +1808,7 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1697
1808
  })
1698
1809
  .catch((err) => {
1699
1810
  console.error(
1700
- '[Graph] Background summary failed (non-fatal):',
1811
+ '[Graph:Compaction] Background summary update failed (non-fatal):',
1701
1812
  err
1702
1813
  );
1703
1814
  })
@@ -1705,44 +1816,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1705
1816
  this._summaryInFlight = false;
1706
1817
  });
1707
1818
  }
1708
-
1709
- if (summary != null && summary !== '') {
1710
- hasSummary = true;
1711
- const summaryMsg = new SystemMessage(
1712
- `[Conversation Summary]\n${summary}`
1713
- );
1714
- const systemIdx =
1715
- messagesToUse[0]?.getType() === 'system' ? 1 : 0;
1716
- messagesToUse = [
1717
- ...messagesToUse.slice(0, systemIdx),
1718
- summaryMsg,
1719
- ...messagesToUse.slice(systemIdx),
1720
- ];
1721
- }
1722
- } catch (err) {
1723
- console.error('[Graph] Summarization failed:', err);
1724
1819
  }
1725
- } else if (messagesToRefine.length > 0) {
1726
- // Log pruning even when no summarize callback (discard mode)
1727
- console.debug(
1728
- `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded, no summary callback) | calibration=${this._pruneCalibration.ratio.toFixed(3)}`
1729
- );
1730
1820
  }
1731
1821
 
1732
- // Deduplicate system messages that accumulate from repeated tool iterations
1733
- const { messages: dedupedMessages, removedCount } =
1734
- deduplicateSystemMessages(messagesToUse);
1735
- if (removedCount > 0) {
1736
- messagesToUse = dedupedMessages;
1737
- console.debug(
1738
- `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
1739
- );
1740
- }
1741
-
1742
- // Post-prune context note for task-tool-enabled agents
1743
- if (messagesToRefine.length > 0 && hasTaskTool(agentContext.tools)) {
1822
+ // Post-compaction context note for task-tool-enabled agents
1823
+ if (compactedMessages.length > 0 && hasTaskTool(agentContext.tools)) {
1744
1824
  const postPruneNote = buildPostPruneNote(
1745
- messagesToRefine.length,
1825
+ compactedMessages.length,
1746
1826
  hasSummary
1747
1827
  );
1748
1828
  if (postPruneNote) {
@@ -1754,6 +1834,18 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1754
1834
  }
1755
1835
  }
1756
1836
 
1837
+ // Deduplicate system messages — ALWAYS runs, not just during compaction.
1838
+ // Duplicate system messages accumulate from repeated tool iterations,
1839
+ // summary injections, and context notes across turns.
1840
+ const { messages: dedupedMessages, removedCount } =
1841
+ deduplicateSystemMessages(messagesToUse);
1842
+ if (removedCount > 0) {
1843
+ messagesToUse = dedupedMessages;
1844
+ console.debug(
1845
+ `[Graph:Dedup] Removed ${removedCount} duplicate system message(s)`
1846
+ );
1847
+ }
1848
+
1757
1849
  let finalMessages = messagesToUse;
1758
1850
  if (agentContext.useLegacyContent) {
1759
1851
  finalMessages = formatContentStrings(finalMessages);