@illuma-ai/agents 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/esm/main.mjs CHANGED
@@ -26,7 +26,7 @@ export { createSearchTool } from './tools/search/tool.mjs';
26
26
  export { DATE_RANGE, DEFAULT_COUNTRY_DESCRIPTION, DEFAULT_QUERY_DESCRIPTION, WebSearchToolDefinition, WebSearchToolDescription, WebSearchToolName, WebSearchToolSchema, countrySchema, dateSchema, imagesSchema, newsSchema, querySchema, videosSchema } from './tools/search/schema.mjs';
27
27
  export { createValidationErrorMessage, isValidJsonSchema, normalizeJsonSchema, prepareSchemaForProvider, validateStructuredOutput, zodToJsonSchema } from './schemas/validate.mjs';
28
28
  export { Callback, CommonEvents, Constants, ContentTypes, EdgeType, EnvVar, FinishReasons, GraphEvents, GraphNodeActions, GraphNodeKeys, MessageTypes, Providers, StepTypes, TitleMethod, ToolCallTypes } from './common/enum.mjs';
29
- export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
29
+ export { CONTEXT_SAFETY_BUFFER, DEDUP_MAX_CONTENT_LENGTH, MIN_THINKING_BUDGET, MULTI_DOCUMENT_THRESHOLD, PROACTIVE_SUMMARY_THRESHOLD, PRUNING_EMA_ALPHA, PRUNING_INITIAL_CALIBRATION, SUMMARIZATION_CONTEXT_THRESHOLD, SUMMARIZATION_RESERVE_RATIO, TOOL_DISCOVERY_CACHE_MAX_SIZE, TOOL_TURN_THINKING_BUDGET } from './common/constants.mjs';
30
30
  export { joinKeys, resetIfNotEmpty } from './utils/graph.mjs';
31
31
  export { isGoogleLike, isOpenAILike } from './utils/llm.mjs';
32
32
  export { isPresent, unescapeObject } from './utils/misc.mjs';
@@ -35,6 +35,17 @@ export declare const CONTEXT_SAFETY_BUFFER = 0.9;
35
35
  * When the context window is ≥80% full, pruning + summarization activates.
36
36
  */
37
37
  export declare const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
38
+ /**
39
+ * Proactive summarization threshold (0-1 fraction of context window).
40
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
41
+ * This gives the summary time to complete so it's ready when context actually fills up.
42
+ *
43
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
44
+ * 80% → proactive background summary
45
+ * 90% → pruning kicks in (with summary already cached)
46
+ * 100% → graceful: use existing summary + recent messages, never block
47
+ */
48
+ export declare const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
38
49
  /**
39
50
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
40
51
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
@@ -80,6 +80,15 @@ export declare class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode>
80
80
  private _pruneCalibration;
81
81
  /** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
82
82
  private _toolDiscoveryCache;
83
+ /**
84
+ * SCALE: Tracks whether a summary call is already in-flight for this Graph instance.
85
+ * Prevents multiple concurrent summary LLM calls when rapid tool iterations each
86
+ * trigger pruning. At 2000 users with 3+ tool calls per turn, this prevents
87
+ * 6000+ summary calls/turn from becoming 2000.
88
+ */
89
+ private _summaryInFlight;
90
+ /** Messages accumulated across tool iterations while a summary call is in-flight */
91
+ private _pendingMessagesToRefine;
83
92
  /** Map of agent contexts by agent ID */
84
93
  agentContexts: Map<string, AgentContext>;
85
94
  /** Default agent ID to use */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@illuma-ai/agents",
3
- "version": "1.1.0",
3
+ "version": "1.1.2",
4
4
  "main": "./dist/cjs/main.cjs",
5
5
  "module": "./dist/esm/main.mjs",
6
6
  "types": "./dist/types/index.d.ts",
@@ -59,6 +59,18 @@ export const CONTEXT_SAFETY_BUFFER = 0.9;
59
59
  */
60
60
  export const SUMMARIZATION_CONTEXT_THRESHOLD = 80;
61
61
 
62
+ /**
63
+ * Proactive summarization threshold (0-1 fraction of context window).
64
+ * At this utilization level, background summarization fires BEFORE pruning is needed.
65
+ * This gives the summary time to complete so it's ready when context actually fills up.
66
+ *
67
+ * Inspired by VS Code Copilot Chat's 3-tier strategy:
68
+ * 80% → proactive background summary
69
+ * 90% → pruning kicks in (with summary already cached)
70
+ * 100% → graceful: use existing summary + recent messages, never block
71
+ */
72
+ export const PROACTIVE_SUMMARY_THRESHOLD = 0.8;
73
+
62
74
  /**
63
75
  * Default reserve ratio (0-1) — fraction of context window to preserve as recent messages.
64
76
  * 0.3 means 30% of the context budget is reserved for the most recent messages,
@@ -35,6 +35,7 @@ import {
35
35
  formatAnthropicArtifactContent,
36
36
  ensureThinkingBlockInMessages,
37
37
  deduplicateSystemMessages,
38
+ getContextUtilization,
38
39
  convertMessagesToContent,
39
40
  addBedrockCacheControl,
40
41
  modifyDeltaProperties,
@@ -54,6 +55,7 @@ import {
54
55
  Constants,
55
56
  TOOL_TURN_THINKING_BUDGET,
56
57
  SUMMARIZATION_CONTEXT_THRESHOLD,
58
+ PROACTIVE_SUMMARY_THRESHOLD,
57
59
  } from '@/common';
58
60
  import {
59
61
  ToolDiscoveryCache,
@@ -218,6 +220,15 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
218
220
  private _pruneCalibration: PruneCalibrationState;
219
221
  /** Run-scoped tool discovery cache — avoids re-parsing conversation history on every iteration */
220
222
  private _toolDiscoveryCache: ToolDiscoveryCache;
223
+ /**
224
+ * SCALE: Tracks whether a summary call is already in-flight for this Graph instance.
225
+ * Prevents multiple concurrent summary LLM calls when rapid tool iterations each
226
+ * trigger pruning. At 2000 users with 3+ tool calls per turn, this prevents
227
+ * 6000+ summary calls/turn from becoming 2000.
228
+ */
229
+ private _summaryInFlight: boolean = false;
230
+ /** Messages accumulated across tool iterations while a summary call is in-flight */
231
+ private _pendingMessagesToRefine: BaseMessage[] = [];
221
232
  /** Map of agent contexts by agent ID */
222
233
  agentContexts: Map<string, AgentContext> = new Map();
223
234
  /** Default agent ID to use */
@@ -301,9 +312,11 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
301
312
  new Map()
302
313
  );
303
314
  this.invokedToolIds = resetIfNotEmpty(this.invokedToolIds, undefined);
304
- // Reset EMA calibration and tool discovery cache for fresh run
315
+ // Reset EMA calibration, tool discovery cache, and summary debounce for fresh run
305
316
  this._pruneCalibration = createPruneCalibration();
306
317
  this._toolDiscoveryCache.reset();
318
+ this._summaryInFlight = false;
319
+ this._pendingMessagesToRefine = [];
307
320
  for (const context of this.agentContexts.values()) {
308
321
  context.reset();
309
322
  }
@@ -1531,6 +1544,67 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1531
1544
  }
1532
1545
  }
1533
1546
 
1547
+ // ── Proactive summarization at context pressure ───────────────────
1548
+ // Inspired by VS Code Copilot Chat's 3-tier strategy:
1549
+ // 80% → fire proactive background summary (BEFORE pruning needed)
1550
+ // 90% → pruning kicks in (summary already cached from 80% trigger)
1551
+ // 100% → graceful: use existing summary + recent messages, NEVER block
1552
+ //
1553
+ // This ensures the summary is READY by the time pruning actually occurs,
1554
+ // so the user never waits and never sees a context cliff.
1555
+ if (
1556
+ agentContext.maxContextTokens != null &&
1557
+ agentContext.maxContextTokens > 0 &&
1558
+ agentContext.summarizeCallback &&
1559
+ !this._summaryInFlight &&
1560
+ !this._cachedRunSummary
1561
+ ) {
1562
+ const utilization = getContextUtilization(
1563
+ agentContext.indexTokenCountMap,
1564
+ agentContext.instructionTokens,
1565
+ agentContext.maxContextTokens
1566
+ );
1567
+ const threshold = (agentContext.summarizationConfig?.triggerThreshold ?? PROACTIVE_SUMMARY_THRESHOLD * 100);
1568
+
1569
+ if (utilization >= threshold) {
1570
+ // Identify older messages to summarize proactively.
1571
+ // Keep the last N messages (recent turns) intact — only summarize older history.
1572
+ // This is incremental: the callback checks for existing summary and updates it.
1573
+ const recentTurnCount = Math.max(4, Math.floor(messages.length * 0.3));
1574
+ const oldMessages = messages.slice(
1575
+ messages[0]?.getType() === 'system' ? 1 : 0,
1576
+ Math.max(1, messages.length - recentTurnCount)
1577
+ );
1578
+
1579
+ if (oldMessages.length > 0) {
1580
+ this._summaryInFlight = true;
1581
+ console.debug(
1582
+ `[Graph:ProactiveSummary] Context at ${utilization.toFixed(1)}% (threshold ${threshold}%) — summarizing ${oldMessages.length} older msgs in background`
1583
+ );
1584
+
1585
+ agentContext
1586
+ .summarizeCallback(oldMessages)
1587
+ .then((updated) => {
1588
+ if (updated != null && updated !== '') {
1589
+ this._cachedRunSummary = updated;
1590
+ console.debug(
1591
+ `[Graph:ProactiveSummary] Background summary ready (len=${updated.length})`
1592
+ );
1593
+ }
1594
+ })
1595
+ .catch((err) => {
1596
+ console.error(
1597
+ '[Graph:ProactiveSummary] Background summary failed (non-fatal):',
1598
+ err
1599
+ );
1600
+ })
1601
+ .finally(() => {
1602
+ this._summaryInFlight = false;
1603
+ });
1604
+ }
1605
+ }
1606
+ }
1607
+
1534
1608
  if (agentContext.pruneMessages) {
1535
1609
  const { context, indexTokenCountMap, messagesToRefine } =
1536
1610
  agentContext.pruneMessages({
@@ -1598,20 +1672,39 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
1598
1672
  `[Graph:ContextMgmt] Pruned ${messages.length}→${context.length} msgs (${messagesToRefine.length} discarded) | summary=${summarySource}${summary ? ` (len=${summary.length})` : ''} | calibration=${this._pruneCalibration.ratio.toFixed(3)}(${this._pruneCalibration.iterations})`
1599
1673
  );
1600
1674
 
1601
- // Fire background summarization — updates cache for next iteration/turn
1602
- agentContext
1603
- .summarizeCallback(messagesToRefine)
1604
- .then((updated) => {
1605
- if (updated != null && updated !== '') {
1606
- this._cachedRunSummary = updated;
1607
- }
1608
- })
1609
- .catch((err) => {
1610
- console.error(
1611
- '[Graph] Background summary failed (non-fatal):',
1612
- err
1613
- );
1614
- });
1675
+ // SCALE: Debounce background summarization — if a summary call is already
1676
+ // in-flight (from a prior tool iteration), accumulate messages instead of
1677
+ // firing another concurrent LLM call. At 2000 users with 3+ tool calls
1678
+ // per turn, this prevents 3x summary call volume.
1679
+ if (this._summaryInFlight) {
1680
+ this._pendingMessagesToRefine.push(...messagesToRefine);
1681
+ console.debug(
1682
+ `[Graph:ContextMgmt] Summary in-flight, queued ${messagesToRefine.length} msgs (pending=${this._pendingMessagesToRefine.length})`
1683
+ );
1684
+ } else {
1685
+ this._summaryInFlight = true;
1686
+ const allMessages = this._pendingMessagesToRefine.length > 0
1687
+ ? [...this._pendingMessagesToRefine, ...messagesToRefine]
1688
+ : messagesToRefine;
1689
+ this._pendingMessagesToRefine = [];
1690
+
1691
+ agentContext
1692
+ .summarizeCallback(allMessages)
1693
+ .then((updated) => {
1694
+ if (updated != null && updated !== '') {
1695
+ this._cachedRunSummary = updated;
1696
+ }
1697
+ })
1698
+ .catch((err) => {
1699
+ console.error(
1700
+ '[Graph] Background summary failed (non-fatal):',
1701
+ err
1702
+ );
1703
+ })
1704
+ .finally(() => {
1705
+ this._summaryInFlight = false;
1706
+ });
1707
+ }
1615
1708
 
1616
1709
  if (summary != null && summary !== '') {
1617
1710
  hasSummary = true;
@@ -518,3 +518,116 @@ describe('All Features Combined — Full Pipeline', () => {
518
518
  expect(callback).toHaveBeenCalled();
519
519
  });
520
520
  });
521
+
522
+ // ===========================================================================
523
+ // Proactive Summarization — Context Pressure
524
+ // ===========================================================================
525
+
526
+ import { getContextUtilization } from '@/messages/prune';
527
+ import { PROACTIVE_SUMMARY_THRESHOLD } from '@/common/constants';
528
+
529
+ describe('Proactive Summarization — Context Pressure', () => {
530
+ it('triggers proactive summary at 80% utilization BEFORE pruning', () => {
531
+ // Simulate context at 82% utilization
532
+ const maxContextTokens = 200_000;
533
+ const indexTokenCountMap: Record<string, number | undefined> = {};
534
+ // Build messages that fill ~82% of context
535
+ const msgsNeeded = 40;
536
+ const tokensPerMsg = Math.floor((maxContextTokens * 0.82) / msgsNeeded);
537
+ for (let i = 0; i < msgsNeeded; i++) {
538
+ indexTokenCountMap[String(i)] = tokensPerMsg;
539
+ }
540
+
541
+ const utilization = getContextUtilization(indexTokenCountMap, 0, maxContextTokens);
542
+ const threshold = PROACTIVE_SUMMARY_THRESHOLD * 100; // 80
543
+
544
+ expect(utilization).toBeGreaterThanOrEqual(threshold);
545
+ // At 82%, proactive summary should fire
546
+ // But pruning should NOT have happened yet (context < 90% safety factor)
547
+ const effectiveBudget = Math.floor(maxContextTokens * 0.9); // CONTEXT_SAFETY_FACTOR
548
+ const totalTokens = Object.values(indexTokenCountMap).reduce((s, v) => (s ?? 0) + (v ?? 0), 0) as number;
549
+ expect(totalTokens).toBeLessThan(effectiveBudget);
550
+ });
551
+
552
+ it('does NOT trigger proactive summary below 80%', () => {
553
+ const maxContextTokens = 200_000;
554
+ const indexTokenCountMap: Record<string, number | undefined> = {};
555
+ // Fill to 50% utilization
556
+ const msgsNeeded = 20;
557
+ const tokensPerMsg = Math.floor((maxContextTokens * 0.5) / msgsNeeded);
558
+ for (let i = 0; i < msgsNeeded; i++) {
559
+ indexTokenCountMap[String(i)] = tokensPerMsg;
560
+ }
561
+
562
+ const utilization = getContextUtilization(indexTokenCountMap, 0, maxContextTokens);
563
+ expect(utilization).toBeLessThan(PROACTIVE_SUMMARY_THRESHOLD * 100);
564
+ });
565
+
566
+ it('selects only older messages for proactive summarization (keeps recent turns)', () => {
567
+ const messages: BaseMessage[] = [
568
+ new SystemMessage('System prompt'),
569
+ ...Array.from({ length: 20 }, (_, i) =>
570
+ i % 2 === 0
571
+ ? new HumanMessage(`User message ${i}`)
572
+ : new AIMessage(`AI response ${i}`)
573
+ ),
574
+ ];
575
+
576
+ // Simulate the selection logic from Graph.ts proactive summarization
577
+ const recentTurnCount = Math.max(4, Math.floor(messages.length * 0.3));
578
+ const oldMessages = messages.slice(
579
+ 1, // skip system message
580
+ Math.max(1, messages.length - recentTurnCount)
581
+ );
582
+
583
+ // Recent 30% (~6 messages) preserved, older messages selected for summary
584
+ expect(oldMessages.length).toBeLessThan(messages.length);
585
+ expect(oldMessages.length).toBeGreaterThan(0);
586
+ // System message not included
587
+ expect(oldMessages[0].getType()).not.toBe('system');
588
+ // Last messages of conversation not included (recent turns preserved)
589
+ const lastOldIndex = messages.indexOf(oldMessages[oldMessages.length - 1]);
590
+ expect(lastOldIndex).toBeLessThan(messages.length - recentTurnCount);
591
+ });
592
+
593
+ it('never blocks — proactive summary is always fire-and-forget', async () => {
594
+ let resolveCallback: ((v: string) => void) | undefined;
595
+ const slowCallback = jest.fn(
596
+ () =>
597
+ new Promise<string>((resolve) => {
598
+ resolveCallback = resolve;
599
+ })
600
+ );
601
+
602
+ // Simulate proactive summary fire-and-forget
603
+ const summaryPromise = slowCallback().then((updated) => {
604
+ return updated;
605
+ });
606
+
607
+ // Main flow continues immediately — callback hasn't resolved yet
608
+ expect(slowCallback).toHaveBeenCalledTimes(1);
609
+
610
+ // Later, callback resolves (simulating Nova Micro responding)
611
+ resolveCallback!('Proactive summary result');
612
+ const result = await summaryPromise;
613
+ expect(result).toBe('Proactive summary result');
614
+ });
615
+
616
+ it('at 100%+ utilization, uses existing summary without throwing', () => {
617
+ const maxContextTokens = 200_000;
618
+ const cachedSummary = 'Previously generated summary of the conversation';
619
+
620
+ // Context is at 105% (over budget)
621
+ const indexTokenCountMap: Record<string, number | undefined> = {
622
+ '0': 210_000, // system + everything
623
+ };
624
+
625
+ const utilization = getContextUtilization(indexTokenCountMap, 0, maxContextTokens);
626
+ expect(utilization).toBeGreaterThan(100);
627
+
628
+ // Even at 100%+, we use the existing cached summary — no error thrown
629
+ expect(cachedSummary).toBeTruthy();
630
+ // Pruning will remove oldest messages to fit, and inject cached summary
631
+ // The key: no blocking, no throwing, just graceful degradation
632
+ });
633
+ });