npm - @librechat/agents - Versions diffs - 3.2.34 → 3.2.35 - Mend

@librechat/agents 3.2.34 → 3.2.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/dist/cjs/agents/AgentContext.cjs +47 -10
package/dist/cjs/agents/AgentContext.cjs.map +1 -1
package/dist/cjs/common/enum.cjs +13 -0
package/dist/cjs/common/enum.cjs.map +1 -1
package/dist/cjs/graphs/Graph.cjs +121 -3
package/dist/cjs/graphs/Graph.cjs.map +1 -1
package/dist/cjs/llm/invoke.cjs +49 -8
package/dist/cjs/llm/invoke.cjs.map +1 -1
package/dist/cjs/main.cjs +2 -0
package/dist/cjs/messages/content.cjs +12 -14
package/dist/cjs/messages/content.cjs.map +1 -1
package/dist/cjs/messages/prune.cjs +31 -13
package/dist/cjs/messages/prune.cjs.map +1 -1
package/dist/cjs/run.cjs +7 -2
package/dist/cjs/run.cjs.map +1 -1
package/dist/cjs/summarization/node.cjs +12 -1
package/dist/cjs/summarization/node.cjs.map +1 -1
package/dist/cjs/tools/subagent/SubagentExecutor.cjs +138 -2
package/dist/cjs/tools/subagent/SubagentExecutor.cjs.map +1 -1
package/dist/cjs/utils/tokens.cjs +30 -0
package/dist/cjs/utils/tokens.cjs.map +1 -1
package/dist/esm/agents/AgentContext.mjs +47 -10
package/dist/esm/agents/AgentContext.mjs.map +1 -1
package/dist/esm/common/enum.mjs +13 -0
package/dist/esm/common/enum.mjs.map +1 -1
package/dist/esm/graphs/Graph.mjs +122 -4
package/dist/esm/graphs/Graph.mjs.map +1 -1
package/dist/esm/llm/invoke.mjs +49 -8
package/dist/esm/llm/invoke.mjs.map +1 -1
package/dist/esm/main.mjs +3 -3
package/dist/esm/messages/content.mjs +12 -15
package/dist/esm/messages/content.mjs.map +1 -1
package/dist/esm/messages/prune.mjs +31 -13
package/dist/esm/messages/prune.mjs.map +1 -1
package/dist/esm/run.mjs +7 -2
package/dist/esm/run.mjs.map +1 -1
package/dist/esm/summarization/node.mjs +12 -1
package/dist/esm/summarization/node.mjs.map +1 -1
package/dist/esm/tools/subagent/SubagentExecutor.mjs +138 -2
package/dist/esm/tools/subagent/SubagentExecutor.mjs.map +1 -1
package/dist/esm/utils/tokens.mjs +30 -1
package/dist/esm/utils/tokens.mjs.map +1 -1
package/dist/types/agents/AgentContext.d.ts +7 -3
package/dist/types/common/enum.d.ts +13 -0
package/dist/types/graphs/Graph.d.ts +8 -1
package/dist/types/llm/invoke.d.ts +1 -1
package/dist/types/messages/content.d.ts +5 -0
package/dist/types/messages/prune.d.ts +4 -0
package/dist/types/run.d.ts +1 -0
package/dist/types/tools/subagent/SubagentExecutor.d.ts +11 -1
package/dist/types/types/graph.d.ts +89 -3
package/dist/types/types/run.d.ts +13 -0
package/dist/types/utils/tokens.d.ts +7 -0
package/package.json +1 -1
package/src/agents/AgentContext.ts +69 -6
package/src/agents/__tests__/AgentContext.test.ts +6 -2
package/src/common/enum.ts +13 -0
package/src/graphs/Graph.ts +196 -0
package/src/llm/invoke.test.ts +79 -1
package/src/llm/invoke.ts +58 -4
package/src/messages/content.ts +24 -32
package/src/messages/prune.ts +39 -2
package/src/run.ts +5 -0
package/src/scripts/subagent-usage-sink.ts +176 -0
package/src/specs/context-accuracy.live.test.ts +409 -0
package/src/specs/context-usage-event.test.ts +117 -0
package/src/specs/context-usage.live.test.ts +297 -0
package/src/specs/prune.test.ts +51 -1
package/src/specs/subagent.test.ts +124 -1
package/src/summarization/__tests__/node.test.ts +60 -1
package/src/summarization/node.ts +20 -1
package/src/tools/__tests__/SubagentExecutor.test.ts +443 -1
package/src/tools/subagent/SubagentExecutor.ts +221 -3
package/src/types/graph.ts +94 -1
package/src/types/run.ts +13 -0
package/src/utils/__tests__/apportion.test.ts +32 -0
package/src/utils/tokens.ts +33 -0

package/dist/types/types/graph.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { BaseMessage, AIMessageChunk, SystemMessage } from '@langchain/core/messages';
+import type { BaseMessage, AIMessageChunk, SystemMessage, UsageMetadata } from '@langchain/core/messages';
 import type { BindToolsInput } from '@langchain/core/language_models/chat_models';
 import type { START, StateGraph, StateGraphArgs } from '@langchain/langgraph';
 import type { RunnableConfig, Runnable } from '@langchain/core/runnables';
@@ -7,10 +7,10 @@ import type { GoogleAIToolType } from '@langchain/google-common';
 import type { SummarizationNodeInput, SummarizeCompleteEvent, SummarizationConfig, SummarizeStartEvent, SummarizeDeltaEvent } from '@/types/summarize';
 import type { ToolMap, ToolEndEvent, GenericTool, LCTool, ToolExecuteBatchRequest } from '@/types/tools';
 import type { RunStep, RunStepDeltaEvent, MessageDeltaEvent, ReasoningDeltaEvent } from '@/types/stream';
+import type { TokenCounter, TokenBudgetBreakdown } from '@/types/run';
 import type { Providers, Callback, GraphNodeKeys } from '@/common';
 import type { StandardGraph, MultiAgentGraph } from '@/graphs';
 import type { ClientOptions } from '@/types/llm';
-import type { TokenCounter } from '@/types/run';
 /** Interface for bound model with stream and invoke methods */
 export interface ChatModel {
     stream?: (messages: BaseMessage[], config?: RunnableConfig) => Promise<AsyncIterable<AIMessageChunk>>;
@@ -44,8 +44,31 @@ export interface AgentLogEvent {
     runId?: string;
     agentId?: string;
 }
+/**
+ * Per-model-call context window usage snapshot, dispatched after pruning and
+ * before the model invocation. Dispatched once per `callModel` invocation:
+ * fallback retries reuse the snapshot since the prompt is identical — budget
+ * numbers reflect the primary provider's tokenizer, and the calibration
+ * ratio self-corrects from whichever provider reports usage.
+ */
+export interface ContextUsageEvent {
+    runId?: string;
+    agentId?: string;
+    /** Structural token budget snapshot from AgentContext.getTokenBudgetBreakdown */
+    breakdown: TokenBudgetBreakdown;
+    /** Usable budget this call: maxContextTokens minus output reserve */
+    contextBudget?: number;
+    /** Calibrated instruction overhead actually applied this call */
+    effectiveInstructionTokens?: number;
+    /** Calibrated message tokens before pruning (excluding instructions) */
+    prePruneContextTokens?: number;
+    /** Tokens still free after instructions + pruned messages */
+    remainingContextTokens?: number;
+    /** EMA ratio of provider-reported vs locally estimated token counts */
+    calibrationRatio?: number;
+}
 export interface EventHandler {
-    handle(event: string, data: StreamEventData | ModelEndData | RunStep | RunStepDeltaEvent | MessageDeltaEvent | ReasoningDeltaEvent | SummarizeStartEvent | SummarizeDeltaEvent | SummarizeCompleteEvent | SubagentUpdateEvent | AgentLogEvent | ToolExecuteBatchRequest | {
+    handle(event: string, data: StreamEventData | ModelEndData | RunStep | RunStepDeltaEvent | MessageDeltaEvent | ReasoningDeltaEvent | SummarizeStartEvent | SummarizeDeltaEvent | SummarizeCompleteEvent | SubagentUpdateEvent | AgentLogEvent | ContextUsageEvent | ToolExecuteBatchRequest | {
         result: ToolEndEvent;
     }, metadata?: Record<string, unknown>, graph?: StandardGraph | MultiAgentGraph): void | Promise<void>;
 }
@@ -199,6 +222,17 @@ export type StandardGraphInput = {
     tokenCounter?: TokenCounter;
     indexTokenCountMap?: Record<string, number>;
     calibrationRatio?: number;
+    /**
+     * Receives a {@link SubagentUsageEvent} for every model call made inside
+     * a subagent child run spawned from this graph (including nested
+     * subagents and child-side summarization calls). Child graphs run via
+     * `invoke()` outside the host's `streamEvents` loop, so their
+     * `on_chat_model_end` events never reach the run's handler registry —
+     * this sink is the only way hosts can observe child token usage for
+     * billing/accounting. Parent-graph model calls are NOT reported here;
+     * they already flow through the registry's `CHAT_MODEL_END` handler.
+     */
+    subagentUsageSink?: SubagentUsageSink;
 };
 export type GraphEdge = {
     /** Agent ID, use a list for multiple sources */
@@ -289,6 +323,58 @@ export interface SubagentUpdateEvent {
     /** ISO timestamp for ordering / display. */
     timestamp: string;
 }
+/**
+ * Token usage for a single model call made inside a subagent child run.
+ * Emitted through {@link SubagentUsageSink} as each call completes, so
+ * hosts can bill child-run model usage that never reaches the parent
+ * run's `CHAT_MODEL_END` handler (child graphs execute via `invoke()`
+ * outside the host's `streamEvents` loop).
+ */
+export interface SubagentUsageEvent {
+    /** Usage metadata reported by the child's model call. */
+    usage: UsageMetadata;
+    /**
+     * Model that produced this usage. Per-call `ls_model_name` from the
+     * model's callback metadata when available (covers child-side
+     * summarization or any call that differs from the configured model),
+     * then the fallback-invocation's configured model (`INVOKED_MODEL`
+     * metadata), then the subagent config's `clientOptions` model.
+     */
+    model?: string;
+    /**
+     * Provider that actually served this call — the SDK `Providers` enum
+     * value stamped per-invocation by `attemptInvoke` (`INVOKED_PROVIDER`
+     * metadata), so fallback-served calls are attributed to the fallback
+     * provider, not the configured primary. Falls back to the subagent
+     * config's provider. Never LangSmith's `ls_provider` string — derived
+     * providers inherit that from their base class, and hosts key
+     * pricing/cache semantics off the enum.
+     */
+    provider?: string;
+    /** Subagent `type` identifier from the SubagentConfig. */
+    subagentType: string;
+    /** Child run ID (unique per subagent execution). */
+    subagentRunId: string;
+    /** Child agent ID assigned to this subagent execution. */
+    subagentAgentId: string;
+    /**
+     * ROOT run ID of the host run that owns billing. For nested subagents
+     * each forwarding layer rewrites this upward, so events from any depth
+     * surface with the outermost run's ID — never an intermediate
+     * `*_sub_*` child id (use {@link subagentRunId} to identify the
+     * emitting child).
+     */
+    runId: string;
+}
+/**
+ * Host-provided callback receiving {@link SubagentUsageEvent}s. Invoked as
+ * each child model call completes. May return a promise — the executor
+ * awaits each dispatch (so all usage is recorded before the child's result
+ * resolves to the parent) and swallows both synchronous throws and
+ * rejections; implementations should still be cheap, as they sit on the
+ * child's model-call path.
+ */
+export type SubagentUsageSink = (event: SubagentUsageEvent) => void | Promise<void>;
 export type LangfuseToolOutputTracingConfig = {
     /**
      * Whether tool outputs should be exported to Langfuse. Defaults to

package/dist/types/types/run.d.ts CHANGED Viewed

@@ -111,6 +111,15 @@ export type RunConfig = {
      */
     langfuse?: g.LangfuseConfig;
     customHandlers?: Record<string, g.EventHandler>;
+    /**
+     * Receives token usage for every model call made inside subagent child
+     * runs (including nested subagents). Child graphs execute via `invoke()`
+     * outside this run's `streamEvents` loop, so their model-end events never
+     * reach `customHandlers` — without this sink, child usage is invisible to
+     * the host. Parent-graph calls are not reported here; they flow through
+     * the registered `CHAT_MODEL_END` handler as usual.
+     */
+    subagentUsageSink?: g.SubagentUsageSink;
     /**
      * Pre-constructed hook registry for this run. Hooks fire at lifecycle
      * points in `processStream` (RunStart, UserPromptSubmit, Stop,
@@ -225,6 +234,10 @@ export type TokenBudgetBreakdown = {
     messageTokens: number;
     /** Tokens available for messages after instructions. */
     availableForMessages: number;
+    /** Per-tool schema token counts (post-multiplier), keyed by tool name. */
+    toolTokenCounts?: Record<string, number>;
+    /** Names of counted tools that are deferred (`defer_loading`) and discovered. */
+    deferredToolNames?: string[];
 };
 export type EventStreamOptions = {
     callbacks?: g.ClientCallbacks;

package/dist/types/utils/tokens.d.ts CHANGED Viewed

@@ -15,6 +15,13 @@ export declare function estimateAnthropicImageTokens(width: number, height: numb
 export declare function estimateOpenAIImageTokens(width: number, height: number, detail?: string): number;
 export declare function encodingForModel(model: string): EncodingName;
 export declare function getTokenCountForMessage(message: BaseMessage, getTokenCount: (text: string) => number, encoding?: EncodingName): number;
+/**
+ * Largest-remainder apportionment: scales each count by `multiplier` and
+ * distributes the rounding remainder so the results sum exactly to
+ * `targetTotal`. Keeps per-item breakdowns reconciled with an aggregate
+ * computed as a single rounded product of the summed raw counts.
+ */
+export declare function apportionTokenCounts(rawCounts: Record<string, number>, multiplier: number, targetTotal: number): Record<string, number>;
 /**
  * Creates a token counter function using the specified encoding.
  * Lazily loads the encoding data on first use via dynamic import.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@librechat/agents",
-  "version": "3.2.34",
+  "version": "3.2.35",
   "main": "./dist/cjs/main.cjs",
   "module": "./dist/esm/main.mjs",
   "types": "./dist/types/index.d.ts",

package/src/agents/AgentContext.ts CHANGED Viewed

@@ -21,6 +21,7 @@ import {
   addCacheControlToStablePrefixMessages,
 } from '@/messages/cache';
 import { createSchemaOnlyTools } from '@/tools/schema';
+import { apportionTokenCounts } from '@/utils/tokens';
 import { DEFAULT_RESERVE_RATIO } from '@/messages';
 import { toJsonSchema } from '@/utils/schema';
@@ -191,6 +192,11 @@ export class AgentContext {
   dynamicInstructionTokens: number = 0;
   /** Token count for tool schemas only. */
   toolSchemaTokens: number = 0;
+  /** Per-tool schema token counts (post-multiplier), keyed by tool name.
+   *  `undefined` when not calculated (e.g. cached aggregate schema tokens). */
+  toolTokenCounts?: Record<string, number>;
+  /** Names of counted tools that are deferred (`defer_loading`) and discovered. */
+  deferredToolNames: string[] = [];
   /** Running calibration ratio from the pruner — persisted across runs via contextMeta. */
   calibrationRatio: number = 1;
   /** Provider-observed instruction overhead from the pruner's best-variance turn. */
@@ -894,6 +900,8 @@ export class AgentContext {
     this.systemMessageTokens = 0;
     this.dynamicInstructionTokens = 0;
     this.toolSchemaTokens = 0;
+    this.toolTokenCounts = undefined;
+    this.deferredToolNames = [];
     this.cachedSystemRunnable = undefined;
     this.systemRunnableStale = true;
     this.lastToken = undefined;
@@ -1006,6 +1014,10 @@ export class AgentContext {
   ): Promise<void> {
     let toolTokens = 0;
     const countedToolNames = new Set<string>();
+    /** Prototype-free: external tool names like `toString` must not hit
+     *  inherited properties during accumulation */
+    const rawToolTokenCounts: Record<string, number> = Object.create(null);
+    const deferredCountedNames = new Set<string>();
     /**
      * Iterate both `tools` (user-provided instance tools) and `graphTools`
@@ -1040,11 +1052,14 @@ export class AgentContext {
             toolName,
             (genericTool.description as string | undefined) ?? ''
           );
-          toolTokens += tokenCounter(
+          const schemaTokens = tokenCounter(
             new SystemMessage(JSON.stringify(jsonSchema))
           );
+          toolTokens += schemaTokens;
           if (toolName) {
             countedToolNames.add(toolName);
+            rawToolTokenCounts[toolName] =
+              (rawToolTokenCounts[toolName] ?? 0) + schemaTokens;
           }
         }
       }
@@ -1062,7 +1077,16 @@ export class AgentContext {
           parameters: def.parameters ?? {},
         },
       };
-      toolTokens += tokenCounter(new SystemMessage(JSON.stringify(schema)));
+      const schemaTokens = tokenCounter(
+        new SystemMessage(JSON.stringify(schema))
+      );
+      toolTokens += schemaTokens;
+      countedToolNames.add(def.name);
+      rawToolTokenCounts[def.name] =
+        (rawToolTokenCounts[def.name] ?? 0) + schemaTokens;
+      if (def.defer_loading === true) {
+        deferredCountedNames.add(def.name);
+      }
     }
     const isAnthropic =
@@ -1077,6 +1101,25 @@ export class AgentContext {
       ? ANTHROPIC_TOOL_TOKEN_MULTIPLIER
       : DEFAULT_TOOL_TOKEN_MULTIPLIER;
     this.toolSchemaTokens = Math.ceil(toolTokens * toolTokenMultiplier);
+    /** Largest-remainder apportionment keeps the per-tool counts summing
+     *  exactly to the aggregate despite per-entry rounding */
+    const toolTokenCounts = apportionTokenCounts(
+      rawToolTokenCounts,
+      toolTokenMultiplier,
+      this.toolSchemaTokens
+    );
+    const deferredToolNames: string[] = [];
+    for (const name of Object.keys(rawToolTokenCounts)) {
+      if (
+        deferredCountedNames.has(name) ||
+        this.toolRegistry?.get(name)?.defer_loading === true
+      ) {
+        deferredToolNames.push(name);
+      }
+    }
+    this.toolTokenCounts = toolTokenCounts;
+    this.deferredToolNames = deferredToolNames;
   }
   /**
@@ -1212,9 +1255,8 @@ export class AgentContext {
    * Returns a structured breakdown of how the context token budget is consumed.
    * Useful for diagnostics when context overflow or pruning issues occur.
    *
-   * Note: `toolCount` reflects discoveries immediately, but `toolSchemaTokens`
-   * is a snapshot taken during `calculateInstructionTokens` and is not
-   * recomputed when `markToolsAsDiscovered` is called mid-run.
+   * Note: `markToolsAsDiscovered` re-triggers `calculateInstructionTokens`,
+   * so `toolSchemaTokens`/`toolTokenCounts` refresh before the next call.
    */
   getTokenBudgetBreakdown(messages?: BaseMessage[]): t.TokenBudgetBreakdown {
     const maxContextTokens = this.maxContextTokens ?? 0;
@@ -1238,7 +1280,14 @@ export class AgentContext {
       }
     }
-    const reserveTokens = Math.round(maxContextTokens * DEFAULT_RESERVE_RATIO);
+    /** Mirror the pruner's reserve math so availableForMessages agrees
+     *  with the contextBudget computed during pruning */
+    const reserveRatio =
+      this.summarizationConfig?.reserveRatio ?? DEFAULT_RESERVE_RATIO;
+    const reserveTokens =
+      reserveRatio > 0 && reserveRatio < 1
+        ? Math.round(maxContextTokens * reserveRatio)
+        : 0;
     const availableForMessages = Math.max(
       0,
       maxContextTokens - reserveTokens - this.instructionTokens
@@ -1255,6 +1304,12 @@ export class AgentContext {
       messageCount,
       messageTokens,
       availableForMessages,
+      toolTokenCounts:
+        this.toolTokenCounts != null ? { ...this.toolTokenCounts } : undefined,
+      deferredToolNames:
+        this.deferredToolNames.length > 0
+          ? [...this.deferredToolNames]
+          : undefined,
     };
   }
@@ -1324,6 +1379,14 @@ export class AgentContext {
     }
     if (hasNewDiscoveries) {
       this.systemRunnableStale = true;
+      /** Refresh schema token accounting so the next call's budget and
+       *  per-tool breakdown include the newly discovered tools; awaited
+       *  via tokenCalculationPromise before the next model call */
+      if (this.tokenCounter) {
+        this.tokenCalculationPromise = this.calculateInstructionTokens(
+          this.tokenCounter
+        );
+      }
     }
     return hasNewDiscoveries;
   }

package/src/agents/__tests__/AgentContext.test.ts CHANGED Viewed

@@ -1414,7 +1414,7 @@ describe('AgentContext', () => {
       expect(ctx.getTokenBudgetBreakdown().toolCount).toBe(2);
     });
-    it('toolSchemaTokens snapshot does not auto-update after markToolsAsDiscovered', async () => {
+    it('refreshes toolSchemaTokens and per-tool counts after markToolsAsDiscovered', async () => {
       const toolDefinitions: t.LCTool[] = [
         {
           name: 'deferred',
@@ -1431,9 +1431,13 @@ describe('AgentContext', () => {
       await ctx.tokenCalculationPromise;
       expect(ctx.toolSchemaTokens).toBe(0);
+      expect(ctx.toolTokenCounts).toEqual({});
       ctx.markToolsAsDiscovered(['deferred']);
-      expect(ctx.toolSchemaTokens).toBe(0);
+      await ctx.tokenCalculationPromise;
+      expect(ctx.toolSchemaTokens).toBeGreaterThan(0);
+      expect(ctx.toolTokenCounts?.deferred).toBeGreaterThan(0);
+      expect(ctx.deferredToolNames).toContain('deferred');
     });
   });

package/src/common/enum.ts CHANGED Viewed

@@ -31,6 +31,8 @@ export enum GraphEvents {
   ON_SUBAGENT_UPDATE = 'on_subagent_update',
   /** [Custom] Diagnostic logging event for context management observability */
   ON_AGENT_LOG = 'on_agent_log',
+  /** [Custom] Per-model-call context window usage snapshot (post-prune token budget) */
+  ON_CONTEXT_USAGE = 'on_context_usage',
   /* Official Events */
@@ -185,6 +187,17 @@ export enum Constants {
   /** Anthropic server tool ID prefix (web_search, code_execution, etc.) */
   ANTHROPIC_SERVER_TOOL_PREFIX = 'srvtoolu_',
   SKILL_TOOL = 'skill',
+  /**
+   * Callback-metadata keys stamped by `attemptInvoke` /
+   * `tryFallbackProviders` carrying the provider (SDK `Providers` enum
+   * value) and configured model that actually served a model invocation.
+   * Unlike `ls_provider` — which derived providers inherit from their base
+   * class (e.g. DeepSeek/OpenRouter report `'openai'`) — these reflect the
+   * SDK's own routing, including fallback-provider calls. Consumed by the
+   * subagent usage-capture handler to tag billing events.
+   */
+  INVOKED_PROVIDER = '__invoked_provider',
+  INVOKED_MODEL = '__invoked_model',
   READ_FILE = 'read_file',
   BASH_TOOL = 'bash_tool',
   BASH_PROGRAMMATIC_TOOL_CALLING = 'run_tools_with_bash',

package/src/graphs/Graph.ts CHANGED Viewed

@@ -23,6 +23,7 @@ import {
   formatArtifactPayload,
   enforceOriginalContentCap,
   formatContentStrings,
+  isLegacyConvertible,
   createPruneMessages,
   addCacheControl,
   getMessageId,
@@ -45,6 +46,7 @@ import {
   isAnthropicLike,
   isOpenAILike,
   isGoogleLike,
+  apportionTokenCounts,
   joinKeys,
   sleep,
 } from '@/utils';
@@ -89,6 +91,55 @@ const { AGENT, TOOLS, SUMMARIZE } = GraphNodeKeys;
 /** Minimum relative variance before calibrated toolSchemaTokens overrides current value. */
 const CALIBRATION_VARIANCE_THRESHOLD = 0.15;
+/**
+ * Start index of the span post-prune formatters can mutate in place: the
+ * trailing tool batch plus its owning AI message (artifact formatting touches
+ * every tool result after the last AI tool call; Bedrock rewrites the AI
+ * message before a trailing tool result). Capped so the usage-snapshot
+ * recount stays constant-cost.
+ */
+function trailingMutationStart(messages: BaseMessage[]): number {
+  const MAX_SPAN = 16;
+  let index = messages.length - 1;
+  while (
+    index >= 0 &&
+    messages[index]?.getType() === 'tool' &&
+    messages.length - index < MAX_SPAN
+  ) {
+    index--;
+  }
+  return Math.max(0, Math.min(index, messages.length - 2));
+}
+/**
+ * Re-derives the breakdown fields coupled to the calibrated budget math so
+ * the snapshot stays internally consistent: the aggregate
+ * `instructionTokens`/`availableForMessages` reflect the pruner's effective
+ * (calibrated) overhead — component fields remain local estimates — and
+ * `messageTokens` mirrors `contextBudget - instructions - remaining`.
+ */
+function syncBudgetDerivedFields(usage: t.ContextUsageEvent): void {
+  const { breakdown, contextBudget, effectiveInstructionTokens } = usage;
+  if (effectiveInstructionTokens == null) {
+    return;
+  }
+  breakdown.instructionTokens = effectiveInstructionTokens;
+  if (contextBudget == null) {
+    return;
+  }
+  breakdown.availableForMessages = Math.max(
+    0,
+    contextBudget - effectiveInstructionTokens
+  );
+  if (usage.remainingContextTokens == null) {
+    return;
+  }
+  breakdown.messageTokens = Math.max(
+    0,
+    contextBudget - effectiveInstructionTokens - usage.remainingContextTokens
+  );
+}
 type ReasoningKey = 'reasoning_content' | 'reasoning';
 type ReasoningSummary = { summary?: Array<{ text?: string }> };
 type ReasoningDetail = { type?: string; text?: string };
@@ -825,6 +876,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
   agentContexts: Map<string, AgentContext> = new Map();
   /** Default agent ID to use */
   defaultAgentId: string;
+  /**
+   * Host sink for model usage emitted inside subagent child runs. Threaded
+   * into each `SubagentExecutor` this graph creates (and from there into
+   * child graphs, so nested subagents report too). See
+   * {@link t.StandardGraphInput.subagentUsageSink}.
+   */
+  subagentUsageSink?: t.SubagentUsageSink;
   constructor({
     runId,
@@ -834,11 +892,13 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
     tokenCounter,
     indexTokenCountMap,
     calibrationRatio,
+    subagentUsageSink,
   }: t.StandardGraphInput) {
     super();
     this.runId = runId;
     this.signal = signal;
     this.langfuse = langfuse;
+    this.subagentUsageSink = subagentUsageSink;
     if (agents.length === 0) {
       throw new Error('At least one agent configuration is required');
@@ -1423,6 +1483,7 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
       this.config = config;
       let messagesToUse = messages;
+      let contextUsage: t.ContextUsageEvent | null = null;
       if (
         !agentContext.pruneMessages &&
         agentContext.tokenCounter &&
@@ -1462,6 +1523,8 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
           originalToolContent,
           calibrationRatio,
           resolvedInstructionOverhead,
+          contextBudget,
+          effectiveInstructionTokens,
         } = agentContext.pruneMessages({
           messages,
           usageMetadata: agentContext.currentUsage,
@@ -1489,10 +1552,42 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
               : 1;
           if (variance > CALIBRATION_VARIANCE_THRESHOLD) {
             agentContext.toolSchemaTokens = calibratedToolTokens;
+            /** Largest-remainder apportionment keeps the per-tool breakdown
+             *  summing exactly to the calibrated aggregate */
+            if (agentContext.toolTokenCounts != null && currentToolTokens > 0) {
+              agentContext.toolTokenCounts = apportionTokenCounts(
+                agentContext.toolTokenCounts,
+                calibratedToolTokens / currentToolTokens,
+                calibratedToolTokens
+              );
+            }
           }
         }
         messagesToUse = context;
+        /** Dispatched right before the model invoke — a summarization
+         *  detour returns from this node without an LLM call, and the
+         *  post-summary retry produces its own snapshot.
+         *
+         *  The breakdown describes the post-prune prompt: counts from the
+         *  kept context, message tokens derived from the same calibrated
+         *  budget math as `remainingContextTokens` (the index map is keyed
+         *  by pre-prune state indices, so summing it over `context` would
+         *  missum); `prePruneContextTokens` carries the pre-prune metric. */
+        const usageBreakdown = agentContext.getTokenBudgetBreakdown(messages);
+        usageBreakdown.messageCount = context.length;
+        contextUsage = {
+          runId: this.runId,
+          agentId,
+          breakdown: usageBreakdown,
+          contextBudget,
+          effectiveInstructionTokens,
+          prePruneContextTokens,
+          remainingContextTokens,
+          calibrationRatio: agentContext.calibrationRatio,
+        };
+        syncBudgetDerivedFields(contextUsage);
         const hasPrunedMessages =
           agentContext.summarizationEnabled === true &&
           Array.isArray(messagesToRefine) &&
@@ -1598,6 +1693,33 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
       }
       let finalMessages = messagesToUse;
+      /** Tail snapshot for the dispatch-time usage delta: in-place
+       *  formatters (artifact appends, Bedrock content rewrites, legacy
+       *  string conversion) mutate without changing length or identity —
+       *  capture before they run. Legacy string conversion can also touch
+       *  messages before the tail, so those convertible indices are
+       *  tracked separately (none exist in the common case). */
+      const tailStart = trailingMutationStart(messagesToUse);
+      let preFormatTailTokens: number | null = null;
+      let legacyIndices: number[] | null = null;
+      let preFormatLegacyTokens = 0;
+      if (contextUsage != null && agentContext.tokenCounter != null) {
+        preFormatTailTokens = 0;
+        for (const message of messagesToUse.slice(tailStart)) {
+          preFormatTailTokens += agentContext.tokenCounter(message);
+        }
+        if (agentContext.useLegacyContent) {
+          legacyIndices = [];
+          for (let i = 0; i < tailStart; i++) {
+            if (isLegacyConvertible(messagesToUse[i])) {
+              legacyIndices.push(i);
+              preFormatLegacyTokens += agentContext.tokenCounter(
+                messagesToUse[i]
+              );
+            }
+          }
+        }
+      }
       if (agentContext.useLegacyContent) {
         finalMessages = formatContentStrings(finalMessages);
       }
@@ -1788,6 +1910,79 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
         );
       }
+      /** Past the empty-prompt guard — a model call is now guaranteed */
+      if (contextUsage != null) {
+        const usageRatio =
+          contextUsage.calibrationRatio != null &&
+          contextUsage.calibrationRatio > 0
+            ? contextUsage.calibrationRatio
+            : 1;
+        if (
+          agentContext.tokenCounter != null &&
+          finalMessages.length !== messagesToUse.length
+        ) {
+          /** Post-prune formatting restructured the payload (e.g. thinking
+           *  placeholder collapse, orphan drops) — recount so the gauge
+           *  reflects what is actually sent */
+          let rawTokens = 0;
+          for (const message of finalMessages) {
+            rawTokens += agentContext.tokenCounter(message);
+          }
+          contextUsage.breakdown.messageCount = finalMessages.length;
+          if (
+            contextUsage.contextBudget != null &&
+            contextUsage.effectiveInstructionTokens != null
+          ) {
+            contextUsage.remainingContextTokens = Math.max(
+              0,
+              contextUsage.contextBudget -
+                contextUsage.effectiveInstructionTokens -
+                Math.round(rawTokens * usageRatio)
+            );
+          }
+        } else if (
+          preFormatTailTokens != null &&
+          agentContext.tokenCounter != null &&
+          contextUsage.remainingContextTokens != null
+        ) {
+          /** Same-length formatting can still mutate in place — the trailing
+           *  tool batch (artifacts, Bedrock rewrites) and any legacy-converted
+           *  messages before it — adjust remaining by the calibrated delta */
+          let postFormatTailTokens = 0;
+          for (const message of finalMessages.slice(tailStart)) {
+            postFormatTailTokens += agentContext.tokenCounter(message);
+          }
+          let formatDelta = postFormatTailTokens - preFormatTailTokens;
+          if (legacyIndices != null && legacyIndices.length > 0) {
+            let postFormatLegacyTokens = 0;
+            for (const index of legacyIndices) {
+              postFormatLegacyTokens += agentContext.tokenCounter(
+                finalMessages[index]
+              );
+            }
+            formatDelta += postFormatLegacyTokens - preFormatLegacyTokens;
+          }
+          if (formatDelta !== 0) {
+            contextUsage.remainingContextTokens = Math.max(
+              0,
+              Math.min(
+                contextUsage.contextBudget ?? Number.MAX_SAFE_INTEGER,
+                contextUsage.remainingContextTokens -
+                  Math.round(formatDelta * usageRatio)
+              )
+            );
+          }
+        }
+        syncBudgetDerivedFields(contextUsage);
+        /** Awaited so async host handlers receive the pre-invoke snapshot
+         *  before any model deltas are emitted */
+        await safeDispatchCustomEvent(
+          GraphEvents.ON_CONTEXT_USAGE,
+          contextUsage,
+          config
+        );
+      }
       const invokeStart = Date.now();
       const invokeMeta = { runId: this.runId, agentId };
       emitAgentLog(
@@ -2063,6 +2258,7 @@ export class StandardGraph extends Graph<t.BaseGraphState, t.GraphNode> {
           parentAgentId: agentContext.agentId,
           langfuse: this.langfuse,
           tokenCounter: agentContext.tokenCounter,
+          usageSink: this.subagentUsageSink,
           maxDepth: effectiveSubagentDepth,
           createChildGraph: (input): StandardGraph => {
             const childGraph = new StandardGraph(input);