@librechat/agents 3.1.57 → 3.1.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (214) hide show
  1. package/dist/cjs/agents/AgentContext.cjs +326 -62
  2. package/dist/cjs/agents/AgentContext.cjs.map +1 -1
  3. package/dist/cjs/common/enum.cjs +13 -0
  4. package/dist/cjs/common/enum.cjs.map +1 -1
  5. package/dist/cjs/events.cjs +7 -27
  6. package/dist/cjs/events.cjs.map +1 -1
  7. package/dist/cjs/graphs/Graph.cjs +303 -222
  8. package/dist/cjs/graphs/Graph.cjs.map +1 -1
  9. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +4 -4
  10. package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
  11. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +6 -2
  12. package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
  13. package/dist/cjs/llm/init.cjs +60 -0
  14. package/dist/cjs/llm/init.cjs.map +1 -0
  15. package/dist/cjs/llm/invoke.cjs +90 -0
  16. package/dist/cjs/llm/invoke.cjs.map +1 -0
  17. package/dist/cjs/llm/openai/index.cjs +2 -0
  18. package/dist/cjs/llm/openai/index.cjs.map +1 -1
  19. package/dist/cjs/llm/request.cjs +41 -0
  20. package/dist/cjs/llm/request.cjs.map +1 -0
  21. package/dist/cjs/main.cjs +40 -0
  22. package/dist/cjs/main.cjs.map +1 -1
  23. package/dist/cjs/messages/cache.cjs +76 -89
  24. package/dist/cjs/messages/cache.cjs.map +1 -1
  25. package/dist/cjs/messages/contextPruning.cjs +156 -0
  26. package/dist/cjs/messages/contextPruning.cjs.map +1 -0
  27. package/dist/cjs/messages/contextPruningSettings.cjs +53 -0
  28. package/dist/cjs/messages/contextPruningSettings.cjs.map +1 -0
  29. package/dist/cjs/messages/core.cjs +23 -37
  30. package/dist/cjs/messages/core.cjs.map +1 -1
  31. package/dist/cjs/messages/format.cjs +156 -11
  32. package/dist/cjs/messages/format.cjs.map +1 -1
  33. package/dist/cjs/messages/prune.cjs +1161 -49
  34. package/dist/cjs/messages/prune.cjs.map +1 -1
  35. package/dist/cjs/messages/reducer.cjs +87 -0
  36. package/dist/cjs/messages/reducer.cjs.map +1 -0
  37. package/dist/cjs/run.cjs +81 -42
  38. package/dist/cjs/run.cjs.map +1 -1
  39. package/dist/cjs/stream.cjs +54 -7
  40. package/dist/cjs/stream.cjs.map +1 -1
  41. package/dist/cjs/summarization/index.cjs +75 -0
  42. package/dist/cjs/summarization/index.cjs.map +1 -0
  43. package/dist/cjs/summarization/node.cjs +663 -0
  44. package/dist/cjs/summarization/node.cjs.map +1 -0
  45. package/dist/cjs/tools/ToolNode.cjs +16 -8
  46. package/dist/cjs/tools/ToolNode.cjs.map +1 -1
  47. package/dist/cjs/tools/handlers.cjs +2 -0
  48. package/dist/cjs/tools/handlers.cjs.map +1 -1
  49. package/dist/cjs/utils/errors.cjs +115 -0
  50. package/dist/cjs/utils/errors.cjs.map +1 -0
  51. package/dist/cjs/utils/events.cjs +17 -0
  52. package/dist/cjs/utils/events.cjs.map +1 -1
  53. package/dist/cjs/utils/handlers.cjs +16 -0
  54. package/dist/cjs/utils/handlers.cjs.map +1 -1
  55. package/dist/cjs/utils/llm.cjs +10 -0
  56. package/dist/cjs/utils/llm.cjs.map +1 -1
  57. package/dist/cjs/utils/tokens.cjs +247 -14
  58. package/dist/cjs/utils/tokens.cjs.map +1 -1
  59. package/dist/cjs/utils/truncation.cjs +107 -0
  60. package/dist/cjs/utils/truncation.cjs.map +1 -0
  61. package/dist/esm/agents/AgentContext.mjs +325 -61
  62. package/dist/esm/agents/AgentContext.mjs.map +1 -1
  63. package/dist/esm/common/enum.mjs +13 -0
  64. package/dist/esm/common/enum.mjs.map +1 -1
  65. package/dist/esm/events.mjs +8 -28
  66. package/dist/esm/events.mjs.map +1 -1
  67. package/dist/esm/graphs/Graph.mjs +307 -226
  68. package/dist/esm/graphs/Graph.mjs.map +1 -1
  69. package/dist/esm/llm/anthropic/utils/message_inputs.mjs +4 -4
  70. package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
  71. package/dist/esm/llm/bedrock/utils/message_inputs.mjs +6 -2
  72. package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
  73. package/dist/esm/llm/init.mjs +58 -0
  74. package/dist/esm/llm/init.mjs.map +1 -0
  75. package/dist/esm/llm/invoke.mjs +87 -0
  76. package/dist/esm/llm/invoke.mjs.map +1 -0
  77. package/dist/esm/llm/openai/index.mjs +2 -0
  78. package/dist/esm/llm/openai/index.mjs.map +1 -1
  79. package/dist/esm/llm/request.mjs +38 -0
  80. package/dist/esm/llm/request.mjs.map +1 -0
  81. package/dist/esm/main.mjs +13 -3
  82. package/dist/esm/main.mjs.map +1 -1
  83. package/dist/esm/messages/cache.mjs +76 -89
  84. package/dist/esm/messages/cache.mjs.map +1 -1
  85. package/dist/esm/messages/contextPruning.mjs +154 -0
  86. package/dist/esm/messages/contextPruning.mjs.map +1 -0
  87. package/dist/esm/messages/contextPruningSettings.mjs +50 -0
  88. package/dist/esm/messages/contextPruningSettings.mjs.map +1 -0
  89. package/dist/esm/messages/core.mjs +23 -37
  90. package/dist/esm/messages/core.mjs.map +1 -1
  91. package/dist/esm/messages/format.mjs +156 -11
  92. package/dist/esm/messages/format.mjs.map +1 -1
  93. package/dist/esm/messages/prune.mjs +1158 -52
  94. package/dist/esm/messages/prune.mjs.map +1 -1
  95. package/dist/esm/messages/reducer.mjs +83 -0
  96. package/dist/esm/messages/reducer.mjs.map +1 -0
  97. package/dist/esm/run.mjs +82 -43
  98. package/dist/esm/run.mjs.map +1 -1
  99. package/dist/esm/stream.mjs +54 -7
  100. package/dist/esm/stream.mjs.map +1 -1
  101. package/dist/esm/summarization/index.mjs +73 -0
  102. package/dist/esm/summarization/index.mjs.map +1 -0
  103. package/dist/esm/summarization/node.mjs +659 -0
  104. package/dist/esm/summarization/node.mjs.map +1 -0
  105. package/dist/esm/tools/ToolNode.mjs +16 -8
  106. package/dist/esm/tools/ToolNode.mjs.map +1 -1
  107. package/dist/esm/tools/handlers.mjs +2 -0
  108. package/dist/esm/tools/handlers.mjs.map +1 -1
  109. package/dist/esm/utils/errors.mjs +111 -0
  110. package/dist/esm/utils/errors.mjs.map +1 -0
  111. package/dist/esm/utils/events.mjs +17 -1
  112. package/dist/esm/utils/events.mjs.map +1 -1
  113. package/dist/esm/utils/handlers.mjs +16 -0
  114. package/dist/esm/utils/handlers.mjs.map +1 -1
  115. package/dist/esm/utils/llm.mjs +10 -1
  116. package/dist/esm/utils/llm.mjs.map +1 -1
  117. package/dist/esm/utils/tokens.mjs +245 -15
  118. package/dist/esm/utils/tokens.mjs.map +1 -1
  119. package/dist/esm/utils/truncation.mjs +102 -0
  120. package/dist/esm/utils/truncation.mjs.map +1 -0
  121. package/dist/types/agents/AgentContext.d.ts +124 -6
  122. package/dist/types/common/enum.d.ts +14 -1
  123. package/dist/types/graphs/Graph.d.ts +22 -27
  124. package/dist/types/index.d.ts +5 -0
  125. package/dist/types/llm/init.d.ts +18 -0
  126. package/dist/types/llm/invoke.d.ts +48 -0
  127. package/dist/types/llm/request.d.ts +14 -0
  128. package/dist/types/messages/contextPruning.d.ts +42 -0
  129. package/dist/types/messages/contextPruningSettings.d.ts +44 -0
  130. package/dist/types/messages/core.d.ts +1 -1
  131. package/dist/types/messages/format.d.ts +17 -1
  132. package/dist/types/messages/index.d.ts +3 -0
  133. package/dist/types/messages/prune.d.ts +162 -1
  134. package/dist/types/messages/reducer.d.ts +18 -0
  135. package/dist/types/run.d.ts +12 -1
  136. package/dist/types/summarization/index.d.ts +20 -0
  137. package/dist/types/summarization/node.d.ts +29 -0
  138. package/dist/types/tools/ToolNode.d.ts +3 -1
  139. package/dist/types/types/graph.d.ts +44 -6
  140. package/dist/types/types/index.d.ts +1 -0
  141. package/dist/types/types/run.d.ts +30 -0
  142. package/dist/types/types/stream.d.ts +31 -4
  143. package/dist/types/types/summarize.d.ts +47 -0
  144. package/dist/types/types/tools.d.ts +7 -0
  145. package/dist/types/utils/errors.d.ts +28 -0
  146. package/dist/types/utils/events.d.ts +13 -0
  147. package/dist/types/utils/index.d.ts +2 -0
  148. package/dist/types/utils/llm.d.ts +4 -0
  149. package/dist/types/utils/tokens.d.ts +14 -1
  150. package/dist/types/utils/truncation.d.ts +49 -0
  151. package/package.json +1 -1
  152. package/src/agents/AgentContext.ts +388 -58
  153. package/src/agents/__tests__/AgentContext.test.ts +265 -5
  154. package/src/common/enum.ts +13 -0
  155. package/src/events.ts +9 -39
  156. package/src/graphs/Graph.ts +468 -331
  157. package/src/index.ts +7 -0
  158. package/src/llm/anthropic/llm.spec.ts +3 -3
  159. package/src/llm/anthropic/utils/message_inputs.ts +6 -4
  160. package/src/llm/bedrock/llm.spec.ts +1 -1
  161. package/src/llm/bedrock/utils/message_inputs.ts +6 -2
  162. package/src/llm/init.ts +63 -0
  163. package/src/llm/invoke.ts +144 -0
  164. package/src/llm/request.ts +55 -0
  165. package/src/messages/__tests__/observationMasking.test.ts +221 -0
  166. package/src/messages/cache.ts +77 -102
  167. package/src/messages/contextPruning.ts +191 -0
  168. package/src/messages/contextPruningSettings.ts +90 -0
  169. package/src/messages/core.ts +32 -53
  170. package/src/messages/ensureThinkingBlock.test.ts +39 -39
  171. package/src/messages/format.ts +227 -15
  172. package/src/messages/formatAgentMessages.test.ts +511 -1
  173. package/src/messages/index.ts +3 -0
  174. package/src/messages/prune.ts +1548 -62
  175. package/src/messages/reducer.ts +22 -0
  176. package/src/run.ts +104 -51
  177. package/src/scripts/bedrock-merge-test.ts +1 -1
  178. package/src/scripts/test-thinking-handoff-bedrock.ts +1 -1
  179. package/src/scripts/test-thinking-handoff.ts +1 -1
  180. package/src/scripts/thinking-bedrock.ts +1 -1
  181. package/src/scripts/thinking.ts +1 -1
  182. package/src/specs/anthropic.simple.test.ts +1 -1
  183. package/src/specs/multi-agent-summarization.test.ts +396 -0
  184. package/src/specs/prune.test.ts +1196 -23
  185. package/src/specs/summarization-unit.test.ts +868 -0
  186. package/src/specs/summarization.test.ts +3810 -0
  187. package/src/specs/summarize-prune.test.ts +376 -0
  188. package/src/specs/thinking-handoff.test.ts +10 -10
  189. package/src/specs/thinking-prune.test.ts +7 -4
  190. package/src/specs/token-accounting-e2e.test.ts +1034 -0
  191. package/src/specs/token-accounting-pipeline.test.ts +882 -0
  192. package/src/specs/token-distribution-edge-case.test.ts +25 -26
  193. package/src/splitStream.test.ts +42 -33
  194. package/src/stream.ts +64 -11
  195. package/src/summarization/__tests__/aggregator.test.ts +153 -0
  196. package/src/summarization/__tests__/node.test.ts +708 -0
  197. package/src/summarization/__tests__/trigger.test.ts +50 -0
  198. package/src/summarization/index.ts +102 -0
  199. package/src/summarization/node.ts +982 -0
  200. package/src/tools/ToolNode.ts +25 -3
  201. package/src/types/graph.ts +62 -7
  202. package/src/types/index.ts +1 -0
  203. package/src/types/run.ts +32 -0
  204. package/src/types/stream.ts +45 -5
  205. package/src/types/summarize.ts +58 -0
  206. package/src/types/tools.ts +7 -0
  207. package/src/utils/errors.ts +117 -0
  208. package/src/utils/events.ts +31 -0
  209. package/src/utils/handlers.ts +18 -0
  210. package/src/utils/index.ts +2 -0
  211. package/src/utils/llm.ts +12 -0
  212. package/src/utils/tokens.ts +336 -18
  213. package/src/utils/truncation.ts +124 -0
  214. package/src/scripts/image.ts +0 -180
@@ -1,6 +1,7 @@
1
1
  import {
2
2
  AIMessage,
3
3
  BaseMessage,
4
+ ToolMessage,
4
5
  UsageMetadata,
5
6
  } from '@langchain/core/messages';
6
7
  import type {
@@ -9,7 +10,53 @@ import type {
9
10
  ReasoningContentText,
10
11
  } from '@/types/stream';
11
12
  import type { TokenCounter } from '@/types/run';
12
- import { ContentTypes, Providers } from '@/common';
13
+ import type { ContextPruningConfig } from '@/types/graph';
14
+ import {
15
+ calculateMaxToolResultChars,
16
+ truncateToolResultContent,
17
+ truncateToolInput,
18
+ } from '@/utils/truncation';
19
+ import { resolveContextPruningSettings } from './contextPruningSettings';
20
+ import { ContentTypes, Providers, Constants } from '@/common';
21
+ import { applyContextPruning } from './contextPruning';
22
+
23
+ function sumTokenCounts(
24
+ tokenMap: Record<string, number | undefined>,
25
+ count: number
26
+ ): number {
27
+ let total = 0;
28
+ for (let i = 0; i < count; i++) {
29
+ total += tokenMap[i] ?? 0;
30
+ }
31
+ return total;
32
+ }
33
+
34
+ /** Default fraction of the token budget reserved as headroom (5 %). */
35
+ export const DEFAULT_RESERVE_RATIO = 0.05;
36
+
37
+ /** Context pressure at which observation masking and context fading activate. */
38
+ const PRESSURE_THRESHOLD_MASKING = 0.8;
39
+
40
+ /** Pressure band thresholds paired with budget factors for progressive context fading. */
41
+ const PRESSURE_BANDS: [number, number][] = [
42
+ [0.99, 0.05],
43
+ [0.9, 0.2],
44
+ [0.85, 0.5],
45
+ [0.8, 1.0],
46
+ ];
47
+
48
+ /** Maximum character length for masked (consumed) tool results. */
49
+ const MASKED_RESULT_MAX_CHARS = 300;
50
+
51
+ /** Hard cap for the originalToolContent store (~2 MB estimated from char length). */
52
+ const ORIGINAL_CONTENT_MAX_CHARS = 2_000_000;
53
+
54
+ /** Minimum cumulative calibration ratio — provider can't count fewer tokens
55
+ * than our raw estimate (within reason). Prevents divide-by-zero edge cases. */
56
+ const CALIBRATION_RATIO_MIN = 0.5;
57
+
58
+ /** Maximum cumulative calibration ratio — sanity cap for the running ratio. */
59
+ const CALIBRATION_RATIO_MAX = 5;
13
60
 
14
61
  export type PruneMessagesFactoryParams = {
15
62
  provider?: Providers;
@@ -18,13 +65,455 @@ export type PruneMessagesFactoryParams = {
18
65
  tokenCounter: TokenCounter;
19
66
  indexTokenCountMap: Record<string, number | undefined>;
20
67
  thinkingEnabled?: boolean;
68
+ /** Context pruning configuration for position-based tool result degradation. */
69
+ contextPruningConfig?: ContextPruningConfig;
70
+ /**
71
+ * When true, context pressure fading (pre-flight tool result truncation)
72
+ * is skipped. Summarization replaces pruning as the primary context
73
+ * management strategy — the summarizer needs full un-truncated tool results
74
+ * to produce an accurate summary. Hard pruning still runs as a fallback
75
+ * when summarization is skipped or capped.
76
+ */
77
+ summarizationEnabled?: boolean;
78
+ /**
79
+ * Returns the current instruction-token overhead (system message + tool schemas + summary).
80
+ * Called on each prune invocation so the budget reflects dynamic changes
81
+ * (e.g. summary added between turns). When messages don't include a leading
82
+ * SystemMessage, these tokens are subtracted from the available budget so
83
+ * the pruner correctly reserves space for the system prompt that will be
84
+ * prepended later by `buildSystemRunnable`.
85
+ */
86
+ getInstructionTokens?: () => number;
87
+ /**
88
+ * Fraction of the effective token budget to reserve as headroom (0–1).
89
+ * When set, pruning triggers at `effectiveMax * (1 - reserveRatio)` instead of
90
+ * filling the context window to 100%. Defaults to 5 % (0.05) when omitted.
91
+ */
92
+ reserveRatio?: number;
93
+ /**
94
+ * Initial calibration ratio from a previous run's persisted contextMeta.
95
+ * Seeds the running EMA so new messages are scaled immediately instead
96
+ * of waiting for the first provider response. Ignored when <= 0.
97
+ */
98
+ calibrationRatio?: number;
99
+ /** Optional diagnostic log callback wired by the graph for observability. */
100
+ log?: (
101
+ level: 'debug' | 'info' | 'warn' | 'error',
102
+ message: string,
103
+ data?: Record<string, unknown>
104
+ ) => void;
21
105
  };
22
106
  export type PruneMessagesParams = {
23
107
  messages: BaseMessage[];
24
108
  usageMetadata?: Partial<UsageMetadata>;
25
109
  startType?: ReturnType<BaseMessage['getType']>;
110
+ /**
111
+ * Usage from the most recent LLM call only (not accumulated).
112
+ * When provided, calibration uses this instead of usageMetadata
113
+ * to avoid inflated ratios from N×cacheRead accumulation.
114
+ */
115
+ lastCallUsage?: {
116
+ totalTokens: number;
117
+ inputTokens?: number;
118
+ };
119
+ /**
120
+ * Whether the token data is fresh (from a just-completed LLM call).
121
+ * When false, provider calibration is skipped to avoid applying
122
+ * stale ratios.
123
+ */
124
+ totalTokensFresh?: boolean;
26
125
  };
27
126
 
127
+ function getToolCallIds(message: BaseMessage): Set<string> {
128
+ if (message.getType() !== 'ai') {
129
+ return new Set<string>();
130
+ }
131
+
132
+ const ids = new Set<string>();
133
+ const aiMessage = message as AIMessage;
134
+ for (const toolCall of aiMessage.tool_calls ?? []) {
135
+ if (typeof toolCall.id === 'string' && toolCall.id.length > 0) {
136
+ ids.add(toolCall.id);
137
+ }
138
+ }
139
+
140
+ if (Array.isArray(aiMessage.content)) {
141
+ for (const part of aiMessage.content) {
142
+ if (typeof part !== 'object') {
143
+ continue;
144
+ }
145
+ const record = part as { type?: unknown; id?: unknown };
146
+ if (
147
+ (record.type === 'tool_use' || record.type === 'tool_call') &&
148
+ typeof record.id === 'string' &&
149
+ record.id.length > 0
150
+ ) {
151
+ ids.add(record.id);
152
+ }
153
+ }
154
+ }
155
+
156
+ return ids;
157
+ }
158
+
159
+ function getToolResultId(message: BaseMessage): string | null {
160
+ if (message.getType() !== 'tool') {
161
+ return null;
162
+ }
163
+ const toolMessage = message as ToolMessage & {
164
+ tool_call_id?: unknown;
165
+ toolCallId?: unknown;
166
+ };
167
+ if (
168
+ typeof toolMessage.tool_call_id === 'string' &&
169
+ toolMessage.tool_call_id.length > 0
170
+ ) {
171
+ return toolMessage.tool_call_id;
172
+ }
173
+ if (
174
+ typeof toolMessage.toolCallId === 'string' &&
175
+ toolMessage.toolCallId.length > 0
176
+ ) {
177
+ return toolMessage.toolCallId;
178
+ }
179
+ return null;
180
+ }
181
+
182
+ function resolveTokenCountForMessage({
183
+ message,
184
+ messageIndexMap,
185
+ tokenCounter,
186
+ indexTokenCountMap,
187
+ }: {
188
+ message: BaseMessage;
189
+ messageIndexMap: Map<BaseMessage, number>;
190
+ tokenCounter: TokenCounter;
191
+ indexTokenCountMap: Record<string, number | undefined>;
192
+ }): number {
193
+ const originalIndex = messageIndexMap.get(message) ?? -1;
194
+ if (originalIndex > -1 && indexTokenCountMap[originalIndex] != null) {
195
+ return indexTokenCountMap[originalIndex] as number;
196
+ }
197
+ return tokenCounter(message);
198
+ }
199
+
200
+ export function repairOrphanedToolMessages({
201
+ context,
202
+ allMessages,
203
+ tokenCounter,
204
+ indexTokenCountMap,
205
+ }: {
206
+ context: BaseMessage[];
207
+ allMessages: BaseMessage[];
208
+ tokenCounter: TokenCounter;
209
+ indexTokenCountMap: Record<string, number | undefined>;
210
+ }): {
211
+ context: BaseMessage[];
212
+ reclaimedTokens: number;
213
+ droppedOrphanCount: number;
214
+ /** Messages removed from context during orphan repair. These should be
215
+ * appended to `messagesToRefine` so that summarization can still see them
216
+ * (e.g. a ToolMessage whose parent AI was pruned). */
217
+ droppedMessages: BaseMessage[];
218
+ } {
219
+ const messageIndexMap = new Map<BaseMessage, number>();
220
+ for (let i = 0; i < allMessages.length; i++) {
221
+ messageIndexMap.set(allMessages[i], i);
222
+ }
223
+
224
+ const validToolCallIds = new Set<string>();
225
+ const presentToolResultIds = new Set<string>();
226
+ for (const message of context) {
227
+ for (const id of getToolCallIds(message)) {
228
+ validToolCallIds.add(id);
229
+ }
230
+ const resultId = getToolResultId(message);
231
+ if (resultId != null) {
232
+ presentToolResultIds.add(resultId);
233
+ }
234
+ }
235
+
236
+ let reclaimedTokens = 0;
237
+ let droppedOrphanCount = 0;
238
+ const repairedContext: BaseMessage[] = [];
239
+ const droppedMessages: BaseMessage[] = [];
240
+
241
+ for (const message of context) {
242
+ if (message.getType() === 'tool') {
243
+ const toolResultId = getToolResultId(message);
244
+ if (toolResultId == null || !validToolCallIds.has(toolResultId)) {
245
+ droppedOrphanCount += 1;
246
+ reclaimedTokens += resolveTokenCountForMessage({
247
+ message,
248
+ tokenCounter,
249
+ messageIndexMap,
250
+ indexTokenCountMap,
251
+ });
252
+ droppedMessages.push(message);
253
+ continue;
254
+ }
255
+ repairedContext.push(message);
256
+ continue;
257
+ }
258
+
259
+ if (message.getType() === 'ai' && message instanceof AIMessage) {
260
+ const toolCallIds = getToolCallIds(message);
261
+ if (toolCallIds.size > 0) {
262
+ let hasOrphanToolCalls = false;
263
+ for (const id of toolCallIds) {
264
+ if (!presentToolResultIds.has(id)) {
265
+ hasOrphanToolCalls = true;
266
+ break;
267
+ }
268
+ }
269
+ if (hasOrphanToolCalls) {
270
+ const originalTokens = resolveTokenCountForMessage({
271
+ message,
272
+ messageIndexMap,
273
+ tokenCounter,
274
+ indexTokenCountMap,
275
+ });
276
+ const stripped = stripOrphanToolUseBlocks(
277
+ message,
278
+ presentToolResultIds
279
+ );
280
+ if (stripped != null) {
281
+ const strippedTokens = tokenCounter(stripped);
282
+ reclaimedTokens += originalTokens - strippedTokens;
283
+ repairedContext.push(stripped);
284
+ } else {
285
+ droppedOrphanCount += 1;
286
+ reclaimedTokens += originalTokens;
287
+ droppedMessages.push(message);
288
+ }
289
+ continue;
290
+ }
291
+ }
292
+ }
293
+
294
+ repairedContext.push(message);
295
+ }
296
+
297
+ return {
298
+ context: repairedContext,
299
+ reclaimedTokens,
300
+ droppedOrphanCount,
301
+ droppedMessages,
302
+ };
303
+ }
304
+
305
+ /**
306
+ * Strips tool_use content blocks and tool_calls entries from an AI message
307
+ * when their corresponding ToolMessages are not in the context.
308
+ * Returns null if the message has no content left after stripping.
309
+ */
310
+ function stripOrphanToolUseBlocks(
311
+ message: AIMessage,
312
+ presentToolResultIds: Set<string>
313
+ ): AIMessage | null {
314
+ const keptToolCalls = (message.tool_calls ?? []).filter(
315
+ (tc) => typeof tc.id === 'string' && presentToolResultIds.has(tc.id)
316
+ );
317
+
318
+ let keptContent: MessageContentComplex[] | string;
319
+ if (Array.isArray(message.content)) {
320
+ const filtered = (message.content as MessageContentComplex[]).filter(
321
+ (block) => {
322
+ if (typeof block !== 'object') {
323
+ return true;
324
+ }
325
+ const record = block as { type?: unknown; id?: unknown };
326
+ if (
327
+ (record.type === 'tool_use' || record.type === 'tool_call') &&
328
+ typeof record.id === 'string'
329
+ ) {
330
+ return presentToolResultIds.has(record.id);
331
+ }
332
+ return true;
333
+ }
334
+ );
335
+
336
+ if (filtered.length === 0) {
337
+ return null;
338
+ }
339
+ keptContent = filtered;
340
+ } else {
341
+ keptContent = message.content;
342
+ }
343
+
344
+ return new AIMessage({
345
+ ...message,
346
+ content: keptContent,
347
+ tool_calls: keptToolCalls.length > 0 ? keptToolCalls : undefined,
348
+ });
349
+ }
350
+
351
+ /**
352
+ * Lightweight structural cleanup: strips orphan tool_use blocks from AI messages
353
+ * and drops orphan ToolMessages whose AI counterpart is missing.
354
+ *
355
+ * Unlike `repairOrphanedToolMessages`, this does NOT track tokens — it is
356
+ * intended as a final safety net in Graph.ts right before model invocation
357
+ * to prevent Anthropic/Bedrock structural validation errors.
358
+ *
359
+ * Uses duck-typing instead of `getType()` because messages at this stage
360
+ * may be plain objects (from LangGraph state serialization) rather than
361
+ * proper BaseMessage class instances.
362
+ *
363
+ * Includes a fast-path: if every tool_call has a matching tool_result and
364
+ * vice-versa, the original array is returned immediately with zero allocation.
365
+ */
366
+ export function sanitizeOrphanToolBlocks(
367
+ messages: BaseMessage[]
368
+ ): BaseMessage[] {
369
+ const allToolCallIds = new Set<string>();
370
+ const allToolResultIds = new Set<string>();
371
+
372
+ for (const msg of messages) {
373
+ const msgAny = msg as unknown as Record<string, unknown>;
374
+ const toolCalls = msgAny.tool_calls as Array<{ id?: string }> | undefined;
375
+ if (Array.isArray(toolCalls)) {
376
+ for (const tc of toolCalls) {
377
+ if (
378
+ typeof tc.id === 'string' &&
379
+ tc.id.length > 0 &&
380
+ !tc.id.startsWith(Constants.ANTHROPIC_SERVER_TOOL_PREFIX)
381
+ ) {
382
+ allToolCallIds.add(tc.id);
383
+ }
384
+ }
385
+ }
386
+ if (Array.isArray(msgAny.content)) {
387
+ for (const block of msgAny.content as Array<Record<string, unknown>>) {
388
+ if (
389
+ typeof block === 'object' &&
390
+ (block.type === 'tool_use' || block.type === 'tool_call') &&
391
+ typeof block.id === 'string' &&
392
+ !block.id.startsWith(Constants.ANTHROPIC_SERVER_TOOL_PREFIX)
393
+ ) {
394
+ allToolCallIds.add(block.id);
395
+ }
396
+ }
397
+ }
398
+ const toolCallId = msgAny.tool_call_id as string | undefined;
399
+ if (typeof toolCallId === 'string' && toolCallId.length > 0) {
400
+ allToolResultIds.add(toolCallId);
401
+ }
402
+ }
403
+
404
+ let hasOrphans = false;
405
+ for (const id of allToolCallIds) {
406
+ if (!allToolResultIds.has(id)) {
407
+ hasOrphans = true;
408
+ break;
409
+ }
410
+ }
411
+ if (!hasOrphans) {
412
+ for (const id of allToolResultIds) {
413
+ if (!allToolCallIds.has(id)) {
414
+ hasOrphans = true;
415
+ break;
416
+ }
417
+ }
418
+ }
419
+ if (!hasOrphans) {
420
+ return messages;
421
+ }
422
+
423
+ const result: BaseMessage[] = [];
424
+ const strippedAiIndices = new Set<number>();
425
+
426
+ for (const msg of messages) {
427
+ const msgAny = msg as unknown as Record<string, unknown>;
428
+ const msgType =
429
+ typeof (msg as { getType?: unknown }).getType === 'function'
430
+ ? msg.getType()
431
+ : ((msgAny.role as string | undefined) ??
432
+ (msgAny._type as string | undefined));
433
+
434
+ const toolCallId = msgAny.tool_call_id as string | undefined;
435
+ if (
436
+ (msgType === 'tool' || msg instanceof ToolMessage) &&
437
+ typeof toolCallId === 'string' &&
438
+ !allToolCallIds.has(toolCallId)
439
+ ) {
440
+ continue;
441
+ }
442
+
443
+ const toolCalls = msgAny.tool_calls as Array<{ id?: string }> | undefined;
444
+ if (
445
+ (msgType === 'ai' ||
446
+ msgType === 'assistant' ||
447
+ msg instanceof AIMessage) &&
448
+ Array.isArray(toolCalls) &&
449
+ toolCalls.length > 0
450
+ ) {
451
+ const hasOrphanCalls = toolCalls.some(
452
+ (tc) => typeof tc.id === 'string' && !allToolResultIds.has(tc.id)
453
+ );
454
+ if (hasOrphanCalls) {
455
+ if (msg instanceof AIMessage) {
456
+ const stripped = stripOrphanToolUseBlocks(msg, allToolResultIds);
457
+ if (stripped != null) {
458
+ strippedAiIndices.add(result.length);
459
+ result.push(stripped);
460
+ }
461
+ continue;
462
+ }
463
+ const keptToolCalls = toolCalls.filter(
464
+ (tc) => typeof tc.id === 'string' && allToolResultIds.has(tc.id)
465
+ );
466
+ const keptContent = Array.isArray(msgAny.content)
467
+ ? (msgAny.content as Array<Record<string, unknown>>).filter(
468
+ (block) => {
469
+ if (typeof block !== 'object') return true;
470
+ if (
471
+ (block.type === 'tool_use' || block.type === 'tool_call') &&
472
+ typeof block.id === 'string'
473
+ ) {
474
+ return allToolResultIds.has(block.id);
475
+ }
476
+ return true;
477
+ }
478
+ )
479
+ : msgAny.content;
480
+ if (
481
+ keptToolCalls.length === 0 &&
482
+ Array.isArray(keptContent) &&
483
+ keptContent.length === 0
484
+ ) {
485
+ continue;
486
+ }
487
+ strippedAiIndices.add(result.length);
488
+ const patched = Object.create(
489
+ Object.getPrototypeOf(msg),
490
+ Object.getOwnPropertyDescriptors(msg)
491
+ );
492
+ patched.tool_calls = keptToolCalls.length > 0 ? keptToolCalls : [];
493
+ patched.content = keptContent;
494
+ result.push(patched as BaseMessage);
495
+ continue;
496
+ }
497
+ }
498
+
499
+ result.push(msg);
500
+ }
501
+
502
+ // Bedrock/Anthropic require the conversation to end with a user message;
503
+ // a stripped AI message (tool_use removed) represents a dead-end exchange.
504
+ while (result.length > 0 && strippedAiIndices.has(result.length - 1)) {
505
+ result.pop();
506
+ }
507
+
508
+ return result;
509
+ }
510
+
511
+ /**
512
+ * Truncates an oversized tool_use `input` field using head+tail, preserving
513
+ * it as a valid JSON object. Head gets ~70%, tail gets ~30% so the model
514
+ * sees both the beginning (what was called) and end (closing structure/values).
515
+ * Falls back to head-only when the budget is too small for a meaningful tail.
516
+ */
28
517
  function isIndexInContext(
29
518
  arrayA: unknown[],
30
519
  arrayB: unknown[],
@@ -69,9 +558,14 @@ export function calculateTotalTokens(
69
558
  const baseInputTokens = Number(usage.input_tokens) || 0;
70
559
  const cacheCreation = Number(usage.input_token_details?.cache_creation) || 0;
71
560
  const cacheRead = Number(usage.input_token_details?.cache_read) || 0;
72
-
73
- const totalInputTokens = baseInputTokens + cacheCreation + cacheRead;
74
561
  const totalOutputTokens = Number(usage.output_tokens) || 0;
562
+ const cacheSum = cacheCreation + cacheRead;
563
+ // Anthropic: input_tokens excludes cache, cache_read can be much larger than input_tokens.
564
+ // OpenAI: input_tokens includes cache, cache_read is always <= input_tokens.
565
+ const cacheIsAdditive = cacheSum > 0 && cacheSum > baseInputTokens;
566
+ const totalInputTokens = cacheIsAdditive
567
+ ? baseInputTokens + cacheSum
568
+ : baseInputTokens;
75
569
 
76
570
  return {
77
571
  input_tokens: totalInputTokens,
@@ -103,6 +597,7 @@ export function getMessagesWithinTokenLimit({
103
597
  tokenCounter,
104
598
  thinkingStartIndex: _thinkingStartIndex = -1,
105
599
  reasoningType = ContentTypes.THINKING,
600
+ instructionTokens: _instructionTokens = 0,
106
601
  }: {
107
602
  messages: BaseMessage[];
108
603
  maxContextTokens: number;
@@ -112,6 +607,13 @@ export function getMessagesWithinTokenLimit({
112
607
  tokenCounter: TokenCounter;
113
608
  thinkingStartIndex?: number;
114
609
  reasoningType?: ContentTypes.THINKING | ContentTypes.REASONING_CONTENT;
610
+ /**
611
+ * Token overhead for instructions (system message + tool schemas + summary)
612
+ * that are NOT included in `messages`. When messages[0] is already a
613
+ * SystemMessage the budget is deducted from its indexTokenCountMap entry
614
+ * as before; otherwise this value is subtracted from the available budget.
615
+ */
616
+ instructionTokens?: number;
115
617
  }): PruningResult {
116
618
  // Every reply is primed with <|start|>assistant<|message|>, so we
117
619
  // start with 3 tokens for the label after all messages have been counted.
@@ -119,7 +621,7 @@ export function getMessagesWithinTokenLimit({
119
621
  const instructions =
120
622
  _messages[0]?.getType() === 'system' ? _messages[0] : undefined;
121
623
  const instructionsTokenCount =
122
- instructions != null ? (indexTokenCountMap[0] ?? 0) : 0;
624
+ instructions != null ? (indexTokenCountMap[0] ?? 0) : _instructionTokens;
123
625
  const initialContextTokens = maxContextTokens - instructionsTokenCount;
124
626
  let remainingContextTokens = initialContextTokens;
125
627
  let startType = _startType;
@@ -242,6 +744,18 @@ export function getMessagesWithinTokenLimit({
242
744
  messages.shift();
243
745
  }
244
746
 
747
+ // The backward iteration pushed messages in reverse chronological order
748
+ // (newest first). Restore correct chronological order before prepending
749
+ // the remaining (older) messages so that messagesToRefine is always
750
+ // ordered oldest → newest. Without this, callers that rely on
751
+ // messagesToRefine order (e.g. the summarization node extracting the
752
+ // latest turn) would see tool_use/tool_result pairs in the wrong order.
753
+ prunedMemory.reverse();
754
+
755
+ if (messages.length > 0) {
756
+ prunedMemory.unshift(...messages);
757
+ }
758
+
245
759
  remainingContextTokens -= currentTokenCount;
246
760
  const result: PruningResult = {
247
761
  remainingContextTokens,
@@ -259,7 +773,6 @@ export function getMessagesWithinTokenLimit({
259
773
  (thinkingStartIndex > -1 &&
260
774
  isIndexInContext(_messages, context, thinkingStartIndex))
261
775
  ) {
262
- // we reverse at this step to ensure the context is in the correct order for the model, and we need to work backwards
263
776
  result.context = context.reverse() as BaseMessage[];
264
777
  return result;
265
778
  }
@@ -276,9 +789,6 @@ export function getMessagesWithinTokenLimit({
276
789
  );
277
790
  }
278
791
 
279
- // Since we have a thinking sequence, we need to find the last assistant message
280
- // in the latest AI/tool sequence to add the thinking block that falls outside of the current context
281
- // Latest messages are ordered first.
282
792
  let assistantIndex = -1;
283
793
  for (let i = 0; i < context.length; i++) {
284
794
  const currentMessage = context[i];
@@ -292,9 +802,10 @@ export function getMessagesWithinTokenLimit({
292
802
  }
293
803
 
294
804
  if (assistantIndex === -1) {
295
- throw new Error(
296
- 'Context window exceeded: aggressive pruning removed all AI messages (likely due to an oversized tool response). Increase max context tokens or reduce tool output size.'
297
- );
805
+ // No AI messages survived pruning — skip thinking block reattachment.
806
+ // The caller handles empty/insufficient context via overflow recovery.
807
+ result.context = context.reverse() as BaseMessage[];
808
+ return result;
298
809
  }
299
810
 
300
811
  thinkingStartIndex = originalLength - 1 - assistantIndex;
@@ -313,7 +824,6 @@ export function getMessagesWithinTokenLimit({
313
824
  }
314
825
 
315
826
  const thinkingMessage: AIMessage = context[assistantIndex] as AIMessage;
316
- // now we need to an additional round of pruning but making the thinking block fit
317
827
  const newThinkingMessageTokenCount =
318
828
  (indexTokenCountMap[thinkingStartIndex] ?? 0) + thinkingTokenCount;
319
829
  remainingContextTokens = initialContextTokens - newThinkingMessageTokenCount;
@@ -389,6 +899,288 @@ export function checkValidNumber(value: unknown): value is number {
389
899
  return typeof value === 'number' && !isNaN(value) && value > 0;
390
900
  }
391
901
 
902
+ /**
903
+ * Observation masking: replaces consumed ToolMessage content with tight
904
+ * head+tail truncations that serve as informative placeholders.
905
+ *
906
+ * A ToolMessage is "consumed" when a subsequent AI message exists that is NOT
907
+ * purely tool calls — meaning the model has already read and acted on the
908
+ * result. Unconsumed results (the latest tool outputs the model hasn't
909
+ * responded to yet) are left intact so the model can still use them.
910
+ *
911
+ * AI messages are never masked — they contain the model's own reasoning and
912
+ * conclusions, which is what prevents the model from repeating work after
913
+ * its tool results are masked.
914
+ *
915
+ * @returns The number of tool messages that were masked.
916
+ */
917
+ export function maskConsumedToolResults(params: {
918
+ messages: BaseMessage[];
919
+ indexTokenCountMap: Record<string, number | undefined>;
920
+ tokenCounter: TokenCounter;
921
+ /** Raw-space token budget available for all consumed tool results combined.
922
+ * When provided, the budget is distributed across consumed results weighted
923
+ * by recency (newest get the most, oldest get MASKED_RESULT_MAX_CHARS min).
924
+ * When omitted, falls back to a flat MASKED_RESULT_MAX_CHARS per result. */
925
+ availableRawBudget?: number;
926
+ /** When provided, original (pre-masking) content is stored here keyed by
927
+ * message index — only for entries that actually get truncated. */
928
+ originalContentStore?: Map<number, string>;
929
+ /** Called after storing content with the char length of the stored entry. */
930
+ onContentStored?: (charLength: number) => void;
931
+ }): number {
932
+ const { messages, indexTokenCountMap, tokenCounter } = params;
933
+ let maskedCount = 0;
934
+
935
+ // Pass 1 (backward): identify consumed tool message indices.
936
+ // A ToolMessage is "consumed" once we've seen a subsequent AI message with
937
+ // substantive text content (not just tool calls).
938
+ // Collected in forward order (oldest first) for recency weighting.
939
+ let seenNonToolCallAI = false;
940
+ const consumedIndices: number[] = [];
941
+
942
+ for (let i = messages.length - 1; i >= 0; i--) {
943
+ const msg = messages[i];
944
+ const type = msg.getType();
945
+
946
+ if (type === 'ai') {
947
+ const hasText =
948
+ typeof msg.content === 'string'
949
+ ? msg.content.trim().length > 0
950
+ : Array.isArray(msg.content) &&
951
+ msg.content.some(
952
+ (b) =>
953
+ typeof b === 'object' &&
954
+ (b as Record<string, unknown>).type === 'text' &&
955
+ typeof (b as Record<string, unknown>).text === 'string' &&
956
+ ((b as Record<string, unknown>).text as string).trim().length >
957
+ 0
958
+ );
959
+ if (hasText) {
960
+ seenNonToolCallAI = true;
961
+ }
962
+ } else if (type === 'tool' && seenNonToolCallAI) {
963
+ consumedIndices.push(i);
964
+ }
965
+ }
966
+
967
+ if (consumedIndices.length === 0) {
968
+ return 0;
969
+ }
970
+
971
+ consumedIndices.reverse();
972
+
973
+ const totalBudgetChars =
974
+ params.availableRawBudget != null && params.availableRawBudget > 0
975
+ ? params.availableRawBudget * 4
976
+ : 0;
977
+
978
+ const count = consumedIndices.length;
979
+
980
+ for (let c = 0; c < count; c++) {
981
+ const i = consumedIndices[c];
982
+ const message = messages[i];
983
+ const content = message.content;
984
+ if (typeof content !== 'string') {
985
+ continue;
986
+ }
987
+
988
+ let maxChars: number;
989
+ if (totalBudgetChars > 0) {
990
+ const position = count > 1 ? c / (count - 1) : 1;
991
+ const weight = 0.2 + 0.8 * position;
992
+ const totalWeight = count > 1 ? 0.6 * count : 1;
993
+ const share = (weight / totalWeight) * totalBudgetChars;
994
+ maxChars = Math.max(MASKED_RESULT_MAX_CHARS, Math.floor(share));
995
+ } else {
996
+ maxChars = MASKED_RESULT_MAX_CHARS;
997
+ }
998
+
999
+ if (content.length <= maxChars) {
1000
+ continue;
1001
+ }
1002
+
1003
+ if (params.originalContentStore && !params.originalContentStore.has(i)) {
1004
+ params.originalContentStore.set(i, content);
1005
+ if (params.onContentStored) {
1006
+ params.onContentStored(content.length);
1007
+ }
1008
+ }
1009
+
1010
+ const cloned = new ToolMessage({
1011
+ content: truncateToolResultContent(content, maxChars),
1012
+ tool_call_id: (message as ToolMessage).tool_call_id,
1013
+ name: message.name,
1014
+ id: message.id,
1015
+ additional_kwargs: message.additional_kwargs,
1016
+ response_metadata: message.response_metadata,
1017
+ });
1018
+ messages[i] = cloned;
1019
+ indexTokenCountMap[i] = tokenCounter(cloned);
1020
+ maskedCount++;
1021
+ }
1022
+
1023
+ return maskedCount;
1024
+ }
1025
+
1026
+ /**
1027
+ * Pre-flight truncation: truncates oversized ToolMessage content before the
1028
+ * main backward-iteration pruning runs. Unlike the ingestion guard (which caps
1029
+ * at tool-execution time), pre-flight truncation applies per-turn based on the
1030
+ * current context window budget (which may have shrunk due to growing conversation).
1031
+ *
1032
+ * After truncation, recounts tokens via tokenCounter and updates indexTokenCountMap
1033
+ * so subsequent pruning works with accurate counts.
1034
+ *
1035
+ * @returns The number of tool messages that were truncated.
1036
+ */
1037
+ export function preFlightTruncateToolResults(params: {
1038
+ messages: BaseMessage[];
1039
+ maxContextTokens: number;
1040
+ indexTokenCountMap: Record<string, number | undefined>;
1041
+ tokenCounter: TokenCounter;
1042
+ }): number {
1043
+ const { messages, maxContextTokens, indexTokenCountMap, tokenCounter } =
1044
+ params;
1045
+ const baseMaxChars = calculateMaxToolResultChars(maxContextTokens);
1046
+ let truncatedCount = 0;
1047
+
1048
+ const toolIndices: number[] = [];
1049
+ for (let i = 0; i < messages.length; i++) {
1050
+ if (messages[i].getType() === 'tool') {
1051
+ toolIndices.push(i);
1052
+ }
1053
+ }
1054
+
1055
+ for (let t = 0; t < toolIndices.length; t++) {
1056
+ const i = toolIndices[t];
1057
+ const message = messages[i];
1058
+ const content = message.content;
1059
+ if (typeof content !== 'string') {
1060
+ continue;
1061
+ }
1062
+
1063
+ const position = toolIndices.length > 1 ? t / (toolIndices.length - 1) : 1;
1064
+ const recencyFactor = 0.2 + 0.8 * position;
1065
+ const maxChars = Math.max(200, Math.floor(baseMaxChars * recencyFactor));
1066
+
1067
+ if (content.length <= maxChars) {
1068
+ continue;
1069
+ }
1070
+
1071
+ const truncated = truncateToolResultContent(content, maxChars);
1072
+ const cloned = new ToolMessage({
1073
+ content: truncated,
1074
+ tool_call_id: (message as ToolMessage).tool_call_id,
1075
+ name: message.name,
1076
+ id: message.id,
1077
+ additional_kwargs: message.additional_kwargs,
1078
+ response_metadata: message.response_metadata,
1079
+ });
1080
+ messages[i] = cloned;
1081
+ indexTokenCountMap[i] = tokenCounter(cloned);
1082
+ truncatedCount++;
1083
+ }
1084
+
1085
+ return truncatedCount;
1086
+ }
1087
+
1088
+ /**
1089
+ * Pre-flight truncation: truncates oversized `tool_use` input fields in AI messages.
1090
+ *
1091
+ * Tool call inputs (arguments) can be very large — e.g., code evaluation payloads from
1092
+ * MCP tools like chrome-devtools. Since these tool calls have already been executed,
1093
+ * the model only needs a summary of what was called, not the full arguments. Truncating
1094
+ * them before pruning can prevent entire messages from being dropped.
1095
+ *
1096
+ * Uses 15% of the context window (in estimated characters, ~4 chars/token) as the
1097
+ * per-input cap, capped at 200K chars.
1098
+ *
1099
+ * @returns The number of AI messages that had tool_use inputs truncated.
1100
+ */
1101
+ export function preFlightTruncateToolCallInputs(params: {
1102
+ messages: BaseMessage[];
1103
+ maxContextTokens: number;
1104
+ indexTokenCountMap: Record<string, number | undefined>;
1105
+ tokenCounter: TokenCounter;
1106
+ }): number {
1107
+ const { messages, maxContextTokens, indexTokenCountMap, tokenCounter } =
1108
+ params;
1109
+ const maxInputChars = Math.min(
1110
+ Math.floor(maxContextTokens * 0.15) * 4,
1111
+ 200_000
1112
+ );
1113
+ let truncatedCount = 0;
1114
+
1115
+ for (let i = 0; i < messages.length; i++) {
1116
+ const message = messages[i];
1117
+ if (message.getType() !== 'ai') {
1118
+ continue;
1119
+ }
1120
+ if (!Array.isArray(message.content)) {
1121
+ continue;
1122
+ }
1123
+
1124
+ const originalContent = message.content as MessageContentComplex[];
1125
+ const state = { changed: false };
1126
+ const newContent = originalContent.map((block) => {
1127
+ if (typeof block !== 'object') {
1128
+ return block;
1129
+ }
1130
+ const record = block as Record<string, unknown>;
1131
+ if (record.type !== 'tool_use' && record.type !== 'tool_call') {
1132
+ return block;
1133
+ }
1134
+
1135
+ const input = record.input;
1136
+ if (input == null) {
1137
+ return block;
1138
+ }
1139
+ const serialized =
1140
+ typeof input === 'string' ? input : JSON.stringify(input);
1141
+ if (serialized.length <= maxInputChars) {
1142
+ return block;
1143
+ }
1144
+
1145
+ state.changed = true;
1146
+ // Replaces original input with { _truncated, _originalChars } —
1147
+ // safe because the tool call already executed in a prior turn.
1148
+ return {
1149
+ ...record,
1150
+ input: truncateToolInput(serialized, maxInputChars),
1151
+ };
1152
+ });
1153
+
1154
+ if (!state.changed) {
1155
+ continue;
1156
+ }
1157
+
1158
+ const aiMsg = message as AIMessage;
1159
+ const newToolCalls = (aiMsg.tool_calls ?? []).map((tc) => {
1160
+ const serializedArgs = JSON.stringify(tc.args);
1161
+ if (serializedArgs.length <= maxInputChars) {
1162
+ return tc;
1163
+ }
1164
+ // Replaces original args with { _truncated, _originalChars } —
1165
+ // safe because the tool call already executed in a prior turn.
1166
+ return {
1167
+ ...tc,
1168
+ args: truncateToolInput(serializedArgs, maxInputChars),
1169
+ };
1170
+ });
1171
+
1172
+ messages[i] = new AIMessage({
1173
+ ...aiMsg,
1174
+ content: newContent,
1175
+ tool_calls: newToolCalls.length > 0 ? newToolCalls : undefined,
1176
+ });
1177
+ indexTokenCountMap[i] = tokenCounter(messages[i]);
1178
+ truncatedCount++;
1179
+ }
1180
+
1181
+ return truncatedCount;
1182
+ }
1183
+
392
1184
  type ThinkingBlocks = {
393
1185
  thinking_blocks?: Array<{
394
1186
  type: 'thinking';
@@ -401,15 +1193,63 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
401
1193
  const indexTokenCountMap = { ...factoryParams.indexTokenCountMap };
402
1194
  let lastTurnStartIndex = factoryParams.startIndex;
403
1195
  let lastCutOffIndex = 0;
404
- let totalTokens = Object.values(indexTokenCountMap).reduce(
405
- (a = 0, b = 0) => a + b,
406
- 0
407
- ) as number;
1196
+ let totalTokens = 0;
1197
+ for (const key in indexTokenCountMap) {
1198
+ totalTokens += indexTokenCountMap[key] ?? 0;
1199
+ }
408
1200
  let runThinkingStartIndex = -1;
1201
+ /** Cumulative raw tiktoken tokens we've sent to the provider (messages only,
1202
+ * excludes instruction overhead and new outputs not yet seen by provider). */
1203
+ let cumulativeRawSent = 0;
1204
+ /** Cumulative provider-reported message tokens (providerInput - instructionOverhead). */
1205
+ let cumulativeProviderReported = 0;
1206
+ /** Stable calibration ratio = cumulativeProviderReported / cumulativeRawSent.
1207
+ * Converges monotonically as data accumulates. Falls back to seeded value. */
1208
+ let calibrationRatio =
1209
+ factoryParams.calibrationRatio != null && factoryParams.calibrationRatio > 0
1210
+ ? factoryParams.calibrationRatio
1211
+ : 1;
1212
+ /** Best observed instruction overhead from a near-zero variance turn.
1213
+ * Self-seeds from provider observations within the run. */
1214
+ let bestInstructionOverhead: number | undefined;
1215
+ let bestVarianceAbs = Infinity;
1216
+ /** Local estimate at the time bestInstructionOverhead was observed.
1217
+ * Used to invalidate the cached overhead when instructions change
1218
+ * mid-run (e.g. tool discovery adds tools to the bound set). */
1219
+ let bestInstructionEstimate: number | undefined;
1220
+ /** Original (pre-masking) tool result content keyed by message index.
1221
+ * Allows the summarizer to see full tool outputs even after masking
1222
+ * has truncated them in the live message array. Cleared when the
1223
+ * pruner is recreated after summarization. */
1224
+ const originalToolContent = new Map<number, string>();
1225
+ let originalToolContentSize = 0;
1226
+ const contextPruningSettings = resolveContextPruningSettings(
1227
+ factoryParams.contextPruningConfig
1228
+ );
1229
+
409
1230
  return function pruneMessages(params: PruneMessagesParams): {
410
1231
  context: BaseMessage[];
411
1232
  indexTokenCountMap: Record<string, number | undefined>;
1233
+ messagesToRefine?: BaseMessage[];
1234
+ prePruneContextTokens?: number;
1235
+ remainingContextTokens?: number;
1236
+ contextPressure?: number;
1237
+ originalToolContent?: Map<number, string>;
1238
+ calibrationRatio?: number;
1239
+ resolvedInstructionOverhead?: number;
412
1240
  } {
1241
+ if (params.messages.length === 0) {
1242
+ return {
1243
+ context: [],
1244
+ indexTokenCountMap,
1245
+ messagesToRefine: [],
1246
+ prePruneContextTokens: 0,
1247
+ remainingContextTokens: factoryParams.maxTokens,
1248
+ calibrationRatio,
1249
+ resolvedInstructionOverhead: bestInstructionOverhead,
1250
+ };
1251
+ }
1252
+
413
1253
  if (
414
1254
  factoryParams.provider === Providers.OPENAI &&
415
1255
  factoryParams.thinkingEnabled === true
@@ -467,84 +1307,402 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
467
1307
  checkValidNumber(params.usageMetadata.output_tokens)
468
1308
  ) {
469
1309
  currentUsage = calculateTotalTokens(params.usageMetadata);
470
- totalTokens = currentUsage.total_tokens;
471
1310
  }
472
1311
 
473
1312
  const newOutputs = new Set<number>();
1313
+ let outputTokensAssigned = false;
474
1314
  for (let i = lastTurnStartIndex; i < params.messages.length; i++) {
475
1315
  const message = params.messages[i];
476
- if (
477
- i === lastTurnStartIndex &&
478
- indexTokenCountMap[i] === undefined &&
479
- currentUsage
480
- ) {
1316
+ if (indexTokenCountMap[i] !== undefined) {
1317
+ continue;
1318
+ }
1319
+
1320
+ // Assign output_tokens to the first uncounted AI message — this is the
1321
+ // model's response. Previous code blindly targeted lastTurnStartIndex
1322
+ // which could hit a pre-counted HumanMessage or miss the AI entirely.
1323
+ if (!outputTokensAssigned && currentUsage && message.getType() === 'ai') {
481
1324
  indexTokenCountMap[i] = currentUsage.output_tokens;
482
- } else if (indexTokenCountMap[i] === undefined) {
1325
+ newOutputs.add(i);
1326
+ outputTokensAssigned = true;
1327
+ } else {
1328
+ // Always store raw tiktoken count — the map stays in raw space.
1329
+ // Budget decisions multiply by calibrationRatio on the fly.
483
1330
  indexTokenCountMap[i] = factoryParams.tokenCounter(message);
484
1331
  if (currentUsage) {
485
1332
  newOutputs.add(i);
486
1333
  }
487
- totalTokens += indexTokenCountMap[i] ?? 0;
488
1334
  }
1335
+ totalTokens += indexTokenCountMap[i] ?? 0;
489
1336
  }
490
1337
 
491
- // If `currentUsage` is defined, we need to distribute the current total tokens to our `indexTokenCountMap`,
492
- // We must distribute it in a weighted manner, so that the total token count is equal to `currentUsage.total_tokens`,
493
- // relative the manually counted tokens in `indexTokenCountMap`.
494
- // EDGE CASE: when the resulting context gets pruned, we should not distribute the usage for messages that are not in the context.
495
- if (currentUsage) {
496
- let totalIndexTokens = 0;
497
- if (params.messages[0].getType() === 'system') {
498
- totalIndexTokens += indexTokenCountMap[0] ?? 0;
1338
+ // Cumulative calibration: accumulate raw tiktoken tokens and provider-
1339
+ // reported tokens across turns. The ratio of the two running totals
1340
+ // converges monotonically to the true provider multiplier no EMA,
1341
+ // no per-turn oscillation, no map mutation.
1342
+ if (currentUsage && params.totalTokensFresh !== false) {
1343
+ const instructionOverhead = factoryParams.getInstructionTokens?.() ?? 0;
1344
+ const providerInputTokens =
1345
+ params.lastCallUsage?.inputTokens ?? currentUsage.input_tokens;
1346
+
1347
+ // Sum raw tiktoken counts for messages the provider saw (excludes
1348
+ // new outputs from this turn — the provider hasn't seen them yet).
1349
+ let rawSentThisTurn = 0;
1350
+ const firstIsSystem =
1351
+ params.messages.length > 0 && params.messages[0].getType() === 'system';
1352
+ if (firstIsSystem) {
1353
+ rawSentThisTurn += indexTokenCountMap[0] ?? 0;
499
1354
  }
500
1355
  for (let i = lastCutOffIndex; i < params.messages.length; i++) {
501
- if (i === 0 && params.messages[0].getType() === 'system') {
1356
+ if ((i === 0 && firstIsSystem) || newOutputs.has(i)) {
502
1357
  continue;
503
1358
  }
504
- if (newOutputs.has(i)) {
505
- continue;
506
- }
507
- totalIndexTokens += indexTokenCountMap[i] ?? 0;
1359
+ rawSentThisTurn += indexTokenCountMap[i] ?? 0;
508
1360
  }
509
1361
 
510
- // Calculate ratio based only on messages that remain in the context
511
- const ratio = currentUsage.total_tokens / totalIndexTokens;
512
- const isRatioSafe = ratio >= 1 / 3 && ratio <= 2.5;
1362
+ const providerMessageTokens = Math.max(
1363
+ 0,
1364
+ providerInputTokens - instructionOverhead
1365
+ );
513
1366
 
514
- // Apply the ratio adjustment only to messages at or after lastCutOffIndex, and only if the ratio is safe
515
- if (isRatioSafe) {
516
- if (
517
- params.messages[0].getType() === 'system' &&
518
- lastCutOffIndex !== 0
519
- ) {
520
- indexTokenCountMap[0] = Math.round(
521
- (indexTokenCountMap[0] ?? 0) * ratio
522
- );
523
- }
1367
+ if (rawSentThisTurn > 0 && providerMessageTokens > 0) {
1368
+ cumulativeRawSent += rawSentThisTurn;
1369
+ cumulativeProviderReported += providerMessageTokens;
1370
+ const newRatio = cumulativeProviderReported / cumulativeRawSent;
1371
+ calibrationRatio = Math.max(
1372
+ CALIBRATION_RATIO_MIN,
1373
+ Math.min(CALIBRATION_RATIO_MAX, newRatio)
1374
+ );
1375
+ }
524
1376
 
525
- for (let i = lastCutOffIndex; i < params.messages.length; i++) {
526
- if (newOutputs.has(i)) {
527
- continue;
528
- }
529
- indexTokenCountMap[i] = Math.round(
530
- (indexTokenCountMap[i] ?? 0) * ratio
531
- );
1377
+ const calibratedOurTotal =
1378
+ instructionOverhead + rawSentThisTurn * calibrationRatio;
1379
+ const overallRatio =
1380
+ calibratedOurTotal > 0 ? providerInputTokens / calibratedOurTotal : 0;
1381
+ const variancePct = Math.round((overallRatio - 1) * 100);
1382
+
1383
+ const absVariance = Math.abs(overallRatio - 1);
1384
+ if (absVariance < bestVarianceAbs && rawSentThisTurn > 0) {
1385
+ bestVarianceAbs = absVariance;
1386
+ bestInstructionOverhead = Math.max(
1387
+ 0,
1388
+ Math.round(providerInputTokens - rawSentThisTurn * calibrationRatio)
1389
+ );
1390
+ bestInstructionEstimate = factoryParams.getInstructionTokens?.() ?? 0;
1391
+ }
1392
+
1393
+ factoryParams.log?.('debug', 'Calibration observed', {
1394
+ providerInputTokens,
1395
+ calibratedEstimate: Math.round(calibratedOurTotal),
1396
+ variance: `${variancePct > 0 ? '+' : ''}${variancePct}%`,
1397
+ calibrationRatio: Math.round(calibrationRatio * 100) / 100,
1398
+ instructionOverhead,
1399
+ cumulativeRawSent,
1400
+ cumulativeProviderReported,
1401
+ });
1402
+ }
1403
+
1404
+ // Computed BEFORE pre-flight truncation so the effective budget can drive
1405
+ // truncation thresholds — without this, thresholds based on maxTokens are
1406
+ // too generous and leave individual messages larger than the actual budget.
1407
+ const estimatedInstructionTokens =
1408
+ factoryParams.getInstructionTokens?.() ?? 0;
1409
+ const estimateStable =
1410
+ bestInstructionEstimate != null &&
1411
+ bestInstructionEstimate > 0 &&
1412
+ Math.abs(estimatedInstructionTokens - bestInstructionEstimate) /
1413
+ bestInstructionEstimate <
1414
+ 0.1;
1415
+ const currentInstructionTokens =
1416
+ bestInstructionOverhead != null &&
1417
+ bestInstructionOverhead <= estimatedInstructionTokens &&
1418
+ estimateStable
1419
+ ? bestInstructionOverhead
1420
+ : estimatedInstructionTokens;
1421
+
1422
+ const reserveRatio = factoryParams.reserveRatio ?? DEFAULT_RESERVE_RATIO;
1423
+ const reserveTokens =
1424
+ reserveRatio > 0 && reserveRatio < 1
1425
+ ? Math.round(factoryParams.maxTokens * reserveRatio)
1426
+ : 0;
1427
+ const pruningBudget = factoryParams.maxTokens - reserveTokens;
1428
+
1429
+ const effectiveMaxTokens = Math.max(
1430
+ 0,
1431
+ pruningBudget - currentInstructionTokens
1432
+ );
1433
+
1434
+ let calibratedTotalTokens = Math.round(totalTokens * calibrationRatio);
1435
+
1436
+ factoryParams.log?.('debug', 'Budget', {
1437
+ maxTokens: factoryParams.maxTokens,
1438
+ pruningBudget,
1439
+ effectiveMax: effectiveMaxTokens,
1440
+ instructionTokens: currentInstructionTokens,
1441
+ messageCount: params.messages.length,
1442
+ calibratedTotalTokens,
1443
+ calibrationRatio: Math.round(calibrationRatio * 100) / 100,
1444
+ });
1445
+
1446
+ // When instructions alone consume the entire budget, no message can
1447
+ // fit regardless of truncation. Short-circuit: yield all messages for
1448
+ // summarization and return an empty context so the Graph can route to
1449
+ // the summarize node immediately instead of falling through to the
1450
+ // emergency path that would reach the same outcome more expensively.
1451
+ if (
1452
+ effectiveMaxTokens === 0 &&
1453
+ factoryParams.summarizationEnabled === true &&
1454
+ params.messages.length > 0
1455
+ ) {
1456
+ factoryParams.log?.(
1457
+ 'warn',
1458
+ 'Instructions consume entire budget — yielding all messages for summarization',
1459
+ {
1460
+ instructionTokens: currentInstructionTokens,
1461
+ pruningBudget,
1462
+ messageCount: params.messages.length,
532
1463
  }
1464
+ );
1465
+
1466
+ lastTurnStartIndex = params.messages.length;
1467
+ return {
1468
+ context: [],
1469
+ indexTokenCountMap,
1470
+ messagesToRefine: [...params.messages],
1471
+ prePruneContextTokens: calibratedTotalTokens,
1472
+ remainingContextTokens: 0,
1473
+ contextPressure:
1474
+ pruningBudget > 0 ? calibratedTotalTokens / pruningBudget : 0,
1475
+ calibrationRatio,
1476
+ resolvedInstructionOverhead: bestInstructionOverhead,
1477
+ };
1478
+ }
1479
+
1480
+ // ---------------------------------------------------------------------------
1481
+ // Progressive context fading — inspired by Claude Code's staged compaction.
1482
+ // Below 80%: no modifications, tool results retain full size.
1483
+ // Above 80%: graduated truncation with increasing aggression per pressure band.
1484
+ // Recency weighting ensures older results fade first, newer results last.
1485
+ //
1486
+ // At the gentlest level, truncation preserves most content (head+tail).
1487
+ // At the most aggressive level, the result is effectively a one-line placeholder.
1488
+ //
1489
+ // 80%: gentle — budget factor 1.0, oldest get light truncation
1490
+ // 85%: moderate — budget factor 0.50, older results shrink significantly
1491
+ // 90%: aggressive — budget factor 0.20, most results heavily truncated
1492
+ // 99%: emergency — budget factor 0.05, effectively placeholders for old results
1493
+ // ---------------------------------------------------------------------------
1494
+ totalTokens = sumTokenCounts(indexTokenCountMap, params.messages.length);
1495
+ calibratedTotalTokens = Math.round(totalTokens * calibrationRatio);
1496
+ const contextPressure =
1497
+ pruningBudget > 0 ? calibratedTotalTokens / pruningBudget : 0;
1498
+ let preFlightResultCount = 0;
1499
+ let preFlightInputCount = 0;
1500
+
1501
+ // -----------------------------------------------------------------------
1502
+ // Observation masking (80%+ pressure, both paths):
1503
+ // Replace consumed ToolMessage content with tight head+tail placeholders.
1504
+ // AI messages stay intact so the model can read its own prior reasoning
1505
+ // and won't repeat work. Unconsumed results (latest tool outputs the
1506
+ // model hasn't acted on yet) stay full.
1507
+ //
1508
+ // When summarization is enabled, snapshot messages first so the
1509
+ // summarizer can see the full originals when compaction fires.
1510
+ // -----------------------------------------------------------------------
1511
+ let observationsMasked = 0;
1512
+
1513
+ if (contextPressure >= PRESSURE_THRESHOLD_MASKING) {
1514
+ const rawMessageBudget =
1515
+ calibrationRatio > 0
1516
+ ? Math.floor(effectiveMaxTokens / calibrationRatio)
1517
+ : effectiveMaxTokens;
1518
+ // When summarization is enabled, use half the reserve ratio as extra
1519
+ // masking headroom — the LLM keeps more context while the summarizer
1520
+ // gets full content from originalToolContent regardless. The remaining
1521
+ // half of the reserve covers estimation errors.
1522
+ const reserveHeadroom =
1523
+ factoryParams.summarizationEnabled === true
1524
+ ? Math.floor(
1525
+ rawMessageBudget *
1526
+ (factoryParams.reserveRatio ?? DEFAULT_RESERVE_RATIO) *
1527
+ 0.5
1528
+ )
1529
+ : 0;
1530
+ observationsMasked = maskConsumedToolResults({
1531
+ messages: params.messages,
1532
+ indexTokenCountMap,
1533
+ tokenCounter: factoryParams.tokenCounter,
1534
+ availableRawBudget: rawMessageBudget + reserveHeadroom,
1535
+ originalContentStore:
1536
+ factoryParams.summarizationEnabled === true
1537
+ ? originalToolContent
1538
+ : undefined,
1539
+ onContentStored:
1540
+ factoryParams.summarizationEnabled === true
1541
+ ? (charLen: number): void => {
1542
+ originalToolContentSize += charLen;
1543
+ while (
1544
+ originalToolContentSize > ORIGINAL_CONTENT_MAX_CHARS &&
1545
+ originalToolContent.size > 0
1546
+ ) {
1547
+ const oldest = originalToolContent.keys().next();
1548
+ if (oldest.done === true) {
1549
+ break;
1550
+ }
1551
+ const removed = originalToolContent.get(oldest.value);
1552
+ if (removed != null) {
1553
+ originalToolContentSize -= removed.length;
1554
+ }
1555
+ originalToolContent.delete(oldest.value);
1556
+ }
1557
+ }
1558
+ : undefined,
1559
+ });
1560
+ if (observationsMasked > 0) {
1561
+ cumulativeRawSent = 0;
1562
+ cumulativeProviderReported = 0;
533
1563
  }
534
1564
  }
535
1565
 
1566
+ if (
1567
+ contextPressure >= PRESSURE_THRESHOLD_MASKING &&
1568
+ factoryParams.summarizationEnabled !== true
1569
+ ) {
1570
+ const budgetFactor =
1571
+ PRESSURE_BANDS.find(
1572
+ ([threshold]) => contextPressure >= threshold
1573
+ )?.[1] ?? 1.0;
1574
+
1575
+ const baseBudget = Math.max(
1576
+ 1024,
1577
+ Math.floor(effectiveMaxTokens * budgetFactor)
1578
+ );
1579
+
1580
+ preFlightResultCount = preFlightTruncateToolResults({
1581
+ messages: params.messages,
1582
+ maxContextTokens: baseBudget,
1583
+ indexTokenCountMap,
1584
+ tokenCounter: factoryParams.tokenCounter,
1585
+ });
1586
+
1587
+ preFlightInputCount = preFlightTruncateToolCallInputs({
1588
+ messages: params.messages,
1589
+ maxContextTokens: baseBudget,
1590
+ indexTokenCountMap,
1591
+ tokenCounter: factoryParams.tokenCounter,
1592
+ });
1593
+ }
1594
+ if (
1595
+ factoryParams.contextPruningConfig?.enabled === true &&
1596
+ factoryParams.summarizationEnabled !== true
1597
+ ) {
1598
+ applyContextPruning({
1599
+ messages: params.messages,
1600
+ indexTokenCountMap,
1601
+ tokenCounter: factoryParams.tokenCounter,
1602
+ resolvedSettings: contextPruningSettings,
1603
+ });
1604
+ }
1605
+
1606
+ // Fit-to-budget: when summarization is enabled and individual messages
1607
+ // exceed the effective budget, truncate them so every message can fit in
1608
+ // a single context slot. Without this, oversized tool results (e.g.
1609
+ // take_snapshot at 9K chars) cause empty context → emergency truncation
1610
+ // → immediate re-summarization after just one tool call.
1611
+ //
1612
+ // This is NOT the lossy position-based fading above — it only targets
1613
+ // messages that individually exceed the budget, using the full effective
1614
+ // budget as the cap (not a pressure-scaled fraction).
1615
+ // Fit-to-budget caps are in raw space (divide by ratio) so that after
1616
+ // calibration the truncated results actually fit within the budget.
1617
+ const rawSpaceEffectiveMax =
1618
+ calibrationRatio > 0
1619
+ ? Math.round(effectiveMaxTokens / calibrationRatio)
1620
+ : effectiveMaxTokens;
1621
+
1622
+ if (
1623
+ factoryParams.summarizationEnabled === true &&
1624
+ rawSpaceEffectiveMax > 0
1625
+ ) {
1626
+ preFlightResultCount = preFlightTruncateToolResults({
1627
+ messages: params.messages,
1628
+ maxContextTokens: rawSpaceEffectiveMax,
1629
+ indexTokenCountMap,
1630
+ tokenCounter: factoryParams.tokenCounter,
1631
+ });
1632
+
1633
+ preFlightInputCount = preFlightTruncateToolCallInputs({
1634
+ messages: params.messages,
1635
+ maxContextTokens: rawSpaceEffectiveMax,
1636
+ indexTokenCountMap,
1637
+ tokenCounter: factoryParams.tokenCounter,
1638
+ });
1639
+ }
1640
+
1641
+ const preTruncationTotalTokens = totalTokens;
1642
+ totalTokens = sumTokenCounts(indexTokenCountMap, params.messages.length);
1643
+ calibratedTotalTokens = Math.round(totalTokens * calibrationRatio);
1644
+
1645
+ const anyAdjustment =
1646
+ observationsMasked > 0 ||
1647
+ preFlightResultCount > 0 ||
1648
+ preFlightInputCount > 0 ||
1649
+ totalTokens !== preTruncationTotalTokens;
1650
+
1651
+ if (anyAdjustment) {
1652
+ factoryParams.log?.('debug', 'Context adjusted', {
1653
+ contextPressure: Math.round(contextPressure * 100),
1654
+ observationsMasked,
1655
+ toolOutputsTruncated: preFlightResultCount,
1656
+ toolInputsTruncated: preFlightInputCount,
1657
+ tokensBefore: preTruncationTotalTokens,
1658
+ tokensAfter: totalTokens,
1659
+ tokensSaved: preTruncationTotalTokens - totalTokens,
1660
+ });
1661
+ }
1662
+
536
1663
  lastTurnStartIndex = params.messages.length;
537
- if (lastCutOffIndex === 0 && totalTokens <= factoryParams.maxTokens) {
538
- return { context: params.messages, indexTokenCountMap };
1664
+ if (
1665
+ lastCutOffIndex === 0 &&
1666
+ calibratedTotalTokens + currentInstructionTokens <= pruningBudget
1667
+ ) {
1668
+ return {
1669
+ context: params.messages,
1670
+ indexTokenCountMap,
1671
+ messagesToRefine: [],
1672
+ prePruneContextTokens: calibratedTotalTokens,
1673
+ remainingContextTokens:
1674
+ pruningBudget - calibratedTotalTokens - currentInstructionTokens,
1675
+ contextPressure,
1676
+ originalToolContent:
1677
+ originalToolContent.size > 0 ? originalToolContent : undefined,
1678
+ calibrationRatio,
1679
+ resolvedInstructionOverhead: bestInstructionOverhead,
1680
+ };
539
1681
  }
540
1682
 
541
- const { context, thinkingStartIndex } = getMessagesWithinTokenLimit({
542
- maxContextTokens: factoryParams.maxTokens,
1683
+ const rawSpaceBudget =
1684
+ calibrationRatio > 0
1685
+ ? Math.round(pruningBudget / calibrationRatio)
1686
+ : pruningBudget;
1687
+
1688
+ const rawSpaceInstructionTokens =
1689
+ calibrationRatio > 0
1690
+ ? Math.round(currentInstructionTokens / calibrationRatio)
1691
+ : currentInstructionTokens;
1692
+
1693
+ const {
1694
+ context: initialContext,
1695
+ thinkingStartIndex,
1696
+ messagesToRefine,
1697
+ remainingContextTokens: initialRemainingContextTokens,
1698
+ } = getMessagesWithinTokenLimit({
1699
+ maxContextTokens: rawSpaceBudget,
543
1700
  messages: params.messages,
544
1701
  indexTokenCountMap,
545
1702
  startType: params.startType,
546
1703
  thinkingEnabled: factoryParams.thinkingEnabled,
547
1704
  tokenCounter: factoryParams.tokenCounter,
1705
+ instructionTokens: rawSpaceInstructionTokens,
548
1706
  reasoningType:
549
1707
  factoryParams.provider === Providers.BEDROCK
550
1708
  ? ContentTypes.REASONING_CONTENT
@@ -554,6 +1712,323 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
554
1712
  ? runThinkingStartIndex
555
1713
  : undefined,
556
1714
  });
1715
+
1716
+ const {
1717
+ context: repairedContext,
1718
+ reclaimedTokens: initialReclaimedTokens,
1719
+ droppedMessages,
1720
+ } = repairOrphanedToolMessages({
1721
+ context: initialContext,
1722
+ allMessages: params.messages,
1723
+ tokenCounter: factoryParams.tokenCounter,
1724
+ indexTokenCountMap,
1725
+ });
1726
+
1727
+ const contextBreakdown = repairedContext.map((msg) => {
1728
+ const type = msg.getType();
1729
+ const name = type === 'tool' ? (msg.name ?? 'unknown') : '';
1730
+ return name !== '' ? `${type}(${name})` : type;
1731
+ });
1732
+ factoryParams.log?.('debug', 'Pruning complete', {
1733
+ contextLength: repairedContext.length,
1734
+ contextTypes: contextBreakdown.join(', '),
1735
+ messagesToRefineCount: messagesToRefine.length,
1736
+ droppedOrphans: droppedMessages.length,
1737
+ remainingTokens: initialRemainingContextTokens,
1738
+ });
1739
+
1740
+ let context = repairedContext;
1741
+ let reclaimedTokens = initialReclaimedTokens;
1742
+
1743
+ // Orphan repair may drop ToolMessages whose parent AI was pruned.
1744
+ // Append them to messagesToRefine so summarization can still see the
1745
+ // tool results (otherwise the summary says "in progress" for a tool
1746
+ // call that already completed, causing the model to repeat it).
1747
+ if (droppedMessages.length > 0) {
1748
+ messagesToRefine.push(...droppedMessages);
1749
+ }
1750
+
1751
+ // ---------------------------------------------------------------
1752
+ // Fallback fading: when summarization skipped fading earlier and
1753
+ // pruning still produced an empty context, apply lossy pressure-band
1754
+ // fading and retry. This is a last resort before emergency truncation
1755
+ // — the summarizer already saw the full messages, so fading the
1756
+ // surviving context for the LLM is acceptable.
1757
+ // ---------------------------------------------------------------
1758
+ if (
1759
+ context.length === 0 &&
1760
+ params.messages.length > 0 &&
1761
+ effectiveMaxTokens > 0 &&
1762
+ factoryParams.summarizationEnabled === true
1763
+ ) {
1764
+ const fadingBudget = Math.max(1024, effectiveMaxTokens);
1765
+
1766
+ factoryParams.log?.(
1767
+ 'debug',
1768
+ 'Fallback fading — empty context with summarization',
1769
+ {
1770
+ messageCount: params.messages.length,
1771
+ effectiveMaxTokens,
1772
+ fadingBudget,
1773
+ }
1774
+ );
1775
+
1776
+ const fadedMessages = [...params.messages];
1777
+ const preFadingTokenCounts: Record<string, number | undefined> = {};
1778
+ for (let i = 0; i < params.messages.length; i++) {
1779
+ preFadingTokenCounts[i] = indexTokenCountMap[i];
1780
+ }
1781
+
1782
+ preFlightTruncateToolResults({
1783
+ messages: fadedMessages,
1784
+ maxContextTokens: fadingBudget,
1785
+ indexTokenCountMap,
1786
+ tokenCounter: factoryParams.tokenCounter,
1787
+ });
1788
+ preFlightTruncateToolCallInputs({
1789
+ messages: fadedMessages,
1790
+ maxContextTokens: fadingBudget,
1791
+ indexTokenCountMap,
1792
+ tokenCounter: factoryParams.tokenCounter,
1793
+ });
1794
+
1795
+ const fadingRetry = getMessagesWithinTokenLimit({
1796
+ maxContextTokens: pruningBudget,
1797
+ messages: fadedMessages,
1798
+ indexTokenCountMap,
1799
+ startType: params.startType,
1800
+ thinkingEnabled: factoryParams.thinkingEnabled,
1801
+ tokenCounter: factoryParams.tokenCounter,
1802
+ instructionTokens: currentInstructionTokens,
1803
+ reasoningType:
1804
+ factoryParams.provider === Providers.BEDROCK
1805
+ ? ContentTypes.REASONING_CONTENT
1806
+ : ContentTypes.THINKING,
1807
+ thinkingStartIndex:
1808
+ factoryParams.thinkingEnabled === true
1809
+ ? runThinkingStartIndex
1810
+ : undefined,
1811
+ });
1812
+
1813
+ const fadingRepaired = repairOrphanedToolMessages({
1814
+ context: fadingRetry.context,
1815
+ allMessages: fadedMessages,
1816
+ tokenCounter: factoryParams.tokenCounter,
1817
+ indexTokenCountMap,
1818
+ });
1819
+
1820
+ if (fadingRepaired.context.length > 0) {
1821
+ context = fadingRepaired.context;
1822
+ reclaimedTokens = fadingRepaired.reclaimedTokens;
1823
+ messagesToRefine.push(...fadingRetry.messagesToRefine);
1824
+ if (fadingRepaired.droppedMessages.length > 0) {
1825
+ messagesToRefine.push(...fadingRepaired.droppedMessages);
1826
+ }
1827
+
1828
+ factoryParams.log?.('debug', 'Fallback fading recovered context', {
1829
+ contextLength: context.length,
1830
+ messagesToRefineCount: messagesToRefine.length,
1831
+ remainingTokens: fadingRetry.remainingContextTokens,
1832
+ });
1833
+
1834
+ for (const [key, value] of Object.entries(preFadingTokenCounts)) {
1835
+ indexTokenCountMap[key] = value;
1836
+ }
1837
+ } else {
1838
+ for (const [key, value] of Object.entries(preFadingTokenCounts)) {
1839
+ indexTokenCountMap[key] = value;
1840
+ }
1841
+ }
1842
+ }
1843
+
1844
+ // ---------------------------------------------------------------
1845
+ // Emergency truncation: if pruning produced an empty context but
1846
+ // messages exist, aggressively truncate all tool_call inputs and
1847
+ // tool results, then retry. Budget is proportional to the
1848
+ // effective token limit (~4 chars/token, spread across messages)
1849
+ // with a floor of 200 chars so content is never completely blank.
1850
+ // Uses head+tail so the model sees both what was called and the
1851
+ // final outcome (e.g., return value at the end of a script eval).
1852
+ // ---------------------------------------------------------------
1853
+ if (
1854
+ context.length === 0 &&
1855
+ params.messages.length > 0 &&
1856
+ effectiveMaxTokens > 0
1857
+ ) {
1858
+ const perMessageTokenBudget = Math.floor(
1859
+ effectiveMaxTokens / Math.max(1, params.messages.length)
1860
+ );
1861
+ const emergencyMaxChars = Math.max(200, perMessageTokenBudget * 4);
1862
+
1863
+ factoryParams.log?.(
1864
+ 'warn',
1865
+ 'Empty context, entering emergency truncation',
1866
+ {
1867
+ messageCount: params.messages.length,
1868
+ effectiveMax: effectiveMaxTokens,
1869
+ emergencyMaxChars,
1870
+ }
1871
+ );
1872
+
1873
+ // Clone the messages array so emergency truncation doesn't permanently
1874
+ // mutate graph state. The originals remain intact for future turns
1875
+ // where more budget may be available. Also snapshot indexTokenCountMap
1876
+ // entries so the closure doesn't retain stale (too-small) counts for
1877
+ // the original un-truncated messages on the next turn.
1878
+ const emergencyMessages = [...params.messages];
1879
+ const preEmergencyTokenCounts: Record<string, number | undefined> = {};
1880
+ for (let i = 0; i < params.messages.length; i++) {
1881
+ preEmergencyTokenCounts[i] = indexTokenCountMap[i];
1882
+ }
1883
+
1884
+ try {
1885
+ let emergencyTruncatedCount = 0;
1886
+ for (let i = 0; i < emergencyMessages.length; i++) {
1887
+ const message = emergencyMessages[i];
1888
+ if (message.getType() === 'tool') {
1889
+ const content = message.content;
1890
+ if (
1891
+ typeof content === 'string' &&
1892
+ content.length > emergencyMaxChars
1893
+ ) {
1894
+ const cloned = new ToolMessage({
1895
+ content: truncateToolResultContent(content, emergencyMaxChars),
1896
+ tool_call_id: (message as ToolMessage).tool_call_id,
1897
+ name: message.name,
1898
+ id: message.id,
1899
+ additional_kwargs: message.additional_kwargs,
1900
+ response_metadata: message.response_metadata,
1901
+ });
1902
+ emergencyMessages[i] = cloned;
1903
+ indexTokenCountMap[i] = factoryParams.tokenCounter(cloned);
1904
+ emergencyTruncatedCount++;
1905
+ }
1906
+ }
1907
+ if (message.getType() === 'ai' && Array.isArray(message.content)) {
1908
+ const aiMsg = message as AIMessage;
1909
+ const contentBlocks = aiMsg.content as MessageContentComplex[];
1910
+ const needsTruncation = contentBlocks.some((block) => {
1911
+ if (typeof block !== 'object') return false;
1912
+ const record = block as Record<string, unknown>;
1913
+ if (
1914
+ (record.type === 'tool_use' || record.type === 'tool_call') &&
1915
+ record.input != null
1916
+ ) {
1917
+ const serialized =
1918
+ typeof record.input === 'string'
1919
+ ? record.input
1920
+ : JSON.stringify(record.input);
1921
+ return serialized.length > emergencyMaxChars;
1922
+ }
1923
+ return false;
1924
+ });
1925
+ if (needsTruncation) {
1926
+ const newContent = contentBlocks.map((block) => {
1927
+ if (typeof block !== 'object') return block;
1928
+ const record = block as Record<string, unknown>;
1929
+ if (
1930
+ (record.type === 'tool_use' || record.type === 'tool_call') &&
1931
+ record.input != null
1932
+ ) {
1933
+ const serialized =
1934
+ typeof record.input === 'string'
1935
+ ? record.input
1936
+ : JSON.stringify(record.input);
1937
+ if (serialized.length > emergencyMaxChars) {
1938
+ // Replaces original input with { _truncated, _originalChars } —
1939
+ // safe because the tool call already executed in a prior turn.
1940
+ return {
1941
+ ...record,
1942
+ input: truncateToolInput(serialized, emergencyMaxChars),
1943
+ };
1944
+ }
1945
+ }
1946
+ return block;
1947
+ });
1948
+ const newToolCalls = (aiMsg.tool_calls ?? []).map((tc) => {
1949
+ const serializedArgs = JSON.stringify(tc.args);
1950
+ if (serializedArgs.length > emergencyMaxChars) {
1951
+ // Replaces original args with { _truncated, _originalChars } —
1952
+ // safe because the tool call already executed in a prior turn.
1953
+ return {
1954
+ ...tc,
1955
+ args: truncateToolInput(serializedArgs, emergencyMaxChars),
1956
+ };
1957
+ }
1958
+ return tc;
1959
+ });
1960
+ emergencyMessages[i] = new AIMessage({
1961
+ ...aiMsg,
1962
+ content: newContent,
1963
+ tool_calls: newToolCalls.length > 0 ? newToolCalls : undefined,
1964
+ });
1965
+ indexTokenCountMap[i] = factoryParams.tokenCounter(
1966
+ emergencyMessages[i]
1967
+ );
1968
+ emergencyTruncatedCount++;
1969
+ }
1970
+ }
1971
+ }
1972
+
1973
+ factoryParams.log?.('info', 'Emergency truncation complete');
1974
+ factoryParams.log?.('debug', 'Emergency truncation details', {
1975
+ truncatedCount: emergencyTruncatedCount,
1976
+ emergencyMaxChars,
1977
+ });
1978
+
1979
+ const retryResult = getMessagesWithinTokenLimit({
1980
+ maxContextTokens: pruningBudget,
1981
+ messages: emergencyMessages,
1982
+ indexTokenCountMap,
1983
+ startType: params.startType,
1984
+ thinkingEnabled: factoryParams.thinkingEnabled,
1985
+ tokenCounter: factoryParams.tokenCounter,
1986
+ instructionTokens: currentInstructionTokens,
1987
+ reasoningType:
1988
+ factoryParams.provider === Providers.BEDROCK
1989
+ ? ContentTypes.REASONING_CONTENT
1990
+ : ContentTypes.THINKING,
1991
+ thinkingStartIndex:
1992
+ factoryParams.thinkingEnabled === true
1993
+ ? runThinkingStartIndex
1994
+ : undefined,
1995
+ });
1996
+
1997
+ const repaired = repairOrphanedToolMessages({
1998
+ context: retryResult.context,
1999
+ allMessages: emergencyMessages,
2000
+ tokenCounter: factoryParams.tokenCounter,
2001
+ indexTokenCountMap,
2002
+ });
2003
+
2004
+ context = repaired.context;
2005
+ reclaimedTokens = repaired.reclaimedTokens;
2006
+ messagesToRefine.push(...retryResult.messagesToRefine);
2007
+ if (repaired.droppedMessages.length > 0) {
2008
+ messagesToRefine.push(...repaired.droppedMessages);
2009
+ }
2010
+
2011
+ factoryParams.log?.('debug', 'Emergency truncation retry result', {
2012
+ contextLength: context.length,
2013
+ messagesToRefineCount: messagesToRefine.length,
2014
+ remainingTokens: retryResult.remainingContextTokens,
2015
+ });
2016
+ } finally {
2017
+ // Restore the closure's indexTokenCountMap to pre-emergency values so the
2018
+ // next turn counts old messages at their original (un-truncated) size.
2019
+ // The emergency-truncated counts were only needed for this turn's
2020
+ // getMessagesWithinTokenLimit retry.
2021
+ for (const [key, value] of Object.entries(preEmergencyTokenCounts)) {
2022
+ indexTokenCountMap[key] = value;
2023
+ }
2024
+ }
2025
+ }
2026
+
2027
+ const remainingContextTokens = Math.max(
2028
+ 0,
2029
+ Math.min(pruningBudget, initialRemainingContextTokens + reclaimedTokens)
2030
+ );
2031
+
557
2032
  runThinkingStartIndex = thinkingStartIndex ?? -1;
558
2033
  /** The index is the first value of `context`, index relative to `params.messages` */
559
2034
  lastCutOffIndex = Math.max(
@@ -562,6 +2037,17 @@ export function createPruneMessages(factoryParams: PruneMessagesFactoryParams) {
562
2037
  0
563
2038
  );
564
2039
 
565
- return { context, indexTokenCountMap };
2040
+ return {
2041
+ context,
2042
+ indexTokenCountMap,
2043
+ messagesToRefine,
2044
+ prePruneContextTokens: calibratedTotalTokens,
2045
+ remainingContextTokens,
2046
+ contextPressure,
2047
+ originalToolContent:
2048
+ originalToolContent.size > 0 ? originalToolContent : undefined,
2049
+ calibrationRatio,
2050
+ resolvedInstructionOverhead: bestInstructionOverhead,
2051
+ };
566
2052
  };
567
2053
  }