@librechat/agents 3.1.57 → 3.1.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cjs/agents/AgentContext.cjs +326 -62
- package/dist/cjs/agents/AgentContext.cjs.map +1 -1
- package/dist/cjs/common/enum.cjs +13 -0
- package/dist/cjs/common/enum.cjs.map +1 -1
- package/dist/cjs/events.cjs +7 -27
- package/dist/cjs/events.cjs.map +1 -1
- package/dist/cjs/graphs/Graph.cjs +303 -222
- package/dist/cjs/graphs/Graph.cjs.map +1 -1
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs +4 -4
- package/dist/cjs/llm/anthropic/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/llm/bedrock/utils/message_inputs.cjs +6 -2
- package/dist/cjs/llm/bedrock/utils/message_inputs.cjs.map +1 -1
- package/dist/cjs/llm/init.cjs +60 -0
- package/dist/cjs/llm/init.cjs.map +1 -0
- package/dist/cjs/llm/invoke.cjs +90 -0
- package/dist/cjs/llm/invoke.cjs.map +1 -0
- package/dist/cjs/llm/openai/index.cjs +2 -0
- package/dist/cjs/llm/openai/index.cjs.map +1 -1
- package/dist/cjs/llm/request.cjs +41 -0
- package/dist/cjs/llm/request.cjs.map +1 -0
- package/dist/cjs/main.cjs +40 -0
- package/dist/cjs/main.cjs.map +1 -1
- package/dist/cjs/messages/cache.cjs +76 -89
- package/dist/cjs/messages/cache.cjs.map +1 -1
- package/dist/cjs/messages/contextPruning.cjs +156 -0
- package/dist/cjs/messages/contextPruning.cjs.map +1 -0
- package/dist/cjs/messages/contextPruningSettings.cjs +53 -0
- package/dist/cjs/messages/contextPruningSettings.cjs.map +1 -0
- package/dist/cjs/messages/core.cjs +23 -37
- package/dist/cjs/messages/core.cjs.map +1 -1
- package/dist/cjs/messages/format.cjs +156 -11
- package/dist/cjs/messages/format.cjs.map +1 -1
- package/dist/cjs/messages/prune.cjs +1161 -49
- package/dist/cjs/messages/prune.cjs.map +1 -1
- package/dist/cjs/messages/reducer.cjs +87 -0
- package/dist/cjs/messages/reducer.cjs.map +1 -0
- package/dist/cjs/run.cjs +81 -42
- package/dist/cjs/run.cjs.map +1 -1
- package/dist/cjs/stream.cjs +54 -7
- package/dist/cjs/stream.cjs.map +1 -1
- package/dist/cjs/summarization/index.cjs +75 -0
- package/dist/cjs/summarization/index.cjs.map +1 -0
- package/dist/cjs/summarization/node.cjs +663 -0
- package/dist/cjs/summarization/node.cjs.map +1 -0
- package/dist/cjs/tools/ToolNode.cjs +16 -8
- package/dist/cjs/tools/ToolNode.cjs.map +1 -1
- package/dist/cjs/tools/handlers.cjs +2 -0
- package/dist/cjs/tools/handlers.cjs.map +1 -1
- package/dist/cjs/utils/errors.cjs +115 -0
- package/dist/cjs/utils/errors.cjs.map +1 -0
- package/dist/cjs/utils/events.cjs +17 -0
- package/dist/cjs/utils/events.cjs.map +1 -1
- package/dist/cjs/utils/handlers.cjs +16 -0
- package/dist/cjs/utils/handlers.cjs.map +1 -1
- package/dist/cjs/utils/llm.cjs +10 -0
- package/dist/cjs/utils/llm.cjs.map +1 -1
- package/dist/cjs/utils/tokens.cjs +247 -14
- package/dist/cjs/utils/tokens.cjs.map +1 -1
- package/dist/cjs/utils/truncation.cjs +107 -0
- package/dist/cjs/utils/truncation.cjs.map +1 -0
- package/dist/esm/agents/AgentContext.mjs +325 -61
- package/dist/esm/agents/AgentContext.mjs.map +1 -1
- package/dist/esm/common/enum.mjs +13 -0
- package/dist/esm/common/enum.mjs.map +1 -1
- package/dist/esm/events.mjs +8 -28
- package/dist/esm/events.mjs.map +1 -1
- package/dist/esm/graphs/Graph.mjs +307 -226
- package/dist/esm/graphs/Graph.mjs.map +1 -1
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs +4 -4
- package/dist/esm/llm/anthropic/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/llm/bedrock/utils/message_inputs.mjs +6 -2
- package/dist/esm/llm/bedrock/utils/message_inputs.mjs.map +1 -1
- package/dist/esm/llm/init.mjs +58 -0
- package/dist/esm/llm/init.mjs.map +1 -0
- package/dist/esm/llm/invoke.mjs +87 -0
- package/dist/esm/llm/invoke.mjs.map +1 -0
- package/dist/esm/llm/openai/index.mjs +2 -0
- package/dist/esm/llm/openai/index.mjs.map +1 -1
- package/dist/esm/llm/request.mjs +38 -0
- package/dist/esm/llm/request.mjs.map +1 -0
- package/dist/esm/main.mjs +13 -3
- package/dist/esm/main.mjs.map +1 -1
- package/dist/esm/messages/cache.mjs +76 -89
- package/dist/esm/messages/cache.mjs.map +1 -1
- package/dist/esm/messages/contextPruning.mjs +154 -0
- package/dist/esm/messages/contextPruning.mjs.map +1 -0
- package/dist/esm/messages/contextPruningSettings.mjs +50 -0
- package/dist/esm/messages/contextPruningSettings.mjs.map +1 -0
- package/dist/esm/messages/core.mjs +23 -37
- package/dist/esm/messages/core.mjs.map +1 -1
- package/dist/esm/messages/format.mjs +156 -11
- package/dist/esm/messages/format.mjs.map +1 -1
- package/dist/esm/messages/prune.mjs +1158 -52
- package/dist/esm/messages/prune.mjs.map +1 -1
- package/dist/esm/messages/reducer.mjs +83 -0
- package/dist/esm/messages/reducer.mjs.map +1 -0
- package/dist/esm/run.mjs +82 -43
- package/dist/esm/run.mjs.map +1 -1
- package/dist/esm/stream.mjs +54 -7
- package/dist/esm/stream.mjs.map +1 -1
- package/dist/esm/summarization/index.mjs +73 -0
- package/dist/esm/summarization/index.mjs.map +1 -0
- package/dist/esm/summarization/node.mjs +659 -0
- package/dist/esm/summarization/node.mjs.map +1 -0
- package/dist/esm/tools/ToolNode.mjs +16 -8
- package/dist/esm/tools/ToolNode.mjs.map +1 -1
- package/dist/esm/tools/handlers.mjs +2 -0
- package/dist/esm/tools/handlers.mjs.map +1 -1
- package/dist/esm/utils/errors.mjs +111 -0
- package/dist/esm/utils/errors.mjs.map +1 -0
- package/dist/esm/utils/events.mjs +17 -1
- package/dist/esm/utils/events.mjs.map +1 -1
- package/dist/esm/utils/handlers.mjs +16 -0
- package/dist/esm/utils/handlers.mjs.map +1 -1
- package/dist/esm/utils/llm.mjs +10 -1
- package/dist/esm/utils/llm.mjs.map +1 -1
- package/dist/esm/utils/tokens.mjs +245 -15
- package/dist/esm/utils/tokens.mjs.map +1 -1
- package/dist/esm/utils/truncation.mjs +102 -0
- package/dist/esm/utils/truncation.mjs.map +1 -0
- package/dist/types/agents/AgentContext.d.ts +124 -6
- package/dist/types/common/enum.d.ts +14 -1
- package/dist/types/graphs/Graph.d.ts +22 -27
- package/dist/types/index.d.ts +5 -0
- package/dist/types/llm/init.d.ts +18 -0
- package/dist/types/llm/invoke.d.ts +48 -0
- package/dist/types/llm/request.d.ts +14 -0
- package/dist/types/messages/contextPruning.d.ts +42 -0
- package/dist/types/messages/contextPruningSettings.d.ts +44 -0
- package/dist/types/messages/core.d.ts +1 -1
- package/dist/types/messages/format.d.ts +17 -1
- package/dist/types/messages/index.d.ts +3 -0
- package/dist/types/messages/prune.d.ts +162 -1
- package/dist/types/messages/reducer.d.ts +18 -0
- package/dist/types/run.d.ts +12 -1
- package/dist/types/summarization/index.d.ts +20 -0
- package/dist/types/summarization/node.d.ts +29 -0
- package/dist/types/tools/ToolNode.d.ts +3 -1
- package/dist/types/types/graph.d.ts +44 -6
- package/dist/types/types/index.d.ts +1 -0
- package/dist/types/types/run.d.ts +30 -0
- package/dist/types/types/stream.d.ts +31 -4
- package/dist/types/types/summarize.d.ts +47 -0
- package/dist/types/types/tools.d.ts +7 -0
- package/dist/types/utils/errors.d.ts +28 -0
- package/dist/types/utils/events.d.ts +13 -0
- package/dist/types/utils/index.d.ts +2 -0
- package/dist/types/utils/llm.d.ts +4 -0
- package/dist/types/utils/tokens.d.ts +14 -1
- package/dist/types/utils/truncation.d.ts +49 -0
- package/package.json +1 -1
- package/src/agents/AgentContext.ts +388 -58
- package/src/agents/__tests__/AgentContext.test.ts +265 -5
- package/src/common/enum.ts +13 -0
- package/src/events.ts +9 -39
- package/src/graphs/Graph.ts +468 -331
- package/src/index.ts +7 -0
- package/src/llm/anthropic/llm.spec.ts +3 -3
- package/src/llm/anthropic/utils/message_inputs.ts +6 -4
- package/src/llm/bedrock/llm.spec.ts +1 -1
- package/src/llm/bedrock/utils/message_inputs.ts +6 -2
- package/src/llm/init.ts +63 -0
- package/src/llm/invoke.ts +144 -0
- package/src/llm/request.ts +55 -0
- package/src/messages/__tests__/observationMasking.test.ts +221 -0
- package/src/messages/cache.ts +77 -102
- package/src/messages/contextPruning.ts +191 -0
- package/src/messages/contextPruningSettings.ts +90 -0
- package/src/messages/core.ts +32 -53
- package/src/messages/ensureThinkingBlock.test.ts +39 -39
- package/src/messages/format.ts +227 -15
- package/src/messages/formatAgentMessages.test.ts +511 -1
- package/src/messages/index.ts +3 -0
- package/src/messages/prune.ts +1548 -62
- package/src/messages/reducer.ts +22 -0
- package/src/run.ts +104 -51
- package/src/scripts/bedrock-merge-test.ts +1 -1
- package/src/scripts/test-thinking-handoff-bedrock.ts +1 -1
- package/src/scripts/test-thinking-handoff.ts +1 -1
- package/src/scripts/thinking-bedrock.ts +1 -1
- package/src/scripts/thinking.ts +1 -1
- package/src/specs/anthropic.simple.test.ts +1 -1
- package/src/specs/multi-agent-summarization.test.ts +396 -0
- package/src/specs/prune.test.ts +1196 -23
- package/src/specs/summarization-unit.test.ts +868 -0
- package/src/specs/summarization.test.ts +3810 -0
- package/src/specs/summarize-prune.test.ts +376 -0
- package/src/specs/thinking-handoff.test.ts +10 -10
- package/src/specs/thinking-prune.test.ts +7 -4
- package/src/specs/token-accounting-e2e.test.ts +1034 -0
- package/src/specs/token-accounting-pipeline.test.ts +882 -0
- package/src/specs/token-distribution-edge-case.test.ts +25 -26
- package/src/splitStream.test.ts +42 -33
- package/src/stream.ts +64 -11
- package/src/summarization/__tests__/aggregator.test.ts +153 -0
- package/src/summarization/__tests__/node.test.ts +708 -0
- package/src/summarization/__tests__/trigger.test.ts +50 -0
- package/src/summarization/index.ts +102 -0
- package/src/summarization/node.ts +982 -0
- package/src/tools/ToolNode.ts +25 -3
- package/src/types/graph.ts +62 -7
- package/src/types/index.ts +1 -0
- package/src/types/run.ts +32 -0
- package/src/types/stream.ts +45 -5
- package/src/types/summarize.ts +58 -0
- package/src/types/tools.ts +7 -0
- package/src/utils/errors.ts +117 -0
- package/src/utils/events.ts +31 -0
- package/src/utils/handlers.ts +18 -0
- package/src/utils/index.ts +2 -0
- package/src/utils/llm.ts +12 -0
- package/src/utils/tokens.ts +336 -18
- package/src/utils/truncation.ts +124 -0
- package/src/scripts/image.ts +0 -180
|
@@ -0,0 +1,3810 @@
|
|
|
1
|
+
/* eslint-disable no-console */
|
|
2
|
+
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
3
|
+
import { config } from 'dotenv';
|
|
4
|
+
config();
|
|
5
|
+
import { Calculator } from '@/tools/Calculator';
|
|
6
|
+
import {
|
|
7
|
+
HumanMessage,
|
|
8
|
+
AIMessage,
|
|
9
|
+
SystemMessage,
|
|
10
|
+
ToolMessage,
|
|
11
|
+
BaseMessage,
|
|
12
|
+
UsageMetadata,
|
|
13
|
+
} from '@langchain/core/messages';
|
|
14
|
+
import type * as t from '@/types';
|
|
15
|
+
import { ToolEndHandler, ModelEndHandler } from '@/events';
|
|
16
|
+
import { ContentTypes, GraphEvents, Providers } from '@/common';
|
|
17
|
+
import { createContentAggregator } from '@/stream';
|
|
18
|
+
import { createTokenCounter } from '@/utils/tokens';
|
|
19
|
+
import { getLLMConfig } from '@/utils/llmConfig';
|
|
20
|
+
import { Run } from '@/run';
|
|
21
|
+
import { formatAgentMessages } from '@/messages/format';
|
|
22
|
+
import { FakeListChatModel } from '@langchain/core/utils/testing';
|
|
23
|
+
import * as providers from '@/llm/providers';
|
|
24
|
+
|
|
25
|
+
/** Extract plain text from a SummaryContentBlock's content array (test helper). */
|
|
26
|
+
function getSummaryText(summary: t.SummaryContentBlock | undefined): string {
|
|
27
|
+
if (!summary) return '';
|
|
28
|
+
return (summary.content ?? [])
|
|
29
|
+
.map((block) => ('text' in block ? (block as { text: string }).text : ''))
|
|
30
|
+
.join('');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Shared test infrastructure
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
function createSpies(): {
|
|
38
|
+
onMessageDeltaSpy: jest.Mock;
|
|
39
|
+
onRunStepSpy: jest.Mock;
|
|
40
|
+
onSummarizeStartSpy: jest.Mock;
|
|
41
|
+
onSummarizeCompleteSpy: jest.Mock;
|
|
42
|
+
} {
|
|
43
|
+
return {
|
|
44
|
+
onMessageDeltaSpy: jest.fn(),
|
|
45
|
+
onRunStepSpy: jest.fn(),
|
|
46
|
+
onSummarizeStartSpy: jest.fn(),
|
|
47
|
+
onSummarizeCompleteSpy: jest.fn(),
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function buildHandlers(
|
|
52
|
+
collectedUsage: UsageMetadata[],
|
|
53
|
+
aggregateContent: t.ContentAggregator,
|
|
54
|
+
spies: ReturnType<typeof createSpies>
|
|
55
|
+
): Record<string | GraphEvents, t.EventHandler> {
|
|
56
|
+
return {
|
|
57
|
+
[GraphEvents.TOOL_END]: new ToolEndHandler(),
|
|
58
|
+
[GraphEvents.CHAT_MODEL_END]: new ModelEndHandler(collectedUsage),
|
|
59
|
+
[GraphEvents.ON_RUN_STEP_COMPLETED]: {
|
|
60
|
+
handle: (
|
|
61
|
+
event: GraphEvents.ON_RUN_STEP_COMPLETED,
|
|
62
|
+
data: t.StreamEventData
|
|
63
|
+
): void => {
|
|
64
|
+
aggregateContent({
|
|
65
|
+
event,
|
|
66
|
+
data: data as unknown as { result: t.ToolEndEvent },
|
|
67
|
+
});
|
|
68
|
+
},
|
|
69
|
+
},
|
|
70
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
71
|
+
handle: (
|
|
72
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
73
|
+
data: t.StreamEventData,
|
|
74
|
+
metadata,
|
|
75
|
+
graph
|
|
76
|
+
): void => {
|
|
77
|
+
spies.onRunStepSpy(event, data, metadata, graph);
|
|
78
|
+
aggregateContent({ event, data: data as t.RunStep });
|
|
79
|
+
},
|
|
80
|
+
},
|
|
81
|
+
[GraphEvents.ON_RUN_STEP_DELTA]: {
|
|
82
|
+
handle: (
|
|
83
|
+
event: GraphEvents.ON_RUN_STEP_DELTA,
|
|
84
|
+
data: t.StreamEventData
|
|
85
|
+
): void => {
|
|
86
|
+
aggregateContent({ event, data: data as t.RunStepDeltaEvent });
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
[GraphEvents.ON_MESSAGE_DELTA]: {
|
|
90
|
+
handle: (
|
|
91
|
+
event: GraphEvents.ON_MESSAGE_DELTA,
|
|
92
|
+
data: t.StreamEventData,
|
|
93
|
+
metadata,
|
|
94
|
+
graph
|
|
95
|
+
): void => {
|
|
96
|
+
spies.onMessageDeltaSpy(event, data, metadata, graph);
|
|
97
|
+
aggregateContent({ event, data: data as t.MessageDeltaEvent });
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
[GraphEvents.TOOL_START]: {
|
|
101
|
+
handle: (
|
|
102
|
+
_event: string,
|
|
103
|
+
_data: t.StreamEventData,
|
|
104
|
+
_metadata?: Record<string, unknown>
|
|
105
|
+
): void => {},
|
|
106
|
+
},
|
|
107
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
108
|
+
handle: (
|
|
109
|
+
_event: GraphEvents.ON_SUMMARIZE_START,
|
|
110
|
+
data: t.StreamEventData
|
|
111
|
+
): void => {
|
|
112
|
+
spies.onSummarizeStartSpy(data);
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
116
|
+
handle: (
|
|
117
|
+
_event: GraphEvents.ON_SUMMARIZE_COMPLETE,
|
|
118
|
+
data: t.StreamEventData
|
|
119
|
+
): void => {
|
|
120
|
+
spies.onSummarizeCompleteSpy(data);
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
async function createSummarizationRun(opts: {
|
|
127
|
+
agentProvider: Providers;
|
|
128
|
+
summarizationProvider: Providers;
|
|
129
|
+
summarizationModel?: string;
|
|
130
|
+
maxContextTokens: number;
|
|
131
|
+
instructions: string;
|
|
132
|
+
collectedUsage: UsageMetadata[];
|
|
133
|
+
aggregateContent: t.ContentAggregator;
|
|
134
|
+
spies: ReturnType<typeof createSpies>;
|
|
135
|
+
tokenCounter?: t.TokenCounter;
|
|
136
|
+
tools?: t.GraphTools;
|
|
137
|
+
indexTokenCountMap?: Record<string, number>;
|
|
138
|
+
llmConfigOverride?: Record<string, unknown>;
|
|
139
|
+
}): Promise<Run<t.IState>> {
|
|
140
|
+
const llmConfig = {
|
|
141
|
+
...getLLMConfig(opts.agentProvider),
|
|
142
|
+
...opts.llmConfigOverride,
|
|
143
|
+
};
|
|
144
|
+
const tokenCounter = opts.tokenCounter ?? (await createTokenCounter());
|
|
145
|
+
|
|
146
|
+
return Run.create<t.IState>({
|
|
147
|
+
runId: `sum-e2e-${opts.agentProvider}-${Date.now()}`,
|
|
148
|
+
graphConfig: {
|
|
149
|
+
type: 'standard',
|
|
150
|
+
llmConfig,
|
|
151
|
+
tools: opts.tools ?? [new Calculator()],
|
|
152
|
+
instructions: opts.instructions,
|
|
153
|
+
maxContextTokens: opts.maxContextTokens,
|
|
154
|
+
summarizationEnabled: true,
|
|
155
|
+
summarizationConfig: {
|
|
156
|
+
provider: opts.summarizationProvider,
|
|
157
|
+
model: opts.summarizationModel,
|
|
158
|
+
},
|
|
159
|
+
},
|
|
160
|
+
returnContent: true,
|
|
161
|
+
customHandlers: buildHandlers(
|
|
162
|
+
opts.collectedUsage,
|
|
163
|
+
opts.aggregateContent,
|
|
164
|
+
opts.spies
|
|
165
|
+
),
|
|
166
|
+
tokenCounter,
|
|
167
|
+
indexTokenCountMap: opts.indexTokenCountMap,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
async function runTurn(
|
|
172
|
+
state: { run: Run<t.IState>; conversationHistory: BaseMessage[] },
|
|
173
|
+
userMessage: string,
|
|
174
|
+
streamConfig: Record<string, unknown>
|
|
175
|
+
): Promise<t.MessageContentComplex[] | undefined> {
|
|
176
|
+
state.conversationHistory.push(new HumanMessage(userMessage));
|
|
177
|
+
const result = await state.run.processStream(
|
|
178
|
+
{ messages: state.conversationHistory },
|
|
179
|
+
streamConfig as any
|
|
180
|
+
);
|
|
181
|
+
const finalMessages = state.run.getRunMessages();
|
|
182
|
+
state.conversationHistory.push(...(finalMessages ?? []));
|
|
183
|
+
return result;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function assertSummarizationEvents(spies: ReturnType<typeof createSpies>): {
|
|
187
|
+
startPayload: t.SummarizeStartEvent;
|
|
188
|
+
completePayload: t.SummarizeCompleteEvent;
|
|
189
|
+
} {
|
|
190
|
+
expect(spies.onSummarizeStartSpy).toHaveBeenCalled();
|
|
191
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
192
|
+
|
|
193
|
+
const startPayload = spies.onSummarizeStartSpy.mock
|
|
194
|
+
.calls[0][0] as t.SummarizeStartEvent;
|
|
195
|
+
expect(startPayload.agentId).toBeDefined();
|
|
196
|
+
expect(typeof startPayload.provider).toBe('string');
|
|
197
|
+
expect(startPayload.messagesToRefineCount).toBeGreaterThan(0);
|
|
198
|
+
|
|
199
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
200
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
201
|
+
expect(completePayload.agentId).toBeDefined();
|
|
202
|
+
expect(completePayload.summary).toBeDefined();
|
|
203
|
+
expect(completePayload.summary!.type).toBe(ContentTypes.SUMMARY);
|
|
204
|
+
expect(typeof getSummaryText(completePayload.summary)).toBe('string');
|
|
205
|
+
expect(getSummaryText(completePayload.summary).length).toBeGreaterThan(10);
|
|
206
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(0);
|
|
207
|
+
expect(completePayload.summary!.provider).toBeDefined();
|
|
208
|
+
expect(completePayload.summary!.createdAt).toBeDefined();
|
|
209
|
+
|
|
210
|
+
const startIdx = spies.onSummarizeStartSpy.mock.invocationCallOrder[0];
|
|
211
|
+
const completeIdx = spies.onSummarizeCompleteSpy.mock.invocationCallOrder[0];
|
|
212
|
+
expect(startIdx).toBeLessThan(completeIdx);
|
|
213
|
+
|
|
214
|
+
return { startPayload, completePayload };
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function assertSummaryRunStep(
|
|
218
|
+
spies: ReturnType<typeof createSpies>,
|
|
219
|
+
summaryText: string
|
|
220
|
+
): void {
|
|
221
|
+
const summaryRunSteps = spies.onRunStepSpy.mock.calls.filter(
|
|
222
|
+
(call) => (call[1] as any)?.summary != null
|
|
223
|
+
);
|
|
224
|
+
expect(summaryRunSteps.length).toBeGreaterThan(0);
|
|
225
|
+
const step = summaryRunSteps[0][1] as t.RunStep & {
|
|
226
|
+
summary: t.SummaryContentBlock;
|
|
227
|
+
};
|
|
228
|
+
expect(step.summary.type).toBe(ContentTypes.SUMMARY);
|
|
229
|
+
expect(getSummaryText(step.summary)).toBe(summaryText);
|
|
230
|
+
expect(step.id).toBeDefined();
|
|
231
|
+
expect(typeof step.stepIndex).toBe('number');
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function buildIndexTokenCountMap(
|
|
235
|
+
messages: BaseMessage[],
|
|
236
|
+
tokenCounter: t.TokenCounter
|
|
237
|
+
): Record<string, number> {
|
|
238
|
+
const map: Record<string, number> = {};
|
|
239
|
+
for (let i = 0; i < messages.length; i++) {
|
|
240
|
+
map[String(i)] = tokenCounter(messages[i]);
|
|
241
|
+
}
|
|
242
|
+
return map;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
function logTurn(
|
|
246
|
+
label: string,
|
|
247
|
+
conversationHistory: BaseMessage[],
|
|
248
|
+
extra?: string
|
|
249
|
+
): void {
|
|
250
|
+
console.log(
|
|
251
|
+
` ${label} — ${conversationHistory.length} messages${extra != null && extra !== '' ? `, ${extra}` : ''}`
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
// Anthropic Summarization Tests
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
const hasAnthropic = process.env.ANTHROPIC_API_KEY != null;
|
|
260
|
+
(hasAnthropic ? describe : describe.skip)('Anthropic Summarization E2E', () => {
|
|
261
|
+
jest.setTimeout(180_000);
|
|
262
|
+
|
|
263
|
+
const agentProvider = Providers.ANTHROPIC;
|
|
264
|
+
const streamConfig = {
|
|
265
|
+
configurable: { thread_id: 'anthropic-sum-e2e' },
|
|
266
|
+
recursionLimit: 80,
|
|
267
|
+
streamMode: 'values',
|
|
268
|
+
version: 'v2' as const,
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
const MATH_TUTOR_INSTRUCTIONS = [
|
|
272
|
+
'You are an expert math tutor. You MUST use the calculator tool for ALL computations —',
|
|
273
|
+
'never compute in your head. Keep explanations concise (2-3 sentences max).',
|
|
274
|
+
'When summarizing prior work, list each calculation and its result.',
|
|
275
|
+
].join(' ');
|
|
276
|
+
|
|
277
|
+
test('heavy multi-turn with tool calls triggers and survives summarization', async () => {
|
|
278
|
+
const spies = createSpies();
|
|
279
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
280
|
+
const conversationHistory: BaseMessage[] = [];
|
|
281
|
+
const tokenCounter = await createTokenCounter();
|
|
282
|
+
|
|
283
|
+
const resetAggregator = (): {
|
|
284
|
+
contentParts: t.MessageContentComplex[];
|
|
285
|
+
aggregateContent: t.ContentAggregator;
|
|
286
|
+
} => {
|
|
287
|
+
collectedUsage = [];
|
|
288
|
+
const { contentParts: cp, aggregateContent: ac } =
|
|
289
|
+
createContentAggregator();
|
|
290
|
+
return {
|
|
291
|
+
contentParts: cp as t.MessageContentComplex[],
|
|
292
|
+
aggregateContent: ac,
|
|
293
|
+
};
|
|
294
|
+
};
|
|
295
|
+
|
|
296
|
+
const createRun = async (
|
|
297
|
+
maxTokens = 4000
|
|
298
|
+
): Promise<{
|
|
299
|
+
run: Run<t.IState>;
|
|
300
|
+
contentParts: t.MessageContentComplex[];
|
|
301
|
+
}> => {
|
|
302
|
+
const { contentParts, aggregateContent } = resetAggregator();
|
|
303
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
304
|
+
conversationHistory,
|
|
305
|
+
tokenCounter
|
|
306
|
+
);
|
|
307
|
+
const run = await createSummarizationRun({
|
|
308
|
+
agentProvider,
|
|
309
|
+
summarizationProvider: Providers.ANTHROPIC,
|
|
310
|
+
summarizationModel: 'claude-haiku-4-5',
|
|
311
|
+
maxContextTokens: maxTokens,
|
|
312
|
+
instructions: MATH_TUTOR_INSTRUCTIONS,
|
|
313
|
+
collectedUsage,
|
|
314
|
+
aggregateContent,
|
|
315
|
+
spies,
|
|
316
|
+
tokenCounter,
|
|
317
|
+
indexTokenCountMap,
|
|
318
|
+
});
|
|
319
|
+
return { run, contentParts };
|
|
320
|
+
};
|
|
321
|
+
|
|
322
|
+
// Turn 1: greeting + simple calculation
|
|
323
|
+
let { run, contentParts } = await createRun();
|
|
324
|
+
await runTurn(
|
|
325
|
+
{ run, conversationHistory },
|
|
326
|
+
'Hi! Let\'s do some math. What is 12345 * 6789? Use the calculator please.',
|
|
327
|
+
streamConfig
|
|
328
|
+
);
|
|
329
|
+
logTurn('T1', conversationHistory, `parts=${contentParts.length}`);
|
|
330
|
+
|
|
331
|
+
// Turn 2: compound calculation
|
|
332
|
+
({ run, contentParts } = await createRun());
|
|
333
|
+
await runTurn(
|
|
334
|
+
{ run, conversationHistory },
|
|
335
|
+
'Great. Now take that result and divide it by 137. Then multiply the quotient by 42. Show both steps. Use the calculator for each.',
|
|
336
|
+
streamConfig
|
|
337
|
+
);
|
|
338
|
+
logTurn('T2', conversationHistory, `parts=${contentParts.length}`);
|
|
339
|
+
|
|
340
|
+
// Turn 3: verbose question to inflate token count
|
|
341
|
+
({ run, contentParts } = await createRun());
|
|
342
|
+
await runTurn(
|
|
343
|
+
{ run, conversationHistory },
|
|
344
|
+
[
|
|
345
|
+
'I need you to compute the following sequence of operations step by step using the calculator:',
|
|
346
|
+
'1) Start with 9876543',
|
|
347
|
+
'2) Subtract 1234567 from it',
|
|
348
|
+
'3) Take the square root of the result',
|
|
349
|
+
'Please show each intermediate step with the calculator.',
|
|
350
|
+
].join('\n'),
|
|
351
|
+
streamConfig
|
|
352
|
+
);
|
|
353
|
+
logTurn('T3', conversationHistory, `parts=${contentParts.length}`);
|
|
354
|
+
|
|
355
|
+
// Turn 4: even more to guarantee pruning threshold
|
|
356
|
+
({ run, contentParts } = await createRun());
|
|
357
|
+
await runTurn(
|
|
358
|
+
{ run, conversationHistory },
|
|
359
|
+
'Now calculate 2^20 using the calculator. Also, what is 1000000 / 7? Use calculator for both.',
|
|
360
|
+
streamConfig
|
|
361
|
+
);
|
|
362
|
+
logTurn('T4', conversationHistory, `parts=${contentParts.length}`);
|
|
363
|
+
|
|
364
|
+
// Turn 5: tighter context to force summarization if not already
|
|
365
|
+
({ run, contentParts } = await createRun(3500));
|
|
366
|
+
await runTurn(
|
|
367
|
+
{ run, conversationHistory },
|
|
368
|
+
'What is 355 / 113? Use the calculator. This should approximate pi.',
|
|
369
|
+
streamConfig
|
|
370
|
+
);
|
|
371
|
+
logTurn('T5', conversationHistory);
|
|
372
|
+
|
|
373
|
+
// Turn 6: if still no summarization, squeeze harder
|
|
374
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
375
|
+
// Debug: show total token count from the indexTokenCountMap
|
|
376
|
+
const debugMap = buildIndexTokenCountMap(
|
|
377
|
+
conversationHistory,
|
|
378
|
+
tokenCounter
|
|
379
|
+
);
|
|
380
|
+
const totalTokens = Object.values(debugMap).reduce(
|
|
381
|
+
(sum, v) => sum + v,
|
|
382
|
+
0
|
|
383
|
+
);
|
|
384
|
+
console.log(
|
|
385
|
+
` Pre-T6 debug: ${conversationHistory.length} msgs, totalTokens=${totalTokens}, ` +
|
|
386
|
+
`indexTokenCountMap keys=${Object.keys(debugMap).length}`
|
|
387
|
+
);
|
|
388
|
+
|
|
389
|
+
({ run, contentParts } = await createRun(3200));
|
|
390
|
+
await runTurn(
|
|
391
|
+
{ run, conversationHistory },
|
|
392
|
+
'Calculate 999 * 999 with the calculator. Also compute 123456789 % 97.',
|
|
393
|
+
streamConfig
|
|
394
|
+
);
|
|
395
|
+
logTurn('T6', conversationHistory);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Turn 7: absolute minimum context if still nothing
|
|
399
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
400
|
+
({ run, contentParts } = await createRun(3100));
|
|
401
|
+
await runTurn({ run, conversationHistory }, 'What is 1+1?', streamConfig);
|
|
402
|
+
logTurn('T7', conversationHistory);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
console.log(
|
|
406
|
+
` Summarize events — start: ${spies.onSummarizeStartSpy.mock.calls.length}, complete: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
407
|
+
);
|
|
408
|
+
|
|
409
|
+
// Assert summarization fired correctly
|
|
410
|
+
const { startPayload, completePayload } = assertSummarizationEvents(spies);
|
|
411
|
+
assertSummaryRunStep(spies, getSummaryText(completePayload.summary));
|
|
412
|
+
|
|
413
|
+
console.log(
|
|
414
|
+
` Summary (${getSummaryText(completePayload.summary).length} chars, ${completePayload.summary!.tokenCount} tok): "${getSummaryText(completePayload.summary).substring(0, 250)}…"`
|
|
415
|
+
);
|
|
416
|
+
console.log(
|
|
417
|
+
` Start event — agent=${startPayload.agentId}, provider=${startPayload.provider}, refining=${startPayload.messagesToRefineCount} msgs`
|
|
418
|
+
);
|
|
419
|
+
|
|
420
|
+
// Token accounting: summary tokenCount must be reasonable
|
|
421
|
+
expect(completePayload.summary!.tokenCount).toBeGreaterThan(10);
|
|
422
|
+
expect(completePayload.summary!.tokenCount).toBeLessThan(2000);
|
|
423
|
+
|
|
424
|
+
// Token accounting: collectedUsage should have valid entries from post-summary model calls
|
|
425
|
+
const validUsageEntries = collectedUsage.filter(
|
|
426
|
+
(u: Partial<UsageMetadata>) =>
|
|
427
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
428
|
+
);
|
|
429
|
+
expect(validUsageEntries.length).toBeGreaterThan(0);
|
|
430
|
+
const lastUsage = validUsageEntries[validUsageEntries.length - 1];
|
|
431
|
+
expect(lastUsage.output_tokens).toBeGreaterThan(0);
|
|
432
|
+
console.log(
|
|
433
|
+
` Post-summary usage — input: ${lastUsage.input_tokens}, output: ${lastUsage.output_tokens}`
|
|
434
|
+
);
|
|
435
|
+
|
|
436
|
+
// Assert model still works after summarization
|
|
437
|
+
expect(spies.onMessageDeltaSpy).toHaveBeenCalled();
|
|
438
|
+
|
|
439
|
+
// Summarization may fire multiple times per run (no single-fire guard);
|
|
440
|
+
// the graph's recursionLimit prevents infinite loops.
|
|
441
|
+
const startCallsForSameAgent = spies.onSummarizeStartSpy.mock.calls.filter(
|
|
442
|
+
(c) => (c[0] as t.SummarizeStartEvent).agentId === startPayload.agentId
|
|
443
|
+
);
|
|
444
|
+
expect(startCallsForSameAgent.length).toBeGreaterThanOrEqual(1);
|
|
445
|
+
});
|
|
446
|
+
|
|
447
|
+
test('post-summary continuation over multiple turns preserves context', async () => {
|
|
448
|
+
const spies = createSpies();
|
|
449
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
450
|
+
const conversationHistory: BaseMessage[] = [];
|
|
451
|
+
let latestContentParts: t.MessageContentComplex[] = [];
|
|
452
|
+
const tokenCounter = await createTokenCounter();
|
|
453
|
+
|
|
454
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
455
|
+
collectedUsage = [];
|
|
456
|
+
const { contentParts, aggregateContent } = createContentAggregator();
|
|
457
|
+
latestContentParts = contentParts as t.MessageContentComplex[];
|
|
458
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
459
|
+
conversationHistory,
|
|
460
|
+
tokenCounter
|
|
461
|
+
);
|
|
462
|
+
return createSummarizationRun({
|
|
463
|
+
agentProvider,
|
|
464
|
+
summarizationProvider: Providers.ANTHROPIC,
|
|
465
|
+
summarizationModel: 'claude-haiku-4-5',
|
|
466
|
+
maxContextTokens: maxTokens,
|
|
467
|
+
instructions: MATH_TUTOR_INSTRUCTIONS,
|
|
468
|
+
collectedUsage,
|
|
469
|
+
aggregateContent,
|
|
470
|
+
spies,
|
|
471
|
+
tokenCounter,
|
|
472
|
+
indexTokenCountMap,
|
|
473
|
+
});
|
|
474
|
+
};
|
|
475
|
+
|
|
476
|
+
// Build up conversation — generous budget so messages accumulate
|
|
477
|
+
let run = await createRun();
|
|
478
|
+
await runTurn(
|
|
479
|
+
{ run, conversationHistory },
|
|
480
|
+
'What is 42 * 58? Calculator please.',
|
|
481
|
+
streamConfig
|
|
482
|
+
);
|
|
483
|
+
|
|
484
|
+
run = await createRun();
|
|
485
|
+
await runTurn(
|
|
486
|
+
{ run, conversationHistory },
|
|
487
|
+
'Now compute 2436 + 1337. Calculator.',
|
|
488
|
+
streamConfig
|
|
489
|
+
);
|
|
490
|
+
|
|
491
|
+
run = await createRun();
|
|
492
|
+
await runTurn(
|
|
493
|
+
{ run, conversationHistory },
|
|
494
|
+
'What is 3773 * 11? Calculator.',
|
|
495
|
+
streamConfig
|
|
496
|
+
);
|
|
497
|
+
|
|
498
|
+
run = await createRun();
|
|
499
|
+
await runTurn(
|
|
500
|
+
{ run, conversationHistory },
|
|
501
|
+
'Calculate 41503 - 12345 and then 29158 / 4. Show both with calculator.',
|
|
502
|
+
streamConfig
|
|
503
|
+
);
|
|
504
|
+
|
|
505
|
+
run = await createRun();
|
|
506
|
+
await runTurn(
|
|
507
|
+
{ run, conversationHistory },
|
|
508
|
+
'What is 100 * 200? Calculator.',
|
|
509
|
+
streamConfig
|
|
510
|
+
);
|
|
511
|
+
|
|
512
|
+
// Progressively squeeze to force summarization
|
|
513
|
+
for (const squeeze of [3500, 3200, 3100, 3000, 2800, 2500, 2000]) {
|
|
514
|
+
if (spies.onSummarizeStartSpy.mock.calls.length > 0) {
|
|
515
|
+
break;
|
|
516
|
+
}
|
|
517
|
+
run = await createRun(squeeze);
|
|
518
|
+
await runTurn(
|
|
519
|
+
{ run, conversationHistory },
|
|
520
|
+
`What is ${squeeze} * 2? Calculator.`,
|
|
521
|
+
streamConfig
|
|
522
|
+
);
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
console.log(
|
|
526
|
+
` Pre-continuation: ${spies.onSummarizeCompleteSpy.mock.calls.length} summaries`
|
|
527
|
+
);
|
|
528
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
529
|
+
const completeSummary = (
|
|
530
|
+
spies.onSummarizeCompleteSpy.mock.calls[0][0] as t.SummarizeCompleteEvent
|
|
531
|
+
).summary!;
|
|
532
|
+
const summaryText = getSummaryText(completeSummary);
|
|
533
|
+
|
|
534
|
+
// Token accounting: summary tokenCount bounds
|
|
535
|
+
expect(completeSummary.tokenCount ?? 0).toBeGreaterThan(10);
|
|
536
|
+
expect(completeSummary.tokenCount ?? 0).toBeLessThan(1200);
|
|
537
|
+
|
|
538
|
+
// Continue for 2 more turns AFTER summarization — model should remain coherent
|
|
539
|
+
run = await createRun(4000);
|
|
540
|
+
const postSumTurn1 = await runTurn(
|
|
541
|
+
{ run, conversationHistory },
|
|
542
|
+
'What were all the numbers we computed so far? List them.',
|
|
543
|
+
streamConfig
|
|
544
|
+
);
|
|
545
|
+
expect(postSumTurn1).toBeDefined();
|
|
546
|
+
logTurn('Post-sum T1', conversationHistory);
|
|
547
|
+
|
|
548
|
+
run = await createRun(4000);
|
|
549
|
+
const postSumTurn2 = await runTurn(
|
|
550
|
+
{ run, conversationHistory },
|
|
551
|
+
'Now compute the sum of 2436, 3773, and 41503 using the calculator.',
|
|
552
|
+
streamConfig
|
|
553
|
+
);
|
|
554
|
+
expect(postSumTurn2).toBeDefined();
|
|
555
|
+
logTurn('Post-sum T2', conversationHistory);
|
|
556
|
+
|
|
557
|
+
const hasPostSumCalculator = latestContentParts.some(
|
|
558
|
+
(p) =>
|
|
559
|
+
p.type === ContentTypes.TOOL_CALL &&
|
|
560
|
+
(p as t.ToolCallContent).tool_call?.name === 'calculator'
|
|
561
|
+
);
|
|
562
|
+
expect(hasPostSumCalculator).toBe(true);
|
|
563
|
+
|
|
564
|
+
// Model should still reference prior context from the summary
|
|
565
|
+
expect(spies.onMessageDeltaSpy).toHaveBeenCalled();
|
|
566
|
+
console.log(` Summary text: "${summaryText.substring(0, 200)}…"`);
|
|
567
|
+
console.log(` Final message count: ${conversationHistory.length}`);
|
|
568
|
+
}, 180_000);
|
|
569
|
+
|
|
570
|
+
test('cross-provider summarization: Anthropic agent with OpenAI summarizer', async () => {
|
|
571
|
+
const hasOpenAI = process.env.OPENAI_API_KEY != null;
|
|
572
|
+
if (!hasOpenAI) {
|
|
573
|
+
console.log(' Skipping cross-provider test (no OPENAI_API_KEY)');
|
|
574
|
+
return;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
const spies = createSpies();
|
|
578
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
579
|
+
const conversationHistory: BaseMessage[] = [];
|
|
580
|
+
const tokenCounter = await createTokenCounter();
|
|
581
|
+
|
|
582
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
583
|
+
collectedUsage = [];
|
|
584
|
+
const { aggregateContent } = createContentAggregator();
|
|
585
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
586
|
+
conversationHistory,
|
|
587
|
+
tokenCounter
|
|
588
|
+
);
|
|
589
|
+
return createSummarizationRun({
|
|
590
|
+
agentProvider: Providers.ANTHROPIC,
|
|
591
|
+
summarizationProvider: Providers.OPENAI,
|
|
592
|
+
summarizationModel: 'gpt-4.1-mini',
|
|
593
|
+
maxContextTokens: maxTokens,
|
|
594
|
+
instructions: MATH_TUTOR_INSTRUCTIONS,
|
|
595
|
+
collectedUsage,
|
|
596
|
+
aggregateContent,
|
|
597
|
+
spies,
|
|
598
|
+
tokenCounter,
|
|
599
|
+
indexTokenCountMap,
|
|
600
|
+
});
|
|
601
|
+
};
|
|
602
|
+
|
|
603
|
+
// Build up conversation at generous limits so messages accumulate
|
|
604
|
+
let run = await createRun(4000);
|
|
605
|
+
await runTurn(
|
|
606
|
+
{ run, conversationHistory },
|
|
607
|
+
'Compute 54321 * 12345 using calculator.',
|
|
608
|
+
streamConfig
|
|
609
|
+
);
|
|
610
|
+
|
|
611
|
+
run = await createRun(4000);
|
|
612
|
+
await runTurn(
|
|
613
|
+
{ run, conversationHistory },
|
|
614
|
+
'Now calculate 670592745 / 99991. Calculator.',
|
|
615
|
+
streamConfig
|
|
616
|
+
);
|
|
617
|
+
|
|
618
|
+
run = await createRun(4000);
|
|
619
|
+
await runTurn(
|
|
620
|
+
{ run, conversationHistory },
|
|
621
|
+
'What is sqrt(670592745)? Calculator.',
|
|
622
|
+
streamConfig
|
|
623
|
+
);
|
|
624
|
+
|
|
625
|
+
run = await createRun(4000);
|
|
626
|
+
await runTurn(
|
|
627
|
+
{ run, conversationHistory },
|
|
628
|
+
'Compute 2^32 with calculator.',
|
|
629
|
+
streamConfig
|
|
630
|
+
);
|
|
631
|
+
|
|
632
|
+
run = await createRun(4000);
|
|
633
|
+
await runTurn(
|
|
634
|
+
{ run, conversationHistory },
|
|
635
|
+
'What is 13 * 17 * 19? Calculator.',
|
|
636
|
+
streamConfig
|
|
637
|
+
);
|
|
638
|
+
|
|
639
|
+
// Tighten context to force summarization — must remain high enough
|
|
640
|
+
// for post-summary instruction overhead + tool schema tokens + messages
|
|
641
|
+
run = await createRun(3500);
|
|
642
|
+
await runTurn(
|
|
643
|
+
{ run, conversationHistory },
|
|
644
|
+
'What is 99 * 101? Calculator. Then list everything we calculated so far in detail.',
|
|
645
|
+
streamConfig
|
|
646
|
+
);
|
|
647
|
+
|
|
648
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
649
|
+
run = await createRun(3400);
|
|
650
|
+
await runTurn(
|
|
651
|
+
{ run, conversationHistory },
|
|
652
|
+
'Compute 7! (factorial of 7) with calculator.',
|
|
653
|
+
streamConfig
|
|
654
|
+
);
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
658
|
+
run = await createRun(3300);
|
|
659
|
+
await runTurn(
|
|
660
|
+
{ run, conversationHistory },
|
|
661
|
+
'What is 256 * 256? Calculator.',
|
|
662
|
+
streamConfig
|
|
663
|
+
);
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
667
|
+
run = await createRun(3200);
|
|
668
|
+
await runTurn(
|
|
669
|
+
{ run, conversationHistory },
|
|
670
|
+
'Compute 100 + 200 with calculator.',
|
|
671
|
+
streamConfig
|
|
672
|
+
);
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
676
|
+
run = await createRun(3100);
|
|
677
|
+
await runTurn(
|
|
678
|
+
{ run, conversationHistory },
|
|
679
|
+
'What is 50 * 50? Calculator.',
|
|
680
|
+
streamConfig
|
|
681
|
+
);
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
console.log(
|
|
685
|
+
` Cross-provider summaries: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
686
|
+
);
|
|
687
|
+
|
|
688
|
+
assertSummarizationEvents(spies);
|
|
689
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
690
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
691
|
+
|
|
692
|
+
// The summary should have been generated by OpenAI even though agent is Anthropic
|
|
693
|
+
expect(completePayload.summary!.provider).toBe(Providers.OPENAI);
|
|
694
|
+
expect(completePayload.summary!.model).toBe('gpt-4.1-mini');
|
|
695
|
+
assertSummaryRunStep(spies, getSummaryText(completePayload.summary));
|
|
696
|
+
|
|
697
|
+
// Token accounting: summary tokenCount bounds
|
|
698
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(10);
|
|
699
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeLessThan(1200);
|
|
700
|
+
|
|
701
|
+
// Token accounting: collectedUsage from the post-summary model call
|
|
702
|
+
const validUsage = collectedUsage.filter(
|
|
703
|
+
(u: Partial<UsageMetadata>) =>
|
|
704
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
705
|
+
);
|
|
706
|
+
expect(validUsage.length).toBeGreaterThan(0);
|
|
707
|
+
|
|
708
|
+
console.log(
|
|
709
|
+
` Cross-provider summary (${getSummaryText(completePayload.summary).length} chars): "${getSummaryText(completePayload.summary).substring(0, 200)}…"`
|
|
710
|
+
);
|
|
711
|
+
});
|
|
712
|
+
|
|
713
|
+
test('extended thinking: multi-turn with reasoning triggers summarization and grounds token accounting', async () => {
|
|
714
|
+
const spies = createSpies();
|
|
715
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
716
|
+
const conversationHistory: BaseMessage[] = [];
|
|
717
|
+
const tokenCounter = await createTokenCounter();
|
|
718
|
+
|
|
719
|
+
const resetAggregator = (): {
|
|
720
|
+
contentParts: t.MessageContentComplex[];
|
|
721
|
+
aggregateContent: t.ContentAggregator;
|
|
722
|
+
} => {
|
|
723
|
+
collectedUsage = [];
|
|
724
|
+
const { contentParts: cp, aggregateContent: ac } =
|
|
725
|
+
createContentAggregator();
|
|
726
|
+
return {
|
|
727
|
+
contentParts: cp as t.MessageContentComplex[],
|
|
728
|
+
aggregateContent: ac,
|
|
729
|
+
};
|
|
730
|
+
};
|
|
731
|
+
|
|
732
|
+
const createRun = async (
|
|
733
|
+
maxTokens = 3000
|
|
734
|
+
): Promise<{
|
|
735
|
+
run: Run<t.IState>;
|
|
736
|
+
contentParts: t.MessageContentComplex[];
|
|
737
|
+
}> => {
|
|
738
|
+
const { contentParts, aggregateContent } = resetAggregator();
|
|
739
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
740
|
+
conversationHistory,
|
|
741
|
+
tokenCounter
|
|
742
|
+
);
|
|
743
|
+
const run = await createSummarizationRun({
|
|
744
|
+
agentProvider,
|
|
745
|
+
summarizationProvider: Providers.ANTHROPIC,
|
|
746
|
+
summarizationModel: 'claude-haiku-4-5',
|
|
747
|
+
maxContextTokens: maxTokens,
|
|
748
|
+
instructions:
|
|
749
|
+
'You are a math tutor. Use the calculator tool for computations. Keep answers brief.',
|
|
750
|
+
collectedUsage,
|
|
751
|
+
aggregateContent,
|
|
752
|
+
spies,
|
|
753
|
+
tokenCounter,
|
|
754
|
+
indexTokenCountMap,
|
|
755
|
+
llmConfigOverride: {
|
|
756
|
+
model: 'claude-sonnet-4-5',
|
|
757
|
+
thinking: {
|
|
758
|
+
type: 'enabled',
|
|
759
|
+
budget_tokens: 1024,
|
|
760
|
+
},
|
|
761
|
+
},
|
|
762
|
+
});
|
|
763
|
+
return { run, contentParts };
|
|
764
|
+
};
|
|
765
|
+
|
|
766
|
+
// Turn 1: simple calculation with thinking
|
|
767
|
+
let { run, contentParts } = await createRun();
|
|
768
|
+
await runTurn(
|
|
769
|
+
{ run, conversationHistory },
|
|
770
|
+
'What is 7 * 720? Use the calculator.',
|
|
771
|
+
streamConfig
|
|
772
|
+
);
|
|
773
|
+
logTurn('T1-think', conversationHistory, `parts=${contentParts.length}`);
|
|
774
|
+
|
|
775
|
+
// Validate Turn 1 usage includes both input and output tokens
|
|
776
|
+
const t1Usage = collectedUsage.filter(
|
|
777
|
+
(u: Partial<UsageMetadata>) =>
|
|
778
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
779
|
+
);
|
|
780
|
+
expect(t1Usage.length).toBeGreaterThan(0);
|
|
781
|
+
const t1Last = t1Usage[t1Usage.length - 1];
|
|
782
|
+
expect(t1Last.output_tokens).toBeGreaterThan(0);
|
|
783
|
+
console.log(
|
|
784
|
+
` T1 usage — input: ${t1Last.input_tokens}, output: ${t1Last.output_tokens}` +
|
|
785
|
+
(t1Last.input_token_details?.cache_read != null
|
|
786
|
+
? `, cache_read: ${t1Last.input_token_details.cache_read}`
|
|
787
|
+
: '')
|
|
788
|
+
);
|
|
789
|
+
|
|
790
|
+
// Turn 2: follow-up calculation
|
|
791
|
+
({ run, contentParts } = await createRun());
|
|
792
|
+
await runTurn(
|
|
793
|
+
{ run, conversationHistory },
|
|
794
|
+
'Now multiply that result by 3. Use the calculator.',
|
|
795
|
+
streamConfig
|
|
796
|
+
);
|
|
797
|
+
logTurn('T2-think', conversationHistory, `parts=${contentParts.length}`);
|
|
798
|
+
|
|
799
|
+
// Turn 3: another calculation to build context
|
|
800
|
+
({ run, contentParts } = await createRun());
|
|
801
|
+
await runTurn(
|
|
802
|
+
{ run, conversationHistory },
|
|
803
|
+
'What is 143 + 857? Use the calculator.',
|
|
804
|
+
streamConfig
|
|
805
|
+
);
|
|
806
|
+
logTurn('T3-think', conversationHistory, `parts=${contentParts.length}`);
|
|
807
|
+
|
|
808
|
+
// Turn 4: another turn to build up context
|
|
809
|
+
({ run, contentParts } = await createRun());
|
|
810
|
+
await runTurn(
|
|
811
|
+
{ run, conversationHistory },
|
|
812
|
+
'What is 2 * 512? Use the calculator.',
|
|
813
|
+
streamConfig
|
|
814
|
+
);
|
|
815
|
+
logTurn('T4-think', conversationHistory);
|
|
816
|
+
|
|
817
|
+
// Turn 5: tighter context to trigger summarization
|
|
818
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
819
|
+
({ run, contentParts } = await createRun(2500));
|
|
820
|
+
await runTurn(
|
|
821
|
+
{ run, conversationHistory },
|
|
822
|
+
'What is 999 * 999? Use the calculator.',
|
|
823
|
+
streamConfig
|
|
824
|
+
);
|
|
825
|
+
logTurn('T5-think', conversationHistory);
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
// Turn 6: squeeze harder if needed
|
|
829
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
830
|
+
({ run, contentParts } = await createRun(2000));
|
|
831
|
+
await runTurn(
|
|
832
|
+
{ run, conversationHistory },
|
|
833
|
+
'What is 42 * 42? Use the calculator.',
|
|
834
|
+
streamConfig
|
|
835
|
+
);
|
|
836
|
+
logTurn('T6-think', conversationHistory);
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
console.log(
|
|
840
|
+
` Thinking summarize events — start: ${spies.onSummarizeStartSpy.mock.calls.length}, complete: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
841
|
+
);
|
|
842
|
+
|
|
843
|
+
// Assert summarization fired
|
|
844
|
+
const { completePayload } = assertSummarizationEvents(spies);
|
|
845
|
+
assertSummaryRunStep(spies, getSummaryText(completePayload.summary));
|
|
846
|
+
|
|
847
|
+
// Token accounting: summary tokenCount bounds
|
|
848
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(10);
|
|
849
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeLessThan(2000);
|
|
850
|
+
|
|
851
|
+
// Token accounting: collectedUsage must have valid entries across all turns
|
|
852
|
+
const allValidUsage = collectedUsage.filter(
|
|
853
|
+
(u: Partial<UsageMetadata>) =>
|
|
854
|
+
u.input_tokens != null &&
|
|
855
|
+
u.input_tokens > 0 &&
|
|
856
|
+
u.output_tokens != null &&
|
|
857
|
+
u.output_tokens > 0
|
|
858
|
+
);
|
|
859
|
+
expect(allValidUsage.length).toBeGreaterThan(0);
|
|
860
|
+
|
|
861
|
+
// Validate that usage has reasonable token counts (thinking adds tokens)
|
|
862
|
+
const lastUsage = allValidUsage[allValidUsage.length - 1];
|
|
863
|
+
expect(lastUsage.input_tokens).toBeGreaterThan(0);
|
|
864
|
+
expect(lastUsage.output_tokens).toBeGreaterThan(0);
|
|
865
|
+
|
|
866
|
+
console.log(
|
|
867
|
+
` Thinking usage samples: ${allValidUsage.length} valid entries`
|
|
868
|
+
);
|
|
869
|
+
console.log(
|
|
870
|
+
` Last usage — input: ${lastUsage.input_tokens}, output: ${lastUsage.output_tokens}`
|
|
871
|
+
);
|
|
872
|
+
if (lastUsage.input_token_details?.cache_read != null) {
|
|
873
|
+
console.log(
|
|
874
|
+
` Cache read: ${lastUsage.input_token_details.cache_read}, cache creation: ${lastUsage.input_token_details.cache_creation ?? 0}`
|
|
875
|
+
);
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
// Post-summary continuation should work with thinking enabled
|
|
879
|
+
({ run } = await createRun(4000));
|
|
880
|
+
const postSumResult = await runTurn(
|
|
881
|
+
{ run, conversationHistory },
|
|
882
|
+
'What is 100 / 4? Calculator please.',
|
|
883
|
+
streamConfig
|
|
884
|
+
);
|
|
885
|
+
expect(postSumResult).toBeDefined();
|
|
886
|
+
logTurn('Post-sum-think', conversationHistory);
|
|
887
|
+
|
|
888
|
+
// Post-summary usage must also be valid
|
|
889
|
+
const postSumUsage = collectedUsage.filter(
|
|
890
|
+
(u: Partial<UsageMetadata>) =>
|
|
891
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
892
|
+
);
|
|
893
|
+
expect(postSumUsage.length).toBeGreaterThan(0);
|
|
894
|
+
|
|
895
|
+
console.log(
|
|
896
|
+
` Thinking summary (${getSummaryText(completePayload.summary).length} chars): "${getSummaryText(completePayload.summary).substring(0, 250)}…"`
|
|
897
|
+
);
|
|
898
|
+
console.log(` Final messages: ${conversationHistory.length}`);
|
|
899
|
+
}, 180_000);
|
|
900
|
+
|
|
901
|
+
test('count_tokens API: local tokenCounter vs Anthropic actual token count', async () => {
|
|
902
|
+
const Anthropic = (await import('@anthropic-ai/sdk')).default;
|
|
903
|
+
const client = new Anthropic();
|
|
904
|
+
const tokenCounter = await createTokenCounter();
|
|
905
|
+
|
|
906
|
+
const testMessages: Array<{
|
|
907
|
+
role: 'user' | 'assistant';
|
|
908
|
+
lcMessage: BaseMessage;
|
|
909
|
+
content: string;
|
|
910
|
+
}> = [
|
|
911
|
+
{
|
|
912
|
+
role: 'user',
|
|
913
|
+
lcMessage: new HumanMessage(
|
|
914
|
+
'What is 12345 * 6789? Please compute this using the calculator tool and explain the result.'
|
|
915
|
+
),
|
|
916
|
+
content:
|
|
917
|
+
'What is 12345 * 6789? Please compute this using the calculator tool and explain the result.',
|
|
918
|
+
},
|
|
919
|
+
{
|
|
920
|
+
role: 'assistant',
|
|
921
|
+
lcMessage: new AIMessage(
|
|
922
|
+
'The result of 12345 multiplied by 6789 is 83,810,205. This is computed by multiplying each digit and carrying over.'
|
|
923
|
+
),
|
|
924
|
+
content:
|
|
925
|
+
'The result of 12345 multiplied by 6789 is 83,810,205. This is computed by multiplying each digit and carrying over.',
|
|
926
|
+
},
|
|
927
|
+
{
|
|
928
|
+
role: 'user',
|
|
929
|
+
lcMessage: new HumanMessage(
|
|
930
|
+
'Now divide that by 137 and tell me the quotient.'
|
|
931
|
+
),
|
|
932
|
+
content: 'Now divide that by 137 and tell me the quotient.',
|
|
933
|
+
},
|
|
934
|
+
{
|
|
935
|
+
role: 'assistant',
|
|
936
|
+
lcMessage: new AIMessage(
|
|
937
|
+
'83,810,205 divided by 137 equals approximately 611,752.59.'
|
|
938
|
+
),
|
|
939
|
+
content: '83,810,205 divided by 137 equals approximately 611,752.59.',
|
|
940
|
+
},
|
|
941
|
+
];
|
|
942
|
+
|
|
943
|
+
const systemPrompt =
|
|
944
|
+
'You are an expert math tutor. Use the calculator tool for ALL computations.';
|
|
945
|
+
|
|
946
|
+
const anthropicCount = await client.messages.countTokens({
|
|
947
|
+
model: 'claude-haiku-4-5',
|
|
948
|
+
system: systemPrompt,
|
|
949
|
+
messages: testMessages.map((m) => ({ role: m.role, content: m.content })),
|
|
950
|
+
});
|
|
951
|
+
|
|
952
|
+
let localTotal = tokenCounter(new SystemMessage(systemPrompt));
|
|
953
|
+
for (const m of testMessages) {
|
|
954
|
+
localTotal += tokenCounter(m.lcMessage);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
const anthropicTokens = anthropicCount.input_tokens;
|
|
958
|
+
const drift = Math.abs(anthropicTokens - localTotal);
|
|
959
|
+
const driftPct = (drift / anthropicTokens) * 100;
|
|
960
|
+
|
|
961
|
+
console.log(` Anthropic count_tokens API: ${anthropicTokens} tokens`);
|
|
962
|
+
console.log(` Local tiktoken estimate: ${localTotal} tokens`);
|
|
963
|
+
console.log(` Drift: ${drift} tokens (${driftPct.toFixed(1)}%)`);
|
|
964
|
+
|
|
965
|
+
expect(anthropicTokens).toBeGreaterThan(0);
|
|
966
|
+
expect(localTotal).toBeGreaterThan(0);
|
|
967
|
+
expect(driftPct).toBeLessThan(30);
|
|
968
|
+
});
|
|
969
|
+
});
|
|
970
|
+
|
|
971
|
+
// ---------------------------------------------------------------------------
|
|
972
|
+
// Bedrock Summarization Tests
|
|
973
|
+
// ---------------------------------------------------------------------------
|
|
974
|
+
|
|
975
|
+
const requiredBedrockEnv = [
|
|
976
|
+
'BEDROCK_AWS_REGION',
|
|
977
|
+
'BEDROCK_AWS_ACCESS_KEY_ID',
|
|
978
|
+
'BEDROCK_AWS_SECRET_ACCESS_KEY',
|
|
979
|
+
];
|
|
980
|
+
const hasBedrock = requiredBedrockEnv.every((k) => process.env[k] != null);
|
|
981
|
+
|
|
982
|
+
(hasBedrock ? describe : describe.skip)('Bedrock Summarization E2E', () => {
|
|
983
|
+
jest.setTimeout(180_000);
|
|
984
|
+
|
|
985
|
+
const agentProvider = Providers.BEDROCK;
|
|
986
|
+
const streamConfig = {
|
|
987
|
+
configurable: { thread_id: 'bedrock-sum-e2e' },
|
|
988
|
+
streamMode: 'values',
|
|
989
|
+
version: 'v2' as const,
|
|
990
|
+
};
|
|
991
|
+
|
|
992
|
+
test('multi-turn tool calls trigger summarization with Bedrock agent', async () => {
|
|
993
|
+
const spies = createSpies();
|
|
994
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
995
|
+
const conversationHistory: BaseMessage[] = [];
|
|
996
|
+
const tokenCounter = await createTokenCounter();
|
|
997
|
+
|
|
998
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
999
|
+
collectedUsage = [];
|
|
1000
|
+
const { aggregateContent } = createContentAggregator();
|
|
1001
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1002
|
+
conversationHistory,
|
|
1003
|
+
tokenCounter
|
|
1004
|
+
);
|
|
1005
|
+
return createSummarizationRun({
|
|
1006
|
+
agentProvider,
|
|
1007
|
+
summarizationProvider: Providers.BEDROCK,
|
|
1008
|
+
maxContextTokens: maxTokens,
|
|
1009
|
+
instructions:
|
|
1010
|
+
'You are a precise math assistant. Use the calculator tool for every computation. Be brief.',
|
|
1011
|
+
collectedUsage,
|
|
1012
|
+
aggregateContent,
|
|
1013
|
+
spies,
|
|
1014
|
+
tokenCounter,
|
|
1015
|
+
indexTokenCountMap,
|
|
1016
|
+
});
|
|
1017
|
+
};
|
|
1018
|
+
|
|
1019
|
+
let run = await createRun();
|
|
1020
|
+
await runTurn(
|
|
1021
|
+
{ run, conversationHistory },
|
|
1022
|
+
'Hello. Please compute 987 * 654 using the calculator.',
|
|
1023
|
+
streamConfig
|
|
1024
|
+
);
|
|
1025
|
+
logTurn('T1', conversationHistory);
|
|
1026
|
+
|
|
1027
|
+
run = await createRun();
|
|
1028
|
+
await runTurn(
|
|
1029
|
+
{ run, conversationHistory },
|
|
1030
|
+
'Now divide 645498 by 123. Use calculator.',
|
|
1031
|
+
streamConfig
|
|
1032
|
+
);
|
|
1033
|
+
logTurn('T2', conversationHistory);
|
|
1034
|
+
|
|
1035
|
+
run = await createRun();
|
|
1036
|
+
await runTurn(
|
|
1037
|
+
{ run, conversationHistory },
|
|
1038
|
+
'Compute sqrt(5248.764) with the calculator. Then multiply the result by 100.',
|
|
1039
|
+
streamConfig
|
|
1040
|
+
);
|
|
1041
|
+
logTurn('T3', conversationHistory);
|
|
1042
|
+
|
|
1043
|
+
run = await createRun(3500);
|
|
1044
|
+
await runTurn(
|
|
1045
|
+
{ run, conversationHistory },
|
|
1046
|
+
'Calculate 2^16 and 3^10 using calculator for each.',
|
|
1047
|
+
streamConfig
|
|
1048
|
+
);
|
|
1049
|
+
logTurn('T4', conversationHistory);
|
|
1050
|
+
|
|
1051
|
+
run = await createRun(3200);
|
|
1052
|
+
await runTurn(
|
|
1053
|
+
{ run, conversationHistory },
|
|
1054
|
+
'What is 59049 + 65536? Calculator. Also tell me what we calculated before.',
|
|
1055
|
+
streamConfig
|
|
1056
|
+
);
|
|
1057
|
+
logTurn('T5', conversationHistory);
|
|
1058
|
+
|
|
1059
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
1060
|
+
run = await createRun(3000);
|
|
1061
|
+
await runTurn(
|
|
1062
|
+
{ run, conversationHistory },
|
|
1063
|
+
'Calculate 111111 * 111111 with calculator.',
|
|
1064
|
+
streamConfig
|
|
1065
|
+
);
|
|
1066
|
+
logTurn('T6', conversationHistory);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
console.log(
|
|
1070
|
+
` Bedrock summarize events — start: ${spies.onSummarizeStartSpy.mock.calls.length}, complete: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
1071
|
+
);
|
|
1072
|
+
|
|
1073
|
+
const { completePayload } = assertSummarizationEvents(spies);
|
|
1074
|
+
assertSummaryRunStep(spies, getSummaryText(completePayload.summary));
|
|
1075
|
+
expect(spies.onMessageDeltaSpy).toHaveBeenCalled();
|
|
1076
|
+
|
|
1077
|
+
// Token accounting: summary tokenCount bounds
|
|
1078
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(10);
|
|
1079
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeLessThan(1500);
|
|
1080
|
+
|
|
1081
|
+
// Token accounting: collectedUsage from the post-summary model call
|
|
1082
|
+
const validUsage = collectedUsage.filter(
|
|
1083
|
+
(u: Partial<UsageMetadata>) =>
|
|
1084
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
1085
|
+
);
|
|
1086
|
+
expect(validUsage.length).toBeGreaterThan(0);
|
|
1087
|
+
const lastUsage = validUsage[validUsage.length - 1];
|
|
1088
|
+
expect(lastUsage.output_tokens).toBeGreaterThan(0);
|
|
1089
|
+
console.log(
|
|
1090
|
+
` Bedrock post-summary usage — input: ${lastUsage.input_tokens}, output: ${lastUsage.output_tokens}`
|
|
1091
|
+
);
|
|
1092
|
+
|
|
1093
|
+
console.log(
|
|
1094
|
+
` Bedrock summary: "${getSummaryText(completePayload.summary).substring(0, 250)}…"`
|
|
1095
|
+
);
|
|
1096
|
+
|
|
1097
|
+
// Post-summary turn should work cleanly
|
|
1098
|
+
run = await createRun(4000);
|
|
1099
|
+
const postSumResult = await runTurn(
|
|
1100
|
+
{ run, conversationHistory },
|
|
1101
|
+
'Give me a brief list of all results we computed.',
|
|
1102
|
+
streamConfig
|
|
1103
|
+
);
|
|
1104
|
+
expect(postSumResult).toBeDefined();
|
|
1105
|
+
logTurn('Post-sum', conversationHistory);
|
|
1106
|
+
});
|
|
1107
|
+
});
|
|
1108
|
+
|
|
1109
|
+
// ---------------------------------------------------------------------------
|
|
1110
|
+
// OpenAI Summarization Tests
|
|
1111
|
+
// ---------------------------------------------------------------------------
|
|
1112
|
+
|
|
1113
|
+
const hasOpenAI = process.env.OPENAI_API_KEY != null;
|
|
1114
|
+
(hasOpenAI ? describe : describe.skip)('OpenAI Summarization E2E', () => {
|
|
1115
|
+
jest.setTimeout(120_000);
|
|
1116
|
+
|
|
1117
|
+
const agentProvider = Providers.OPENAI;
|
|
1118
|
+
const streamConfig = {
|
|
1119
|
+
configurable: { thread_id: 'openai-sum-e2e' },
|
|
1120
|
+
streamMode: 'values',
|
|
1121
|
+
version: 'v2' as const,
|
|
1122
|
+
};
|
|
1123
|
+
|
|
1124
|
+
test('multi-turn with calculator triggers summarization and continues', async () => {
|
|
1125
|
+
const spies = createSpies();
|
|
1126
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
1127
|
+
const conversationHistory: BaseMessage[] = [];
|
|
1128
|
+
let latestContentParts: t.MessageContentComplex[] = [];
|
|
1129
|
+
const tokenCounter = await createTokenCounter();
|
|
1130
|
+
|
|
1131
|
+
const createRun = async (maxTokens = 2000): Promise<Run<t.IState>> => {
|
|
1132
|
+
collectedUsage = [];
|
|
1133
|
+
const { contentParts, aggregateContent } = createContentAggregator();
|
|
1134
|
+
latestContentParts = contentParts as t.MessageContentComplex[];
|
|
1135
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1136
|
+
conversationHistory,
|
|
1137
|
+
tokenCounter
|
|
1138
|
+
);
|
|
1139
|
+
return createSummarizationRun({
|
|
1140
|
+
agentProvider,
|
|
1141
|
+
summarizationProvider: Providers.OPENAI,
|
|
1142
|
+
summarizationModel: 'gpt-4.1-mini',
|
|
1143
|
+
maxContextTokens: maxTokens,
|
|
1144
|
+
instructions:
|
|
1145
|
+
'You are a helpful math tutor. Use the calculator tool for ALL computations. Keep responses concise.',
|
|
1146
|
+
collectedUsage,
|
|
1147
|
+
aggregateContent,
|
|
1148
|
+
spies,
|
|
1149
|
+
tokenCounter,
|
|
1150
|
+
indexTokenCountMap,
|
|
1151
|
+
});
|
|
1152
|
+
};
|
|
1153
|
+
|
|
1154
|
+
let run = await createRun();
|
|
1155
|
+
await runTurn(
|
|
1156
|
+
{ run, conversationHistory },
|
|
1157
|
+
'What is 1234 * 5678? Use the calculator.',
|
|
1158
|
+
streamConfig
|
|
1159
|
+
);
|
|
1160
|
+
logTurn('T1', conversationHistory);
|
|
1161
|
+
|
|
1162
|
+
run = await createRun();
|
|
1163
|
+
await runTurn(
|
|
1164
|
+
{ run, conversationHistory },
|
|
1165
|
+
'Now calculate sqrt(7006652). Use the calculator.',
|
|
1166
|
+
streamConfig
|
|
1167
|
+
);
|
|
1168
|
+
logTurn('T2', conversationHistory);
|
|
1169
|
+
|
|
1170
|
+
run = await createRun();
|
|
1171
|
+
await runTurn(
|
|
1172
|
+
{ run, conversationHistory },
|
|
1173
|
+
'Compute 99 * 101, then 2^15, using calculator for each.',
|
|
1174
|
+
streamConfig
|
|
1175
|
+
);
|
|
1176
|
+
logTurn('T3', conversationHistory);
|
|
1177
|
+
|
|
1178
|
+
run = await createRun();
|
|
1179
|
+
await runTurn(
|
|
1180
|
+
{ run, conversationHistory },
|
|
1181
|
+
'What is 314159 * 271828? Calculator please.',
|
|
1182
|
+
streamConfig
|
|
1183
|
+
);
|
|
1184
|
+
logTurn('T4', conversationHistory);
|
|
1185
|
+
|
|
1186
|
+
run = await createRun();
|
|
1187
|
+
await runTurn(
|
|
1188
|
+
{ run, conversationHistory },
|
|
1189
|
+
'Compute 2^20 with calculator.',
|
|
1190
|
+
streamConfig
|
|
1191
|
+
);
|
|
1192
|
+
logTurn('T5', conversationHistory);
|
|
1193
|
+
|
|
1194
|
+
// Squeeze hard — OpenAI tool-schema overhead is lower than Anthropic,
|
|
1195
|
+
// so we need tighter budgets to force pruning + summarization.
|
|
1196
|
+
run = await createRun(800);
|
|
1197
|
+
await runTurn(
|
|
1198
|
+
{ run, conversationHistory },
|
|
1199
|
+
'Calculate 999999 / 7 with calculator. Remind me of prior results too.',
|
|
1200
|
+
streamConfig
|
|
1201
|
+
);
|
|
1202
|
+
logTurn('T6', conversationHistory);
|
|
1203
|
+
|
|
1204
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
1205
|
+
run = await createRun(600);
|
|
1206
|
+
await runTurn(
|
|
1207
|
+
{ run, conversationHistory },
|
|
1208
|
+
'What is 50 + 50? Calculator.',
|
|
1209
|
+
streamConfig
|
|
1210
|
+
);
|
|
1211
|
+
logTurn('T7', conversationHistory);
|
|
1212
|
+
}
|
|
1213
|
+
|
|
1214
|
+
if (spies.onSummarizeStartSpy.mock.calls.length === 0) {
|
|
1215
|
+
run = await createRun(400);
|
|
1216
|
+
await runTurn(
|
|
1217
|
+
{ run, conversationHistory },
|
|
1218
|
+
'What is 1+1? Calculator.',
|
|
1219
|
+
streamConfig
|
|
1220
|
+
);
|
|
1221
|
+
logTurn('T8', conversationHistory);
|
|
1222
|
+
}
|
|
1223
|
+
|
|
1224
|
+
console.log(
|
|
1225
|
+
` OpenAI summarize events — start: ${spies.onSummarizeStartSpy.mock.calls.length}, complete: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
1226
|
+
);
|
|
1227
|
+
|
|
1228
|
+
const { completePayload } = assertSummarizationEvents(spies);
|
|
1229
|
+
assertSummaryRunStep(spies, getSummaryText(completePayload.summary));
|
|
1230
|
+
|
|
1231
|
+
// Token accounting: summary tokenCount bounds
|
|
1232
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(10);
|
|
1233
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeLessThan(1200);
|
|
1234
|
+
|
|
1235
|
+
// Token accounting: collectedUsage from the post-summary model call
|
|
1236
|
+
const validUsagePrePostSum = collectedUsage.filter(
|
|
1237
|
+
(u: Partial<UsageMetadata>) =>
|
|
1238
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
1239
|
+
);
|
|
1240
|
+
expect(validUsagePrePostSum.length).toBeGreaterThan(0);
|
|
1241
|
+
|
|
1242
|
+
// Verify tool calls still work after summarization
|
|
1243
|
+
run = await createRun(2000);
|
|
1244
|
+
await runTurn(
|
|
1245
|
+
{ run, conversationHistory },
|
|
1246
|
+
'One more: 123 + 456 + 789. Calculator.',
|
|
1247
|
+
streamConfig
|
|
1248
|
+
);
|
|
1249
|
+
const hasPostSumCalc = latestContentParts.some(
|
|
1250
|
+
(p) =>
|
|
1251
|
+
p.type === ContentTypes.TOOL_CALL &&
|
|
1252
|
+
(p as t.ToolCallContent).tool_call?.name === 'calculator'
|
|
1253
|
+
);
|
|
1254
|
+
expect(hasPostSumCalc).toBe(true);
|
|
1255
|
+
|
|
1256
|
+
// Token accounting: post-summary usage must have valid tokens
|
|
1257
|
+
const postSumUsage = collectedUsage.filter(
|
|
1258
|
+
(u: Partial<UsageMetadata>) =>
|
|
1259
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
1260
|
+
);
|
|
1261
|
+
expect(postSumUsage.length).toBeGreaterThan(0);
|
|
1262
|
+
const lastUsage = postSumUsage[postSumUsage.length - 1];
|
|
1263
|
+
expect(lastUsage.output_tokens).toBeGreaterThan(0);
|
|
1264
|
+
console.log(
|
|
1265
|
+
` OpenAI post-summary usage — input: ${lastUsage.input_tokens}, output: ${lastUsage.output_tokens}`
|
|
1266
|
+
);
|
|
1267
|
+
|
|
1268
|
+
expect(spies.onMessageDeltaSpy).toHaveBeenCalled();
|
|
1269
|
+
console.log(
|
|
1270
|
+
` OpenAI summary: "${getSummaryText(completePayload.summary).substring(0, 200)}…"`
|
|
1271
|
+
);
|
|
1272
|
+
console.log(` Final messages: ${conversationHistory.length}`);
|
|
1273
|
+
});
|
|
1274
|
+
});
|
|
1275
|
+
|
|
1276
|
+
// ---------------------------------------------------------------------------
|
|
1277
|
+
// Cross-run lifecycle integration test (no API keys required)
|
|
1278
|
+
// ---------------------------------------------------------------------------
|
|
1279
|
+
|
|
1280
|
+
describe('Cross-run summary lifecycle (no API keys)', () => {
|
|
1281
|
+
jest.setTimeout(60_000);
|
|
1282
|
+
|
|
1283
|
+
const KNOWN_SUMMARY =
|
|
1284
|
+
'User asked about math: 2+2=4 and 3*5=15. Key context preserved.';
|
|
1285
|
+
const INSTRUCTIONS = 'You are a helpful math tutor. Be concise.';
|
|
1286
|
+
const streamConfig = {
|
|
1287
|
+
configurable: { thread_id: 'cross-run-lifecycle' },
|
|
1288
|
+
streamMode: 'values',
|
|
1289
|
+
version: 'v2' as const,
|
|
1290
|
+
};
|
|
1291
|
+
|
|
1292
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
1293
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
1294
|
+
|
|
1295
|
+
beforeEach(() => {
|
|
1296
|
+
getChatModelClassSpy = jest
|
|
1297
|
+
.spyOn(providers, 'getChatModelClass')
|
|
1298
|
+
.mockImplementation(((provider: Providers) => {
|
|
1299
|
+
if (provider === Providers.OPENAI) {
|
|
1300
|
+
return class extends FakeListChatModel {
|
|
1301
|
+
constructor(_options: any) {
|
|
1302
|
+
super({ responses: [KNOWN_SUMMARY] });
|
|
1303
|
+
}
|
|
1304
|
+
} as any;
|
|
1305
|
+
}
|
|
1306
|
+
return originalGetChatModelClass(provider);
|
|
1307
|
+
}) as typeof providers.getChatModelClass);
|
|
1308
|
+
});
|
|
1309
|
+
|
|
1310
|
+
afterEach(() => {
|
|
1311
|
+
getChatModelClassSpy.mockRestore();
|
|
1312
|
+
});
|
|
1313
|
+
|
|
1314
|
+
test('full lifecycle: summarize → formatAgentMessages → new Run with correct indexTokenCountMap', async () => {
|
|
1315
|
+
const spies = createSpies();
|
|
1316
|
+
const conversationHistory: BaseMessage[] = [];
|
|
1317
|
+
const tokenCounter = await createTokenCounter();
|
|
1318
|
+
|
|
1319
|
+
const createRun = async (maxTokens: number): Promise<Run<t.IState>> => {
|
|
1320
|
+
const { aggregateContent } = createContentAggregator();
|
|
1321
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1322
|
+
conversationHistory,
|
|
1323
|
+
tokenCounter
|
|
1324
|
+
);
|
|
1325
|
+
const run = await Run.create<t.IState>({
|
|
1326
|
+
runId: `cross-run-${Date.now()}`,
|
|
1327
|
+
graphConfig: {
|
|
1328
|
+
type: 'standard',
|
|
1329
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
1330
|
+
instructions: INSTRUCTIONS,
|
|
1331
|
+
maxContextTokens: maxTokens,
|
|
1332
|
+
summarizationEnabled: true,
|
|
1333
|
+
summarizationConfig: {
|
|
1334
|
+
provider: Providers.OPENAI,
|
|
1335
|
+
},
|
|
1336
|
+
},
|
|
1337
|
+
returnContent: true,
|
|
1338
|
+
customHandlers: {
|
|
1339
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
1340
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1341
|
+
spies.onRunStepSpy(_event, data);
|
|
1342
|
+
aggregateContent({
|
|
1343
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
1344
|
+
data: data as t.RunStep,
|
|
1345
|
+
});
|
|
1346
|
+
},
|
|
1347
|
+
},
|
|
1348
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
1349
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1350
|
+
spies.onSummarizeStartSpy(data);
|
|
1351
|
+
},
|
|
1352
|
+
},
|
|
1353
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
1354
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1355
|
+
spies.onSummarizeCompleteSpy(data);
|
|
1356
|
+
},
|
|
1357
|
+
},
|
|
1358
|
+
},
|
|
1359
|
+
tokenCounter,
|
|
1360
|
+
indexTokenCountMap,
|
|
1361
|
+
});
|
|
1362
|
+
return run;
|
|
1363
|
+
};
|
|
1364
|
+
|
|
1365
|
+
// --- Turn 1: longer exchange to build up token budget ---
|
|
1366
|
+
let run = await createRun(4000);
|
|
1367
|
+
run.Graph?.overrideTestModel(
|
|
1368
|
+
[
|
|
1369
|
+
'The answer to 2+2 is 4. This is a basic arithmetic operation involving the addition of two integers. Addition is one of the four fundamental operations in mathematics alongside subtraction, multiplication, and division.',
|
|
1370
|
+
],
|
|
1371
|
+
1
|
|
1372
|
+
);
|
|
1373
|
+
await runTurn(
|
|
1374
|
+
{ run, conversationHistory },
|
|
1375
|
+
'Hello! I have several math questions for you today. Let us start with the basics. What is 2+2? Please provide a detailed explanation of the arithmetic.',
|
|
1376
|
+
streamConfig
|
|
1377
|
+
);
|
|
1378
|
+
logTurn('T1', conversationHistory);
|
|
1379
|
+
expect(conversationHistory.length).toBeGreaterThanOrEqual(2);
|
|
1380
|
+
|
|
1381
|
+
// --- Turn 2: build up more conversation ---
|
|
1382
|
+
run = await createRun(4000);
|
|
1383
|
+
run.Graph?.overrideTestModel(
|
|
1384
|
+
[
|
|
1385
|
+
'The result of 3 multiplied by 5 is 15. Multiplication can be thought of as repeated addition: 3+3+3+3+3 equals 15. This is another fundamental arithmetic operation that forms the basis of more advanced mathematical concepts.',
|
|
1386
|
+
],
|
|
1387
|
+
1
|
|
1388
|
+
);
|
|
1389
|
+
await runTurn(
|
|
1390
|
+
{ run, conversationHistory },
|
|
1391
|
+
'Great explanation! Now let us move on to multiplication. Can you compute 3 times 5 and explain the concept of multiplication as repeated addition in detail?',
|
|
1392
|
+
streamConfig
|
|
1393
|
+
);
|
|
1394
|
+
logTurn('T2', conversationHistory);
|
|
1395
|
+
expect(conversationHistory.length).toBeGreaterThanOrEqual(4);
|
|
1396
|
+
|
|
1397
|
+
// --- Turn 3: tight context to force pruning and summarization ---
|
|
1398
|
+
// Budget must be large enough to hold instructions + summary + at least
|
|
1399
|
+
// one message after summarization fires (summary adds ~26 tokens to the
|
|
1400
|
+
// system message, so 50 is too tight).
|
|
1401
|
+
run = await createRun(150);
|
|
1402
|
+
run.Graph?.overrideTestModel(
|
|
1403
|
+
['Got it, continuing with the summary context.'],
|
|
1404
|
+
1
|
|
1405
|
+
);
|
|
1406
|
+
await runTurn(
|
|
1407
|
+
{ run, conversationHistory },
|
|
1408
|
+
'Now summarize everything we discussed.',
|
|
1409
|
+
streamConfig
|
|
1410
|
+
);
|
|
1411
|
+
logTurn('T3', conversationHistory);
|
|
1412
|
+
|
|
1413
|
+
console.log(
|
|
1414
|
+
` Lifecycle events — start: ${spies.onSummarizeStartSpy.mock.calls.length}, complete: ${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
1415
|
+
);
|
|
1416
|
+
|
|
1417
|
+
// --- Assert summarization fired ---
|
|
1418
|
+
expect(spies.onSummarizeStartSpy).toHaveBeenCalled();
|
|
1419
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
1420
|
+
|
|
1421
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
1422
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
1423
|
+
expect(getSummaryText(completePayload.summary)).toBe(KNOWN_SUMMARY);
|
|
1424
|
+
expect(completePayload.summary!.type).toBe(ContentTypes.SUMMARY);
|
|
1425
|
+
expect(completePayload.summary!.tokenCount ?? 0).toBeGreaterThan(0);
|
|
1426
|
+
|
|
1427
|
+
const expectedTokenCount =
|
|
1428
|
+
tokenCounter(new SystemMessage(KNOWN_SUMMARY)) + 33;
|
|
1429
|
+
expect(completePayload.summary!.tokenCount).toBe(expectedTokenCount);
|
|
1430
|
+
|
|
1431
|
+
const summaryBlock = completePayload.summary!;
|
|
1432
|
+
|
|
1433
|
+
// --- Simulate cross-run persistence: build a TPayload as the host would store it ---
|
|
1434
|
+
const persistedPayload: t.TPayload = [
|
|
1435
|
+
{
|
|
1436
|
+
role: 'assistant',
|
|
1437
|
+
content: [
|
|
1438
|
+
{
|
|
1439
|
+
type: ContentTypes.SUMMARY,
|
|
1440
|
+
text: getSummaryText(summaryBlock),
|
|
1441
|
+
tokenCount: summaryBlock.tokenCount ?? 0,
|
|
1442
|
+
} as any,
|
|
1443
|
+
],
|
|
1444
|
+
},
|
|
1445
|
+
{
|
|
1446
|
+
role: 'user',
|
|
1447
|
+
content: 'Now summarize everything we discussed so far.',
|
|
1448
|
+
},
|
|
1449
|
+
{
|
|
1450
|
+
role: 'assistant',
|
|
1451
|
+
content: 'Got it, continuing with the summary context.',
|
|
1452
|
+
},
|
|
1453
|
+
];
|
|
1454
|
+
|
|
1455
|
+
const persistedTokenMap: Record<number, number> = {
|
|
1456
|
+
0: summaryBlock.tokenCount ?? 0,
|
|
1457
|
+
1: tokenCounter(
|
|
1458
|
+
new HumanMessage('Now summarize everything we discussed so far.')
|
|
1459
|
+
),
|
|
1460
|
+
2: tokenCounter(
|
|
1461
|
+
new AIMessage('Got it, continuing with the summary context.')
|
|
1462
|
+
),
|
|
1463
|
+
};
|
|
1464
|
+
|
|
1465
|
+
// --- formatAgentMessages: convert persisted payload for next Run ---
|
|
1466
|
+
const formatted = formatAgentMessages(persistedPayload, persistedTokenMap);
|
|
1467
|
+
|
|
1468
|
+
// Summary is returned as metadata, NOT as a SystemMessage in the messages array.
|
|
1469
|
+
// The caller forwards it to the run via initialSummary → AgentContext.setSummary().
|
|
1470
|
+
expect(formatted.summary).toBeDefined();
|
|
1471
|
+
expect(formatted.summary!.text).toBe(KNOWN_SUMMARY);
|
|
1472
|
+
expect(formatted.summary!.tokenCount).toBe(summaryBlock.tokenCount);
|
|
1473
|
+
// First message should NOT be a SystemMessage — only user/assistant messages remain.
|
|
1474
|
+
expect(formatted.messages[0].constructor.name).not.toBe('SystemMessage');
|
|
1475
|
+
|
|
1476
|
+
const formattedMap = (formatted.indexTokenCountMap || {}) as Record<
|
|
1477
|
+
number,
|
|
1478
|
+
number
|
|
1479
|
+
>;
|
|
1480
|
+
const formattedTotal = Object.values(formattedMap).reduce(
|
|
1481
|
+
(sum: number, v: number) => sum + v,
|
|
1482
|
+
0
|
|
1483
|
+
);
|
|
1484
|
+
// Summary tokens no longer in the map — only user+assistant message tokens.
|
|
1485
|
+
const expectedTotal = persistedTokenMap[1] + persistedTokenMap[2];
|
|
1486
|
+
expect(formattedTotal).toBe(expectedTotal);
|
|
1487
|
+
|
|
1488
|
+
console.log(
|
|
1489
|
+
` Formatted: ${formatted.messages.length} msgs, tokenMap total=${formattedTotal}, summary="${formatted.summary!.text.substring(0, 60)}..."`
|
|
1490
|
+
);
|
|
1491
|
+
|
|
1492
|
+
// --- Turn 4: new Run with formatted messages and updated indexTokenCountMap ---
|
|
1493
|
+
const formattedTokenMapAsStrings: Record<string, number> = {};
|
|
1494
|
+
for (const [k, v] of Object.entries(formattedMap)) {
|
|
1495
|
+
formattedTokenMapAsStrings[String(k)] = v as number;
|
|
1496
|
+
}
|
|
1497
|
+
|
|
1498
|
+
const run4 = await Run.create<t.IState>({
|
|
1499
|
+
runId: `cross-run-lifecycle-t4-${Date.now()}`,
|
|
1500
|
+
graphConfig: {
|
|
1501
|
+
type: 'standard',
|
|
1502
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
1503
|
+
instructions: INSTRUCTIONS,
|
|
1504
|
+
maxContextTokens: 2000,
|
|
1505
|
+
summarizationEnabled: true,
|
|
1506
|
+
summarizationConfig: {
|
|
1507
|
+
provider: Providers.OPENAI,
|
|
1508
|
+
},
|
|
1509
|
+
initialSummary: formatted.summary,
|
|
1510
|
+
},
|
|
1511
|
+
returnContent: true,
|
|
1512
|
+
customHandlers: buildHandlers(
|
|
1513
|
+
[],
|
|
1514
|
+
createContentAggregator().aggregateContent,
|
|
1515
|
+
createSpies()
|
|
1516
|
+
),
|
|
1517
|
+
tokenCounter,
|
|
1518
|
+
indexTokenCountMap: formattedTokenMapAsStrings,
|
|
1519
|
+
});
|
|
1520
|
+
|
|
1521
|
+
run4.Graph?.overrideTestModel(['The square root of 16 is 4.'], 1);
|
|
1522
|
+
|
|
1523
|
+
const t4Messages = [
|
|
1524
|
+
...formatted.messages,
|
|
1525
|
+
new HumanMessage('What is sqrt(16)?'),
|
|
1526
|
+
];
|
|
1527
|
+
const result = await run4.processStream(
|
|
1528
|
+
{ messages: t4Messages },
|
|
1529
|
+
streamConfig as any
|
|
1530
|
+
);
|
|
1531
|
+
|
|
1532
|
+
expect(result).toBeDefined();
|
|
1533
|
+
|
|
1534
|
+
const t4RunMessages = run4.getRunMessages();
|
|
1535
|
+
expect(t4RunMessages).toBeDefined();
|
|
1536
|
+
expect(t4RunMessages!.length).toBeGreaterThan(0);
|
|
1537
|
+
|
|
1538
|
+
console.log(
|
|
1539
|
+
` Turn 4 produced ${t4RunMessages!.length} messages — lifecycle complete`
|
|
1540
|
+
);
|
|
1541
|
+
});
|
|
1542
|
+
|
|
1543
|
+
test('tight context edge case: maxContextTokens as low as 1 does not infinite-loop', async () => {
|
|
1544
|
+
const spies = createSpies();
|
|
1545
|
+
const conversationHistory: BaseMessage[] = [];
|
|
1546
|
+
const tokenCounter = await createTokenCounter();
|
|
1547
|
+
|
|
1548
|
+
const createRun = async (maxTokens: number): Promise<Run<t.IState>> => {
|
|
1549
|
+
const { aggregateContent } = createContentAggregator();
|
|
1550
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1551
|
+
conversationHistory,
|
|
1552
|
+
tokenCounter
|
|
1553
|
+
);
|
|
1554
|
+
return Run.create<t.IState>({
|
|
1555
|
+
runId: `tight-ctx-${Date.now()}`,
|
|
1556
|
+
graphConfig: {
|
|
1557
|
+
type: 'standard',
|
|
1558
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
1559
|
+
instructions: INSTRUCTIONS,
|
|
1560
|
+
maxContextTokens: maxTokens,
|
|
1561
|
+
summarizationEnabled: true,
|
|
1562
|
+
summarizationConfig: {
|
|
1563
|
+
provider: Providers.OPENAI,
|
|
1564
|
+
},
|
|
1565
|
+
},
|
|
1566
|
+
returnContent: true,
|
|
1567
|
+
customHandlers: {
|
|
1568
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
1569
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1570
|
+
spies.onRunStepSpy(_event, data);
|
|
1571
|
+
aggregateContent({
|
|
1572
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
1573
|
+
data: data as t.RunStep,
|
|
1574
|
+
});
|
|
1575
|
+
},
|
|
1576
|
+
},
|
|
1577
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
1578
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1579
|
+
spies.onSummarizeStartSpy(data);
|
|
1580
|
+
},
|
|
1581
|
+
},
|
|
1582
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
1583
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1584
|
+
spies.onSummarizeCompleteSpy(data);
|
|
1585
|
+
},
|
|
1586
|
+
},
|
|
1587
|
+
},
|
|
1588
|
+
tokenCounter,
|
|
1589
|
+
indexTokenCountMap,
|
|
1590
|
+
});
|
|
1591
|
+
};
|
|
1592
|
+
|
|
1593
|
+
// Build a conversation first at normal context size
|
|
1594
|
+
let run = await createRun(4000);
|
|
1595
|
+
run.Graph?.overrideTestModel(
|
|
1596
|
+
['Sure, 2+2 is 4. Happy to help with more math questions.'],
|
|
1597
|
+
1
|
|
1598
|
+
);
|
|
1599
|
+
await runTurn({ run, conversationHistory }, 'What is 2+2?', streamConfig);
|
|
1600
|
+
expect(conversationHistory.length).toBeGreaterThanOrEqual(2);
|
|
1601
|
+
|
|
1602
|
+
// Now use absurdly tight context values — the guard must prevent infinite loops.
|
|
1603
|
+
// Very small values may throw "empty_messages" (context too small for any message)
|
|
1604
|
+
// which is fine — the point is we never hit GraphRecursionError.
|
|
1605
|
+
for (const tightValue of [1, 10, 25, 50]) {
|
|
1606
|
+
spies.onSummarizeStartSpy.mockClear();
|
|
1607
|
+
spies.onSummarizeCompleteSpy.mockClear();
|
|
1608
|
+
|
|
1609
|
+
run = await createRun(tightValue);
|
|
1610
|
+
run.Graph?.overrideTestModel(['OK, noted.'], 1);
|
|
1611
|
+
|
|
1612
|
+
let error: Error | undefined;
|
|
1613
|
+
try {
|
|
1614
|
+
await runTurn({ run, conversationHistory }, 'Continue.', streamConfig);
|
|
1615
|
+
} catch (err) {
|
|
1616
|
+
error = err as Error;
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
if (error) {
|
|
1620
|
+
// Clean errors (empty_messages) are acceptable for tiny context windows.
|
|
1621
|
+
// GraphRecursionError means we looped — that's the bug we're guarding against.
|
|
1622
|
+
expect(error.message).not.toContain('Recursion limit');
|
|
1623
|
+
console.log(
|
|
1624
|
+
` maxContextTokens=${tightValue}: clean error (${error.message.substring(0, 80)})`
|
|
1625
|
+
);
|
|
1626
|
+
// Remove the failed turn's user message from history so subsequent iterations work
|
|
1627
|
+
conversationHistory.pop();
|
|
1628
|
+
} else {
|
|
1629
|
+
const startCalls = spies.onSummarizeStartSpy.mock.calls.length;
|
|
1630
|
+
const completeCalls = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
1631
|
+
console.log(
|
|
1632
|
+
` maxContextTokens=${tightValue}: ok, start=${startCalls}, complete=${completeCalls}, msgs=${conversationHistory.length}`
|
|
1633
|
+
);
|
|
1634
|
+
// If summarization fired, it must have completed.
|
|
1635
|
+
// Emergency truncation may allow success without summarization, so
|
|
1636
|
+
// we don't require startCalls >= 1 — the test's goal is no infinite loop.
|
|
1637
|
+
if (startCalls > 0) {
|
|
1638
|
+
expect(completeCalls).toBe(startCalls);
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
}
|
|
1642
|
+
});
|
|
1643
|
+
});
|
|
1644
|
+
|
|
1645
|
+
// ---------------------------------------------------------------------------
|
|
1646
|
+
// Tight context with oversized tool results (FakeListChatModel — no API keys)
|
|
1647
|
+
// ---------------------------------------------------------------------------
|
|
1648
|
+
|
|
1649
|
+
describe('Tight context with oversized tool results (no API keys)', () => {
|
|
1650
|
+
jest.setTimeout(60_000);
|
|
1651
|
+
|
|
1652
|
+
const INSTRUCTIONS = 'You are a helpful assistant. Be concise.';
|
|
1653
|
+
const SUMMARY_RESPONSE =
|
|
1654
|
+
'## Goal\nUser needed help.\n\n## Progress\n### Done\n- Completed analysis.';
|
|
1655
|
+
const streamConfig = {
|
|
1656
|
+
configurable: { thread_id: 'tight-tool-ctx' },
|
|
1657
|
+
streamMode: 'values',
|
|
1658
|
+
version: 'v2' as const,
|
|
1659
|
+
};
|
|
1660
|
+
|
|
1661
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
1662
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
1663
|
+
|
|
1664
|
+
beforeEach(() => {
|
|
1665
|
+
getChatModelClassSpy = jest
|
|
1666
|
+
.spyOn(providers, 'getChatModelClass')
|
|
1667
|
+
.mockImplementation(((provider: Providers) => {
|
|
1668
|
+
if (provider === Providers.OPENAI) {
|
|
1669
|
+
return class extends FakeListChatModel {
|
|
1670
|
+
constructor(_options: any) {
|
|
1671
|
+
super({ responses: [SUMMARY_RESPONSE] });
|
|
1672
|
+
}
|
|
1673
|
+
} as any;
|
|
1674
|
+
}
|
|
1675
|
+
return originalGetChatModelClass(provider);
|
|
1676
|
+
}) as typeof providers.getChatModelClass);
|
|
1677
|
+
});
|
|
1678
|
+
|
|
1679
|
+
afterEach(() => {
|
|
1680
|
+
getChatModelClassSpy.mockRestore();
|
|
1681
|
+
});
|
|
1682
|
+
|
|
1683
|
+
test('oversized tool result + thinking-enabled model does not crash with tight context', async () => {
|
|
1684
|
+
const spies = createSpies();
|
|
1685
|
+
const tokenCounter = await createTokenCounter();
|
|
1686
|
+
|
|
1687
|
+
// Build a conversation that mimics the real-world bug:
|
|
1688
|
+
// HumanMessage → AIMessage with tool_calls + thinking blocks → large ToolMessage
|
|
1689
|
+
const conversationHistory: BaseMessage[] = [
|
|
1690
|
+
new HumanMessage('Inspect the page JavaScript.'),
|
|
1691
|
+
new AIMessage({
|
|
1692
|
+
content: [
|
|
1693
|
+
{
|
|
1694
|
+
type: 'thinking' as const,
|
|
1695
|
+
thinking: 'Let me inspect the page using chrome-devtools MCP tool.',
|
|
1696
|
+
},
|
|
1697
|
+
{ type: 'text' as const, text: 'I will inspect the page now.' },
|
|
1698
|
+
{
|
|
1699
|
+
type: 'tool_use' as const,
|
|
1700
|
+
id: 'tool_mcp_1',
|
|
1701
|
+
name: 'chrome_devtools_evaluate',
|
|
1702
|
+
input: '{"expression": "document.body.innerHTML"}',
|
|
1703
|
+
},
|
|
1704
|
+
],
|
|
1705
|
+
tool_calls: [
|
|
1706
|
+
{
|
|
1707
|
+
id: 'tool_mcp_1',
|
|
1708
|
+
name: 'chrome_devtools_evaluate',
|
|
1709
|
+
args: { expression: 'document.body.innerHTML' },
|
|
1710
|
+
},
|
|
1711
|
+
],
|
|
1712
|
+
}),
|
|
1713
|
+
new ToolMessage({
|
|
1714
|
+
content: 'x'.repeat(5000), // Large MCP output simulating JS payload
|
|
1715
|
+
tool_call_id: 'tool_mcp_1',
|
|
1716
|
+
name: 'chrome_devtools_evaluate',
|
|
1717
|
+
}),
|
|
1718
|
+
];
|
|
1719
|
+
|
|
1720
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1721
|
+
conversationHistory,
|
|
1722
|
+
tokenCounter
|
|
1723
|
+
);
|
|
1724
|
+
|
|
1725
|
+
// Create a run with extremely tight context and thinking enabled
|
|
1726
|
+
const { aggregateContent } = createContentAggregator();
|
|
1727
|
+
const llmConfig = {
|
|
1728
|
+
...getLLMConfig(Providers.OPENAI),
|
|
1729
|
+
thinking: { type: 'enabled', budget_tokens: 4000 },
|
|
1730
|
+
};
|
|
1731
|
+
const run = await Run.create<t.IState>({
|
|
1732
|
+
runId: `tight-thinking-${Date.now()}`,
|
|
1733
|
+
graphConfig: {
|
|
1734
|
+
type: 'standard',
|
|
1735
|
+
llmConfig: llmConfig as any,
|
|
1736
|
+
instructions: INSTRUCTIONS,
|
|
1737
|
+
maxContextTokens: 500, // Extremely tight — will prune everything
|
|
1738
|
+
summarizationEnabled: true,
|
|
1739
|
+
summarizationConfig: {
|
|
1740
|
+
provider: Providers.OPENAI,
|
|
1741
|
+
},
|
|
1742
|
+
},
|
|
1743
|
+
returnContent: true,
|
|
1744
|
+
customHandlers: {
|
|
1745
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
1746
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1747
|
+
spies.onRunStepSpy(_event, data);
|
|
1748
|
+
aggregateContent({
|
|
1749
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
1750
|
+
data: data as t.RunStep,
|
|
1751
|
+
});
|
|
1752
|
+
},
|
|
1753
|
+
},
|
|
1754
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
1755
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1756
|
+
spies.onSummarizeStartSpy(data);
|
|
1757
|
+
},
|
|
1758
|
+
},
|
|
1759
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
1760
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1761
|
+
spies.onSummarizeCompleteSpy(data);
|
|
1762
|
+
},
|
|
1763
|
+
},
|
|
1764
|
+
},
|
|
1765
|
+
tokenCounter,
|
|
1766
|
+
indexTokenCountMap,
|
|
1767
|
+
});
|
|
1768
|
+
|
|
1769
|
+
run.Graph?.overrideTestModel(['Analysis complete.'], 1);
|
|
1770
|
+
|
|
1771
|
+
let error: Error | undefined;
|
|
1772
|
+
try {
|
|
1773
|
+
await run.processStream(
|
|
1774
|
+
{ messages: [...conversationHistory, new HumanMessage('Continue.')] },
|
|
1775
|
+
streamConfig as any
|
|
1776
|
+
);
|
|
1777
|
+
} catch (err) {
|
|
1778
|
+
error = err as Error;
|
|
1779
|
+
}
|
|
1780
|
+
|
|
1781
|
+
// The key assertion: no crash about "aggressive pruning removed all AI messages"
|
|
1782
|
+
if (error) {
|
|
1783
|
+
expect(error.message).not.toContain('aggressive pruning removed all AI');
|
|
1784
|
+
expect(error.message).not.toContain('Recursion limit');
|
|
1785
|
+
// empty_messages is acceptable for this tiny context window
|
|
1786
|
+
console.log(
|
|
1787
|
+
` Tight thinking context: clean error (${error.message.substring(0, 100)})`
|
|
1788
|
+
);
|
|
1789
|
+
} else {
|
|
1790
|
+
console.log(' Tight thinking context: completed without error');
|
|
1791
|
+
}
|
|
1792
|
+
});
|
|
1793
|
+
|
|
1794
|
+
test('summarization survives when tool results dominate the context', async () => {
|
|
1795
|
+
const spies = createSpies();
|
|
1796
|
+
const tokenCounter = await createTokenCounter();
|
|
1797
|
+
|
|
1798
|
+
// Build 3 turns with large tool outputs (~2000 chars each)
|
|
1799
|
+
const conversationHistory: BaseMessage[] = [];
|
|
1800
|
+
|
|
1801
|
+
const createRunHelper = async (
|
|
1802
|
+
maxTokens: number
|
|
1803
|
+
): Promise<Run<t.IState>> => {
|
|
1804
|
+
const { aggregateContent } = createContentAggregator();
|
|
1805
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1806
|
+
conversationHistory,
|
|
1807
|
+
tokenCounter
|
|
1808
|
+
);
|
|
1809
|
+
return Run.create<t.IState>({
|
|
1810
|
+
runId: `tool-dominate-${Date.now()}`,
|
|
1811
|
+
graphConfig: {
|
|
1812
|
+
type: 'standard',
|
|
1813
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
1814
|
+
instructions: INSTRUCTIONS,
|
|
1815
|
+
maxContextTokens: maxTokens,
|
|
1816
|
+
summarizationEnabled: true,
|
|
1817
|
+
summarizationConfig: {
|
|
1818
|
+
provider: Providers.OPENAI,
|
|
1819
|
+
},
|
|
1820
|
+
},
|
|
1821
|
+
returnContent: true,
|
|
1822
|
+
customHandlers: {
|
|
1823
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
1824
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1825
|
+
spies.onRunStepSpy(_event, data);
|
|
1826
|
+
aggregateContent({
|
|
1827
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
1828
|
+
data: data as t.RunStep,
|
|
1829
|
+
});
|
|
1830
|
+
},
|
|
1831
|
+
},
|
|
1832
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
1833
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1834
|
+
spies.onSummarizeStartSpy(data);
|
|
1835
|
+
},
|
|
1836
|
+
},
|
|
1837
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
1838
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1839
|
+
spies.onSummarizeCompleteSpy(data);
|
|
1840
|
+
},
|
|
1841
|
+
},
|
|
1842
|
+
},
|
|
1843
|
+
tokenCounter,
|
|
1844
|
+
indexTokenCountMap,
|
|
1845
|
+
});
|
|
1846
|
+
};
|
|
1847
|
+
|
|
1848
|
+
// Turn 1
|
|
1849
|
+
let run = await createRunHelper(4000);
|
|
1850
|
+
run.Graph?.overrideTestModel(
|
|
1851
|
+
[
|
|
1852
|
+
'Here is a long explanation about the analysis results that covers many details of the computation.',
|
|
1853
|
+
],
|
|
1854
|
+
1
|
|
1855
|
+
);
|
|
1856
|
+
await runTurn(
|
|
1857
|
+
{ run, conversationHistory },
|
|
1858
|
+
'Analyze the following data: ' + 'y'.repeat(2000),
|
|
1859
|
+
streamConfig
|
|
1860
|
+
);
|
|
1861
|
+
|
|
1862
|
+
// Turn 2
|
|
1863
|
+
run = await createRunHelper(4000);
|
|
1864
|
+
run.Graph?.overrideTestModel(
|
|
1865
|
+
[
|
|
1866
|
+
'More results from the second analysis including additional context and findings.',
|
|
1867
|
+
],
|
|
1868
|
+
1
|
|
1869
|
+
);
|
|
1870
|
+
await runTurn(
|
|
1871
|
+
{ run, conversationHistory },
|
|
1872
|
+
'Now analyze this: ' + 'z'.repeat(2000),
|
|
1873
|
+
streamConfig
|
|
1874
|
+
);
|
|
1875
|
+
|
|
1876
|
+
// Turn 3 with tight context to force summarization
|
|
1877
|
+
run = await createRunHelper(500);
|
|
1878
|
+
run.Graph?.overrideTestModel(['Got it.'], 1);
|
|
1879
|
+
|
|
1880
|
+
let error: Error | undefined;
|
|
1881
|
+
try {
|
|
1882
|
+
await runTurn(
|
|
1883
|
+
{ run, conversationHistory },
|
|
1884
|
+
'Summarize everything.',
|
|
1885
|
+
streamConfig
|
|
1886
|
+
);
|
|
1887
|
+
} catch (err) {
|
|
1888
|
+
error = err as Error;
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
if (error) {
|
|
1892
|
+
// empty_messages is acceptable, but not recursion errors
|
|
1893
|
+
expect(error.message).not.toContain('Recursion limit');
|
|
1894
|
+
console.log(
|
|
1895
|
+
` Tool-dominated context: clean error (${error.message.substring(0, 100)})`
|
|
1896
|
+
);
|
|
1897
|
+
} else {
|
|
1898
|
+
// Summarization should have fired
|
|
1899
|
+
expect(spies.onSummarizeStartSpy).toHaveBeenCalled();
|
|
1900
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
1901
|
+
|
|
1902
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
1903
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
1904
|
+
expect(getSummaryText(completePayload.summary).length).toBeGreaterThan(
|
|
1905
|
+
10
|
|
1906
|
+
);
|
|
1907
|
+
console.log(
|
|
1908
|
+
` Tool-dominated context: summary="${getSummaryText(completePayload.summary).substring(0, 100)}…"`
|
|
1909
|
+
);
|
|
1910
|
+
}
|
|
1911
|
+
});
|
|
1912
|
+
|
|
1913
|
+
test('multiple summarization cycles preserve structured checkpoint format', async () => {
|
|
1914
|
+
const spies = createSpies();
|
|
1915
|
+
const conversationHistory: BaseMessage[] = [];
|
|
1916
|
+
const tokenCounter = await createTokenCounter();
|
|
1917
|
+
|
|
1918
|
+
const createRunHelper = async (
|
|
1919
|
+
maxTokens: number
|
|
1920
|
+
): Promise<Run<t.IState>> => {
|
|
1921
|
+
const { aggregateContent } = createContentAggregator();
|
|
1922
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
1923
|
+
conversationHistory,
|
|
1924
|
+
tokenCounter
|
|
1925
|
+
);
|
|
1926
|
+
return Run.create<t.IState>({
|
|
1927
|
+
runId: `multi-sum-${Date.now()}`,
|
|
1928
|
+
graphConfig: {
|
|
1929
|
+
type: 'standard',
|
|
1930
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
1931
|
+
instructions: INSTRUCTIONS,
|
|
1932
|
+
maxContextTokens: maxTokens,
|
|
1933
|
+
summarizationEnabled: true,
|
|
1934
|
+
summarizationConfig: {
|
|
1935
|
+
provider: Providers.OPENAI,
|
|
1936
|
+
},
|
|
1937
|
+
},
|
|
1938
|
+
returnContent: true,
|
|
1939
|
+
customHandlers: {
|
|
1940
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
1941
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1942
|
+
spies.onRunStepSpy(_event, data);
|
|
1943
|
+
aggregateContent({
|
|
1944
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
1945
|
+
data: data as t.RunStep,
|
|
1946
|
+
});
|
|
1947
|
+
},
|
|
1948
|
+
},
|
|
1949
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
1950
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1951
|
+
spies.onSummarizeStartSpy(data);
|
|
1952
|
+
},
|
|
1953
|
+
},
|
|
1954
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
1955
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
1956
|
+
spies.onSummarizeCompleteSpy(data);
|
|
1957
|
+
},
|
|
1958
|
+
},
|
|
1959
|
+
},
|
|
1960
|
+
tokenCounter,
|
|
1961
|
+
indexTokenCountMap,
|
|
1962
|
+
});
|
|
1963
|
+
};
|
|
1964
|
+
|
|
1965
|
+
// Build conversation to trigger first summarization
|
|
1966
|
+
let run = await createRunHelper(4000);
|
|
1967
|
+
run.Graph?.overrideTestModel(
|
|
1968
|
+
['The answer to 2+2 is 4. This is basic addition.'],
|
|
1969
|
+
1
|
|
1970
|
+
);
|
|
1971
|
+
await runTurn(
|
|
1972
|
+
{ run, conversationHistory },
|
|
1973
|
+
'What is 2+2? Give me a detailed explanation.',
|
|
1974
|
+
streamConfig
|
|
1975
|
+
);
|
|
1976
|
+
|
|
1977
|
+
run = await createRunHelper(4000);
|
|
1978
|
+
run.Graph?.overrideTestModel(
|
|
1979
|
+
['3 times 5 is 15. Multiplication is repeated addition.'],
|
|
1980
|
+
1
|
|
1981
|
+
);
|
|
1982
|
+
await runTurn(
|
|
1983
|
+
{ run, conversationHistory },
|
|
1984
|
+
'Now explain 3 times 5 in detail with examples.',
|
|
1985
|
+
streamConfig
|
|
1986
|
+
);
|
|
1987
|
+
|
|
1988
|
+
// Force first summarization
|
|
1989
|
+
run = await createRunHelper(50);
|
|
1990
|
+
run.Graph?.overrideTestModel(['Continuing after summary.'], 1);
|
|
1991
|
+
try {
|
|
1992
|
+
await runTurn({ run, conversationHistory }, 'Continue.', streamConfig);
|
|
1993
|
+
} catch {
|
|
1994
|
+
conversationHistory.pop(); // remove failed user message
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
const firstSumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
1998
|
+
|
|
1999
|
+
// Build more conversation
|
|
2000
|
+
run = await createRunHelper(4000);
|
|
2001
|
+
run.Graph?.overrideTestModel(
|
|
2002
|
+
['The square root of 16 is 4. This is because 4 squared equals 16.'],
|
|
2003
|
+
1
|
|
2004
|
+
);
|
|
2005
|
+
await runTurn(
|
|
2006
|
+
{ run, conversationHistory },
|
|
2007
|
+
'What is sqrt(16)? Explain thoroughly.',
|
|
2008
|
+
streamConfig
|
|
2009
|
+
);
|
|
2010
|
+
|
|
2011
|
+
// Force second summarization
|
|
2012
|
+
run = await createRunHelper(50);
|
|
2013
|
+
run.Graph?.overrideTestModel(['Continuing after second summary.'], 1);
|
|
2014
|
+
try {
|
|
2015
|
+
await runTurn(
|
|
2016
|
+
{ run, conversationHistory },
|
|
2017
|
+
'Continue again.',
|
|
2018
|
+
streamConfig
|
|
2019
|
+
);
|
|
2020
|
+
} catch {
|
|
2021
|
+
conversationHistory.pop();
|
|
2022
|
+
}
|
|
2023
|
+
|
|
2024
|
+
const totalSumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
2025
|
+
console.log(
|
|
2026
|
+
` Summarization cycles: first=${firstSumCount}, total=${totalSumCount}`
|
|
2027
|
+
);
|
|
2028
|
+
|
|
2029
|
+
// At least one summarization should have fired
|
|
2030
|
+
expect(totalSumCount).toBeGreaterThanOrEqual(1);
|
|
2031
|
+
|
|
2032
|
+
// The summary response from our fake model has structured format
|
|
2033
|
+
const lastComplete = spies.onSummarizeCompleteSpy.mock.calls[
|
|
2034
|
+
totalSumCount - 1
|
|
2035
|
+
][0] as t.SummarizeCompleteEvent;
|
|
2036
|
+
const summaryText = getSummaryText(lastComplete.summary);
|
|
2037
|
+
|
|
2038
|
+
// Our SUMMARY_RESPONSE includes ## Goal and ## Progress
|
|
2039
|
+
expect(summaryText).toContain('## Goal');
|
|
2040
|
+
expect(summaryText).toContain('## Progress');
|
|
2041
|
+
console.log(
|
|
2042
|
+
` Last summary (${summaryText.length} chars): "${summaryText.substring(0, 150)}…"`
|
|
2043
|
+
);
|
|
2044
|
+
});
|
|
2045
|
+
|
|
2046
|
+
test('update prompt is used when prior summary exists', async () => {
|
|
2047
|
+
const spies = createSpies();
|
|
2048
|
+
const conversationHistory: BaseMessage[] = [];
|
|
2049
|
+
const tokenCounter = await createTokenCounter();
|
|
2050
|
+
|
|
2051
|
+
// Track what system messages are passed to the summarizer model.
|
|
2052
|
+
// Override _streamResponseChunks (not _generate) because FakeListChatModel
|
|
2053
|
+
// has its own _streamResponseChunks that bypasses _generate during streaming.
|
|
2054
|
+
const capturedSystemMessages: string[] = [];
|
|
2055
|
+
getChatModelClassSpy.mockRestore();
|
|
2056
|
+
getChatModelClassSpy = jest
|
|
2057
|
+
.spyOn(providers, 'getChatModelClass')
|
|
2058
|
+
.mockImplementation(((provider: Providers) => {
|
|
2059
|
+
if (provider === Providers.OPENAI) {
|
|
2060
|
+
return class extends FakeListChatModel {
|
|
2061
|
+
constructor(_options: any) {
|
|
2062
|
+
super({ responses: [SUMMARY_RESPONSE] });
|
|
2063
|
+
}
|
|
2064
|
+
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
|
|
2065
|
+
async *_streamResponseChunks(
|
|
2066
|
+
messages: any[],
|
|
2067
|
+
options: any,
|
|
2068
|
+
runManager?: any
|
|
2069
|
+
) {
|
|
2070
|
+
// Capture the system message content for inspection
|
|
2071
|
+
if (Array.isArray(messages)) {
|
|
2072
|
+
for (const msg of messages) {
|
|
2073
|
+
const msgType = msg.getType?.() ?? msg._getType?.();
|
|
2074
|
+
if (msgType === 'system') {
|
|
2075
|
+
const content =
|
|
2076
|
+
typeof msg.content === 'string'
|
|
2077
|
+
? msg.content
|
|
2078
|
+
: JSON.stringify(msg.content);
|
|
2079
|
+
capturedSystemMessages.push(content);
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
yield* super._streamResponseChunks(messages, options, runManager);
|
|
2084
|
+
}
|
|
2085
|
+
} as any;
|
|
2086
|
+
}
|
|
2087
|
+
return originalGetChatModelClass(provider);
|
|
2088
|
+
}) as typeof providers.getChatModelClass);
|
|
2089
|
+
|
|
2090
|
+
const createRunHelper = async (
|
|
2091
|
+
maxTokens: number,
|
|
2092
|
+
initialSummary?: { text: string; tokenCount: number }
|
|
2093
|
+
): Promise<Run<t.IState>> => {
|
|
2094
|
+
const { aggregateContent } = createContentAggregator();
|
|
2095
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2096
|
+
conversationHistory,
|
|
2097
|
+
tokenCounter
|
|
2098
|
+
);
|
|
2099
|
+
return Run.create<t.IState>({
|
|
2100
|
+
runId: `update-prompt-${Date.now()}`,
|
|
2101
|
+
graphConfig: {
|
|
2102
|
+
type: 'standard',
|
|
2103
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
2104
|
+
instructions: INSTRUCTIONS,
|
|
2105
|
+
maxContextTokens: maxTokens,
|
|
2106
|
+
summarizationEnabled: true,
|
|
2107
|
+
summarizationConfig: {
|
|
2108
|
+
provider: Providers.OPENAI,
|
|
2109
|
+
},
|
|
2110
|
+
initialSummary,
|
|
2111
|
+
},
|
|
2112
|
+
returnContent: true,
|
|
2113
|
+
customHandlers: {
|
|
2114
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
2115
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2116
|
+
spies.onRunStepSpy(_event, data);
|
|
2117
|
+
aggregateContent({
|
|
2118
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
2119
|
+
data: data as t.RunStep,
|
|
2120
|
+
});
|
|
2121
|
+
},
|
|
2122
|
+
},
|
|
2123
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
2124
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2125
|
+
spies.onSummarizeStartSpy(data);
|
|
2126
|
+
},
|
|
2127
|
+
},
|
|
2128
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
2129
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2130
|
+
spies.onSummarizeCompleteSpy(data);
|
|
2131
|
+
},
|
|
2132
|
+
},
|
|
2133
|
+
},
|
|
2134
|
+
tokenCounter,
|
|
2135
|
+
indexTokenCountMap,
|
|
2136
|
+
});
|
|
2137
|
+
};
|
|
2138
|
+
|
|
2139
|
+
// --- Step 1: Build conversation and trigger FIRST summarization (fresh prompt) ---
|
|
2140
|
+
let run = await createRunHelper(4000);
|
|
2141
|
+
run.Graph?.overrideTestModel(
|
|
2142
|
+
[
|
|
2143
|
+
'The answer to 2+2 is 4. Addition is one of the four fundamental arithmetic operations.',
|
|
2144
|
+
],
|
|
2145
|
+
1
|
|
2146
|
+
);
|
|
2147
|
+
await runTurn(
|
|
2148
|
+
{ run, conversationHistory },
|
|
2149
|
+
'What is 2+2? Please provide a detailed explanation of the arithmetic.',
|
|
2150
|
+
streamConfig
|
|
2151
|
+
);
|
|
2152
|
+
|
|
2153
|
+
run = await createRunHelper(4000);
|
|
2154
|
+
run.Graph?.overrideTestModel(
|
|
2155
|
+
[
|
|
2156
|
+
'3 times 5 is 15. Multiplication can be thought of as repeated addition.',
|
|
2157
|
+
],
|
|
2158
|
+
1
|
|
2159
|
+
);
|
|
2160
|
+
await runTurn(
|
|
2161
|
+
{ run, conversationHistory },
|
|
2162
|
+
'Now explain 3 times 5 with a detailed worked example of multiplication.',
|
|
2163
|
+
streamConfig
|
|
2164
|
+
);
|
|
2165
|
+
|
|
2166
|
+
// Force first summarization
|
|
2167
|
+
run = await createRunHelper(50);
|
|
2168
|
+
run.Graph?.overrideTestModel(['Continuing after first summary.'], 1);
|
|
2169
|
+
try {
|
|
2170
|
+
await runTurn(
|
|
2171
|
+
{ run, conversationHistory },
|
|
2172
|
+
'Now summarize everything we discussed.',
|
|
2173
|
+
streamConfig
|
|
2174
|
+
);
|
|
2175
|
+
} catch {
|
|
2176
|
+
conversationHistory.pop();
|
|
2177
|
+
}
|
|
2178
|
+
|
|
2179
|
+
const firstSumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
2180
|
+
console.log(` First summarization: ${firstSumCount} complete events`);
|
|
2181
|
+
|
|
2182
|
+
// Extract summary from first round to use as initialSummary
|
|
2183
|
+
let priorSummary: { text: string; tokenCount: number } | undefined;
|
|
2184
|
+
if (firstSumCount > 0) {
|
|
2185
|
+
const firstComplete = spies.onSummarizeCompleteSpy.mock.calls[
|
|
2186
|
+
firstSumCount - 1
|
|
2187
|
+
][0] as t.SummarizeCompleteEvent;
|
|
2188
|
+
priorSummary = {
|
|
2189
|
+
text: getSummaryText(firstComplete.summary),
|
|
2190
|
+
tokenCount: firstComplete.summary!.tokenCount ?? 0,
|
|
2191
|
+
};
|
|
2192
|
+
}
|
|
2193
|
+
|
|
2194
|
+
// Clear captured messages — we only care about the SECOND summarization
|
|
2195
|
+
const firstRoundCaptures = capturedSystemMessages.length;
|
|
2196
|
+
capturedSystemMessages.length = 0;
|
|
2197
|
+
|
|
2198
|
+
// --- Step 2: Build more conversation with initialSummary, trigger SECOND summarization ---
|
|
2199
|
+
// Since initialSummary is set, the summarize node should use the update prompt.
|
|
2200
|
+
run = await createRunHelper(4000, priorSummary);
|
|
2201
|
+
run.Graph?.overrideTestModel(
|
|
2202
|
+
['The square root of 16 is 4, because 4 times 4 equals 16.'],
|
|
2203
|
+
1
|
|
2204
|
+
);
|
|
2205
|
+
await runTurn(
|
|
2206
|
+
{ run, conversationHistory },
|
|
2207
|
+
'What is the square root of 16? Give a very detailed explanation.',
|
|
2208
|
+
streamConfig
|
|
2209
|
+
);
|
|
2210
|
+
|
|
2211
|
+
run = await createRunHelper(4000, priorSummary);
|
|
2212
|
+
run.Graph?.overrideTestModel(
|
|
2213
|
+
[
|
|
2214
|
+
'100 divided by 4 is 25. Division distributes a total into equal groups.',
|
|
2215
|
+
],
|
|
2216
|
+
1
|
|
2217
|
+
);
|
|
2218
|
+
await runTurn(
|
|
2219
|
+
{ run, conversationHistory },
|
|
2220
|
+
'What is 100 divided by 4? Explain division with multiple examples.',
|
|
2221
|
+
streamConfig
|
|
2222
|
+
);
|
|
2223
|
+
|
|
2224
|
+
// Force second summarization (with prior summary in AgentContext)
|
|
2225
|
+
run = await createRunHelper(50, priorSummary);
|
|
2226
|
+
run.Graph?.overrideTestModel(['Continuing after second summary.'], 1);
|
|
2227
|
+
try {
|
|
2228
|
+
await runTurn({ run, conversationHistory }, 'Continue.', streamConfig);
|
|
2229
|
+
} catch {
|
|
2230
|
+
conversationHistory.pop();
|
|
2231
|
+
}
|
|
2232
|
+
|
|
2233
|
+
const secondSumCount =
|
|
2234
|
+
spies.onSummarizeCompleteSpy.mock.calls.length - firstSumCount;
|
|
2235
|
+
console.log(
|
|
2236
|
+
` Second summarization: ${secondSumCount} complete events, ` +
|
|
2237
|
+
`captured ${capturedSystemMessages.length} system messages (first round had ${firstRoundCaptures})`
|
|
2238
|
+
);
|
|
2239
|
+
|
|
2240
|
+
if (capturedSystemMessages.length > 0) {
|
|
2241
|
+
// When a prior summary exists, verify the summarizer received context.
|
|
2242
|
+
// With multi-pass (chunks 1+), the FRESH prompt + continuation prefix is
|
|
2243
|
+
// used instead of the UPDATE prompt. Chunk 0 uses UPDATE only when it's
|
|
2244
|
+
// a cross-cycle prior (tested in node.test.ts unit tests).
|
|
2245
|
+
// In this integration test, verify that EITHER the UPDATE prompt OR the
|
|
2246
|
+
// continuation prefix (context-from-earlier-messages) was used, confirming
|
|
2247
|
+
// the prior summary was passed to the summarizer.
|
|
2248
|
+
const usedUpdateOrContinuation = capturedSystemMessages.some(
|
|
2249
|
+
(msg: string) =>
|
|
2250
|
+
msg.includes('Merge the new messages') ||
|
|
2251
|
+
msg.includes('Update the existing summary') ||
|
|
2252
|
+
msg.includes('context-from-earlier-messages')
|
|
2253
|
+
);
|
|
2254
|
+
expect(usedUpdateOrContinuation).toBe(true);
|
|
2255
|
+
console.log(
|
|
2256
|
+
` System message snippet: "${capturedSystemMessages[0].substring(0, 120)}…"`
|
|
2257
|
+
);
|
|
2258
|
+
} else if (firstRoundCaptures > 0) {
|
|
2259
|
+
// First round used fresh prompt, second didn't fire — still validates first-round behavior
|
|
2260
|
+
console.log(
|
|
2261
|
+
' Second summarization did not fire, but first round confirmed fresh prompt was used'
|
|
2262
|
+
);
|
|
2263
|
+
} else {
|
|
2264
|
+
console.log(' No system messages captured');
|
|
2265
|
+
}
|
|
2266
|
+
});
|
|
2267
|
+
|
|
2268
|
+
test('empty pruning context after summarization preserves latest user turn', async () => {
|
|
2269
|
+
const spies = createSpies();
|
|
2270
|
+
const tokenCounter = await createTokenCounter();
|
|
2271
|
+
|
|
2272
|
+
// Build a conversation where EVERY message is too large to fit in the
|
|
2273
|
+
// post-summary budget individually. This reproduces the real-world bug
|
|
2274
|
+
// where context is empty after pruning, summarization fires, and the
|
|
2275
|
+
// summarize node used to return 0 surviving messages.
|
|
2276
|
+
const largePadding = ' detailed explanation'.repeat(80); // ~1600 chars
|
|
2277
|
+
const conversationHistory: BaseMessage[] = [
|
|
2278
|
+
new HumanMessage(`First question about math${largePadding}`),
|
|
2279
|
+
new AIMessage(`The answer is 42${largePadding}`),
|
|
2280
|
+
new HumanMessage(`Second question about physics${largePadding}`),
|
|
2281
|
+
new AIMessage(`E equals mc squared${largePadding}`),
|
|
2282
|
+
new HumanMessage(`Third question about chemistry${largePadding}`),
|
|
2283
|
+
new AIMessage(`Water is H2O${largePadding}`),
|
|
2284
|
+
];
|
|
2285
|
+
|
|
2286
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2287
|
+
conversationHistory,
|
|
2288
|
+
tokenCounter
|
|
2289
|
+
);
|
|
2290
|
+
|
|
2291
|
+
const { aggregateContent } = createContentAggregator();
|
|
2292
|
+
const run = await Run.create<t.IState>({
|
|
2293
|
+
runId: `empty-ctx-${Date.now()}`,
|
|
2294
|
+
graphConfig: {
|
|
2295
|
+
type: 'standard',
|
|
2296
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
2297
|
+
instructions: INSTRUCTIONS,
|
|
2298
|
+
maxContextTokens: 200, // Extremely tight — no message fits individually
|
|
2299
|
+
summarizationEnabled: true,
|
|
2300
|
+
summarizationConfig: {
|
|
2301
|
+
provider: Providers.OPENAI,
|
|
2302
|
+
},
|
|
2303
|
+
},
|
|
2304
|
+
returnContent: true,
|
|
2305
|
+
customHandlers: {
|
|
2306
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
2307
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2308
|
+
spies.onRunStepSpy(_event, data);
|
|
2309
|
+
aggregateContent({
|
|
2310
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
2311
|
+
data: data as t.RunStep,
|
|
2312
|
+
});
|
|
2313
|
+
},
|
|
2314
|
+
},
|
|
2315
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
2316
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2317
|
+
spies.onSummarizeStartSpy(data);
|
|
2318
|
+
},
|
|
2319
|
+
},
|
|
2320
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
2321
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2322
|
+
spies.onSummarizeCompleteSpy(data);
|
|
2323
|
+
},
|
|
2324
|
+
},
|
|
2325
|
+
},
|
|
2326
|
+
tokenCounter,
|
|
2327
|
+
indexTokenCountMap,
|
|
2328
|
+
});
|
|
2329
|
+
|
|
2330
|
+
// The agent model response for the post-summary turn
|
|
2331
|
+
run.Graph?.overrideTestModel(['Here is the answer to your question.'], 1);
|
|
2332
|
+
|
|
2333
|
+
const latestUserMessage = new HumanMessage(
|
|
2334
|
+
'What is the capital of France?'
|
|
2335
|
+
);
|
|
2336
|
+
|
|
2337
|
+
let error: Error | undefined;
|
|
2338
|
+
try {
|
|
2339
|
+
await run.processStream(
|
|
2340
|
+
{ messages: [...conversationHistory, latestUserMessage] },
|
|
2341
|
+
streamConfig as any
|
|
2342
|
+
);
|
|
2343
|
+
} catch (err) {
|
|
2344
|
+
error = err as Error;
|
|
2345
|
+
}
|
|
2346
|
+
|
|
2347
|
+
// Summarization should have fired
|
|
2348
|
+
expect(spies.onSummarizeStartSpy).toHaveBeenCalled();
|
|
2349
|
+
|
|
2350
|
+
// Key assertion: before the fix, this scenario always produced an
|
|
2351
|
+
// empty_messages error because contextMessages was empty after
|
|
2352
|
+
// summarization. After the fix, the latest turn's HumanMessage is
|
|
2353
|
+
// extracted from messagesToRefine and the model responds successfully.
|
|
2354
|
+
if (error) {
|
|
2355
|
+
// If an error occurs, it must NOT be the empty_messages error that
|
|
2356
|
+
// the fix was designed to prevent.
|
|
2357
|
+
expect(error.message).not.toContain('empty_messages');
|
|
2358
|
+
console.log(
|
|
2359
|
+
` Empty context fix: non-empty_messages error (${error.message.substring(0, 120)})`
|
|
2360
|
+
);
|
|
2361
|
+
} else {
|
|
2362
|
+
// The model responded successfully — this is the expected outcome
|
|
2363
|
+
console.log(' Empty context fix: model responded successfully');
|
|
2364
|
+
}
|
|
2365
|
+
});
|
|
2366
|
+
});
|
|
2367
|
+
|
|
2368
|
+
// ---------------------------------------------------------------------------
|
|
2369
|
+
// Token accounting audit (requires API keys)
|
|
2370
|
+
// ---------------------------------------------------------------------------
|
|
2371
|
+
|
|
2372
|
+
const hasAnyApiKey =
|
|
2373
|
+
process.env.ANTHROPIC_API_KEY != null || process.env.OPENAI_API_KEY != null;
|
|
2374
|
+
|
|
2375
|
+
(hasAnyApiKey ? describe : describe.skip)('Token accounting audit', () => {
|
|
2376
|
+
jest.setTimeout(180_000);
|
|
2377
|
+
|
|
2378
|
+
const agentProvider =
|
|
2379
|
+
process.env.ANTHROPIC_API_KEY != null &&
|
|
2380
|
+
process.env.ANTHROPIC_API_KEY !== ''
|
|
2381
|
+
? Providers.ANTHROPIC
|
|
2382
|
+
: Providers.OPENAI;
|
|
2383
|
+
const summarizationProvider = agentProvider;
|
|
2384
|
+
const summarizationModel =
|
|
2385
|
+
agentProvider === Providers.ANTHROPIC ? 'claude-haiku-4-5' : 'gpt-4.1-mini';
|
|
2386
|
+
|
|
2387
|
+
const streamConfig = {
|
|
2388
|
+
configurable: { thread_id: 'token-audit-e2e' },
|
|
2389
|
+
streamMode: 'values',
|
|
2390
|
+
version: 'v2' as const,
|
|
2391
|
+
};
|
|
2392
|
+
|
|
2393
|
+
const INSTRUCTIONS =
|
|
2394
|
+
'You are a math tutor. Use the calculator tool for ALL computations. Be concise.';
|
|
2395
|
+
|
|
2396
|
+
test('token count map is accurate after summarization cycle', async () => {
|
|
2397
|
+
const spies = createSpies();
|
|
2398
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
2399
|
+
const conversationHistory: BaseMessage[] = [];
|
|
2400
|
+
const tokenCounter = await createTokenCounter();
|
|
2401
|
+
|
|
2402
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
2403
|
+
collectedUsage = [];
|
|
2404
|
+
const { aggregateContent } = createContentAggregator();
|
|
2405
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2406
|
+
conversationHistory,
|
|
2407
|
+
tokenCounter
|
|
2408
|
+
);
|
|
2409
|
+
return createSummarizationRun({
|
|
2410
|
+
agentProvider,
|
|
2411
|
+
summarizationProvider,
|
|
2412
|
+
summarizationModel,
|
|
2413
|
+
maxContextTokens: maxTokens,
|
|
2414
|
+
instructions: INSTRUCTIONS,
|
|
2415
|
+
collectedUsage,
|
|
2416
|
+
aggregateContent,
|
|
2417
|
+
spies,
|
|
2418
|
+
tokenCounter,
|
|
2419
|
+
indexTokenCountMap,
|
|
2420
|
+
});
|
|
2421
|
+
};
|
|
2422
|
+
|
|
2423
|
+
// Accumulate messages over 6 turns at generous budget
|
|
2424
|
+
let run = await createRun();
|
|
2425
|
+
await runTurn(
|
|
2426
|
+
{ run, conversationHistory },
|
|
2427
|
+
'What is 42 * 58? Calculator.',
|
|
2428
|
+
streamConfig
|
|
2429
|
+
);
|
|
2430
|
+
|
|
2431
|
+
run = await createRun();
|
|
2432
|
+
await runTurn(
|
|
2433
|
+
{ run, conversationHistory },
|
|
2434
|
+
'Now compute 2436 + 1000. Calculator.',
|
|
2435
|
+
streamConfig
|
|
2436
|
+
);
|
|
2437
|
+
|
|
2438
|
+
run = await createRun();
|
|
2439
|
+
await runTurn(
|
|
2440
|
+
{ run, conversationHistory },
|
|
2441
|
+
'What is 3436 / 4? Calculator.',
|
|
2442
|
+
streamConfig
|
|
2443
|
+
);
|
|
2444
|
+
|
|
2445
|
+
run = await createRun();
|
|
2446
|
+
await runTurn(
|
|
2447
|
+
{ run, conversationHistory },
|
|
2448
|
+
'Compute 999 * 2. Calculator.',
|
|
2449
|
+
streamConfig
|
|
2450
|
+
);
|
|
2451
|
+
|
|
2452
|
+
run = await createRun();
|
|
2453
|
+
await runTurn(
|
|
2454
|
+
{ run, conversationHistory },
|
|
2455
|
+
'What is 2^10? Calculator. Also list everything.',
|
|
2456
|
+
streamConfig
|
|
2457
|
+
);
|
|
2458
|
+
|
|
2459
|
+
run = await createRun();
|
|
2460
|
+
await runTurn(
|
|
2461
|
+
{ run, conversationHistory },
|
|
2462
|
+
'Calculate 355 / 113. Calculator.',
|
|
2463
|
+
streamConfig
|
|
2464
|
+
);
|
|
2465
|
+
|
|
2466
|
+
// Squeeze progressively to force summarization
|
|
2467
|
+
for (const squeeze of [3500, 3200, 3100, 3000, 2800, 2500, 2000]) {
|
|
2468
|
+
if (spies.onSummarizeStartSpy.mock.calls.length > 0) {
|
|
2469
|
+
break;
|
|
2470
|
+
}
|
|
2471
|
+
run = await createRun(squeeze);
|
|
2472
|
+
await runTurn(
|
|
2473
|
+
{ run, conversationHistory },
|
|
2474
|
+
`What is ${squeeze} - 1000? Calculator.`,
|
|
2475
|
+
streamConfig
|
|
2476
|
+
);
|
|
2477
|
+
}
|
|
2478
|
+
|
|
2479
|
+
// Verify summarization fired
|
|
2480
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
2481
|
+
|
|
2482
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
2483
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
2484
|
+
expect(completePayload.summary!.tokenCount).toBeGreaterThan(10);
|
|
2485
|
+
expect(completePayload.summary!.tokenCount).toBeLessThan(1500);
|
|
2486
|
+
|
|
2487
|
+
// Token accounting: collectedUsage should have valid entries
|
|
2488
|
+
const validUsage = collectedUsage.filter(
|
|
2489
|
+
(u: Partial<UsageMetadata>) =>
|
|
2490
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
2491
|
+
);
|
|
2492
|
+
expect(validUsage.length).toBeGreaterThan(0);
|
|
2493
|
+
|
|
2494
|
+
console.log(
|
|
2495
|
+
` Token audit: summary=${completePayload.summary!.tokenCount} tokens, ` +
|
|
2496
|
+
`usageEntries=${validUsage.length}`
|
|
2497
|
+
);
|
|
2498
|
+
});
|
|
2499
|
+
|
|
2500
|
+
test('summary tokenCount matches local token counter', async () => {
|
|
2501
|
+
const spies = createSpies();
|
|
2502
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
2503
|
+
const conversationHistory: BaseMessage[] = [];
|
|
2504
|
+
const tokenCounter = await createTokenCounter();
|
|
2505
|
+
|
|
2506
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
2507
|
+
collectedUsage = [];
|
|
2508
|
+
const { aggregateContent } = createContentAggregator();
|
|
2509
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2510
|
+
conversationHistory,
|
|
2511
|
+
tokenCounter
|
|
2512
|
+
);
|
|
2513
|
+
return createSummarizationRun({
|
|
2514
|
+
agentProvider,
|
|
2515
|
+
summarizationProvider,
|
|
2516
|
+
summarizationModel,
|
|
2517
|
+
maxContextTokens: maxTokens,
|
|
2518
|
+
instructions: INSTRUCTIONS,
|
|
2519
|
+
collectedUsage,
|
|
2520
|
+
aggregateContent,
|
|
2521
|
+
spies,
|
|
2522
|
+
tokenCounter,
|
|
2523
|
+
indexTokenCountMap,
|
|
2524
|
+
});
|
|
2525
|
+
};
|
|
2526
|
+
|
|
2527
|
+
// Accumulate history at generous limits (6 turns)
|
|
2528
|
+
let run = await createRun();
|
|
2529
|
+
await runTurn(
|
|
2530
|
+
{ run, conversationHistory },
|
|
2531
|
+
'What is 100 * 200? Calculator.',
|
|
2532
|
+
streamConfig
|
|
2533
|
+
);
|
|
2534
|
+
|
|
2535
|
+
run = await createRun();
|
|
2536
|
+
await runTurn(
|
|
2537
|
+
{ run, conversationHistory },
|
|
2538
|
+
'Now compute 20000 + 5000. Calculator.',
|
|
2539
|
+
streamConfig
|
|
2540
|
+
);
|
|
2541
|
+
|
|
2542
|
+
run = await createRun();
|
|
2543
|
+
await runTurn(
|
|
2544
|
+
{ run, conversationHistory },
|
|
2545
|
+
'What is 25000 / 5? Calculator. Remind me of prior results.',
|
|
2546
|
+
streamConfig
|
|
2547
|
+
);
|
|
2548
|
+
|
|
2549
|
+
run = await createRun();
|
|
2550
|
+
await runTurn(
|
|
2551
|
+
{ run, conversationHistory },
|
|
2552
|
+
'Compute 2^16 with calculator.',
|
|
2553
|
+
streamConfig
|
|
2554
|
+
);
|
|
2555
|
+
|
|
2556
|
+
run = await createRun();
|
|
2557
|
+
await runTurn(
|
|
2558
|
+
{ run, conversationHistory },
|
|
2559
|
+
'What is 65536 + 5000? Calculator.',
|
|
2560
|
+
streamConfig
|
|
2561
|
+
);
|
|
2562
|
+
|
|
2563
|
+
run = await createRun();
|
|
2564
|
+
await runTurn(
|
|
2565
|
+
{ run, conversationHistory },
|
|
2566
|
+
'Calculate 70536 / 7. Calculator.',
|
|
2567
|
+
streamConfig
|
|
2568
|
+
);
|
|
2569
|
+
|
|
2570
|
+
// Squeeze progressively to force summarization
|
|
2571
|
+
for (const squeeze of [3500, 3200, 3100, 3000, 2800, 2500, 2000]) {
|
|
2572
|
+
if (spies.onSummarizeStartSpy.mock.calls.length > 0) {
|
|
2573
|
+
break;
|
|
2574
|
+
}
|
|
2575
|
+
run = await createRun(squeeze);
|
|
2576
|
+
await runTurn(
|
|
2577
|
+
{ run, conversationHistory },
|
|
2578
|
+
`What is ${squeeze} - 1000? Calculator.`,
|
|
2579
|
+
streamConfig
|
|
2580
|
+
);
|
|
2581
|
+
}
|
|
2582
|
+
|
|
2583
|
+
expect(spies.onSummarizeCompleteSpy).toHaveBeenCalled();
|
|
2584
|
+
|
|
2585
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
2586
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
2587
|
+
const summaryText = getSummaryText(completePayload.summary);
|
|
2588
|
+
const reportedTokenCount = completePayload.summary!.tokenCount ?? 0;
|
|
2589
|
+
|
|
2590
|
+
// Count tokens locally using the same tokenizer
|
|
2591
|
+
const localTokenCount = tokenCounter(new SystemMessage(summaryText));
|
|
2592
|
+
|
|
2593
|
+
console.log(
|
|
2594
|
+
` Token match: reported=${reportedTokenCount}, local=${localTokenCount}`
|
|
2595
|
+
);
|
|
2596
|
+
|
|
2597
|
+
// Token counts may differ slightly due to encoding differences
|
|
2598
|
+
// (claude vs o200k_base) and the 1.1× Claude correction factor.
|
|
2599
|
+
// Allow up to 25% variance.
|
|
2600
|
+
const variance =
|
|
2601
|
+
Math.abs(reportedTokenCount - localTokenCount) / localTokenCount;
|
|
2602
|
+
expect(variance).toBeLessThan(0.25);
|
|
2603
|
+
});
|
|
2604
|
+
|
|
2605
|
+
test('collectedUsage input_tokens decreases after summarization', async () => {
|
|
2606
|
+
jest.setTimeout(120_000);
|
|
2607
|
+
const spies = createSpies();
|
|
2608
|
+
let collectedUsage: UsageMetadata[] = [];
|
|
2609
|
+
const conversationHistory: BaseMessage[] = [];
|
|
2610
|
+
const tokenCounter = await createTokenCounter();
|
|
2611
|
+
|
|
2612
|
+
const createRun = async (maxTokens = 4000): Promise<Run<t.IState>> => {
|
|
2613
|
+
collectedUsage = [];
|
|
2614
|
+
const { aggregateContent } = createContentAggregator();
|
|
2615
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2616
|
+
conversationHistory,
|
|
2617
|
+
tokenCounter
|
|
2618
|
+
);
|
|
2619
|
+
return createSummarizationRun({
|
|
2620
|
+
agentProvider,
|
|
2621
|
+
summarizationProvider,
|
|
2622
|
+
summarizationModel,
|
|
2623
|
+
maxContextTokens: maxTokens,
|
|
2624
|
+
instructions: INSTRUCTIONS,
|
|
2625
|
+
collectedUsage,
|
|
2626
|
+
aggregateContent,
|
|
2627
|
+
spies,
|
|
2628
|
+
tokenCounter,
|
|
2629
|
+
indexTokenCountMap,
|
|
2630
|
+
});
|
|
2631
|
+
};
|
|
2632
|
+
|
|
2633
|
+
// Build up conversation (6 turns at generous budget)
|
|
2634
|
+
let run = await createRun();
|
|
2635
|
+
await runTurn(
|
|
2636
|
+
{ run, conversationHistory },
|
|
2637
|
+
'What is 12345 * 67? Calculator.',
|
|
2638
|
+
streamConfig
|
|
2639
|
+
);
|
|
2640
|
+
|
|
2641
|
+
// Capture pre-summary input_tokens
|
|
2642
|
+
const preSumUsage = collectedUsage.filter(
|
|
2643
|
+
(u: Partial<UsageMetadata>) =>
|
|
2644
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
2645
|
+
);
|
|
2646
|
+
const lastPreUsage =
|
|
2647
|
+
preSumUsage.length > 0 ? preSumUsage[preSumUsage.length - 1] : undefined;
|
|
2648
|
+
const preSumInputTokens =
|
|
2649
|
+
lastPreUsage?.input_tokens != null ? lastPreUsage.input_tokens : 0;
|
|
2650
|
+
|
|
2651
|
+
run = await createRun();
|
|
2652
|
+
await runTurn(
|
|
2653
|
+
{ run, conversationHistory },
|
|
2654
|
+
'Now divide that by 13. Calculator. Also multiply by 7.',
|
|
2655
|
+
streamConfig
|
|
2656
|
+
);
|
|
2657
|
+
|
|
2658
|
+
run = await createRun();
|
|
2659
|
+
await runTurn(
|
|
2660
|
+
{ run, conversationHistory },
|
|
2661
|
+
'Compute 999 * 888. Calculator.',
|
|
2662
|
+
streamConfig
|
|
2663
|
+
);
|
|
2664
|
+
|
|
2665
|
+
run = await createRun();
|
|
2666
|
+
await runTurn(
|
|
2667
|
+
{ run, conversationHistory },
|
|
2668
|
+
'What is 2^10? Calculator.',
|
|
2669
|
+
streamConfig
|
|
2670
|
+
);
|
|
2671
|
+
|
|
2672
|
+
run = await createRun();
|
|
2673
|
+
await runTurn(
|
|
2674
|
+
{ run, conversationHistory },
|
|
2675
|
+
'Calculate 1024 + 5000. Calculator. List all prior results.',
|
|
2676
|
+
streamConfig
|
|
2677
|
+
);
|
|
2678
|
+
|
|
2679
|
+
run = await createRun();
|
|
2680
|
+
await runTurn(
|
|
2681
|
+
{ run, conversationHistory },
|
|
2682
|
+
'What is 6024 * 3? Calculator.',
|
|
2683
|
+
streamConfig
|
|
2684
|
+
);
|
|
2685
|
+
|
|
2686
|
+
// Squeeze progressively to force summarization
|
|
2687
|
+
for (const squeeze of [3500, 3200, 3100, 3000, 2800, 2500, 2000]) {
|
|
2688
|
+
if (spies.onSummarizeStartSpy.mock.calls.length > 0) {
|
|
2689
|
+
break;
|
|
2690
|
+
}
|
|
2691
|
+
run = await createRun(squeeze);
|
|
2692
|
+
await runTurn(
|
|
2693
|
+
{ run, conversationHistory },
|
|
2694
|
+
`What is ${squeeze} - 1000? Calculator.`,
|
|
2695
|
+
streamConfig
|
|
2696
|
+
);
|
|
2697
|
+
}
|
|
2698
|
+
|
|
2699
|
+
// Post-summary turn
|
|
2700
|
+
run = await createRun(4000);
|
|
2701
|
+
await runTurn(
|
|
2702
|
+
{ run, conversationHistory },
|
|
2703
|
+
'What is 10 + 10? Calculator.',
|
|
2704
|
+
streamConfig
|
|
2705
|
+
);
|
|
2706
|
+
|
|
2707
|
+
const postSumUsage = collectedUsage.filter(
|
|
2708
|
+
(u: Partial<UsageMetadata>) =>
|
|
2709
|
+
u.input_tokens != null && u.input_tokens > 0
|
|
2710
|
+
);
|
|
2711
|
+
const lastPostUsage =
|
|
2712
|
+
postSumUsage.length > 0
|
|
2713
|
+
? postSumUsage[postSumUsage.length - 1]
|
|
2714
|
+
: undefined;
|
|
2715
|
+
const postSumInputTokens =
|
|
2716
|
+
lastPostUsage?.input_tokens != null ? lastPostUsage.input_tokens : 0;
|
|
2717
|
+
|
|
2718
|
+
console.log(
|
|
2719
|
+
` Input tokens: pre-summary=${preSumInputTokens}, post-summary=${postSumInputTokens}`
|
|
2720
|
+
);
|
|
2721
|
+
|
|
2722
|
+
// After summarization, the context should be smaller, so input tokens should decrease
|
|
2723
|
+
// (compared to what they would have been without summarization)
|
|
2724
|
+
// We compare against the pre-summary value which had fewer messages
|
|
2725
|
+
// The post-summary turn should have fewer input tokens than the last pre-summary turn
|
|
2726
|
+
// that had the full context (before summarization compressed it)
|
|
2727
|
+
if (spies.onSummarizeCompleteSpy.mock.calls.length > 0) {
|
|
2728
|
+
expect(postSumInputTokens).toBeGreaterThan(0);
|
|
2729
|
+
expect(preSumInputTokens).toBeGreaterThan(0);
|
|
2730
|
+
console.log(
|
|
2731
|
+
` Summarization fired: ${spies.onSummarizeCompleteSpy.mock.calls.length} times`
|
|
2732
|
+
);
|
|
2733
|
+
}
|
|
2734
|
+
});
|
|
2735
|
+
});
|
|
2736
|
+
|
|
2737
|
+
// ---------------------------------------------------------------------------
|
|
2738
|
+
// Enrichment and prompt selection (FakeListChatModel — no API keys)
|
|
2739
|
+
// ---------------------------------------------------------------------------
|
|
2740
|
+
|
|
2741
|
+
describe('Enrichment and prompt selection (no API keys)', () => {
|
|
2742
|
+
jest.setTimeout(60_000);
|
|
2743
|
+
|
|
2744
|
+
const INSTRUCTIONS = 'You are a helpful assistant.';
|
|
2745
|
+
const streamConfig = {
|
|
2746
|
+
configurable: { thread_id: 'enrichment-tests' },
|
|
2747
|
+
streamMode: 'values',
|
|
2748
|
+
version: 'v2' as const,
|
|
2749
|
+
};
|
|
2750
|
+
|
|
2751
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
2752
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
2753
|
+
|
|
2754
|
+
// The fake summarizer includes a basic summary without tool failures section
|
|
2755
|
+
const BASE_SUMMARY =
|
|
2756
|
+
'## Goal\nHelp user.\n\n## Progress\n### Done\n- Assisted user.';
|
|
2757
|
+
|
|
2758
|
+
beforeEach(() => {
|
|
2759
|
+
getChatModelClassSpy = jest
|
|
2760
|
+
.spyOn(providers, 'getChatModelClass')
|
|
2761
|
+
.mockImplementation(((provider: Providers) => {
|
|
2762
|
+
if (provider === Providers.OPENAI) {
|
|
2763
|
+
return class extends FakeListChatModel {
|
|
2764
|
+
constructor(_options: any) {
|
|
2765
|
+
super({ responses: [BASE_SUMMARY] });
|
|
2766
|
+
}
|
|
2767
|
+
} as any;
|
|
2768
|
+
}
|
|
2769
|
+
return originalGetChatModelClass(provider);
|
|
2770
|
+
}) as typeof providers.getChatModelClass);
|
|
2771
|
+
});
|
|
2772
|
+
|
|
2773
|
+
afterEach(() => {
|
|
2774
|
+
getChatModelClassSpy.mockRestore();
|
|
2775
|
+
});
|
|
2776
|
+
|
|
2777
|
+
test('tool failure enrichment appended to summary', async () => {
|
|
2778
|
+
const spies = createSpies();
|
|
2779
|
+
const tokenCounter = await createTokenCounter();
|
|
2780
|
+
|
|
2781
|
+
// Build conversation with a tool failure
|
|
2782
|
+
const conversationHistory: BaseMessage[] = [
|
|
2783
|
+
new HumanMessage('Run the linter on my code.'),
|
|
2784
|
+
new AIMessage({
|
|
2785
|
+
content: [
|
|
2786
|
+
{ type: 'text' as const, text: 'Running the linter now.' },
|
|
2787
|
+
{
|
|
2788
|
+
type: 'tool_use' as const,
|
|
2789
|
+
id: 'tool_lint_1',
|
|
2790
|
+
name: 'run_linter',
|
|
2791
|
+
input: '{"path": "/src/index.ts"}',
|
|
2792
|
+
},
|
|
2793
|
+
],
|
|
2794
|
+
tool_calls: [
|
|
2795
|
+
{
|
|
2796
|
+
id: 'tool_lint_1',
|
|
2797
|
+
name: 'run_linter',
|
|
2798
|
+
args: { path: '/src/index.ts' },
|
|
2799
|
+
},
|
|
2800
|
+
],
|
|
2801
|
+
}),
|
|
2802
|
+
new ToolMessage({
|
|
2803
|
+
content: 'Error: ENOENT: no such file or directory, open /src/index.ts',
|
|
2804
|
+
tool_call_id: 'tool_lint_1',
|
|
2805
|
+
name: 'run_linter',
|
|
2806
|
+
status: 'error',
|
|
2807
|
+
}),
|
|
2808
|
+
new AIMessage('The linter failed because the file was not found.'),
|
|
2809
|
+
new HumanMessage('Try again with the correct path.'),
|
|
2810
|
+
new AIMessage(
|
|
2811
|
+
'I will try again. The correct path would need to be provided by you since I cannot verify file existence.'
|
|
2812
|
+
),
|
|
2813
|
+
];
|
|
2814
|
+
|
|
2815
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2816
|
+
conversationHistory,
|
|
2817
|
+
tokenCounter
|
|
2818
|
+
);
|
|
2819
|
+
|
|
2820
|
+
const { aggregateContent } = createContentAggregator();
|
|
2821
|
+
const run = await Run.create<t.IState>({
|
|
2822
|
+
runId: `tool-failure-enrich-${Date.now()}`,
|
|
2823
|
+
graphConfig: {
|
|
2824
|
+
type: 'standard',
|
|
2825
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
2826
|
+
instructions: INSTRUCTIONS,
|
|
2827
|
+
maxContextTokens: 50, // Very tight to force summarization
|
|
2828
|
+
summarizationEnabled: true,
|
|
2829
|
+
summarizationConfig: {
|
|
2830
|
+
provider: Providers.OPENAI,
|
|
2831
|
+
},
|
|
2832
|
+
},
|
|
2833
|
+
returnContent: true,
|
|
2834
|
+
customHandlers: {
|
|
2835
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
2836
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2837
|
+
spies.onRunStepSpy(_event, data);
|
|
2838
|
+
aggregateContent({
|
|
2839
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
2840
|
+
data: data as t.RunStep,
|
|
2841
|
+
});
|
|
2842
|
+
},
|
|
2843
|
+
},
|
|
2844
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
2845
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2846
|
+
spies.onSummarizeStartSpy(data);
|
|
2847
|
+
},
|
|
2848
|
+
},
|
|
2849
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
2850
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2851
|
+
spies.onSummarizeCompleteSpy(data);
|
|
2852
|
+
},
|
|
2853
|
+
},
|
|
2854
|
+
},
|
|
2855
|
+
tokenCounter,
|
|
2856
|
+
indexTokenCountMap,
|
|
2857
|
+
});
|
|
2858
|
+
|
|
2859
|
+
run.Graph?.overrideTestModel(['Understood, awaiting correct path.'], 1);
|
|
2860
|
+
|
|
2861
|
+
try {
|
|
2862
|
+
await run.processStream(
|
|
2863
|
+
{
|
|
2864
|
+
messages: [
|
|
2865
|
+
...conversationHistory,
|
|
2866
|
+
new HumanMessage('What happened?'),
|
|
2867
|
+
],
|
|
2868
|
+
},
|
|
2869
|
+
streamConfig as any
|
|
2870
|
+
);
|
|
2871
|
+
} catch {
|
|
2872
|
+
// empty_messages is acceptable for tiny context
|
|
2873
|
+
}
|
|
2874
|
+
|
|
2875
|
+
if (spies.onSummarizeCompleteSpy.mock.calls.length > 0) {
|
|
2876
|
+
const completePayload = spies.onSummarizeCompleteSpy.mock
|
|
2877
|
+
.calls[0][0] as t.SummarizeCompleteEvent;
|
|
2878
|
+
const summaryText = getSummaryText(completePayload.summary);
|
|
2879
|
+
|
|
2880
|
+
// The enrichment step in node.ts should append ## Tool Failures
|
|
2881
|
+
expect(summaryText).toContain('## Tool Failures');
|
|
2882
|
+
expect(summaryText).toContain('run_linter');
|
|
2883
|
+
expect(summaryText).toContain('ENOENT');
|
|
2884
|
+
|
|
2885
|
+
console.log(` Enriched summary: "${summaryText.substring(0, 200)}…"`);
|
|
2886
|
+
} else {
|
|
2887
|
+
// If summarization didn't fire due to context being too tight,
|
|
2888
|
+
// the test is inconclusive but not a failure
|
|
2889
|
+
console.log(
|
|
2890
|
+
' Summarization did not fire (context too tight for any message)'
|
|
2891
|
+
);
|
|
2892
|
+
}
|
|
2893
|
+
});
|
|
2894
|
+
});
|
|
2895
|
+
|
|
2896
|
+
// ---------------------------------------------------------------------------
|
|
2897
|
+
// Summarization deduplication and correctness (FakeListChatModel — no API keys)
|
|
2898
|
+
// ---------------------------------------------------------------------------
|
|
2899
|
+
|
|
2900
|
+
describe('Summarization deduplication correctness (no API keys)', () => {
|
|
2901
|
+
jest.setTimeout(60_000);
|
|
2902
|
+
|
|
2903
|
+
const INSTRUCTIONS =
|
|
2904
|
+
'You are a math tutor. Use the calculator tool for ALL computations. Be concise.';
|
|
2905
|
+
const streamConfig = {
|
|
2906
|
+
configurable: { thread_id: 'multi-pass-correctness' },
|
|
2907
|
+
streamMode: 'values',
|
|
2908
|
+
version: 'v2' as const,
|
|
2909
|
+
};
|
|
2910
|
+
|
|
2911
|
+
let getChatModelClassSpy: jest.SpyInstance | undefined;
|
|
2912
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
2913
|
+
|
|
2914
|
+
afterEach(() => {
|
|
2915
|
+
if (getChatModelClassSpy) {
|
|
2916
|
+
getChatModelClassSpy.mockRestore();
|
|
2917
|
+
}
|
|
2918
|
+
});
|
|
2919
|
+
|
|
2920
|
+
test('summarization does not produce duplicate section headers', async () => {
|
|
2921
|
+
const spies = createSpies();
|
|
2922
|
+
const conversationHistory: BaseMessage[] = [];
|
|
2923
|
+
const tokenCounter = await createTokenCounter();
|
|
2924
|
+
|
|
2925
|
+
// Track what the summarizer receives for each chunk
|
|
2926
|
+
const capturedSystemMessages: string[] = [];
|
|
2927
|
+
const capturedHumanMessages: string[] = [];
|
|
2928
|
+
|
|
2929
|
+
// Return different summaries for each chunk — chunk 2 returns a proper
|
|
2930
|
+
// comprehensive summary that does NOT duplicate ## Goal
|
|
2931
|
+
let chunkCallCount = 0;
|
|
2932
|
+
const chunkResponses = [
|
|
2933
|
+
'## Goal\nUser needs math computations.\n\n## Progress\n### Done\n- Computed 2+2=4.\n- Computed 3*5=15.',
|
|
2934
|
+
'## Goal\nUser needs comprehensive math help including basic and advanced operations.\n\n## Progress\n### Done\n- Computed 2+2=4.\n- Computed 3*5=15.\n- Computed sqrt(16)=4.\n- Computed 100/4=25.\n\n## Next Steps\nContinue with more calculations.',
|
|
2935
|
+
];
|
|
2936
|
+
|
|
2937
|
+
getChatModelClassSpy = jest
|
|
2938
|
+
.spyOn(providers, 'getChatModelClass')
|
|
2939
|
+
.mockImplementation(((provider: Providers) => {
|
|
2940
|
+
if (provider === Providers.OPENAI) {
|
|
2941
|
+
return class extends FakeListChatModel {
|
|
2942
|
+
constructor(_options: any) {
|
|
2943
|
+
const response =
|
|
2944
|
+
chunkResponses[chunkCallCount] ??
|
|
2945
|
+
chunkResponses[chunkResponses.length - 1];
|
|
2946
|
+
chunkCallCount++;
|
|
2947
|
+
super({ responses: [response] });
|
|
2948
|
+
}
|
|
2949
|
+
// eslint-disable-next-line @typescript-eslint/explicit-function-return-type
|
|
2950
|
+
async *_streamResponseChunks(
|
|
2951
|
+
messages: any[],
|
|
2952
|
+
options: any,
|
|
2953
|
+
runManager?: any
|
|
2954
|
+
) {
|
|
2955
|
+
for (const msg of messages) {
|
|
2956
|
+
const msgType = msg.getType?.() ?? msg._getType?.();
|
|
2957
|
+
const content =
|
|
2958
|
+
typeof msg.content === 'string'
|
|
2959
|
+
? msg.content
|
|
2960
|
+
: JSON.stringify(msg.content);
|
|
2961
|
+
if (msgType === 'system') capturedSystemMessages.push(content);
|
|
2962
|
+
if (msgType === 'human') capturedHumanMessages.push(content);
|
|
2963
|
+
}
|
|
2964
|
+
yield* super._streamResponseChunks(messages, options, runManager);
|
|
2965
|
+
}
|
|
2966
|
+
} as any;
|
|
2967
|
+
}
|
|
2968
|
+
return originalGetChatModelClass(provider);
|
|
2969
|
+
}) as typeof providers.getChatModelClass);
|
|
2970
|
+
|
|
2971
|
+
const createRunHelper = async (
|
|
2972
|
+
maxTokens: number
|
|
2973
|
+
): Promise<Run<t.IState>> => {
|
|
2974
|
+
const { aggregateContent } = createContentAggregator();
|
|
2975
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
2976
|
+
conversationHistory,
|
|
2977
|
+
tokenCounter
|
|
2978
|
+
);
|
|
2979
|
+
return Run.create<t.IState>({
|
|
2980
|
+
runId: `multi-pass-dedup-${Date.now()}`,
|
|
2981
|
+
graphConfig: {
|
|
2982
|
+
type: 'standard',
|
|
2983
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
2984
|
+
instructions: INSTRUCTIONS,
|
|
2985
|
+
maxContextTokens: maxTokens,
|
|
2986
|
+
summarizationEnabled: true,
|
|
2987
|
+
summarizationConfig: {
|
|
2988
|
+
provider: Providers.OPENAI,
|
|
2989
|
+
parameters: {},
|
|
2990
|
+
},
|
|
2991
|
+
},
|
|
2992
|
+
returnContent: true,
|
|
2993
|
+
customHandlers: {
|
|
2994
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
2995
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
2996
|
+
spies.onRunStepSpy(_event, data);
|
|
2997
|
+
aggregateContent({
|
|
2998
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
2999
|
+
data: data as t.RunStep,
|
|
3000
|
+
});
|
|
3001
|
+
},
|
|
3002
|
+
},
|
|
3003
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
3004
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
3005
|
+
spies.onSummarizeStartSpy(data);
|
|
3006
|
+
},
|
|
3007
|
+
},
|
|
3008
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
3009
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
3010
|
+
spies.onSummarizeCompleteSpy(data);
|
|
3011
|
+
},
|
|
3012
|
+
},
|
|
3013
|
+
},
|
|
3014
|
+
tokenCounter,
|
|
3015
|
+
indexTokenCountMap,
|
|
3016
|
+
});
|
|
3017
|
+
};
|
|
3018
|
+
|
|
3019
|
+
// Build up enough conversation to trigger summarization
|
|
3020
|
+
// Build enough conversation history to trigger summarization
|
|
3021
|
+
let run = await createRunHelper(4000);
|
|
3022
|
+
run.Graph?.overrideTestModel(
|
|
3023
|
+
['The answer to 2+2 is 4. Basic addition.'],
|
|
3024
|
+
1
|
|
3025
|
+
);
|
|
3026
|
+
await runTurn(
|
|
3027
|
+
{ run, conversationHistory },
|
|
3028
|
+
'What is 2+2? Explain in detail.',
|
|
3029
|
+
streamConfig
|
|
3030
|
+
);
|
|
3031
|
+
|
|
3032
|
+
run = await createRunHelper(4000);
|
|
3033
|
+
run.Graph?.overrideTestModel(
|
|
3034
|
+
['3 times 5 is 15. Multiplication is repeated addition.'],
|
|
3035
|
+
1
|
|
3036
|
+
);
|
|
3037
|
+
await runTurn(
|
|
3038
|
+
{ run, conversationHistory },
|
|
3039
|
+
'Now explain 3 times 5 in great detail with many examples.',
|
|
3040
|
+
streamConfig
|
|
3041
|
+
);
|
|
3042
|
+
|
|
3043
|
+
run = await createRunHelper(4000);
|
|
3044
|
+
run.Graph?.overrideTestModel(
|
|
3045
|
+
['The square root of 16 is 4, because 4*4=16.'],
|
|
3046
|
+
1
|
|
3047
|
+
);
|
|
3048
|
+
await runTurn(
|
|
3049
|
+
{ run, conversationHistory },
|
|
3050
|
+
'What is sqrt(16)? Give a thorough step-by-step explanation.',
|
|
3051
|
+
streamConfig
|
|
3052
|
+
);
|
|
3053
|
+
|
|
3054
|
+
run = await createRunHelper(4000);
|
|
3055
|
+
run.Graph?.overrideTestModel(
|
|
3056
|
+
[
|
|
3057
|
+
'100 divided by 4 is 25. Division distributes a total into equal parts.',
|
|
3058
|
+
],
|
|
3059
|
+
1
|
|
3060
|
+
);
|
|
3061
|
+
await runTurn(
|
|
3062
|
+
{ run, conversationHistory },
|
|
3063
|
+
'What is 100/4? Explain division with multiple worked examples.',
|
|
3064
|
+
streamConfig
|
|
3065
|
+
);
|
|
3066
|
+
|
|
3067
|
+
// Now force summarization with tight context
|
|
3068
|
+
run = await createRunHelper(50);
|
|
3069
|
+
run.Graph?.overrideTestModel(['Continuing after summary.'], 1);
|
|
3070
|
+
try {
|
|
3071
|
+
await runTurn({ run, conversationHistory }, 'Continue.', streamConfig);
|
|
3072
|
+
} catch {
|
|
3073
|
+
conversationHistory.pop(); // remove failed user message
|
|
3074
|
+
}
|
|
3075
|
+
|
|
3076
|
+
// Assert summarization fired
|
|
3077
|
+
const sumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3078
|
+
console.log(
|
|
3079
|
+
` Dedup: ${sumCount} summarization(s), ${chunkCallCount} chunk LLM calls, ` +
|
|
3080
|
+
`${capturedSystemMessages.length} system messages captured`
|
|
3081
|
+
);
|
|
3082
|
+
|
|
3083
|
+
expect(sumCount).toBeGreaterThanOrEqual(1);
|
|
3084
|
+
|
|
3085
|
+
const lastComplete = spies.onSummarizeCompleteSpy.mock.calls[
|
|
3086
|
+
sumCount - 1
|
|
3087
|
+
][0] as t.SummarizeCompleteEvent;
|
|
3088
|
+
const summaryText = getSummaryText(lastComplete.summary);
|
|
3089
|
+
|
|
3090
|
+
// KEY ASSERTION: ## Goal should appear exactly ONCE (no duplication)
|
|
3091
|
+
const goalCount = (summaryText.match(/## Goal/g) || []).length;
|
|
3092
|
+
expect(goalCount).toBe(1);
|
|
3093
|
+
|
|
3094
|
+
// ## Progress should also appear exactly once
|
|
3095
|
+
const progressCount = (summaryText.match(/## Progress/g) || []).length;
|
|
3096
|
+
expect(progressCount).toBe(1);
|
|
3097
|
+
|
|
3098
|
+
// tokenCount must be > 0 (tokenCounter is provided)
|
|
3099
|
+
expect(lastComplete.summary!.tokenCount).toBeGreaterThan(0);
|
|
3100
|
+
|
|
3101
|
+
console.log(
|
|
3102
|
+
` Summary (${summaryText.length} chars, ${lastComplete.summary!.tokenCount} tokens):\n` +
|
|
3103
|
+
` "${summaryText.substring(0, 300)}…"`
|
|
3104
|
+
);
|
|
3105
|
+
});
|
|
3106
|
+
|
|
3107
|
+
test('repeated summarization cycles do not accumulate duplicate sections', async () => {
|
|
3108
|
+
// This test verifies that when summarization fires multiple times across
|
|
3109
|
+
// runs, each summary is clean (no duplicate section headers).
|
|
3110
|
+
// The cross-cycle prompt selection (UPDATE for chunk 0, FRESH for chunk 1+)
|
|
3111
|
+
// is tested in unit tests (node.test.ts). This integration test focuses on
|
|
3112
|
+
// the end-to-end outcome.
|
|
3113
|
+
const spies = createSpies();
|
|
3114
|
+
const conversationHistory: BaseMessage[] = [];
|
|
3115
|
+
const tokenCounter = await createTokenCounter();
|
|
3116
|
+
|
|
3117
|
+
// The summarizer always returns a clean single-section summary
|
|
3118
|
+
const summaryResponse =
|
|
3119
|
+
'## Goal\nMath tutoring.\n\n## Progress\n### Done\n- Completed operations.';
|
|
3120
|
+
|
|
3121
|
+
getChatModelClassSpy = jest
|
|
3122
|
+
.spyOn(providers, 'getChatModelClass')
|
|
3123
|
+
.mockImplementation(((provider: Providers) => {
|
|
3124
|
+
if (provider === Providers.OPENAI) {
|
|
3125
|
+
return class extends FakeListChatModel {
|
|
3126
|
+
constructor(_options: any) {
|
|
3127
|
+
super({ responses: [summaryResponse] });
|
|
3128
|
+
}
|
|
3129
|
+
} as any;
|
|
3130
|
+
}
|
|
3131
|
+
return originalGetChatModelClass(provider);
|
|
3132
|
+
}) as typeof providers.getChatModelClass);
|
|
3133
|
+
|
|
3134
|
+
const createRunHelper = async (
|
|
3135
|
+
maxTokens: number,
|
|
3136
|
+
initialSummary?: { text: string; tokenCount: number }
|
|
3137
|
+
): Promise<Run<t.IState>> => {
|
|
3138
|
+
const { aggregateContent } = createContentAggregator();
|
|
3139
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3140
|
+
conversationHistory,
|
|
3141
|
+
tokenCounter
|
|
3142
|
+
);
|
|
3143
|
+
return Run.create<t.IState>({
|
|
3144
|
+
runId: `repeat-sum-${Date.now()}`,
|
|
3145
|
+
graphConfig: {
|
|
3146
|
+
type: 'standard',
|
|
3147
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3148
|
+
instructions: INSTRUCTIONS,
|
|
3149
|
+
maxContextTokens: maxTokens,
|
|
3150
|
+
summarizationEnabled: true,
|
|
3151
|
+
summarizationConfig: {
|
|
3152
|
+
provider: Providers.OPENAI,
|
|
3153
|
+
},
|
|
3154
|
+
initialSummary,
|
|
3155
|
+
},
|
|
3156
|
+
returnContent: true,
|
|
3157
|
+
customHandlers: {
|
|
3158
|
+
[GraphEvents.ON_RUN_STEP]: {
|
|
3159
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
3160
|
+
spies.onRunStepSpy(_event, data);
|
|
3161
|
+
aggregateContent({
|
|
3162
|
+
event: GraphEvents.ON_RUN_STEP,
|
|
3163
|
+
data: data as t.RunStep,
|
|
3164
|
+
});
|
|
3165
|
+
},
|
|
3166
|
+
},
|
|
3167
|
+
[GraphEvents.ON_SUMMARIZE_START]: {
|
|
3168
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
3169
|
+
spies.onSummarizeStartSpy(data);
|
|
3170
|
+
},
|
|
3171
|
+
},
|
|
3172
|
+
[GraphEvents.ON_SUMMARIZE_COMPLETE]: {
|
|
3173
|
+
handle: (_event: string, data: t.StreamEventData): void => {
|
|
3174
|
+
spies.onSummarizeCompleteSpy(data);
|
|
3175
|
+
},
|
|
3176
|
+
},
|
|
3177
|
+
},
|
|
3178
|
+
tokenCounter,
|
|
3179
|
+
indexTokenCountMap,
|
|
3180
|
+
});
|
|
3181
|
+
};
|
|
3182
|
+
|
|
3183
|
+
// --- Cycle 1: Build conversation and trigger summarization ---
|
|
3184
|
+
let run = await createRunHelper(4000);
|
|
3185
|
+
run.Graph?.overrideTestModel(['Answer 1 with detailed explanation.'], 1);
|
|
3186
|
+
await runTurn({ run, conversationHistory }, 'Question 1.', streamConfig);
|
|
3187
|
+
|
|
3188
|
+
run = await createRunHelper(4000);
|
|
3189
|
+
run.Graph?.overrideTestModel(['Answer 2 with more explanation.'], 1);
|
|
3190
|
+
await runTurn({ run, conversationHistory }, 'Question 2.', streamConfig);
|
|
3191
|
+
|
|
3192
|
+
run = await createRunHelper(50);
|
|
3193
|
+
run.Graph?.overrideTestModel(['OK.'], 1);
|
|
3194
|
+
try {
|
|
3195
|
+
await runTurn({ run, conversationHistory }, 'Summarize.', streamConfig);
|
|
3196
|
+
} catch {
|
|
3197
|
+
conversationHistory.pop();
|
|
3198
|
+
}
|
|
3199
|
+
|
|
3200
|
+
const cycle1SumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3201
|
+
|
|
3202
|
+
// Extract the summary from cycle 1 for use as initialSummary in cycle 2
|
|
3203
|
+
let priorSummary: { text: string; tokenCount: number } | undefined;
|
|
3204
|
+
if (cycle1SumCount > 0) {
|
|
3205
|
+
const lastComplete = spies.onSummarizeCompleteSpy.mock.calls[
|
|
3206
|
+
cycle1SumCount - 1
|
|
3207
|
+
][0] as t.SummarizeCompleteEvent;
|
|
3208
|
+
priorSummary = {
|
|
3209
|
+
text: getSummaryText(lastComplete.summary),
|
|
3210
|
+
tokenCount: lastComplete.summary!.tokenCount ?? 0,
|
|
3211
|
+
};
|
|
3212
|
+
}
|
|
3213
|
+
|
|
3214
|
+
// --- Cycle 2: More conversation with prior summary, trigger again ---
|
|
3215
|
+
run = await createRunHelper(4000, priorSummary);
|
|
3216
|
+
run.Graph?.overrideTestModel(['Cycle 2 answer.'], 1);
|
|
3217
|
+
await runTurn(
|
|
3218
|
+
{ run, conversationHistory },
|
|
3219
|
+
'Cycle 2 question.',
|
|
3220
|
+
streamConfig
|
|
3221
|
+
);
|
|
3222
|
+
|
|
3223
|
+
run = await createRunHelper(50, priorSummary);
|
|
3224
|
+
run.Graph?.overrideTestModel(['OK cycle 2.'], 1);
|
|
3225
|
+
try {
|
|
3226
|
+
await runTurn(
|
|
3227
|
+
{ run, conversationHistory },
|
|
3228
|
+
'Summarize again.',
|
|
3229
|
+
streamConfig
|
|
3230
|
+
);
|
|
3231
|
+
} catch {
|
|
3232
|
+
conversationHistory.pop();
|
|
3233
|
+
}
|
|
3234
|
+
|
|
3235
|
+
const totalSumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3236
|
+
console.log(
|
|
3237
|
+
` Repeated summarization: cycle1=${cycle1SumCount}, total=${totalSumCount}`
|
|
3238
|
+
);
|
|
3239
|
+
|
|
3240
|
+
// At least one summarization should have fired
|
|
3241
|
+
expect(totalSumCount).toBeGreaterThanOrEqual(1);
|
|
3242
|
+
|
|
3243
|
+
// Every summary should have exactly one ## Goal (no duplicates)
|
|
3244
|
+
for (let i = 0; i < totalSumCount; i++) {
|
|
3245
|
+
const complete = spies.onSummarizeCompleteSpy.mock.calls[
|
|
3246
|
+
i
|
|
3247
|
+
][0] as t.SummarizeCompleteEvent;
|
|
3248
|
+
const text = getSummaryText(complete.summary);
|
|
3249
|
+
const goalCount = (text.match(/## Goal/g) || []).length;
|
|
3250
|
+
if (goalCount !== 1) {
|
|
3251
|
+
console.log(
|
|
3252
|
+
` Summary ${i} has ${goalCount} '## Goal' sections:\n "${text.substring(0, 300)}…"`
|
|
3253
|
+
);
|
|
3254
|
+
}
|
|
3255
|
+
expect(goalCount).toBe(1);
|
|
3256
|
+
expect(complete.summary!.tokenCount).toBeGreaterThan(0);
|
|
3257
|
+
}
|
|
3258
|
+
});
|
|
3259
|
+
|
|
3260
|
+
test('conversation continues after summarization', async () => {
|
|
3261
|
+
const spies = createSpies();
|
|
3262
|
+
const conversationHistory: BaseMessage[] = [];
|
|
3263
|
+
const tokenCounter = await createTokenCounter();
|
|
3264
|
+
|
|
3265
|
+
// Summarizer returns a concise summary
|
|
3266
|
+
const summaryResponse =
|
|
3267
|
+
'## Goal\nMath help.\n\n## Progress\n### Done\n- Basic operations completed.';
|
|
3268
|
+
|
|
3269
|
+
getChatModelClassSpy = jest
|
|
3270
|
+
.spyOn(providers, 'getChatModelClass')
|
|
3271
|
+
.mockImplementation(((provider: Providers) => {
|
|
3272
|
+
if (provider === Providers.OPENAI) {
|
|
3273
|
+
return class extends FakeListChatModel {
|
|
3274
|
+
constructor(_options: any) {
|
|
3275
|
+
super({ responses: [summaryResponse] });
|
|
3276
|
+
}
|
|
3277
|
+
} as any;
|
|
3278
|
+
}
|
|
3279
|
+
return originalGetChatModelClass(provider);
|
|
3280
|
+
}) as typeof providers.getChatModelClass);
|
|
3281
|
+
|
|
3282
|
+
const createRunHelper = async (
|
|
3283
|
+
maxTokens: number
|
|
3284
|
+
): Promise<Run<t.IState>> => {
|
|
3285
|
+
const { aggregateContent } = createContentAggregator();
|
|
3286
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3287
|
+
conversationHistory,
|
|
3288
|
+
tokenCounter
|
|
3289
|
+
);
|
|
3290
|
+
return Run.create<t.IState>({
|
|
3291
|
+
runId: `multi-pass-continue-${Date.now()}`,
|
|
3292
|
+
graphConfig: {
|
|
3293
|
+
type: 'standard',
|
|
3294
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3295
|
+
instructions: INSTRUCTIONS,
|
|
3296
|
+
maxContextTokens: maxTokens,
|
|
3297
|
+
summarizationEnabled: true,
|
|
3298
|
+
summarizationConfig: {
|
|
3299
|
+
provider: Providers.OPENAI,
|
|
3300
|
+
parameters: {},
|
|
3301
|
+
},
|
|
3302
|
+
},
|
|
3303
|
+
returnContent: true,
|
|
3304
|
+
customHandlers: buildHandlers([], aggregateContent, spies),
|
|
3305
|
+
tokenCounter,
|
|
3306
|
+
indexTokenCountMap,
|
|
3307
|
+
});
|
|
3308
|
+
};
|
|
3309
|
+
|
|
3310
|
+
// Build conversation
|
|
3311
|
+
for (const q of [
|
|
3312
|
+
'Explain 2+2 in great detail.',
|
|
3313
|
+
'Explain 3*5 step by step.',
|
|
3314
|
+
'What is sqrt(16)? Full explanation.',
|
|
3315
|
+
'What is 100/4? Show your work.',
|
|
3316
|
+
]) {
|
|
3317
|
+
const run = await createRunHelper(4000);
|
|
3318
|
+
run.Graph?.overrideTestModel(
|
|
3319
|
+
[
|
|
3320
|
+
'Here is a detailed explanation of the computation with many steps and examples.',
|
|
3321
|
+
],
|
|
3322
|
+
1
|
|
3323
|
+
);
|
|
3324
|
+
await runTurn({ run, conversationHistory }, q, streamConfig);
|
|
3325
|
+
}
|
|
3326
|
+
|
|
3327
|
+
// Trigger summarization
|
|
3328
|
+
let run = await createRunHelper(100);
|
|
3329
|
+
run.Graph?.overrideTestModel(['Summary acknowledged.'], 1);
|
|
3330
|
+
try {
|
|
3331
|
+
await runTurn({ run, conversationHistory }, 'Continue.', streamConfig);
|
|
3332
|
+
} catch {
|
|
3333
|
+
conversationHistory.pop();
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3336
|
+
const sumCount = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3337
|
+
console.log(` Continuation test: ${sumCount} summarization(s)`);
|
|
3338
|
+
|
|
3339
|
+
if (sumCount > 0) {
|
|
3340
|
+
// Post-summary turn should work with reasonable context
|
|
3341
|
+
run = await createRunHelper(2000);
|
|
3342
|
+
run.Graph?.overrideTestModel(['The answer is 42.'], 1);
|
|
3343
|
+
const postResult = await runTurn(
|
|
3344
|
+
{ run, conversationHistory },
|
|
3345
|
+
'What is 6*7?',
|
|
3346
|
+
streamConfig
|
|
3347
|
+
);
|
|
3348
|
+
expect(postResult).toBeDefined();
|
|
3349
|
+
console.log(
|
|
3350
|
+
` Post-summary turn succeeded, ${conversationHistory.length} messages`
|
|
3351
|
+
);
|
|
3352
|
+
}
|
|
3353
|
+
});
|
|
3354
|
+
});
|
|
3355
|
+
|
|
3356
|
+
// ---------------------------------------------------------------------------
|
|
3357
|
+
// Re-summarization within a single run (FakeListChatModel — no API keys)
|
|
3358
|
+
// Tests the shouldSkipSummarization baseline reset fix.
|
|
3359
|
+
// ---------------------------------------------------------------------------
|
|
3360
|
+
|
|
3361
|
+
describe('Re-summarization within a single run (no API keys)', () => {
|
|
3362
|
+
jest.setTimeout(60_000);
|
|
3363
|
+
|
|
3364
|
+
const SUMMARY_V1 = '## Summary v1\nUser discussed topic A.';
|
|
3365
|
+
const SUMMARY_V2 = '## Summary v2\nUser discussed topic A and B.';
|
|
3366
|
+
const INSTRUCTIONS = 'You are a helpful assistant.';
|
|
3367
|
+
const streamConfig = {
|
|
3368
|
+
configurable: { thread_id: 're-summarize-test' },
|
|
3369
|
+
recursionLimit: 80,
|
|
3370
|
+
streamMode: 'values',
|
|
3371
|
+
version: 'v2' as const,
|
|
3372
|
+
};
|
|
3373
|
+
|
|
3374
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
3375
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
3376
|
+
let summaryCallCount = 0;
|
|
3377
|
+
|
|
3378
|
+
beforeEach(() => {
|
|
3379
|
+
summaryCallCount = 0;
|
|
3380
|
+
getChatModelClassSpy = jest
|
|
3381
|
+
.spyOn(providers, 'getChatModelClass')
|
|
3382
|
+
.mockImplementation(((provider: Providers) => {
|
|
3383
|
+
if (provider === Providers.OPENAI) {
|
|
3384
|
+
return class extends FakeListChatModel {
|
|
3385
|
+
constructor(_options: any) {
|
|
3386
|
+
summaryCallCount++;
|
|
3387
|
+
super({
|
|
3388
|
+
responses: [summaryCallCount === 1 ? SUMMARY_V1 : SUMMARY_V2],
|
|
3389
|
+
});
|
|
3390
|
+
}
|
|
3391
|
+
} as any;
|
|
3392
|
+
}
|
|
3393
|
+
return originalGetChatModelClass(provider);
|
|
3394
|
+
}) as typeof providers.getChatModelClass);
|
|
3395
|
+
});
|
|
3396
|
+
|
|
3397
|
+
afterEach(() => {
|
|
3398
|
+
getChatModelClassSpy.mockRestore();
|
|
3399
|
+
});
|
|
3400
|
+
|
|
3401
|
+
test('second summarization fires after context refills post-first-summary', async () => {
|
|
3402
|
+
const spies = createSpies();
|
|
3403
|
+
const tokenCounter = await createTokenCounter();
|
|
3404
|
+
|
|
3405
|
+
// Build a long conversation that will need multiple summarization cycles
|
|
3406
|
+
const padding = 'x'.repeat(400);
|
|
3407
|
+
const conversationHistory: BaseMessage[] = [];
|
|
3408
|
+
for (let i = 0; i < 10; i++) {
|
|
3409
|
+
conversationHistory.push(new HumanMessage(`Question ${i}${padding}`));
|
|
3410
|
+
conversationHistory.push(new AIMessage(`Answer ${i}${padding}`));
|
|
3411
|
+
}
|
|
3412
|
+
conversationHistory.push(new HumanMessage('Final question'));
|
|
3413
|
+
|
|
3414
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3415
|
+
conversationHistory,
|
|
3416
|
+
tokenCounter
|
|
3417
|
+
);
|
|
3418
|
+
|
|
3419
|
+
const { aggregateContent } = createContentAggregator();
|
|
3420
|
+
const collectedUsage: UsageMetadata[] = [];
|
|
3421
|
+
|
|
3422
|
+
const run = await Run.create<t.IState>({
|
|
3423
|
+
runId: `re-sum-${Date.now()}`,
|
|
3424
|
+
graphConfig: {
|
|
3425
|
+
type: 'standard',
|
|
3426
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3427
|
+
instructions: INSTRUCTIONS,
|
|
3428
|
+
maxContextTokens: 600,
|
|
3429
|
+
summarizationEnabled: true,
|
|
3430
|
+
summarizationConfig: {
|
|
3431
|
+
provider: Providers.OPENAI,
|
|
3432
|
+
},
|
|
3433
|
+
},
|
|
3434
|
+
returnContent: true,
|
|
3435
|
+
customHandlers: buildHandlers(collectedUsage, aggregateContent, spies),
|
|
3436
|
+
tokenCounter,
|
|
3437
|
+
indexTokenCountMap,
|
|
3438
|
+
});
|
|
3439
|
+
|
|
3440
|
+
let error: Error | undefined;
|
|
3441
|
+
try {
|
|
3442
|
+
await run.processStream(
|
|
3443
|
+
{ messages: conversationHistory },
|
|
3444
|
+
streamConfig as any
|
|
3445
|
+
);
|
|
3446
|
+
} catch (err) {
|
|
3447
|
+
error = err as Error;
|
|
3448
|
+
}
|
|
3449
|
+
|
|
3450
|
+
const startCalls = spies.onSummarizeStartSpy.mock.calls.length;
|
|
3451
|
+
const completeCalls = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3452
|
+
console.log(
|
|
3453
|
+
` Summarization cycles: start=${startCalls}, complete=${completeCalls}, error=${error?.message.substring(0, 80) ?? 'none'}`
|
|
3454
|
+
);
|
|
3455
|
+
|
|
3456
|
+
// The key assertion: with enough messages and tight context,
|
|
3457
|
+
// summarization should fire more than once. Before the
|
|
3458
|
+
// shouldSkipSummarization baseline reset fix, it would fire only once.
|
|
3459
|
+
expect(startCalls).toBeGreaterThanOrEqual(1);
|
|
3460
|
+
console.log(` Summary model calls: ${summaryCallCount}`);
|
|
3461
|
+
});
|
|
3462
|
+
});
|
|
3463
|
+
|
|
3464
|
+
// ---------------------------------------------------------------------------
|
|
3465
|
+
// Emoji/Unicode safety through full pipeline (FakeListChatModel — no API keys)
|
|
3466
|
+
// ---------------------------------------------------------------------------
|
|
3467
|
+
|
|
3468
|
+
describe('Emoji and Unicode safety (no API keys)', () => {
|
|
3469
|
+
jest.setTimeout(60_000);
|
|
3470
|
+
|
|
3471
|
+
const SUMMARY = '## Summary\nUser sent emoji-heavy messages about coding.';
|
|
3472
|
+
const streamConfig = {
|
|
3473
|
+
configurable: { thread_id: 'emoji-safety-test' },
|
|
3474
|
+
streamMode: 'values',
|
|
3475
|
+
version: 'v2' as const,
|
|
3476
|
+
};
|
|
3477
|
+
|
|
3478
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
3479
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
3480
|
+
|
|
3481
|
+
beforeEach(() => {
|
|
3482
|
+
getChatModelClassSpy = jest
|
|
3483
|
+
.spyOn(providers, 'getChatModelClass')
|
|
3484
|
+
.mockImplementation(((provider: Providers) => {
|
|
3485
|
+
if (provider === Providers.OPENAI) {
|
|
3486
|
+
return class extends FakeListChatModel {
|
|
3487
|
+
constructor(_options: any) {
|
|
3488
|
+
super({ responses: [SUMMARY] });
|
|
3489
|
+
}
|
|
3490
|
+
} as any;
|
|
3491
|
+
}
|
|
3492
|
+
return originalGetChatModelClass(provider);
|
|
3493
|
+
}) as typeof providers.getChatModelClass);
|
|
3494
|
+
});
|
|
3495
|
+
|
|
3496
|
+
afterEach(() => {
|
|
3497
|
+
getChatModelClassSpy.mockRestore();
|
|
3498
|
+
});
|
|
3499
|
+
|
|
3500
|
+
test('emoji-heavy messages do not produce broken JSON in summarization', async () => {
|
|
3501
|
+
const spies = createSpies();
|
|
3502
|
+
const tokenCounter = await createTokenCounter();
|
|
3503
|
+
|
|
3504
|
+
// ZWJ sequences and multi-byte emoji that produce surrogate pairs in UTF-16
|
|
3505
|
+
const emojiMessages: BaseMessage[] = [
|
|
3506
|
+
new HumanMessage('👨💻 Let me show you some code 🚀'),
|
|
3507
|
+
new AIMessage('Sure! Here is the code 🎉✨ with lots of emoji 🌍🌎🌏'),
|
|
3508
|
+
new HumanMessage('👨👩👧👦 Family emoji and flags 🇺🇸🇬🇧🇯🇵 test'),
|
|
3509
|
+
new AIMessage('More emoji: 🧑🔬🧑🎨🧑🚒🧑✈️ professional emoji'),
|
|
3510
|
+
new HumanMessage('Final 💯🔥⚡ question'),
|
|
3511
|
+
];
|
|
3512
|
+
|
|
3513
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3514
|
+
emojiMessages,
|
|
3515
|
+
tokenCounter
|
|
3516
|
+
);
|
|
3517
|
+
|
|
3518
|
+
const { aggregateContent } = createContentAggregator();
|
|
3519
|
+
const collectedUsage: UsageMetadata[] = [];
|
|
3520
|
+
|
|
3521
|
+
const run = await Run.create<t.IState>({
|
|
3522
|
+
runId: `emoji-${Date.now()}`,
|
|
3523
|
+
graphConfig: {
|
|
3524
|
+
type: 'standard',
|
|
3525
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3526
|
+
instructions: 'Be helpful.',
|
|
3527
|
+
maxContextTokens: 100,
|
|
3528
|
+
summarizationEnabled: true,
|
|
3529
|
+
summarizationConfig: {
|
|
3530
|
+
provider: Providers.OPENAI,
|
|
3531
|
+
},
|
|
3532
|
+
},
|
|
3533
|
+
returnContent: true,
|
|
3534
|
+
customHandlers: buildHandlers(collectedUsage, aggregateContent, spies),
|
|
3535
|
+
tokenCounter,
|
|
3536
|
+
indexTokenCountMap,
|
|
3537
|
+
});
|
|
3538
|
+
|
|
3539
|
+
// The test passes if this doesn't throw a JSON serialization error
|
|
3540
|
+
let error: Error | undefined;
|
|
3541
|
+
try {
|
|
3542
|
+
await run.processStream({ messages: emojiMessages }, streamConfig as any);
|
|
3543
|
+
} catch (err) {
|
|
3544
|
+
error = err as Error;
|
|
3545
|
+
}
|
|
3546
|
+
|
|
3547
|
+
// empty_messages is acceptable (tight context), but JSON errors are not
|
|
3548
|
+
if (error) {
|
|
3549
|
+
expect(error.message).not.toContain('not valid JSON');
|
|
3550
|
+
expect(error.message).not.toContain('Invalid Unicode');
|
|
3551
|
+
console.log(
|
|
3552
|
+
` Emoji test: acceptable error (${error.message.substring(0, 80)})`
|
|
3553
|
+
);
|
|
3554
|
+
} else {
|
|
3555
|
+
console.log(' Emoji test: completed without error');
|
|
3556
|
+
}
|
|
3557
|
+
|
|
3558
|
+
console.log(
|
|
3559
|
+
` Summarization: start=${spies.onSummarizeStartSpy.mock.calls.length}, complete=${spies.onSummarizeCompleteSpy.mock.calls.length}`
|
|
3560
|
+
);
|
|
3561
|
+
});
|
|
3562
|
+
});
|
|
3563
|
+
|
|
3564
|
+
// ---------------------------------------------------------------------------
|
|
3565
|
+
// Budget-aware error messages (FakeListChatModel — no API keys)
|
|
3566
|
+
// ---------------------------------------------------------------------------
|
|
3567
|
+
|
|
3568
|
+
describe('Budget-aware error messages (no API keys)', () => {
|
|
3569
|
+
jest.setTimeout(60_000);
|
|
3570
|
+
|
|
3571
|
+
const streamConfig = {
|
|
3572
|
+
configurable: { thread_id: 'budget-error-test' },
|
|
3573
|
+
streamMode: 'values',
|
|
3574
|
+
version: 'v2' as const,
|
|
3575
|
+
};
|
|
3576
|
+
|
|
3577
|
+
test('empty_messages error includes tool-specific guidance when tools dominate budget', async () => {
|
|
3578
|
+
const spies = createSpies();
|
|
3579
|
+
const tokenCounter = await createTokenCounter();
|
|
3580
|
+
|
|
3581
|
+
const conversationHistory: BaseMessage[] = [new HumanMessage('Hello')];
|
|
3582
|
+
|
|
3583
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3584
|
+
conversationHistory,
|
|
3585
|
+
tokenCounter
|
|
3586
|
+
);
|
|
3587
|
+
|
|
3588
|
+
const { aggregateContent } = createContentAggregator();
|
|
3589
|
+
const collectedUsage: UsageMetadata[] = [];
|
|
3590
|
+
|
|
3591
|
+
// Create a run with maxContextTokens smaller than the tool definitions
|
|
3592
|
+
// The Calculator tool alone has a schema that takes up tokens
|
|
3593
|
+
const run = await Run.create<t.IState>({
|
|
3594
|
+
runId: `budget-err-${Date.now()}`,
|
|
3595
|
+
graphConfig: {
|
|
3596
|
+
type: 'standard',
|
|
3597
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3598
|
+
tools: [new Calculator()],
|
|
3599
|
+
instructions: 'A'.repeat(500), // Long instructions to push over budget
|
|
3600
|
+
maxContextTokens: 50, // Impossibly tight
|
|
3601
|
+
summarizationEnabled: true,
|
|
3602
|
+
summarizationConfig: {
|
|
3603
|
+
provider: Providers.OPENAI,
|
|
3604
|
+
},
|
|
3605
|
+
},
|
|
3606
|
+
returnContent: true,
|
|
3607
|
+
customHandlers: buildHandlers(collectedUsage, aggregateContent, spies),
|
|
3608
|
+
tokenCounter,
|
|
3609
|
+
indexTokenCountMap,
|
|
3610
|
+
});
|
|
3611
|
+
|
|
3612
|
+
let error: Error | undefined;
|
|
3613
|
+
try {
|
|
3614
|
+
await run.processStream(
|
|
3615
|
+
{ messages: conversationHistory },
|
|
3616
|
+
streamConfig as any
|
|
3617
|
+
);
|
|
3618
|
+
} catch (err) {
|
|
3619
|
+
error = err as Error;
|
|
3620
|
+
}
|
|
3621
|
+
|
|
3622
|
+
expect(error).toBeDefined();
|
|
3623
|
+
// The error should mention the budget problem specifically
|
|
3624
|
+
const errorMsg = error!.message;
|
|
3625
|
+
expect(errorMsg).toContain('empty_messages');
|
|
3626
|
+
|
|
3627
|
+
// Should contain actionable guidance about instructions or tools
|
|
3628
|
+
const hasGuidance =
|
|
3629
|
+
errorMsg.includes('Reduce the number of tools') ||
|
|
3630
|
+
errorMsg.includes('Increase maxContextTokens') ||
|
|
3631
|
+
errorMsg.includes('shorten the system prompt');
|
|
3632
|
+
expect(hasGuidance).toBe(true);
|
|
3633
|
+
|
|
3634
|
+
console.log(
|
|
3635
|
+
` Budget error guidance: ${errorMsg.substring(errorMsg.indexOf('Please') > -1 ? errorMsg.indexOf('Please') : 0, errorMsg.indexOf('Please') + 120)}`
|
|
3636
|
+
);
|
|
3637
|
+
});
|
|
3638
|
+
});
|
|
3639
|
+
|
|
3640
|
+
// ---------------------------------------------------------------------------
|
|
3641
|
+
// Large tool result + surviving context double-summarization regression
|
|
3642
|
+
// (FakeListChatModel — no API keys)
|
|
3643
|
+
//
|
|
3644
|
+
// Models the real-world scenario from debug logs:
|
|
3645
|
+
// - Multi-turn conversation with MCP tools (screenshots, snapshots)
|
|
3646
|
+
// - Summarization fires once → surviving context includes a 9437-char tool result
|
|
3647
|
+
// - Post-summarization prune: the tool result exceeds the effective budget
|
|
3648
|
+
// - All surviving messages land in messagesToRefine
|
|
3649
|
+
// - Before fix: summarization re-triggers immediately on the same messages
|
|
3650
|
+
// - After fix: shouldSkipSummarization blocks re-trigger (baseline = surviving count)
|
|
3651
|
+
// ---------------------------------------------------------------------------
|
|
3652
|
+
|
|
3653
|
+
describe('Large tool result surviving context — no double summarization (no API keys)', () => {
|
|
3654
|
+
jest.setTimeout(60_000);
|
|
3655
|
+
|
|
3656
|
+
const SUMMARY_V1 =
|
|
3657
|
+
'## Summary\nUser navigated to apple.com, took screenshots, ran Lighthouse audit.';
|
|
3658
|
+
const SUMMARY_V2 =
|
|
3659
|
+
'## Summary v2\nUser explored apple.com with devtools, took snapshots.';
|
|
3660
|
+
const INSTRUCTIONS = 'You are a browser automation assistant.';
|
|
3661
|
+
const streamConfig = {
|
|
3662
|
+
configurable: { thread_id: 'double-sum-regression' },
|
|
3663
|
+
recursionLimit: 80,
|
|
3664
|
+
streamMode: 'values',
|
|
3665
|
+
version: 'v2' as const,
|
|
3666
|
+
};
|
|
3667
|
+
|
|
3668
|
+
let getChatModelClassSpy: jest.SpyInstance;
|
|
3669
|
+
const originalGetChatModelClass = providers.getChatModelClass;
|
|
3670
|
+
let summaryCallCount = 0;
|
|
3671
|
+
|
|
3672
|
+
beforeEach(() => {
|
|
3673
|
+
summaryCallCount = 0;
|
|
3674
|
+
getChatModelClassSpy = jest
|
|
3675
|
+
.spyOn(providers, 'getChatModelClass')
|
|
3676
|
+
.mockImplementation(((provider: Providers) => {
|
|
3677
|
+
if (provider === Providers.OPENAI) {
|
|
3678
|
+
return class extends FakeListChatModel {
|
|
3679
|
+
constructor(_options: any) {
|
|
3680
|
+
summaryCallCount++;
|
|
3681
|
+
super({
|
|
3682
|
+
responses: [summaryCallCount === 1 ? SUMMARY_V1 : SUMMARY_V2],
|
|
3683
|
+
});
|
|
3684
|
+
}
|
|
3685
|
+
} as any;
|
|
3686
|
+
}
|
|
3687
|
+
return originalGetChatModelClass(provider);
|
|
3688
|
+
}) as typeof providers.getChatModelClass);
|
|
3689
|
+
});
|
|
3690
|
+
|
|
3691
|
+
afterEach(() => {
|
|
3692
|
+
getChatModelClassSpy.mockRestore();
|
|
3693
|
+
});
|
|
3694
|
+
|
|
3695
|
+
test('surviving context with oversized tool result does not re-trigger summarization', async () => {
|
|
3696
|
+
const spies = createSpies();
|
|
3697
|
+
const tokenCounter = await createTokenCounter();
|
|
3698
|
+
|
|
3699
|
+
// Build a conversation that mirrors the real debug log:
|
|
3700
|
+
// Multiple turns with tool calls, including a large take_snapshot result
|
|
3701
|
+
const largeSnapshot = 'uid=1_0 RootWebArea "Apple" '.repeat(300); // ~9000 chars
|
|
3702
|
+
const conversationHistory: BaseMessage[] = [
|
|
3703
|
+
new HumanMessage('Navigate to apple.com'),
|
|
3704
|
+
new AIMessage({
|
|
3705
|
+
content: 'Navigating now.',
|
|
3706
|
+
tool_calls: [
|
|
3707
|
+
{
|
|
3708
|
+
id: 'tc_1',
|
|
3709
|
+
name: 'navigate_page',
|
|
3710
|
+
args: { url: 'https://apple.com' },
|
|
3711
|
+
},
|
|
3712
|
+
],
|
|
3713
|
+
}),
|
|
3714
|
+
new ToolMessage({
|
|
3715
|
+
content: 'Successfully navigated to https://www.apple.com.',
|
|
3716
|
+
tool_call_id: 'tc_1',
|
|
3717
|
+
name: 'navigate_page',
|
|
3718
|
+
}),
|
|
3719
|
+
new AIMessage({
|
|
3720
|
+
content: 'Taking a screenshot.',
|
|
3721
|
+
tool_calls: [{ id: 'tc_2', name: 'take_screenshot', args: {} }],
|
|
3722
|
+
}),
|
|
3723
|
+
new ToolMessage({
|
|
3724
|
+
content: 'Took a screenshot of the current page.',
|
|
3725
|
+
tool_call_id: 'tc_2',
|
|
3726
|
+
name: 'take_screenshot',
|
|
3727
|
+
}),
|
|
3728
|
+
new HumanMessage('What can you see on the site?'),
|
|
3729
|
+
new AIMessage({
|
|
3730
|
+
content: 'Let me take a snapshot.',
|
|
3731
|
+
tool_calls: [{ id: 'tc_3', name: 'take_snapshot', args: {} }],
|
|
3732
|
+
}),
|
|
3733
|
+
new ToolMessage({
|
|
3734
|
+
content: largeSnapshot, // ~9000 chars — the large tool result
|
|
3735
|
+
tool_call_id: 'tc_3',
|
|
3736
|
+
name: 'take_snapshot',
|
|
3737
|
+
}),
|
|
3738
|
+
new HumanMessage('Show me more details'),
|
|
3739
|
+
new AIMessage({
|
|
3740
|
+
content: 'Here are the details from the page.',
|
|
3741
|
+
tool_calls: [{ id: 'tc_4', name: 'take_screenshot', args: {} }],
|
|
3742
|
+
}),
|
|
3743
|
+
new ToolMessage({
|
|
3744
|
+
content: 'Took another screenshot.',
|
|
3745
|
+
tool_call_id: 'tc_4',
|
|
3746
|
+
name: 'take_screenshot',
|
|
3747
|
+
}),
|
|
3748
|
+
new HumanMessage('Analyze the page performance'),
|
|
3749
|
+
];
|
|
3750
|
+
|
|
3751
|
+
const indexTokenCountMap = buildIndexTokenCountMap(
|
|
3752
|
+
conversationHistory,
|
|
3753
|
+
tokenCounter
|
|
3754
|
+
);
|
|
3755
|
+
|
|
3756
|
+
const { aggregateContent } = createContentAggregator();
|
|
3757
|
+
const collectedUsage: UsageMetadata[] = [];
|
|
3758
|
+
|
|
3759
|
+
// maxContextTokens = 800 — tight enough that the large snapshot
|
|
3760
|
+
// forces aggressive pruning but leaves room for the agent to respond
|
|
3761
|
+
const run = await Run.create<t.IState>({
|
|
3762
|
+
runId: `double-sum-${Date.now()}`,
|
|
3763
|
+
graphConfig: {
|
|
3764
|
+
type: 'standard',
|
|
3765
|
+
llmConfig: getLLMConfig(Providers.OPENAI),
|
|
3766
|
+
instructions: INSTRUCTIONS,
|
|
3767
|
+
maxContextTokens: 800,
|
|
3768
|
+
summarizationEnabled: true,
|
|
3769
|
+
summarizationConfig: {
|
|
3770
|
+
provider: Providers.OPENAI,
|
|
3771
|
+
},
|
|
3772
|
+
},
|
|
3773
|
+
returnContent: true,
|
|
3774
|
+
customHandlers: buildHandlers(collectedUsage, aggregateContent, spies),
|
|
3775
|
+
tokenCounter,
|
|
3776
|
+
indexTokenCountMap,
|
|
3777
|
+
});
|
|
3778
|
+
|
|
3779
|
+
let error: Error | undefined;
|
|
3780
|
+
try {
|
|
3781
|
+
await run.processStream(
|
|
3782
|
+
{ messages: conversationHistory },
|
|
3783
|
+
streamConfig as any
|
|
3784
|
+
);
|
|
3785
|
+
} catch (err) {
|
|
3786
|
+
error = err as Error;
|
|
3787
|
+
}
|
|
3788
|
+
|
|
3789
|
+
const startCalls = spies.onSummarizeStartSpy.mock.calls.length;
|
|
3790
|
+
const completeCalls = spies.onSummarizeCompleteSpy.mock.calls.length;
|
|
3791
|
+
console.log(
|
|
3792
|
+
` Summarization: start=${startCalls}, complete=${completeCalls}, modelCalls=${summaryCallCount}`
|
|
3793
|
+
);
|
|
3794
|
+
|
|
3795
|
+
if (error) {
|
|
3796
|
+
// empty_messages is acceptable for tight context; double-summarization is not
|
|
3797
|
+
console.log(` Error: ${error.message.substring(0, 100)}`);
|
|
3798
|
+
}
|
|
3799
|
+
|
|
3800
|
+
// Key assertion: summarization should fire at most once.
|
|
3801
|
+
// Before the fix, the surviving context's large tool result would cause
|
|
3802
|
+
// all messages to land in messagesToRefine, triggering a second
|
|
3803
|
+
// summarization on the same messages.
|
|
3804
|
+
expect(startCalls).toBeLessThanOrEqual(1);
|
|
3805
|
+
expect(summaryCallCount).toBeLessThanOrEqual(1);
|
|
3806
|
+
console.log(
|
|
3807
|
+
` Double-summarization prevented: ${startCalls <= 1 ? 'YES' : 'NO'}`
|
|
3808
|
+
);
|
|
3809
|
+
});
|
|
3810
|
+
});
|