brownian-code 2026.2.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +97 -0
  3. package/bin/brownian +25 -0
  4. package/env.example +21 -0
  5. package/package.json +87 -0
  6. package/src/agent/agent.test.ts +414 -0
  7. package/src/agent/agent.ts +385 -0
  8. package/src/agent/index.ts +27 -0
  9. package/src/agent/prompts.ts +271 -0
  10. package/src/agent/scratchpad.test.ts +482 -0
  11. package/src/agent/scratchpad.ts +526 -0
  12. package/src/agent/token-counter.test.ts +59 -0
  13. package/src/agent/token-counter.ts +33 -0
  14. package/src/agent/types.ts +137 -0
  15. package/src/cli.tsx +385 -0
  16. package/src/commands/builtin.test.ts +271 -0
  17. package/src/commands/builtin.ts +200 -0
  18. package/src/commands/registry.test.ts +188 -0
  19. package/src/commands/registry.ts +111 -0
  20. package/src/commands/types.ts +64 -0
  21. package/src/components/AgentEventView.tsx +487 -0
  22. package/src/components/AnswerBox.tsx +81 -0
  23. package/src/components/ApiKeyPrompt.tsx +75 -0
  24. package/src/components/CommandMenu.test.tsx +64 -0
  25. package/src/components/CommandMenu.tsx +38 -0
  26. package/src/components/CursorText.tsx +43 -0
  27. package/src/components/DebugPanel.tsx +48 -0
  28. package/src/components/ErrorBox.test.tsx +58 -0
  29. package/src/components/ErrorBox.tsx +26 -0
  30. package/src/components/HelpView.test.tsx +70 -0
  31. package/src/components/HelpView.tsx +61 -0
  32. package/src/components/HistoryItemView.tsx +108 -0
  33. package/src/components/Input.tsx +193 -0
  34. package/src/components/Intro.test.tsx +59 -0
  35. package/src/components/Intro.tsx +35 -0
  36. package/src/components/ModelSelector.tsx +288 -0
  37. package/src/components/StatusBar.test.tsx +78 -0
  38. package/src/components/StatusBar.tsx +56 -0
  39. package/src/components/WorkingIndicator.tsx +133 -0
  40. package/src/components/index.ts +23 -0
  41. package/src/e2e/agent-flow.test.ts +378 -0
  42. package/src/evals/components/EvalApp.tsx +206 -0
  43. package/src/evals/components/EvalCurrentQuestion.tsx +42 -0
  44. package/src/evals/components/EvalProgress.tsx +33 -0
  45. package/src/evals/components/EvalRecentResults.tsx +63 -0
  46. package/src/evals/components/EvalStats.tsx +49 -0
  47. package/src/evals/components/index.ts +5 -0
  48. package/src/evals/dataset/crypto_agent.csv +16 -0
  49. package/src/evals/run.ts +355 -0
  50. package/src/gateway/channels/whatsapp/auth-store.ts +15 -0
  51. package/src/gateway/channels/whatsapp/inbound.ts +86 -0
  52. package/src/gateway/channels/whatsapp/login.ts +28 -0
  53. package/src/gateway/channels/whatsapp/outbound.ts +27 -0
  54. package/src/gateway/channels/whatsapp/session.ts +69 -0
  55. package/src/gateway/config.ts +81 -0
  56. package/src/gateway/index.ts +62 -0
  57. package/src/hooks/useAgentRunner.ts +317 -0
  58. package/src/hooks/useDebugLogs.ts +22 -0
  59. package/src/hooks/useInputHistory.ts +106 -0
  60. package/src/hooks/useModelSelection.ts +249 -0
  61. package/src/hooks/useTextBuffer.test.ts +121 -0
  62. package/src/hooks/useTextBuffer.ts +97 -0
  63. package/src/index.tsx +74 -0
  64. package/src/mcp/cache.ts +205 -0
  65. package/src/mcp/client.test.ts +126 -0
  66. package/src/mcp/client.ts +145 -0
  67. package/src/mcp/index.ts +2 -0
  68. package/src/model/llm.test.ts +158 -0
  69. package/src/model/llm.ts +233 -0
  70. package/src/providers.ts +94 -0
  71. package/src/skills/index.ts +17 -0
  72. package/src/skills/loader.ts +73 -0
  73. package/src/skills/registry.ts +125 -0
  74. package/src/skills/types.ts +31 -0
  75. package/src/test-utils/mocks.ts +110 -0
  76. package/src/theme.ts +21 -0
  77. package/src/tools/browser/browser.ts +357 -0
  78. package/src/tools/browser/index.ts +1 -0
  79. package/src/tools/crypto/hive-tools.ts +171 -0
  80. package/src/tools/crypto/index.ts +1 -0
  81. package/src/tools/descriptions/browser.ts +105 -0
  82. package/src/tools/descriptions/crypto-search.ts +58 -0
  83. package/src/tools/descriptions/index.ts +8 -0
  84. package/src/tools/descriptions/web-fetch.ts +44 -0
  85. package/src/tools/descriptions/web-search.ts +26 -0
  86. package/src/tools/fetch/cache.ts +95 -0
  87. package/src/tools/fetch/external-content.ts +200 -0
  88. package/src/tools/fetch/index.ts +1 -0
  89. package/src/tools/fetch/web-fetch-utils.ts +122 -0
  90. package/src/tools/fetch/web-fetch.ts +371 -0
  91. package/src/tools/index.ts +12 -0
  92. package/src/tools/registry.ts +130 -0
  93. package/src/tools/search/exa.ts +43 -0
  94. package/src/tools/search/index.ts +2 -0
  95. package/src/tools/search/tavily.ts +35 -0
  96. package/src/tools/skill.ts +62 -0
  97. package/src/tools/types.ts +53 -0
  98. package/src/utils/ai-message.ts +26 -0
  99. package/src/utils/config.ts +54 -0
  100. package/src/utils/cost-calculator.test.ts +101 -0
  101. package/src/utils/cost-calculator.ts +74 -0
  102. package/src/utils/env.ts +101 -0
  103. package/src/utils/error-classifier.test.ts +146 -0
  104. package/src/utils/error-classifier.ts +91 -0
  105. package/src/utils/in-memory-chat-history.test.ts +291 -0
  106. package/src/utils/in-memory-chat-history.ts +224 -0
  107. package/src/utils/index.ts +19 -0
  108. package/src/utils/input-key-handlers.test.ts +155 -0
  109. package/src/utils/input-key-handlers.ts +64 -0
  110. package/src/utils/logger.ts +67 -0
  111. package/src/utils/long-term-chat-history.ts +138 -0
  112. package/src/utils/markdown-table.ts +227 -0
  113. package/src/utils/ollama.ts +37 -0
  114. package/src/utils/progress-channel.ts +84 -0
  115. package/src/utils/text-navigation.test.ts +222 -0
  116. package/src/utils/text-navigation.ts +81 -0
  117. package/src/utils/thinking-verbs.ts +29 -0
  118. package/src/utils/tokens.test.ts +163 -0
  119. package/src/utils/tokens.ts +67 -0
  120. package/src/utils/tool-description.ts +88 -0
@@ -0,0 +1,56 @@
1
+ import React from 'react';
2
+ import { Box, Text } from 'ink';
3
+ import { colors } from '../theme.js';
4
+ import { formatCost } from '../utils/cost-calculator.js';
5
+
6
+ interface StatusBarProps {
7
+ modelDisplayName: string;
8
+ cumulativeTokens: number;
9
+ cumulativeCost: number;
10
+ turnCount: number;
11
+ contextPercentage?: number;
12
+ }
13
+
14
+ export function StatusBar({
15
+ modelDisplayName,
16
+ cumulativeTokens,
17
+ cumulativeCost,
18
+ turnCount,
19
+ contextPercentage,
20
+ }: StatusBarProps) {
21
+ // Don't show until there's something to display
22
+ if (turnCount === 0) return null;
23
+
24
+ // Token count color based on context usage
25
+ let tokenColor: string = colors.muted;
26
+ let contextWarning = '';
27
+ if (contextPercentage !== undefined) {
28
+ if (contextPercentage > 90) {
29
+ tokenColor = colors.error;
30
+ contextWarning = ' (use /compact)';
31
+ } else if (contextPercentage > 70) {
32
+ tokenColor = colors.warning;
33
+ }
34
+ }
35
+
36
+ const parts = [
37
+ modelDisplayName,
38
+ `${cumulativeTokens.toLocaleString()} tokens`,
39
+ formatCost(cumulativeCost),
40
+ `${turnCount} ${turnCount === 1 ? 'turn' : 'turns'}`,
41
+ ];
42
+
43
+ return (
44
+ <Box>
45
+ <Text color={colors.mutedDark}>{'┄ '}</Text>
46
+ <Text color={colors.muted}>{parts[0]}</Text>
47
+ <Text color={colors.mutedDark}>{' · '}</Text>
48
+ <Text color={tokenColor}>{parts[1]}{contextWarning}</Text>
49
+ <Text color={colors.mutedDark}>{' · '}</Text>
50
+ <Text color={colors.muted}>{parts[2]}</Text>
51
+ <Text color={colors.mutedDark}>{' · '}</Text>
52
+ <Text color={colors.muted}>{parts[3]}</Text>
53
+ <Text color={colors.mutedDark}>{' ┄'}</Text>
54
+ </Box>
55
+ );
56
+ }
@@ -0,0 +1,133 @@
1
+ import React, { useState, useEffect, useRef, useMemo } from 'react';
2
+ import { Box, Text } from 'ink';
3
+ import Spinner from 'ink-spinner';
4
+ import { colors } from '../theme.js';
5
+ import { getRandomThinkingVerb } from '../utils/thinking-verbs.js';
6
+
7
+ /**
8
+ * Renders text with a shine effect that sweeps left-to-right
9
+ */
10
+ function ShineText({ text, color, shineColor }: { text: string; color: string; shineColor: string }) {
11
+ const [shinePos, setShinePos] = useState(0);
12
+ const [isPaused, setIsPaused] = useState(false);
13
+
14
+ useEffect(() => {
15
+ if (isPaused) {
16
+ // Wait 2 seconds before restarting the shine
17
+ const timeout = setTimeout(() => {
18
+ setShinePos(0);
19
+ setIsPaused(false);
20
+ }, 2000);
21
+ return () => clearTimeout(timeout);
22
+ }
23
+
24
+ const interval = setInterval(() => {
25
+ setShinePos((prev) => {
26
+ const next = prev + 1;
27
+ if (next >= text.length) {
28
+ setIsPaused(true);
29
+ return prev; // Keep at end position until pause completes
30
+ }
31
+ return next;
32
+ });
33
+ }, 60);
34
+
35
+ return () => clearInterval(interval);
36
+ }, [isPaused, text.length]);
37
+
38
+ // Memoize the rendered parts for performance
39
+ const parts = useMemo(() => {
40
+ const result: React.ReactNode[] = [];
41
+ for (let i = 0; i < text.length; i++) {
42
+ // Highlight characters within 1.25 of shine position (~2.5 char width)
43
+ const isShine = !isPaused && Math.abs(i - shinePos) < 1.25;
44
+ result.push(
45
+ <Text key={i} color={isShine ? shineColor : color}>
46
+ {text[i]}
47
+ </Text>
48
+ );
49
+ }
50
+ return result;
51
+ }, [text, shinePos, isPaused, color, shineColor]);
52
+
53
+ return <>{parts}</>;
54
+ }
55
+
56
+ export type WorkingState =
57
+ | { status: 'idle' }
58
+ | { status: 'thinking' }
59
+ | { status: 'tool'; toolName: string }
60
+ | { status: 'answering'; startTime: number };
61
+
62
+ interface WorkingIndicatorProps {
63
+ state: WorkingState;
64
+ }
65
+
66
+ /**
67
+ * Persistent status indicator shown above the input box while agent is working
68
+ */
69
+ export function WorkingIndicator({ state }: WorkingIndicatorProps) {
70
+ const [elapsed, setElapsed] = useState(0);
71
+ const [thinkingVerb, setThinkingVerb] = useState(getRandomThinkingVerb);
72
+ const prevStatusRef = useRef<WorkingState['status']>('idle');
73
+
74
+ // Pick a new random verb when transitioning into thinking/tool state
75
+ useEffect(() => {
76
+ const isThinking = state.status === 'thinking' || state.status === 'tool';
77
+ const wasThinking = prevStatusRef.current === 'thinking' || prevStatusRef.current === 'tool';
78
+
79
+ if (isThinking && !wasThinking) {
80
+ setThinkingVerb(getRandomThinkingVerb());
81
+ }
82
+
83
+ prevStatusRef.current = state.status;
84
+ }, [state.status]);
85
+
86
+ // Track elapsed time only when answering
87
+ useEffect(() => {
88
+ if (state.status !== 'answering') {
89
+ setElapsed(0);
90
+ return;
91
+ }
92
+
93
+ const startTime = state.startTime;
94
+ setElapsed(Math.floor((Date.now() - startTime) / 1000));
95
+
96
+ const interval = setInterval(() => {
97
+ setElapsed(Math.floor((Date.now() - startTime) / 1000));
98
+ }, 1000);
99
+
100
+ return () => clearInterval(interval);
101
+ }, [state]);
102
+
103
+ if (state.status === 'idle') {
104
+ return null;
105
+ }
106
+
107
+ let statusWord: string;
108
+ let suffixEnd: string;
109
+ switch (state.status) {
110
+ case 'thinking':
111
+ case 'tool':
112
+ statusWord = `${thinkingVerb}...`;
113
+ suffixEnd = ' to interrupt)';
114
+ break;
115
+ case 'answering':
116
+ statusWord = 'Answering';
117
+ suffixEnd = ` to interrupt)`;
118
+ break;
119
+ }
120
+
121
+ return (
122
+ <Box>
123
+ <Text color={colors.primary}>
124
+ <Spinner type="dots" />
125
+ </Text>
126
+ <Text color={colors.primary}> </Text>
127
+ <ShineText text={statusWord} color={colors.primary} shineColor={colors.primaryLight} />
128
+ <Text color={colors.muted}> (</Text>
129
+ <Text color={colors.muted} bold>esc</Text>
130
+ <Text color={colors.muted}>{suffixEnd}</Text>
131
+ </Box>
132
+ );
133
+ }
@@ -0,0 +1,23 @@
1
+ export { Intro } from './Intro.js';
2
+ export { Input } from './Input.js';
3
+ export { CursorText } from './CursorText.js';
4
+ export { AnswerBox } from './AnswerBox.js';
5
+ export { ProviderSelector, ModelSelector, PROVIDERS, getModelsForProvider, getDefaultModelForProvider } from './ModelSelector.js';
6
+ export { ApiKeyConfirm, ApiKeyInput } from './ApiKeyPrompt.js';
7
+ export { DebugPanel } from './DebugPanel.js';
8
+
9
+ // V2 components
10
+ export { EventListView } from './AgentEventView.js';
11
+ export type { DisplayEvent } from './AgentEventView.js';
12
+
13
+ export { WorkingIndicator } from './WorkingIndicator.js';
14
+ export type { WorkingState } from './WorkingIndicator.js';
15
+
16
+ export { HistoryItemView } from './HistoryItemView.js';
17
+ export type { HistoryItem } from './HistoryItemView.js';
18
+
19
+ // UI enhancement components
20
+ export { StatusBar } from './StatusBar.js';
21
+ export { CommandMenu } from './CommandMenu.js';
22
+ export { HelpView, ShortcutsView } from './HelpView.js';
23
+ export { ErrorBox } from './ErrorBox.js';
@@ -0,0 +1,378 @@
1
+ /**
2
+ * End-to-end integration tests for the agent flow.
3
+ * Tests the complete event sequence with mocked LLM + tools.
4
+ */
5
+ import { describe, test, expect, mock, beforeAll, afterAll, beforeEach } from 'bun:test';
6
+ import { createMockAIMessage, createMockTool, createTempDir } from '../test-utils/mocks.js';
7
+ import type { AgentEvent, DoneEvent, ToolEndEvent, ToolErrorEvent, ContextClearedEvent } from '../agent/types.js';
8
+
9
+ // ---------------------------------------------------------------------------
10
+ // Mock all external dependencies
11
+ // ---------------------------------------------------------------------------
12
+
13
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
14
+ const mockCallLlm = mock(async (): Promise<any> => ({
15
+ response: 'default',
16
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
17
+ }));
18
+
19
+ mock.module('../model/llm.js', () => ({
20
+ callLlm: mockCallLlm,
21
+ DEFAULT_MODEL: 'claude-sonnet-4-5',
22
+ }));
23
+
24
+ const testTools = {
25
+ search: createMockTool('search', (args) => {
26
+ return JSON.stringify({ query: args.query, result: 'BTC $65,000, ETH $3,500' });
27
+ }),
28
+ api: createMockTool('api', (args) => {
29
+ return JSON.stringify({ endpoint: args.endpoint, data: { price: 65000 } });
30
+ }),
31
+ broken: createMockTool('broken', () => {
32
+ throw new Error('Connection timeout');
33
+ }),
34
+ slow: createMockTool('slow', async () => {
35
+ await new Promise(r => setTimeout(r, 10));
36
+ return 'slow result';
37
+ }),
38
+ bigdata: createMockTool('bigdata', () => {
39
+ // Returns a huge result to trigger context clearing
40
+ return 'x'.repeat(200_000);
41
+ }),
42
+ };
43
+
44
+ const mockGetTools = mock(() => Object.values(testTools));
45
+
46
+ mock.module('../tools/registry.js', () => ({
47
+ getTools: mockGetTools,
48
+ buildToolDescriptions: () => 'Test tool descriptions',
49
+ }));
50
+
51
+ mock.module('../agent/prompts.js', () => ({
52
+ buildSystemPrompt: () => 'Test system prompt.',
53
+ buildIterationPrompt: (query: string, results: string) => `Query: ${query}\nResults: ${results}`,
54
+ buildFinalAnswerPrompt: (query: string, context: string) => `Answer: ${query}\nContext: ${context}`,
55
+ DEFAULT_SYSTEM_PROMPT: 'Default test prompt.',
56
+ }));
57
+
58
+ mock.module('../utils/tool-description.js', () => ({
59
+ getToolDescription: (name: string) => name,
60
+ }));
61
+
62
+ mock.module('../skills/index.js', () => ({
63
+ discoverSkills: () => [],
64
+ buildSkillMetadataSection: () => '',
65
+ }));
66
+
67
+ const { Agent } = await import('../agent/agent.js');
68
+
69
+ // ---------------------------------------------------------------------------
70
+ // Setup
71
+ // ---------------------------------------------------------------------------
72
+
73
+ let originalCwd: string;
74
+ let cleanup: () => void;
75
+
76
+ beforeAll(() => {
77
+ originalCwd = process.cwd();
78
+ const tmp = createTempDir();
79
+ cleanup = tmp.cleanup;
80
+ process.chdir(tmp.path);
81
+ });
82
+
83
+ afterAll(() => {
84
+ process.chdir(originalCwd);
85
+ cleanup();
86
+ });
87
+
88
+ beforeEach(() => {
89
+ mockCallLlm.mockClear();
90
+ mockGetTools.mockClear();
91
+ mockGetTools.mockReturnValue(Object.values(testTools));
92
+ });
93
+
94
+ // ---------------------------------------------------------------------------
95
+ // Helpers
96
+ // ---------------------------------------------------------------------------
97
+
98
+ async function collectEvents(query: string, config = {}): Promise<AgentEvent[]> {
99
+ const agent = Agent.create({ model: 'claude-sonnet-4-5', ...config });
100
+ const events: AgentEvent[] = [];
101
+ for await (const event of agent.run(query)) {
102
+ events.push(event);
103
+ }
104
+ return events;
105
+ }
106
+
107
+ function findEvent<T extends AgentEvent>(events: AgentEvent[], type: string): T | undefined {
108
+ return events.find(e => e.type === type) as T | undefined;
109
+ }
110
+
111
+ function findAllEvents<T extends AgentEvent>(events: AgentEvent[], type: string): T[] {
112
+ return events.filter(e => e.type === type) as T[];
113
+ }
114
+
115
+ // ---------------------------------------------------------------------------
116
+ // Test: Complete price lookup flow
117
+ // ---------------------------------------------------------------------------
118
+
119
+ describe('E2E: price lookup flow', () => {
120
+ test('query → tool_call → tool_end → final answer', async () => {
121
+ // Step 1: LLM decides to search
122
+ mockCallLlm.mockResolvedValueOnce({
123
+ response: createMockAIMessage('Searching for price data', [
124
+ { name: 'search', args: { query: 'BTC price' } },
125
+ ]),
126
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
127
+ });
128
+ // Step 2: LLM sees results, decides to answer
129
+ mockCallLlm.mockResolvedValueOnce({
130
+ response: 'I have the data.',
131
+ usage: { inputTokens: 200, outputTokens: 50, totalTokens: 250 },
132
+ });
133
+ // Step 3: Final answer generation
134
+ mockCallLlm.mockResolvedValueOnce({
135
+ response: 'BTC is currently at $65,000.',
136
+ usage: { inputTokens: 150, outputTokens: 80, totalTokens: 230 },
137
+ });
138
+
139
+ const events = await collectEvents('What is the BTC price?');
140
+
141
+ // Verify event sequence
142
+ const types = events.map(e => e.type);
143
+ expect(types).toContain('thinking');
144
+ expect(types).toContain('tool_start');
145
+ expect(types).toContain('tool_end');
146
+ expect(types).toContain('answer_start');
147
+ expect(types).toContain('done');
148
+
149
+ // Verify tool_start comes before tool_end
150
+ const startIdx = types.indexOf('tool_start');
151
+ const endIdx = types.indexOf('tool_end');
152
+ expect(startIdx).toBeLessThan(endIdx);
153
+
154
+ // Verify answer_start comes before done
155
+ const answerIdx = types.indexOf('answer_start');
156
+ const doneIdx = types.indexOf('done');
157
+ expect(answerIdx).toBeLessThan(doneIdx);
158
+
159
+ // Verify done event
160
+ const done = findEvent<DoneEvent>(events, 'done')!;
161
+ expect(done.answer).toBe('BTC is currently at $65,000.');
162
+ expect(done.toolCalls.length).toBe(1);
163
+ expect(done.toolCalls[0].tool).toBe('search');
164
+ expect(done.iterations).toBeGreaterThanOrEqual(1);
165
+ expect(done.tokenUsage).toBeDefined();
166
+ expect(done.tokenUsage!.totalTokens).toBe(630); // 150+250+230
167
+ });
168
+ });
169
+
170
+ // ---------------------------------------------------------------------------
171
+ // Test: Tool error recovery
172
+ // ---------------------------------------------------------------------------
173
+
174
+ describe('E2E: tool error recovery', () => {
175
+ test('first tool fails → second tool works → answer produced', async () => {
176
+ // LLM calls the broken tool
177
+ mockCallLlm.mockResolvedValueOnce({
178
+ response: createMockAIMessage('', [
179
+ { name: 'broken', args: {} },
180
+ ]),
181
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
182
+ });
183
+ // LLM sees error, tries different tool
184
+ mockCallLlm.mockResolvedValueOnce({
185
+ response: createMockAIMessage('Let me try a different approach', [
186
+ { name: 'search', args: { query: 'BTC' } },
187
+ ]),
188
+ usage: { inputTokens: 200, outputTokens: 50, totalTokens: 250 },
189
+ });
190
+ // LLM generates answer
191
+ mockCallLlm.mockResolvedValueOnce({
192
+ response: 'Got the data now.',
193
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
194
+ });
195
+ // Final answer
196
+ mockCallLlm.mockResolvedValueOnce({
197
+ response: 'Despite initial error, BTC is $65k.',
198
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
199
+ });
200
+
201
+ const events = await collectEvents('BTC with error recovery');
202
+
203
+ const types = events.map(e => e.type);
204
+ expect(types).toContain('tool_error');
205
+ expect(types).toContain('done');
206
+
207
+ const error = findEvent<ToolErrorEvent>(events, 'tool_error')!;
208
+ expect(error.error).toContain('Connection timeout');
209
+
210
+ const done = findEvent<DoneEvent>(events, 'done')!;
211
+ expect(done.answer).toBeDefined();
212
+ expect(done.toolCalls.length).toBe(2); // broken + search
213
+ });
214
+ });
215
+
216
+ // ---------------------------------------------------------------------------
217
+ // Test: Multi-tool flow
218
+ // ---------------------------------------------------------------------------
219
+
220
+ describe('E2E: multi-tool flow', () => {
221
+ test('LLM requests multiple tools → all execute → comprehensive answer', async () => {
222
+ // LLM requests two tools at once
223
+ mockCallLlm.mockResolvedValueOnce({
224
+ response: createMockAIMessage('Gathering data from multiple sources', [
225
+ { name: 'search', args: { query: 'BTC price' } },
226
+ { name: 'api', args: { endpoint: 'market-data' } },
227
+ ]),
228
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
229
+ });
230
+ // LLM generates answer
231
+ mockCallLlm.mockResolvedValueOnce({
232
+ response: 'All data gathered.',
233
+ usage: { inputTokens: 200, outputTokens: 100, totalTokens: 300 },
234
+ });
235
+ // Final answer
236
+ mockCallLlm.mockResolvedValueOnce({
237
+ response: 'BTC: $65k, market looks strong.',
238
+ usage: { inputTokens: 150, outputTokens: 80, totalTokens: 230 },
239
+ });
240
+
241
+ const events = await collectEvents('Full market overview');
242
+
243
+ const toolEnds = findAllEvents<ToolEndEvent>(events, 'tool_end');
244
+ expect(toolEnds.length).toBe(2);
245
+
246
+ const toolNames = toolEnds.map(e => e.tool);
247
+ expect(toolNames).toContain('search');
248
+ expect(toolNames).toContain('api');
249
+
250
+ const done = findEvent<DoneEvent>(events, 'done')!;
251
+ expect(done.toolCalls.length).toBe(2);
252
+ });
253
+ });
254
+
255
+ // ---------------------------------------------------------------------------
256
+ // Test: Direct answer (no tools needed)
257
+ // ---------------------------------------------------------------------------
258
+
259
+ describe('E2E: direct answer', () => {
260
+ test('greeting → direct response, no tool calls', async () => {
261
+ mockCallLlm.mockResolvedValueOnce({
262
+ response: 'Hello! I can help with crypto research.',
263
+ usage: { inputTokens: 50, outputTokens: 30, totalTokens: 80 },
264
+ });
265
+
266
+ const events = await collectEvents('Hello');
267
+
268
+ const types = events.map(e => e.type);
269
+ expect(types).not.toContain('tool_start');
270
+ expect(types).toContain('done');
271
+
272
+ const done = findEvent<DoneEvent>(events, 'done')!;
273
+ expect(done.answer).toBe('Hello! I can help with crypto research.');
274
+ expect(done.toolCalls.length).toBe(0);
275
+ expect(done.iterations).toBe(1);
276
+ });
277
+ });
278
+
279
+ // ---------------------------------------------------------------------------
280
+ // Test: Max iterations with final answer
281
+ // ---------------------------------------------------------------------------
282
+
283
+ describe('E2E: max iterations', () => {
284
+ test('always calls tools → stops at limit → still produces answer', async () => {
285
+ let callNum = 0;
286
+ mockCallLlm.mockImplementation(async () => {
287
+ callNum++;
288
+ if (callNum <= 3) {
289
+ return {
290
+ response: createMockAIMessage('', [
291
+ { name: 'search', args: { query: `attempt-${callNum}` } },
292
+ ]),
293
+ usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
294
+ };
295
+ }
296
+ return {
297
+ response: 'Final answer after iterations.',
298
+ usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
299
+ };
300
+ });
301
+
302
+ const events = await collectEvents('looping query', { maxIterations: 3 });
303
+
304
+ const done = findEvent<DoneEvent>(events, 'done')!;
305
+ expect(done.iterations).toBe(3);
306
+ expect(done.answer).toBeDefined();
307
+ // Should have tool calls from all 3 iterations
308
+ expect(done.toolCalls.length).toBe(3);
309
+ });
310
+ });
311
+
312
+ // ---------------------------------------------------------------------------
313
+ // Test: Context overflow → truncation → answer still works
314
+ // ---------------------------------------------------------------------------
315
+
316
+ describe('E2E: context overflow', () => {
317
+ test('large tool results → context_cleared event → answer produced', async () => {
318
+ mockGetTools.mockReturnValue([testTools.bigdata, testTools.search]);
319
+
320
+ // LLM calls bigdata tool (returns 200K chars)
321
+ mockCallLlm.mockResolvedValueOnce({
322
+ response: createMockAIMessage('', [
323
+ { name: 'bigdata', args: {} },
324
+ ]),
325
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
326
+ });
327
+ // After context check, LLM gives answer
328
+ mockCallLlm.mockResolvedValueOnce({
329
+ response: 'Data analyzed.',
330
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
331
+ });
332
+ // Final answer
333
+ mockCallLlm.mockResolvedValueOnce({
334
+ response: 'Based on the large dataset analysis...',
335
+ usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
336
+ });
337
+
338
+ const events = await collectEvents('analyze big data');
339
+
340
+ const done = findEvent<DoneEvent>(events, 'done')!;
341
+ expect(done.answer).toBeDefined();
342
+ // The bigdata result (200K chars) gets truncated to 50K by scratchpad
343
+ // Whether context_cleared fires depends on estimated token count vs threshold
344
+ // The key assertion is that the flow completes successfully
345
+ expect(done.toolCalls.length).toBeGreaterThanOrEqual(1);
346
+ });
347
+ });
348
+
349
+ // ---------------------------------------------------------------------------
350
+ // Test: Tool limit warnings
351
+ // ---------------------------------------------------------------------------
352
+
353
+ describe('E2E: tool limit warnings', () => {
354
+ test('repeated tool calls generate tool_limit events', async () => {
355
+ let callNum = 0;
356
+ mockCallLlm.mockImplementation(async () => {
357
+ callNum++;
358
+ if (callNum <= 4) {
359
+ return {
360
+ response: createMockAIMessage('', [
361
+ { name: 'search', args: { query: `search-${callNum}` } },
362
+ ]),
363
+ usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
364
+ };
365
+ }
366
+ return {
367
+ response: 'Done with repeated searches.',
368
+ usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
369
+ };
370
+ });
371
+
372
+ const events = await collectEvents('repeated searches', { maxIterations: 5 });
373
+
374
+ const limitEvents = findAllEvents(events, 'tool_limit');
375
+ // After 3 calls (default limit), warnings should appear
376
+ expect(limitEvents.length).toBeGreaterThanOrEqual(1);
377
+ });
378
+ });