brownian-code 2026.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +97 -0
- package/bin/brownian +25 -0
- package/env.example +21 -0
- package/package.json +87 -0
- package/src/agent/agent.test.ts +414 -0
- package/src/agent/agent.ts +385 -0
- package/src/agent/index.ts +27 -0
- package/src/agent/prompts.ts +271 -0
- package/src/agent/scratchpad.test.ts +482 -0
- package/src/agent/scratchpad.ts +526 -0
- package/src/agent/token-counter.test.ts +59 -0
- package/src/agent/token-counter.ts +33 -0
- package/src/agent/types.ts +137 -0
- package/src/cli.tsx +385 -0
- package/src/commands/builtin.test.ts +271 -0
- package/src/commands/builtin.ts +200 -0
- package/src/commands/registry.test.ts +188 -0
- package/src/commands/registry.ts +111 -0
- package/src/commands/types.ts +64 -0
- package/src/components/AgentEventView.tsx +487 -0
- package/src/components/AnswerBox.tsx +81 -0
- package/src/components/ApiKeyPrompt.tsx +75 -0
- package/src/components/CommandMenu.test.tsx +64 -0
- package/src/components/CommandMenu.tsx +38 -0
- package/src/components/CursorText.tsx +43 -0
- package/src/components/DebugPanel.tsx +48 -0
- package/src/components/ErrorBox.test.tsx +58 -0
- package/src/components/ErrorBox.tsx +26 -0
- package/src/components/HelpView.test.tsx +70 -0
- package/src/components/HelpView.tsx +61 -0
- package/src/components/HistoryItemView.tsx +108 -0
- package/src/components/Input.tsx +193 -0
- package/src/components/Intro.test.tsx +59 -0
- package/src/components/Intro.tsx +35 -0
- package/src/components/ModelSelector.tsx +288 -0
- package/src/components/StatusBar.test.tsx +78 -0
- package/src/components/StatusBar.tsx +56 -0
- package/src/components/WorkingIndicator.tsx +133 -0
- package/src/components/index.ts +23 -0
- package/src/e2e/agent-flow.test.ts +378 -0
- package/src/evals/components/EvalApp.tsx +206 -0
- package/src/evals/components/EvalCurrentQuestion.tsx +42 -0
- package/src/evals/components/EvalProgress.tsx +33 -0
- package/src/evals/components/EvalRecentResults.tsx +63 -0
- package/src/evals/components/EvalStats.tsx +49 -0
- package/src/evals/components/index.ts +5 -0
- package/src/evals/dataset/crypto_agent.csv +16 -0
- package/src/evals/run.ts +355 -0
- package/src/gateway/channels/whatsapp/auth-store.ts +15 -0
- package/src/gateway/channels/whatsapp/inbound.ts +86 -0
- package/src/gateway/channels/whatsapp/login.ts +28 -0
- package/src/gateway/channels/whatsapp/outbound.ts +27 -0
- package/src/gateway/channels/whatsapp/session.ts +69 -0
- package/src/gateway/config.ts +81 -0
- package/src/gateway/index.ts +62 -0
- package/src/hooks/useAgentRunner.ts +317 -0
- package/src/hooks/useDebugLogs.ts +22 -0
- package/src/hooks/useInputHistory.ts +106 -0
- package/src/hooks/useModelSelection.ts +249 -0
- package/src/hooks/useTextBuffer.test.ts +121 -0
- package/src/hooks/useTextBuffer.ts +97 -0
- package/src/index.tsx +74 -0
- package/src/mcp/cache.ts +205 -0
- package/src/mcp/client.test.ts +126 -0
- package/src/mcp/client.ts +145 -0
- package/src/mcp/index.ts +2 -0
- package/src/model/llm.test.ts +158 -0
- package/src/model/llm.ts +233 -0
- package/src/providers.ts +94 -0
- package/src/skills/index.ts +17 -0
- package/src/skills/loader.ts +73 -0
- package/src/skills/registry.ts +125 -0
- package/src/skills/types.ts +31 -0
- package/src/test-utils/mocks.ts +110 -0
- package/src/theme.ts +21 -0
- package/src/tools/browser/browser.ts +357 -0
- package/src/tools/browser/index.ts +1 -0
- package/src/tools/crypto/hive-tools.ts +171 -0
- package/src/tools/crypto/index.ts +1 -0
- package/src/tools/descriptions/browser.ts +105 -0
- package/src/tools/descriptions/crypto-search.ts +58 -0
- package/src/tools/descriptions/index.ts +8 -0
- package/src/tools/descriptions/web-fetch.ts +44 -0
- package/src/tools/descriptions/web-search.ts +26 -0
- package/src/tools/fetch/cache.ts +95 -0
- package/src/tools/fetch/external-content.ts +200 -0
- package/src/tools/fetch/index.ts +1 -0
- package/src/tools/fetch/web-fetch-utils.ts +122 -0
- package/src/tools/fetch/web-fetch.ts +371 -0
- package/src/tools/index.ts +12 -0
- package/src/tools/registry.ts +130 -0
- package/src/tools/search/exa.ts +43 -0
- package/src/tools/search/index.ts +2 -0
- package/src/tools/search/tavily.ts +35 -0
- package/src/tools/skill.ts +62 -0
- package/src/tools/types.ts +53 -0
- package/src/utils/ai-message.ts +26 -0
- package/src/utils/config.ts +54 -0
- package/src/utils/cost-calculator.test.ts +101 -0
- package/src/utils/cost-calculator.ts +74 -0
- package/src/utils/env.ts +101 -0
- package/src/utils/error-classifier.test.ts +146 -0
- package/src/utils/error-classifier.ts +91 -0
- package/src/utils/in-memory-chat-history.test.ts +291 -0
- package/src/utils/in-memory-chat-history.ts +224 -0
- package/src/utils/index.ts +19 -0
- package/src/utils/input-key-handlers.test.ts +155 -0
- package/src/utils/input-key-handlers.ts +64 -0
- package/src/utils/logger.ts +67 -0
- package/src/utils/long-term-chat-history.ts +138 -0
- package/src/utils/markdown-table.ts +227 -0
- package/src/utils/ollama.ts +37 -0
- package/src/utils/progress-channel.ts +84 -0
- package/src/utils/text-navigation.test.ts +222 -0
- package/src/utils/text-navigation.ts +81 -0
- package/src/utils/thinking-verbs.ts +29 -0
- package/src/utils/tokens.test.ts +163 -0
- package/src/utils/tokens.ts +67 -0
- package/src/utils/tool-description.ts +88 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import React from 'react';
|
|
2
|
+
import { Box, Text } from 'ink';
|
|
3
|
+
import { colors } from '../theme.js';
|
|
4
|
+
import { formatCost } from '../utils/cost-calculator.js';
|
|
5
|
+
|
|
6
|
+
interface StatusBarProps {
|
|
7
|
+
modelDisplayName: string;
|
|
8
|
+
cumulativeTokens: number;
|
|
9
|
+
cumulativeCost: number;
|
|
10
|
+
turnCount: number;
|
|
11
|
+
contextPercentage?: number;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function StatusBar({
|
|
15
|
+
modelDisplayName,
|
|
16
|
+
cumulativeTokens,
|
|
17
|
+
cumulativeCost,
|
|
18
|
+
turnCount,
|
|
19
|
+
contextPercentage,
|
|
20
|
+
}: StatusBarProps) {
|
|
21
|
+
// Don't show until there's something to display
|
|
22
|
+
if (turnCount === 0) return null;
|
|
23
|
+
|
|
24
|
+
// Token count color based on context usage
|
|
25
|
+
let tokenColor: string = colors.muted;
|
|
26
|
+
let contextWarning = '';
|
|
27
|
+
if (contextPercentage !== undefined) {
|
|
28
|
+
if (contextPercentage > 90) {
|
|
29
|
+
tokenColor = colors.error;
|
|
30
|
+
contextWarning = ' (use /compact)';
|
|
31
|
+
} else if (contextPercentage > 70) {
|
|
32
|
+
tokenColor = colors.warning;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const parts = [
|
|
37
|
+
modelDisplayName,
|
|
38
|
+
`${cumulativeTokens.toLocaleString()} tokens`,
|
|
39
|
+
formatCost(cumulativeCost),
|
|
40
|
+
`${turnCount} ${turnCount === 1 ? 'turn' : 'turns'}`,
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
return (
|
|
44
|
+
<Box>
|
|
45
|
+
<Text color={colors.mutedDark}>{'┄ '}</Text>
|
|
46
|
+
<Text color={colors.muted}>{parts[0]}</Text>
|
|
47
|
+
<Text color={colors.mutedDark}>{' · '}</Text>
|
|
48
|
+
<Text color={tokenColor}>{parts[1]}{contextWarning}</Text>
|
|
49
|
+
<Text color={colors.mutedDark}>{' · '}</Text>
|
|
50
|
+
<Text color={colors.muted}>{parts[2]}</Text>
|
|
51
|
+
<Text color={colors.mutedDark}>{' · '}</Text>
|
|
52
|
+
<Text color={colors.muted}>{parts[3]}</Text>
|
|
53
|
+
<Text color={colors.mutedDark}>{' ┄'}</Text>
|
|
54
|
+
</Box>
|
|
55
|
+
);
|
|
56
|
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import React, { useState, useEffect, useRef, useMemo } from 'react';
|
|
2
|
+
import { Box, Text } from 'ink';
|
|
3
|
+
import Spinner from 'ink-spinner';
|
|
4
|
+
import { colors } from '../theme.js';
|
|
5
|
+
import { getRandomThinkingVerb } from '../utils/thinking-verbs.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Renders text with a shine effect that sweeps left-to-right
|
|
9
|
+
*/
|
|
10
|
+
function ShineText({ text, color, shineColor }: { text: string; color: string; shineColor: string }) {
|
|
11
|
+
const [shinePos, setShinePos] = useState(0);
|
|
12
|
+
const [isPaused, setIsPaused] = useState(false);
|
|
13
|
+
|
|
14
|
+
useEffect(() => {
|
|
15
|
+
if (isPaused) {
|
|
16
|
+
// Wait 2 seconds before restarting the shine
|
|
17
|
+
const timeout = setTimeout(() => {
|
|
18
|
+
setShinePos(0);
|
|
19
|
+
setIsPaused(false);
|
|
20
|
+
}, 2000);
|
|
21
|
+
return () => clearTimeout(timeout);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const interval = setInterval(() => {
|
|
25
|
+
setShinePos((prev) => {
|
|
26
|
+
const next = prev + 1;
|
|
27
|
+
if (next >= text.length) {
|
|
28
|
+
setIsPaused(true);
|
|
29
|
+
return prev; // Keep at end position until pause completes
|
|
30
|
+
}
|
|
31
|
+
return next;
|
|
32
|
+
});
|
|
33
|
+
}, 60);
|
|
34
|
+
|
|
35
|
+
return () => clearInterval(interval);
|
|
36
|
+
}, [isPaused, text.length]);
|
|
37
|
+
|
|
38
|
+
// Memoize the rendered parts for performance
|
|
39
|
+
const parts = useMemo(() => {
|
|
40
|
+
const result: React.ReactNode[] = [];
|
|
41
|
+
for (let i = 0; i < text.length; i++) {
|
|
42
|
+
// Highlight characters within 1.25 of shine position (~2.5 char width)
|
|
43
|
+
const isShine = !isPaused && Math.abs(i - shinePos) < 1.25;
|
|
44
|
+
result.push(
|
|
45
|
+
<Text key={i} color={isShine ? shineColor : color}>
|
|
46
|
+
{text[i]}
|
|
47
|
+
</Text>
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
return result;
|
|
51
|
+
}, [text, shinePos, isPaused, color, shineColor]);
|
|
52
|
+
|
|
53
|
+
return <>{parts}</>;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export type WorkingState =
|
|
57
|
+
| { status: 'idle' }
|
|
58
|
+
| { status: 'thinking' }
|
|
59
|
+
| { status: 'tool'; toolName: string }
|
|
60
|
+
| { status: 'answering'; startTime: number };
|
|
61
|
+
|
|
62
|
+
interface WorkingIndicatorProps {
|
|
63
|
+
state: WorkingState;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Persistent status indicator shown above the input box while agent is working
|
|
68
|
+
*/
|
|
69
|
+
export function WorkingIndicator({ state }: WorkingIndicatorProps) {
|
|
70
|
+
const [elapsed, setElapsed] = useState(0);
|
|
71
|
+
const [thinkingVerb, setThinkingVerb] = useState(getRandomThinkingVerb);
|
|
72
|
+
const prevStatusRef = useRef<WorkingState['status']>('idle');
|
|
73
|
+
|
|
74
|
+
// Pick a new random verb when transitioning into thinking/tool state
|
|
75
|
+
useEffect(() => {
|
|
76
|
+
const isThinking = state.status === 'thinking' || state.status === 'tool';
|
|
77
|
+
const wasThinking = prevStatusRef.current === 'thinking' || prevStatusRef.current === 'tool';
|
|
78
|
+
|
|
79
|
+
if (isThinking && !wasThinking) {
|
|
80
|
+
setThinkingVerb(getRandomThinkingVerb());
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
prevStatusRef.current = state.status;
|
|
84
|
+
}, [state.status]);
|
|
85
|
+
|
|
86
|
+
// Track elapsed time only when answering
|
|
87
|
+
useEffect(() => {
|
|
88
|
+
if (state.status !== 'answering') {
|
|
89
|
+
setElapsed(0);
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const startTime = state.startTime;
|
|
94
|
+
setElapsed(Math.floor((Date.now() - startTime) / 1000));
|
|
95
|
+
|
|
96
|
+
const interval = setInterval(() => {
|
|
97
|
+
setElapsed(Math.floor((Date.now() - startTime) / 1000));
|
|
98
|
+
}, 1000);
|
|
99
|
+
|
|
100
|
+
return () => clearInterval(interval);
|
|
101
|
+
}, [state]);
|
|
102
|
+
|
|
103
|
+
if (state.status === 'idle') {
|
|
104
|
+
return null;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let statusWord: string;
|
|
108
|
+
let suffixEnd: string;
|
|
109
|
+
switch (state.status) {
|
|
110
|
+
case 'thinking':
|
|
111
|
+
case 'tool':
|
|
112
|
+
statusWord = `${thinkingVerb}...`;
|
|
113
|
+
suffixEnd = ' to interrupt)';
|
|
114
|
+
break;
|
|
115
|
+
case 'answering':
|
|
116
|
+
statusWord = 'Answering';
|
|
117
|
+
suffixEnd = ` to interrupt)`;
|
|
118
|
+
break;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return (
|
|
122
|
+
<Box>
|
|
123
|
+
<Text color={colors.primary}>
|
|
124
|
+
<Spinner type="dots" />
|
|
125
|
+
</Text>
|
|
126
|
+
<Text color={colors.primary}> </Text>
|
|
127
|
+
<ShineText text={statusWord} color={colors.primary} shineColor={colors.primaryLight} />
|
|
128
|
+
<Text color={colors.muted}> (</Text>
|
|
129
|
+
<Text color={colors.muted} bold>esc</Text>
|
|
130
|
+
<Text color={colors.muted}>{suffixEnd}</Text>
|
|
131
|
+
</Box>
|
|
132
|
+
);
|
|
133
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export { Intro } from './Intro.js';
|
|
2
|
+
export { Input } from './Input.js';
|
|
3
|
+
export { CursorText } from './CursorText.js';
|
|
4
|
+
export { AnswerBox } from './AnswerBox.js';
|
|
5
|
+
export { ProviderSelector, ModelSelector, PROVIDERS, getModelsForProvider, getDefaultModelForProvider } from './ModelSelector.js';
|
|
6
|
+
export { ApiKeyConfirm, ApiKeyInput } from './ApiKeyPrompt.js';
|
|
7
|
+
export { DebugPanel } from './DebugPanel.js';
|
|
8
|
+
|
|
9
|
+
// V2 components
|
|
10
|
+
export { EventListView } from './AgentEventView.js';
|
|
11
|
+
export type { DisplayEvent } from './AgentEventView.js';
|
|
12
|
+
|
|
13
|
+
export { WorkingIndicator } from './WorkingIndicator.js';
|
|
14
|
+
export type { WorkingState } from './WorkingIndicator.js';
|
|
15
|
+
|
|
16
|
+
export { HistoryItemView } from './HistoryItemView.js';
|
|
17
|
+
export type { HistoryItem } from './HistoryItemView.js';
|
|
18
|
+
|
|
19
|
+
// UI enhancement components
|
|
20
|
+
export { StatusBar } from './StatusBar.js';
|
|
21
|
+
export { CommandMenu } from './CommandMenu.js';
|
|
22
|
+
export { HelpView, ShortcutsView } from './HelpView.js';
|
|
23
|
+
export { ErrorBox } from './ErrorBox.js';
|
|
@@ -0,0 +1,378 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* End-to-end integration tests for the agent flow.
|
|
3
|
+
* Tests the complete event sequence with mocked LLM + tools.
|
|
4
|
+
*/
|
|
5
|
+
import { describe, test, expect, mock, beforeAll, afterAll, beforeEach } from 'bun:test';
|
|
6
|
+
import { createMockAIMessage, createMockTool, createTempDir } from '../test-utils/mocks.js';
|
|
7
|
+
import type { AgentEvent, DoneEvent, ToolEndEvent, ToolErrorEvent, ContextClearedEvent } from '../agent/types.js';
|
|
8
|
+
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Mock all external dependencies
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
14
|
+
const mockCallLlm = mock(async (): Promise<any> => ({
|
|
15
|
+
response: 'default',
|
|
16
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
17
|
+
}));
|
|
18
|
+
|
|
19
|
+
mock.module('../model/llm.js', () => ({
|
|
20
|
+
callLlm: mockCallLlm,
|
|
21
|
+
DEFAULT_MODEL: 'claude-sonnet-4-5',
|
|
22
|
+
}));
|
|
23
|
+
|
|
24
|
+
const testTools = {
|
|
25
|
+
search: createMockTool('search', (args) => {
|
|
26
|
+
return JSON.stringify({ query: args.query, result: 'BTC $65,000, ETH $3,500' });
|
|
27
|
+
}),
|
|
28
|
+
api: createMockTool('api', (args) => {
|
|
29
|
+
return JSON.stringify({ endpoint: args.endpoint, data: { price: 65000 } });
|
|
30
|
+
}),
|
|
31
|
+
broken: createMockTool('broken', () => {
|
|
32
|
+
throw new Error('Connection timeout');
|
|
33
|
+
}),
|
|
34
|
+
slow: createMockTool('slow', async () => {
|
|
35
|
+
await new Promise(r => setTimeout(r, 10));
|
|
36
|
+
return 'slow result';
|
|
37
|
+
}),
|
|
38
|
+
bigdata: createMockTool('bigdata', () => {
|
|
39
|
+
// Returns a huge result to trigger context clearing
|
|
40
|
+
return 'x'.repeat(200_000);
|
|
41
|
+
}),
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const mockGetTools = mock(() => Object.values(testTools));
|
|
45
|
+
|
|
46
|
+
mock.module('../tools/registry.js', () => ({
|
|
47
|
+
getTools: mockGetTools,
|
|
48
|
+
buildToolDescriptions: () => 'Test tool descriptions',
|
|
49
|
+
}));
|
|
50
|
+
|
|
51
|
+
mock.module('../agent/prompts.js', () => ({
|
|
52
|
+
buildSystemPrompt: () => 'Test system prompt.',
|
|
53
|
+
buildIterationPrompt: (query: string, results: string) => `Query: ${query}\nResults: ${results}`,
|
|
54
|
+
buildFinalAnswerPrompt: (query: string, context: string) => `Answer: ${query}\nContext: ${context}`,
|
|
55
|
+
DEFAULT_SYSTEM_PROMPT: 'Default test prompt.',
|
|
56
|
+
}));
|
|
57
|
+
|
|
58
|
+
mock.module('../utils/tool-description.js', () => ({
|
|
59
|
+
getToolDescription: (name: string) => name,
|
|
60
|
+
}));
|
|
61
|
+
|
|
62
|
+
mock.module('../skills/index.js', () => ({
|
|
63
|
+
discoverSkills: () => [],
|
|
64
|
+
buildSkillMetadataSection: () => '',
|
|
65
|
+
}));
|
|
66
|
+
|
|
67
|
+
const { Agent } = await import('../agent/agent.js');
|
|
68
|
+
|
|
69
|
+
// ---------------------------------------------------------------------------
|
|
70
|
+
// Setup
|
|
71
|
+
// ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
let originalCwd: string;
|
|
74
|
+
let cleanup: () => void;
|
|
75
|
+
|
|
76
|
+
beforeAll(() => {
|
|
77
|
+
originalCwd = process.cwd();
|
|
78
|
+
const tmp = createTempDir();
|
|
79
|
+
cleanup = tmp.cleanup;
|
|
80
|
+
process.chdir(tmp.path);
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
afterAll(() => {
|
|
84
|
+
process.chdir(originalCwd);
|
|
85
|
+
cleanup();
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
beforeEach(() => {
|
|
89
|
+
mockCallLlm.mockClear();
|
|
90
|
+
mockGetTools.mockClear();
|
|
91
|
+
mockGetTools.mockReturnValue(Object.values(testTools));
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
// Helpers
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
async function collectEvents(query: string, config = {}): Promise<AgentEvent[]> {
|
|
99
|
+
const agent = Agent.create({ model: 'claude-sonnet-4-5', ...config });
|
|
100
|
+
const events: AgentEvent[] = [];
|
|
101
|
+
for await (const event of agent.run(query)) {
|
|
102
|
+
events.push(event);
|
|
103
|
+
}
|
|
104
|
+
return events;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function findEvent<T extends AgentEvent>(events: AgentEvent[], type: string): T | undefined {
|
|
108
|
+
return events.find(e => e.type === type) as T | undefined;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function findAllEvents<T extends AgentEvent>(events: AgentEvent[], type: string): T[] {
|
|
112
|
+
return events.filter(e => e.type === type) as T[];
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
// Test: Complete price lookup flow
|
|
117
|
+
// ---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
describe('E2E: price lookup flow', () => {
|
|
120
|
+
test('query → tool_call → tool_end → final answer', async () => {
|
|
121
|
+
// Step 1: LLM decides to search
|
|
122
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
123
|
+
response: createMockAIMessage('Searching for price data', [
|
|
124
|
+
{ name: 'search', args: { query: 'BTC price' } },
|
|
125
|
+
]),
|
|
126
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
127
|
+
});
|
|
128
|
+
// Step 2: LLM sees results, decides to answer
|
|
129
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
130
|
+
response: 'I have the data.',
|
|
131
|
+
usage: { inputTokens: 200, outputTokens: 50, totalTokens: 250 },
|
|
132
|
+
});
|
|
133
|
+
// Step 3: Final answer generation
|
|
134
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
135
|
+
response: 'BTC is currently at $65,000.',
|
|
136
|
+
usage: { inputTokens: 150, outputTokens: 80, totalTokens: 230 },
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
const events = await collectEvents('What is the BTC price?');
|
|
140
|
+
|
|
141
|
+
// Verify event sequence
|
|
142
|
+
const types = events.map(e => e.type);
|
|
143
|
+
expect(types).toContain('thinking');
|
|
144
|
+
expect(types).toContain('tool_start');
|
|
145
|
+
expect(types).toContain('tool_end');
|
|
146
|
+
expect(types).toContain('answer_start');
|
|
147
|
+
expect(types).toContain('done');
|
|
148
|
+
|
|
149
|
+
// Verify tool_start comes before tool_end
|
|
150
|
+
const startIdx = types.indexOf('tool_start');
|
|
151
|
+
const endIdx = types.indexOf('tool_end');
|
|
152
|
+
expect(startIdx).toBeLessThan(endIdx);
|
|
153
|
+
|
|
154
|
+
// Verify answer_start comes before done
|
|
155
|
+
const answerIdx = types.indexOf('answer_start');
|
|
156
|
+
const doneIdx = types.indexOf('done');
|
|
157
|
+
expect(answerIdx).toBeLessThan(doneIdx);
|
|
158
|
+
|
|
159
|
+
// Verify done event
|
|
160
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
161
|
+
expect(done.answer).toBe('BTC is currently at $65,000.');
|
|
162
|
+
expect(done.toolCalls.length).toBe(1);
|
|
163
|
+
expect(done.toolCalls[0].tool).toBe('search');
|
|
164
|
+
expect(done.iterations).toBeGreaterThanOrEqual(1);
|
|
165
|
+
expect(done.tokenUsage).toBeDefined();
|
|
166
|
+
expect(done.tokenUsage!.totalTokens).toBe(630); // 150+250+230
|
|
167
|
+
});
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
// ---------------------------------------------------------------------------
|
|
171
|
+
// Test: Tool error recovery
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
|
|
174
|
+
describe('E2E: tool error recovery', () => {
|
|
175
|
+
test('first tool fails → second tool works → answer produced', async () => {
|
|
176
|
+
// LLM calls the broken tool
|
|
177
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
178
|
+
response: createMockAIMessage('', [
|
|
179
|
+
{ name: 'broken', args: {} },
|
|
180
|
+
]),
|
|
181
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
182
|
+
});
|
|
183
|
+
// LLM sees error, tries different tool
|
|
184
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
185
|
+
response: createMockAIMessage('Let me try a different approach', [
|
|
186
|
+
{ name: 'search', args: { query: 'BTC' } },
|
|
187
|
+
]),
|
|
188
|
+
usage: { inputTokens: 200, outputTokens: 50, totalTokens: 250 },
|
|
189
|
+
});
|
|
190
|
+
// LLM generates answer
|
|
191
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
192
|
+
response: 'Got the data now.',
|
|
193
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
194
|
+
});
|
|
195
|
+
// Final answer
|
|
196
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
197
|
+
response: 'Despite initial error, BTC is $65k.',
|
|
198
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
const events = await collectEvents('BTC with error recovery');
|
|
202
|
+
|
|
203
|
+
const types = events.map(e => e.type);
|
|
204
|
+
expect(types).toContain('tool_error');
|
|
205
|
+
expect(types).toContain('done');
|
|
206
|
+
|
|
207
|
+
const error = findEvent<ToolErrorEvent>(events, 'tool_error')!;
|
|
208
|
+
expect(error.error).toContain('Connection timeout');
|
|
209
|
+
|
|
210
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
211
|
+
expect(done.answer).toBeDefined();
|
|
212
|
+
expect(done.toolCalls.length).toBe(2); // broken + search
|
|
213
|
+
});
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
// ---------------------------------------------------------------------------
|
|
217
|
+
// Test: Multi-tool flow
|
|
218
|
+
// ---------------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
describe('E2E: multi-tool flow', () => {
|
|
221
|
+
test('LLM requests multiple tools → all execute → comprehensive answer', async () => {
|
|
222
|
+
// LLM requests two tools at once
|
|
223
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
224
|
+
response: createMockAIMessage('Gathering data from multiple sources', [
|
|
225
|
+
{ name: 'search', args: { query: 'BTC price' } },
|
|
226
|
+
{ name: 'api', args: { endpoint: 'market-data' } },
|
|
227
|
+
]),
|
|
228
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
229
|
+
});
|
|
230
|
+
// LLM generates answer
|
|
231
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
232
|
+
response: 'All data gathered.',
|
|
233
|
+
usage: { inputTokens: 200, outputTokens: 100, totalTokens: 300 },
|
|
234
|
+
});
|
|
235
|
+
// Final answer
|
|
236
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
237
|
+
response: 'BTC: $65k, market looks strong.',
|
|
238
|
+
usage: { inputTokens: 150, outputTokens: 80, totalTokens: 230 },
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
const events = await collectEvents('Full market overview');
|
|
242
|
+
|
|
243
|
+
const toolEnds = findAllEvents<ToolEndEvent>(events, 'tool_end');
|
|
244
|
+
expect(toolEnds.length).toBe(2);
|
|
245
|
+
|
|
246
|
+
const toolNames = toolEnds.map(e => e.tool);
|
|
247
|
+
expect(toolNames).toContain('search');
|
|
248
|
+
expect(toolNames).toContain('api');
|
|
249
|
+
|
|
250
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
251
|
+
expect(done.toolCalls.length).toBe(2);
|
|
252
|
+
});
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
// Test: Direct answer (no tools needed)
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
258
|
+
|
|
259
|
+
describe('E2E: direct answer', () => {
|
|
260
|
+
test('greeting → direct response, no tool calls', async () => {
|
|
261
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
262
|
+
response: 'Hello! I can help with crypto research.',
|
|
263
|
+
usage: { inputTokens: 50, outputTokens: 30, totalTokens: 80 },
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
const events = await collectEvents('Hello');
|
|
267
|
+
|
|
268
|
+
const types = events.map(e => e.type);
|
|
269
|
+
expect(types).not.toContain('tool_start');
|
|
270
|
+
expect(types).toContain('done');
|
|
271
|
+
|
|
272
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
273
|
+
expect(done.answer).toBe('Hello! I can help with crypto research.');
|
|
274
|
+
expect(done.toolCalls.length).toBe(0);
|
|
275
|
+
expect(done.iterations).toBe(1);
|
|
276
|
+
});
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// ---------------------------------------------------------------------------
|
|
280
|
+
// Test: Max iterations with final answer
|
|
281
|
+
// ---------------------------------------------------------------------------
|
|
282
|
+
|
|
283
|
+
describe('E2E: max iterations', () => {
|
|
284
|
+
test('always calls tools → stops at limit → still produces answer', async () => {
|
|
285
|
+
let callNum = 0;
|
|
286
|
+
mockCallLlm.mockImplementation(async () => {
|
|
287
|
+
callNum++;
|
|
288
|
+
if (callNum <= 3) {
|
|
289
|
+
return {
|
|
290
|
+
response: createMockAIMessage('', [
|
|
291
|
+
{ name: 'search', args: { query: `attempt-${callNum}` } },
|
|
292
|
+
]),
|
|
293
|
+
usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
return {
|
|
297
|
+
response: 'Final answer after iterations.',
|
|
298
|
+
usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
|
|
299
|
+
};
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const events = await collectEvents('looping query', { maxIterations: 3 });
|
|
303
|
+
|
|
304
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
305
|
+
expect(done.iterations).toBe(3);
|
|
306
|
+
expect(done.answer).toBeDefined();
|
|
307
|
+
// Should have tool calls from all 3 iterations
|
|
308
|
+
expect(done.toolCalls.length).toBe(3);
|
|
309
|
+
});
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
// ---------------------------------------------------------------------------
|
|
313
|
+
// Test: Context overflow → truncation → answer still works
|
|
314
|
+
// ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
describe('E2E: context overflow', () => {
|
|
317
|
+
test('large tool results → context_cleared event → answer produced', async () => {
|
|
318
|
+
mockGetTools.mockReturnValue([testTools.bigdata, testTools.search]);
|
|
319
|
+
|
|
320
|
+
// LLM calls bigdata tool (returns 200K chars)
|
|
321
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
322
|
+
response: createMockAIMessage('', [
|
|
323
|
+
{ name: 'bigdata', args: {} },
|
|
324
|
+
]),
|
|
325
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
326
|
+
});
|
|
327
|
+
// After context check, LLM gives answer
|
|
328
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
329
|
+
response: 'Data analyzed.',
|
|
330
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
331
|
+
});
|
|
332
|
+
// Final answer
|
|
333
|
+
mockCallLlm.mockResolvedValueOnce({
|
|
334
|
+
response: 'Based on the large dataset analysis...',
|
|
335
|
+
usage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
const events = await collectEvents('analyze big data');
|
|
339
|
+
|
|
340
|
+
const done = findEvent<DoneEvent>(events, 'done')!;
|
|
341
|
+
expect(done.answer).toBeDefined();
|
|
342
|
+
// The bigdata result (200K chars) gets truncated to 50K by scratchpad
|
|
343
|
+
// Whether context_cleared fires depends on estimated token count vs threshold
|
|
344
|
+
// The key assertion is that the flow completes successfully
|
|
345
|
+
expect(done.toolCalls.length).toBeGreaterThanOrEqual(1);
|
|
346
|
+
});
|
|
347
|
+
});
|
|
348
|
+
|
|
349
|
+
// ---------------------------------------------------------------------------
|
|
350
|
+
// Test: Tool limit warnings
|
|
351
|
+
// ---------------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
describe('E2E: tool limit warnings', () => {
|
|
354
|
+
test('repeated tool calls generate tool_limit events', async () => {
|
|
355
|
+
let callNum = 0;
|
|
356
|
+
mockCallLlm.mockImplementation(async () => {
|
|
357
|
+
callNum++;
|
|
358
|
+
if (callNum <= 4) {
|
|
359
|
+
return {
|
|
360
|
+
response: createMockAIMessage('', [
|
|
361
|
+
{ name: 'search', args: { query: `search-${callNum}` } },
|
|
362
|
+
]),
|
|
363
|
+
usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
return {
|
|
367
|
+
response: 'Done with repeated searches.',
|
|
368
|
+
usage: { inputTokens: 50, outputTokens: 25, totalTokens: 75 },
|
|
369
|
+
};
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
const events = await collectEvents('repeated searches', { maxIterations: 5 });
|
|
373
|
+
|
|
374
|
+
const limitEvents = findAllEvents(events, 'tool_limit');
|
|
375
|
+
// After 3 calls (default limit), warnings should appear
|
|
376
|
+
expect(limitEvents.length).toBeGreaterThanOrEqual(1);
|
|
377
|
+
});
|
|
378
|
+
});
|