apteva 0.4.57 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +216 -54
- package/cli.js +35 -0
- package/install.js +92 -0
- package/package.json +15 -76
- package/LICENSE +0 -63
- package/bin/apteva.js +0 -196
- package/dist/ActivityPage.kxzzb4yc.js +0 -3
- package/dist/ApiDocsPage.zq998hbm.js +0 -4
- package/dist/App.55rea8mn.js +0 -61
- package/dist/App.5ywb23z4.js +0 -53
- package/dist/App.6thds120.js +0 -4
- package/dist/App.9tctxzqm.js +0 -8
- package/dist/App.a8r8ttaz.js +0 -4
- package/dist/App.agsv5bje.js +0 -4
- package/dist/App.cepapqmx.js +0 -4
- package/dist/App.dp041gb3.js +0 -221
- package/dist/App.fds72zb5.js +0 -4
- package/dist/App.fg9qj2dq.js +0 -4
- package/dist/App.ndfejbm9.js +0 -4
- package/dist/App.nxmfmq1h.js +0 -13
- package/dist/App.qdfyt8ba.js +0 -4
- package/dist/App.x2d0ygt6.js +0 -4
- package/dist/App.yt9p4nr3.js +0 -20
- package/dist/App.zn4mw16t.js +0 -1
- package/dist/ConnectionsPage.8r96ryw7.js +0 -3
- package/dist/McpPage.3cwh0gnd.js +0 -3
- package/dist/SettingsPage.ykgdh5ev.js +0 -3
- package/dist/SkillsPage.4np1s65b.js +0 -3
- package/dist/TasksPage.4g08t7p6.js +0 -3
- package/dist/TelemetryPage.72w9pwcp.js +0 -3
- package/dist/TestsPage.z4fk3r7r.js +0 -3
- package/dist/ThreadsPage.63tcajeh.js +0 -3
- package/dist/apteva-kit.css +0 -1
- package/dist/icon.png +0 -0
- package/dist/index.html +0 -16
- package/dist/styles.css +0 -1
- package/scripts/postinstall.mjs +0 -102
- package/src/auth/index.ts +0 -394
- package/src/auth/middleware.ts +0 -213
- package/src/binary.ts +0 -536
- package/src/channels/index.ts +0 -40
- package/src/channels/telegram.ts +0 -311
- package/src/crypto.ts +0 -301
- package/src/db-tests.ts +0 -174
- package/src/db.ts +0 -3133
- package/src/integrations/agentdojo.ts +0 -559
- package/src/integrations/composio.ts +0 -437
- package/src/integrations/index.ts +0 -87
- package/src/integrations/skillsmp.ts +0 -318
- package/src/mcp-client.ts +0 -605
- package/src/mcp-handler.ts +0 -394
- package/src/mcp-platform.ts +0 -2403
- package/src/openapi.ts +0 -2410
- package/src/providers.ts +0 -597
- package/src/routes/api/agent-utils.ts +0 -890
- package/src/routes/api/agents.ts +0 -916
- package/src/routes/api/api-keys.ts +0 -95
- package/src/routes/api/channels.ts +0 -182
- package/src/routes/api/helpers.ts +0 -12
- package/src/routes/api/integrations.ts +0 -639
- package/src/routes/api/mcp.ts +0 -574
- package/src/routes/api/meta-agent.ts +0 -195
- package/src/routes/api/projects.ts +0 -112
- package/src/routes/api/providers.ts +0 -424
- package/src/routes/api/skills.ts +0 -537
- package/src/routes/api/system.ts +0 -333
- package/src/routes/api/telemetry.ts +0 -203
- package/src/routes/api/tests.ts +0 -148
- package/src/routes/api/triggers.ts +0 -518
- package/src/routes/api/users.ts +0 -148
- package/src/routes/api/webhooks.ts +0 -171
- package/src/routes/api.ts +0 -53
- package/src/routes/auth.ts +0 -251
- package/src/routes/share.ts +0 -86
- package/src/routes/static.ts +0 -131
- package/src/server.ts +0 -642
- package/src/test-runner.ts +0 -598
- package/src/triggers/agentdojo.ts +0 -253
- package/src/triggers/composio.ts +0 -264
- package/src/triggers/index.ts +0 -71
- package/src/tui/AgentList.tsx +0 -145
- package/src/tui/App.tsx +0 -102
- package/src/tui/Login.tsx +0 -104
- package/src/tui/api.ts +0 -72
- package/src/tui/index.tsx +0 -7
- package/src/web/App.tsx +0 -455
- package/src/web/components/activity/ActivityPage.tsx +0 -314
- package/src/web/components/activity/index.ts +0 -1
- package/src/web/components/agents/AgentCard.tsx +0 -189
- package/src/web/components/agents/AgentPanel.tsx +0 -2244
- package/src/web/components/agents/AgentsView.tsx +0 -180
- package/src/web/components/agents/CreateAgentModal.tsx +0 -475
- package/src/web/components/agents/index.ts +0 -4
- package/src/web/components/api/ApiDocsPage.tsx +0 -842
- package/src/web/components/auth/CreateAccountStep.tsx +0 -176
- package/src/web/components/auth/LoginPage.tsx +0 -91
- package/src/web/components/auth/index.ts +0 -2
- package/src/web/components/common/Icons.tsx +0 -250
- package/src/web/components/common/LoadingSpinner.tsx +0 -44
- package/src/web/components/common/Modal.tsx +0 -199
- package/src/web/components/common/Select.tsx +0 -97
- package/src/web/components/common/index.ts +0 -20
- package/src/web/components/connections/ConnectionsPage.tsx +0 -54
- package/src/web/components/connections/IntegrationsTab.tsx +0 -170
- package/src/web/components/connections/OverviewTab.tsx +0 -137
- package/src/web/components/connections/TriggersTab.tsx +0 -1346
- package/src/web/components/dashboard/Dashboard.tsx +0 -572
- package/src/web/components/dashboard/index.ts +0 -1
- package/src/web/components/index.ts +0 -21
- package/src/web/components/layout/ErrorBanner.tsx +0 -18
- package/src/web/components/layout/Header.tsx +0 -332
- package/src/web/components/layout/Sidebar.tsx +0 -231
- package/src/web/components/layout/index.ts +0 -3
- package/src/web/components/mcp/IntegrationsPanel.tsx +0 -857
- package/src/web/components/mcp/McpPage.tsx +0 -2515
- package/src/web/components/mcp/index.ts +0 -1
- package/src/web/components/meta-agent/MetaAgent.tsx +0 -245
- package/src/web/components/onboarding/OnboardingWizard.tsx +0 -404
- package/src/web/components/onboarding/index.ts +0 -1
- package/src/web/components/settings/SettingsPage.tsx +0 -2776
- package/src/web/components/settings/index.ts +0 -1
- package/src/web/components/skills/SkillsPage.tsx +0 -1200
- package/src/web/components/tasks/TasksPage.tsx +0 -1116
- package/src/web/components/tasks/index.ts +0 -1
- package/src/web/components/telemetry/TelemetryPage.tsx +0 -1129
- package/src/web/components/tests/TestsPage.tsx +0 -594
- package/src/web/components/threads/ThreadsPage.tsx +0 -315
- package/src/web/context/AuthContext.tsx +0 -242
- package/src/web/context/ProjectContext.tsx +0 -214
- package/src/web/context/TelemetryContext.tsx +0 -299
- package/src/web/context/ThemeContext.tsx +0 -90
- package/src/web/context/UIModeContext.tsx +0 -49
- package/src/web/context/index.ts +0 -12
- package/src/web/hooks/index.ts +0 -3
- package/src/web/hooks/useAgents.ts +0 -115
- package/src/web/hooks/useOnboarding.ts +0 -20
- package/src/web/hooks/useProviders.ts +0 -75
- package/src/web/icon.png +0 -0
- package/src/web/index.html +0 -16
- package/src/web/styles.css +0 -118
- package/src/web/themes.ts +0 -162
- package/src/web/types.ts +0 -298
package/src/test-runner.ts
DELETED
|
@@ -1,598 +0,0 @@
|
|
|
1
|
-
import { AgentDB, generateId } from "./db";
|
|
2
|
-
import { TestCaseDB, TestRunDB, type TestCase, type TestRun } from "./db-tests";
|
|
3
|
-
import { agentFetch, startAgentProcess } from "./routes/api/agent-utils";
|
|
4
|
-
import { ProviderKeys, PROVIDERS } from "./providers";
|
|
5
|
-
import { telemetryBroadcaster, type TelemetryEvent } from "./server";
|
|
6
|
-
|
|
7
|
-
const TAG = "[test-runner]";
|
|
8
|
-
|
|
9
|
-
interface JudgeResult {
|
|
10
|
-
pass: boolean;
|
|
11
|
-
score: number;
|
|
12
|
-
reasoning: string;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
interface PlanResult {
|
|
16
|
-
agent_id: string;
|
|
17
|
-
agent_name: string;
|
|
18
|
-
message: string;
|
|
19
|
-
reasoning: string;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
// 5-minute safety cap for stream consumption
|
|
23
|
-
const STREAM_SAFETY_TIMEOUT_MS = 5 * 60 * 1000;
|
|
24
|
-
|
|
25
|
-
// Broadcast a test telemetry event via SSE
|
|
26
|
-
function broadcastTestEvent(
|
|
27
|
-
runId: string,
|
|
28
|
-
testCaseId: string,
|
|
29
|
-
type: string,
|
|
30
|
-
data: Record<string, unknown> = {},
|
|
31
|
-
agentId?: string,
|
|
32
|
-
) {
|
|
33
|
-
const event: TelemetryEvent = {
|
|
34
|
-
id: generateId(),
|
|
35
|
-
agent_id: agentId || "system",
|
|
36
|
-
timestamp: new Date().toISOString(),
|
|
37
|
-
category: "test",
|
|
38
|
-
type,
|
|
39
|
-
level: "info",
|
|
40
|
-
trace_id: runId,
|
|
41
|
-
data: { test_case_id: testCaseId, ...data },
|
|
42
|
-
};
|
|
43
|
-
telemetryBroadcaster.broadcast([event]);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
// Plan a behavior-driven test: AI picks the agent and generates the message
|
|
47
|
-
async function planTest(behavior: string, projectId: string | null): Promise<PlanResult> {
|
|
48
|
-
console.log(`${TAG} Planning test for behavior: "${behavior.slice(0, 80)}..." (project: ${projectId || "all"})`);
|
|
49
|
-
|
|
50
|
-
// Get available agents
|
|
51
|
-
const agents = projectId
|
|
52
|
-
? AgentDB.findByProject(projectId)
|
|
53
|
-
: AgentDB.findAll();
|
|
54
|
-
|
|
55
|
-
console.log(`${TAG} Found ${agents.length} agent(s) for planning`);
|
|
56
|
-
|
|
57
|
-
if (agents.length === 0) {
|
|
58
|
-
throw new Error("No agents available to test");
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
const agentDescriptions = agents.map(a => {
|
|
62
|
-
const features = [];
|
|
63
|
-
if (a.features.memory) features.push("memory");
|
|
64
|
-
if (a.features.tasks) features.push("tasks");
|
|
65
|
-
if (a.features.mcp) features.push("MCP tools");
|
|
66
|
-
if (a.features.operator) features.push("browser");
|
|
67
|
-
if (a.features.vision) features.push("vision");
|
|
68
|
-
if (a.features.realtime) features.push("realtime voice");
|
|
69
|
-
const featureStr = features.length > 0 ? ` | Features: ${features.join(", ")}` : "";
|
|
70
|
-
const promptSnippet = a.system_prompt.length > 200
|
|
71
|
-
? a.system_prompt.slice(0, 200) + "..."
|
|
72
|
-
: a.system_prompt;
|
|
73
|
-
return `- ID: ${a.id} | Name: ${a.name} | Status: ${a.status}${featureStr}\n System prompt: ${promptSnippet}`;
|
|
74
|
-
}).join("\n");
|
|
75
|
-
|
|
76
|
-
const planPrompt = `You are a test planner for an AI agent platform. Given a behavior description, you must:
|
|
77
|
-
1. Pick the most appropriate agent to test this behavior
|
|
78
|
-
2. Generate a realistic user message that would trigger the described behavior
|
|
79
|
-
|
|
80
|
-
## Available Agents
|
|
81
|
-
${agentDescriptions}
|
|
82
|
-
|
|
83
|
-
## Behavior to Test
|
|
84
|
-
${behavior}
|
|
85
|
-
|
|
86
|
-
Pick the agent whose capabilities best match this behavior. Prefer running agents when possible.
|
|
87
|
-
Generate a natural user message that would test this specific behavior.
|
|
88
|
-
|
|
89
|
-
Respond with ONLY a JSON object (no markdown, no extra text):
|
|
90
|
-
{"agent_id": "the-agent-id", "message": "the message to send", "reasoning": "brief explanation of why this agent and message"}`;
|
|
91
|
-
|
|
92
|
-
console.log(`${TAG} Calling LLM planner...`);
|
|
93
|
-
const result = await callLLM(planPrompt);
|
|
94
|
-
console.log(`${TAG} Planner raw response: ${result.slice(0, 300)}`);
|
|
95
|
-
|
|
96
|
-
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
|
97
|
-
if (!jsonMatch) {
|
|
98
|
-
throw new Error(`Planner returned unparseable response: ${result.slice(0, 200)}`);
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
const parsed = JSON.parse(jsonMatch[0]);
|
|
102
|
-
console.log(`${TAG} Planner chose agent_id="${parsed.agent_id}", message="${(parsed.message || "").slice(0, 80)}"`);
|
|
103
|
-
|
|
104
|
-
if (!parsed.agent_id || !parsed.message) {
|
|
105
|
-
throw new Error("Planner response missing agent_id or message");
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Validate the chosen agent exists
|
|
109
|
-
const chosenAgent = agents.find(a => a.id === parsed.agent_id);
|
|
110
|
-
if (!chosenAgent) {
|
|
111
|
-
const fallback = agents[0];
|
|
112
|
-
console.log(`${TAG} Planner picked invalid agent "${parsed.agent_id}", falling back to "${fallback.name}" (${fallback.id})`);
|
|
113
|
-
return {
|
|
114
|
-
agent_id: fallback.id,
|
|
115
|
-
agent_name: fallback.name,
|
|
116
|
-
message: parsed.message,
|
|
117
|
-
reasoning: `${parsed.reasoning} (Note: planner picked invalid agent ${parsed.agent_id}, falling back to ${fallback.name})`,
|
|
118
|
-
};
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
console.log(`${TAG} Plan complete: agent="${chosenAgent.name}" (${chosenAgent.id})`);
|
|
122
|
-
return {
|
|
123
|
-
agent_id: chosenAgent.id,
|
|
124
|
-
agent_name: chosenAgent.name,
|
|
125
|
-
message: parsed.message,
|
|
126
|
-
reasoning: parsed.reasoning || "",
|
|
127
|
-
};
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
// Run a single test case
|
|
131
|
-
export async function runTest(testCase: TestCase): Promise<TestRun> {
|
|
132
|
-
console.log(`${TAG} ========== Running test "${testCase.name}" (${testCase.id}) ==========`);
|
|
133
|
-
console.log(`${TAG} Test details: behavior=${testCase.behavior ? `"${testCase.behavior.slice(0, 60)}..."` : "null"}, agent_id=${testCase.agent_id || "null"}, input_message=${testCase.input_message ? `"${testCase.input_message.slice(0, 60)}"` : "null"}, project_id=${testCase.project_id || "null"}`);
|
|
134
|
-
|
|
135
|
-
const run = TestRunDB.create(testCase.id);
|
|
136
|
-
console.log(`${TAG} Created test run: ${run.id}`);
|
|
137
|
-
const startTime = Date.now();
|
|
138
|
-
|
|
139
|
-
broadcastTestEvent(run.id, testCase.id, "test_started", {
|
|
140
|
-
test_name: testCase.name,
|
|
141
|
-
behavior: testCase.behavior || undefined,
|
|
142
|
-
});
|
|
143
|
-
|
|
144
|
-
try {
|
|
145
|
-
let agentId = testCase.agent_id;
|
|
146
|
-
let inputMessage = testCase.input_message;
|
|
147
|
-
let plannerReasoning: string | undefined;
|
|
148
|
-
let selectedAgentId: string | undefined;
|
|
149
|
-
let selectedAgentName: string | undefined;
|
|
150
|
-
let generatedMessage: string | undefined;
|
|
151
|
-
|
|
152
|
-
// Behavior-driven test: use AI planner to pick agent and generate message
|
|
153
|
-
if (testCase.behavior && (!agentId || !inputMessage)) {
|
|
154
|
-
console.log(`${TAG} Behavior-driven test — running planner (agentId=${agentId || "auto"}, inputMessage=${inputMessage ? "set" : "auto"})`);
|
|
155
|
-
broadcastTestEvent(run.id, testCase.id, "test_planning", { test_name: testCase.name });
|
|
156
|
-
const plan = await planTest(testCase.behavior, testCase.project_id);
|
|
157
|
-
|
|
158
|
-
if (!agentId) {
|
|
159
|
-
agentId = plan.agent_id;
|
|
160
|
-
selectedAgentId = plan.agent_id;
|
|
161
|
-
selectedAgentName = plan.agent_name;
|
|
162
|
-
console.log(`${TAG} Planner selected agent: ${plan.agent_name} (${plan.agent_id})`);
|
|
163
|
-
}
|
|
164
|
-
if (!inputMessage) {
|
|
165
|
-
inputMessage = plan.message;
|
|
166
|
-
generatedMessage = plan.message;
|
|
167
|
-
console.log(`${TAG} Planner generated message: "${plan.message.slice(0, 100)}"`);
|
|
168
|
-
}
|
|
169
|
-
plannerReasoning = plan.reasoning;
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
if (!agentId || !inputMessage) {
|
|
173
|
-
console.log(`${TAG} ERROR: Missing agentId (${agentId}) or inputMessage (${inputMessage})`);
|
|
174
|
-
return TestRunDB.complete(run.id, {
|
|
175
|
-
status: "error",
|
|
176
|
-
error: "Test requires either behavior description or explicit agent_id + input_message",
|
|
177
|
-
duration_ms: Date.now() - startTime,
|
|
178
|
-
})!;
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
console.log(`${TAG} Looking up agent: ${agentId}`);
|
|
182
|
-
const agent = AgentDB.findById(agentId);
|
|
183
|
-
if (!agent) {
|
|
184
|
-
console.log(`${TAG} ERROR: Agent not found: ${agentId}`);
|
|
185
|
-
return TestRunDB.complete(run.id, {
|
|
186
|
-
status: "error",
|
|
187
|
-
error: `Agent not found: ${agentId}`,
|
|
188
|
-
duration_ms: Date.now() - startTime,
|
|
189
|
-
selected_agent_id: selectedAgentId,
|
|
190
|
-
selected_agent_name: selectedAgentName,
|
|
191
|
-
generated_message: generatedMessage,
|
|
192
|
-
planner_reasoning: plannerReasoning,
|
|
193
|
-
})!;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
console.log(`${TAG} Agent "${agent.name}": status=${agent.status}, port=${agent.port}`);
|
|
197
|
-
|
|
198
|
-
// Start agent if not running
|
|
199
|
-
if (agent.status !== "running" || !agent.port) {
|
|
200
|
-
console.log(`${TAG} Agent not running, starting...`);
|
|
201
|
-
const startResult = await startAgentProcess(agent, { silent: true });
|
|
202
|
-
console.log(`${TAG} Start result: success=${startResult.success}, port=${startResult.port}, error=${startResult.error || "none"}`);
|
|
203
|
-
if (!startResult.success) {
|
|
204
|
-
return TestRunDB.complete(run.id, {
|
|
205
|
-
status: "error",
|
|
206
|
-
error: `Failed to start agent: ${startResult.error}`,
|
|
207
|
-
duration_ms: Date.now() - startTime,
|
|
208
|
-
selected_agent_id: selectedAgentId,
|
|
209
|
-
selected_agent_name: selectedAgentName,
|
|
210
|
-
generated_message: generatedMessage,
|
|
211
|
-
planner_reasoning: plannerReasoning,
|
|
212
|
-
})!;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
// Re-fetch agent to get updated port
|
|
217
|
-
const runningAgent = AgentDB.findById(agentId)!;
|
|
218
|
-
if (!runningAgent || runningAgent.status !== "running" || !runningAgent.port) {
|
|
219
|
-
console.log(`${TAG} ERROR: Agent still not running after start. status=${runningAgent?.status}, port=${runningAgent?.port}`);
|
|
220
|
-
return TestRunDB.complete(run.id, {
|
|
221
|
-
status: "error",
|
|
222
|
-
error: "Agent failed to start",
|
|
223
|
-
duration_ms: Date.now() - startTime,
|
|
224
|
-
selected_agent_id: selectedAgentId,
|
|
225
|
-
selected_agent_name: selectedAgentName,
|
|
226
|
-
generated_message: generatedMessage,
|
|
227
|
-
planner_reasoning: plannerReasoning,
|
|
228
|
-
})!;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
console.log(`${TAG} Agent running on port ${runningAgent.port}`);
|
|
232
|
-
|
|
233
|
-
// 1. Send message via /chat endpoint (thread created automatically by agent)
|
|
234
|
-
console.log(`${TAG} Step 1: Sending message to /chat: "${inputMessage.slice(0, 100)}"`);
|
|
235
|
-
broadcastTestEvent(run.id, testCase.id, "test_executing", {
|
|
236
|
-
test_name: testCase.name,
|
|
237
|
-
agent_name: selectedAgentName || runningAgent.name,
|
|
238
|
-
message: inputMessage.slice(0, 100),
|
|
239
|
-
}, agentId!);
|
|
240
|
-
const chatBody = { message: inputMessage };
|
|
241
|
-
|
|
242
|
-
const chatRes = await agentFetch(runningAgent.id, runningAgent.port!, "/chat", {
|
|
243
|
-
method: "POST",
|
|
244
|
-
headers: { "Content-Type": "application/json", "X-Test-Mode": "true" },
|
|
245
|
-
body: JSON.stringify(chatBody),
|
|
246
|
-
});
|
|
247
|
-
|
|
248
|
-
if (!chatRes.ok) {
|
|
249
|
-
const errBody = await chatRes.text();
|
|
250
|
-
console.log(`${TAG} ERROR: Chat failed (${chatRes.status}): ${errBody}`);
|
|
251
|
-
return TestRunDB.complete(run.id, {
|
|
252
|
-
status: "error",
|
|
253
|
-
error: `Chat failed: ${errBody}`,
|
|
254
|
-
duration_ms: Date.now() - startTime,
|
|
255
|
-
selected_agent_id: selectedAgentId,
|
|
256
|
-
selected_agent_name: selectedAgentName,
|
|
257
|
-
generated_message: generatedMessage,
|
|
258
|
-
planner_reasoning: plannerReasoning,
|
|
259
|
-
})!;
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
console.log(`${TAG} Chat response started (status ${chatRes.status}, content-type: ${chatRes.headers.get("content-type")})`);
|
|
263
|
-
|
|
264
|
-
// 2. Consume the streaming response (5-min safety cap)
|
|
265
|
-
console.log(`${TAG} Step 2: Consuming stream...`);
|
|
266
|
-
const streamText = await consumeStream(chatRes);
|
|
267
|
-
console.log(`${TAG} Stream consumed: ${streamText.length} chars (${(Date.now() - startTime) / 1000}s elapsed)`);
|
|
268
|
-
|
|
269
|
-
// 3. Parse SSE events from stream — extract thread_id and assemble response text
|
|
270
|
-
let threadId: string | null = null;
|
|
271
|
-
let messages: any[] = [];
|
|
272
|
-
const contentChunks: string[] = [];
|
|
273
|
-
|
|
274
|
-
const lines = streamText.split("\n").filter(l => l.trim());
|
|
275
|
-
let parsedCount = 0;
|
|
276
|
-
for (const line of lines) {
|
|
277
|
-
const sseData = line.startsWith("data: ") ? line.slice(6) : line;
|
|
278
|
-
try {
|
|
279
|
-
const evt = JSON.parse(sseData);
|
|
280
|
-
parsedCount++;
|
|
281
|
-
// Extract thread_id (agent sends type: "thread_id")
|
|
282
|
-
if (evt.type === "thread_id" && evt.thread_id) {
|
|
283
|
-
threadId = evt.thread_id;
|
|
284
|
-
console.log(`${TAG} Found thread_id in SSE: ${threadId}`);
|
|
285
|
-
}
|
|
286
|
-
// Accumulate content chunks
|
|
287
|
-
if (evt.type === "content" && evt.content) {
|
|
288
|
-
contentChunks.push(evt.content);
|
|
289
|
-
}
|
|
290
|
-
} catch {
|
|
291
|
-
// Not valid JSON
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
console.log(`${TAG} Parsed ${parsedCount} SSE event(s), ${contentChunks.length} content chunks, threadId=${threadId || "none"}`);
|
|
295
|
-
|
|
296
|
-
// Assemble the agent's full text response from content chunks
|
|
297
|
-
const assembledResponse = contentChunks.join("");
|
|
298
|
-
|
|
299
|
-
// If we got a threadId, try fetching full thread messages for a structured view
|
|
300
|
-
if (threadId) {
|
|
301
|
-
console.log(`${TAG} Step 3: Fetching thread ${threadId} messages...`);
|
|
302
|
-
const messagesRes = await agentFetch(runningAgent.id, runningAgent.port!, `/threads/${threadId}/messages`, {
|
|
303
|
-
method: "GET",
|
|
304
|
-
headers: { "Accept": "application/json" },
|
|
305
|
-
});
|
|
306
|
-
|
|
307
|
-
if (messagesRes.ok) {
|
|
308
|
-
const data = await messagesRes.json();
|
|
309
|
-
messages = Array.isArray(data) ? data : (data.messages || []);
|
|
310
|
-
console.log(`${TAG} Got ${messages.length} message(s) from thread`);
|
|
311
|
-
} else {
|
|
312
|
-
console.log(`${TAG} WARNING: Failed to fetch messages (${messagesRes.status}): ${await messagesRes.text()}`);
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
// Fallback: build conversation from assembled SSE content
|
|
317
|
-
if (messages.length === 0 && assembledResponse.length > 0) {
|
|
318
|
-
console.log(`${TAG} Building conversation from SSE content (${assembledResponse.length} chars)`);
|
|
319
|
-
messages = [
|
|
320
|
-
{ role: "user", content: inputMessage },
|
|
321
|
-
{ role: "assistant", content: assembledResponse },
|
|
322
|
-
];
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
const agentResponse = JSON.stringify(messages, null, 2);
|
|
326
|
-
|
|
327
|
-
// 4. Run LLM judge — use behavior as criteria when available
|
|
328
|
-
const evalCriteria = testCase.behavior || testCase.eval_criteria;
|
|
329
|
-
console.log(`${TAG} Step 4: Running LLM judge with criteria: "${evalCriteria.slice(0, 80)}..."`);
|
|
330
|
-
broadcastTestEvent(run.id, testCase.id, "test_judging", { test_name: testCase.name }, agentId!);
|
|
331
|
-
const judgeResult = await judge(messages, evalCriteria);
|
|
332
|
-
console.log(`${TAG} Judge result: pass=${judgeResult.pass}, score=${judgeResult.score}, reasoning="${judgeResult.reasoning.slice(0, 100)}"`);
|
|
333
|
-
|
|
334
|
-
const totalMs = Date.now() - startTime;
|
|
335
|
-
console.log(`${TAG} ========== Test "${testCase.name}" ${judgeResult.pass ? "PASSED" : "FAILED"} (score: ${judgeResult.score}/10, ${(totalMs / 1000).toFixed(1)}s) ==========`);
|
|
336
|
-
|
|
337
|
-
broadcastTestEvent(run.id, testCase.id, "test_completed", {
|
|
338
|
-
test_name: testCase.name,
|
|
339
|
-
status: judgeResult.pass ? "passed" : "failed",
|
|
340
|
-
score: judgeResult.score,
|
|
341
|
-
duration_ms: totalMs,
|
|
342
|
-
reasoning: judgeResult.reasoning.slice(0, 200),
|
|
343
|
-
}, agentId!);
|
|
344
|
-
|
|
345
|
-
return TestRunDB.complete(run.id, {
|
|
346
|
-
status: judgeResult.pass ? "passed" : "failed",
|
|
347
|
-
score: judgeResult.score,
|
|
348
|
-
agent_response: agentResponse,
|
|
349
|
-
judge_reasoning: judgeResult.reasoning,
|
|
350
|
-
duration_ms: totalMs,
|
|
351
|
-
selected_agent_id: selectedAgentId,
|
|
352
|
-
selected_agent_name: selectedAgentName,
|
|
353
|
-
generated_message: generatedMessage,
|
|
354
|
-
planner_reasoning: plannerReasoning,
|
|
355
|
-
})!;
|
|
356
|
-
} catch (err: any) {
|
|
357
|
-
const totalMs = Date.now() - startTime;
|
|
358
|
-
console.log(`${TAG} ========== Test "${testCase.name}" ERROR (${(totalMs / 1000).toFixed(1)}s): ${err.message || err} ==========`);
|
|
359
|
-
console.log(`${TAG} Stack: ${err.stack || "no stack"}`);
|
|
360
|
-
|
|
361
|
-
broadcastTestEvent(run.id, testCase.id, "test_completed", {
|
|
362
|
-
test_name: testCase.name,
|
|
363
|
-
status: "error",
|
|
364
|
-
duration_ms: totalMs,
|
|
365
|
-
error: (err.message || String(err)).slice(0, 200),
|
|
366
|
-
});
|
|
367
|
-
|
|
368
|
-
return TestRunDB.complete(run.id, {
|
|
369
|
-
status: "error",
|
|
370
|
-
error: err.message || String(err),
|
|
371
|
-
duration_ms: totalMs,
|
|
372
|
-
})!;
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
|
|
376
|
-
// Run multiple tests sequentially
|
|
377
|
-
export async function runAll(testCaseIds?: string[]): Promise<TestRun[]> {
|
|
378
|
-
const testCases = testCaseIds
|
|
379
|
-
? testCaseIds.map(id => TestCaseDB.findById(id)).filter(Boolean) as TestCase[]
|
|
380
|
-
: TestCaseDB.findAll();
|
|
381
|
-
|
|
382
|
-
console.log(`${TAG} Running ${testCases.length} test(s)`);
|
|
383
|
-
const results: TestRun[] = [];
|
|
384
|
-
for (const tc of testCases) {
|
|
385
|
-
results.push(await runTest(tc));
|
|
386
|
-
}
|
|
387
|
-
console.log(`${TAG} All ${results.length} test(s) complete: ${results.filter(r => r.status === "passed").length} passed, ${results.filter(r => r.status === "failed").length} failed, ${results.filter(r => r.status === "error").length} errors`);
|
|
388
|
-
return results;
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// Consume a streaming response (SSE or NDJSON) until done, with safety timeout
|
|
392
|
-
async function consumeStream(response: Response): Promise<string> {
|
|
393
|
-
const reader = response.body?.getReader();
|
|
394
|
-
if (!reader) {
|
|
395
|
-
console.log(`${TAG} consumeStream: no body reader available`);
|
|
396
|
-
return "";
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
const decoder = new TextDecoder();
|
|
400
|
-
let fullText = "";
|
|
401
|
-
let chunks = 0;
|
|
402
|
-
|
|
403
|
-
const timeout = new Promise<void>((_, reject) =>
|
|
404
|
-
setTimeout(() => reject(new Error("Stream safety timeout (5 min)")), STREAM_SAFETY_TIMEOUT_MS)
|
|
405
|
-
);
|
|
406
|
-
|
|
407
|
-
const consume = async () => {
|
|
408
|
-
while (true) {
|
|
409
|
-
const { done, value } = await reader.read();
|
|
410
|
-
if (done) {
|
|
411
|
-
console.log(`${TAG} consumeStream: stream ended after ${chunks} chunks, ${fullText.length} chars`);
|
|
412
|
-
break;
|
|
413
|
-
}
|
|
414
|
-
chunks++;
|
|
415
|
-
const chunk = decoder.decode(value, { stream: true });
|
|
416
|
-
fullText += chunk;
|
|
417
|
-
if (chunks <= 3 || chunks % 10 === 0) {
|
|
418
|
-
console.log(`${TAG} consumeStream: chunk #${chunks} (+${chunk.length} chars, total ${fullText.length})`);
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
};
|
|
422
|
-
|
|
423
|
-
try {
|
|
424
|
-
await Promise.race([consume(), timeout]);
|
|
425
|
-
} catch (err: any) {
|
|
426
|
-
console.log(`${TAG} consumeStream: error — ${err.message}`);
|
|
427
|
-
reader.cancel();
|
|
428
|
-
if (err.message.includes("safety timeout")) {
|
|
429
|
-
throw err;
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
return fullText;
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
// LLM Judge: evaluate conversation thread against criteria
|
|
437
|
-
async function judge(messages: any[], criteria: string): Promise<JudgeResult> {
|
|
438
|
-
// Format messages for the judge prompt
|
|
439
|
-
const formattedMessages = messages.map((m: any) => {
|
|
440
|
-
const role = m.role || "unknown";
|
|
441
|
-
let content = "";
|
|
442
|
-
if (typeof m.content === "string") {
|
|
443
|
-
content = m.content;
|
|
444
|
-
} else if (Array.isArray(m.content)) {
|
|
445
|
-
content = m.content.map((block: any) => {
|
|
446
|
-
if (block.type === "text") return block.text;
|
|
447
|
-
if (block.type === "tool_use") return `[Tool Call: ${block.name}(${JSON.stringify(block.input)})]`;
|
|
448
|
-
if (block.type === "tool_result") return `[Tool Result: ${block.content}${block.is_error ? " (ERROR)" : ""}]`;
|
|
449
|
-
return JSON.stringify(block);
|
|
450
|
-
}).join("\n");
|
|
451
|
-
}
|
|
452
|
-
return `${role}: ${content}`;
|
|
453
|
-
}).join("\n\n");
|
|
454
|
-
|
|
455
|
-
const judgePrompt = `You are a test evaluator for an AI agent platform. Given a conversation thread between a user and an AI agent, determine if the agent's behavior meets the success criteria.
|
|
456
|
-
|
|
457
|
-
## Success Criteria
|
|
458
|
-
${criteria}
|
|
459
|
-
|
|
460
|
-
## Conversation Thread
|
|
461
|
-
${formattedMessages}
|
|
462
|
-
|
|
463
|
-
Evaluate whether the agent met the success criteria. Also give a score from 1-10 (10 = perfect).
|
|
464
|
-
|
|
465
|
-
Respond with ONLY a JSON object (no markdown, no extra text):
|
|
466
|
-
{"pass": true, "score": 9, "reasoning": "brief explanation"}
|
|
467
|
-
or
|
|
468
|
-
{"pass": false, "score": 3, "reasoning": "brief explanation of what failed"}`;
|
|
469
|
-
|
|
470
|
-
try {
|
|
471
|
-
console.log(`${TAG} Judge: calling LLM with ${formattedMessages.length} chars of conversation...`);
|
|
472
|
-
const result = await callLLM(judgePrompt);
|
|
473
|
-
console.log(`${TAG} Judge raw response: ${result.slice(0, 200)}`);
|
|
474
|
-
// Parse JSON from response
|
|
475
|
-
const jsonMatch = result.match(/\{[\s\S]*\}/);
|
|
476
|
-
if (jsonMatch) {
|
|
477
|
-
const parsed = JSON.parse(jsonMatch[0]);
|
|
478
|
-
const score = typeof parsed.score === "number" ? Math.max(1, Math.min(10, parsed.score)) : (parsed.pass ? 8 : 3);
|
|
479
|
-
return { pass: !!parsed.pass, score, reasoning: parsed.reasoning || "" };
|
|
480
|
-
}
|
|
481
|
-
return { pass: false, score: 1, reasoning: `Judge returned unparseable response: ${result.slice(0, 200)}` };
|
|
482
|
-
} catch (err: any) {
|
|
483
|
-
console.log(`${TAG} Judge error: ${err.message}`);
|
|
484
|
-
return { pass: false, score: 1, reasoning: `Judge error: ${err.message}` };
|
|
485
|
-
}
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
// Get an LLM provider + key, or throw
|
|
489
|
-
function getLLMProvider(): { providerId: string; apiKey: string; provider: any } {
|
|
490
|
-
const configuredProviders = ProviderKeys.getConfiguredProviders();
|
|
491
|
-
console.log(`${TAG} Configured providers: ${configuredProviders.join(", ") || "none"}`);
|
|
492
|
-
|
|
493
|
-
const llmProvider = configuredProviders.find(id => {
|
|
494
|
-
const p = PROVIDERS[id as keyof typeof PROVIDERS];
|
|
495
|
-
return p && p.type === "llm" && p.models.length > 0;
|
|
496
|
-
});
|
|
497
|
-
|
|
498
|
-
if (!llmProvider) {
|
|
499
|
-
throw new Error("No LLM provider configured");
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
const provider = PROVIDERS[llmProvider as keyof typeof PROVIDERS];
|
|
503
|
-
const apiKey = ProviderKeys.getDecrypted(llmProvider);
|
|
504
|
-
if (!apiKey) {
|
|
505
|
-
throw new Error("Failed to retrieve API key for LLM");
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
console.log(`${TAG} Using LLM provider: ${llmProvider}`);
|
|
509
|
-
return { providerId: llmProvider, apiKey, provider };
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
// Call LLM provider API
|
|
513
|
-
async function callLLM(prompt: string): Promise<string> {
|
|
514
|
-
const { providerId, apiKey, provider } = getLLMProvider();
|
|
515
|
-
|
|
516
|
-
// Pick a fast model if available
|
|
517
|
-
const model = provider.models.find((m: any) => m.label?.toLowerCase().includes("fast"))?.value
|
|
518
|
-
|| provider.models.find((m: any) => m.label?.toLowerCase().includes("mini"))?.value
|
|
519
|
-
|| provider.models[0]?.value;
|
|
520
|
-
|
|
521
|
-
console.log(`${TAG} callLLM: provider=${providerId}, model=${model}, prompt=${prompt.length} chars`);
|
|
522
|
-
|
|
523
|
-
if (providerId === "anthropic") {
|
|
524
|
-
const res = await fetch("https://api.anthropic.com/v1/messages", {
|
|
525
|
-
method: "POST",
|
|
526
|
-
headers: {
|
|
527
|
-
"Content-Type": "application/json",
|
|
528
|
-
"x-api-key": apiKey,
|
|
529
|
-
"anthropic-version": "2023-06-01",
|
|
530
|
-
},
|
|
531
|
-
body: JSON.stringify({
|
|
532
|
-
model,
|
|
533
|
-
max_tokens: 512,
|
|
534
|
-
messages: [{ role: "user", content: prompt }],
|
|
535
|
-
}),
|
|
536
|
-
});
|
|
537
|
-
console.log(`${TAG} callLLM: Anthropic response status=${res.status}`);
|
|
538
|
-
const data = await res.json() as any;
|
|
539
|
-
if (!res.ok) {
|
|
540
|
-
console.log(`${TAG} callLLM: Anthropic error: ${JSON.stringify(data).slice(0, 300)}`);
|
|
541
|
-
throw new Error(`Anthropic API error ${res.status}: ${data.error?.message || JSON.stringify(data)}`);
|
|
542
|
-
}
|
|
543
|
-
return data.content?.[0]?.text || JSON.stringify(data);
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
if (providerId === "gemini") {
|
|
547
|
-
const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`, {
|
|
548
|
-
method: "POST",
|
|
549
|
-
headers: { "Content-Type": "application/json" },
|
|
550
|
-
body: JSON.stringify({
|
|
551
|
-
contents: [{ parts: [{ text: prompt }] }],
|
|
552
|
-
generationConfig: { maxOutputTokens: 512 },
|
|
553
|
-
}),
|
|
554
|
-
});
|
|
555
|
-
console.log(`${TAG} callLLM: Gemini response status=${res.status}`);
|
|
556
|
-
const data = await res.json() as any;
|
|
557
|
-
if (!res.ok) {
|
|
558
|
-
console.log(`${TAG} callLLM: Gemini error: ${JSON.stringify(data).slice(0, 300)}`);
|
|
559
|
-
throw new Error(`Gemini API error ${res.status}: ${JSON.stringify(data)}`);
|
|
560
|
-
}
|
|
561
|
-
return data.candidates?.[0]?.content?.parts?.[0]?.text || JSON.stringify(data);
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
// OpenAI-compatible (openai, groq, xai, together, fireworks, moonshot)
|
|
565
|
-
const baseUrls: Record<string, string> = {
|
|
566
|
-
openai: "https://api.openai.com/v1",
|
|
567
|
-
groq: "https://api.groq.com/openai/v1",
|
|
568
|
-
xai: "https://api.x.ai/v1",
|
|
569
|
-
together: "https://api.together.xyz/v1",
|
|
570
|
-
fireworks: "https://api.fireworks.ai/inference/v1",
|
|
571
|
-
moonshot: "https://api.moonshot.cn/v1",
|
|
572
|
-
};
|
|
573
|
-
|
|
574
|
-
const baseUrl = baseUrls[providerId];
|
|
575
|
-
if (!baseUrl) {
|
|
576
|
-
throw new Error(`Unsupported provider: ${providerId}`);
|
|
577
|
-
}
|
|
578
|
-
|
|
579
|
-
const res = await fetch(`${baseUrl}/chat/completions`, {
|
|
580
|
-
method: "POST",
|
|
581
|
-
headers: {
|
|
582
|
-
"Content-Type": "application/json",
|
|
583
|
-
"Authorization": `Bearer ${apiKey}`,
|
|
584
|
-
},
|
|
585
|
-
body: JSON.stringify({
|
|
586
|
-
model,
|
|
587
|
-
max_tokens: 512,
|
|
588
|
-
messages: [{ role: "user", content: prompt }],
|
|
589
|
-
}),
|
|
590
|
-
});
|
|
591
|
-
console.log(`${TAG} callLLM: ${providerId} response status=${res.status}`);
|
|
592
|
-
const data = await res.json() as any;
|
|
593
|
-
if (!res.ok) {
|
|
594
|
-
console.log(`${TAG} callLLM: ${providerId} error: ${JSON.stringify(data).slice(0, 300)}`);
|
|
595
|
-
throw new Error(`${providerId} API error ${res.status}: ${data.error?.message || JSON.stringify(data)}`);
|
|
596
|
-
}
|
|
597
|
-
return data.choices?.[0]?.message?.content || JSON.stringify(data);
|
|
598
|
-
}
|