apteva 0.4.57 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. package/README.md +216 -54
  2. package/cli.js +35 -0
  3. package/install.js +92 -0
  4. package/package.json +15 -76
  5. package/LICENSE +0 -63
  6. package/bin/apteva.js +0 -196
  7. package/dist/ActivityPage.kxzzb4yc.js +0 -3
  8. package/dist/ApiDocsPage.zq998hbm.js +0 -4
  9. package/dist/App.55rea8mn.js +0 -61
  10. package/dist/App.5ywb23z4.js +0 -53
  11. package/dist/App.6thds120.js +0 -4
  12. package/dist/App.9tctxzqm.js +0 -8
  13. package/dist/App.a8r8ttaz.js +0 -4
  14. package/dist/App.agsv5bje.js +0 -4
  15. package/dist/App.cepapqmx.js +0 -4
  16. package/dist/App.dp041gb3.js +0 -221
  17. package/dist/App.fds72zb5.js +0 -4
  18. package/dist/App.fg9qj2dq.js +0 -4
  19. package/dist/App.ndfejbm9.js +0 -4
  20. package/dist/App.nxmfmq1h.js +0 -13
  21. package/dist/App.qdfyt8ba.js +0 -4
  22. package/dist/App.x2d0ygt6.js +0 -4
  23. package/dist/App.yt9p4nr3.js +0 -20
  24. package/dist/App.zn4mw16t.js +0 -1
  25. package/dist/ConnectionsPage.8r96ryw7.js +0 -3
  26. package/dist/McpPage.3cwh0gnd.js +0 -3
  27. package/dist/SettingsPage.ykgdh5ev.js +0 -3
  28. package/dist/SkillsPage.4np1s65b.js +0 -3
  29. package/dist/TasksPage.4g08t7p6.js +0 -3
  30. package/dist/TelemetryPage.72w9pwcp.js +0 -3
  31. package/dist/TestsPage.z4fk3r7r.js +0 -3
  32. package/dist/ThreadsPage.63tcajeh.js +0 -3
  33. package/dist/apteva-kit.css +0 -1
  34. package/dist/icon.png +0 -0
  35. package/dist/index.html +0 -16
  36. package/dist/styles.css +0 -1
  37. package/scripts/postinstall.mjs +0 -102
  38. package/src/auth/index.ts +0 -394
  39. package/src/auth/middleware.ts +0 -213
  40. package/src/binary.ts +0 -536
  41. package/src/channels/index.ts +0 -40
  42. package/src/channels/telegram.ts +0 -311
  43. package/src/crypto.ts +0 -301
  44. package/src/db-tests.ts +0 -174
  45. package/src/db.ts +0 -3133
  46. package/src/integrations/agentdojo.ts +0 -559
  47. package/src/integrations/composio.ts +0 -437
  48. package/src/integrations/index.ts +0 -87
  49. package/src/integrations/skillsmp.ts +0 -318
  50. package/src/mcp-client.ts +0 -605
  51. package/src/mcp-handler.ts +0 -394
  52. package/src/mcp-platform.ts +0 -2403
  53. package/src/openapi.ts +0 -2410
  54. package/src/providers.ts +0 -597
  55. package/src/routes/api/agent-utils.ts +0 -890
  56. package/src/routes/api/agents.ts +0 -916
  57. package/src/routes/api/api-keys.ts +0 -95
  58. package/src/routes/api/channels.ts +0 -182
  59. package/src/routes/api/helpers.ts +0 -12
  60. package/src/routes/api/integrations.ts +0 -639
  61. package/src/routes/api/mcp.ts +0 -574
  62. package/src/routes/api/meta-agent.ts +0 -195
  63. package/src/routes/api/projects.ts +0 -112
  64. package/src/routes/api/providers.ts +0 -424
  65. package/src/routes/api/skills.ts +0 -537
  66. package/src/routes/api/system.ts +0 -333
  67. package/src/routes/api/telemetry.ts +0 -203
  68. package/src/routes/api/tests.ts +0 -148
  69. package/src/routes/api/triggers.ts +0 -518
  70. package/src/routes/api/users.ts +0 -148
  71. package/src/routes/api/webhooks.ts +0 -171
  72. package/src/routes/api.ts +0 -53
  73. package/src/routes/auth.ts +0 -251
  74. package/src/routes/share.ts +0 -86
  75. package/src/routes/static.ts +0 -131
  76. package/src/server.ts +0 -642
  77. package/src/test-runner.ts +0 -598
  78. package/src/triggers/agentdojo.ts +0 -253
  79. package/src/triggers/composio.ts +0 -264
  80. package/src/triggers/index.ts +0 -71
  81. package/src/tui/AgentList.tsx +0 -145
  82. package/src/tui/App.tsx +0 -102
  83. package/src/tui/Login.tsx +0 -104
  84. package/src/tui/api.ts +0 -72
  85. package/src/tui/index.tsx +0 -7
  86. package/src/web/App.tsx +0 -455
  87. package/src/web/components/activity/ActivityPage.tsx +0 -314
  88. package/src/web/components/activity/index.ts +0 -1
  89. package/src/web/components/agents/AgentCard.tsx +0 -189
  90. package/src/web/components/agents/AgentPanel.tsx +0 -2244
  91. package/src/web/components/agents/AgentsView.tsx +0 -180
  92. package/src/web/components/agents/CreateAgentModal.tsx +0 -475
  93. package/src/web/components/agents/index.ts +0 -4
  94. package/src/web/components/api/ApiDocsPage.tsx +0 -842
  95. package/src/web/components/auth/CreateAccountStep.tsx +0 -176
  96. package/src/web/components/auth/LoginPage.tsx +0 -91
  97. package/src/web/components/auth/index.ts +0 -2
  98. package/src/web/components/common/Icons.tsx +0 -250
  99. package/src/web/components/common/LoadingSpinner.tsx +0 -44
  100. package/src/web/components/common/Modal.tsx +0 -199
  101. package/src/web/components/common/Select.tsx +0 -97
  102. package/src/web/components/common/index.ts +0 -20
  103. package/src/web/components/connections/ConnectionsPage.tsx +0 -54
  104. package/src/web/components/connections/IntegrationsTab.tsx +0 -170
  105. package/src/web/components/connections/OverviewTab.tsx +0 -137
  106. package/src/web/components/connections/TriggersTab.tsx +0 -1346
  107. package/src/web/components/dashboard/Dashboard.tsx +0 -572
  108. package/src/web/components/dashboard/index.ts +0 -1
  109. package/src/web/components/index.ts +0 -21
  110. package/src/web/components/layout/ErrorBanner.tsx +0 -18
  111. package/src/web/components/layout/Header.tsx +0 -332
  112. package/src/web/components/layout/Sidebar.tsx +0 -231
  113. package/src/web/components/layout/index.ts +0 -3
  114. package/src/web/components/mcp/IntegrationsPanel.tsx +0 -857
  115. package/src/web/components/mcp/McpPage.tsx +0 -2515
  116. package/src/web/components/mcp/index.ts +0 -1
  117. package/src/web/components/meta-agent/MetaAgent.tsx +0 -245
  118. package/src/web/components/onboarding/OnboardingWizard.tsx +0 -404
  119. package/src/web/components/onboarding/index.ts +0 -1
  120. package/src/web/components/settings/SettingsPage.tsx +0 -2776
  121. package/src/web/components/settings/index.ts +0 -1
  122. package/src/web/components/skills/SkillsPage.tsx +0 -1200
  123. package/src/web/components/tasks/TasksPage.tsx +0 -1116
  124. package/src/web/components/tasks/index.ts +0 -1
  125. package/src/web/components/telemetry/TelemetryPage.tsx +0 -1129
  126. package/src/web/components/tests/TestsPage.tsx +0 -594
  127. package/src/web/components/threads/ThreadsPage.tsx +0 -315
  128. package/src/web/context/AuthContext.tsx +0 -242
  129. package/src/web/context/ProjectContext.tsx +0 -214
  130. package/src/web/context/TelemetryContext.tsx +0 -299
  131. package/src/web/context/ThemeContext.tsx +0 -90
  132. package/src/web/context/UIModeContext.tsx +0 -49
  133. package/src/web/context/index.ts +0 -12
  134. package/src/web/hooks/index.ts +0 -3
  135. package/src/web/hooks/useAgents.ts +0 -115
  136. package/src/web/hooks/useOnboarding.ts +0 -20
  137. package/src/web/hooks/useProviders.ts +0 -75
  138. package/src/web/icon.png +0 -0
  139. package/src/web/index.html +0 -16
  140. package/src/web/styles.css +0 -118
  141. package/src/web/themes.ts +0 -162
  142. package/src/web/types.ts +0 -298
@@ -1,598 +0,0 @@
1
- import { AgentDB, generateId } from "./db";
2
- import { TestCaseDB, TestRunDB, type TestCase, type TestRun } from "./db-tests";
3
- import { agentFetch, startAgentProcess } from "./routes/api/agent-utils";
4
- import { ProviderKeys, PROVIDERS } from "./providers";
5
- import { telemetryBroadcaster, type TelemetryEvent } from "./server";
6
-
7
- const TAG = "[test-runner]";
8
-
9
- interface JudgeResult {
10
- pass: boolean;
11
- score: number;
12
- reasoning: string;
13
- }
14
-
15
- interface PlanResult {
16
- agent_id: string;
17
- agent_name: string;
18
- message: string;
19
- reasoning: string;
20
- }
21
-
22
- // 5-minute safety cap for stream consumption
23
- const STREAM_SAFETY_TIMEOUT_MS = 5 * 60 * 1000;
24
-
25
- // Broadcast a test telemetry event via SSE
26
- function broadcastTestEvent(
27
- runId: string,
28
- testCaseId: string,
29
- type: string,
30
- data: Record<string, unknown> = {},
31
- agentId?: string,
32
- ) {
33
- const event: TelemetryEvent = {
34
- id: generateId(),
35
- agent_id: agentId || "system",
36
- timestamp: new Date().toISOString(),
37
- category: "test",
38
- type,
39
- level: "info",
40
- trace_id: runId,
41
- data: { test_case_id: testCaseId, ...data },
42
- };
43
- telemetryBroadcaster.broadcast([event]);
44
- }
45
-
46
- // Plan a behavior-driven test: AI picks the agent and generates the message
47
- async function planTest(behavior: string, projectId: string | null): Promise<PlanResult> {
48
- console.log(`${TAG} Planning test for behavior: "${behavior.slice(0, 80)}..." (project: ${projectId || "all"})`);
49
-
50
- // Get available agents
51
- const agents = projectId
52
- ? AgentDB.findByProject(projectId)
53
- : AgentDB.findAll();
54
-
55
- console.log(`${TAG} Found ${agents.length} agent(s) for planning`);
56
-
57
- if (agents.length === 0) {
58
- throw new Error("No agents available to test");
59
- }
60
-
61
- const agentDescriptions = agents.map(a => {
62
- const features = [];
63
- if (a.features.memory) features.push("memory");
64
- if (a.features.tasks) features.push("tasks");
65
- if (a.features.mcp) features.push("MCP tools");
66
- if (a.features.operator) features.push("browser");
67
- if (a.features.vision) features.push("vision");
68
- if (a.features.realtime) features.push("realtime voice");
69
- const featureStr = features.length > 0 ? ` | Features: ${features.join(", ")}` : "";
70
- const promptSnippet = a.system_prompt.length > 200
71
- ? a.system_prompt.slice(0, 200) + "..."
72
- : a.system_prompt;
73
- return `- ID: ${a.id} | Name: ${a.name} | Status: ${a.status}${featureStr}\n System prompt: ${promptSnippet}`;
74
- }).join("\n");
75
-
76
- const planPrompt = `You are a test planner for an AI agent platform. Given a behavior description, you must:
77
- 1. Pick the most appropriate agent to test this behavior
78
- 2. Generate a realistic user message that would trigger the described behavior
79
-
80
- ## Available Agents
81
- ${agentDescriptions}
82
-
83
- ## Behavior to Test
84
- ${behavior}
85
-
86
- Pick the agent whose capabilities best match this behavior. Prefer running agents when possible.
87
- Generate a natural user message that would test this specific behavior.
88
-
89
- Respond with ONLY a JSON object (no markdown, no extra text):
90
- {"agent_id": "the-agent-id", "message": "the message to send", "reasoning": "brief explanation of why this agent and message"}`;
91
-
92
- console.log(`${TAG} Calling LLM planner...`);
93
- const result = await callLLM(planPrompt);
94
- console.log(`${TAG} Planner raw response: ${result.slice(0, 300)}`);
95
-
96
- const jsonMatch = result.match(/\{[\s\S]*\}/);
97
- if (!jsonMatch) {
98
- throw new Error(`Planner returned unparseable response: ${result.slice(0, 200)}`);
99
- }
100
-
101
- const parsed = JSON.parse(jsonMatch[0]);
102
- console.log(`${TAG} Planner chose agent_id="${parsed.agent_id}", message="${(parsed.message || "").slice(0, 80)}"`);
103
-
104
- if (!parsed.agent_id || !parsed.message) {
105
- throw new Error("Planner response missing agent_id or message");
106
- }
107
-
108
- // Validate the chosen agent exists
109
- const chosenAgent = agents.find(a => a.id === parsed.agent_id);
110
- if (!chosenAgent) {
111
- const fallback = agents[0];
112
- console.log(`${TAG} Planner picked invalid agent "${parsed.agent_id}", falling back to "${fallback.name}" (${fallback.id})`);
113
- return {
114
- agent_id: fallback.id,
115
- agent_name: fallback.name,
116
- message: parsed.message,
117
- reasoning: `${parsed.reasoning} (Note: planner picked invalid agent ${parsed.agent_id}, falling back to ${fallback.name})`,
118
- };
119
- }
120
-
121
- console.log(`${TAG} Plan complete: agent="${chosenAgent.name}" (${chosenAgent.id})`);
122
- return {
123
- agent_id: chosenAgent.id,
124
- agent_name: chosenAgent.name,
125
- message: parsed.message,
126
- reasoning: parsed.reasoning || "",
127
- };
128
- }
129
-
130
- // Run a single test case
131
- export async function runTest(testCase: TestCase): Promise<TestRun> {
132
- console.log(`${TAG} ========== Running test "${testCase.name}" (${testCase.id}) ==========`);
133
- console.log(`${TAG} Test details: behavior=${testCase.behavior ? `"${testCase.behavior.slice(0, 60)}..."` : "null"}, agent_id=${testCase.agent_id || "null"}, input_message=${testCase.input_message ? `"${testCase.input_message.slice(0, 60)}"` : "null"}, project_id=${testCase.project_id || "null"}`);
134
-
135
- const run = TestRunDB.create(testCase.id);
136
- console.log(`${TAG} Created test run: ${run.id}`);
137
- const startTime = Date.now();
138
-
139
- broadcastTestEvent(run.id, testCase.id, "test_started", {
140
- test_name: testCase.name,
141
- behavior: testCase.behavior || undefined,
142
- });
143
-
144
- try {
145
- let agentId = testCase.agent_id;
146
- let inputMessage = testCase.input_message;
147
- let plannerReasoning: string | undefined;
148
- let selectedAgentId: string | undefined;
149
- let selectedAgentName: string | undefined;
150
- let generatedMessage: string | undefined;
151
-
152
- // Behavior-driven test: use AI planner to pick agent and generate message
153
- if (testCase.behavior && (!agentId || !inputMessage)) {
154
- console.log(`${TAG} Behavior-driven test — running planner (agentId=${agentId || "auto"}, inputMessage=${inputMessage ? "set" : "auto"})`);
155
- broadcastTestEvent(run.id, testCase.id, "test_planning", { test_name: testCase.name });
156
- const plan = await planTest(testCase.behavior, testCase.project_id);
157
-
158
- if (!agentId) {
159
- agentId = plan.agent_id;
160
- selectedAgentId = plan.agent_id;
161
- selectedAgentName = plan.agent_name;
162
- console.log(`${TAG} Planner selected agent: ${plan.agent_name} (${plan.agent_id})`);
163
- }
164
- if (!inputMessage) {
165
- inputMessage = plan.message;
166
- generatedMessage = plan.message;
167
- console.log(`${TAG} Planner generated message: "${plan.message.slice(0, 100)}"`);
168
- }
169
- plannerReasoning = plan.reasoning;
170
- }
171
-
172
- if (!agentId || !inputMessage) {
173
- console.log(`${TAG} ERROR: Missing agentId (${agentId}) or inputMessage (${inputMessage})`);
174
- return TestRunDB.complete(run.id, {
175
- status: "error",
176
- error: "Test requires either behavior description or explicit agent_id + input_message",
177
- duration_ms: Date.now() - startTime,
178
- })!;
179
- }
180
-
181
- console.log(`${TAG} Looking up agent: ${agentId}`);
182
- const agent = AgentDB.findById(agentId);
183
- if (!agent) {
184
- console.log(`${TAG} ERROR: Agent not found: ${agentId}`);
185
- return TestRunDB.complete(run.id, {
186
- status: "error",
187
- error: `Agent not found: ${agentId}`,
188
- duration_ms: Date.now() - startTime,
189
- selected_agent_id: selectedAgentId,
190
- selected_agent_name: selectedAgentName,
191
- generated_message: generatedMessage,
192
- planner_reasoning: plannerReasoning,
193
- })!;
194
- }
195
-
196
- console.log(`${TAG} Agent "${agent.name}": status=${agent.status}, port=${agent.port}`);
197
-
198
- // Start agent if not running
199
- if (agent.status !== "running" || !agent.port) {
200
- console.log(`${TAG} Agent not running, starting...`);
201
- const startResult = await startAgentProcess(agent, { silent: true });
202
- console.log(`${TAG} Start result: success=${startResult.success}, port=${startResult.port}, error=${startResult.error || "none"}`);
203
- if (!startResult.success) {
204
- return TestRunDB.complete(run.id, {
205
- status: "error",
206
- error: `Failed to start agent: ${startResult.error}`,
207
- duration_ms: Date.now() - startTime,
208
- selected_agent_id: selectedAgentId,
209
- selected_agent_name: selectedAgentName,
210
- generated_message: generatedMessage,
211
- planner_reasoning: plannerReasoning,
212
- })!;
213
- }
214
- }
215
-
216
- // Re-fetch agent to get updated port
217
- const runningAgent = AgentDB.findById(agentId)!;
218
- if (!runningAgent || runningAgent.status !== "running" || !runningAgent.port) {
219
- console.log(`${TAG} ERROR: Agent still not running after start. status=${runningAgent?.status}, port=${runningAgent?.port}`);
220
- return TestRunDB.complete(run.id, {
221
- status: "error",
222
- error: "Agent failed to start",
223
- duration_ms: Date.now() - startTime,
224
- selected_agent_id: selectedAgentId,
225
- selected_agent_name: selectedAgentName,
226
- generated_message: generatedMessage,
227
- planner_reasoning: plannerReasoning,
228
- })!;
229
- }
230
-
231
- console.log(`${TAG} Agent running on port ${runningAgent.port}`);
232
-
233
- // 1. Send message via /chat endpoint (thread created automatically by agent)
234
- console.log(`${TAG} Step 1: Sending message to /chat: "${inputMessage.slice(0, 100)}"`);
235
- broadcastTestEvent(run.id, testCase.id, "test_executing", {
236
- test_name: testCase.name,
237
- agent_name: selectedAgentName || runningAgent.name,
238
- message: inputMessage.slice(0, 100),
239
- }, agentId!);
240
- const chatBody = { message: inputMessage };
241
-
242
- const chatRes = await agentFetch(runningAgent.id, runningAgent.port!, "/chat", {
243
- method: "POST",
244
- headers: { "Content-Type": "application/json", "X-Test-Mode": "true" },
245
- body: JSON.stringify(chatBody),
246
- });
247
-
248
- if (!chatRes.ok) {
249
- const errBody = await chatRes.text();
250
- console.log(`${TAG} ERROR: Chat failed (${chatRes.status}): ${errBody}`);
251
- return TestRunDB.complete(run.id, {
252
- status: "error",
253
- error: `Chat failed: ${errBody}`,
254
- duration_ms: Date.now() - startTime,
255
- selected_agent_id: selectedAgentId,
256
- selected_agent_name: selectedAgentName,
257
- generated_message: generatedMessage,
258
- planner_reasoning: plannerReasoning,
259
- })!;
260
- }
261
-
262
- console.log(`${TAG} Chat response started (status ${chatRes.status}, content-type: ${chatRes.headers.get("content-type")})`);
263
-
264
- // 2. Consume the streaming response (5-min safety cap)
265
- console.log(`${TAG} Step 2: Consuming stream...`);
266
- const streamText = await consumeStream(chatRes);
267
- console.log(`${TAG} Stream consumed: ${streamText.length} chars (${(Date.now() - startTime) / 1000}s elapsed)`);
268
-
269
- // 3. Parse SSE events from stream — extract thread_id and assemble response text
270
- let threadId: string | null = null;
271
- let messages: any[] = [];
272
- const contentChunks: string[] = [];
273
-
274
- const lines = streamText.split("\n").filter(l => l.trim());
275
- let parsedCount = 0;
276
- for (const line of lines) {
277
- const sseData = line.startsWith("data: ") ? line.slice(6) : line;
278
- try {
279
- const evt = JSON.parse(sseData);
280
- parsedCount++;
281
- // Extract thread_id (agent sends type: "thread_id")
282
- if (evt.type === "thread_id" && evt.thread_id) {
283
- threadId = evt.thread_id;
284
- console.log(`${TAG} Found thread_id in SSE: ${threadId}`);
285
- }
286
- // Accumulate content chunks
287
- if (evt.type === "content" && evt.content) {
288
- contentChunks.push(evt.content);
289
- }
290
- } catch {
291
- // Not valid JSON
292
- }
293
- }
294
- console.log(`${TAG} Parsed ${parsedCount} SSE event(s), ${contentChunks.length} content chunks, threadId=${threadId || "none"}`);
295
-
296
- // Assemble the agent's full text response from content chunks
297
- const assembledResponse = contentChunks.join("");
298
-
299
- // If we got a threadId, try fetching full thread messages for a structured view
300
- if (threadId) {
301
- console.log(`${TAG} Step 3: Fetching thread ${threadId} messages...`);
302
- const messagesRes = await agentFetch(runningAgent.id, runningAgent.port!, `/threads/${threadId}/messages`, {
303
- method: "GET",
304
- headers: { "Accept": "application/json" },
305
- });
306
-
307
- if (messagesRes.ok) {
308
- const data = await messagesRes.json();
309
- messages = Array.isArray(data) ? data : (data.messages || []);
310
- console.log(`${TAG} Got ${messages.length} message(s) from thread`);
311
- } else {
312
- console.log(`${TAG} WARNING: Failed to fetch messages (${messagesRes.status}): ${await messagesRes.text()}`);
313
- }
314
- }
315
-
316
- // Fallback: build conversation from assembled SSE content
317
- if (messages.length === 0 && assembledResponse.length > 0) {
318
- console.log(`${TAG} Building conversation from SSE content (${assembledResponse.length} chars)`);
319
- messages = [
320
- { role: "user", content: inputMessage },
321
- { role: "assistant", content: assembledResponse },
322
- ];
323
- }
324
-
325
- const agentResponse = JSON.stringify(messages, null, 2);
326
-
327
- // 4. Run LLM judge — use behavior as criteria when available
328
- const evalCriteria = testCase.behavior || testCase.eval_criteria;
329
- console.log(`${TAG} Step 4: Running LLM judge with criteria: "${evalCriteria.slice(0, 80)}..."`);
330
- broadcastTestEvent(run.id, testCase.id, "test_judging", { test_name: testCase.name }, agentId!);
331
- const judgeResult = await judge(messages, evalCriteria);
332
- console.log(`${TAG} Judge result: pass=${judgeResult.pass}, score=${judgeResult.score}, reasoning="${judgeResult.reasoning.slice(0, 100)}"`);
333
-
334
- const totalMs = Date.now() - startTime;
335
- console.log(`${TAG} ========== Test "${testCase.name}" ${judgeResult.pass ? "PASSED" : "FAILED"} (score: ${judgeResult.score}/10, ${(totalMs / 1000).toFixed(1)}s) ==========`);
336
-
337
- broadcastTestEvent(run.id, testCase.id, "test_completed", {
338
- test_name: testCase.name,
339
- status: judgeResult.pass ? "passed" : "failed",
340
- score: judgeResult.score,
341
- duration_ms: totalMs,
342
- reasoning: judgeResult.reasoning.slice(0, 200),
343
- }, agentId!);
344
-
345
- return TestRunDB.complete(run.id, {
346
- status: judgeResult.pass ? "passed" : "failed",
347
- score: judgeResult.score,
348
- agent_response: agentResponse,
349
- judge_reasoning: judgeResult.reasoning,
350
- duration_ms: totalMs,
351
- selected_agent_id: selectedAgentId,
352
- selected_agent_name: selectedAgentName,
353
- generated_message: generatedMessage,
354
- planner_reasoning: plannerReasoning,
355
- })!;
356
- } catch (err: any) {
357
- const totalMs = Date.now() - startTime;
358
- console.log(`${TAG} ========== Test "${testCase.name}" ERROR (${(totalMs / 1000).toFixed(1)}s): ${err.message || err} ==========`);
359
- console.log(`${TAG} Stack: ${err.stack || "no stack"}`);
360
-
361
- broadcastTestEvent(run.id, testCase.id, "test_completed", {
362
- test_name: testCase.name,
363
- status: "error",
364
- duration_ms: totalMs,
365
- error: (err.message || String(err)).slice(0, 200),
366
- });
367
-
368
- return TestRunDB.complete(run.id, {
369
- status: "error",
370
- error: err.message || String(err),
371
- duration_ms: totalMs,
372
- })!;
373
- }
374
- }
375
-
376
- // Run multiple tests sequentially
377
- export async function runAll(testCaseIds?: string[]): Promise<TestRun[]> {
378
- const testCases = testCaseIds
379
- ? testCaseIds.map(id => TestCaseDB.findById(id)).filter(Boolean) as TestCase[]
380
- : TestCaseDB.findAll();
381
-
382
- console.log(`${TAG} Running ${testCases.length} test(s)`);
383
- const results: TestRun[] = [];
384
- for (const tc of testCases) {
385
- results.push(await runTest(tc));
386
- }
387
- console.log(`${TAG} All ${results.length} test(s) complete: ${results.filter(r => r.status === "passed").length} passed, ${results.filter(r => r.status === "failed").length} failed, ${results.filter(r => r.status === "error").length} errors`);
388
- return results;
389
- }
390
-
391
- // Consume a streaming response (SSE or NDJSON) until done, with safety timeout
392
- async function consumeStream(response: Response): Promise<string> {
393
- const reader = response.body?.getReader();
394
- if (!reader) {
395
- console.log(`${TAG} consumeStream: no body reader available`);
396
- return "";
397
- }
398
-
399
- const decoder = new TextDecoder();
400
- let fullText = "";
401
- let chunks = 0;
402
-
403
- const timeout = new Promise<void>((_, reject) =>
404
- setTimeout(() => reject(new Error("Stream safety timeout (5 min)")), STREAM_SAFETY_TIMEOUT_MS)
405
- );
406
-
407
- const consume = async () => {
408
- while (true) {
409
- const { done, value } = await reader.read();
410
- if (done) {
411
- console.log(`${TAG} consumeStream: stream ended after ${chunks} chunks, ${fullText.length} chars`);
412
- break;
413
- }
414
- chunks++;
415
- const chunk = decoder.decode(value, { stream: true });
416
- fullText += chunk;
417
- if (chunks <= 3 || chunks % 10 === 0) {
418
- console.log(`${TAG} consumeStream: chunk #${chunks} (+${chunk.length} chars, total ${fullText.length})`);
419
- }
420
- }
421
- };
422
-
423
- try {
424
- await Promise.race([consume(), timeout]);
425
- } catch (err: any) {
426
- console.log(`${TAG} consumeStream: error — ${err.message}`);
427
- reader.cancel();
428
- if (err.message.includes("safety timeout")) {
429
- throw err;
430
- }
431
- }
432
-
433
- return fullText;
434
- }
435
-
436
- // LLM Judge: evaluate conversation thread against criteria
437
- async function judge(messages: any[], criteria: string): Promise<JudgeResult> {
438
- // Format messages for the judge prompt
439
- const formattedMessages = messages.map((m: any) => {
440
- const role = m.role || "unknown";
441
- let content = "";
442
- if (typeof m.content === "string") {
443
- content = m.content;
444
- } else if (Array.isArray(m.content)) {
445
- content = m.content.map((block: any) => {
446
- if (block.type === "text") return block.text;
447
- if (block.type === "tool_use") return `[Tool Call: ${block.name}(${JSON.stringify(block.input)})]`;
448
- if (block.type === "tool_result") return `[Tool Result: ${block.content}${block.is_error ? " (ERROR)" : ""}]`;
449
- return JSON.stringify(block);
450
- }).join("\n");
451
- }
452
- return `${role}: ${content}`;
453
- }).join("\n\n");
454
-
455
- const judgePrompt = `You are a test evaluator for an AI agent platform. Given a conversation thread between a user and an AI agent, determine if the agent's behavior meets the success criteria.
456
-
457
- ## Success Criteria
458
- ${criteria}
459
-
460
- ## Conversation Thread
461
- ${formattedMessages}
462
-
463
- Evaluate whether the agent met the success criteria. Also give a score from 1-10 (10 = perfect).
464
-
465
- Respond with ONLY a JSON object (no markdown, no extra text):
466
- {"pass": true, "score": 9, "reasoning": "brief explanation"}
467
- or
468
- {"pass": false, "score": 3, "reasoning": "brief explanation of what failed"}`;
469
-
470
- try {
471
- console.log(`${TAG} Judge: calling LLM with ${formattedMessages.length} chars of conversation...`);
472
- const result = await callLLM(judgePrompt);
473
- console.log(`${TAG} Judge raw response: ${result.slice(0, 200)}`);
474
- // Parse JSON from response
475
- const jsonMatch = result.match(/\{[\s\S]*\}/);
476
- if (jsonMatch) {
477
- const parsed = JSON.parse(jsonMatch[0]);
478
- const score = typeof parsed.score === "number" ? Math.max(1, Math.min(10, parsed.score)) : (parsed.pass ? 8 : 3);
479
- return { pass: !!parsed.pass, score, reasoning: parsed.reasoning || "" };
480
- }
481
- return { pass: false, score: 1, reasoning: `Judge returned unparseable response: ${result.slice(0, 200)}` };
482
- } catch (err: any) {
483
- console.log(`${TAG} Judge error: ${err.message}`);
484
- return { pass: false, score: 1, reasoning: `Judge error: ${err.message}` };
485
- }
486
- }
487
-
488
- // Get an LLM provider + key, or throw
489
- function getLLMProvider(): { providerId: string; apiKey: string; provider: any } {
490
- const configuredProviders = ProviderKeys.getConfiguredProviders();
491
- console.log(`${TAG} Configured providers: ${configuredProviders.join(", ") || "none"}`);
492
-
493
- const llmProvider = configuredProviders.find(id => {
494
- const p = PROVIDERS[id as keyof typeof PROVIDERS];
495
- return p && p.type === "llm" && p.models.length > 0;
496
- });
497
-
498
- if (!llmProvider) {
499
- throw new Error("No LLM provider configured");
500
- }
501
-
502
- const provider = PROVIDERS[llmProvider as keyof typeof PROVIDERS];
503
- const apiKey = ProviderKeys.getDecrypted(llmProvider);
504
- if (!apiKey) {
505
- throw new Error("Failed to retrieve API key for LLM");
506
- }
507
-
508
- console.log(`${TAG} Using LLM provider: ${llmProvider}`);
509
- return { providerId: llmProvider, apiKey, provider };
510
- }
511
-
512
- // Call LLM provider API
513
- async function callLLM(prompt: string): Promise<string> {
514
- const { providerId, apiKey, provider } = getLLMProvider();
515
-
516
- // Pick a fast model if available
517
- const model = provider.models.find((m: any) => m.label?.toLowerCase().includes("fast"))?.value
518
- || provider.models.find((m: any) => m.label?.toLowerCase().includes("mini"))?.value
519
- || provider.models[0]?.value;
520
-
521
- console.log(`${TAG} callLLM: provider=${providerId}, model=${model}, prompt=${prompt.length} chars`);
522
-
523
- if (providerId === "anthropic") {
524
- const res = await fetch("https://api.anthropic.com/v1/messages", {
525
- method: "POST",
526
- headers: {
527
- "Content-Type": "application/json",
528
- "x-api-key": apiKey,
529
- "anthropic-version": "2023-06-01",
530
- },
531
- body: JSON.stringify({
532
- model,
533
- max_tokens: 512,
534
- messages: [{ role: "user", content: prompt }],
535
- }),
536
- });
537
- console.log(`${TAG} callLLM: Anthropic response status=${res.status}`);
538
- const data = await res.json() as any;
539
- if (!res.ok) {
540
- console.log(`${TAG} callLLM: Anthropic error: ${JSON.stringify(data).slice(0, 300)}`);
541
- throw new Error(`Anthropic API error ${res.status}: ${data.error?.message || JSON.stringify(data)}`);
542
- }
543
- return data.content?.[0]?.text || JSON.stringify(data);
544
- }
545
-
546
- if (providerId === "gemini") {
547
- const res = await fetch(`https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`, {
548
- method: "POST",
549
- headers: { "Content-Type": "application/json" },
550
- body: JSON.stringify({
551
- contents: [{ parts: [{ text: prompt }] }],
552
- generationConfig: { maxOutputTokens: 512 },
553
- }),
554
- });
555
- console.log(`${TAG} callLLM: Gemini response status=${res.status}`);
556
- const data = await res.json() as any;
557
- if (!res.ok) {
558
- console.log(`${TAG} callLLM: Gemini error: ${JSON.stringify(data).slice(0, 300)}`);
559
- throw new Error(`Gemini API error ${res.status}: ${JSON.stringify(data)}`);
560
- }
561
- return data.candidates?.[0]?.content?.parts?.[0]?.text || JSON.stringify(data);
562
- }
563
-
564
- // OpenAI-compatible (openai, groq, xai, together, fireworks, moonshot)
565
- const baseUrls: Record<string, string> = {
566
- openai: "https://api.openai.com/v1",
567
- groq: "https://api.groq.com/openai/v1",
568
- xai: "https://api.x.ai/v1",
569
- together: "https://api.together.xyz/v1",
570
- fireworks: "https://api.fireworks.ai/inference/v1",
571
- moonshot: "https://api.moonshot.cn/v1",
572
- };
573
-
574
- const baseUrl = baseUrls[providerId];
575
- if (!baseUrl) {
576
- throw new Error(`Unsupported provider: ${providerId}`);
577
- }
578
-
579
- const res = await fetch(`${baseUrl}/chat/completions`, {
580
- method: "POST",
581
- headers: {
582
- "Content-Type": "application/json",
583
- "Authorization": `Bearer ${apiKey}`,
584
- },
585
- body: JSON.stringify({
586
- model,
587
- max_tokens: 512,
588
- messages: [{ role: "user", content: prompt }],
589
- }),
590
- });
591
- console.log(`${TAG} callLLM: ${providerId} response status=${res.status}`);
592
- const data = await res.json() as any;
593
- if (!res.ok) {
594
- console.log(`${TAG} callLLM: ${providerId} error: ${JSON.stringify(data).slice(0, 300)}`);
595
- throw new Error(`${providerId} API error ${res.status}: ${data.error?.message || JSON.stringify(data)}`);
596
- }
597
- return data.choices?.[0]?.message?.content || JSON.stringify(data);
598
- }