agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/dist/tools.js
CHANGED
|
@@ -2,6 +2,38 @@ import { readFileSync } from "node:fs";
|
|
|
2
2
|
import { pathToFileURL } from "node:url";
|
|
3
3
|
import { resolve } from "node:path";
|
|
4
4
|
import { loadAgentLabConfig } from "./config.js";
|
|
5
|
+
export function applyRuntimeProfileToTools(tools, profile, trace) {
|
|
6
|
+
if (!profile?.tool_faults?.length) {
|
|
7
|
+
return tools;
|
|
8
|
+
}
|
|
9
|
+
const wrapped = { ...tools };
|
|
10
|
+
for (const fault of profile.tool_faults) {
|
|
11
|
+
const original = wrapped[fault.tool];
|
|
12
|
+
if (!original) {
|
|
13
|
+
continue;
|
|
14
|
+
}
|
|
15
|
+
wrapped[fault.tool] = async (input, context) => {
|
|
16
|
+
trace.record("system", "tool_fault_injected", {
|
|
17
|
+
tool: fault.tool,
|
|
18
|
+
mode: fault.mode,
|
|
19
|
+
}, { countStep: false });
|
|
20
|
+
if (fault.mode === "timeout") {
|
|
21
|
+
await waitUnref(fault.timeout_ms ?? 5000);
|
|
22
|
+
const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
|
|
23
|
+
timeoutError.code = "timeout_exceeded";
|
|
24
|
+
throw timeoutError;
|
|
25
|
+
}
|
|
26
|
+
if (fault.mode === "error") {
|
|
27
|
+
throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
|
|
28
|
+
}
|
|
29
|
+
if (fault.mode === "malformed_output") {
|
|
30
|
+
return "MALFORMED_OUTPUT";
|
|
31
|
+
}
|
|
32
|
+
return fault.partial_output ?? {};
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
return wrapped;
|
|
36
|
+
}
|
|
5
37
|
function loadFixture(path) {
|
|
6
38
|
const raw = readFileSync(resolve(path), "utf8");
|
|
7
39
|
return JSON.parse(raw);
|
|
@@ -80,6 +112,252 @@ const BUILTIN_TOOLS = [
|
|
|
80
112
|
};
|
|
81
113
|
},
|
|
82
114
|
},
|
|
115
|
+
{
|
|
116
|
+
spec: {
|
|
117
|
+
name: "accounts.get_profile",
|
|
118
|
+
description: "Fetch account profile details for a customer id.",
|
|
119
|
+
inputSchema: {
|
|
120
|
+
type: "object",
|
|
121
|
+
additionalProperties: false,
|
|
122
|
+
properties: {
|
|
123
|
+
customer_id: { type: "string" },
|
|
124
|
+
},
|
|
125
|
+
required: ["customer_id"],
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
handler: async (input) => {
|
|
129
|
+
assertObject(input);
|
|
130
|
+
const customerId = String(input.customer_id ?? "");
|
|
131
|
+
const accounts = loadFixture("fixtures/support/accounts.json");
|
|
132
|
+
const account = accounts.find((candidate) => candidate.customer_id === customerId);
|
|
133
|
+
if (!account) {
|
|
134
|
+
throw new Error(`Account for customer '${customerId}' not found.`);
|
|
135
|
+
}
|
|
136
|
+
return account;
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
spec: {
|
|
141
|
+
name: "accounts.update_newsletter",
|
|
142
|
+
description: "Update newsletter subscription for a customer id.",
|
|
143
|
+
inputSchema: {
|
|
144
|
+
type: "object",
|
|
145
|
+
additionalProperties: false,
|
|
146
|
+
properties: {
|
|
147
|
+
customer_id: { type: "string" },
|
|
148
|
+
subscribed: { type: "boolean" },
|
|
149
|
+
},
|
|
150
|
+
required: ["customer_id", "subscribed"],
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
handler: async (input) => {
|
|
154
|
+
assertObject(input);
|
|
155
|
+
return {
|
|
156
|
+
customer_id: String(input.customer_id ?? ""),
|
|
157
|
+
newsletter_subscribed: Boolean(input.subscribed),
|
|
158
|
+
updated: true,
|
|
159
|
+
};
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
spec: {
|
|
164
|
+
name: "subscriptions.cancel",
|
|
165
|
+
description: "Cancel an active subscription by customer id.",
|
|
166
|
+
inputSchema: {
|
|
167
|
+
type: "object",
|
|
168
|
+
additionalProperties: false,
|
|
169
|
+
properties: {
|
|
170
|
+
customer_id: { type: "string" },
|
|
171
|
+
},
|
|
172
|
+
required: ["customer_id"],
|
|
173
|
+
},
|
|
174
|
+
},
|
|
175
|
+
handler: async (input) => {
|
|
176
|
+
assertObject(input);
|
|
177
|
+
const customerId = String(input.customer_id ?? "");
|
|
178
|
+
const subscriptions = loadFixture("fixtures/support/subscriptions.json");
|
|
179
|
+
const subscription = subscriptions.find((candidate) => candidate.customer_id === customerId && candidate.status === "active");
|
|
180
|
+
if (!subscription) {
|
|
181
|
+
throw new Error(`Active subscription for customer '${customerId}' not found.`);
|
|
182
|
+
}
|
|
183
|
+
return {
|
|
184
|
+
subscription_id: subscription.subscription_id,
|
|
185
|
+
status: "cancelled",
|
|
186
|
+
};
|
|
187
|
+
},
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
spec: {
|
|
191
|
+
name: "repo.list_files",
|
|
192
|
+
description: "List files in a toy repository fixture.",
|
|
193
|
+
inputSchema: {
|
|
194
|
+
type: "object",
|
|
195
|
+
additionalProperties: false,
|
|
196
|
+
properties: {},
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
handler: async () => {
|
|
200
|
+
const files = loadFixture("fixtures/coding/repo-files.json");
|
|
201
|
+
return files.map((file) => ({ path: file.path }));
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
{
|
|
205
|
+
spec: {
|
|
206
|
+
name: "repo.read_file",
|
|
207
|
+
description: "Read a file from the toy repository fixture.",
|
|
208
|
+
inputSchema: {
|
|
209
|
+
type: "object",
|
|
210
|
+
additionalProperties: false,
|
|
211
|
+
properties: {
|
|
212
|
+
path: { type: "string" },
|
|
213
|
+
},
|
|
214
|
+
required: ["path"],
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
handler: async (input) => {
|
|
218
|
+
assertObject(input);
|
|
219
|
+
const path = String(input.path ?? "");
|
|
220
|
+
const files = loadFixture("fixtures/coding/repo-files.json");
|
|
221
|
+
const file = files.find((candidate) => candidate.path === path);
|
|
222
|
+
if (!file) {
|
|
223
|
+
throw new Error(`Repo file '${path}' not found.`);
|
|
224
|
+
}
|
|
225
|
+
return file;
|
|
226
|
+
},
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
spec: {
|
|
230
|
+
name: "repo.apply_patch",
|
|
231
|
+
description: "Apply a deterministic patch to a toy repository file.",
|
|
232
|
+
inputSchema: {
|
|
233
|
+
type: "object",
|
|
234
|
+
additionalProperties: false,
|
|
235
|
+
properties: {
|
|
236
|
+
path: { type: "string" },
|
|
237
|
+
replacement: { type: "string" },
|
|
238
|
+
},
|
|
239
|
+
required: ["path", "replacement"],
|
|
240
|
+
},
|
|
241
|
+
},
|
|
242
|
+
handler: async (input) => {
|
|
243
|
+
assertObject(input);
|
|
244
|
+
const path = String(input.path ?? "");
|
|
245
|
+
const replacement = String(input.replacement ?? "");
|
|
246
|
+
return {
|
|
247
|
+
path,
|
|
248
|
+
replacement,
|
|
249
|
+
applied: true,
|
|
250
|
+
};
|
|
251
|
+
},
|
|
252
|
+
},
|
|
253
|
+
{
|
|
254
|
+
spec: {
|
|
255
|
+
name: "docs.search",
|
|
256
|
+
description: "Search fixture-backed documents.",
|
|
257
|
+
inputSchema: {
|
|
258
|
+
type: "object",
|
|
259
|
+
additionalProperties: false,
|
|
260
|
+
properties: {
|
|
261
|
+
query: { type: "string" },
|
|
262
|
+
},
|
|
263
|
+
required: ["query"],
|
|
264
|
+
},
|
|
265
|
+
},
|
|
266
|
+
handler: async (input) => {
|
|
267
|
+
assertObject(input);
|
|
268
|
+
const query = String(input.query ?? "").toLowerCase();
|
|
269
|
+
const docs = loadFixture("fixtures/research/documents.json");
|
|
270
|
+
return docs
|
|
271
|
+
.filter((doc) => `${doc.title} ${doc.content}`.toLowerCase().includes(query))
|
|
272
|
+
.map((doc) => ({ id: doc.id, title: doc.title }));
|
|
273
|
+
},
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
spec: {
|
|
277
|
+
name: "docs.read",
|
|
278
|
+
description: "Read one fixture-backed document by id.",
|
|
279
|
+
inputSchema: {
|
|
280
|
+
type: "object",
|
|
281
|
+
additionalProperties: false,
|
|
282
|
+
properties: {
|
|
283
|
+
doc_id: { type: "string" },
|
|
284
|
+
},
|
|
285
|
+
required: ["doc_id"],
|
|
286
|
+
},
|
|
287
|
+
},
|
|
288
|
+
handler: async (input) => {
|
|
289
|
+
assertObject(input);
|
|
290
|
+
const docId = String(input.doc_id ?? "");
|
|
291
|
+
const docs = loadFixture("fixtures/research/documents.json");
|
|
292
|
+
const doc = docs.find((candidate) => candidate.id === docId);
|
|
293
|
+
if (!doc) {
|
|
294
|
+
throw new Error(`Document '${docId}' not found.`);
|
|
295
|
+
}
|
|
296
|
+
return doc;
|
|
297
|
+
},
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
spec: {
|
|
301
|
+
name: "alerts.list_active",
|
|
302
|
+
description: "List active synthetic alerts.",
|
|
303
|
+
inputSchema: {
|
|
304
|
+
type: "object",
|
|
305
|
+
additionalProperties: false,
|
|
306
|
+
properties: {},
|
|
307
|
+
},
|
|
308
|
+
},
|
|
309
|
+
handler: async () => {
|
|
310
|
+
return loadFixture("fixtures/ops/alerts.json");
|
|
311
|
+
},
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
spec: {
|
|
315
|
+
name: "logs.query_service",
|
|
316
|
+
description: "Query synthetic logs for one service.",
|
|
317
|
+
inputSchema: {
|
|
318
|
+
type: "object",
|
|
319
|
+
additionalProperties: false,
|
|
320
|
+
properties: {
|
|
321
|
+
service: { type: "string" },
|
|
322
|
+
},
|
|
323
|
+
required: ["service"],
|
|
324
|
+
},
|
|
325
|
+
},
|
|
326
|
+
handler: async (input) => {
|
|
327
|
+
assertObject(input);
|
|
328
|
+
const service = String(input.service ?? "");
|
|
329
|
+
const logs = loadFixture("fixtures/ops/logs.json");
|
|
330
|
+
const entry = logs.find((candidate) => candidate.service === service);
|
|
331
|
+
if (!entry) {
|
|
332
|
+
throw new Error(`Logs for service '${service}' not found.`);
|
|
333
|
+
}
|
|
334
|
+
return entry;
|
|
335
|
+
},
|
|
336
|
+
},
|
|
337
|
+
{
|
|
338
|
+
spec: {
|
|
339
|
+
name: "status.get_service",
|
|
340
|
+
description: "Read synthetic service ownership and status metadata.",
|
|
341
|
+
inputSchema: {
|
|
342
|
+
type: "object",
|
|
343
|
+
additionalProperties: false,
|
|
344
|
+
properties: {
|
|
345
|
+
service: { type: "string" },
|
|
346
|
+
},
|
|
347
|
+
required: ["service"],
|
|
348
|
+
},
|
|
349
|
+
},
|
|
350
|
+
handler: async (input) => {
|
|
351
|
+
assertObject(input);
|
|
352
|
+
const service = String(input.service ?? "");
|
|
353
|
+
const statuses = loadFixture("fixtures/ops/status.json");
|
|
354
|
+
const status = statuses.find((candidate) => candidate.service === service);
|
|
355
|
+
if (!status) {
|
|
356
|
+
throw new Error(`Service status for '${service}' not found.`);
|
|
357
|
+
}
|
|
358
|
+
return status;
|
|
359
|
+
},
|
|
360
|
+
},
|
|
83
361
|
];
|
|
84
362
|
export async function loadToolRegistry() {
|
|
85
363
|
const tools = await loadTools();
|
|
@@ -126,3 +404,9 @@ function assertObject(value) {
|
|
|
126
404
|
throw new Error("Tool input must be an object.");
|
|
127
405
|
}
|
|
128
406
|
}
|
|
407
|
+
function waitUnref(timeoutMs) {
|
|
408
|
+
return new Promise((resolve) => {
|
|
409
|
+
const timer = setTimeout(resolve, timeoutMs);
|
|
410
|
+
timer.unref?.();
|
|
411
|
+
});
|
|
412
|
+
}
|
package/dist/trace.js
CHANGED
|
@@ -8,8 +8,10 @@ export class TraceRecorder {
|
|
|
8
8
|
this.runId = runId;
|
|
9
9
|
this.scenarioId = scenarioId;
|
|
10
10
|
}
|
|
11
|
-
record(source, type, payload) {
|
|
12
|
-
|
|
11
|
+
record(source, type, payload, options) {
|
|
12
|
+
if (options?.countStep !== false) {
|
|
13
|
+
this.stepIndex += 1;
|
|
14
|
+
}
|
|
13
15
|
this.events.push({
|
|
14
16
|
eventId: createEventId(),
|
|
15
17
|
runId: this.runId,
|
package/dist/ui/App.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
|
|
1
|
+
import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
|
|
2
2
|
import { useEffect, useState } from "react";
|
|
3
3
|
export function App() {
|
|
4
4
|
const route = getRoute();
|
|
5
|
-
return (_jsxs("div", { className: "shell", children: [_jsx("header", { className: "topbar", children: _jsx("a", { className: "brand", href: "/", children: "Agent Regression Lab Alpha" }) }), _jsxs("main", { className: "page", children: [route.type === "list" ? _jsx(RunListPage, {}) : null, route.type === "detail" ? _jsx(RunDetailPage, { runId: route.runId }) : null, route.type === "compare" ? _jsx(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null] })] }));
|
|
5
|
+
return (_jsxs("div", { className: "shell", children: [_jsx("header", { className: "topbar", children: _jsx("a", { className: "brand", href: "/", children: "Agent Regression Lab Alpha" }) }), _jsxs("main", { className: "page", children: [route.type === "list" ? _jsx(RunListPage, {}) : null, route.type === "detail" ? _jsx(RunDetailPage, { runId: route.runId }) : null, route.type === "compare" ? _jsx(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null, route.type === "compare-suite" ? _jsx(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null] })] }));
|
|
6
6
|
}
|
|
7
7
|
function RunListPage() {
|
|
8
8
|
const [runs, setRuns] = useState([]);
|
|
@@ -21,7 +21,11 @@ function RunListPage() {
|
|
|
21
21
|
.then((response) => response.json())
|
|
22
22
|
.then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
|
|
23
23
|
}, [suite, status, provider]);
|
|
24
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null
|
|
24
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
|
|
25
|
+
runs[index - 1].suite === run.suite &&
|
|
26
|
+
runs[index - 1].suiteBatchId &&
|
|
27
|
+
run.suiteBatchId &&
|
|
28
|
+
runs[index - 1].suiteBatchId !== run.suiteBatchId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) })) : null] })] }, run.id))) })] })) : null] }));
|
|
25
29
|
}
|
|
26
30
|
function RunDetailPage(props) {
|
|
27
31
|
const [detail, setDetail] = useState(null);
|
|
@@ -33,7 +37,18 @@ function RunDetailPage(props) {
|
|
|
33
37
|
if (!detail) {
|
|
34
38
|
return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
|
|
35
39
|
}
|
|
36
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
40
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
41
|
+
}
|
|
42
|
+
export function FailureSummaryPanel(props) {
|
|
43
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
44
|
+
if (failureItems.length === 0) {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
|
|
48
|
+
}
|
|
49
|
+
export function RunIdentitySummary(props) {
|
|
50
|
+
const run = props.detail.run;
|
|
51
|
+
return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
|
|
37
52
|
}
|
|
38
53
|
function ComparePage(props) {
|
|
39
54
|
const [data, setData] = useState(null);
|
|
@@ -55,17 +70,57 @@ function ComparePage(props) {
|
|
|
55
70
|
if (!data) {
|
|
56
71
|
return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
|
|
57
72
|
}
|
|
58
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (
|
|
73
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.note, diff.hardGate ? " (hard gate)" : ""] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack", children: data.toolDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
|
|
59
74
|
}
|
|
60
75
|
function RunSide(props) {
|
|
61
76
|
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { children: _jsxs("strong", { children: [event.stepIndex, ". ", event.type] }) }, event.eventId))) })] }));
|
|
62
77
|
}
|
|
78
|
+
function SuiteComparePage(props) {
|
|
79
|
+
const [data, setData] = useState(null);
|
|
80
|
+
useEffect(() => {
|
|
81
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
82
|
+
setData(null);
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
const url = new URL("/api/compare-suite", window.location.origin);
|
|
86
|
+
url.searchParams.set("baselineBatch", props.baselineBatch);
|
|
87
|
+
url.searchParams.set("candidateBatch", props.candidateBatch);
|
|
88
|
+
void fetch(url)
|
|
89
|
+
.then((response) => response.json())
|
|
90
|
+
.then((payload) => setData(payload));
|
|
91
|
+
}, [props.baselineBatch, props.candidateBatch]);
|
|
92
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
93
|
+
return _jsx(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
|
|
94
|
+
}
|
|
95
|
+
if (!data) {
|
|
96
|
+
return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
|
|
97
|
+
}
|
|
98
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
|
|
99
|
+
}
|
|
100
|
+
function ScenarioList(props) {
|
|
101
|
+
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
|
|
102
|
+
}
|
|
63
103
|
function Stat(props) {
|
|
64
104
|
return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
|
|
65
105
|
}
|
|
66
106
|
function EmptyState(props) {
|
|
67
107
|
return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
|
|
68
108
|
}
|
|
109
|
+
export function getFailureSummaryItems(detail) {
|
|
110
|
+
const items = [];
|
|
111
|
+
if (detail.errorDetail) {
|
|
112
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
113
|
+
}
|
|
114
|
+
for (const result of detail.evaluatorResults) {
|
|
115
|
+
if (result.status === "fail") {
|
|
116
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
120
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
121
|
+
}
|
|
122
|
+
return items;
|
|
123
|
+
}
|
|
69
124
|
function signed(value) {
|
|
70
125
|
return value > 0 ? `+${value}` : `${value}`;
|
|
71
126
|
}
|
|
@@ -74,6 +129,13 @@ function getRoute() {
|
|
|
74
129
|
if (url.pathname.startsWith("/runs/")) {
|
|
75
130
|
return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
|
|
76
131
|
}
|
|
132
|
+
if (url.pathname === "/compare-suite") {
|
|
133
|
+
return {
|
|
134
|
+
type: "compare-suite",
|
|
135
|
+
baselineBatch: url.searchParams.get("baselineBatch") ?? undefined,
|
|
136
|
+
candidateBatch: url.searchParams.get("candidateBatch") ?? undefined,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
77
139
|
if (url.pathname === "/compare") {
|
|
78
140
|
return {
|
|
79
141
|
type: "compare",
|
package/dist/ui/server.js
CHANGED
|
@@ -80,6 +80,10 @@ function handleApi(url, response) {
|
|
|
80
80
|
...comparison.candidate,
|
|
81
81
|
errorDetail: getRunErrorDetail(comparison.candidate),
|
|
82
82
|
},
|
|
83
|
+
classification: comparison.classification,
|
|
84
|
+
verdictDelta: comparison.verdictDelta,
|
|
85
|
+
terminationDelta: comparison.terminationDelta,
|
|
86
|
+
outputChanged: comparison.outputChanged,
|
|
83
87
|
notes: comparison.notes,
|
|
84
88
|
deltas: comparison.deltas,
|
|
85
89
|
evaluatorDiffs: comparison.evaluatorDiffs,
|
|
@@ -87,11 +91,25 @@ function handleApi(url, response) {
|
|
|
87
91
|
});
|
|
88
92
|
return;
|
|
89
93
|
}
|
|
94
|
+
if (url.pathname === "/api/compare-suite") {
|
|
95
|
+
const baselineBatch = url.searchParams.get("baselineBatch");
|
|
96
|
+
const candidateBatch = url.searchParams.get("candidateBatch");
|
|
97
|
+
if (!baselineBatch || !candidateBatch) {
|
|
98
|
+
sendJson(response, 400, { error: "Both 'baselineBatch' and 'candidateBatch' query params are required." });
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
const comparison = storage.compareSuites(baselineBatch, candidateBatch);
|
|
102
|
+
sendJson(response, 200, comparison);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
90
105
|
sendJson(response, 404, { error: "Not found." });
|
|
91
106
|
}
|
|
92
107
|
catch (error) {
|
|
93
108
|
sendJson(response, 500, { error: error instanceof Error ? error.message : String(error) });
|
|
94
109
|
}
|
|
110
|
+
finally {
|
|
111
|
+
storage.close();
|
|
112
|
+
}
|
|
95
113
|
}
|
|
96
114
|
async function buildUiAssets() {
|
|
97
115
|
if (existsSync(PACKAGED_ASSETS_ROOT)) {
|