agent-regression-lab 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -4
- package/bin/agentlab.js +2 -0
- package/dist/config.js +13 -9
- package/dist/index.js +14 -0
- package/dist/init.js +88 -0
- package/dist/tools.js +18 -2
- package/dist/ui/App.js +49 -7
- package/dist/ui-assets/client.css +1108 -116
- package/dist/ui-assets/client.js +863 -426
- package/docs/coding-agents.md +74 -0
- package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
- package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
- package/docs/superpowers/plans/2026-04-16-regression-atlas-ui-redesign.md +1010 -0
- package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
- package/docs/superpowers/specs/2026-04-16-regression-atlas-ui-redesign-design.md +417 -0
- package/docs/tools.md +34 -3
- package/docs/troubleshooting.md +55 -0
- package/examples/coding-tools/README.md +21 -0
- package/examples/coding-tools/index.js +11 -0
- package/examples/coding-tools/package.json +8 -0
- package/examples/support-tools/README.md +21 -0
- package/examples/support-tools/index.js +8 -0
- package/examples/support-tools/package.json +8 -0
- package/package.json +6 -4
package/README.md
CHANGED
|
@@ -29,7 +29,7 @@ This is a local-first alpha for early technical teams. It is strongest when used
|
|
|
29
29
|
## What It Supports Today
|
|
30
30
|
|
|
31
31
|
- YAML scenarios under `scenarios/`
|
|
32
|
-
- deterministic built-in tools plus
|
|
32
|
+
- deterministic built-in tools plus custom tools from `agentlab.config.yaml`
|
|
33
33
|
- named agents from `agentlab.config.yaml`
|
|
34
34
|
- built-in `mock`, `openai`, `external_process`, and `http` agent modes
|
|
35
35
|
- `type: conversation` multi-turn dialog scenarios for HTTP agents
|
|
@@ -46,6 +46,26 @@ Use this as the default product story:
|
|
|
46
46
|
3. run curated golden suites before release
|
|
47
47
|
4. keep incident-derived scenarios as permanent regression assets
|
|
48
48
|
|
|
49
|
+
## Start Here
|
|
50
|
+
|
|
51
|
+
If your agent runs as an HTTP service:
|
|
52
|
+
|
|
53
|
+
- use `provider: http`
|
|
54
|
+
- start with [arl-test](arl-test)
|
|
55
|
+
- read [docs/agents.md](docs/agents.md) and [docs/scenarios.md](docs/scenarios.md)
|
|
56
|
+
|
|
57
|
+
If you are validating coding-agent changes:
|
|
58
|
+
|
|
59
|
+
- start with the coding scenarios under `scenarios/coding/`
|
|
60
|
+
- read [docs/coding-agents.md](docs/coding-agents.md)
|
|
61
|
+
- use deterministic tool-loop runs first, then compare before/after behavior
|
|
62
|
+
|
|
63
|
+
If you want pre-merge regression checks in CI:
|
|
64
|
+
|
|
65
|
+
- use `suite_definitions`
|
|
66
|
+
- start with `.github/workflows/agentlab-pre-merge.yml`
|
|
67
|
+
- run `agentlab run --suite-def pre_merge --agent mock-default`
|
|
68
|
+
|
|
49
69
|
## First 10 Minutes
|
|
50
70
|
|
|
51
71
|
The fastest path is to run the CLI from a local checkout.
|
|
@@ -180,7 +200,7 @@ Use this as the default mental model:
|
|
|
180
200
|
3. note the run id or suite batch id
|
|
181
201
|
4. inspect the run in CLI or UI
|
|
182
202
|
5. compare two runs or two suite batches
|
|
183
|
-
6. extend the setup with a named agent or repo-local
|
|
203
|
+
6. extend the setup with a named agent or custom tools from repo-local files or installed packages when needed
|
|
184
204
|
|
|
185
205
|
## Canonical Live HTTP Fixture
|
|
186
206
|
|
|
@@ -202,7 +222,7 @@ The `arl-test` scenarios are intended to behave like a real internal-team regres
|
|
|
202
222
|
`agentlab.config.yaml` is the public extension point for:
|
|
203
223
|
|
|
204
224
|
- named agents
|
|
205
|
-
- repo-local
|
|
225
|
+
- custom tools from repo-local files or installed npm packages
|
|
206
226
|
|
|
207
227
|
Supported agent providers:
|
|
208
228
|
|
|
@@ -215,6 +235,7 @@ Working sample assets already live in this repo:
|
|
|
215
235
|
|
|
216
236
|
- external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
|
|
217
237
|
- custom tool: `user_tools/findDuplicateCharge.ts`
|
|
238
|
+
- package-style tool examples: `examples/support-tools`, `examples/coding-tools`
|
|
218
239
|
- sample config: `agentlab.config.yaml`
|
|
219
240
|
|
|
220
241
|
See:
|
|
@@ -251,7 +272,7 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
|
|
|
251
272
|
## Limitations
|
|
252
273
|
|
|
253
274
|
- this is a local-first alpha, not a hosted platform
|
|
254
|
-
-
|
|
275
|
+
- the published package/example ecosystem is still small
|
|
255
276
|
- external agents integrate through the local stdin/stdout protocol only
|
|
256
277
|
- the UI is intentionally minimal and optimized for debugging
|
|
257
278
|
- SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB
|
package/bin/agentlab.js
ADDED
package/dist/config.js
CHANGED
|
@@ -90,8 +90,10 @@ function validateToolRegistration(value) {
|
|
|
90
90
|
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
91
91
|
throw new Error("Each tool registration must define a non-empty 'name'.");
|
|
92
92
|
}
|
|
93
|
-
|
|
94
|
-
|
|
93
|
+
const hasModulePath = typeof value.modulePath === "string" && value.modulePath.length > 0;
|
|
94
|
+
const hasPackage = typeof value.package === "string" && value.package.length > 0;
|
|
95
|
+
if ((hasModulePath ? 1 : 0) + (hasPackage ? 1 : 0) !== 1) {
|
|
96
|
+
throw new Error(`Tool '${value.name}' must define exactly one of 'modulePath' or 'package'.`);
|
|
95
97
|
}
|
|
96
98
|
if (typeof value.exportName !== "string" || value.exportName.length === 0) {
|
|
97
99
|
throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
|
|
@@ -102,13 +104,15 @@ function validateToolRegistration(value) {
|
|
|
102
104
|
if (!isObject(value.inputSchema)) {
|
|
103
105
|
throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
|
|
104
106
|
}
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
107
|
+
if (hasModulePath) {
|
|
108
|
+
const resolved = resolve(value.modulePath);
|
|
109
|
+
const root = `${process.cwd()}${sep}`;
|
|
110
|
+
if (!(resolved === process.cwd() || resolved.startsWith(root))) {
|
|
111
|
+
throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
|
|
112
|
+
}
|
|
113
|
+
if (!exists(resolved)) {
|
|
114
|
+
throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
|
|
115
|
+
}
|
|
112
116
|
}
|
|
113
117
|
}
|
|
114
118
|
function validateAgentRegistration(value) {
|
package/dist/index.js
CHANGED
|
@@ -5,6 +5,7 @@ import { createAgentFactory } from "./agent/factory.js";
|
|
|
5
5
|
import { getAgentRegistration, getVariantSet } from "./config.js";
|
|
6
6
|
import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
|
|
7
7
|
import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
|
|
8
|
+
import { initProject } from "./init.js";
|
|
8
9
|
async function main() {
|
|
9
10
|
const [, , command, ...args] = process.argv;
|
|
10
11
|
switch (command) {
|
|
@@ -33,12 +34,16 @@ async function main() {
|
|
|
33
34
|
case "ui":
|
|
34
35
|
await handleUi();
|
|
35
36
|
break;
|
|
37
|
+
case "init":
|
|
38
|
+
await handleInit(args);
|
|
39
|
+
break;
|
|
36
40
|
default:
|
|
37
41
|
printUsage();
|
|
38
42
|
}
|
|
39
43
|
}
|
|
40
44
|
function printUsage() {
|
|
41
45
|
console.log(`Usage:
|
|
46
|
+
agentlab init <project-name>
|
|
42
47
|
agentlab list scenarios
|
|
43
48
|
agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
44
49
|
agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
@@ -64,6 +69,15 @@ async function handleList(args) {
|
|
|
64
69
|
console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
|
|
65
70
|
}
|
|
66
71
|
}
|
|
72
|
+
async function handleInit(args) {
|
|
73
|
+
const projectName = args[0];
|
|
74
|
+
if (!projectName) {
|
|
75
|
+
console.error("Error: project-name is required.");
|
|
76
|
+
console.error("Usage: agentlab init <project-name>");
|
|
77
|
+
process.exit(1);
|
|
78
|
+
}
|
|
79
|
+
await initProject(projectName);
|
|
80
|
+
}
|
|
67
81
|
async function handleRun(args) {
|
|
68
82
|
const parsed = parseRunArgs(args);
|
|
69
83
|
const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
|
package/dist/init.js
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
const SAMPLE_SCENARIO = `id: sample.hello-world
|
|
4
|
+
name: Hello World Sample
|
|
5
|
+
suite: sample
|
|
6
|
+
description: A minimal example to verify your setup.
|
|
7
|
+
difficulty: easy
|
|
8
|
+
tags:
|
|
9
|
+
- smoke
|
|
10
|
+
- sample
|
|
11
|
+
task:
|
|
12
|
+
instructions: |
|
|
13
|
+
Say hello to the user and confirm the system is working.
|
|
14
|
+
context:
|
|
15
|
+
user_name: Alice
|
|
16
|
+
tools:
|
|
17
|
+
allowed: []
|
|
18
|
+
runtime:
|
|
19
|
+
max_steps: 5
|
|
20
|
+
evaluators:
|
|
21
|
+
- id: greeting-output
|
|
22
|
+
type: final_answer_contains
|
|
23
|
+
mode: hard_gate
|
|
24
|
+
config:
|
|
25
|
+
required_substrings:
|
|
26
|
+
- "Hello"
|
|
27
|
+
`;
|
|
28
|
+
const SAMPLE_FIXTURE = `{
|
|
29
|
+
"users": [
|
|
30
|
+
{ "id": "user_001", "name": "Alice", "email": "alice@example.com" }
|
|
31
|
+
]
|
|
32
|
+
}
|
|
33
|
+
`;
|
|
34
|
+
const SAMPLE_CONFIG = `# Agent Regression Lab Configuration
|
|
35
|
+
# Docs: https://github.com/YakshithK/agent-regression-lab#readme
|
|
36
|
+
|
|
37
|
+
agents:
|
|
38
|
+
- name: mock-default
|
|
39
|
+
provider: mock
|
|
40
|
+
label: mock-default
|
|
41
|
+
|
|
42
|
+
# Uncomment and configure to test with OpenAI:
|
|
43
|
+
# - name: openai-test
|
|
44
|
+
# provider: openai
|
|
45
|
+
# model: gpt-4o-mini
|
|
46
|
+
# label: openai-test
|
|
47
|
+
|
|
48
|
+
# Tools can be registered from either:
|
|
49
|
+
# 1. repo-local files
|
|
50
|
+
# 2. installed npm packages
|
|
51
|
+
#
|
|
52
|
+
# tools:
|
|
53
|
+
# - name: my.local_tool
|
|
54
|
+
# modulePath: ./tools/customTool.ts
|
|
55
|
+
# exportName: customTool
|
|
56
|
+
# description: My repo-local custom tool.
|
|
57
|
+
# inputSchema:
|
|
58
|
+
# type: object
|
|
59
|
+
#
|
|
60
|
+
# - name: support.find_duplicate_charge
|
|
61
|
+
# package: "@agentlab/example-support-tools"
|
|
62
|
+
# exportName: findDuplicateCharge
|
|
63
|
+
# description: Find the duplicated charge order id for a given customer.
|
|
64
|
+
# inputSchema:
|
|
65
|
+
# type: object
|
|
66
|
+
`;
|
|
67
|
+
export async function initProject(projectName) {
|
|
68
|
+
const targetDir = join(process.cwd(), projectName);
|
|
69
|
+
if (existsSync(targetDir)) {
|
|
70
|
+
throw new Error(`Directory '${projectName}' already exists.`);
|
|
71
|
+
}
|
|
72
|
+
// Create directory structure
|
|
73
|
+
mkdirSync(targetDir, { recursive: true });
|
|
74
|
+
mkdirSync(join(targetDir, "scenarios"), { recursive: true });
|
|
75
|
+
mkdirSync(join(targetDir, "scenarios", "sample"), { recursive: true });
|
|
76
|
+
mkdirSync(join(targetDir, "fixtures"), { recursive: true });
|
|
77
|
+
// Write files
|
|
78
|
+
writeFileSync(join(targetDir, "scenarios", "sample", "hello-world.yaml"), SAMPLE_SCENARIO);
|
|
79
|
+
writeFileSync(join(targetDir, "fixtures", "users.json"), SAMPLE_FIXTURE);
|
|
80
|
+
writeFileSync(join(targetDir, "agentlab.config.yaml"), SAMPLE_CONFIG);
|
|
81
|
+
console.log(`Created '${projectName}' with sample scenario.`);
|
|
82
|
+
console.log("");
|
|
83
|
+
console.log("Next steps:");
|
|
84
|
+
console.log(` cd ${projectName}`);
|
|
85
|
+
console.log(" npm install @agentlab/example-support-tools");
|
|
86
|
+
console.log(" # then register package-backed tools in agentlab.config.yaml if needed");
|
|
87
|
+
console.log(" agentlab run sample.hello-world --agent mock-default");
|
|
88
|
+
}
|
package/dist/tools.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { readFileSync } from "node:fs";
|
|
2
|
+
import { createRequire } from "node:module";
|
|
2
3
|
import { pathToFileURL } from "node:url";
|
|
3
4
|
import { resolve } from "node:path";
|
|
4
5
|
import { loadAgentLabConfig } from "./config.js";
|
|
@@ -384,8 +385,7 @@ async function loadTools() {
|
|
|
384
385
|
return merged;
|
|
385
386
|
}
|
|
386
387
|
async function loadConfiguredTool(tool) {
|
|
387
|
-
const
|
|
388
|
-
const module = await import(moduleUrl);
|
|
388
|
+
const module = tool.package ? await importConfiguredPackageTool(tool) : await importConfiguredFileTool(tool);
|
|
389
389
|
const candidate = module[tool.exportName];
|
|
390
390
|
if (typeof candidate !== "function") {
|
|
391
391
|
throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
|
|
@@ -399,6 +399,22 @@ async function loadConfiguredTool(tool) {
|
|
|
399
399
|
handler: candidate,
|
|
400
400
|
};
|
|
401
401
|
}
|
|
402
|
+
async function importConfiguredFileTool(tool) {
|
|
403
|
+
const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
|
|
404
|
+
return (await import(moduleUrl));
|
|
405
|
+
}
|
|
406
|
+
async function importConfiguredPackageTool(tool) {
|
|
407
|
+
try {
|
|
408
|
+
const requireFromCwd = createRequire(resolve(process.cwd(), "package.json"));
|
|
409
|
+
const resolved = requireFromCwd.resolve(tool.package);
|
|
410
|
+
const moduleUrl = pathToFileURL(resolved).href;
|
|
411
|
+
return (await import(moduleUrl));
|
|
412
|
+
}
|
|
413
|
+
catch (error) {
|
|
414
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
415
|
+
throw new Error(`Tool '${tool.name}' failed to load package '${tool.package}': ${message}`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
402
418
|
function assertObject(value) {
|
|
403
419
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
404
420
|
throw new Error("Tool input must be an object.");
|
package/dist/ui/App.js
CHANGED
|
@@ -21,7 +21,8 @@ function RunListPage() {
|
|
|
21
21
|
.then((response) => response.json())
|
|
22
22
|
.then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
|
|
23
23
|
}, [suite, status, provider]);
|
|
24
|
-
|
|
24
|
+
const stats = summarizeRuns(runs);
|
|
25
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), runs.length > 0 ? (_jsxs("div", { className: "stats dashboard-stats", children: [_jsx(Stat, { label: "Runs shown", value: stats.total }), _jsx(Stat, { label: "Passing", value: _jsx("span", { className: "pass-text", children: stats.pass }) }), _jsx(Stat, { label: "Failing", value: _jsx("span", { className: "fail-text", children: stats.fail }) }), _jsx(Stat, { label: "Errors", value: _jsx("span", { className: "error-text", children: stats.error }) }), _jsx(Stat, { label: "Latest suite", value: stats.latestSuite }), _jsx(Stat, { label: "Latest provider", value: stats.latestProvider })] })) : null, _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
|
|
25
26
|
runs[index - 1].suite === run.suite &&
|
|
26
27
|
runs[index - 1].suiteBatchId &&
|
|
27
28
|
run.suiteBatchId &&
|
|
@@ -37,14 +38,14 @@ function RunDetailPage(props) {
|
|
|
37
38
|
if (!detail) {
|
|
38
39
|
return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
|
|
39
40
|
}
|
|
40
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("
|
|
41
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => (_jsxs("li", { className: "timeline-item", children: [_jsxs("div", { className: "timeline-head", children: [_jsxs("span", { className: "timeline-step", children: ["Step ", event.stepIndex] }), _jsx("span", { className: "event-chip", children: formatEventLabel(event.type) }), _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
41
42
|
}
|
|
42
43
|
export function FailureSummaryPanel(props) {
|
|
43
44
|
const failureItems = getFailureSummaryItems(props.detail);
|
|
44
45
|
if (failureItems.length === 0) {
|
|
45
46
|
return null;
|
|
46
47
|
}
|
|
47
|
-
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
|
|
48
|
+
return (_jsxs("section", { className: "panel failure-panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
|
|
48
49
|
}
|
|
49
50
|
export function RunIdentitySummary(props) {
|
|
50
51
|
const run = props.detail.run;
|
|
@@ -70,10 +71,10 @@ function ComparePage(props) {
|
|
|
70
71
|
if (!data) {
|
|
71
72
|
return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
|
|
72
73
|
}
|
|
73
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.
|
|
74
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsx(ComparisonHero, { comparison: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel emphasis-panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.evaluatorId }), diff.hardGate ? _jsx("span", { className: "event-chip", children: "hard gate" }) : null] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.toolName }), _jsx("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
|
|
74
75
|
}
|
|
75
76
|
function RunSide(props) {
|
|
76
|
-
return (_jsxs("section", { className:
|
|
77
|
+
return (_jsxs("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { className: "timeline-item compact-item", children: _jsxs("strong", { children: [event.stepIndex, ". ", formatEventLabel(event.type)] }) }, event.eventId))) })] }));
|
|
77
78
|
}
|
|
78
79
|
function SuiteComparePage(props) {
|
|
79
80
|
const [data, setData] = useState(null);
|
|
@@ -95,10 +96,10 @@ function SuiteComparePage(props) {
|
|
|
95
96
|
if (!data) {
|
|
96
97
|
return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
|
|
97
98
|
}
|
|
98
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
|
|
99
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsx(SuiteComparisonHero, { data: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
|
|
99
100
|
}
|
|
100
101
|
function ScenarioList(props) {
|
|
101
|
-
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
|
|
102
|
+
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack diff-list", children: props.items.map((item) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification })] }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
|
|
102
103
|
}
|
|
103
104
|
function Stat(props) {
|
|
104
105
|
return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
|
|
@@ -106,6 +107,13 @@ function Stat(props) {
|
|
|
106
107
|
function EmptyState(props) {
|
|
107
108
|
return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
|
|
108
109
|
}
|
|
110
|
+
export function ComparisonHero(props) {
|
|
111
|
+
const tone = mapClassificationToTone(props.comparison.classification);
|
|
112
|
+
return (_jsxs("section", { className: `panel compare-hero ${tone}`, children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: props.comparison.classification }), _jsx("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })] }), _jsxs("p", { className: "muted", children: ["Output changed: ", props.comparison.outputChanged ? "yes" : "no", props.comparison.terminationDelta ? ` • termination: ${props.comparison.terminationDelta}` : ""] })] }));
|
|
113
|
+
}
|
|
114
|
+
export function SuiteComparisonHero(props) {
|
|
115
|
+
return (_jsxs("section", { className: "panel compare-hero neutral", children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: "Suite movement" }), _jsx("span", { className: "event-chip", children: props.data.classification })] }), _jsxs("div", { className: "stats compact-stats", children: [_jsx(Stat, { label: "Regressions", value: props.data.regressions.length }), _jsx(Stat, { label: "Improvements", value: props.data.improvements.length }), _jsx(Stat, { label: "Unchanged", value: props.data.unchanged.length })] })] }));
|
|
116
|
+
}
|
|
109
117
|
export function getFailureSummaryItems(detail) {
|
|
110
118
|
const items = [];
|
|
111
119
|
if (detail.errorDetail) {
|
|
@@ -121,6 +129,40 @@ export function getFailureSummaryItems(detail) {
|
|
|
121
129
|
}
|
|
122
130
|
return items;
|
|
123
131
|
}
|
|
132
|
+
export function summarizeRuns(runs) {
|
|
133
|
+
return {
|
|
134
|
+
total: runs.length,
|
|
135
|
+
pass: runs.filter((run) => run.status === "pass").length,
|
|
136
|
+
fail: runs.filter((run) => run.status === "fail").length,
|
|
137
|
+
error: runs.filter((run) => run.status === "error").length,
|
|
138
|
+
latestSuite: runs[0]?.suite ?? "-",
|
|
139
|
+
latestProvider: runs[0]?.provider ?? "-",
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
function formatEventLabel(type) {
|
|
143
|
+
return type.replaceAll("_", " ");
|
|
144
|
+
}
|
|
145
|
+
function mapRiskToPill(risk) {
|
|
146
|
+
if (risk === "high") {
|
|
147
|
+
return "fail";
|
|
148
|
+
}
|
|
149
|
+
if (risk === "medium") {
|
|
150
|
+
return "error";
|
|
151
|
+
}
|
|
152
|
+
return "pass";
|
|
153
|
+
}
|
|
154
|
+
function mapClassificationToTone(classification) {
|
|
155
|
+
if (classification.includes("regress")) {
|
|
156
|
+
return "fail";
|
|
157
|
+
}
|
|
158
|
+
if (classification.includes("improv")) {
|
|
159
|
+
return "pass";
|
|
160
|
+
}
|
|
161
|
+
if (classification.includes("changed")) {
|
|
162
|
+
return "error";
|
|
163
|
+
}
|
|
164
|
+
return "neutral";
|
|
165
|
+
}
|
|
124
166
|
function signed(value) {
|
|
125
167
|
return value > 0 ? `+${value}` : `${value}`;
|
|
126
168
|
}
|