agent-regression-lab 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -29,7 +29,7 @@ This is a local-first alpha for early technical teams. It is strongest when used
29
29
  ## What It Supports Today
30
30
 
31
31
  - YAML scenarios under `scenarios/`
32
- - deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
32
+ - deterministic built-in tools plus custom tools from `agentlab.config.yaml`
33
33
  - named agents from `agentlab.config.yaml`
34
34
  - built-in `mock`, `openai`, `external_process`, and `http` agent modes
35
35
  - `type: conversation` multi-turn dialog scenarios for HTTP agents
@@ -46,6 +46,26 @@ Use this as the default product story:
46
46
  3. run curated golden suites before release
47
47
  4. keep incident-derived scenarios as permanent regression assets
48
48
 
49
+ ## Start Here
50
+
51
+ If your agent runs as an HTTP service:
52
+
53
+ - use `provider: http`
54
+ - start with [arl-test](arl-test)
55
+ - read [docs/agents.md](docs/agents.md) and [docs/scenarios.md](docs/scenarios.md)
56
+
57
+ If you are validating coding-agent changes:
58
+
59
+ - start with the coding scenarios under `scenarios/coding/`
60
+ - read [docs/coding-agents.md](docs/coding-agents.md)
61
+ - use deterministic tool-loop runs first, then compare before/after behavior
62
+
63
+ If you want pre-merge regression checks in CI:
64
+
65
+ - use `suite_definitions`
66
+ - start with `.github/workflows/agentlab-pre-merge.yml`
67
+ - run `agentlab run --suite-def pre_merge --agent mock-default`
68
+
49
69
  ## First 10 Minutes
50
70
 
51
71
  The fastest path is to run the CLI from a local checkout.
@@ -180,7 +200,7 @@ Use this as the default mental model:
180
200
  3. note the run id or suite batch id
181
201
  4. inspect the run in CLI or UI
182
202
  5. compare two runs or two suite batches
183
- 6. extend the setup with a named agent or repo-local tool when needed
203
+ 6. extend the setup with a named agent or custom tools from repo-local files or installed packages when needed
184
204
 
185
205
  ## Canonical Live HTTP Fixture
186
206
 
@@ -202,7 +222,7 @@ The `arl-test` scenarios are intended to behave like a real internal-team regres
202
222
  `agentlab.config.yaml` is the public extension point for:
203
223
 
204
224
  - named agents
205
- - repo-local custom tools
225
+ - custom tools from repo-local files or installed npm packages
206
226
 
207
227
  Supported agent providers:
208
228
 
@@ -215,6 +235,7 @@ Working sample assets already live in this repo:
215
235
 
216
236
  - external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
217
237
  - custom tool: `user_tools/findDuplicateCharge.ts`
238
+ - package-style tool examples: `examples/support-tools`, `examples/coding-tools`
218
239
  - sample config: `agentlab.config.yaml`
219
240
 
220
241
  See:
@@ -251,7 +272,7 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
251
272
  ## Limitations
252
273
 
253
274
  - this is a local-first alpha, not a hosted platform
254
- - custom tool loading is limited to repo-local module paths
275
+ - the published package/example ecosystem is still small
255
276
  - external agents integrate through the local stdin/stdout protocol only
256
277
  - the UI is intentionally minimal and optimized for debugging
257
278
  - SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ import "../dist/index.js";
package/dist/config.js CHANGED
@@ -90,8 +90,10 @@ function validateToolRegistration(value) {
90
90
  if (typeof value.name !== "string" || value.name.length === 0) {
91
91
  throw new Error("Each tool registration must define a non-empty 'name'.");
92
92
  }
93
- if (typeof value.modulePath !== "string" || value.modulePath.length === 0) {
94
- throw new Error(`Tool '${value.name}' must define a non-empty 'modulePath'.`);
93
+ const hasModulePath = typeof value.modulePath === "string" && value.modulePath.length > 0;
94
+ const hasPackage = typeof value.package === "string" && value.package.length > 0;
95
+ if ((hasModulePath ? 1 : 0) + (hasPackage ? 1 : 0) !== 1) {
96
+ throw new Error(`Tool '${value.name}' must define exactly one of 'modulePath' or 'package'.`);
95
97
  }
96
98
  if (typeof value.exportName !== "string" || value.exportName.length === 0) {
97
99
  throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
@@ -102,13 +104,15 @@ function validateToolRegistration(value) {
102
104
  if (!isObject(value.inputSchema)) {
103
105
  throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
104
106
  }
105
- const resolved = resolve(value.modulePath);
106
- const root = `${process.cwd()}${sep}`;
107
- if (!(resolved === process.cwd() || resolved.startsWith(root))) {
108
- throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
109
- }
110
- if (!exists(resolved)) {
111
- throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
107
+ if (hasModulePath) {
108
+ const resolved = resolve(value.modulePath);
109
+ const root = `${process.cwd()}${sep}`;
110
+ if (!(resolved === process.cwd() || resolved.startsWith(root))) {
111
+ throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
112
+ }
113
+ if (!exists(resolved)) {
114
+ throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
115
+ }
112
116
  }
113
117
  }
114
118
  function validateAgentRegistration(value) {
package/dist/index.js CHANGED
@@ -5,6 +5,7 @@ import { createAgentFactory } from "./agent/factory.js";
5
5
  import { getAgentRegistration, getVariantSet } from "./config.js";
6
6
  import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
7
7
  import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
8
+ import { initProject } from "./init.js";
8
9
  async function main() {
9
10
  const [, , command, ...args] = process.argv;
10
11
  switch (command) {
@@ -33,12 +34,16 @@ async function main() {
33
34
  case "ui":
34
35
  await handleUi();
35
36
  break;
37
+ case "init":
38
+ await handleInit(args);
39
+ break;
36
40
  default:
37
41
  printUsage();
38
42
  }
39
43
  }
40
44
  function printUsage() {
41
45
  console.log(`Usage:
46
+ agentlab init <project-name>
42
47
  agentlab list scenarios
43
48
  agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
44
49
  agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
@@ -64,6 +69,15 @@ async function handleList(args) {
64
69
  console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
65
70
  }
66
71
  }
72
+ async function handleInit(args) {
73
+ const projectName = args[0];
74
+ if (!projectName) {
75
+ console.error("Error: project-name is required.");
76
+ console.error("Usage: agentlab init <project-name>");
77
+ process.exit(1);
78
+ }
79
+ await initProject(projectName);
80
+ }
67
81
  async function handleRun(args) {
68
82
  const parsed = parseRunArgs(args);
69
83
  const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
package/dist/init.js ADDED
@@ -0,0 +1,88 @@
1
+ import { existsSync, mkdirSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ const SAMPLE_SCENARIO = `id: sample.hello-world
4
+ name: Hello World Sample
5
+ suite: sample
6
+ description: A minimal example to verify your setup.
7
+ difficulty: easy
8
+ tags:
9
+ - smoke
10
+ - sample
11
+ task:
12
+ instructions: |
13
+ Say hello to the user and confirm the system is working.
14
+ context:
15
+ user_name: Alice
16
+ tools:
17
+ allowed: []
18
+ runtime:
19
+ max_steps: 5
20
+ evaluators:
21
+ - id: greeting-output
22
+ type: final_answer_contains
23
+ mode: hard_gate
24
+ config:
25
+ required_substrings:
26
+ - "Hello"
27
+ `;
28
+ const SAMPLE_FIXTURE = `{
29
+ "users": [
30
+ { "id": "user_001", "name": "Alice", "email": "alice@example.com" }
31
+ ]
32
+ }
33
+ `;
34
+ const SAMPLE_CONFIG = `# Agent Regression Lab Configuration
35
+ # Docs: https://github.com/YakshithK/agent-regression-lab#readme
36
+
37
+ agents:
38
+ - name: mock-default
39
+ provider: mock
40
+ label: mock-default
41
+
42
+ # Uncomment and configure to test with OpenAI:
43
+ # - name: openai-test
44
+ # provider: openai
45
+ # model: gpt-4o-mini
46
+ # label: openai-test
47
+
48
+ # Tools can be registered from either:
49
+ # 1. repo-local files
50
+ # 2. installed npm packages
51
+ #
52
+ # tools:
53
+ # - name: my.local_tool
54
+ # modulePath: ./tools/customTool.ts
55
+ # exportName: customTool
56
+ # description: My repo-local custom tool.
57
+ # inputSchema:
58
+ # type: object
59
+ #
60
+ # - name: support.find_duplicate_charge
61
+ # package: "@agentlab/example-support-tools"
62
+ # exportName: findDuplicateCharge
63
+ # description: Find the duplicated charge order id for a given customer.
64
+ # inputSchema:
65
+ # type: object
66
+ `;
67
+ export async function initProject(projectName) {
68
+ const targetDir = join(process.cwd(), projectName);
69
+ if (existsSync(targetDir)) {
70
+ throw new Error(`Directory '${projectName}' already exists.`);
71
+ }
72
+ // Create directory structure
73
+ mkdirSync(targetDir, { recursive: true });
74
+ mkdirSync(join(targetDir, "scenarios"), { recursive: true });
75
+ mkdirSync(join(targetDir, "scenarios", "sample"), { recursive: true });
76
+ mkdirSync(join(targetDir, "fixtures"), { recursive: true });
77
+ // Write files
78
+ writeFileSync(join(targetDir, "scenarios", "sample", "hello-world.yaml"), SAMPLE_SCENARIO);
79
+ writeFileSync(join(targetDir, "fixtures", "users.json"), SAMPLE_FIXTURE);
80
+ writeFileSync(join(targetDir, "agentlab.config.yaml"), SAMPLE_CONFIG);
81
+ console.log(`Created '${projectName}' with sample scenario.`);
82
+ console.log("");
83
+ console.log("Next steps:");
84
+ console.log(` cd ${projectName}`);
85
+ console.log(" npm install @agentlab/example-support-tools");
86
+ console.log(" # then register package-backed tools in agentlab.config.yaml if needed");
87
+ console.log(" agentlab run sample.hello-world --agent mock-default");
88
+ }
package/dist/tools.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import { readFileSync } from "node:fs";
2
+ import { createRequire } from "node:module";
2
3
  import { pathToFileURL } from "node:url";
3
4
  import { resolve } from "node:path";
4
5
  import { loadAgentLabConfig } from "./config.js";
@@ -384,8 +385,7 @@ async function loadTools() {
384
385
  return merged;
385
386
  }
386
387
  async function loadConfiguredTool(tool) {
387
- const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
388
- const module = await import(moduleUrl);
388
+ const module = tool.package ? await importConfiguredPackageTool(tool) : await importConfiguredFileTool(tool);
389
389
  const candidate = module[tool.exportName];
390
390
  if (typeof candidate !== "function") {
391
391
  throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
@@ -399,6 +399,22 @@ async function loadConfiguredTool(tool) {
399
399
  handler: candidate,
400
400
  };
401
401
  }
402
+ async function importConfiguredFileTool(tool) {
403
+ const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
404
+ return (await import(moduleUrl));
405
+ }
406
+ async function importConfiguredPackageTool(tool) {
407
+ try {
408
+ const requireFromCwd = createRequire(resolve(process.cwd(), "package.json"));
409
+ const resolved = requireFromCwd.resolve(tool.package);
410
+ const moduleUrl = pathToFileURL(resolved).href;
411
+ return (await import(moduleUrl));
412
+ }
413
+ catch (error) {
414
+ const message = error instanceof Error ? error.message : String(error);
415
+ throw new Error(`Tool '${tool.name}' failed to load package '${tool.package}': ${message}`);
416
+ }
417
+ }
402
418
  function assertObject(value) {
403
419
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
404
420
  throw new Error("Tool input must be an object.");
package/dist/ui/App.js CHANGED
@@ -21,7 +21,8 @@ function RunListPage() {
21
21
  .then((response) => response.json())
22
22
  .then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
23
23
  }, [suite, status, provider]);
24
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
24
+ const stats = summarizeRuns(runs);
25
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), runs.length > 0 ? (_jsxs("div", { className: "stats dashboard-stats", children: [_jsx(Stat, { label: "Runs shown", value: stats.total }), _jsx(Stat, { label: "Passing", value: _jsx("span", { className: "pass-text", children: stats.pass }) }), _jsx(Stat, { label: "Failing", value: _jsx("span", { className: "fail-text", children: stats.fail }) }), _jsx(Stat, { label: "Errors", value: _jsx("span", { className: "error-text", children: stats.error }) }), _jsx(Stat, { label: "Latest suite", value: stats.latestSuite }), _jsx(Stat, { label: "Latest provider", value: stats.latestProvider })] })) : null, _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
25
26
  runs[index - 1].suite === run.suite &&
26
27
  runs[index - 1].suiteBatchId &&
27
28
  run.suiteBatchId &&
@@ -37,14 +38,14 @@ function RunDetailPage(props) {
37
38
  if (!detail) {
38
39
  return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
39
40
  }
40
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
41
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => (_jsxs("li", { className: "timeline-item", children: [_jsxs("div", { className: "timeline-head", children: [_jsxs("span", { className: "timeline-step", children: ["Step ", event.stepIndex] }), _jsx("span", { className: "event-chip", children: formatEventLabel(event.type) }), _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
41
42
  }
42
43
  export function FailureSummaryPanel(props) {
43
44
  const failureItems = getFailureSummaryItems(props.detail);
44
45
  if (failureItems.length === 0) {
45
46
  return null;
46
47
  }
47
- return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
48
+ return (_jsxs("section", { className: "panel failure-panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
48
49
  }
49
50
  export function RunIdentitySummary(props) {
50
51
  const run = props.detail.run;
@@ -70,10 +71,10 @@ function ComparePage(props) {
70
71
  if (!data) {
71
72
  return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
72
73
  }
73
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.note, diff.hardGate ? " (hard gate)" : ""] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack", children: data.toolDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
74
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsx(ComparisonHero, { comparison: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel emphasis-panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.evaluatorId }), diff.hardGate ? _jsx("span", { className: "event-chip", children: "hard gate" }) : null] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.toolName }), _jsx("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
74
75
  }
75
76
  function RunSide(props) {
76
- return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { children: _jsxs("strong", { children: [event.stepIndex, ". ", event.type] }) }, event.eventId))) })] }));
77
+ return (_jsxs("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { className: "timeline-item compact-item", children: _jsxs("strong", { children: [event.stepIndex, ". ", formatEventLabel(event.type)] }) }, event.eventId))) })] }));
77
78
  }
78
79
  function SuiteComparePage(props) {
79
80
  const [data, setData] = useState(null);
@@ -95,10 +96,10 @@ function SuiteComparePage(props) {
95
96
  if (!data) {
96
97
  return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
97
98
  }
98
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
99
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsx(SuiteComparisonHero, { data: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
99
100
  }
100
101
  function ScenarioList(props) {
101
- return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
102
+ return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack diff-list", children: props.items.map((item) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification })] }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
102
103
  }
103
104
  function Stat(props) {
104
105
  return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
@@ -106,6 +107,13 @@ function Stat(props) {
106
107
  function EmptyState(props) {
107
108
  return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
108
109
  }
110
+ export function ComparisonHero(props) {
111
+ const tone = mapClassificationToTone(props.comparison.classification);
112
+ return (_jsxs("section", { className: `panel compare-hero ${tone}`, children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: props.comparison.classification }), _jsx("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })] }), _jsxs("p", { className: "muted", children: ["Output changed: ", props.comparison.outputChanged ? "yes" : "no", props.comparison.terminationDelta ? ` • termination: ${props.comparison.terminationDelta}` : ""] })] }));
113
+ }
114
+ export function SuiteComparisonHero(props) {
115
+ return (_jsxs("section", { className: "panel compare-hero neutral", children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: "Suite movement" }), _jsx("span", { className: "event-chip", children: props.data.classification })] }), _jsxs("div", { className: "stats compact-stats", children: [_jsx(Stat, { label: "Regressions", value: props.data.regressions.length }), _jsx(Stat, { label: "Improvements", value: props.data.improvements.length }), _jsx(Stat, { label: "Unchanged", value: props.data.unchanged.length })] })] }));
116
+ }
109
117
  export function getFailureSummaryItems(detail) {
110
118
  const items = [];
111
119
  if (detail.errorDetail) {
@@ -121,6 +129,40 @@ export function getFailureSummaryItems(detail) {
121
129
  }
122
130
  return items;
123
131
  }
132
+ export function summarizeRuns(runs) {
133
+ return {
134
+ total: runs.length,
135
+ pass: runs.filter((run) => run.status === "pass").length,
136
+ fail: runs.filter((run) => run.status === "fail").length,
137
+ error: runs.filter((run) => run.status === "error").length,
138
+ latestSuite: runs[0]?.suite ?? "-",
139
+ latestProvider: runs[0]?.provider ?? "-",
140
+ };
141
+ }
142
+ function formatEventLabel(type) {
143
+ return type.replaceAll("_", " ");
144
+ }
145
+ function mapRiskToPill(risk) {
146
+ if (risk === "high") {
147
+ return "fail";
148
+ }
149
+ if (risk === "medium") {
150
+ return "error";
151
+ }
152
+ return "pass";
153
+ }
154
+ function mapClassificationToTone(classification) {
155
+ if (classification.includes("regress")) {
156
+ return "fail";
157
+ }
158
+ if (classification.includes("improv")) {
159
+ return "pass";
160
+ }
161
+ if (classification.includes("changed")) {
162
+ return "error";
163
+ }
164
+ return "neutral";
165
+ }
124
166
  function signed(value) {
125
167
  return value > 0 ? `+${value}` : `${value}`;
126
168
  }
@@ -10,6 +10,7 @@
10
10
  --pass: #1e6a42;
11
11
  --fail: #9a2c1f;
12
12
  --error: #5b1e72;
13
+ --shadow: 0 16px 40px rgba(76, 58, 26, 0.08);
13
14
  }
14
15
  * {
15
16
  box-sizing: border-box;
@@ -104,6 +105,7 @@ select {
104
105
  border: 1px solid var(--line);
105
106
  border-radius: 16px;
106
107
  padding: 1rem;
108
+ box-shadow: var(--shadow);
107
109
  }
108
110
  .stat-value {
109
111
  font-size: 1.4rem;
@@ -114,6 +116,18 @@ select {
114
116
  grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
115
117
  margin-bottom: 1rem;
116
118
  }
119
+ .dashboard-stats .stat {
120
+ border-top: 4px solid var(--line);
121
+ }
122
+ .pass-text {
123
+ color: var(--pass);
124
+ }
125
+ .fail-text {
126
+ color: var(--fail);
127
+ }
128
+ .error-text {
129
+ color: var(--error);
130
+ }
117
131
  .table {
118
132
  width: 100%;
119
133
  border-collapse: collapse;
@@ -157,6 +171,16 @@ select {
157
171
  background: rgba(91, 30, 114, 0.12);
158
172
  color: var(--error);
159
173
  }
174
+ .pill.neutral {
175
+ background: rgba(102, 95, 84, 0.14);
176
+ color: var(--muted);
177
+ }
178
+ .failure-panel {
179
+ border-left: 6px solid var(--fail);
180
+ }
181
+ .emphasis-panel {
182
+ border-left: 6px solid var(--accent);
183
+ }
160
184
  .stack,
161
185
  .timeline {
162
186
  display: grid;
@@ -166,6 +190,74 @@ select {
166
190
  .timeline.compact {
167
191
  gap: 0.35rem;
168
192
  }
193
+ .timeline-detailed {
194
+ padding-left: 0;
195
+ list-style: none;
196
+ }
197
+ .timeline-item {
198
+ border-left: 3px solid var(--line);
199
+ padding-left: 0.9rem;
200
+ margin-left: 0.35rem;
201
+ }
202
+ .timeline-head,
203
+ .diff-card-head,
204
+ .compare-hero-head {
205
+ display: flex;
206
+ gap: 0.6rem;
207
+ align-items: center;
208
+ flex-wrap: wrap;
209
+ }
210
+ .timeline-step,
211
+ .event-chip {
212
+ display: inline-block;
213
+ padding: 0.2rem 0.55rem;
214
+ border-radius: 999px;
215
+ background: #efe5d5;
216
+ color: var(--ink);
217
+ font-size: 0.78rem;
218
+ font-family: "IBM Plex Mono", monospace;
219
+ text-transform: uppercase;
220
+ }
221
+ .diff-list {
222
+ padding-left: 0;
223
+ list-style: none;
224
+ }
225
+ .diff-card {
226
+ border: 1px solid var(--line);
227
+ border-radius: 12px;
228
+ padding: 0.8rem;
229
+ background: #faf5ec;
230
+ }
231
+ .compare-hero {
232
+ margin-bottom: 1rem;
233
+ }
234
+ .compare-hero.pass {
235
+ border-left: 6px solid var(--pass);
236
+ }
237
+ .compare-hero.fail {
238
+ border-left: 6px solid var(--fail);
239
+ }
240
+ .compare-hero.error {
241
+ border-left: 6px solid var(--error);
242
+ }
243
+ .compare-hero.neutral {
244
+ border-left: 6px solid var(--muted);
245
+ }
246
+ .compact-stats {
247
+ margin-top: 1rem;
248
+ margin-bottom: 0;
249
+ }
250
+ .compare-side.baseline-side {
251
+ border-top: 4px solid #b89d67;
252
+ }
253
+ .compare-side.candidate-side {
254
+ border-top: 4px solid var(--accent);
255
+ }
256
+ .compact-item {
257
+ border-left: none;
258
+ padding-left: 0;
259
+ margin-left: 0;
260
+ }
169
261
  @media (max-width: 720px) {
170
262
  .table {
171
263
  display: block;