npm - agent-regression-lab - Versions diffs - 0.3.0 → 0.4.0 - Mend

agent-regression-lab 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +25 -4
package/bin/agentlab.js +2 -0
package/dist/config.js +13 -9
package/dist/index.js +14 -0
package/dist/init.js +88 -0
package/dist/tools.js +18 -2
package/dist/ui/App.js +49 -7
package/dist/ui-assets/client.css +92 -0
package/dist/ui-assets/client.js +102 -20
package/docs/coding-agents.md +74 -0
package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
package/docs/tools.md +34 -3
package/docs/troubleshooting.md +55 -0
package/examples/coding-tools/README.md +21 -0
package/examples/coding-tools/index.js +11 -0
package/examples/coding-tools/package.json +8 -0
package/examples/support-tools/README.md +21 -0
package/examples/support-tools/index.js +8 -0
package/examples/support-tools/package.json +8 -0
package/package.json +6 -4

package/README.md CHANGED Viewed

@@ -29,7 +29,7 @@ This is a local-first alpha for early technical teams. It is strongest when used
 ## What It Supports Today
 - YAML scenarios under `scenarios/`
-- deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
+- deterministic built-in tools plus custom tools from `agentlab.config.yaml`
 - named agents from `agentlab.config.yaml`
 - built-in `mock`, `openai`, `external_process`, and `http` agent modes
 - `type: conversation` multi-turn dialog scenarios for HTTP agents
@@ -46,6 +46,26 @@ Use this as the default product story:
 3. run curated golden suites before release
 4. keep incident-derived scenarios as permanent regression assets
+## Start Here
+If your agent runs as an HTTP service:
+- use `provider: http`
+- start with [arl-test](arl-test)
+- read [docs/agents.md](docs/agents.md) and [docs/scenarios.md](docs/scenarios.md)
+If you are validating coding-agent changes:
+- start with the coding scenarios under `scenarios/coding/`
+- read [docs/coding-agents.md](docs/coding-agents.md)
+- use deterministic tool-loop runs first, then compare before/after behavior
+If you want pre-merge regression checks in CI:
+- use `suite_definitions`
+- start with `.github/workflows/agentlab-pre-merge.yml`
+- run `agentlab run --suite-def pre_merge --agent mock-default`
 ## First 10 Minutes
 The fastest path is to run the CLI from a local checkout.
@@ -180,7 +200,7 @@ Use this as the default mental model:
 3. note the run id or suite batch id
 4. inspect the run in CLI or UI
 5. compare two runs or two suite batches
-6. extend the setup with a named agent or repo-local tool when needed
+6. extend the setup with a named agent or custom tools from repo-local files or installed packages when needed
 ## Canonical Live HTTP Fixture
@@ -202,7 +222,7 @@ The `arl-test` scenarios are intended to behave like a real internal-team regres
 `agentlab.config.yaml` is the public extension point for:
 - named agents
-- repo-local custom tools
+- custom tools from repo-local files or installed npm packages
 Supported agent providers:
@@ -215,6 +235,7 @@ Working sample assets already live in this repo:
 - external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
 - custom tool: `user_tools/findDuplicateCharge.ts`
+- package-style tool examples: `examples/support-tools`, `examples/coding-tools`
 - sample config: `agentlab.config.yaml`
 See:
@@ -251,7 +272,7 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
 ## Limitations
 - this is a local-first alpha, not a hosted platform
-- custom tool loading is limited to repo-local module paths
+- the published package/example ecosystem is still small
 - external agents integrate through the local stdin/stdout protocol only
 - the UI is intentionally minimal and optimized for debugging
 - SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB

package/bin/agentlab.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ #!/usr/bin/env node
2	+ import "../dist/index.js";

package/dist/config.js CHANGED Viewed

@@ -90,8 +90,10 @@ function validateToolRegistration(value) {
     if (typeof value.name !== "string" || value.name.length === 0) {
         throw new Error("Each tool registration must define a non-empty 'name'.");
     }
-    if (typeof value.modulePath !== "string" || value.modulePath.length === 0) {
-        throw new Error(`Tool '${value.name}' must define a non-empty 'modulePath'.`);
+    const hasModulePath = typeof value.modulePath === "string" && value.modulePath.length > 0;
+    const hasPackage = typeof value.package === "string" && value.package.length > 0;
+    if ((hasModulePath ? 1 : 0) + (hasPackage ? 1 : 0) !== 1) {
+        throw new Error(`Tool '${value.name}' must define exactly one of 'modulePath' or 'package'.`);
     }
     if (typeof value.exportName !== "string" || value.exportName.length === 0) {
         throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
@@ -102,13 +104,15 @@ function validateToolRegistration(value) {
     if (!isObject(value.inputSchema)) {
         throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
     }
-    const resolved = resolve(value.modulePath);
-    const root = `${process.cwd()}${sep}`;
-    if (!(resolved === process.cwd() || resolved.startsWith(root))) {
-        throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
-    }
-    if (!exists(resolved)) {
-        throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
+    if (hasModulePath) {
+        const resolved = resolve(value.modulePath);
+        const root = `${process.cwd()}${sep}`;
+        if (!(resolved === process.cwd() || resolved.startsWith(root))) {
+            throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
+        }
+        if (!exists(resolved)) {
+            throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
+        }
     }
 }
 function validateAgentRegistration(value) {

package/dist/index.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { createAgentFactory } from "./agent/factory.js";
 import { getAgentRegistration, getVariantSet } from "./config.js";
 import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
 import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
+import { initProject } from "./init.js";
 async function main() {
     const [, , command, ...args] = process.argv;
     switch (command) {
@@ -33,12 +34,16 @@ async function main() {
         case "ui":
             await handleUi();
             break;
+        case "init":
+            await handleInit(args);
+            break;
         default:
             printUsage();
     }
 }
 function printUsage() {
     console.log(`Usage:
+  agentlab init <project-name>
   agentlab list scenarios
   agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
   agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
@@ -64,6 +69,15 @@ async function handleList(args) {
         console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
     }
 }
+async function handleInit(args) {
+    const projectName = args[0];
+    if (!projectName) {
+        console.error("Error: project-name is required.");
+        console.error("Usage: agentlab init <project-name>");
+        process.exit(1);
+    }
+    await initProject(projectName);
+}
 async function handleRun(args) {
     const parsed = parseRunArgs(args);
     const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);

package/dist/init.js ADDED Viewed

@@ -0,0 +1,88 @@
+import { existsSync, mkdirSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+const SAMPLE_SCENARIO = `id: sample.hello-world
+name: Hello World Sample
+suite: sample
+description: A minimal example to verify your setup.
+difficulty: easy
+tags:
+  - smoke
+  - sample
+task:
+  instructions: |
+    Say hello to the user and confirm the system is working.
+  context:
+    user_name: Alice
+tools:
+  allowed: []
+runtime:
+  max_steps: 5
+evaluators:
+  - id: greeting-output
+    type: final_answer_contains
+    mode: hard_gate
+    config:
+      required_substrings:
+        - "Hello"
+`;
+const SAMPLE_FIXTURE = `{
+  "users": [
+    { "id": "user_001", "name": "Alice", "email": "alice@example.com" }
+  ]
+}
+`;
+const SAMPLE_CONFIG = `# Agent Regression Lab Configuration
+# Docs: https://github.com/YakshithK/agent-regression-lab#readme
+agents:
+  - name: mock-default
+    provider: mock
+    label: mock-default
+  # Uncomment and configure to test with OpenAI:
+  # - name: openai-test
+  #   provider: openai
+  #   model: gpt-4o-mini
+  #   label: openai-test
+# Tools can be registered from either:
+# 1. repo-local files
+# 2. installed npm packages
+#
+# tools:
+#   - name: my.local_tool
+#     modulePath: ./tools/customTool.ts
+#     exportName: customTool
+#     description: My repo-local custom tool.
+#     inputSchema:
+#       type: object
+#
+#   - name: support.find_duplicate_charge
+#     package: "@agentlab/example-support-tools"
+#     exportName: findDuplicateCharge
+#     description: Find the duplicated charge order id for a given customer.
+#     inputSchema:
+#       type: object
+`;
+export async function initProject(projectName) {
+    const targetDir = join(process.cwd(), projectName);
+    if (existsSync(targetDir)) {
+        throw new Error(`Directory '${projectName}' already exists.`);
+    }
+    // Create directory structure
+    mkdirSync(targetDir, { recursive: true });
+    mkdirSync(join(targetDir, "scenarios"), { recursive: true });
+    mkdirSync(join(targetDir, "scenarios", "sample"), { recursive: true });
+    mkdirSync(join(targetDir, "fixtures"), { recursive: true });
+    // Write files
+    writeFileSync(join(targetDir, "scenarios", "sample", "hello-world.yaml"), SAMPLE_SCENARIO);
+    writeFileSync(join(targetDir, "fixtures", "users.json"), SAMPLE_FIXTURE);
+    writeFileSync(join(targetDir, "agentlab.config.yaml"), SAMPLE_CONFIG);
+    console.log(`Created '${projectName}' with sample scenario.`);
+    console.log("");
+    console.log("Next steps:");
+    console.log(`  cd ${projectName}`);
+    console.log("  npm install @agentlab/example-support-tools");
+    console.log("  # then register package-backed tools in agentlab.config.yaml if needed");
+    console.log("  agentlab run sample.hello-world --agent mock-default");
+}

package/dist/tools.js CHANGED Viewed

@@ -1,4 +1,5 @@
 import { readFileSync } from "node:fs";
+import { createRequire } from "node:module";
 import { pathToFileURL } from "node:url";
 import { resolve } from "node:path";
 import { loadAgentLabConfig } from "./config.js";
@@ -384,8 +385,7 @@ async function loadTools() {
     return merged;
 }
 async function loadConfiguredTool(tool) {
-    const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
-    const module = await import(moduleUrl);
+    const module = tool.package ? await importConfiguredPackageTool(tool) : await importConfiguredFileTool(tool);
     const candidate = module[tool.exportName];
     if (typeof candidate !== "function") {
         throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
@@ -399,6 +399,22 @@ async function loadConfiguredTool(tool) {
         handler: candidate,
     };
 }
+async function importConfiguredFileTool(tool) {
+    const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
+    return (await import(moduleUrl));
+}
+async function importConfiguredPackageTool(tool) {
+    try {
+        const requireFromCwd = createRequire(resolve(process.cwd(), "package.json"));
+        const resolved = requireFromCwd.resolve(tool.package);
+        const moduleUrl = pathToFileURL(resolved).href;
+        return (await import(moduleUrl));
+    }
+    catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        throw new Error(`Tool '${tool.name}' failed to load package '${tool.package}': ${message}`);
+    }
+}
 function assertObject(value) {
     if (typeof value !== "object" || value === null || Array.isArray(value)) {
         throw new Error("Tool input must be an object.");

package/dist/ui/App.js CHANGED Viewed

@@ -21,7 +21,8 @@ function RunListPage() {
             .then((response) => response.json())
             .then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
     }, [suite, status, provider]);
-    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
+    const stats = summarizeRuns(runs);
+    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), runs.length > 0 ? (_jsxs("div", { className: "stats dashboard-stats", children: [_jsx(Stat, { label: "Runs shown", value: stats.total }), _jsx(Stat, { label: "Passing", value: _jsx("span", { className: "pass-text", children: stats.pass }) }), _jsx(Stat, { label: "Failing", value: _jsx("span", { className: "fail-text", children: stats.fail }) }), _jsx(Stat, { label: "Errors", value: _jsx("span", { className: "error-text", children: stats.error }) }), _jsx(Stat, { label: "Latest suite", value: stats.latestSuite }), _jsx(Stat, { label: "Latest provider", value: stats.latestProvider })] })) : null, _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
                                             runs[index - 1].suite === run.suite &&
                                             runs[index - 1].suiteBatchId &&
                                             run.suiteBatchId &&
@@ -37,14 +38,14 @@ function RunDetailPage(props) {
     if (!detail) {
         return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
     }
-    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
+    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => (_jsxs("li", { className: "timeline-item", children: [_jsxs("div", { className: "timeline-head", children: [_jsxs("span", { className: "timeline-step", children: ["Step ", event.stepIndex] }), _jsx("span", { className: "event-chip", children: formatEventLabel(event.type) }), _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
 }
 export function FailureSummaryPanel(props) {
     const failureItems = getFailureSummaryItems(props.detail);
     if (failureItems.length === 0) {
         return null;
     }
-    return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
+    return (_jsxs("section", { className: "panel failure-panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
 }
 export function RunIdentitySummary(props) {
     const run = props.detail.run;
@@ -70,10 +71,10 @@ function ComparePage(props) {
     if (!data) {
         return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
     }
-    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.note, diff.hardGate ? " (hard gate)" : ""] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack", children: data.toolDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
+    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsx(ComparisonHero, { comparison: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel emphasis-panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.evaluatorId }), diff.hardGate ? _jsx("span", { className: "event-chip", children: "hard gate" }) : null] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.toolName }), _jsx("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
 }
 function RunSide(props) {
-    return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { children: _jsxs("strong", { children: [event.stepIndex, ". ", event.type] }) }, event.eventId))) })] }));
+    return (_jsxs("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { className: "timeline-item compact-item", children: _jsxs("strong", { children: [event.stepIndex, ". ", formatEventLabel(event.type)] }) }, event.eventId))) })] }));
 }
 function SuiteComparePage(props) {
     const [data, setData] = useState(null);
@@ -95,10 +96,10 @@ function SuiteComparePage(props) {
     if (!data) {
         return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
     }
-    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
+    return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsx(SuiteComparisonHero, { data: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
 }
 function ScenarioList(props) {
-    return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
+    return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack diff-list", children: props.items.map((item) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification })] }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
 }
 function Stat(props) {
     return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
@@ -106,6 +107,13 @@ function Stat(props) {
 function EmptyState(props) {
     return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
 }
+export function ComparisonHero(props) {
+    const tone = mapClassificationToTone(props.comparison.classification);
+    return (_jsxs("section", { className: `panel compare-hero ${tone}`, children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: props.comparison.classification }), _jsx("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })] }), _jsxs("p", { className: "muted", children: ["Output changed: ", props.comparison.outputChanged ? "yes" : "no", props.comparison.terminationDelta ? ` • termination: ${props.comparison.terminationDelta}` : ""] })] }));
+}
+export function SuiteComparisonHero(props) {
+    return (_jsxs("section", { className: "panel compare-hero neutral", children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: "Suite movement" }), _jsx("span", { className: "event-chip", children: props.data.classification })] }), _jsxs("div", { className: "stats compact-stats", children: [_jsx(Stat, { label: "Regressions", value: props.data.regressions.length }), _jsx(Stat, { label: "Improvements", value: props.data.improvements.length }), _jsx(Stat, { label: "Unchanged", value: props.data.unchanged.length })] })] }));
+}
 export function getFailureSummaryItems(detail) {
     const items = [];
     if (detail.errorDetail) {
@@ -121,6 +129,40 @@ export function getFailureSummaryItems(detail) {
     }
     return items;
 }
+export function summarizeRuns(runs) {
+    return {
+        total: runs.length,
+        pass: runs.filter((run) => run.status === "pass").length,
+        fail: runs.filter((run) => run.status === "fail").length,
+        error: runs.filter((run) => run.status === "error").length,
+        latestSuite: runs[0]?.suite ?? "-",
+        latestProvider: runs[0]?.provider ?? "-",
+    };
+}
+function formatEventLabel(type) {
+    return type.replaceAll("_", " ");
+}
+function mapRiskToPill(risk) {
+    if (risk === "high") {
+        return "fail";
+    }
+    if (risk === "medium") {
+        return "error";
+    }
+    return "pass";
+}
+function mapClassificationToTone(classification) {
+    if (classification.includes("regress")) {
+        return "fail";
+    }
+    if (classification.includes("improv")) {
+        return "pass";
+    }
+    if (classification.includes("changed")) {
+        return "error";
+    }
+    return "neutral";
+}
 function signed(value) {
     return value > 0 ? `+${value}` : `${value}`;
 }

package/dist/ui-assets/client.css CHANGED Viewed

@@ -10,6 +10,7 @@
   --pass: #1e6a42;
   --fail: #9a2c1f;
   --error: #5b1e72;
+  --shadow: 0 16px 40px rgba(76, 58, 26, 0.08);
 }
 * {
   box-sizing: border-box;
@@ -104,6 +105,7 @@ select {
   border: 1px solid var(--line);
   border-radius: 16px;
   padding: 1rem;
+  box-shadow: var(--shadow);
 }
 .stat-value {
   font-size: 1.4rem;
@@ -114,6 +116,18 @@ select {
   grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
   margin-bottom: 1rem;
 }
+.dashboard-stats .stat {
+  border-top: 4px solid var(--line);
+}
+.pass-text {
+  color: var(--pass);
+}
+.fail-text {
+  color: var(--fail);
+}
+.error-text {
+  color: var(--error);
+}
 .table {
   width: 100%;
   border-collapse: collapse;
@@ -157,6 +171,16 @@ select {
   background: rgba(91, 30, 114, 0.12);
   color: var(--error);
 }
+.pill.neutral {
+  background: rgba(102, 95, 84, 0.14);
+  color: var(--muted);
+}
+.failure-panel {
+  border-left: 6px solid var(--fail);
+}
+.emphasis-panel {
+  border-left: 6px solid var(--accent);
+}
 .stack,
 .timeline {
   display: grid;
@@ -166,6 +190,74 @@ select {
 .timeline.compact {
   gap: 0.35rem;
 }
+.timeline-detailed {
+  padding-left: 0;
+  list-style: none;
+}
+.timeline-item {
+  border-left: 3px solid var(--line);
+  padding-left: 0.9rem;
+  margin-left: 0.35rem;
+}
+.timeline-head,
+.diff-card-head,
+.compare-hero-head {
+  display: flex;
+  gap: 0.6rem;
+  align-items: center;
+  flex-wrap: wrap;
+}
+.timeline-step,
+.event-chip {
+  display: inline-block;
+  padding: 0.2rem 0.55rem;
+  border-radius: 999px;
+  background: #efe5d5;
+  color: var(--ink);
+  font-size: 0.78rem;
+  font-family: "IBM Plex Mono", monospace;
+  text-transform: uppercase;
+}
+.diff-list {
+  padding-left: 0;
+  list-style: none;
+}
+.diff-card {
+  border: 1px solid var(--line);
+  border-radius: 12px;
+  padding: 0.8rem;
+  background: #faf5ec;
+}
+.compare-hero {
+  margin-bottom: 1rem;
+}
+.compare-hero.pass {
+  border-left: 6px solid var(--pass);
+}
+.compare-hero.fail {
+  border-left: 6px solid var(--fail);
+}
+.compare-hero.error {
+  border-left: 6px solid var(--error);
+}
+.compare-hero.neutral {
+  border-left: 6px solid var(--muted);
+}
+.compact-stats {
+  margin-top: 1rem;
+  margin-bottom: 0;
+}
+.compare-side.baseline-side {
+  border-top: 4px solid #b89d67;
+}
+.compare-side.candidate-side {
+  border-top: 4px solid var(--accent);
+}
+.compact-item {
+  border-left: none;
+  padding-left: 0;
+  margin-left: 0;
+}
 @media (max-width: 720px) {
   .table {
     display: block;