agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +78 -11
  2. package/bin/agentlab.js +2 -0
  3. package/dist/agent/factory.js +20 -6
  4. package/dist/agent/httpAdapter.js +5 -4
  5. package/dist/config.js +199 -12
  6. package/dist/evaluators.js +56 -1
  7. package/dist/index.js +157 -11
  8. package/dist/init.js +88 -0
  9. package/dist/lib/id.js +3 -0
  10. package/dist/runOutput.js +46 -0
  11. package/dist/runner.js +31 -9
  12. package/dist/scenarios.js +90 -2
  13. package/dist/scoring.js +2 -2
  14. package/dist/storage.js +117 -7
  15. package/dist/tools.js +56 -2
  16. package/dist/trace.js +4 -2
  17. package/dist/ui/App.js +75 -7
  18. package/dist/ui-assets/client.css +92 -0
  19. package/dist/ui-assets/client.js +183 -19
  20. package/docs/agents.md +143 -8
  21. package/docs/coding-agents.md +74 -0
  22. package/docs/golden-suites.md +74 -0
  23. package/docs/integrations-and-live-services.md +58 -0
  24. package/docs/memory-and-stateful-agents.md +51 -0
  25. package/docs/release-checklist.md +30 -0
  26. package/docs/runtime-profiles.md +67 -0
  27. package/docs/scenarios.md +303 -56
  28. package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
  29. package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
  30. package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
  31. package/docs/tools.md +34 -3
  32. package/docs/troubleshooting.md +193 -0
  33. package/docs/variant-sets.md +63 -0
  34. package/examples/coding-tools/README.md +21 -0
  35. package/examples/coding-tools/index.js +11 -0
  36. package/examples/coding-tools/package.json +8 -0
  37. package/examples/support-tools/README.md +21 -0
  38. package/examples/support-tools/index.js +8 -0
  39. package/examples/support-tools/package.json +8 -0
  40. package/package.json +7 -5
package/dist/storage.js CHANGED
@@ -10,6 +10,10 @@ export class Storage {
10
10
  ensureParentDir(DB_PATH);
11
11
  this.db = new DatabaseSync(DB_PATH);
12
12
  this.db.exec(`
13
+ PRAGMA journal_mode = WAL;
14
+ PRAGMA busy_timeout = 5000;
15
+ `);
16
+ this.db.exec(`
13
17
  CREATE TABLE IF NOT EXISTS metadata (
14
18
  key TEXT PRIMARY KEY,
15
19
  value TEXT NOT NULL
@@ -35,6 +39,15 @@ export class Storage {
35
39
  provider TEXT,
36
40
  command TEXT,
37
41
  args_json TEXT,
42
+ variant_set_name TEXT,
43
+ variant_label TEXT,
44
+ prompt_version TEXT,
45
+ model_version TEXT,
46
+ tool_schema_version TEXT,
47
+ config_label TEXT,
48
+ config_hash TEXT,
49
+ runtime_profile_name TEXT,
50
+ suite_definition_name TEXT,
38
51
  config_json TEXT NOT NULL,
39
52
  created_at TEXT NOT NULL
40
53
  );
@@ -45,6 +58,15 @@ export class Storage {
45
58
  scenario_file_hash TEXT NOT NULL,
46
59
  agent_version_id TEXT NOT NULL,
47
60
  suite_batch_id TEXT,
61
+ variant_set_name TEXT,
62
+ variant_label TEXT,
63
+ prompt_version TEXT,
64
+ model_version TEXT,
65
+ tool_schema_version TEXT,
66
+ config_label TEXT,
67
+ config_hash TEXT,
68
+ runtime_profile_name TEXT,
69
+ suite_definition_name TEXT,
48
70
  status TEXT NOT NULL,
49
71
  termination_reason TEXT NOT NULL,
50
72
  final_output TEXT NOT NULL,
@@ -120,25 +142,41 @@ export class Storage {
120
142
  upsertAgentVersion(agentVersion) {
121
143
  const now = new Date().toISOString();
122
144
  this.db
123
- .prepare(`INSERT INTO agent_versions (id, label, model_id, provider, command, args_json, config_json, created_at)
124
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
145
+ .prepare(`INSERT INTO agent_versions (
146
+ id, label, model_id, provider, command, args_json,
147
+ variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
148
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
149
+ config_json, created_at
150
+ )
151
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
125
152
  ON CONFLICT(id) DO UPDATE SET
126
153
  label = excluded.label,
127
154
  model_id = excluded.model_id,
128
155
  provider = excluded.provider,
129
156
  command = excluded.command,
130
157
  args_json = excluded.args_json,
158
+ variant_set_name = excluded.variant_set_name,
159
+ variant_label = excluded.variant_label,
160
+ prompt_version = excluded.prompt_version,
161
+ model_version = excluded.model_version,
162
+ tool_schema_version = excluded.tool_schema_version,
163
+ config_label = excluded.config_label,
164
+ config_hash = excluded.config_hash,
165
+ runtime_profile_name = excluded.runtime_profile_name,
166
+ suite_definition_name = excluded.suite_definition_name,
131
167
  config_json = excluded.config_json`)
132
- .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
168
+ .run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
133
169
  }
134
170
  saveRun(bundle) {
135
171
  const run = bundle.run;
136
172
  this.db
137
173
  .prepare(`INSERT INTO runs (
138
174
  id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
139
- suite_batch_id, total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
140
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
141
- .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
175
+ suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
176
+ config_label, config_hash, runtime_profile_name, suite_definition_name,
177
+ total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
178
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
179
+ .run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
142
180
  const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
143
181
  VALUES (?, ?, ?, ?, ?, ?, ?)`);
144
182
  const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
@@ -183,6 +221,7 @@ export class Storage {
183
221
  return this.db
184
222
  .prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
185
223
  r.suite_batch_id as suiteBatchId,
224
+ r.variant_set_name as variantSetName, r.variant_label as variantLabel,
186
225
  av.label as agentLabel, av.provider, av.model_id as modelId,
187
226
  r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
188
227
  r.started_at as startedAt
@@ -244,6 +283,11 @@ export class Storage {
244
283
  }));
245
284
  const agentVersion = this.db
246
285
  .prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
286
+ , variant_set_name as variantSetName, variant_label as variantLabel,
287
+ prompt_version as promptVersion, model_version as modelVersion,
288
+ tool_schema_version as toolSchemaVersion, config_label as configLabel,
289
+ config_hash as configHash, runtime_profile_name as runtimeProfileName,
290
+ suite_definition_name as suiteDefinitionName
247
291
  FROM agent_versions WHERE id = ?`)
248
292
  .get(run.agentVersionId);
249
293
  return {
@@ -259,6 +303,15 @@ export class Storage {
259
303
  provider: agentVersion.provider ?? undefined,
260
304
  command: agentVersion.command ?? undefined,
261
305
  args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
306
+ variantSetName: agentVersion.variantSetName ?? undefined,
307
+ variantLabel: agentVersion.variantLabel ?? undefined,
308
+ promptVersion: agentVersion.promptVersion ?? undefined,
309
+ modelVersion: agentVersion.modelVersion ?? undefined,
310
+ toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
311
+ configLabel: agentVersion.configLabel ?? undefined,
312
+ configHash: agentVersion.configHash ?? undefined,
313
+ runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
314
+ suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
262
315
  config: JSON.parse(agentVersion.config_json),
263
316
  }
264
317
  : undefined,
@@ -348,7 +401,10 @@ export class Storage {
348
401
  getRunRecord(runId) {
349
402
  return (this.db
350
403
  .prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
351
- suite_batch_id as suiteBatchId,
404
+ suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
405
+ prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
406
+ config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
407
+ suite_definition_name as suiteDefinitionName,
352
408
  status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
353
409
  total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
354
410
  total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
@@ -388,6 +444,33 @@ export class Storage {
388
444
  if (!names.has("args_json")) {
389
445
  this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
390
446
  }
447
+ if (!names.has("variant_set_name")) {
448
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
449
+ }
450
+ if (!names.has("variant_label")) {
451
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
452
+ }
453
+ if (!names.has("prompt_version")) {
454
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
455
+ }
456
+ if (!names.has("model_version")) {
457
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
458
+ }
459
+ if (!names.has("tool_schema_version")) {
460
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
461
+ }
462
+ if (!names.has("config_label")) {
463
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
464
+ }
465
+ if (!names.has("config_hash")) {
466
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
467
+ }
468
+ if (!names.has("runtime_profile_name")) {
469
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
470
+ }
471
+ if (!names.has("suite_definition_name")) {
472
+ this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
473
+ }
391
474
  }
392
475
  ensureRunColumns() {
393
476
  const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
@@ -395,6 +478,33 @@ export class Storage {
395
478
  if (!names.has("suite_batch_id")) {
396
479
  this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
397
480
  }
481
+ if (!names.has("variant_set_name")) {
482
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
483
+ }
484
+ if (!names.has("variant_label")) {
485
+ this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
486
+ }
487
+ if (!names.has("prompt_version")) {
488
+ this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
489
+ }
490
+ if (!names.has("model_version")) {
491
+ this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
492
+ }
493
+ if (!names.has("tool_schema_version")) {
494
+ this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
495
+ }
496
+ if (!names.has("config_label")) {
497
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
498
+ }
499
+ if (!names.has("config_hash")) {
500
+ this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
501
+ }
502
+ if (!names.has("runtime_profile_name")) {
503
+ this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
504
+ }
505
+ if (!names.has("suite_definition_name")) {
506
+ this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
507
+ }
398
508
  }
399
509
  getRunsBySuiteBatchId(suiteBatchId) {
400
510
  const runIds = this.db
package/dist/tools.js CHANGED
@@ -1,7 +1,40 @@
1
1
  import { readFileSync } from "node:fs";
2
+ import { createRequire } from "node:module";
2
3
  import { pathToFileURL } from "node:url";
3
4
  import { resolve } from "node:path";
4
5
  import { loadAgentLabConfig } from "./config.js";
6
+ export function applyRuntimeProfileToTools(tools, profile, trace) {
7
+ if (!profile?.tool_faults?.length) {
8
+ return tools;
9
+ }
10
+ const wrapped = { ...tools };
11
+ for (const fault of profile.tool_faults) {
12
+ const original = wrapped[fault.tool];
13
+ if (!original) {
14
+ continue;
15
+ }
16
+ wrapped[fault.tool] = async (input, context) => {
17
+ trace.record("system", "tool_fault_injected", {
18
+ tool: fault.tool,
19
+ mode: fault.mode,
20
+ }, { countStep: false });
21
+ if (fault.mode === "timeout") {
22
+ await waitUnref(fault.timeout_ms ?? 5000);
23
+ const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
24
+ timeoutError.code = "timeout_exceeded";
25
+ throw timeoutError;
26
+ }
27
+ if (fault.mode === "error") {
28
+ throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
29
+ }
30
+ if (fault.mode === "malformed_output") {
31
+ return "MALFORMED_OUTPUT";
32
+ }
33
+ return fault.partial_output ?? {};
34
+ };
35
+ }
36
+ return wrapped;
37
+ }
5
38
  function loadFixture(path) {
6
39
  const raw = readFileSync(resolve(path), "utf8");
7
40
  return JSON.parse(raw);
@@ -352,8 +385,7 @@ async function loadTools() {
352
385
  return merged;
353
386
  }
354
387
  async function loadConfiguredTool(tool) {
355
- const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
356
- const module = await import(moduleUrl);
388
+ const module = tool.package ? await importConfiguredPackageTool(tool) : await importConfiguredFileTool(tool);
357
389
  const candidate = module[tool.exportName];
358
390
  if (typeof candidate !== "function") {
359
391
  throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
@@ -367,8 +399,30 @@ async function loadConfiguredTool(tool) {
367
399
  handler: candidate,
368
400
  };
369
401
  }
402
+ async function importConfiguredFileTool(tool) {
403
+ const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
404
+ return (await import(moduleUrl));
405
+ }
406
+ async function importConfiguredPackageTool(tool) {
407
+ try {
408
+ const requireFromCwd = createRequire(resolve(process.cwd(), "package.json"));
409
+ const resolved = requireFromCwd.resolve(tool.package);
410
+ const moduleUrl = pathToFileURL(resolved).href;
411
+ return (await import(moduleUrl));
412
+ }
413
+ catch (error) {
414
+ const message = error instanceof Error ? error.message : String(error);
415
+ throw new Error(`Tool '${tool.name}' failed to load package '${tool.package}': ${message}`);
416
+ }
417
+ }
370
418
  function assertObject(value) {
371
419
  if (typeof value !== "object" || value === null || Array.isArray(value)) {
372
420
  throw new Error("Tool input must be an object.");
373
421
  }
374
422
  }
423
+ function waitUnref(timeoutMs) {
424
+ return new Promise((resolve) => {
425
+ const timer = setTimeout(resolve, timeoutMs);
426
+ timer.unref?.();
427
+ });
428
+ }
package/dist/trace.js CHANGED
@@ -8,8 +8,10 @@ export class TraceRecorder {
8
8
  this.runId = runId;
9
9
  this.scenarioId = scenarioId;
10
10
  }
11
- record(source, type, payload) {
12
- this.stepIndex += 1;
11
+ record(source, type, payload, options) {
12
+ if (options?.countStep !== false) {
13
+ this.stepIndex += 1;
14
+ }
13
15
  this.events.push({
14
16
  eventId: createEventId(),
15
17
  runId: this.runId,
package/dist/ui/App.js CHANGED
@@ -1,4 +1,4 @@
1
- import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
1
+ import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
2
2
  import { useEffect, useState } from "react";
3
3
  export function App() {
4
4
  const route = getRoute();
@@ -21,7 +21,8 @@ function RunListPage() {
21
21
  .then((response) => response.json())
22
22
  .then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
23
23
  }, [suite, status, provider]);
24
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
24
+ const stats = summarizeRuns(runs);
25
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), runs.length > 0 ? (_jsxs("div", { className: "stats dashboard-stats", children: [_jsx(Stat, { label: "Runs shown", value: stats.total }), _jsx(Stat, { label: "Passing", value: _jsx("span", { className: "pass-text", children: stats.pass }) }), _jsx(Stat, { label: "Failing", value: _jsx("span", { className: "fail-text", children: stats.fail }) }), _jsx(Stat, { label: "Errors", value: _jsx("span", { className: "error-text", children: stats.error }) }), _jsx(Stat, { label: "Latest suite", value: stats.latestSuite }), _jsx(Stat, { label: "Latest provider", value: stats.latestProvider })] })) : null, _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
25
26
  runs[index - 1].suite === run.suite &&
26
27
  runs[index - 1].suiteBatchId &&
27
28
  run.suiteBatchId &&
@@ -37,7 +38,18 @@ function RunDetailPage(props) {
37
38
  if (!detail) {
38
39
  return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
39
40
  }
40
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("strong", { children: [event.stepIndex, ". ", event.type] }), " ", _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
41
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => (_jsxs("li", { className: "timeline-item", children: [_jsxs("div", { className: "timeline-head", children: [_jsxs("span", { className: "timeline-step", children: ["Step ", event.stepIndex] }), _jsx("span", { className: "event-chip", children: formatEventLabel(event.type) }), _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
42
+ }
43
+ export function FailureSummaryPanel(props) {
44
+ const failureItems = getFailureSummaryItems(props.detail);
45
+ if (failureItems.length === 0) {
46
+ return null;
47
+ }
48
+ return (_jsxs("section", { className: "panel failure-panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
49
+ }
50
+ export function RunIdentitySummary(props) {
51
+ const run = props.detail.run;
52
+ return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
41
53
  }
42
54
  function ComparePage(props) {
43
55
  const [data, setData] = useState(null);
@@ -59,10 +71,10 @@ function ComparePage(props) {
59
71
  if (!data) {
60
72
  return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
61
73
  }
62
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.note, diff.hardGate ? " (hard gate)" : ""] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack", children: data.toolDiffs.map((diff) => (_jsx("li", { children: diff.note }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
74
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsx(ComparisonHero, { comparison: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel emphasis-panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.evaluatorId }), diff.hardGate ? _jsx("span", { className: "event-chip", children: "hard gate" }) : null] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.toolName }), _jsx("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
63
75
  }
64
76
  function RunSide(props) {
65
- return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { children: _jsxs("strong", { children: [event.stepIndex, ". ", event.type] }) }, event.eventId))) })] }));
77
+ return (_jsxs("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { className: "timeline-item compact-item", children: _jsxs("strong", { children: [event.stepIndex, ". ", formatEventLabel(event.type)] }) }, event.eventId))) })] }));
66
78
  }
67
79
  function SuiteComparePage(props) {
68
80
  const [data, setData] = useState(null);
@@ -84,10 +96,10 @@ function SuiteComparePage(props) {
84
96
  if (!data) {
85
97
  return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
86
98
  }
87
- return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
99
+ return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsx(SuiteComparisonHero, { data: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
88
100
  }
89
101
  function ScenarioList(props) {
90
- return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
102
+ return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack diff-list", children: props.items.map((item) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification })] }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
91
103
  }
92
104
  function Stat(props) {
93
105
  return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
@@ -95,6 +107,62 @@ function Stat(props) {
95
107
  function EmptyState(props) {
96
108
  return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
97
109
  }
110
+ export function ComparisonHero(props) {
111
+ const tone = mapClassificationToTone(props.comparison.classification);
112
+ return (_jsxs("section", { className: `panel compare-hero ${tone}`, children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: props.comparison.classification }), _jsx("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })] }), _jsxs("p", { className: "muted", children: ["Output changed: ", props.comparison.outputChanged ? "yes" : "no", props.comparison.terminationDelta ? ` • termination: ${props.comparison.terminationDelta}` : ""] })] }));
113
+ }
114
+ export function SuiteComparisonHero(props) {
115
+ return (_jsxs("section", { className: "panel compare-hero neutral", children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: "Suite movement" }), _jsx("span", { className: "event-chip", children: props.data.classification })] }), _jsxs("div", { className: "stats compact-stats", children: [_jsx(Stat, { label: "Regressions", value: props.data.regressions.length }), _jsx(Stat, { label: "Improvements", value: props.data.improvements.length }), _jsx(Stat, { label: "Unchanged", value: props.data.unchanged.length })] })] }));
116
+ }
117
+ export function getFailureSummaryItems(detail) {
118
+ const items = [];
119
+ if (detail.errorDetail) {
120
+ items.push(`Error: ${detail.errorDetail}`);
121
+ }
122
+ for (const result of detail.evaluatorResults) {
123
+ if (result.status === "fail") {
124
+ items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
125
+ }
126
+ }
127
+ if (detail.run.status !== "pass" && items.length === 0) {
128
+ items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
129
+ }
130
+ return items;
131
+ }
132
+ export function summarizeRuns(runs) {
133
+ return {
134
+ total: runs.length,
135
+ pass: runs.filter((run) => run.status === "pass").length,
136
+ fail: runs.filter((run) => run.status === "fail").length,
137
+ error: runs.filter((run) => run.status === "error").length,
138
+ latestSuite: runs[0]?.suite ?? "-",
139
+ latestProvider: runs[0]?.provider ?? "-",
140
+ };
141
+ }
142
+ function formatEventLabel(type) {
143
+ return type.replaceAll("_", " ");
144
+ }
145
+ function mapRiskToPill(risk) {
146
+ if (risk === "high") {
147
+ return "fail";
148
+ }
149
+ if (risk === "medium") {
150
+ return "error";
151
+ }
152
+ return "pass";
153
+ }
154
+ function mapClassificationToTone(classification) {
155
+ if (classification.includes("regress")) {
156
+ return "fail";
157
+ }
158
+ if (classification.includes("improv")) {
159
+ return "pass";
160
+ }
161
+ if (classification.includes("changed")) {
162
+ return "error";
163
+ }
164
+ return "neutral";
165
+ }
98
166
  function signed(value) {
99
167
  return value > 0 ? `+${value}` : `${value}`;
100
168
  }
@@ -10,6 +10,7 @@
10
10
  --pass: #1e6a42;
11
11
  --fail: #9a2c1f;
12
12
  --error: #5b1e72;
13
+ --shadow: 0 16px 40px rgba(76, 58, 26, 0.08);
13
14
  }
14
15
  * {
15
16
  box-sizing: border-box;
@@ -104,6 +105,7 @@ select {
104
105
  border: 1px solid var(--line);
105
106
  border-radius: 16px;
106
107
  padding: 1rem;
108
+ box-shadow: var(--shadow);
107
109
  }
108
110
  .stat-value {
109
111
  font-size: 1.4rem;
@@ -114,6 +116,18 @@ select {
114
116
  grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
115
117
  margin-bottom: 1rem;
116
118
  }
119
+ .dashboard-stats .stat {
120
+ border-top: 4px solid var(--line);
121
+ }
122
+ .pass-text {
123
+ color: var(--pass);
124
+ }
125
+ .fail-text {
126
+ color: var(--fail);
127
+ }
128
+ .error-text {
129
+ color: var(--error);
130
+ }
117
131
  .table {
118
132
  width: 100%;
119
133
  border-collapse: collapse;
@@ -157,6 +171,16 @@ select {
157
171
  background: rgba(91, 30, 114, 0.12);
158
172
  color: var(--error);
159
173
  }
174
+ .pill.neutral {
175
+ background: rgba(102, 95, 84, 0.14);
176
+ color: var(--muted);
177
+ }
178
+ .failure-panel {
179
+ border-left: 6px solid var(--fail);
180
+ }
181
+ .emphasis-panel {
182
+ border-left: 6px solid var(--accent);
183
+ }
160
184
  .stack,
161
185
  .timeline {
162
186
  display: grid;
@@ -166,6 +190,74 @@ select {
166
190
  .timeline.compact {
167
191
  gap: 0.35rem;
168
192
  }
193
+ .timeline-detailed {
194
+ padding-left: 0;
195
+ list-style: none;
196
+ }
197
+ .timeline-item {
198
+ border-left: 3px solid var(--line);
199
+ padding-left: 0.9rem;
200
+ margin-left: 0.35rem;
201
+ }
202
+ .timeline-head,
203
+ .diff-card-head,
204
+ .compare-hero-head {
205
+ display: flex;
206
+ gap: 0.6rem;
207
+ align-items: center;
208
+ flex-wrap: wrap;
209
+ }
210
+ .timeline-step,
211
+ .event-chip {
212
+ display: inline-block;
213
+ padding: 0.2rem 0.55rem;
214
+ border-radius: 999px;
215
+ background: #efe5d5;
216
+ color: var(--ink);
217
+ font-size: 0.78rem;
218
+ font-family: "IBM Plex Mono", monospace;
219
+ text-transform: uppercase;
220
+ }
221
+ .diff-list {
222
+ padding-left: 0;
223
+ list-style: none;
224
+ }
225
+ .diff-card {
226
+ border: 1px solid var(--line);
227
+ border-radius: 12px;
228
+ padding: 0.8rem;
229
+ background: #faf5ec;
230
+ }
231
+ .compare-hero {
232
+ margin-bottom: 1rem;
233
+ }
234
+ .compare-hero.pass {
235
+ border-left: 6px solid var(--pass);
236
+ }
237
+ .compare-hero.fail {
238
+ border-left: 6px solid var(--fail);
239
+ }
240
+ .compare-hero.error {
241
+ border-left: 6px solid var(--error);
242
+ }
243
+ .compare-hero.neutral {
244
+ border-left: 6px solid var(--muted);
245
+ }
246
+ .compact-stats {
247
+ margin-top: 1rem;
248
+ margin-bottom: 0;
249
+ }
250
+ .compare-side.baseline-side {
251
+ border-top: 4px solid #b89d67;
252
+ }
253
+ .compare-side.candidate-side {
254
+ border-top: 4px solid var(--accent);
255
+ }
256
+ .compact-item {
257
+ border-left: none;
258
+ padding-left: 0;
259
+ margin-left: 0;
260
+ }
169
261
  @media (max-width: 720px) {
170
262
  .table {
171
263
  display: block;