agent-regression-lab 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -11
- package/bin/agentlab.js +2 -0
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +199 -12
- package/dist/evaluators.js +56 -1
- package/dist/index.js +157 -11
- package/dist/init.js +88 -0
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +56 -2
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +75 -7
- package/dist/ui-assets/client.css +92 -0
- package/dist/ui-assets/client.js +183 -19
- package/docs/agents.md +143 -8
- package/docs/coding-agents.md +74 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
- package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
- package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
- package/docs/tools.md +34 -3
- package/docs/troubleshooting.md +193 -0
- package/docs/variant-sets.md +63 -0
- package/examples/coding-tools/README.md +21 -0
- package/examples/coding-tools/index.js +11 -0
- package/examples/coding-tools/package.json +8 -0
- package/examples/support-tools/README.md +21 -0
- package/examples/support-tools/index.js +8 -0
- package/examples/support-tools/package.json +8 -0
- package/package.json +7 -5
package/dist/storage.js
CHANGED
|
@@ -10,6 +10,10 @@ export class Storage {
|
|
|
10
10
|
ensureParentDir(DB_PATH);
|
|
11
11
|
this.db = new DatabaseSync(DB_PATH);
|
|
12
12
|
this.db.exec(`
|
|
13
|
+
PRAGMA journal_mode = WAL;
|
|
14
|
+
PRAGMA busy_timeout = 5000;
|
|
15
|
+
`);
|
|
16
|
+
this.db.exec(`
|
|
13
17
|
CREATE TABLE IF NOT EXISTS metadata (
|
|
14
18
|
key TEXT PRIMARY KEY,
|
|
15
19
|
value TEXT NOT NULL
|
|
@@ -35,6 +39,15 @@ export class Storage {
|
|
|
35
39
|
provider TEXT,
|
|
36
40
|
command TEXT,
|
|
37
41
|
args_json TEXT,
|
|
42
|
+
variant_set_name TEXT,
|
|
43
|
+
variant_label TEXT,
|
|
44
|
+
prompt_version TEXT,
|
|
45
|
+
model_version TEXT,
|
|
46
|
+
tool_schema_version TEXT,
|
|
47
|
+
config_label TEXT,
|
|
48
|
+
config_hash TEXT,
|
|
49
|
+
runtime_profile_name TEXT,
|
|
50
|
+
suite_definition_name TEXT,
|
|
38
51
|
config_json TEXT NOT NULL,
|
|
39
52
|
created_at TEXT NOT NULL
|
|
40
53
|
);
|
|
@@ -45,6 +58,15 @@ export class Storage {
|
|
|
45
58
|
scenario_file_hash TEXT NOT NULL,
|
|
46
59
|
agent_version_id TEXT NOT NULL,
|
|
47
60
|
suite_batch_id TEXT,
|
|
61
|
+
variant_set_name TEXT,
|
|
62
|
+
variant_label TEXT,
|
|
63
|
+
prompt_version TEXT,
|
|
64
|
+
model_version TEXT,
|
|
65
|
+
tool_schema_version TEXT,
|
|
66
|
+
config_label TEXT,
|
|
67
|
+
config_hash TEXT,
|
|
68
|
+
runtime_profile_name TEXT,
|
|
69
|
+
suite_definition_name TEXT,
|
|
48
70
|
status TEXT NOT NULL,
|
|
49
71
|
termination_reason TEXT NOT NULL,
|
|
50
72
|
final_output TEXT NOT NULL,
|
|
@@ -120,25 +142,41 @@ export class Storage {
|
|
|
120
142
|
upsertAgentVersion(agentVersion) {
|
|
121
143
|
const now = new Date().toISOString();
|
|
122
144
|
this.db
|
|
123
|
-
.prepare(`INSERT INTO agent_versions (
|
|
124
|
-
|
|
145
|
+
.prepare(`INSERT INTO agent_versions (
|
|
146
|
+
id, label, model_id, provider, command, args_json,
|
|
147
|
+
variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
148
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
149
|
+
config_json, created_at
|
|
150
|
+
)
|
|
151
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
125
152
|
ON CONFLICT(id) DO UPDATE SET
|
|
126
153
|
label = excluded.label,
|
|
127
154
|
model_id = excluded.model_id,
|
|
128
155
|
provider = excluded.provider,
|
|
129
156
|
command = excluded.command,
|
|
130
157
|
args_json = excluded.args_json,
|
|
158
|
+
variant_set_name = excluded.variant_set_name,
|
|
159
|
+
variant_label = excluded.variant_label,
|
|
160
|
+
prompt_version = excluded.prompt_version,
|
|
161
|
+
model_version = excluded.model_version,
|
|
162
|
+
tool_schema_version = excluded.tool_schema_version,
|
|
163
|
+
config_label = excluded.config_label,
|
|
164
|
+
config_hash = excluded.config_hash,
|
|
165
|
+
runtime_profile_name = excluded.runtime_profile_name,
|
|
166
|
+
suite_definition_name = excluded.suite_definition_name,
|
|
131
167
|
config_json = excluded.config_json`)
|
|
132
|
-
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), JSON.stringify(agentVersion.config), now);
|
|
168
|
+
.run(agentVersion.id, agentVersion.label, agentVersion.modelId ?? null, agentVersion.provider ?? null, agentVersion.command ?? null, JSON.stringify(agentVersion.args ?? []), agentVersion.variantSetName ?? null, agentVersion.variantLabel ?? null, agentVersion.promptVersion ?? null, agentVersion.modelVersion ?? null, agentVersion.toolSchemaVersion ?? null, agentVersion.configLabel ?? null, agentVersion.configHash ?? null, agentVersion.runtimeProfileName ?? null, agentVersion.suiteDefinitionName ?? null, JSON.stringify(agentVersion.config), now);
|
|
133
169
|
}
|
|
134
170
|
saveRun(bundle) {
|
|
135
171
|
const run = bundle.run;
|
|
136
172
|
this.db
|
|
137
173
|
.prepare(`INSERT INTO runs (
|
|
138
174
|
id, scenario_id, scenario_file_hash, agent_version_id, status, termination_reason, final_output,
|
|
139
|
-
suite_batch_id,
|
|
140
|
-
|
|
141
|
-
|
|
175
|
+
suite_batch_id, variant_set_name, variant_label, prompt_version, model_version, tool_schema_version,
|
|
176
|
+
config_label, config_hash, runtime_profile_name, suite_definition_name,
|
|
177
|
+
total_steps, total_tool_calls, duration_ms, total_tokens, total_cost_usd, score, started_at, finished_at
|
|
178
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
|
179
|
+
.run(run.id, run.scenarioId, run.scenarioFileHash, run.agentVersionId, run.status, run.terminationReason, run.finalOutput, run.suiteBatchId ?? null, run.variantSetName ?? null, run.variantLabel ?? null, run.promptVersion ?? null, run.modelVersion ?? null, run.toolSchemaVersion ?? null, run.configLabel ?? null, run.configHash ?? null, run.runtimeProfileName ?? null, run.suiteDefinitionName ?? null, run.totalSteps, run.totalToolCalls, run.durationMs, run.totalTokens ?? null, run.totalCostUsd ?? null, run.score, run.startedAt, run.finishedAt);
|
|
142
180
|
const insertStep = this.db.prepare(`INSERT INTO run_steps (id, run_id, step_index, timestamp, source, type, payload_json)
|
|
143
181
|
VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
|
144
182
|
const insertTool = this.db.prepare(`INSERT INTO tool_calls (id, run_id, step_index, tool_name, input_json, output_json, status, duration_ms, error_message)
|
|
@@ -183,6 +221,7 @@ export class Storage {
|
|
|
183
221
|
return this.db
|
|
184
222
|
.prepare(`SELECT r.id, r.scenario_id as scenarioId, s.suite, r.agent_version_id as agentVersionId,
|
|
185
223
|
r.suite_batch_id as suiteBatchId,
|
|
224
|
+
r.variant_set_name as variantSetName, r.variant_label as variantLabel,
|
|
186
225
|
av.label as agentLabel, av.provider, av.model_id as modelId,
|
|
187
226
|
r.status, r.score, r.duration_ms as durationMs, r.total_steps as totalSteps,
|
|
188
227
|
r.started_at as startedAt
|
|
@@ -244,6 +283,11 @@ export class Storage {
|
|
|
244
283
|
}));
|
|
245
284
|
const agentVersion = this.db
|
|
246
285
|
.prepare(`SELECT id, label, model_id as modelId, provider, command, args_json, config_json
|
|
286
|
+
, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
287
|
+
prompt_version as promptVersion, model_version as modelVersion,
|
|
288
|
+
tool_schema_version as toolSchemaVersion, config_label as configLabel,
|
|
289
|
+
config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
290
|
+
suite_definition_name as suiteDefinitionName
|
|
247
291
|
FROM agent_versions WHERE id = ?`)
|
|
248
292
|
.get(run.agentVersionId);
|
|
249
293
|
return {
|
|
@@ -259,6 +303,15 @@ export class Storage {
|
|
|
259
303
|
provider: agentVersion.provider ?? undefined,
|
|
260
304
|
command: agentVersion.command ?? undefined,
|
|
261
305
|
args: agentVersion.args_json ? JSON.parse(agentVersion.args_json) : undefined,
|
|
306
|
+
variantSetName: agentVersion.variantSetName ?? undefined,
|
|
307
|
+
variantLabel: agentVersion.variantLabel ?? undefined,
|
|
308
|
+
promptVersion: agentVersion.promptVersion ?? undefined,
|
|
309
|
+
modelVersion: agentVersion.modelVersion ?? undefined,
|
|
310
|
+
toolSchemaVersion: agentVersion.toolSchemaVersion ?? undefined,
|
|
311
|
+
configLabel: agentVersion.configLabel ?? undefined,
|
|
312
|
+
configHash: agentVersion.configHash ?? undefined,
|
|
313
|
+
runtimeProfileName: agentVersion.runtimeProfileName ?? undefined,
|
|
314
|
+
suiteDefinitionName: agentVersion.suiteDefinitionName ?? undefined,
|
|
262
315
|
config: JSON.parse(agentVersion.config_json),
|
|
263
316
|
}
|
|
264
317
|
: undefined,
|
|
@@ -348,7 +401,10 @@ export class Storage {
|
|
|
348
401
|
getRunRecord(runId) {
|
|
349
402
|
return (this.db
|
|
350
403
|
.prepare(`SELECT id, scenario_id as scenarioId, scenario_file_hash as scenarioFileHash, agent_version_id as agentVersionId,
|
|
351
|
-
suite_batch_id as suiteBatchId,
|
|
404
|
+
suite_batch_id as suiteBatchId, variant_set_name as variantSetName, variant_label as variantLabel,
|
|
405
|
+
prompt_version as promptVersion, model_version as modelVersion, tool_schema_version as toolSchemaVersion,
|
|
406
|
+
config_label as configLabel, config_hash as configHash, runtime_profile_name as runtimeProfileName,
|
|
407
|
+
suite_definition_name as suiteDefinitionName,
|
|
352
408
|
status, termination_reason as terminationReason, final_output as finalOutput, total_steps as totalSteps,
|
|
353
409
|
total_tool_calls as totalToolCalls, duration_ms as durationMs, total_tokens as totalTokens,
|
|
354
410
|
total_cost_usd as totalCostUsd, score, started_at as startedAt, finished_at as finishedAt
|
|
@@ -388,6 +444,33 @@ export class Storage {
|
|
|
388
444
|
if (!names.has("args_json")) {
|
|
389
445
|
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN args_json TEXT`);
|
|
390
446
|
}
|
|
447
|
+
if (!names.has("variant_set_name")) {
|
|
448
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_set_name TEXT`);
|
|
449
|
+
}
|
|
450
|
+
if (!names.has("variant_label")) {
|
|
451
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN variant_label TEXT`);
|
|
452
|
+
}
|
|
453
|
+
if (!names.has("prompt_version")) {
|
|
454
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN prompt_version TEXT`);
|
|
455
|
+
}
|
|
456
|
+
if (!names.has("model_version")) {
|
|
457
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN model_version TEXT`);
|
|
458
|
+
}
|
|
459
|
+
if (!names.has("tool_schema_version")) {
|
|
460
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN tool_schema_version TEXT`);
|
|
461
|
+
}
|
|
462
|
+
if (!names.has("config_label")) {
|
|
463
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_label TEXT`);
|
|
464
|
+
}
|
|
465
|
+
if (!names.has("config_hash")) {
|
|
466
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN config_hash TEXT`);
|
|
467
|
+
}
|
|
468
|
+
if (!names.has("runtime_profile_name")) {
|
|
469
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN runtime_profile_name TEXT`);
|
|
470
|
+
}
|
|
471
|
+
if (!names.has("suite_definition_name")) {
|
|
472
|
+
this.db.exec(`ALTER TABLE agent_versions ADD COLUMN suite_definition_name TEXT`);
|
|
473
|
+
}
|
|
391
474
|
}
|
|
392
475
|
ensureRunColumns() {
|
|
393
476
|
const columns = this.db.prepare(`PRAGMA table_info(runs)`).all();
|
|
@@ -395,6 +478,33 @@ export class Storage {
|
|
|
395
478
|
if (!names.has("suite_batch_id")) {
|
|
396
479
|
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_batch_id TEXT`);
|
|
397
480
|
}
|
|
481
|
+
if (!names.has("variant_set_name")) {
|
|
482
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_set_name TEXT`);
|
|
483
|
+
}
|
|
484
|
+
if (!names.has("variant_label")) {
|
|
485
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN variant_label TEXT`);
|
|
486
|
+
}
|
|
487
|
+
if (!names.has("prompt_version")) {
|
|
488
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN prompt_version TEXT`);
|
|
489
|
+
}
|
|
490
|
+
if (!names.has("model_version")) {
|
|
491
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN model_version TEXT`);
|
|
492
|
+
}
|
|
493
|
+
if (!names.has("tool_schema_version")) {
|
|
494
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN tool_schema_version TEXT`);
|
|
495
|
+
}
|
|
496
|
+
if (!names.has("config_label")) {
|
|
497
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_label TEXT`);
|
|
498
|
+
}
|
|
499
|
+
if (!names.has("config_hash")) {
|
|
500
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN config_hash TEXT`);
|
|
501
|
+
}
|
|
502
|
+
if (!names.has("runtime_profile_name")) {
|
|
503
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN runtime_profile_name TEXT`);
|
|
504
|
+
}
|
|
505
|
+
if (!names.has("suite_definition_name")) {
|
|
506
|
+
this.db.exec(`ALTER TABLE runs ADD COLUMN suite_definition_name TEXT`);
|
|
507
|
+
}
|
|
398
508
|
}
|
|
399
509
|
getRunsBySuiteBatchId(suiteBatchId) {
|
|
400
510
|
const runIds = this.db
|
package/dist/tools.js
CHANGED
|
@@ -1,7 +1,40 @@
|
|
|
1
1
|
import { readFileSync } from "node:fs";
|
|
2
|
+
import { createRequire } from "node:module";
|
|
2
3
|
import { pathToFileURL } from "node:url";
|
|
3
4
|
import { resolve } from "node:path";
|
|
4
5
|
import { loadAgentLabConfig } from "./config.js";
|
|
6
|
+
export function applyRuntimeProfileToTools(tools, profile, trace) {
|
|
7
|
+
if (!profile?.tool_faults?.length) {
|
|
8
|
+
return tools;
|
|
9
|
+
}
|
|
10
|
+
const wrapped = { ...tools };
|
|
11
|
+
for (const fault of profile.tool_faults) {
|
|
12
|
+
const original = wrapped[fault.tool];
|
|
13
|
+
if (!original) {
|
|
14
|
+
continue;
|
|
15
|
+
}
|
|
16
|
+
wrapped[fault.tool] = async (input, context) => {
|
|
17
|
+
trace.record("system", "tool_fault_injected", {
|
|
18
|
+
tool: fault.tool,
|
|
19
|
+
mode: fault.mode,
|
|
20
|
+
}, { countStep: false });
|
|
21
|
+
if (fault.mode === "timeout") {
|
|
22
|
+
await waitUnref(fault.timeout_ms ?? 5000);
|
|
23
|
+
const timeoutError = new Error(`Injected timeout for ${fault.tool}`);
|
|
24
|
+
timeoutError.code = "timeout_exceeded";
|
|
25
|
+
throw timeoutError;
|
|
26
|
+
}
|
|
27
|
+
if (fault.mode === "error") {
|
|
28
|
+
throw new Error(fault.error_message ?? `Injected failure for ${fault.tool}`);
|
|
29
|
+
}
|
|
30
|
+
if (fault.mode === "malformed_output") {
|
|
31
|
+
return "MALFORMED_OUTPUT";
|
|
32
|
+
}
|
|
33
|
+
return fault.partial_output ?? {};
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
return wrapped;
|
|
37
|
+
}
|
|
5
38
|
function loadFixture(path) {
|
|
6
39
|
const raw = readFileSync(resolve(path), "utf8");
|
|
7
40
|
return JSON.parse(raw);
|
|
@@ -352,8 +385,7 @@ async function loadTools() {
|
|
|
352
385
|
return merged;
|
|
353
386
|
}
|
|
354
387
|
async function loadConfiguredTool(tool) {
|
|
355
|
-
const
|
|
356
|
-
const module = await import(moduleUrl);
|
|
388
|
+
const module = tool.package ? await importConfiguredPackageTool(tool) : await importConfiguredFileTool(tool);
|
|
357
389
|
const candidate = module[tool.exportName];
|
|
358
390
|
if (typeof candidate !== "function") {
|
|
359
391
|
throw new Error(`Tool '${tool.name}' export '${tool.exportName}' is not a function.`);
|
|
@@ -367,8 +399,30 @@ async function loadConfiguredTool(tool) {
|
|
|
367
399
|
handler: candidate,
|
|
368
400
|
};
|
|
369
401
|
}
|
|
402
|
+
async function importConfiguredFileTool(tool) {
|
|
403
|
+
const moduleUrl = pathToFileURL(resolve(tool.modulePath)).href;
|
|
404
|
+
return (await import(moduleUrl));
|
|
405
|
+
}
|
|
406
|
+
async function importConfiguredPackageTool(tool) {
|
|
407
|
+
try {
|
|
408
|
+
const requireFromCwd = createRequire(resolve(process.cwd(), "package.json"));
|
|
409
|
+
const resolved = requireFromCwd.resolve(tool.package);
|
|
410
|
+
const moduleUrl = pathToFileURL(resolved).href;
|
|
411
|
+
return (await import(moduleUrl));
|
|
412
|
+
}
|
|
413
|
+
catch (error) {
|
|
414
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
415
|
+
throw new Error(`Tool '${tool.name}' failed to load package '${tool.package}': ${message}`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
370
418
|
function assertObject(value) {
|
|
371
419
|
if (typeof value !== "object" || value === null || Array.isArray(value)) {
|
|
372
420
|
throw new Error("Tool input must be an object.");
|
|
373
421
|
}
|
|
374
422
|
}
|
|
423
|
+
function waitUnref(timeoutMs) {
|
|
424
|
+
return new Promise((resolve) => {
|
|
425
|
+
const timer = setTimeout(resolve, timeoutMs);
|
|
426
|
+
timer.unref?.();
|
|
427
|
+
});
|
|
428
|
+
}
|
package/dist/trace.js
CHANGED
|
@@ -8,8 +8,10 @@ export class TraceRecorder {
|
|
|
8
8
|
this.runId = runId;
|
|
9
9
|
this.scenarioId = scenarioId;
|
|
10
10
|
}
|
|
11
|
-
record(source, type, payload) {
|
|
12
|
-
|
|
11
|
+
record(source, type, payload, options) {
|
|
12
|
+
if (options?.countStep !== false) {
|
|
13
|
+
this.stepIndex += 1;
|
|
14
|
+
}
|
|
13
15
|
this.events.push({
|
|
14
16
|
eventId: createEventId(),
|
|
15
17
|
runId: this.runId,
|
package/dist/ui/App.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { jsx as _jsx, jsxs as _jsxs } from "react/jsx-runtime";
|
|
1
|
+
import { jsx as _jsx, jsxs as _jsxs, Fragment as _Fragment } from "react/jsx-runtime";
|
|
2
2
|
import { useEffect, useState } from "react";
|
|
3
3
|
export function App() {
|
|
4
4
|
const route = getRoute();
|
|
@@ -21,7 +21,8 @@ function RunListPage() {
|
|
|
21
21
|
.then((response) => response.json())
|
|
22
22
|
.then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
|
|
23
23
|
}, [suite, status, provider]);
|
|
24
|
-
|
|
24
|
+
const stats = summarizeRuns(runs);
|
|
25
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Runs" }), _jsx("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })] }), runs.length > 0 ? (_jsxs("div", { className: "stats dashboard-stats", children: [_jsx(Stat, { label: "Runs shown", value: stats.total }), _jsx(Stat, { label: "Passing", value: _jsx("span", { className: "pass-text", children: stats.pass }) }), _jsx(Stat, { label: "Failing", value: _jsx("span", { className: "fail-text", children: stats.fail }) }), _jsx(Stat, { label: "Errors", value: _jsx("span", { className: "error-text", children: stats.error }) }), _jsx(Stat, { label: "Latest suite", value: stats.latestSuite }), _jsx(Stat, { label: "Latest provider", value: stats.latestProvider })] })) : null, _jsxs("div", { className: "filters", children: [_jsx("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }), _jsxs("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [_jsx("option", { value: "", children: "All statuses" }), _jsx("option", { value: "pass", children: "Pass" }), _jsx("option", { value: "fail", children: "Fail" }), _jsx("option", { value: "error", children: "Error" })] }), _jsxs("select", { value: provider, onChange: (event) => setProvider(event.target.value), children: [_jsx("option", { value: "", children: "All providers" }), _jsx("option", { value: "mock", children: "Mock" }), _jsx("option", { value: "openai", children: "OpenAI" }), _jsx("option", { value: "external_process", children: "External process" })] })] }), runs.length === 0 ? _jsx(EmptyState, { title: "No runs yet", description: "Run a scenario from the CLI to populate the lab." }) : null, runs.length > 0 ? (_jsxs("table", { className: "table", children: [_jsx("thead", { children: _jsxs("tr", { children: [_jsx("th", { children: "Run" }), _jsx("th", { children: "Scenario" }), _jsx("th", { children: "Provider" }), _jsx("th", { children: "Status" }), _jsx("th", { children: "Score" }), _jsx("th", { children: "Runtime" }), _jsx("th", { children: "Steps" }), _jsx("th", { children: "Started" })] }) }), _jsx("tbody", { children: runs.map((run, index) => (_jsxs("tr", { children: [_jsx("td", { children: _jsx("a", { href: `/runs/${run.id}`, children: run.id }) }), _jsx("td", { children: run.scenarioId }), _jsxs("td", { children: [run.provider ?? "-", _jsx("div", { className: "muted", children: run.modelId ?? run.agentLabel ?? "" })] }), _jsx("td", { children: _jsx("span", { className: `pill ${run.status}`, children: run.status }) }), _jsx("td", { children: run.score }), _jsxs("td", { children: [run.durationMs, "ms"] }), _jsx("td", { children: run.totalSteps }), _jsxs("td", { children: [new Date(run.startedAt).toLocaleString(), index > 0 && runs[index - 1].scenarioId === run.scenarioId ? (_jsx("div", { className: "muted", children: _jsx("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) })) : null, index > 0 &&
|
|
25
26
|
runs[index - 1].suite === run.suite &&
|
|
26
27
|
runs[index - 1].suiteBatchId &&
|
|
27
28
|
run.suiteBatchId &&
|
|
@@ -37,7 +38,18 @@ function RunDetailPage(props) {
|
|
|
37
38
|
if (!detail) {
|
|
38
39
|
return _jsx(EmptyState, { title: "Loading run", description: "Fetching run detail from the local lab." });
|
|
39
40
|
}
|
|
40
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline", children: detail.traceEvents.map((event) => (_jsxs("li", { children: [_jsxs("div", { children: [_jsxs("
|
|
41
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: detail.run.id }), _jsx("p", { children: detail.run.scenarioId })] }), _jsx(FailureSummaryPanel, { detail: detail }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Status", value: _jsx("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }), _jsx(Stat, { label: "Score", value: detail.run.score }), _jsx(Stat, { label: "Runtime", value: `${detail.run.durationMs}ms` }), _jsx(Stat, { label: "Steps", value: detail.run.totalSteps })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Summary" }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", detail.agentVersion?.provider ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", detail.agentVersion?.modelId ?? "-"] }), _jsx(RunIdentitySummary, { detail: detail }), detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", detail.agentVersion.command, " ", (detail.agentVersion.args ?? []).join(" ")] })) : null, _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", detail.run.terminationReason] }), detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: detail.run.finalOutput || "(none)" })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluators" }), _jsx("ul", { className: "stack", children: detail.evaluatorResults.map((result) => (_jsxs("li", { children: [_jsx("span", { className: `pill ${result.status}`, children: result.status }), " ", result.evaluatorId, _jsx("div", { className: "muted", children: result.message })] }, result.evaluatorId))) })] })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool Calls" }), detail.toolCalls.length === 0 ? _jsx("p", { className: "muted", children: "No tool calls recorded." }) : null, _jsx("ul", { className: "stack", children: detail.toolCalls.map((call) => (_jsxs("li", { children: [_jsx("strong", { children: call.toolName }), " ", _jsx("span", { className: `pill ${call.status}`, children: call.status }), _jsx("pre", { children: JSON.stringify({ input: call.input, output: call.output }, null, 2) })] }, call.id))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Trace" }), _jsx("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => (_jsxs("li", { className: "timeline-item", children: [_jsxs("div", { className: "timeline-head", children: [_jsxs("span", { className: "timeline-step", children: ["Step ", event.stepIndex] }), _jsx("span", { className: "event-chip", children: formatEventLabel(event.type) }), _jsx("span", { className: "muted", children: event.source })] }), _jsx("pre", { children: JSON.stringify(event.payload, null, 2) })] }, event.eventId))) })] })] }));
|
|
42
|
+
}
|
|
43
|
+
export function FailureSummaryPanel(props) {
|
|
44
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
45
|
+
if (failureItems.length === 0) {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
return (_jsxs("section", { className: "panel failure-panel", children: [_jsx("h2", { children: "Failures First" }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsx("ul", { className: "stack", children: failureItems.map((item) => (_jsx("li", { children: item }, item))) })] }));
|
|
49
|
+
}
|
|
50
|
+
export function RunIdentitySummary(props) {
|
|
51
|
+
const run = props.detail.run;
|
|
52
|
+
return (_jsxs(_Fragment, { children: [_jsxs("p", { children: [_jsx("strong", { children: "Variant set:" }), " ", run.variantSetName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Variant:" }), " ", run.variantLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Prompt version:" }), " ", run.promptVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Model version:" }), " ", run.modelVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Tool schema version:" }), " ", run.toolSchemaVersion ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Config label:" }), " ", run.configLabel ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime profile:" }), " ", run.runtimeProfileName ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Suite definition:" }), " ", run.suiteDefinitionName ?? "-"] })] }));
|
|
41
53
|
}
|
|
42
54
|
function ComparePage(props) {
|
|
43
55
|
const [data, setData] = useState(null);
|
|
@@ -59,10 +71,10 @@ function ComparePage(props) {
|
|
|
59
71
|
if (!data) {
|
|
60
72
|
return _jsx(EmptyState, { title: "Loading comparison", description: "Fetching both runs and computing deltas." });
|
|
61
73
|
}
|
|
62
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { children: [diff.
|
|
74
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Compare" }), _jsx("p", { children: data.baseline.run.scenarioId })] }), _jsx(ComparisonHero, { comparison: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.score) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.steps) })] }), _jsxs("section", { className: "panel emphasis-panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No material differences recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Evaluator diffs" }), data.evaluatorDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No evaluator changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.evaluatorId }), diff.hardGate ? _jsx("span", { className: "event-chip", children: "hard gate" }) : null] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.evaluatorId))) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Tool diffs" }), data.toolDiffs.length === 0 ? _jsx("p", { className: "muted", children: "No tool usage changes." }) : null, _jsx("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: diff.toolName }), _jsx("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })] }), _jsx("div", { className: "muted", children: diff.note })] }, diff.toolName))) })] })] }), _jsxs("div", { className: "compare-grid", children: [_jsx(RunSide, { title: "Baseline", detail: data.baseline }), _jsx(RunSide, { title: "Candidate", detail: data.candidate })] })] }));
|
|
63
75
|
}
|
|
64
76
|
function RunSide(props) {
|
|
65
|
-
return (_jsxs("section", { className:
|
|
77
|
+
return (_jsxs("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [_jsx("h2", { children: props.title }), _jsxs("p", { children: [_jsx("strong", { children: "Run:" }), " ", _jsx("a", { href: `/runs/${props.detail.run.id}`, children: props.detail.run.id })] }), _jsxs("p", { children: [_jsx("strong", { children: "Status:" }), " ", _jsx("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })] }), _jsxs("p", { children: [_jsx("strong", { children: "Score:" }), " ", props.detail.run.score] }), _jsxs("p", { children: [_jsx("strong", { children: "Runtime:" }), " ", props.detail.run.durationMs, "ms"] }), _jsxs("p", { children: [_jsx("strong", { children: "Termination:" }), " ", props.detail.run.terminationReason] }), _jsxs("p", { children: [_jsx("strong", { children: "Agent:" }), " ", props.detail.agentVersion?.label ?? "-"] }), _jsxs("p", { children: [_jsx("strong", { children: "Provider:" }), " ", props.detail.agentVersion?.provider ?? "-"] }), props.detail.agentVersion?.modelId ? _jsxs("p", { children: [_jsx("strong", { children: "Model:" }), " ", props.detail.agentVersion.modelId] }) : null, props.detail.agentVersion?.command ? (_jsxs("p", { children: [_jsx("strong", { children: "Command:" }), " ", props.detail.agentVersion.command, " ", (props.detail.agentVersion.args ?? []).join(" ")] })) : null, props.detail.errorDetail ? _jsxs("p", { children: [_jsx("strong", { children: "Error:" }), " ", props.detail.errorDetail] }) : null, _jsx("p", { children: _jsx("strong", { children: "Final output:" }) }), _jsx("pre", { children: props.detail.run.finalOutput || "(none)" }), _jsx("h3", { children: "Trace" }), _jsx("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => (_jsx("li", { className: "timeline-item compact-item", children: _jsxs("strong", { children: [event.stepIndex, ". ", formatEventLabel(event.type)] }) }, event.eventId))) })] }));
|
|
66
78
|
}
|
|
67
79
|
function SuiteComparePage(props) {
|
|
68
80
|
const [data, setData] = useState(null);
|
|
@@ -84,10 +96,10 @@ function SuiteComparePage(props) {
|
|
|
84
96
|
if (!data) {
|
|
85
97
|
return _jsx(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
|
|
86
98
|
}
|
|
87
|
-
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
|
|
99
|
+
return (_jsxs("section", { children: [_jsxs("div", { className: "hero", children: [_jsx("h1", { children: "Suite Compare" }), _jsx("p", { children: data.suite })] }), _jsx(SuiteComparisonHero, { data: data }), _jsxs("div", { className: "stats", children: [_jsx(Stat, { label: "Classification", value: data.classification }), _jsx(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }), _jsx(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }), _jsx(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }), _jsx(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }), _jsx(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Notes" }), data.notes.length === 0 ? _jsx("p", { className: "muted", children: "No suite-level notes recorded." }) : null, _jsx("ul", { className: "stack", children: data.notes.map((note) => (_jsx("li", { children: note }, note))) })] }), _jsxs("div", { className: "panel-grid", children: [_jsx(ScenarioList, { title: "Regressions", items: data.regressions }), _jsx(ScenarioList, { title: "Improvements", items: data.improvements })] }), _jsxs("section", { className: "panel", children: [_jsx("h2", { children: "Missing scenarios" }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from candidate:" }), " ", data.missingFromCandidate.join(", ") || "None"] }), _jsxs("p", { children: [_jsx("strong", { children: "Missing from baseline:" }), " ", data.missingFromBaseline.join(", ") || "None"] })] })] }));
|
|
88
100
|
}
|
|
89
101
|
function ScenarioList(props) {
|
|
90
|
-
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack", children: props.items.map((item) => (_jsxs("li", { children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
|
|
102
|
+
return (_jsxs("section", { className: "panel", children: [_jsx("h2", { children: props.title }), props.items.length === 0 ? _jsx("p", { className: "muted", children: "None." }) : null, _jsx("ul", { className: "stack diff-list", children: props.items.map((item) => (_jsxs("li", { className: "diff-card", children: [_jsxs("div", { className: "diff-card-head", children: [_jsx("strong", { children: item.scenarioId }), " ", _jsx("span", { className: "muted", children: item.comparison.classification })] }), _jsx("div", { children: _jsx("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })] }, item.scenarioId))) })] }));
|
|
91
103
|
}
|
|
92
104
|
function Stat(props) {
|
|
93
105
|
return (_jsxs("div", { className: "stat", children: [_jsx("div", { className: "muted", children: props.label }), _jsx("div", { className: "stat-value", children: props.value })] }));
|
|
@@ -95,6 +107,62 @@ function Stat(props) {
|
|
|
95
107
|
function EmptyState(props) {
|
|
96
108
|
return (_jsxs("section", { className: "empty", children: [_jsx("h1", { children: props.title }), _jsx("p", { children: props.description })] }));
|
|
97
109
|
}
|
|
110
|
+
export function ComparisonHero(props) {
|
|
111
|
+
const tone = mapClassificationToTone(props.comparison.classification);
|
|
112
|
+
return (_jsxs("section", { className: `panel compare-hero ${tone}`, children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: props.comparison.classification }), _jsx("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })] }), _jsxs("p", { className: "muted", children: ["Output changed: ", props.comparison.outputChanged ? "yes" : "no", props.comparison.terminationDelta ? ` • termination: ${props.comparison.terminationDelta}` : ""] })] }));
|
|
113
|
+
}
|
|
114
|
+
export function SuiteComparisonHero(props) {
|
|
115
|
+
return (_jsxs("section", { className: "panel compare-hero neutral", children: [_jsxs("div", { className: "compare-hero-head", children: [_jsx("h2", { children: "Suite movement" }), _jsx("span", { className: "event-chip", children: props.data.classification })] }), _jsxs("div", { className: "stats compact-stats", children: [_jsx(Stat, { label: "Regressions", value: props.data.regressions.length }), _jsx(Stat, { label: "Improvements", value: props.data.improvements.length }), _jsx(Stat, { label: "Unchanged", value: props.data.unchanged.length })] })] }));
|
|
116
|
+
}
|
|
117
|
+
export function getFailureSummaryItems(detail) {
|
|
118
|
+
const items = [];
|
|
119
|
+
if (detail.errorDetail) {
|
|
120
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
121
|
+
}
|
|
122
|
+
for (const result of detail.evaluatorResults) {
|
|
123
|
+
if (result.status === "fail") {
|
|
124
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
128
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
129
|
+
}
|
|
130
|
+
return items;
|
|
131
|
+
}
|
|
132
|
+
export function summarizeRuns(runs) {
|
|
133
|
+
return {
|
|
134
|
+
total: runs.length,
|
|
135
|
+
pass: runs.filter((run) => run.status === "pass").length,
|
|
136
|
+
fail: runs.filter((run) => run.status === "fail").length,
|
|
137
|
+
error: runs.filter((run) => run.status === "error").length,
|
|
138
|
+
latestSuite: runs[0]?.suite ?? "-",
|
|
139
|
+
latestProvider: runs[0]?.provider ?? "-",
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
function formatEventLabel(type) {
|
|
143
|
+
return type.replaceAll("_", " ");
|
|
144
|
+
}
|
|
145
|
+
function mapRiskToPill(risk) {
|
|
146
|
+
if (risk === "high") {
|
|
147
|
+
return "fail";
|
|
148
|
+
}
|
|
149
|
+
if (risk === "medium") {
|
|
150
|
+
return "error";
|
|
151
|
+
}
|
|
152
|
+
return "pass";
|
|
153
|
+
}
|
|
154
|
+
function mapClassificationToTone(classification) {
|
|
155
|
+
if (classification.includes("regress")) {
|
|
156
|
+
return "fail";
|
|
157
|
+
}
|
|
158
|
+
if (classification.includes("improv")) {
|
|
159
|
+
return "pass";
|
|
160
|
+
}
|
|
161
|
+
if (classification.includes("changed")) {
|
|
162
|
+
return "error";
|
|
163
|
+
}
|
|
164
|
+
return "neutral";
|
|
165
|
+
}
|
|
98
166
|
function signed(value) {
|
|
99
167
|
return value > 0 ? `+${value}` : `${value}`;
|
|
100
168
|
}
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
--pass: #1e6a42;
|
|
11
11
|
--fail: #9a2c1f;
|
|
12
12
|
--error: #5b1e72;
|
|
13
|
+
--shadow: 0 16px 40px rgba(76, 58, 26, 0.08);
|
|
13
14
|
}
|
|
14
15
|
* {
|
|
15
16
|
box-sizing: border-box;
|
|
@@ -104,6 +105,7 @@ select {
|
|
|
104
105
|
border: 1px solid var(--line);
|
|
105
106
|
border-radius: 16px;
|
|
106
107
|
padding: 1rem;
|
|
108
|
+
box-shadow: var(--shadow);
|
|
107
109
|
}
|
|
108
110
|
.stat-value {
|
|
109
111
|
font-size: 1.4rem;
|
|
@@ -114,6 +116,18 @@ select {
|
|
|
114
116
|
grid-template-columns: repeat(auto-fit, minmax(320px, 1fr));
|
|
115
117
|
margin-bottom: 1rem;
|
|
116
118
|
}
|
|
119
|
+
.dashboard-stats .stat {
|
|
120
|
+
border-top: 4px solid var(--line);
|
|
121
|
+
}
|
|
122
|
+
.pass-text {
|
|
123
|
+
color: var(--pass);
|
|
124
|
+
}
|
|
125
|
+
.fail-text {
|
|
126
|
+
color: var(--fail);
|
|
127
|
+
}
|
|
128
|
+
.error-text {
|
|
129
|
+
color: var(--error);
|
|
130
|
+
}
|
|
117
131
|
.table {
|
|
118
132
|
width: 100%;
|
|
119
133
|
border-collapse: collapse;
|
|
@@ -157,6 +171,16 @@ select {
|
|
|
157
171
|
background: rgba(91, 30, 114, 0.12);
|
|
158
172
|
color: var(--error);
|
|
159
173
|
}
|
|
174
|
+
.pill.neutral {
|
|
175
|
+
background: rgba(102, 95, 84, 0.14);
|
|
176
|
+
color: var(--muted);
|
|
177
|
+
}
|
|
178
|
+
.failure-panel {
|
|
179
|
+
border-left: 6px solid var(--fail);
|
|
180
|
+
}
|
|
181
|
+
.emphasis-panel {
|
|
182
|
+
border-left: 6px solid var(--accent);
|
|
183
|
+
}
|
|
160
184
|
.stack,
|
|
161
185
|
.timeline {
|
|
162
186
|
display: grid;
|
|
@@ -166,6 +190,74 @@ select {
|
|
|
166
190
|
.timeline.compact {
|
|
167
191
|
gap: 0.35rem;
|
|
168
192
|
}
|
|
193
|
+
.timeline-detailed {
|
|
194
|
+
padding-left: 0;
|
|
195
|
+
list-style: none;
|
|
196
|
+
}
|
|
197
|
+
.timeline-item {
|
|
198
|
+
border-left: 3px solid var(--line);
|
|
199
|
+
padding-left: 0.9rem;
|
|
200
|
+
margin-left: 0.35rem;
|
|
201
|
+
}
|
|
202
|
+
.timeline-head,
|
|
203
|
+
.diff-card-head,
|
|
204
|
+
.compare-hero-head {
|
|
205
|
+
display: flex;
|
|
206
|
+
gap: 0.6rem;
|
|
207
|
+
align-items: center;
|
|
208
|
+
flex-wrap: wrap;
|
|
209
|
+
}
|
|
210
|
+
.timeline-step,
|
|
211
|
+
.event-chip {
|
|
212
|
+
display: inline-block;
|
|
213
|
+
padding: 0.2rem 0.55rem;
|
|
214
|
+
border-radius: 999px;
|
|
215
|
+
background: #efe5d5;
|
|
216
|
+
color: var(--ink);
|
|
217
|
+
font-size: 0.78rem;
|
|
218
|
+
font-family: "IBM Plex Mono", monospace;
|
|
219
|
+
text-transform: uppercase;
|
|
220
|
+
}
|
|
221
|
+
.diff-list {
|
|
222
|
+
padding-left: 0;
|
|
223
|
+
list-style: none;
|
|
224
|
+
}
|
|
225
|
+
.diff-card {
|
|
226
|
+
border: 1px solid var(--line);
|
|
227
|
+
border-radius: 12px;
|
|
228
|
+
padding: 0.8rem;
|
|
229
|
+
background: #faf5ec;
|
|
230
|
+
}
|
|
231
|
+
.compare-hero {
|
|
232
|
+
margin-bottom: 1rem;
|
|
233
|
+
}
|
|
234
|
+
.compare-hero.pass {
|
|
235
|
+
border-left: 6px solid var(--pass);
|
|
236
|
+
}
|
|
237
|
+
.compare-hero.fail {
|
|
238
|
+
border-left: 6px solid var(--fail);
|
|
239
|
+
}
|
|
240
|
+
.compare-hero.error {
|
|
241
|
+
border-left: 6px solid var(--error);
|
|
242
|
+
}
|
|
243
|
+
.compare-hero.neutral {
|
|
244
|
+
border-left: 6px solid var(--muted);
|
|
245
|
+
}
|
|
246
|
+
.compact-stats {
|
|
247
|
+
margin-top: 1rem;
|
|
248
|
+
margin-bottom: 0;
|
|
249
|
+
}
|
|
250
|
+
.compare-side.baseline-side {
|
|
251
|
+
border-top: 4px solid #b89d67;
|
|
252
|
+
}
|
|
253
|
+
.compare-side.candidate-side {
|
|
254
|
+
border-top: 4px solid var(--accent);
|
|
255
|
+
}
|
|
256
|
+
.compact-item {
|
|
257
|
+
border-left: none;
|
|
258
|
+
padding-left: 0;
|
|
259
|
+
margin-left: 0;
|
|
260
|
+
}
|
|
169
261
|
@media (max-width: 720px) {
|
|
170
262
|
.table {
|
|
171
263
|
display: block;
|