agent-regression-lab 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21748,11 +21748,20 @@ function RunListPage() {
21748
21748
  if (provider) url.searchParams.set("provider", provider);
21749
21749
  void fetch(url).then((response) => response.json()).then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
21750
21750
  }, [suite, status, provider]);
21751
+ const stats = summarizeRuns(runs);
21751
21752
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
21752
21753
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
21753
21754
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Runs" }),
21754
21755
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })
21755
21756
  ] }),
21757
+ runs.length > 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats dashboard-stats", children: [
21758
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runs shown", value: stats.total }),
21759
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Passing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "pass-text", children: stats.pass }) }),
21760
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Failing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "fail-text", children: stats.fail }) }),
21761
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Errors", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "error-text", children: stats.error }) }),
21762
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest suite", value: stats.latestSuite }),
21763
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest provider", value: stats.latestProvider })
21764
+ ] }) : null,
21756
21765
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "filters", children: [
21757
21766
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }),
21758
21767
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [
@@ -21879,14 +21888,13 @@ function RunDetailPage(props) {
21879
21888
  ] }),
21880
21889
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21881
21890
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Trace" }),
21882
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
21883
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [
21884
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
21885
- event.stepIndex,
21886
- ". ",
21887
- event.type
21891
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "timeline-item", children: [
21892
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "timeline-head", children: [
21893
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("span", { className: "timeline-step", children: [
21894
+ "Step ",
21895
+ event.stepIndex
21888
21896
  ] }),
21889
- " ",
21897
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: formatEventLabel(event.type) }),
21890
21898
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: event.source })
21891
21899
  ] }),
21892
21900
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: JSON.stringify(event.payload, null, 2) })
@@ -21899,7 +21907,7 @@ function FailureSummaryPanel(props) {
21899
21907
  if (failureItems.length === 0) {
21900
21908
  return null;
21901
21909
  }
21902
- return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21910
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel failure-panel", children: [
21903
21911
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
21904
21912
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21905
21913
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
@@ -21982,13 +21990,14 @@ function ComparePage(props) {
21982
21990
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Compare" }),
21983
21991
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
21984
21992
  ] }),
21993
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparisonHero, { comparison: data }),
21985
21994
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21986
21995
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
21987
21996
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
21988
21997
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
21989
21998
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
21990
21999
  ] }),
21991
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22000
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel emphasis-panel", children: [
21992
22001
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
21993
22002
  data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No material differences recorded." }) : null,
21994
22003
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
@@ -21997,15 +22006,24 @@ function ComparePage(props) {
21997
22006
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21998
22007
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
21999
22008
  data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
22000
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22001
- diff.note,
22002
- diff.hardGate ? " (hard gate)" : ""
22009
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22010
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22011
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.evaluatorId }),
22012
+ diff.hardGate ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: "hard gate" }) : null
22013
+ ] }),
22014
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
22003
22015
  ] }, diff.evaluatorId)) })
22004
22016
  ] }),
22005
22017
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22006
22018
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
22007
22019
  data.toolDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No tool usage changes." }) : null,
22008
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.toolName)) })
22020
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22021
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22022
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.toolName }),
22023
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })
22024
+ ] }),
22025
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
22026
+ ] }, diff.toolName)) })
22009
22027
  ] })
22010
22028
  ] }),
22011
22029
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-grid", children: [
@@ -22015,7 +22033,7 @@ function ComparePage(props) {
22015
22033
  ] });
22016
22034
  }
22017
22035
  function RunSide(props) {
22018
- return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22036
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [
22019
22037
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
22020
22038
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
22021
22039
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Run:" }),
@@ -22073,10 +22091,10 @@ function RunSide(props) {
22073
22091
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Final output:" }) }),
22074
22092
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: props.detail.run.finalOutput || "(none)" }),
22075
22093
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h3", { children: "Trace" }),
22076
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
22094
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { className: "timeline-item compact-item", children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
22077
22095
  event.stepIndex,
22078
22096
  ". ",
22079
- event.type
22097
+ formatEventLabel(event.type)
22080
22098
  ] }) }, event.eventId)) })
22081
22099
  ] });
22082
22100
  }
@@ -22103,6 +22121,7 @@ function SuiteComparePage(props) {
22103
22121
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
22104
22122
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
22105
22123
  ] }),
22124
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparisonHero, { data }),
22106
22125
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
22107
22126
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
22108
22127
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
@@ -22139,10 +22158,12 @@ function ScenarioList(props) {
22139
22158
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22140
22159
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
22141
22160
  props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
22142
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22143
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22144
- " ",
22145
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
22161
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22162
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22163
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22164
+ " ",
22165
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification })
22166
+ ] }),
22146
22167
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
22147
22168
  ] }, item.scenarioId)) })
22148
22169
  ] });
@@ -22159,6 +22180,33 @@ function EmptyState(props) {
22159
22180
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
22160
22181
  ] });
22161
22182
  }
22183
+ function ComparisonHero(props) {
22184
+ const tone = mapClassificationToTone(props.comparison.classification);
22185
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-hero ${tone}`, children: [
22186
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
22187
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.comparison.classification }),
22188
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })
22189
+ ] }),
22190
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { className: "muted", children: [
22191
+ "Output changed: ",
22192
+ props.comparison.outputChanged ? "yes" : "no",
22193
+ props.comparison.terminationDelta ? ` \u2022 termination: ${props.comparison.terminationDelta}` : ""
22194
+ ] })
22195
+ ] });
22196
+ }
22197
+ function SuiteComparisonHero(props) {
22198
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel compare-hero neutral", children: [
22199
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
22200
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Suite movement" }),
22201
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: props.data.classification })
22202
+ ] }),
22203
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats compact-stats", children: [
22204
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Regressions", value: props.data.regressions.length }),
22205
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Improvements", value: props.data.improvements.length }),
22206
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Unchanged", value: props.data.unchanged.length })
22207
+ ] })
22208
+ ] });
22209
+ }
22162
22210
  function getFailureSummaryItems(detail) {
22163
22211
  const items = [];
22164
22212
  if (detail.errorDetail) {
@@ -22174,6 +22222,40 @@ function getFailureSummaryItems(detail) {
22174
22222
  }
22175
22223
  return items;
22176
22224
  }
22225
+ function summarizeRuns(runs) {
22226
+ return {
22227
+ total: runs.length,
22228
+ pass: runs.filter((run) => run.status === "pass").length,
22229
+ fail: runs.filter((run) => run.status === "fail").length,
22230
+ error: runs.filter((run) => run.status === "error").length,
22231
+ latestSuite: runs[0]?.suite ?? "-",
22232
+ latestProvider: runs[0]?.provider ?? "-"
22233
+ };
22234
+ }
22235
+ function formatEventLabel(type) {
22236
+ return type.replaceAll("_", " ");
22237
+ }
22238
+ function mapRiskToPill(risk) {
22239
+ if (risk === "high") {
22240
+ return "fail";
22241
+ }
22242
+ if (risk === "medium") {
22243
+ return "error";
22244
+ }
22245
+ return "pass";
22246
+ }
22247
+ function mapClassificationToTone(classification) {
22248
+ if (classification.includes("regress")) {
22249
+ return "fail";
22250
+ }
22251
+ if (classification.includes("improv")) {
22252
+ return "pass";
22253
+ }
22254
+ if (classification.includes("changed")) {
22255
+ return "error";
22256
+ }
22257
+ return "neutral";
22258
+ }
22177
22259
  function signed(value) {
22178
22260
  return value > 0 ? `+${value}` : `${value}`;
22179
22261
  }
@@ -0,0 +1,74 @@
1
+ # Coding Agents
2
+
3
+ ARL supports coding-agent regression workflows through deterministic task scenarios.
4
+
5
+ Use this path when the runner should remain authoritative for:
6
+
7
+ - file inspection tools
8
+ - patch application tools
9
+ - step limits
10
+ - regression scoring
11
+
12
+ ## Start With The Built-In Coding Scenarios
13
+
14
+ This repo already includes two coding scenarios:
15
+
16
+ - `coding.fix-add-function`
17
+ - `coding.update-greeting`
18
+
19
+ Run one directly:
20
+
21
+ ```bash
22
+ agentlab run coding.fix-add-function --agent mock-default
23
+ ```
24
+
25
+ These scenarios use fixture-backed repo tools, which makes them useful for:
26
+
27
+ - prompt changes
28
+ - model comparisons
29
+ - patch-discipline checks
30
+ - pre-merge behavioral regression checks
31
+
32
+ ## Why This Matters
33
+
34
+ Coding agents often regress in subtle ways:
35
+
36
+ - they inspect too much of the repo
37
+ - they patch the wrong file
38
+ - they over-edit instead of making a narrow change
39
+ - they stop naming the changed file clearly
40
+
41
+ ARL helps by making those expectations explicit in scenario evaluators.
42
+
43
+ ## Minimal Workflow
44
+
45
+ 1. run one coding scenario locally
46
+ 2. inspect the run output and trace
47
+ 3. run it again against a changed prompt/model/agent variant
48
+ 4. compare the two runs
49
+
50
+ Example:
51
+
52
+ ```bash
53
+ agentlab run coding.fix-add-function --agent mock-default
54
+ agentlab run coding.fix-add-function --agent mock-default
55
+ agentlab compare <baseline-run-id> <candidate-run-id>
56
+ ```
57
+
58
+ ## When To Use Task Scenarios Versus HTTP
59
+
60
+ Use task scenarios for coding agents when:
61
+
62
+ - you want deterministic fixture-backed tools
63
+ - you want ARL to own the tool loop
64
+ - you want reproducible patch-evaluator behavior
65
+
66
+ Use HTTP/conversation scenarios only when the coding agent already exists as a running service and owns its own orchestration internally.
67
+
68
+ ## Next Step
69
+
70
+ If you want coding-agent checks in team workflows, pair these scenarios with suite definitions and CI:
71
+
72
+ ```bash
73
+ agentlab run --suite-def pre_merge --agent mock-default
74
+ ```
@@ -0,0 +1,160 @@
1
+ # Phase 2 Lite And Phase 3 Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Deliver a minimal integration story for new users, then improve the UI enough that ARL is easier to demo, screenshot, and understand visually.
6
+
7
+ **Architecture:** Keep Phase 2-lite focused on assets that clarify adoption: README routing, one coding-agent path, and one CI path. Keep Phase 3 focused on UI clarity instead of new product surface area by improving the runs dashboard, comparison screens, and trace presentation inside the existing React UI.
8
+
9
+ **Tech Stack:** TypeScript, React, node:test, esbuild, Markdown, GitHub Actions YAML
10
+
11
+ ---
12
+
13
+ ## File Map
14
+
15
+ **Roadmap and product docs**
16
+ - Modify: `.claude/active-tasks.md`
17
+ - Modify: `.claude/project.md`
18
+ - Modify: `README.md`
19
+
20
+ **Phase 2-lite assets**
21
+ - Create: `docs/coding-agents.md`
22
+ - Create: `.github/workflows/agentlab-pre-merge.yml`
23
+
24
+ **UI**
25
+ - Modify: `src/ui/App.tsx`
26
+ - Modify: `src/ui/styles.css`
27
+
28
+ **Tests**
29
+ - Modify: `tests/launch/ui-smoke.test.ts`
30
+
31
+ ---
32
+
33
+ ### Task 1: Reframe Roadmap To Phase 2-lite Then Phase 3
34
+
35
+ **Files:**
36
+ - Modify: `.claude/active-tasks.md`
37
+ - Modify: `.claude/project.md`
38
+
39
+ - [ ] Update active task tracking so the current next phase is `Phase 2-lite`, not the original full Phase 2.
40
+ - [ ] Update project memory so Phase 2-lite is the minimal integration-story pass and Phase 3 is the main visual/demo workstream.
41
+ - [ ] Keep the scope explicit:
42
+ - HTTP example via `arl-test`
43
+ - CI example
44
+ - coding-agent example
45
+ - then UI polish
46
+
47
+ Verification:
48
+
49
+ ```bash
50
+ rg -n "Phase 2-lite|Phase 3" .claude/active-tasks.md .claude/project.md
51
+ ```
52
+
53
+ ---
54
+
55
+ ### Task 2: Add Phase 2-lite Integration Assets
56
+
57
+ **Files:**
58
+ - Modify: `README.md`
59
+ - Create: `docs/coding-agents.md`
60
+ - Create: `.github/workflows/agentlab-pre-merge.yml`
61
+
62
+ - [ ] Add README routing sections:
63
+ - if your agent runs as an HTTP service
64
+ - if you are validating coding-agent changes
65
+ - if you want pre-merge regression checks in CI
66
+ - [ ] Add one coding-agent guide using the existing coding scenarios and current tool-loop model.
67
+ - [ ] Add one GitHub Actions example that runs:
68
+
69
+ ```bash
70
+ npm ci
71
+ npm run build
72
+ node dist/index.js run --suite-def pre_merge --agent mock-default
73
+ ```
74
+
75
+ - [ ] Keep this section narrow and copy-pasteable. No broad framework matrix.
76
+
77
+ Verification:
78
+
79
+ ```bash
80
+ rg -n "HTTP service|coding-agent|pre-merge|GitHub Actions" README.md docs/coding-agents.md .github/workflows/agentlab-pre-merge.yml
81
+ ```
82
+
83
+ ---
84
+
85
+ ### Task 3: Improve Runs Dashboard And Comparison UX
86
+
87
+ **Files:**
88
+ - Modify: `src/ui/App.tsx`
89
+ - Modify: `src/ui/styles.css`
90
+ - Modify: `tests/launch/ui-smoke.test.ts`
91
+
92
+ - [ ] Add a stronger runs dashboard summary at the top of the list page:
93
+ - total runs shown
94
+ - pass/fail/error counts
95
+ - most recent suite/context hint
96
+ - [ ] Redesign the compare page to make regressions visually obvious:
97
+ - top classification banner
98
+ - clearer delta cards
99
+ - evaluator/tool diff blocks with stronger hierarchy
100
+ - more obvious baseline vs candidate sections
101
+ - [ ] Make the suite compare page easier to scan:
102
+ - headline regression/improvement counts
103
+ - clearer scenario groupings
104
+
105
+ Verification:
106
+
107
+ ```bash
108
+ npx tsx --test tests/launch/ui-smoke.test.ts
109
+ ```
110
+
111
+ ---
112
+
113
+ ### Task 4: Improve Trace And Detail Presentation
114
+
115
+ **Files:**
116
+ - Modify: `src/ui/App.tsx`
117
+ - Modify: `src/ui/styles.css`
118
+ - Modify: `tests/launch/ui-smoke.test.ts`
119
+
120
+ - [ ] Replace the plain trace list with a more intentional timeline treatment:
121
+ - event badges or type labels
122
+ - stronger step grouping
123
+ - clearer source metadata
124
+ - [ ] Keep failure-first behavior intact.
125
+ - [ ] Preserve readability on narrow screens.
126
+
127
+ Verification:
128
+
129
+ ```bash
130
+ npx tsx --test tests/launch/ui-smoke.test.ts
131
+ ```
132
+
133
+ ---
134
+
135
+ ### Task 5: Full Verification
136
+
137
+ **Files:**
138
+ - Modify only if verification exposes issues
139
+
140
+ - [ ] Run focused UI/docs-related verification:
141
+
142
+ ```bash
143
+ npx tsx --test tests/launch/ui-smoke.test.ts tests/cliPackaging.test.ts
144
+ ```
145
+
146
+ - [ ] Run full suite:
147
+
148
+ ```bash
149
+ npm test
150
+ ```
151
+
152
+ - [ ] Run release gates:
153
+
154
+ ```bash
155
+ npm run check
156
+ npm run build
157
+ npm run smoke:cli
158
+ npm_config_cache=/tmp/agentlab-npm-cache npm pack --dry-run
159
+ ```
160
+