agent-regression-lab 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -4
- package/bin/agentlab.js +2 -0
- package/dist/config.js +13 -9
- package/dist/index.js +14 -0
- package/dist/init.js +88 -0
- package/dist/tools.js +18 -2
- package/dist/ui/App.js +49 -7
- package/dist/ui-assets/client.css +92 -0
- package/dist/ui-assets/client.js +102 -20
- package/docs/coding-agents.md +74 -0
- package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
- package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
- package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
- package/docs/tools.md +34 -3
- package/docs/troubleshooting.md +55 -0
- package/examples/coding-tools/README.md +21 -0
- package/examples/coding-tools/index.js +11 -0
- package/examples/coding-tools/package.json +8 -0
- package/examples/support-tools/README.md +21 -0
- package/examples/support-tools/index.js +8 -0
- package/examples/support-tools/package.json +8 -0
- package/package.json +6 -4
package/dist/ui-assets/client.js
CHANGED
|
@@ -21748,11 +21748,20 @@ function RunListPage() {
|
|
|
21748
21748
|
if (provider) url.searchParams.set("provider", provider);
|
|
21749
21749
|
void fetch(url).then((response) => response.json()).then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
|
|
21750
21750
|
}, [suite, status, provider]);
|
|
21751
|
+
const stats = summarizeRuns(runs);
|
|
21751
21752
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
|
|
21752
21753
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
|
|
21753
21754
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Runs" }),
|
|
21754
21755
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })
|
|
21755
21756
|
] }),
|
|
21757
|
+
runs.length > 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats dashboard-stats", children: [
|
|
21758
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runs shown", value: stats.total }),
|
|
21759
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Passing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "pass-text", children: stats.pass }) }),
|
|
21760
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Failing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "fail-text", children: stats.fail }) }),
|
|
21761
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Errors", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "error-text", children: stats.error }) }),
|
|
21762
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest suite", value: stats.latestSuite }),
|
|
21763
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest provider", value: stats.latestProvider })
|
|
21764
|
+
] }) : null,
|
|
21756
21765
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "filters", children: [
|
|
21757
21766
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }),
|
|
21758
21767
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [
|
|
@@ -21879,14 +21888,13 @@ function RunDetailPage(props) {
|
|
|
21879
21888
|
] }),
|
|
21880
21889
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21881
21890
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Trace" }),
|
|
21882
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
21883
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [
|
|
21884
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("
|
|
21885
|
-
|
|
21886
|
-
|
|
21887
|
-
event.type
|
|
21891
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "timeline-item", children: [
|
|
21892
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "timeline-head", children: [
|
|
21893
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("span", { className: "timeline-step", children: [
|
|
21894
|
+
"Step ",
|
|
21895
|
+
event.stepIndex
|
|
21888
21896
|
] }),
|
|
21889
|
-
" ",
|
|
21897
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: formatEventLabel(event.type) }),
|
|
21890
21898
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: event.source })
|
|
21891
21899
|
] }),
|
|
21892
21900
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: JSON.stringify(event.payload, null, 2) })
|
|
@@ -21899,7 +21907,7 @@ function FailureSummaryPanel(props) {
|
|
|
21899
21907
|
if (failureItems.length === 0) {
|
|
21900
21908
|
return null;
|
|
21901
21909
|
}
|
|
21902
|
-
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21910
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel failure-panel", children: [
|
|
21903
21911
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
|
|
21904
21912
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21905
21913
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
|
|
@@ -21982,13 +21990,14 @@ function ComparePage(props) {
|
|
|
21982
21990
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Compare" }),
|
|
21983
21991
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
|
|
21984
21992
|
] }),
|
|
21993
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparisonHero, { comparison: data }),
|
|
21985
21994
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21986
21995
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
21987
21996
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
|
|
21988
21997
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
|
|
21989
21998
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
|
|
21990
21999
|
] }),
|
|
21991
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22000
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel emphasis-panel", children: [
|
|
21992
22001
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
|
|
21993
22002
|
data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No material differences recorded." }) : null,
|
|
21994
22003
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
|
|
@@ -21997,15 +22006,24 @@ function ComparePage(props) {
|
|
|
21997
22006
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21998
22007
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
|
|
21999
22008
|
data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
|
|
22000
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22001
|
-
|
|
22002
|
-
|
|
22009
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22010
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22011
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.evaluatorId }),
|
|
22012
|
+
diff.hardGate ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: "hard gate" }) : null
|
|
22013
|
+
] }),
|
|
22014
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
|
|
22003
22015
|
] }, diff.evaluatorId)) })
|
|
22004
22016
|
] }),
|
|
22005
22017
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22006
22018
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
|
|
22007
22019
|
data.toolDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No tool usage changes." }) : null,
|
|
22008
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.
|
|
22020
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22021
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22022
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.toolName }),
|
|
22023
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })
|
|
22024
|
+
] }),
|
|
22025
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
|
|
22026
|
+
] }, diff.toolName)) })
|
|
22009
22027
|
] })
|
|
22010
22028
|
] }),
|
|
22011
22029
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-grid", children: [
|
|
@@ -22015,7 +22033,7 @@ function ComparePage(props) {
|
|
|
22015
22033
|
] });
|
|
22016
22034
|
}
|
|
22017
22035
|
function RunSide(props) {
|
|
22018
|
-
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className:
|
|
22036
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [
|
|
22019
22037
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
22020
22038
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
22021
22039
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Run:" }),
|
|
@@ -22073,10 +22091,10 @@ function RunSide(props) {
|
|
|
22073
22091
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Final output:" }) }),
|
|
22074
22092
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: props.detail.run.finalOutput || "(none)" }),
|
|
22075
22093
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h3", { children: "Trace" }),
|
|
22076
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
|
|
22094
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { className: "timeline-item compact-item", children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
|
|
22077
22095
|
event.stepIndex,
|
|
22078
22096
|
". ",
|
|
22079
|
-
event.type
|
|
22097
|
+
formatEventLabel(event.type)
|
|
22080
22098
|
] }) }, event.eventId)) })
|
|
22081
22099
|
] });
|
|
22082
22100
|
}
|
|
@@ -22103,6 +22121,7 @@ function SuiteComparePage(props) {
|
|
|
22103
22121
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
|
|
22104
22122
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
|
|
22105
22123
|
] }),
|
|
22124
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparisonHero, { data }),
|
|
22106
22125
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
22107
22126
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
22108
22127
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
|
|
@@ -22139,10 +22158,12 @@ function ScenarioList(props) {
|
|
|
22139
22158
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22140
22159
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
22141
22160
|
props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
|
|
22142
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22143
|
-
/* @__PURE__ */ (0, import_jsx_runtime.
|
|
22144
|
-
|
|
22145
|
-
|
|
22161
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22162
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22163
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
|
|
22164
|
+
" ",
|
|
22165
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification })
|
|
22166
|
+
] }),
|
|
22146
22167
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
|
|
22147
22168
|
] }, item.scenarioId)) })
|
|
22148
22169
|
] });
|
|
@@ -22159,6 +22180,33 @@ function EmptyState(props) {
|
|
|
22159
22180
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
|
|
22160
22181
|
] });
|
|
22161
22182
|
}
|
|
22183
|
+
function ComparisonHero(props) {
|
|
22184
|
+
const tone = mapClassificationToTone(props.comparison.classification);
|
|
22185
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-hero ${tone}`, children: [
|
|
22186
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
|
|
22187
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.comparison.classification }),
|
|
22188
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })
|
|
22189
|
+
] }),
|
|
22190
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { className: "muted", children: [
|
|
22191
|
+
"Output changed: ",
|
|
22192
|
+
props.comparison.outputChanged ? "yes" : "no",
|
|
22193
|
+
props.comparison.terminationDelta ? ` \u2022 termination: ${props.comparison.terminationDelta}` : ""
|
|
22194
|
+
] })
|
|
22195
|
+
] });
|
|
22196
|
+
}
|
|
22197
|
+
function SuiteComparisonHero(props) {
|
|
22198
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel compare-hero neutral", children: [
|
|
22199
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
|
|
22200
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Suite movement" }),
|
|
22201
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: props.data.classification })
|
|
22202
|
+
] }),
|
|
22203
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats compact-stats", children: [
|
|
22204
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Regressions", value: props.data.regressions.length }),
|
|
22205
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Improvements", value: props.data.improvements.length }),
|
|
22206
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Unchanged", value: props.data.unchanged.length })
|
|
22207
|
+
] })
|
|
22208
|
+
] });
|
|
22209
|
+
}
|
|
22162
22210
|
function getFailureSummaryItems(detail) {
|
|
22163
22211
|
const items = [];
|
|
22164
22212
|
if (detail.errorDetail) {
|
|
@@ -22174,6 +22222,40 @@ function getFailureSummaryItems(detail) {
|
|
|
22174
22222
|
}
|
|
22175
22223
|
return items;
|
|
22176
22224
|
}
|
|
22225
|
+
function summarizeRuns(runs) {
|
|
22226
|
+
return {
|
|
22227
|
+
total: runs.length,
|
|
22228
|
+
pass: runs.filter((run) => run.status === "pass").length,
|
|
22229
|
+
fail: runs.filter((run) => run.status === "fail").length,
|
|
22230
|
+
error: runs.filter((run) => run.status === "error").length,
|
|
22231
|
+
latestSuite: runs[0]?.suite ?? "-",
|
|
22232
|
+
latestProvider: runs[0]?.provider ?? "-"
|
|
22233
|
+
};
|
|
22234
|
+
}
|
|
22235
|
+
function formatEventLabel(type) {
|
|
22236
|
+
return type.replaceAll("_", " ");
|
|
22237
|
+
}
|
|
22238
|
+
function mapRiskToPill(risk) {
|
|
22239
|
+
if (risk === "high") {
|
|
22240
|
+
return "fail";
|
|
22241
|
+
}
|
|
22242
|
+
if (risk === "medium") {
|
|
22243
|
+
return "error";
|
|
22244
|
+
}
|
|
22245
|
+
return "pass";
|
|
22246
|
+
}
|
|
22247
|
+
function mapClassificationToTone(classification) {
|
|
22248
|
+
if (classification.includes("regress")) {
|
|
22249
|
+
return "fail";
|
|
22250
|
+
}
|
|
22251
|
+
if (classification.includes("improv")) {
|
|
22252
|
+
return "pass";
|
|
22253
|
+
}
|
|
22254
|
+
if (classification.includes("changed")) {
|
|
22255
|
+
return "error";
|
|
22256
|
+
}
|
|
22257
|
+
return "neutral";
|
|
22258
|
+
}
|
|
22177
22259
|
function signed(value) {
|
|
22178
22260
|
return value > 0 ? `+${value}` : `${value}`;
|
|
22179
22261
|
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Coding Agents
|
|
2
|
+
|
|
3
|
+
ARL supports coding-agent regression workflows through deterministic task scenarios.
|
|
4
|
+
|
|
5
|
+
Use this path when the runner should remain authoritative for:
|
|
6
|
+
|
|
7
|
+
- file inspection tools
|
|
8
|
+
- patch application tools
|
|
9
|
+
- step limits
|
|
10
|
+
- regression scoring
|
|
11
|
+
|
|
12
|
+
## Start With The Built-In Coding Scenarios
|
|
13
|
+
|
|
14
|
+
This repo already includes two coding scenarios:
|
|
15
|
+
|
|
16
|
+
- `coding.fix-add-function`
|
|
17
|
+
- `coding.update-greeting`
|
|
18
|
+
|
|
19
|
+
Run one directly:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
These scenarios use fixture-backed repo tools, which makes them useful for:
|
|
26
|
+
|
|
27
|
+
- prompt changes
|
|
28
|
+
- model comparisons
|
|
29
|
+
- patch-discipline checks
|
|
30
|
+
- pre-merge behavioral regression checks
|
|
31
|
+
|
|
32
|
+
## Why This Matters
|
|
33
|
+
|
|
34
|
+
Coding agents often regress in subtle ways:
|
|
35
|
+
|
|
36
|
+
- they inspect too much of the repo
|
|
37
|
+
- they patch the wrong file
|
|
38
|
+
- they over-edit instead of making a narrow change
|
|
39
|
+
- they stop naming the changed file clearly
|
|
40
|
+
|
|
41
|
+
ARL helps by making those expectations explicit in scenario evaluators.
|
|
42
|
+
|
|
43
|
+
## Minimal Workflow
|
|
44
|
+
|
|
45
|
+
1. run one coding scenario locally
|
|
46
|
+
2. inspect the run output and trace
|
|
47
|
+
3. run it again against a changed prompt/model/agent variant
|
|
48
|
+
4. compare the two runs
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
54
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
55
|
+
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## When To Use Task Scenarios Versus HTTP
|
|
59
|
+
|
|
60
|
+
Use task scenarios for coding agents when:
|
|
61
|
+
|
|
62
|
+
- you want deterministic fixture-backed tools
|
|
63
|
+
- you want ARL to own the tool loop
|
|
64
|
+
- you want reproducible patch-evaluator behavior
|
|
65
|
+
|
|
66
|
+
Use HTTP/conversation scenarios only when the coding agent already exists as a running service and owns its own orchestration internally.
|
|
67
|
+
|
|
68
|
+
## Next Step
|
|
69
|
+
|
|
70
|
+
If you want coding-agent checks in team workflows, pair these scenarios with suite definitions and CI:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
agentlab run --suite-def pre_merge --agent mock-default
|
|
74
|
+
```
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# Phase 2 Lite And Phase 3 Implementation Plan
|
|
2
|
+
|
|
3
|
+
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
4
|
+
|
|
5
|
+
**Goal:** Deliver a minimal integration story for new users, then improve the UI enough that ARL is easier to demo, screenshot, and understand visually.
|
|
6
|
+
|
|
7
|
+
**Architecture:** Keep Phase 2-lite focused on assets that clarify adoption: README routing, one coding-agent path, and one CI path. Keep Phase 3 focused on UI clarity instead of new product surface area by improving the runs dashboard, comparison screens, and trace presentation inside the existing React UI.
|
|
8
|
+
|
|
9
|
+
**Tech Stack:** TypeScript, React, node:test, esbuild, Markdown, GitHub Actions YAML
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## File Map
|
|
14
|
+
|
|
15
|
+
**Roadmap and product docs**
|
|
16
|
+
- Modify: `.claude/active-tasks.md`
|
|
17
|
+
- Modify: `.claude/project.md`
|
|
18
|
+
- Modify: `README.md`
|
|
19
|
+
|
|
20
|
+
**Phase 2-lite assets**
|
|
21
|
+
- Create: `docs/coding-agents.md`
|
|
22
|
+
- Create: `.github/workflows/agentlab-pre-merge.yml`
|
|
23
|
+
|
|
24
|
+
**UI**
|
|
25
|
+
- Modify: `src/ui/App.tsx`
|
|
26
|
+
- Modify: `src/ui/styles.css`
|
|
27
|
+
|
|
28
|
+
**Tests**
|
|
29
|
+
- Modify: `tests/launch/ui-smoke.test.ts`
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
### Task 1: Reframe Roadmap To Phase 2-lite Then Phase 3
|
|
34
|
+
|
|
35
|
+
**Files:**
|
|
36
|
+
- Modify: `.claude/active-tasks.md`
|
|
37
|
+
- Modify: `.claude/project.md`
|
|
38
|
+
|
|
39
|
+
- [ ] Update active task tracking so the current next phase is `Phase 2-lite`, not the original full Phase 2.
|
|
40
|
+
- [ ] Update project memory so Phase 2-lite is the minimal integration-story pass and Phase 3 is the main visual/demo workstream.
|
|
41
|
+
- [ ] Keep the scope explicit:
|
|
42
|
+
- HTTP example via `arl-test`
|
|
43
|
+
- CI example
|
|
44
|
+
- coding-agent example
|
|
45
|
+
- then UI polish
|
|
46
|
+
|
|
47
|
+
Verification:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
rg -n "Phase 2-lite|Phase 3" .claude/active-tasks.md .claude/project.md
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
### Task 2: Add Phase 2-lite Integration Assets
|
|
56
|
+
|
|
57
|
+
**Files:**
|
|
58
|
+
- Modify: `README.md`
|
|
59
|
+
- Create: `docs/coding-agents.md`
|
|
60
|
+
- Create: `.github/workflows/agentlab-pre-merge.yml`
|
|
61
|
+
|
|
62
|
+
- [ ] Add README routing sections:
|
|
63
|
+
- if your agent runs as an HTTP service
|
|
64
|
+
- if you are validating coding-agent changes
|
|
65
|
+
- if you want pre-merge regression checks in CI
|
|
66
|
+
- [ ] Add one coding-agent guide using the existing coding scenarios and current tool-loop model.
|
|
67
|
+
- [ ] Add one GitHub Actions example that runs:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
npm ci
|
|
71
|
+
npm run build
|
|
72
|
+
node dist/index.js run --suite-def pre_merge --agent mock-default
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
- [ ] Keep this section narrow and copy-pasteable. No broad framework matrix.
|
|
76
|
+
|
|
77
|
+
Verification:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
rg -n "HTTP service|coding-agent|pre-merge|GitHub Actions" README.md docs/coding-agents.md .github/workflows/agentlab-pre-merge.yml
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
### Task 3: Improve Runs Dashboard And Comparison UX
|
|
86
|
+
|
|
87
|
+
**Files:**
|
|
88
|
+
- Modify: `src/ui/App.tsx`
|
|
89
|
+
- Modify: `src/ui/styles.css`
|
|
90
|
+
- Modify: `tests/launch/ui-smoke.test.ts`
|
|
91
|
+
|
|
92
|
+
- [ ] Add a stronger runs dashboard summary at the top of the list page:
|
|
93
|
+
- total runs shown
|
|
94
|
+
- pass/fail/error counts
|
|
95
|
+
- most recent suite/context hint
|
|
96
|
+
- [ ] Redesign the compare page to make regressions visually obvious:
|
|
97
|
+
- top classification banner
|
|
98
|
+
- clearer delta cards
|
|
99
|
+
- evaluator/tool diff blocks with stronger hierarchy
|
|
100
|
+
- more obvious baseline vs candidate sections
|
|
101
|
+
- [ ] Make the suite compare page easier to scan:
|
|
102
|
+
- headline regression/improvement counts
|
|
103
|
+
- clearer scenario groupings
|
|
104
|
+
|
|
105
|
+
Verification:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
npx tsx --test tests/launch/ui-smoke.test.ts
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
### Task 4: Improve Trace And Detail Presentation
|
|
114
|
+
|
|
115
|
+
**Files:**
|
|
116
|
+
- Modify: `src/ui/App.tsx`
|
|
117
|
+
- Modify: `src/ui/styles.css`
|
|
118
|
+
- Modify: `tests/launch/ui-smoke.test.ts`
|
|
119
|
+
|
|
120
|
+
- [ ] Replace the plain trace list with a more intentional timeline treatment:
|
|
121
|
+
- event badges or type labels
|
|
122
|
+
- stronger step grouping
|
|
123
|
+
- clearer source metadata
|
|
124
|
+
- [ ] Keep failure-first behavior intact.
|
|
125
|
+
- [ ] Preserve readability on narrow screens.
|
|
126
|
+
|
|
127
|
+
Verification:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
npx tsx --test tests/launch/ui-smoke.test.ts
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
### Task 5: Full Verification
|
|
136
|
+
|
|
137
|
+
**Files:**
|
|
138
|
+
- Modify only if verification exposes issues
|
|
139
|
+
|
|
140
|
+
- [ ] Run focused UI/docs-related verification:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
npx tsx --test tests/launch/ui-smoke.test.ts tests/cliPackaging.test.ts
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
- [ ] Run full suite:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
npm test
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
- [ ] Run release gates:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
npm run check
|
|
156
|
+
npm run build
|
|
157
|
+
npm run smoke:cli
|
|
158
|
+
npm_config_cache=/tmp/agentlab-npm-cache npm pack --dry-run
|
|
159
|
+
```
|
|
160
|
+
|