agent-regression-lab 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -11
- package/bin/agentlab.js +2 -0
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +199 -12
- package/dist/evaluators.js +56 -1
- package/dist/index.js +157 -11
- package/dist/init.js +88 -0
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +56 -2
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +75 -7
- package/dist/ui-assets/client.css +92 -0
- package/dist/ui-assets/client.js +183 -19
- package/docs/agents.md +143 -8
- package/docs/coding-agents.md +74 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
- package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
- package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
- package/docs/tools.md +34 -3
- package/docs/troubleshooting.md +193 -0
- package/docs/variant-sets.md +63 -0
- package/examples/coding-tools/README.md +21 -0
- package/examples/coding-tools/index.js +11 -0
- package/examples/coding-tools/package.json +8 -0
- package/examples/support-tools/README.md +21 -0
- package/examples/support-tools/index.js +8 -0
- package/examples/support-tools/package.json +8 -0
- package/package.json +7 -5
package/dist/ui-assets/client.js
CHANGED
|
@@ -21748,11 +21748,20 @@ function RunListPage() {
|
|
|
21748
21748
|
if (provider) url.searchParams.set("provider", provider);
|
|
21749
21749
|
void fetch(url).then((response) => response.json()).then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
|
|
21750
21750
|
}, [suite, status, provider]);
|
|
21751
|
+
const stats = summarizeRuns(runs);
|
|
21751
21752
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
|
|
21752
21753
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
|
|
21753
21754
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Runs" }),
|
|
21754
21755
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })
|
|
21755
21756
|
] }),
|
|
21757
|
+
runs.length > 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats dashboard-stats", children: [
|
|
21758
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runs shown", value: stats.total }),
|
|
21759
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Passing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "pass-text", children: stats.pass }) }),
|
|
21760
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Failing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "fail-text", children: stats.fail }) }),
|
|
21761
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Errors", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "error-text", children: stats.error }) }),
|
|
21762
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest suite", value: stats.latestSuite }),
|
|
21763
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest provider", value: stats.latestProvider })
|
|
21764
|
+
] }) : null,
|
|
21756
21765
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "filters", children: [
|
|
21757
21766
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }),
|
|
21758
21767
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [
|
|
@@ -21816,6 +21825,7 @@ function RunDetailPage(props) {
|
|
|
21816
21825
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
|
|
21817
21826
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
|
|
21818
21827
|
] }),
|
|
21828
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
|
|
21819
21829
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21820
21830
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
|
|
21821
21831
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
|
|
@@ -21835,6 +21845,7 @@ function RunDetailPage(props) {
|
|
|
21835
21845
|
" ",
|
|
21836
21846
|
detail.agentVersion?.modelId ?? "-"
|
|
21837
21847
|
] }),
|
|
21848
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
|
|
21838
21849
|
detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21839
21850
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
|
|
21840
21851
|
" ",
|
|
@@ -21877,14 +21888,13 @@ function RunDetailPage(props) {
|
|
|
21877
21888
|
] }),
|
|
21878
21889
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21879
21890
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Trace" }),
|
|
21880
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
21881
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [
|
|
21882
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("
|
|
21883
|
-
|
|
21884
|
-
|
|
21885
|
-
event.type
|
|
21891
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "timeline-item", children: [
|
|
21892
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "timeline-head", children: [
|
|
21893
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("span", { className: "timeline-step", children: [
|
|
21894
|
+
"Step ",
|
|
21895
|
+
event.stepIndex
|
|
21886
21896
|
] }),
|
|
21887
|
-
" ",
|
|
21897
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: formatEventLabel(event.type) }),
|
|
21888
21898
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: event.source })
|
|
21889
21899
|
] }),
|
|
21890
21900
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: JSON.stringify(event.payload, null, 2) })
|
|
@@ -21892,6 +21902,71 @@ function RunDetailPage(props) {
|
|
|
21892
21902
|
] })
|
|
21893
21903
|
] });
|
|
21894
21904
|
}
|
|
21905
|
+
function FailureSummaryPanel(props) {
|
|
21906
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
21907
|
+
if (failureItems.length === 0) {
|
|
21908
|
+
return null;
|
|
21909
|
+
}
|
|
21910
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel failure-panel", children: [
|
|
21911
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
|
|
21912
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21913
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
|
|
21914
|
+
" ",
|
|
21915
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
|
|
21916
|
+
] }),
|
|
21917
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21918
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
|
|
21919
|
+
" ",
|
|
21920
|
+
props.detail.run.terminationReason
|
|
21921
|
+
] }),
|
|
21922
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
|
|
21923
|
+
] });
|
|
21924
|
+
}
|
|
21925
|
+
function RunIdentitySummary(props) {
|
|
21926
|
+
const run = props.detail.run;
|
|
21927
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
|
|
21928
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21929
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
|
|
21930
|
+
" ",
|
|
21931
|
+
run.variantSetName ?? "-"
|
|
21932
|
+
] }),
|
|
21933
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21934
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
|
|
21935
|
+
" ",
|
|
21936
|
+
run.variantLabel ?? "-"
|
|
21937
|
+
] }),
|
|
21938
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21939
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
|
|
21940
|
+
" ",
|
|
21941
|
+
run.promptVersion ?? "-"
|
|
21942
|
+
] }),
|
|
21943
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21944
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
|
|
21945
|
+
" ",
|
|
21946
|
+
run.modelVersion ?? "-"
|
|
21947
|
+
] }),
|
|
21948
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21949
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
|
|
21950
|
+
" ",
|
|
21951
|
+
run.toolSchemaVersion ?? "-"
|
|
21952
|
+
] }),
|
|
21953
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21954
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
|
|
21955
|
+
" ",
|
|
21956
|
+
run.configLabel ?? "-"
|
|
21957
|
+
] }),
|
|
21958
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21959
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
|
|
21960
|
+
" ",
|
|
21961
|
+
run.runtimeProfileName ?? "-"
|
|
21962
|
+
] }),
|
|
21963
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21964
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
|
|
21965
|
+
" ",
|
|
21966
|
+
run.suiteDefinitionName ?? "-"
|
|
21967
|
+
] })
|
|
21968
|
+
] });
|
|
21969
|
+
}
|
|
21895
21970
|
function ComparePage(props) {
|
|
21896
21971
|
const [data, setData] = (0, import_react.useState)(null);
|
|
21897
21972
|
(0, import_react.useEffect)(() => {
|
|
@@ -21915,13 +21990,14 @@ function ComparePage(props) {
|
|
|
21915
21990
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Compare" }),
|
|
21916
21991
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
|
|
21917
21992
|
] }),
|
|
21993
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparisonHero, { comparison: data }),
|
|
21918
21994
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21919
21995
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
21920
21996
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
|
|
21921
21997
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
|
|
21922
21998
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
|
|
21923
21999
|
] }),
|
|
21924
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22000
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel emphasis-panel", children: [
|
|
21925
22001
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
|
|
21926
22002
|
data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No material differences recorded." }) : null,
|
|
21927
22003
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
|
|
@@ -21930,15 +22006,24 @@ function ComparePage(props) {
|
|
|
21930
22006
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21931
22007
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
|
|
21932
22008
|
data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
|
|
21933
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
21934
|
-
|
|
21935
|
-
|
|
22009
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22010
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22011
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.evaluatorId }),
|
|
22012
|
+
diff.hardGate ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: "hard gate" }) : null
|
|
22013
|
+
] }),
|
|
22014
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
|
|
21936
22015
|
] }, diff.evaluatorId)) })
|
|
21937
22016
|
] }),
|
|
21938
22017
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21939
22018
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
|
|
21940
22019
|
data.toolDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No tool usage changes." }) : null,
|
|
21941
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.
|
|
22020
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22021
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22022
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.toolName }),
|
|
22023
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })
|
|
22024
|
+
] }),
|
|
22025
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
|
|
22026
|
+
] }, diff.toolName)) })
|
|
21942
22027
|
] })
|
|
21943
22028
|
] }),
|
|
21944
22029
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-grid", children: [
|
|
@@ -21948,7 +22033,7 @@ function ComparePage(props) {
|
|
|
21948
22033
|
] });
|
|
21949
22034
|
}
|
|
21950
22035
|
function RunSide(props) {
|
|
21951
|
-
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className:
|
|
22036
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [
|
|
21952
22037
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
21953
22038
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21954
22039
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Run:" }),
|
|
@@ -22006,10 +22091,10 @@ function RunSide(props) {
|
|
|
22006
22091
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Final output:" }) }),
|
|
22007
22092
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: props.detail.run.finalOutput || "(none)" }),
|
|
22008
22093
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h3", { children: "Trace" }),
|
|
22009
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
|
|
22094
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { className: "timeline-item compact-item", children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
|
|
22010
22095
|
event.stepIndex,
|
|
22011
22096
|
". ",
|
|
22012
|
-
event.type
|
|
22097
|
+
formatEventLabel(event.type)
|
|
22013
22098
|
] }) }, event.eventId)) })
|
|
22014
22099
|
] });
|
|
22015
22100
|
}
|
|
@@ -22036,6 +22121,7 @@ function SuiteComparePage(props) {
|
|
|
22036
22121
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
|
|
22037
22122
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
|
|
22038
22123
|
] }),
|
|
22124
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparisonHero, { data }),
|
|
22039
22125
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
22040
22126
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
22041
22127
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
|
|
@@ -22072,10 +22158,12 @@ function ScenarioList(props) {
|
|
|
22072
22158
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22073
22159
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
22074
22160
|
props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
|
|
22075
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22076
|
-
/* @__PURE__ */ (0, import_jsx_runtime.
|
|
22077
|
-
|
|
22078
|
-
|
|
22161
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
|
|
22162
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
|
|
22163
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
|
|
22164
|
+
" ",
|
|
22165
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification })
|
|
22166
|
+
] }),
|
|
22079
22167
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
|
|
22080
22168
|
] }, item.scenarioId)) })
|
|
22081
22169
|
] });
|
|
@@ -22092,6 +22180,82 @@ function EmptyState(props) {
|
|
|
22092
22180
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
|
|
22093
22181
|
] });
|
|
22094
22182
|
}
|
|
22183
|
+
function ComparisonHero(props) {
|
|
22184
|
+
const tone = mapClassificationToTone(props.comparison.classification);
|
|
22185
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-hero ${tone}`, children: [
|
|
22186
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
|
|
22187
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.comparison.classification }),
|
|
22188
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })
|
|
22189
|
+
] }),
|
|
22190
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { className: "muted", children: [
|
|
22191
|
+
"Output changed: ",
|
|
22192
|
+
props.comparison.outputChanged ? "yes" : "no",
|
|
22193
|
+
props.comparison.terminationDelta ? ` \u2022 termination: ${props.comparison.terminationDelta}` : ""
|
|
22194
|
+
] })
|
|
22195
|
+
] });
|
|
22196
|
+
}
|
|
22197
|
+
function SuiteComparisonHero(props) {
|
|
22198
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel compare-hero neutral", children: [
|
|
22199
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
|
|
22200
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Suite movement" }),
|
|
22201
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: props.data.classification })
|
|
22202
|
+
] }),
|
|
22203
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats compact-stats", children: [
|
|
22204
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Regressions", value: props.data.regressions.length }),
|
|
22205
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Improvements", value: props.data.improvements.length }),
|
|
22206
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Unchanged", value: props.data.unchanged.length })
|
|
22207
|
+
] })
|
|
22208
|
+
] });
|
|
22209
|
+
}
|
|
22210
|
+
function getFailureSummaryItems(detail) {
|
|
22211
|
+
const items = [];
|
|
22212
|
+
if (detail.errorDetail) {
|
|
22213
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
22214
|
+
}
|
|
22215
|
+
for (const result of detail.evaluatorResults) {
|
|
22216
|
+
if (result.status === "fail") {
|
|
22217
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
22218
|
+
}
|
|
22219
|
+
}
|
|
22220
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
22221
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
22222
|
+
}
|
|
22223
|
+
return items;
|
|
22224
|
+
}
|
|
22225
|
+
function summarizeRuns(runs) {
|
|
22226
|
+
return {
|
|
22227
|
+
total: runs.length,
|
|
22228
|
+
pass: runs.filter((run) => run.status === "pass").length,
|
|
22229
|
+
fail: runs.filter((run) => run.status === "fail").length,
|
|
22230
|
+
error: runs.filter((run) => run.status === "error").length,
|
|
22231
|
+
latestSuite: runs[0]?.suite ?? "-",
|
|
22232
|
+
latestProvider: runs[0]?.provider ?? "-"
|
|
22233
|
+
};
|
|
22234
|
+
}
|
|
22235
|
+
function formatEventLabel(type) {
|
|
22236
|
+
return type.replaceAll("_", " ");
|
|
22237
|
+
}
|
|
22238
|
+
function mapRiskToPill(risk) {
|
|
22239
|
+
if (risk === "high") {
|
|
22240
|
+
return "fail";
|
|
22241
|
+
}
|
|
22242
|
+
if (risk === "medium") {
|
|
22243
|
+
return "error";
|
|
22244
|
+
}
|
|
22245
|
+
return "pass";
|
|
22246
|
+
}
|
|
22247
|
+
function mapClassificationToTone(classification) {
|
|
22248
|
+
if (classification.includes("regress")) {
|
|
22249
|
+
return "fail";
|
|
22250
|
+
}
|
|
22251
|
+
if (classification.includes("improv")) {
|
|
22252
|
+
return "pass";
|
|
22253
|
+
}
|
|
22254
|
+
if (classification.includes("changed")) {
|
|
22255
|
+
return "error";
|
|
22256
|
+
}
|
|
22257
|
+
return "neutral";
|
|
22258
|
+
}
|
|
22095
22259
|
function signed(value) {
|
|
22096
22260
|
return value > 0 ? `+${value}` : `${value}`;
|
|
22097
22261
|
}
|
package/docs/agents.md
CHANGED
|
@@ -2,15 +2,25 @@
|
|
|
2
2
|
|
|
3
3
|
Named agents are configured in `agentlab.config.yaml`.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
|
|
6
|
+
|
|
7
|
+
This repo supports four provider modes:
|
|
6
8
|
|
|
7
9
|
- `mock`
|
|
8
10
|
- `openai`
|
|
9
11
|
- `external_process`
|
|
12
|
+
- `http`
|
|
13
|
+
|
|
14
|
+
Choose the simplest provider that answers the engineering question you actually have:
|
|
15
|
+
|
|
16
|
+
- `mock` for deterministic harness verification
|
|
17
|
+
- `openai` for real model behavior on deterministic tools
|
|
18
|
+
- `external_process` for local agents where the runner should still own the tool loop
|
|
19
|
+
- `http` for real running services that own their own memory and internal orchestration
|
|
10
20
|
|
|
11
21
|
## Named Agent Config
|
|
12
22
|
|
|
13
|
-
Example:
|
|
23
|
+
Example covering all providers:
|
|
14
24
|
|
|
15
25
|
```yaml
|
|
16
26
|
agents:
|
|
@@ -29,14 +39,31 @@ agents:
|
|
|
29
39
|
args:
|
|
30
40
|
- custom_agents/node_agent.mjs
|
|
31
41
|
label: custom-node-agent
|
|
42
|
+
|
|
43
|
+
- name: my-production-agent
|
|
44
|
+
provider: http
|
|
45
|
+
url: http://localhost:3000/api/chat
|
|
46
|
+
label: my-production-agent
|
|
32
47
|
```
|
|
33
48
|
|
|
34
49
|
Run a named agent with:
|
|
35
50
|
|
|
36
51
|
```bash
|
|
37
52
|
agentlab run support.refund-correct-order --agent mock-default
|
|
53
|
+
agentlab run internal-teams.memory-followup-recall --agent my-production-agent
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
|
|
60
|
+
agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
|
|
38
61
|
```
|
|
39
62
|
|
|
63
|
+
Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
40
67
|
## Mock
|
|
41
68
|
|
|
42
69
|
The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
|
|
@@ -47,6 +74,8 @@ Use it when you want:
|
|
|
47
74
|
- stable docs examples
|
|
48
75
|
- predictable benchmark behavior
|
|
49
76
|
|
|
77
|
+
---
|
|
78
|
+
|
|
50
79
|
## OpenAI
|
|
51
80
|
|
|
52
81
|
The OpenAI path uses your API key and a configured model.
|
|
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
|
|
|
65
94
|
|
|
66
95
|
The OpenAI path is useful, but less deterministic than the mock path.
|
|
67
96
|
|
|
97
|
+
---
|
|
98
|
+
|
|
68
99
|
## External Process
|
|
69
100
|
|
|
70
101
|
External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
|
|
@@ -110,14 +141,12 @@ Run one of them with:
|
|
|
110
141
|
agentlab run support.refund-via-config-tool --agent custom-node-agent
|
|
111
142
|
```
|
|
112
143
|
|
|
113
|
-
|
|
144
|
+
### Environment Allowlist
|
|
114
145
|
|
|
115
146
|
External-process agents can optionally define `envAllowlist`.
|
|
116
147
|
|
|
117
148
|
Use it when a child process needs specific environment variables passed through.
|
|
118
149
|
|
|
119
|
-
Example shape:
|
|
120
|
-
|
|
121
150
|
```yaml
|
|
122
151
|
agents:
|
|
123
152
|
- name: custom-agent
|
|
@@ -131,13 +160,117 @@ agents:
|
|
|
131
160
|
|
|
132
161
|
Only allow through what the child actually needs.
|
|
133
162
|
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## HTTP
|
|
166
|
+
|
|
167
|
+
The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
|
|
168
|
+
|
|
169
|
+
Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
|
|
170
|
+
|
|
171
|
+
Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
|
|
172
|
+
|
|
173
|
+
This is the default choice when validating memoryful or stateful agents that already run as a service.
|
|
174
|
+
|
|
175
|
+
HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
|
|
176
|
+
|
|
177
|
+
### Minimal Config
|
|
178
|
+
|
|
179
|
+
```yaml
|
|
180
|
+
agents:
|
|
181
|
+
- name: my-agent
|
|
182
|
+
provider: http
|
|
183
|
+
url: http://localhost:3000/api/chat
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
|
|
187
|
+
|
|
188
|
+
### Custom Field Names
|
|
189
|
+
|
|
190
|
+
If your agent uses different field names:
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
agents:
|
|
194
|
+
- name: my-agent-custom
|
|
195
|
+
provider: http
|
|
196
|
+
url: http://localhost:3000/api/chat
|
|
197
|
+
request_template:
|
|
198
|
+
query: "{{message}}"
|
|
199
|
+
session_id: "{{conversation_id}}"
|
|
200
|
+
response_field: reply
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
`request_template` values support three placeholders:
|
|
204
|
+
|
|
205
|
+
- `{{message}}` — the current step message
|
|
206
|
+
- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
|
|
207
|
+
- `{{env.VAR_NAME}}` — reads from the environment at runtime
|
|
208
|
+
|
|
209
|
+
Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
|
|
210
|
+
|
|
211
|
+
### Auth and Timeout
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
agents:
|
|
215
|
+
- name: my-agent-auth
|
|
216
|
+
provider: http
|
|
217
|
+
url: http://localhost:3000/api/chat
|
|
218
|
+
headers:
|
|
219
|
+
Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
|
|
220
|
+
timeout_ms: 10000
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
`timeout_ms` defaults to 30000 (30 seconds) if not set.
|
|
224
|
+
|
|
225
|
+
Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
|
|
226
|
+
|
|
227
|
+
### Full Config Reference
|
|
228
|
+
|
|
229
|
+
| Field | Required | Default | Description |
|
|
230
|
+
|-------|----------|---------|-------------|
|
|
231
|
+
| `url` | yes | — | HTTP endpoint to POST to |
|
|
232
|
+
| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
|
|
233
|
+
| `response_field` | no | `message` | Field to read the reply from |
|
|
234
|
+
| `headers` | no | `{}` | Additional HTTP headers |
|
|
235
|
+
| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
|
|
236
|
+
| `label` | no | agent name | Display label in CLI output and run history |
|
|
237
|
+
|
|
238
|
+
### How It Works
|
|
239
|
+
|
|
240
|
+
For each step in a conversation scenario:
|
|
241
|
+
|
|
242
|
+
1. agentlab generates a UUID `conversation_id` once at the start of the run
|
|
243
|
+
2. for every step, it POSTs the current message and `conversation_id` to your agent
|
|
244
|
+
3. your agent is responsible for maintaining conversation history using that id
|
|
245
|
+
4. agentlab reads the reply, measures latency, and runs per-step evaluators
|
|
246
|
+
5. if a hard-gate evaluator fails, the run stops immediately
|
|
247
|
+
|
|
248
|
+
### Error Handling
|
|
249
|
+
|
|
250
|
+
HTTP provider runs can end with these termination reasons:
|
|
251
|
+
|
|
252
|
+
| Reason | Cause |
|
|
253
|
+
|--------|-------|
|
|
254
|
+
| `http_connection_failed` | Could not connect to the URL |
|
|
255
|
+
| `http_error` | Agent returned HTTP 4xx or 5xx |
|
|
256
|
+
| `timeout_exceeded` | Request exceeded `timeout_ms` |
|
|
257
|
+
| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
|
|
258
|
+
| `evaluator_failed` | A per-step hard-gate evaluator failed |
|
|
259
|
+
|
|
260
|
+
Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
134
264
|
## Best Practices
|
|
135
265
|
|
|
136
|
-
- use named agents instead of ad hoc
|
|
266
|
+
- use named agents instead of ad hoc provider flags
|
|
137
267
|
- keep labels stable so compare output stays readable
|
|
138
268
|
- prefer the mock path for smoke tests and docs
|
|
139
|
-
- use external-process agents when you want to wrap a local Node or Python agent
|
|
140
|
-
-
|
|
269
|
+
- use external-process agents when you want to wrap a local Node or Python agent
|
|
270
|
+
- use http agents when your agent is already running as a service
|
|
271
|
+
- keep the runner authoritative for tools and termination (external_process and mock)
|
|
272
|
+
- keep your agent authoritative for tools and history (http)
|
|
273
|
+
- choose the simplest provider that answers the engineering question you actually have
|
|
141
274
|
|
|
142
275
|
## Common Errors
|
|
143
276
|
|
|
@@ -148,5 +281,7 @@ Typical failures:
|
|
|
148
281
|
- missing external-process `command`
|
|
149
282
|
- invalid `args` or `envAllowlist`
|
|
150
283
|
- child process returning invalid JSON
|
|
284
|
+
- http agent url not running when the test starts
|
|
285
|
+
- http agent returning a field name that doesn't match `response_field`
|
|
151
286
|
|
|
152
287
|
See [troubleshooting.md](troubleshooting.md) for fixes.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Coding Agents
|
|
2
|
+
|
|
3
|
+
ARL supports coding-agent regression workflows through deterministic task scenarios.
|
|
4
|
+
|
|
5
|
+
Use this path when the runner should remain authoritative for:
|
|
6
|
+
|
|
7
|
+
- file inspection tools
|
|
8
|
+
- patch application tools
|
|
9
|
+
- step limits
|
|
10
|
+
- regression scoring
|
|
11
|
+
|
|
12
|
+
## Start With The Built-In Coding Scenarios
|
|
13
|
+
|
|
14
|
+
This repo already includes two coding scenarios:
|
|
15
|
+
|
|
16
|
+
- `coding.fix-add-function`
|
|
17
|
+
- `coding.update-greeting`
|
|
18
|
+
|
|
19
|
+
Run one directly:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
These scenarios use fixture-backed repo tools, which makes them useful for:
|
|
26
|
+
|
|
27
|
+
- prompt changes
|
|
28
|
+
- model comparisons
|
|
29
|
+
- patch-discipline checks
|
|
30
|
+
- pre-merge behavioral regression checks
|
|
31
|
+
|
|
32
|
+
## Why This Matters
|
|
33
|
+
|
|
34
|
+
Coding agents often regress in subtle ways:
|
|
35
|
+
|
|
36
|
+
- they inspect too much of the repo
|
|
37
|
+
- they patch the wrong file
|
|
38
|
+
- they over-edit instead of making a narrow change
|
|
39
|
+
- they stop naming the changed file clearly
|
|
40
|
+
|
|
41
|
+
ARL helps by making those expectations explicit in scenario evaluators.
|
|
42
|
+
|
|
43
|
+
## Minimal Workflow
|
|
44
|
+
|
|
45
|
+
1. run one coding scenario locally
|
|
46
|
+
2. inspect the run output and trace
|
|
47
|
+
3. run it again against a changed prompt/model/agent variant
|
|
48
|
+
4. compare the two runs
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
54
|
+
agentlab run coding.fix-add-function --agent mock-default
|
|
55
|
+
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## When To Use Task Scenarios Versus HTTP
|
|
59
|
+
|
|
60
|
+
Use task scenarios for coding agents when:
|
|
61
|
+
|
|
62
|
+
- you want deterministic fixture-backed tools
|
|
63
|
+
- you want ARL to own the tool loop
|
|
64
|
+
- you want reproducible patch-evaluator behavior
|
|
65
|
+
|
|
66
|
+
Use HTTP/conversation scenarios only when the coding agent already exists as a running service and owns its own orchestration internally.
|
|
67
|
+
|
|
68
|
+
## Next Step
|
|
69
|
+
|
|
70
|
+
If you want coding-agent checks in team workflows, pair these scenarios with suite definitions and CI:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
agentlab run --suite-def pre_merge --agent mock-default
|
|
74
|
+
```
|