agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +78 -11
  2. package/bin/agentlab.js +2 -0
  3. package/dist/agent/factory.js +20 -6
  4. package/dist/agent/httpAdapter.js +5 -4
  5. package/dist/config.js +199 -12
  6. package/dist/evaluators.js +56 -1
  7. package/dist/index.js +157 -11
  8. package/dist/init.js +88 -0
  9. package/dist/lib/id.js +3 -0
  10. package/dist/runOutput.js +46 -0
  11. package/dist/runner.js +31 -9
  12. package/dist/scenarios.js +90 -2
  13. package/dist/scoring.js +2 -2
  14. package/dist/storage.js +117 -7
  15. package/dist/tools.js +56 -2
  16. package/dist/trace.js +4 -2
  17. package/dist/ui/App.js +75 -7
  18. package/dist/ui-assets/client.css +92 -0
  19. package/dist/ui-assets/client.js +183 -19
  20. package/docs/agents.md +143 -8
  21. package/docs/coding-agents.md +74 -0
  22. package/docs/golden-suites.md +74 -0
  23. package/docs/integrations-and-live-services.md +58 -0
  24. package/docs/memory-and-stateful-agents.md +51 -0
  25. package/docs/release-checklist.md +30 -0
  26. package/docs/runtime-profiles.md +67 -0
  27. package/docs/scenarios.md +303 -56
  28. package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
  29. package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
  30. package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
  31. package/docs/tools.md +34 -3
  32. package/docs/troubleshooting.md +193 -0
  33. package/docs/variant-sets.md +63 -0
  34. package/examples/coding-tools/README.md +21 -0
  35. package/examples/coding-tools/index.js +11 -0
  36. package/examples/coding-tools/package.json +8 -0
  37. package/examples/support-tools/README.md +21 -0
  38. package/examples/support-tools/index.js +8 -0
  39. package/examples/support-tools/package.json +8 -0
  40. package/package.json +7 -5
@@ -21748,11 +21748,20 @@ function RunListPage() {
21748
21748
  if (provider) url.searchParams.set("provider", provider);
21749
21749
  void fetch(url).then((response) => response.json()).then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
21750
21750
  }, [suite, status, provider]);
21751
+ const stats = summarizeRuns(runs);
21751
21752
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
21752
21753
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
21753
21754
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Runs" }),
21754
21755
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })
21755
21756
  ] }),
21757
+ runs.length > 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats dashboard-stats", children: [
21758
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runs shown", value: stats.total }),
21759
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Passing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "pass-text", children: stats.pass }) }),
21760
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Failing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "fail-text", children: stats.fail }) }),
21761
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Errors", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "error-text", children: stats.error }) }),
21762
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest suite", value: stats.latestSuite }),
21763
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest provider", value: stats.latestProvider })
21764
+ ] }) : null,
21756
21765
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "filters", children: [
21757
21766
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }),
21758
21767
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [
@@ -21816,6 +21825,7 @@ function RunDetailPage(props) {
21816
21825
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
21817
21826
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
21818
21827
  ] }),
21828
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
21819
21829
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21820
21830
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
21821
21831
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21835,6 +21845,7 @@ function RunDetailPage(props) {
21835
21845
  " ",
21836
21846
  detail.agentVersion?.modelId ?? "-"
21837
21847
  ] }),
21848
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
21838
21849
  detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21839
21850
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
21840
21851
  " ",
@@ -21877,14 +21888,13 @@ function RunDetailPage(props) {
21877
21888
  ] }),
21878
21889
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21879
21890
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Trace" }),
21880
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
21881
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [
21882
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
21883
- event.stepIndex,
21884
- ". ",
21885
- event.type
21891
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "timeline-item", children: [
21892
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "timeline-head", children: [
21893
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("span", { className: "timeline-step", children: [
21894
+ "Step ",
21895
+ event.stepIndex
21886
21896
  ] }),
21887
- " ",
21897
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: formatEventLabel(event.type) }),
21888
21898
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: event.source })
21889
21899
  ] }),
21890
21900
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: JSON.stringify(event.payload, null, 2) })
@@ -21892,6 +21902,71 @@ function RunDetailPage(props) {
21892
21902
  ] })
21893
21903
  ] });
21894
21904
  }
21905
+ function FailureSummaryPanel(props) {
21906
+ const failureItems = getFailureSummaryItems(props.detail);
21907
+ if (failureItems.length === 0) {
21908
+ return null;
21909
+ }
21910
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel failure-panel", children: [
21911
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
21912
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21913
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
21914
+ " ",
21915
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
21916
+ ] }),
21917
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21918
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
21919
+ " ",
21920
+ props.detail.run.terminationReason
21921
+ ] }),
21922
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
21923
+ ] });
21924
+ }
21925
+ function RunIdentitySummary(props) {
21926
+ const run = props.detail.run;
21927
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
21928
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21929
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
21930
+ " ",
21931
+ run.variantSetName ?? "-"
21932
+ ] }),
21933
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21934
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
21935
+ " ",
21936
+ run.variantLabel ?? "-"
21937
+ ] }),
21938
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21939
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
21940
+ " ",
21941
+ run.promptVersion ?? "-"
21942
+ ] }),
21943
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21944
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
21945
+ " ",
21946
+ run.modelVersion ?? "-"
21947
+ ] }),
21948
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21949
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
21950
+ " ",
21951
+ run.toolSchemaVersion ?? "-"
21952
+ ] }),
21953
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21954
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
21955
+ " ",
21956
+ run.configLabel ?? "-"
21957
+ ] }),
21958
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21959
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
21960
+ " ",
21961
+ run.runtimeProfileName ?? "-"
21962
+ ] }),
21963
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21964
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
21965
+ " ",
21966
+ run.suiteDefinitionName ?? "-"
21967
+ ] })
21968
+ ] });
21969
+ }
21895
21970
  function ComparePage(props) {
21896
21971
  const [data, setData] = (0, import_react.useState)(null);
21897
21972
  (0, import_react.useEffect)(() => {
@@ -21915,13 +21990,14 @@ function ComparePage(props) {
21915
21990
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Compare" }),
21916
21991
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
21917
21992
  ] }),
21993
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparisonHero, { comparison: data }),
21918
21994
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21919
21995
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
21920
21996
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
21921
21997
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
21922
21998
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
21923
21999
  ] }),
21924
- /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22000
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel emphasis-panel", children: [
21925
22001
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
21926
22002
  data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No material differences recorded." }) : null,
21927
22003
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
@@ -21930,15 +22006,24 @@ function ComparePage(props) {
21930
22006
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21931
22007
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
21932
22008
  data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
21933
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
21934
- diff.note,
21935
- diff.hardGate ? " (hard gate)" : ""
22009
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22010
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22011
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.evaluatorId }),
22012
+ diff.hardGate ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: "hard gate" }) : null
22013
+ ] }),
22014
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
21936
22015
  ] }, diff.evaluatorId)) })
21937
22016
  ] }),
21938
22017
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21939
22018
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
21940
22019
  data.toolDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No tool usage changes." }) : null,
21941
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.toolName)) })
22020
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22021
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22022
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.toolName }),
22023
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })
22024
+ ] }),
22025
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
22026
+ ] }, diff.toolName)) })
21942
22027
  ] })
21943
22028
  ] }),
21944
22029
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-grid", children: [
@@ -21948,7 +22033,7 @@ function ComparePage(props) {
21948
22033
  ] });
21949
22034
  }
21950
22035
  function RunSide(props) {
21951
- return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22036
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [
21952
22037
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
21953
22038
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21954
22039
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Run:" }),
@@ -22006,10 +22091,10 @@ function RunSide(props) {
22006
22091
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Final output:" }) }),
22007
22092
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: props.detail.run.finalOutput || "(none)" }),
22008
22093
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h3", { children: "Trace" }),
22009
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
22094
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { className: "timeline-item compact-item", children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
22010
22095
  event.stepIndex,
22011
22096
  ". ",
22012
- event.type
22097
+ formatEventLabel(event.type)
22013
22098
  ] }) }, event.eventId)) })
22014
22099
  ] });
22015
22100
  }
@@ -22036,6 +22121,7 @@ function SuiteComparePage(props) {
22036
22121
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
22037
22122
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
22038
22123
  ] }),
22124
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparisonHero, { data }),
22039
22125
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
22040
22126
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
22041
22127
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
@@ -22072,10 +22158,12 @@ function ScenarioList(props) {
22072
22158
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22073
22159
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
22074
22160
  props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
22075
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22076
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22077
- " ",
22078
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
22161
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
22162
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
22163
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22164
+ " ",
22165
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification })
22166
+ ] }),
22079
22167
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
22080
22168
  ] }, item.scenarioId)) })
22081
22169
  ] });
@@ -22092,6 +22180,82 @@ function EmptyState(props) {
22092
22180
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
22093
22181
  ] });
22094
22182
  }
22183
+ function ComparisonHero(props) {
22184
+ const tone = mapClassificationToTone(props.comparison.classification);
22185
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-hero ${tone}`, children: [
22186
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
22187
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.comparison.classification }),
22188
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })
22189
+ ] }),
22190
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { className: "muted", children: [
22191
+ "Output changed: ",
22192
+ props.comparison.outputChanged ? "yes" : "no",
22193
+ props.comparison.terminationDelta ? ` \u2022 termination: ${props.comparison.terminationDelta}` : ""
22194
+ ] })
22195
+ ] });
22196
+ }
22197
+ function SuiteComparisonHero(props) {
22198
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel compare-hero neutral", children: [
22199
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
22200
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Suite movement" }),
22201
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: props.data.classification })
22202
+ ] }),
22203
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats compact-stats", children: [
22204
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Regressions", value: props.data.regressions.length }),
22205
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Improvements", value: props.data.improvements.length }),
22206
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Unchanged", value: props.data.unchanged.length })
22207
+ ] })
22208
+ ] });
22209
+ }
22210
+ function getFailureSummaryItems(detail) {
22211
+ const items = [];
22212
+ if (detail.errorDetail) {
22213
+ items.push(`Error: ${detail.errorDetail}`);
22214
+ }
22215
+ for (const result of detail.evaluatorResults) {
22216
+ if (result.status === "fail") {
22217
+ items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
22218
+ }
22219
+ }
22220
+ if (detail.run.status !== "pass" && items.length === 0) {
22221
+ items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
22222
+ }
22223
+ return items;
22224
+ }
22225
+ function summarizeRuns(runs) {
22226
+ return {
22227
+ total: runs.length,
22228
+ pass: runs.filter((run) => run.status === "pass").length,
22229
+ fail: runs.filter((run) => run.status === "fail").length,
22230
+ error: runs.filter((run) => run.status === "error").length,
22231
+ latestSuite: runs[0]?.suite ?? "-",
22232
+ latestProvider: runs[0]?.provider ?? "-"
22233
+ };
22234
+ }
22235
+ function formatEventLabel(type) {
22236
+ return type.replaceAll("_", " ");
22237
+ }
22238
+ function mapRiskToPill(risk) {
22239
+ if (risk === "high") {
22240
+ return "fail";
22241
+ }
22242
+ if (risk === "medium") {
22243
+ return "error";
22244
+ }
22245
+ return "pass";
22246
+ }
22247
+ function mapClassificationToTone(classification) {
22248
+ if (classification.includes("regress")) {
22249
+ return "fail";
22250
+ }
22251
+ if (classification.includes("improv")) {
22252
+ return "pass";
22253
+ }
22254
+ if (classification.includes("changed")) {
22255
+ return "error";
22256
+ }
22257
+ return "neutral";
22258
+ }
22095
22259
  function signed(value) {
22096
22260
  return value > 0 ? `+${value}` : `${value}`;
22097
22261
  }
package/docs/agents.md CHANGED
@@ -2,15 +2,25 @@
2
2
 
3
3
  Named agents are configured in `agentlab.config.yaml`.
4
4
 
5
- This repo currently supports three provider modes:
5
+ Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
6
+
7
+ This repo supports four provider modes:
6
8
 
7
9
  - `mock`
8
10
  - `openai`
9
11
  - `external_process`
12
+ - `http`
13
+
14
+ Choose the simplest provider that answers the engineering question you actually have:
15
+
16
+ - `mock` for deterministic harness verification
17
+ - `openai` for real model behavior on deterministic tools
18
+ - `external_process` for local agents where the runner should still own the tool loop
19
+ - `http` for real running services that own their own memory and internal orchestration
10
20
 
11
21
  ## Named Agent Config
12
22
 
13
- Example:
23
+ Example covering all providers:
14
24
 
15
25
  ```yaml
16
26
  agents:
@@ -29,14 +39,31 @@ agents:
29
39
  args:
30
40
  - custom_agents/node_agent.mjs
31
41
  label: custom-node-agent
42
+
43
+ - name: my-production-agent
44
+ provider: http
45
+ url: http://localhost:3000/api/chat
46
+ label: my-production-agent
32
47
  ```
33
48
 
34
49
  Run a named agent with:
35
50
 
36
51
  ```bash
37
52
  agentlab run support.refund-correct-order --agent mock-default
53
+ agentlab run internal-teams.memory-followup-recall --agent my-production-agent
54
+ ```
55
+
56
+ Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
57
+
58
+ ```bash
59
+ agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
60
+ agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
38
61
  ```
39
62
 
63
+ Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
64
+
65
+ ---
66
+
40
67
  ## Mock
41
68
 
42
69
  The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
@@ -47,6 +74,8 @@ Use it when you want:
47
74
  - stable docs examples
48
75
  - predictable benchmark behavior
49
76
 
77
+ ---
78
+
50
79
  ## OpenAI
51
80
 
52
81
  The OpenAI path uses your API key and a configured model.
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
65
94
 
66
95
  The OpenAI path is useful, but less deterministic than the mock path.
67
96
 
97
+ ---
98
+
68
99
  ## External Process
69
100
 
70
101
  External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
@@ -110,14 +141,12 @@ Run one of them with:
110
141
  agentlab run support.refund-via-config-tool --agent custom-node-agent
111
142
  ```
112
143
 
113
- ## Environment Allowlist
144
+ ### Environment Allowlist
114
145
 
115
146
  External-process agents can optionally define `envAllowlist`.
116
147
 
117
148
  Use it when a child process needs specific environment variables passed through.
118
149
 
119
- Example shape:
120
-
121
150
  ```yaml
122
151
  agents:
123
152
  - name: custom-agent
@@ -131,13 +160,117 @@ agents:
131
160
 
132
161
  Only allow through what the child actually needs.
133
162
 
163
+ ---
164
+
165
+ ## HTTP
166
+
167
+ The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
168
+
169
+ Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
170
+
171
+ Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
172
+
173
+ This is the default choice when validating memoryful or stateful agents that already run as a service.
174
+
175
+ HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
176
+
177
+ ### Minimal Config
178
+
179
+ ```yaml
180
+ agents:
181
+ - name: my-agent
182
+ provider: http
183
+ url: http://localhost:3000/api/chat
184
+ ```
185
+
186
+ Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
187
+
188
+ ### Custom Field Names
189
+
190
+ If your agent uses different field names:
191
+
192
+ ```yaml
193
+ agents:
194
+ - name: my-agent-custom
195
+ provider: http
196
+ url: http://localhost:3000/api/chat
197
+ request_template:
198
+ query: "{{message}}"
199
+ session_id: "{{conversation_id}}"
200
+ response_field: reply
201
+ ```
202
+
203
+ `request_template` values support three placeholders:
204
+
205
+ - `{{message}}` — the current step message
206
+ - `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
207
+ - `{{env.VAR_NAME}}` — reads from the environment at runtime
208
+
209
+ Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
210
+
211
+ ### Auth and Timeout
212
+
213
+ ```yaml
214
+ agents:
215
+ - name: my-agent-auth
216
+ provider: http
217
+ url: http://localhost:3000/api/chat
218
+ headers:
219
+ Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
220
+ timeout_ms: 10000
221
+ ```
222
+
223
+ `timeout_ms` defaults to 30000 (30 seconds) if not set.
224
+
225
+ Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
226
+
227
+ ### Full Config Reference
228
+
229
+ | Field | Required | Default | Description |
230
+ |-------|----------|---------|-------------|
231
+ | `url` | yes | — | HTTP endpoint to POST to |
232
+ | `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
233
+ | `response_field` | no | `message` | Field to read the reply from |
234
+ | `headers` | no | `{}` | Additional HTTP headers |
235
+ | `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
236
+ | `label` | no | agent name | Display label in CLI output and run history |
237
+
238
+ ### How It Works
239
+
240
+ For each step in a conversation scenario:
241
+
242
+ 1. agentlab generates a UUID `conversation_id` once at the start of the run
243
+ 2. for every step, it POSTs the current message and `conversation_id` to your agent
244
+ 3. your agent is responsible for maintaining conversation history using that id
245
+ 4. agentlab reads the reply, measures latency, and runs per-step evaluators
246
+ 5. if a hard-gate evaluator fails, the run stops immediately
247
+
248
+ ### Error Handling
249
+
250
+ HTTP provider runs can end with these termination reasons:
251
+
252
+ | Reason | Cause |
253
+ |--------|-------|
254
+ | `http_connection_failed` | Could not connect to the URL |
255
+ | `http_error` | Agent returned HTTP 4xx or 5xx |
256
+ | `timeout_exceeded` | Request exceeded `timeout_ms` |
257
+ | `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
258
+ | `evaluator_failed` | A per-step hard-gate evaluator failed |
259
+
260
+ Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
261
+
262
+ ---
263
+
134
264
  ## Best Practices
135
265
 
136
- - use named agents instead of ad hoc local command strings
266
+ - use named agents instead of ad hoc provider flags
137
267
  - keep labels stable so compare output stays readable
138
268
  - prefer the mock path for smoke tests and docs
139
- - use external-process agents when you want to wrap a local Node or Python agent implementation
140
- - keep the runner authoritative for tools and termination
269
+ - use external-process agents when you want to wrap a local Node or Python agent
270
+ - use http agents when your agent is already running as a service
271
+ - keep the runner authoritative for tools and termination (external_process and mock)
272
+ - keep your agent authoritative for tools and history (http)
273
+ - choose the simplest provider that answers the engineering question you actually have
141
274
 
142
275
  ## Common Errors
143
276
 
@@ -148,5 +281,7 @@ Typical failures:
148
281
  - missing external-process `command`
149
282
  - invalid `args` or `envAllowlist`
150
283
  - child process returning invalid JSON
284
+ - http agent url not running when the test starts
285
+ - http agent returning a field name that doesn't match `response_field`
151
286
 
152
287
  See [troubleshooting.md](troubleshooting.md) for fixes.
@@ -0,0 +1,74 @@
1
+ # Coding Agents
2
+
3
+ ARL supports coding-agent regression workflows through deterministic task scenarios.
4
+
5
+ Use this path when the runner should remain authoritative for:
6
+
7
+ - file inspection tools
8
+ - patch application tools
9
+ - step limits
10
+ - regression scoring
11
+
12
+ ## Start With The Built-In Coding Scenarios
13
+
14
+ This repo already includes two coding scenarios:
15
+
16
+ - `coding.fix-add-function`
17
+ - `coding.update-greeting`
18
+
19
+ Run one directly:
20
+
21
+ ```bash
22
+ agentlab run coding.fix-add-function --agent mock-default
23
+ ```
24
+
25
+ These scenarios use fixture-backed repo tools, which makes them useful for:
26
+
27
+ - prompt changes
28
+ - model comparisons
29
+ - patch-discipline checks
30
+ - pre-merge behavioral regression checks
31
+
32
+ ## Why This Matters
33
+
34
+ Coding agents often regress in subtle ways:
35
+
36
+ - they inspect too much of the repo
37
+ - they patch the wrong file
38
+ - they over-edit instead of making a narrow change
39
+ - they stop naming the changed file clearly
40
+
41
+ ARL helps by making those expectations explicit in scenario evaluators.
42
+
43
+ ## Minimal Workflow
44
+
45
+ 1. run one coding scenario locally
46
+ 2. inspect the run output and trace
47
+ 3. run it again against a changed prompt/model/agent variant
48
+ 4. compare the two runs
49
+
50
+ Example:
51
+
52
+ ```bash
53
+ agentlab run coding.fix-add-function --agent mock-default
54
+ agentlab run coding.fix-add-function --agent mock-default
55
+ agentlab compare <baseline-run-id> <candidate-run-id>
56
+ ```
57
+
58
+ ## When To Use Task Scenarios Versus HTTP
59
+
60
+ Use task scenarios for coding agents when:
61
+
62
+ - you want deterministic fixture-backed tools
63
+ - you want ARL to own the tool loop
64
+ - you want reproducible patch-evaluator behavior
65
+
66
+ Use HTTP/conversation scenarios only when the coding agent already exists as a running service and owns its own orchestration internally.
67
+
68
+ ## Next Step
69
+
70
+ If you want coding-agent checks in team workflows, pair these scenarios with suite definitions and CI:
71
+
72
+ ```bash
73
+ agentlab run --suite-def pre_merge --agent mock-default
74
+ ```