npm - agent-regression-lab - Versions diffs - 0.2.0 → 0.4.0 - Mend

agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +78 -11
package/bin/agentlab.js +2 -0
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +5 -4
package/dist/config.js +199 -12
package/dist/evaluators.js +56 -1
package/dist/index.js +157 -11
package/dist/init.js +88 -0
package/dist/lib/id.js +3 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +90 -2
package/dist/scoring.js +2 -2
package/dist/storage.js +117 -7
package/dist/tools.js +56 -2
package/dist/trace.js +4 -2
package/dist/ui/App.js +75 -7
package/dist/ui-assets/client.css +92 -0
package/dist/ui-assets/client.js +183 -19
package/docs/agents.md +143 -8
package/docs/coding-agents.md +74 -0
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +30 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +303 -56
package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
package/docs/tools.md +34 -3
package/docs/troubleshooting.md +193 -0
package/docs/variant-sets.md +63 -0
package/examples/coding-tools/README.md +21 -0
package/examples/coding-tools/index.js +11 -0
package/examples/coding-tools/package.json +8 -0
package/examples/support-tools/README.md +21 -0
package/examples/support-tools/index.js +8 -0
package/examples/support-tools/package.json +8 -0
package/package.json +7 -5

package/dist/ui-assets/client.js CHANGED Viewed

@@ -21748,11 +21748,20 @@ function RunListPage() {
     if (provider) url.searchParams.set("provider", provider);
     void fetch(url).then((response) => response.json()).then((data) => setRuns(Array.isArray(data.runs) ? data.runs : []));
   }, [suite, status, provider]);
+  const stats = summarizeRuns(runs);
   return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Runs" }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: "Inspect local alpha runs, filter failures, and compare behavior changes." })
     ] }),
+    runs.length > 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats dashboard-stats", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runs shown", value: stats.total }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Passing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "pass-text", children: stats.pass }) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Failing", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "fail-text", children: stats.fail }) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Errors", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "error-text", children: stats.error }) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest suite", value: stats.latestSuite }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Latest provider", value: stats.latestProvider })
+    ] }) : null,
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "filters", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("input", { value: suite, onChange: (event) => setSuite(event.target.value), placeholder: "Suite" }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", { value: status, onChange: (event) => setStatus(event.target.value), children: [
@@ -21816,6 +21825,7 @@ function RunDetailPage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
     ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21835,6 +21845,7 @@ function RunDetailPage(props) {
           " ",
           detail.agentVersion?.modelId ?? "-"
         ] }),
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
         detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
           /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
           " ",
@@ -21877,14 +21888,13 @@ function RunDetailPage(props) {
     ] }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Trace" }),
-      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
-        /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [
-          /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
-            event.stepIndex,
-            ". ",
-            event.type
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline timeline-detailed", children: detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "timeline-item", children: [
+        /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "timeline-head", children: [
+          /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("span", { className: "timeline-step", children: [
+            "Step ",
+            event.stepIndex
           ] }),
-          " ",
+          /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: formatEventLabel(event.type) }),
           /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: event.source })
         ] }),
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: JSON.stringify(event.payload, null, 2) })
@@ -21892,6 +21902,71 @@ function RunDetailPage(props) {
     ] })
   ] });
 }
+function FailureSummaryPanel(props) {
+  const failureItems = getFailureSummaryItems(props.detail);
+  if (failureItems.length === 0) {
+    return null;
+  }
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel failure-panel", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
+      " ",
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
+      " ",
+      props.detail.run.terminationReason
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
+  ] });
+}
+function RunIdentitySummary(props) {
+  const run = props.detail.run;
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
+      " ",
+      run.variantSetName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
+      " ",
+      run.variantLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
+      " ",
+      run.promptVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
+      " ",
+      run.modelVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
+      " ",
+      run.toolSchemaVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
+      " ",
+      run.configLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
+      " ",
+      run.runtimeProfileName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
+      " ",
+      run.suiteDefinitionName ?? "-"
+    ] })
+  ] });
+}
 function ComparePage(props) {
   const [data, setData] = (0, import_react.useState)(null);
   (0, import_react.useEffect)(() => {
@@ -21915,13 +21990,14 @@ function ComparePage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Compare" }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
     ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparisonHero, { comparison: data }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
     ] }),
-    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel emphasis-panel", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
       data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No material differences recorded." }) : null,
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
@@ -21930,15 +22006,24 @@ function ComparePage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
         data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
-        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
-          diff.note,
-          diff.hardGate ? " (hard gate)" : ""
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
+          /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
+            /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.evaluatorId }),
+            diff.hardGate ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: "hard gate" }) : null
+          ] }),
+          /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
         ] }, diff.evaluatorId)) })
       ] }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
         data.toolDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No tool usage changes." }) : null,
-        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.toolName)) })
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: data.toolDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
+          /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
+            /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: diff.toolName }),
+            /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${mapRiskToPill(diff.risk)}`, children: diff.risk })
+          ] }),
+          /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: diff.note })
+        ] }, diff.toolName)) })
       ] })
     ] }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-grid", children: [
@@ -21948,7 +22033,7 @@ function ComparePage(props) {
   ] });
 }
 function RunSide(props) {
-  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-side ${props.title === "Candidate" ? "candidate-side" : "baseline-side"}`, children: [
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Run:" }),
@@ -22006,10 +22091,10 @@ function RunSide(props) {
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Final output:" }) }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("pre", { children: props.detail.run.finalOutput || "(none)" }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h3", { children: "Trace" }),
-    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ol", { className: "timeline compact", children: props.detail.traceEvents.map((event) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { className: "timeline-item compact-item", children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("strong", { children: [
       event.stepIndex,
       ". ",
-      event.type
+      formatEventLabel(event.type)
     ] }) }, event.eventId)) })
   ] });
 }
@@ -22036,6 +22121,7 @@ function SuiteComparePage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
     ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparisonHero, { data }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
@@ -22072,10 +22158,12 @@ function ScenarioList(props) {
   return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
     props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
-    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
-      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
-      " ",
-      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack diff-list", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { className: "diff-card", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "diff-card-head", children: [
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
+        " ",
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification })
+      ] }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
     ] }, item.scenarioId)) })
   ] });
@@ -22092,6 +22180,82 @@ function EmptyState(props) {
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
   ] });
 }
+function ComparisonHero(props) {
+  const tone = mapClassificationToTone(props.comparison.classification);
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: `panel compare-hero ${tone}`, children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.comparison.classification }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${tone}`, children: props.comparison.verdictDelta })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { className: "muted", children: [
+      "Output changed: ",
+      props.comparison.outputChanged ? "yes" : "no",
+      props.comparison.terminationDelta ? ` \u2022 termination: ${props.comparison.terminationDelta}` : ""
+    ] })
+  ] });
+}
+function SuiteComparisonHero(props) {
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel compare-hero neutral", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "compare-hero-head", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Suite movement" }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "event-chip", children: props.data.classification })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats compact-stats", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Regressions", value: props.data.regressions.length }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Improvements", value: props.data.improvements.length }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Unchanged", value: props.data.unchanged.length })
+    ] })
+  ] });
+}
+function getFailureSummaryItems(detail) {
+  const items = [];
+  if (detail.errorDetail) {
+    items.push(`Error: ${detail.errorDetail}`);
+  }
+  for (const result of detail.evaluatorResults) {
+    if (result.status === "fail") {
+      items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
+    }
+  }
+  if (detail.run.status !== "pass" && items.length === 0) {
+    items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
+  }
+  return items;
+}
+function summarizeRuns(runs) {
+  return {
+    total: runs.length,
+    pass: runs.filter((run) => run.status === "pass").length,
+    fail: runs.filter((run) => run.status === "fail").length,
+    error: runs.filter((run) => run.status === "error").length,
+    latestSuite: runs[0]?.suite ?? "-",
+    latestProvider: runs[0]?.provider ?? "-"
+  };
+}
+function formatEventLabel(type) {
+  return type.replaceAll("_", " ");
+}
+function mapRiskToPill(risk) {
+  if (risk === "high") {
+    return "fail";
+  }
+  if (risk === "medium") {
+    return "error";
+  }
+  return "pass";
+}
+function mapClassificationToTone(classification) {
+  if (classification.includes("regress")) {
+    return "fail";
+  }
+  if (classification.includes("improv")) {
+    return "pass";
+  }
+  if (classification.includes("changed")) {
+    return "error";
+  }
+  return "neutral";
+}
 function signed(value) {
   return value > 0 ? `+${value}` : `${value}`;
 }

package/docs/agents.md CHANGED Viewed

@@ -2,15 +2,25 @@
 Named agents are configured in `agentlab.config.yaml`.
-This repo currently supports three provider modes:
+Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
+This repo supports four provider modes:
 - `mock`
 - `openai`
 - `external_process`
+- `http`
+Choose the simplest provider that answers the engineering question you actually have:
+- `mock` for deterministic harness verification
+- `openai` for real model behavior on deterministic tools
+- `external_process` for local agents where the runner should still own the tool loop
+- `http` for real running services that own their own memory and internal orchestration
 ## Named Agent Config
-Example:
+Example covering all providers:
 ```yaml
 agents:
@@ -29,14 +39,31 @@ agents:
     args:
       - custom_agents/node_agent.mjs
     label: custom-node-agent
+  - name: my-production-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+    label: my-production-agent
 ```
 Run a named agent with:
 ```bash
 agentlab run support.refund-correct-order --agent mock-default
+agentlab run internal-teams.memory-followup-recall --agent my-production-agent
+```
+Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
+```bash
+agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
+agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
 ```
+Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
+---
 ## Mock
 The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
@@ -47,6 +74,8 @@ Use it when you want:
 - stable docs examples
 - predictable benchmark behavior
+---
 ## OpenAI
 The OpenAI path uses your API key and a configured model.
@@ -65,6 +94,8 @@ agentlab run support.refund-correct-order --agent openai-cheap
 The OpenAI path is useful, but less deterministic than the mock path.
+---
 ## External Process
 External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
@@ -110,14 +141,12 @@ Run one of them with:
 agentlab run support.refund-via-config-tool --agent custom-node-agent
 ```
-## Environment Allowlist
+### Environment Allowlist
 External-process agents can optionally define `envAllowlist`.
 Use it when a child process needs specific environment variables passed through.
-Example shape:
 ```yaml
 agents:
   - name: custom-agent
@@ -131,13 +160,117 @@ agents:
 Only allow through what the child actually needs.
+---
+## HTTP
+The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
+Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
+Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
+This is the default choice when validating memoryful or stateful agents that already run as a service.
+HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
+### Minimal Config
+```yaml
+agents:
+  - name: my-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+```
+Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
+### Custom Field Names
+If your agent uses different field names:
+```yaml
+agents:
+  - name: my-agent-custom
+    provider: http
+    url: http://localhost:3000/api/chat
+    request_template:
+      query: "{{message}}"
+      session_id: "{{conversation_id}}"
+    response_field: reply
+```
+`request_template` values support three placeholders:
+- `{{message}}` — the current step message
+- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
+- `{{env.VAR_NAME}}` — reads from the environment at runtime
+Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
+### Auth and Timeout
+```yaml
+agents:
+  - name: my-agent-auth
+    provider: http
+    url: http://localhost:3000/api/chat
+    headers:
+      Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
+    timeout_ms: 10000
+```
+`timeout_ms` defaults to 30000 (30 seconds) if not set.
+Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
+### Full Config Reference
+| Field | Required | Default | Description |
+|-------|----------|---------|-------------|
+| `url` | yes | — | HTTP endpoint to POST to |
+| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
+| `response_field` | no | `message` | Field to read the reply from |
+| `headers` | no | `{}` | Additional HTTP headers |
+| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
+| `label` | no | agent name | Display label in CLI output and run history |
+### How It Works
+For each step in a conversation scenario:
+1. agentlab generates a UUID `conversation_id` once at the start of the run
+2. for every step, it POSTs the current message and `conversation_id` to your agent
+3. your agent is responsible for maintaining conversation history using that id
+4. agentlab reads the reply, measures latency, and runs per-step evaluators
+5. if a hard-gate evaluator fails, the run stops immediately
+### Error Handling
+HTTP provider runs can end with these termination reasons:
+| Reason | Cause |
+|--------|-------|
+| `http_connection_failed` | Could not connect to the URL |
+| `http_error` | Agent returned HTTP 4xx or 5xx |
+| `timeout_exceeded` | Request exceeded `timeout_ms` |
+| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
+| `evaluator_failed` | A per-step hard-gate evaluator failed |
+Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
+---
 ## Best Practices
-- use named agents instead of ad hoc local command strings
+- use named agents instead of ad hoc provider flags
 - keep labels stable so compare output stays readable
 - prefer the mock path for smoke tests and docs
-- use external-process agents when you want to wrap a local Node or Python agent implementation
-- keep the runner authoritative for tools and termination
+- use external-process agents when you want to wrap a local Node or Python agent
+- use http agents when your agent is already running as a service
+- keep the runner authoritative for tools and termination (external_process and mock)
+- keep your agent authoritative for tools and history (http)
+- choose the simplest provider that answers the engineering question you actually have
 ## Common Errors
@@ -148,5 +281,7 @@ Typical failures:
 - missing external-process `command`
 - invalid `args` or `envAllowlist`
 - child process returning invalid JSON
+- http agent url not running when the test starts
+- http agent returning a field name that doesn't match `response_field`
 See [troubleshooting.md](troubleshooting.md) for fixes.

package/docs/coding-agents.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Coding Agents
+ARL supports coding-agent regression workflows through deterministic task scenarios.
+Use this path when the runner should remain authoritative for:
+- file inspection tools
+- patch application tools
+- step limits
+- regression scoring
+## Start With The Built-In Coding Scenarios
+This repo already includes two coding scenarios:
+- `coding.fix-add-function`
+- `coding.update-greeting`
+Run one directly:
+```bash
+agentlab run coding.fix-add-function --agent mock-default
+```
+These scenarios use fixture-backed repo tools, which makes them useful for:
+- prompt changes
+- model comparisons
+- patch-discipline checks
+- pre-merge behavioral regression checks
+## Why This Matters
+Coding agents often regress in subtle ways:
+- they inspect too much of the repo
+- they patch the wrong file
+- they over-edit instead of making a narrow change
+- they stop naming the changed file clearly
+ARL helps by making those expectations explicit in scenario evaluators.
+## Minimal Workflow
+1. run one coding scenario locally
+2. inspect the run output and trace
+3. run it again against a changed prompt/model/agent variant
+4. compare the two runs
+Example:
+```bash
+agentlab run coding.fix-add-function --agent mock-default
+agentlab run coding.fix-add-function --agent mock-default
+agentlab compare <baseline-run-id> <candidate-run-id>
+```
+## When To Use Task Scenarios Versus HTTP
+Use task scenarios for coding agents when:
+- you want deterministic fixture-backed tools
+- you want ARL to own the tool loop
+- you want reproducible patch-evaluator behavior
+Use HTTP/conversation scenarios only when the coding agent already exists as a running service and owns its own orchestration internally.
+## Next Step
+If you want coding-agent checks in team workflows, pair these scenarios with suite definitions and CI:
+```bash
+agentlab run --suite-def pre_merge --agent mock-default
+```