npm - agent-regression-lab - Versions diffs - 0.1.1 → 0.3.0 - Mend

agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +186 -123
package/dist/agent/factory.js +20 -6
package/dist/agent/httpAdapter.js +79 -0
package/dist/agent/mockAdapter.js +210 -13
package/dist/config.js +223 -4
package/dist/conversationEvaluators.js +167 -0
package/dist/conversationRunner.js +199 -0
package/dist/evaluators.js +56 -1
package/dist/index.js +428 -111
package/dist/lib/id.js +6 -0
package/dist/runOutput.js +46 -0
package/dist/runner.js +31 -9
package/dist/scenarios.js +211 -11
package/dist/scoring.js +2 -2
package/dist/storage.js +305 -31
package/dist/tools.js +284 -0
package/dist/trace.js +4 -2
package/dist/ui/App.js +67 -5
package/dist/ui/server.js +18 -0
package/dist/ui-assets/client.js +165 -3
package/docs/agents.md +287 -0
package/docs/golden-suites.md +74 -0
package/docs/integrations-and-live-services.md +58 -0
package/docs/memory-and-stateful-agents.md +51 -0
package/docs/release-checklist.md +94 -0
package/docs/runtime-profiles.md +67 -0
package/docs/scenarios.md +419 -0
package/docs/tools.md +102 -0
package/docs/troubleshooting.md +296 -0
package/docs/variant-sets.md +63 -0
package/package.json +4 -3

package/dist/ui-assets/client.js CHANGED Viewed

@@ -21731,7 +21731,8 @@ function App() {
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("main", { className: "page", children: [
       route.type === "list" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunListPage, {}) : null,
       route.type === "detail" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunDetailPage, { runId: route.runId }) : null,
-      route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null
+      route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null,
+      route.type === "compare-suite" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null
     ] })
   ] });
 }
@@ -21795,7 +21796,8 @@ function RunListPage() {
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("td", { children: run.totalSteps }),
         /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("td", { children: [
           new Date(run.startedAt).toLocaleString(),
-          index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null
+          index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null,
+          index > 0 && runs[index - 1].suite === run.suite && runs[index - 1].suiteBatchId && run.suiteBatchId && runs[index - 1].suiteBatchId !== run.suiteBatchId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) }) : null
         ] })
       ] }, run.id)) })
     ] }) : null
@@ -21814,6 +21816,7 @@ function RunDetailPage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
     ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21833,6 +21836,7 @@ function RunDetailPage(props) {
           " ",
           detail.agentVersion?.modelId ?? "-"
         ] }),
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
         detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
           /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
           " ",
@@ -21890,6 +21894,71 @@ function RunDetailPage(props) {
     ] })
   ] });
 }
+function FailureSummaryPanel(props) {
+  const failureItems = getFailureSummaryItems(props.detail);
+  if (failureItems.length === 0) {
+    return null;
+  }
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
+      " ",
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
+      " ",
+      props.detail.run.terminationReason
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
+  ] });
+}
+function RunIdentitySummary(props) {
+  const run = props.detail.run;
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
+      " ",
+      run.variantSetName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
+      " ",
+      run.variantLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
+      " ",
+      run.promptVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
+      " ",
+      run.modelVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
+      " ",
+      run.toolSchemaVersion ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
+      " ",
+      run.configLabel ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
+      " ",
+      run.runtimeProfileName ?? "-"
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
+      " ",
+      run.suiteDefinitionName ?? "-"
+    ] })
+  ] });
+}
 function ComparePage(props) {
   const [data, setData] = (0, import_react.useState)(null);
   (0, import_react.useEffect)(() => {
@@ -21914,6 +21983,7 @@ function ComparePage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
     ] }),
     /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
@@ -21927,7 +21997,10 @@ function ComparePage(props) {
       /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
         data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
-        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.evaluatorId)) })
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
+          diff.note,
+          diff.hardGate ? " (hard gate)" : ""
+        ] }, diff.evaluatorId)) })
       ] }),
       /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
         /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
@@ -22007,6 +22080,73 @@ function RunSide(props) {
     ] }) }, event.eventId)) })
   ] });
 }
+function SuiteComparePage(props) {
+  const [data, setData] = (0, import_react.useState)(null);
+  (0, import_react.useEffect)(() => {
+    if (!props.baselineBatch || !props.candidateBatch) {
+      setData(null);
+      return;
+    }
+    const url = new URL("/api/compare-suite", window.location.origin);
+    url.searchParams.set("baselineBatch", props.baselineBatch);
+    url.searchParams.set("candidateBatch", props.candidateBatch);
+    void fetch(url).then((response) => response.json()).then((payload) => setData(payload));
+  }, [props.baselineBatch, props.candidateBatch]);
+  if (!props.baselineBatch || !props.candidateBatch) {
+    return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
+  }
+  if (!data) {
+    return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
+  }
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
+      data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No suite-level notes recorded." }) : null,
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "panel-grid", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Regressions", items: data.regressions }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Improvements", items: data.improvements })
+    ] }),
+    /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Missing scenarios" }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from candidate:" }),
+        " ",
+        data.missingFromCandidate.join(", ") || "None"
+      ] }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
+        /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from baseline:" }),
+        " ",
+        data.missingFromBaseline.join(", ") || "None"
+      ] })
+    ] })
+  ] });
+}
+function ScenarioList(props) {
+  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
+    props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
+    /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
+      " ",
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
+      /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
+    ] }, item.scenarioId)) })
+  ] });
+}
 function Stat(props) {
   return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stat", children: [
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: props.label }),
@@ -22019,6 +22159,21 @@ function EmptyState(props) {
     /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
   ] });
 }
+function getFailureSummaryItems(detail) {
+  const items = [];
+  if (detail.errorDetail) {
+    items.push(`Error: ${detail.errorDetail}`);
+  }
+  for (const result of detail.evaluatorResults) {
+    if (result.status === "fail") {
+      items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
+    }
+  }
+  if (detail.run.status !== "pass" && items.length === 0) {
+    items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
+  }
+  return items;
+}
 function signed(value) {
   return value > 0 ? `+${value}` : `${value}`;
 }
@@ -22027,6 +22182,13 @@ function getRoute() {
   if (url.pathname.startsWith("/runs/")) {
     return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
   }
+  if (url.pathname === "/compare-suite") {
+    return {
+      type: "compare-suite",
+      baselineBatch: url.searchParams.get("baselineBatch") ?? void 0,
+      candidateBatch: url.searchParams.get("candidateBatch") ?? void 0
+    };
+  }
   if (url.pathname === "/compare") {
     return {
       type: "compare",

package/docs/agents.md ADDED Viewed

@@ -0,0 +1,287 @@
+# Agents
+Named agents are configured in `agentlab.config.yaml`.
+Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
+This repo supports four provider modes:
+- `mock`
+- `openai`
+- `external_process`
+- `http`
+Choose the simplest provider that answers the engineering question you actually have:
+- `mock` for deterministic harness verification
+- `openai` for real model behavior on deterministic tools
+- `external_process` for local agents where the runner should still own the tool loop
+- `http` for real running services that own their own memory and internal orchestration
+## Named Agent Config
+Example covering all providers:
+```yaml
+agents:
+  - name: mock-default
+    provider: mock
+    label: mock-default
+  - name: openai-cheap
+    provider: openai
+    model: gpt-4o-mini
+    label: openai-cheap
+  - name: custom-node-agent
+    provider: external_process
+    command: node
+    args:
+      - custom_agents/node_agent.mjs
+    label: custom-node-agent
+  - name: my-production-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+    label: my-production-agent
+```
+Run a named agent with:
+```bash
+agentlab run support.refund-correct-order --agent mock-default
+agentlab run internal-teams.memory-followup-recall --agent my-production-agent
+```
+Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
+```bash
+agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
+agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
+```
+Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
+---
+## Mock
+The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
+Use it when you want:
+- fast local verification
+- stable docs examples
+- predictable benchmark behavior
+---
+## OpenAI
+The OpenAI path uses your API key and a configured model.
+Requirements:
+- `OPENAI_API_KEY` in the environment
+- a named `openai` agent in `agentlab.config.yaml`, or equivalent CLI runtime settings
+Example:
+```bash
+export OPENAI_API_KEY=...
+agentlab run support.refund-correct-order --agent openai-cheap
+```
+The OpenAI path is useful, but less deterministic than the mock path.
+---
+## External Process
+External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
+The runner stays in control of:
+- tool execution
+- stopping conditions
+- runtime limits
+- persisted run state
+The external agent decides what tool to call next or when to return a final answer.
+### Protocol
+Runner events:
+- `run_started`
+- `tool_result`
+- `runner_error`
+Agent responses:
+- `tool_call`
+- `final`
+- `error`
+Minimal flow:
+1. the runner sends `run_started`
+2. the agent returns `tool_call` or `final`
+3. the runner executes the tool and sends `tool_result`
+4. the agent continues until it returns `final` or `error`
+Working examples:
+- `custom_agents/node_agent.mjs`
+- `custom_agents/python_agent.py`
+Run one of them with:
+```bash
+agentlab run support.refund-via-config-tool --agent custom-node-agent
+```
+### Environment Allowlist
+External-process agents can optionally define `envAllowlist`.
+Use it when a child process needs specific environment variables passed through.
+```yaml
+agents:
+  - name: custom-agent
+    provider: external_process
+    command: node
+    args:
+      - custom_agents/node_agent.mjs
+    envAllowlist:
+      - OPENAI_API_KEY
+```
+Only allow through what the child actually needs.
+---
+## HTTP
+The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
+Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
+Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
+This is the default choice when validating memoryful or stateful agents that already run as a service.
+HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
+### Minimal Config
+```yaml
+agents:
+  - name: my-agent
+    provider: http
+    url: http://localhost:3000/api/chat
+```
+Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
+### Custom Field Names
+If your agent uses different field names:
+```yaml
+agents:
+  - name: my-agent-custom
+    provider: http
+    url: http://localhost:3000/api/chat
+    request_template:
+      query: "{{message}}"
+      session_id: "{{conversation_id}}"
+    response_field: reply
+```
+`request_template` values support three placeholders:
+- `{{message}}` — the current step message
+- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
+- `{{env.VAR_NAME}}` — reads from the environment at runtime
+Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
+### Auth and Timeout
+```yaml
+agents:
+  - name: my-agent-auth
+    provider: http
+    url: http://localhost:3000/api/chat
+    headers:
+      Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
+    timeout_ms: 10000
+```
+`timeout_ms` defaults to 30000 (30 seconds) if not set.
+Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
+### Full Config Reference
+| Field | Required | Default | Description |
+|-------|----------|---------|-------------|
+| `url` | yes | — | HTTP endpoint to POST to |
+| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
+| `response_field` | no | `message` | Field to read the reply from |
+| `headers` | no | `{}` | Additional HTTP headers |
+| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
+| `label` | no | agent name | Display label in CLI output and run history |
+### How It Works
+For each step in a conversation scenario:
+1. agentlab generates a UUID `conversation_id` once at the start of the run
+2. for every step, it POSTs the current message and `conversation_id` to your agent
+3. your agent is responsible for maintaining conversation history using that id
+4. agentlab reads the reply, measures latency, and runs per-step evaluators
+5. if a hard-gate evaluator fails, the run stops immediately
+### Error Handling
+HTTP provider runs can end with these termination reasons:
+| Reason | Cause |
+|--------|-------|
+| `http_connection_failed` | Could not connect to the URL |
+| `http_error` | Agent returned HTTP 4xx or 5xx |
+| `timeout_exceeded` | Request exceeded `timeout_ms` |
+| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
+| `evaluator_failed` | A per-step hard-gate evaluator failed |
+Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
+---
+## Best Practices
+- use named agents instead of ad hoc provider flags
+- keep labels stable so compare output stays readable
+- prefer the mock path for smoke tests and docs
+- use external-process agents when you want to wrap a local Node or Python agent
+- use http agents when your agent is already running as a service
+- keep the runner authoritative for tools and termination (external_process and mock)
+- keep your agent authoritative for tools and history (http)
+- choose the simplest provider that answers the engineering question you actually have
+## Common Errors
+Typical failures:
+- missing `OPENAI_API_KEY`
+- unsupported provider name
+- missing external-process `command`
+- invalid `args` or `envAllowlist`
+- child process returning invalid JSON
+- http agent url not running when the test starts
+- http agent returning a field name that doesn't match `response_field`
+See [troubleshooting.md](troubleshooting.md) for fixes.

package/docs/golden-suites.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Golden Suites
+Golden suites are the scenario portfolio internal engineering teams should keep as long-lived regression assets.
+They are not just demos. They are engineering memory for the behaviors that matter before merge and before release.
+## Required Launch Categories
+- coding agent regressions
+- support and policy agents
+- incident / ops agents
+- memoryful multi-turn agents
+- tool-failure recovery
+- ambiguity and escalation
+- adversarial or malformed tool output
+- cost / latency / step-discipline checks
+## Recommended Portfolio Composition
+- 5 golden workflows
+- 5 historical regressions
+- 5 ugly edge failures
+- 3 degraded-tool scenarios
+- 2 policy or escalation scenarios
+## How To Use Golden Suites
+1. Keep one or two scenarios for the happy path that must always work.
+2. Add scenarios from real incidents as soon as a failure is understood.
+3. Add edge-case scenarios for ambiguity, degraded tools, malformed outputs, and multi-turn drift.
+4. Group launch-critical workflows into config-level `suite_definitions`.
+5. Run one scenario while debugging locally.
+6. Run a `pre_merge` suite definition before merge.
+7. Run curated `release` and `incident_regressions` suite definitions before release.
+## Suggested Initial Internal-Team Scenarios
+- coding destructive edit guardrails
+- incident triage under noisy alerts
+- escalation on ambiguity instead of guessing
+- malformed tool output or partial tool output
+- cross-session memory leakage
+- follow-up recall across turns
+## Design Rule
+Treat suite composition as a product artifact.
+The suite is part of the system design, not a disposable test folder.
+## Recommended Suite Definitions
+Use first-class `suite_definitions` instead of ad hoc tags alone:
+```yaml
+suite_definitions:
+  - name: smoke
+    include:
+      tags: [smoke]
+  - name: pre_merge
+    include:
+      tags: [smoke, regression]
+  - name: release
+    include:
+      suites: [support, internal-teams]
+  - name: incident_regressions
+    include:
+      tags: [incident, regression]
+```
+These become the operational units you wire into local verification, pre-merge checks, and release readiness.

package/docs/integrations-and-live-services.md ADDED Viewed

@@ -0,0 +1,58 @@
+# Integrations And Live Services
+Use this guide to choose the right ARL provider path for the engineering question you are trying to answer.
+## Provider Matrix
+### `mock`
+Use when you want:
+- deterministic smoke tests
+- stable docs examples
+- baseline verification while changing the harness itself
+### `openai`
+Use when you want:
+- real model behavior against deterministic tool surfaces
+- prompt and model validation before merge
+- quick local comparisons where the model is the variable
+### `external_process`
+Use when you want:
+- a local Node or Python agent to participate in the runner-controlled tool loop
+- the runner to remain authoritative for tools, step limits, and storage
+- a thin adapter around an existing local agent implementation
+### `http`
+Use when you want:
+- production-like multi-turn validation against a running service
+- the agent to own memory, conversation history, and internal tool execution
+- live verification of a real app instead of a deterministic wrapper
+`arl-test/` is the canonical example of this path in this repo.
+## Live-Service Verification
+Default workflow:
+1. start the service
+2. run `agentlab` from the project containing the relevant scenarios and `agentlab.config.yaml`
+3. run one scenario while debugging
+4. run a suite before merge
+5. compare candidate runs or suite batches against a known baseline
+## Integration Design Rule
+Choose the simplest provider that answers the engineering question you have.
+- If you only need deterministic regression evidence, prefer `mock`.
+- If you need real model behavior but deterministic tools, prefer `openai`.
+- If you need a local agent implementation but still want runner-owned tools, prefer `external_process`.
+- If you need the real running service with its own memory and orchestration, use `http`.