agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21731,7 +21731,8 @@ function App() {
21731
21731
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("main", { className: "page", children: [
21732
21732
  route.type === "list" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunListPage, {}) : null,
21733
21733
  route.type === "detail" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunDetailPage, { runId: route.runId }) : null,
21734
- route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null
21734
+ route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null,
21735
+ route.type === "compare-suite" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null
21735
21736
  ] })
21736
21737
  ] });
21737
21738
  }
@@ -21795,7 +21796,8 @@ function RunListPage() {
21795
21796
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("td", { children: run.totalSteps }),
21796
21797
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("td", { children: [
21797
21798
  new Date(run.startedAt).toLocaleString(),
21798
- index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null
21799
+ index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null,
21800
+ index > 0 && runs[index - 1].suite === run.suite && runs[index - 1].suiteBatchId && run.suiteBatchId && runs[index - 1].suiteBatchId !== run.suiteBatchId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) }) : null
21799
21801
  ] })
21800
21802
  ] }, run.id)) })
21801
21803
  ] }) : null
@@ -21814,6 +21816,7 @@ function RunDetailPage(props) {
21814
21816
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
21815
21817
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
21816
21818
  ] }),
21819
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
21817
21820
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21818
21821
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
21819
21822
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
@@ -21833,6 +21836,7 @@ function RunDetailPage(props) {
21833
21836
  " ",
21834
21837
  detail.agentVersion?.modelId ?? "-"
21835
21838
  ] }),
21839
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
21836
21840
  detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21837
21841
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
21838
21842
  " ",
@@ -21890,6 +21894,71 @@ function RunDetailPage(props) {
21890
21894
  ] })
21891
21895
  ] });
21892
21896
  }
21897
+ function FailureSummaryPanel(props) {
21898
+ const failureItems = getFailureSummaryItems(props.detail);
21899
+ if (failureItems.length === 0) {
21900
+ return null;
21901
+ }
21902
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21903
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
21904
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21905
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
21906
+ " ",
21907
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
21908
+ ] }),
21909
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21910
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
21911
+ " ",
21912
+ props.detail.run.terminationReason
21913
+ ] }),
21914
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
21915
+ ] });
21916
+ }
21917
+ function RunIdentitySummary(props) {
21918
+ const run = props.detail.run;
21919
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
21920
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21921
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
21922
+ " ",
21923
+ run.variantSetName ?? "-"
21924
+ ] }),
21925
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21926
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
21927
+ " ",
21928
+ run.variantLabel ?? "-"
21929
+ ] }),
21930
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21931
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
21932
+ " ",
21933
+ run.promptVersion ?? "-"
21934
+ ] }),
21935
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21936
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
21937
+ " ",
21938
+ run.modelVersion ?? "-"
21939
+ ] }),
21940
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21941
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
21942
+ " ",
21943
+ run.toolSchemaVersion ?? "-"
21944
+ ] }),
21945
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21946
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
21947
+ " ",
21948
+ run.configLabel ?? "-"
21949
+ ] }),
21950
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21951
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
21952
+ " ",
21953
+ run.runtimeProfileName ?? "-"
21954
+ ] }),
21955
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
21956
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
21957
+ " ",
21958
+ run.suiteDefinitionName ?? "-"
21959
+ ] })
21960
+ ] });
21961
+ }
21893
21962
  function ComparePage(props) {
21894
21963
  const [data, setData] = (0, import_react.useState)(null);
21895
21964
  (0, import_react.useEffect)(() => {
@@ -21914,6 +21983,7 @@ function ComparePage(props) {
21914
21983
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
21915
21984
  ] }),
21916
21985
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21986
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
21917
21987
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
21918
21988
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
21919
21989
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
@@ -21927,7 +21997,10 @@ function ComparePage(props) {
21927
21997
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21928
21998
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
21929
21999
  data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
21930
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.evaluatorId)) })
22000
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22001
+ diff.note,
22002
+ diff.hardGate ? " (hard gate)" : ""
22003
+ ] }, diff.evaluatorId)) })
21931
22004
  ] }),
21932
22005
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21933
22006
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
@@ -22007,6 +22080,73 @@ function RunSide(props) {
22007
22080
  ] }) }, event.eventId)) })
22008
22081
  ] });
22009
22082
  }
22083
+ function SuiteComparePage(props) {
22084
+ const [data, setData] = (0, import_react.useState)(null);
22085
+ (0, import_react.useEffect)(() => {
22086
+ if (!props.baselineBatch || !props.candidateBatch) {
22087
+ setData(null);
22088
+ return;
22089
+ }
22090
+ const url = new URL("/api/compare-suite", window.location.origin);
22091
+ url.searchParams.set("baselineBatch", props.baselineBatch);
22092
+ url.searchParams.set("candidateBatch", props.candidateBatch);
22093
+ void fetch(url).then((response) => response.json()).then((payload) => setData(payload));
22094
+ }, [props.baselineBatch, props.candidateBatch]);
22095
+ if (!props.baselineBatch || !props.candidateBatch) {
22096
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
22097
+ }
22098
+ if (!data) {
22099
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
22100
+ }
22101
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
22102
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
22103
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
22104
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
22105
+ ] }),
22106
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
22107
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
22108
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
22109
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }),
22110
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }),
22111
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }),
22112
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })
22113
+ ] }),
22114
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22115
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
22116
+ data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No suite-level notes recorded." }) : null,
22117
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
22118
+ ] }),
22119
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "panel-grid", children: [
22120
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Regressions", items: data.regressions }),
22121
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Improvements", items: data.improvements })
22122
+ ] }),
22123
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22124
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Missing scenarios" }),
22125
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
22126
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from candidate:" }),
22127
+ " ",
22128
+ data.missingFromCandidate.join(", ") || "None"
22129
+ ] }),
22130
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
22131
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from baseline:" }),
22132
+ " ",
22133
+ data.missingFromBaseline.join(", ") || "None"
22134
+ ] })
22135
+ ] })
22136
+ ] });
22137
+ }
22138
+ function ScenarioList(props) {
22139
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22140
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
22141
+ props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
22142
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22143
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22144
+ " ",
22145
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
22146
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
22147
+ ] }, item.scenarioId)) })
22148
+ ] });
22149
+ }
22010
22150
  function Stat(props) {
22011
22151
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stat", children: [
22012
22152
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: props.label }),
@@ -22019,6 +22159,21 @@ function EmptyState(props) {
22019
22159
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
22020
22160
  ] });
22021
22161
  }
22162
+ function getFailureSummaryItems(detail) {
22163
+ const items = [];
22164
+ if (detail.errorDetail) {
22165
+ items.push(`Error: ${detail.errorDetail}`);
22166
+ }
22167
+ for (const result of detail.evaluatorResults) {
22168
+ if (result.status === "fail") {
22169
+ items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
22170
+ }
22171
+ }
22172
+ if (detail.run.status !== "pass" && items.length === 0) {
22173
+ items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
22174
+ }
22175
+ return items;
22176
+ }
22022
22177
  function signed(value) {
22023
22178
  return value > 0 ? `+${value}` : `${value}`;
22024
22179
  }
@@ -22027,6 +22182,13 @@ function getRoute() {
22027
22182
  if (url.pathname.startsWith("/runs/")) {
22028
22183
  return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
22029
22184
  }
22185
+ if (url.pathname === "/compare-suite") {
22186
+ return {
22187
+ type: "compare-suite",
22188
+ baselineBatch: url.searchParams.get("baselineBatch") ?? void 0,
22189
+ candidateBatch: url.searchParams.get("candidateBatch") ?? void 0
22190
+ };
22191
+ }
22030
22192
  if (url.pathname === "/compare") {
22031
22193
  return {
22032
22194
  type: "compare",
package/docs/agents.md ADDED
@@ -0,0 +1,287 @@
1
+ # Agents
2
+
3
+ Named agents are configured in `agentlab.config.yaml`.
4
+
5
+ Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
6
+
7
+ This repo supports four provider modes:
8
+
9
+ - `mock`
10
+ - `openai`
11
+ - `external_process`
12
+ - `http`
13
+
14
+ Choose the simplest provider that answers the engineering question you actually have:
15
+
16
+ - `mock` for deterministic harness verification
17
+ - `openai` for real model behavior on deterministic tools
18
+ - `external_process` for local agents where the runner should still own the tool loop
19
+ - `http` for real running services that own their own memory and internal orchestration
20
+
21
+ ## Named Agent Config
22
+
23
+ Example covering all providers:
24
+
25
+ ```yaml
26
+ agents:
27
+ - name: mock-default
28
+ provider: mock
29
+ label: mock-default
30
+
31
+ - name: openai-cheap
32
+ provider: openai
33
+ model: gpt-4o-mini
34
+ label: openai-cheap
35
+
36
+ - name: custom-node-agent
37
+ provider: external_process
38
+ command: node
39
+ args:
40
+ - custom_agents/node_agent.mjs
41
+ label: custom-node-agent
42
+
43
+ - name: my-production-agent
44
+ provider: http
45
+ url: http://localhost:3000/api/chat
46
+ label: my-production-agent
47
+ ```
48
+
49
+ Run a named agent with:
50
+
51
+ ```bash
52
+ agentlab run support.refund-correct-order --agent mock-default
53
+ agentlab run internal-teams.memory-followup-recall --agent my-production-agent
54
+ ```
55
+
56
+ Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
57
+
58
+ ```bash
59
+ agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
60
+ agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
61
+ ```
62
+
63
+ Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
64
+
65
+ ---
66
+
67
+ ## Mock
68
+
69
+ The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
70
+
71
+ Use it when you want:
72
+
73
+ - fast local verification
74
+ - stable docs examples
75
+ - predictable benchmark behavior
76
+
77
+ ---
78
+
79
+ ## OpenAI
80
+
81
+ The OpenAI path uses your API key and a configured model.
82
+
83
+ Requirements:
84
+
85
+ - `OPENAI_API_KEY` in the environment
86
+ - a named `openai` agent in `agentlab.config.yaml`, or equivalent CLI runtime settings
87
+
88
+ Example:
89
+
90
+ ```bash
91
+ export OPENAI_API_KEY=...
92
+ agentlab run support.refund-correct-order --agent openai-cheap
93
+ ```
94
+
95
+ The OpenAI path is useful, but less deterministic than the mock path.
96
+
97
+ ---
98
+
99
+ ## External Process
100
+
101
+ External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
102
+
103
+ The runner stays in control of:
104
+
105
+ - tool execution
106
+ - stopping conditions
107
+ - runtime limits
108
+ - persisted run state
109
+
110
+ The external agent decides what tool to call next or when to return a final answer.
111
+
112
+ ### Protocol
113
+
114
+ Runner events:
115
+
116
+ - `run_started`
117
+ - `tool_result`
118
+ - `runner_error`
119
+
120
+ Agent responses:
121
+
122
+ - `tool_call`
123
+ - `final`
124
+ - `error`
125
+
126
+ Minimal flow:
127
+
128
+ 1. the runner sends `run_started`
129
+ 2. the agent returns `tool_call` or `final`
130
+ 3. the runner executes the tool and sends `tool_result`
131
+ 4. the agent continues until it returns `final` or `error`
132
+
133
+ Working examples:
134
+
135
+ - `custom_agents/node_agent.mjs`
136
+ - `custom_agents/python_agent.py`
137
+
138
+ Run one of them with:
139
+
140
+ ```bash
141
+ agentlab run support.refund-via-config-tool --agent custom-node-agent
142
+ ```
143
+
144
+ ### Environment Allowlist
145
+
146
+ External-process agents can optionally define `envAllowlist`.
147
+
148
+ Use it when a child process needs specific environment variables passed through.
149
+
150
+ ```yaml
151
+ agents:
152
+ - name: custom-agent
153
+ provider: external_process
154
+ command: node
155
+ args:
156
+ - custom_agents/node_agent.mjs
157
+ envAllowlist:
158
+ - OPENAI_API_KEY
159
+ ```
160
+
161
+ Only allow through what the child actually needs.
162
+
163
+ ---
164
+
165
+ ## HTTP
166
+
167
+ The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
168
+
169
+ Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
170
+
171
+ Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
172
+
173
+ This is the default choice when validating memoryful or stateful agents that already run as a service.
174
+
175
+ HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
176
+
177
+ ### Minimal Config
178
+
179
+ ```yaml
180
+ agents:
181
+ - name: my-agent
182
+ provider: http
183
+ url: http://localhost:3000/api/chat
184
+ ```
185
+
186
+ Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
187
+
188
+ ### Custom Field Names
189
+
190
+ If your agent uses different field names:
191
+
192
+ ```yaml
193
+ agents:
194
+ - name: my-agent-custom
195
+ provider: http
196
+ url: http://localhost:3000/api/chat
197
+ request_template:
198
+ query: "{{message}}"
199
+ session_id: "{{conversation_id}}"
200
+ response_field: reply
201
+ ```
202
+
203
+ `request_template` values support three placeholders:
204
+
205
+ - `{{message}}` — the current step message
206
+ - `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
207
+ - `{{env.VAR_NAME}}` — reads from the environment at runtime
208
+
209
+ Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
210
+
211
+ ### Auth and Timeout
212
+
213
+ ```yaml
214
+ agents:
215
+ - name: my-agent-auth
216
+ provider: http
217
+ url: http://localhost:3000/api/chat
218
+ headers:
219
+ Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
220
+ timeout_ms: 10000
221
+ ```
222
+
223
+ `timeout_ms` defaults to 30000 (30 seconds) if not set.
224
+
225
+ Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
226
+
227
+ ### Full Config Reference
228
+
229
+ | Field | Required | Default | Description |
230
+ |-------|----------|---------|-------------|
231
+ | `url` | yes | — | HTTP endpoint to POST to |
232
+ | `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
233
+ | `response_field` | no | `message` | Field to read the reply from |
234
+ | `headers` | no | `{}` | Additional HTTP headers |
235
+ | `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
236
+ | `label` | no | agent name | Display label in CLI output and run history |
237
+
238
+ ### How It Works
239
+
240
+ For each step in a conversation scenario:
241
+
242
+ 1. agentlab generates a UUID `conversation_id` once at the start of the run
243
+ 2. for every step, it POSTs the current message and `conversation_id` to your agent
244
+ 3. your agent is responsible for maintaining conversation history using that id
245
+ 4. agentlab reads the reply, measures latency, and runs per-step evaluators
246
+ 5. if a hard-gate evaluator fails, the run stops immediately
247
+
248
+ ### Error Handling
249
+
250
+ HTTP provider runs can end with these termination reasons:
251
+
252
+ | Reason | Cause |
253
+ |--------|-------|
254
+ | `http_connection_failed` | Could not connect to the URL |
255
+ | `http_error` | Agent returned HTTP 4xx or 5xx |
256
+ | `timeout_exceeded` | Request exceeded `timeout_ms` |
257
+ | `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
258
+ | `evaluator_failed` | A per-step hard-gate evaluator failed |
259
+
260
+ Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
261
+
262
+ ---
263
+
264
+ ## Best Practices
265
+
266
+ - use named agents instead of ad hoc provider flags
267
+ - keep labels stable so compare output stays readable
268
+ - prefer the mock path for smoke tests and docs
269
+ - use external-process agents when you want to wrap a local Node or Python agent
270
+ - use http agents when your agent is already running as a service
271
+ - keep the runner authoritative for tools and termination (external_process and mock)
272
+ - keep your agent authoritative for tools and history (http)
273
+ - choose the simplest provider that answers the engineering question you actually have
274
+
275
+ ## Common Errors
276
+
277
+ Typical failures:
278
+
279
+ - missing `OPENAI_API_KEY`
280
+ - unsupported provider name
281
+ - missing external-process `command`
282
+ - invalid `args` or `envAllowlist`
283
+ - child process returning invalid JSON
284
+ - http agent url not running when the test starts
285
+ - http agent returning a field name that doesn't match `response_field`
286
+
287
+ See [troubleshooting.md](troubleshooting.md) for fixes.
@@ -0,0 +1,74 @@
1
+ # Golden Suites
2
+
3
+ Golden suites are the scenario portfolio internal engineering teams should keep as long-lived regression assets.
4
+
5
+ They are not just demos. They are engineering memory for the behaviors that matter before merge and before release.
6
+
7
+ ## Required Launch Categories
8
+
9
+ - coding agent regressions
10
+ - support and policy agents
11
+ - incident / ops agents
12
+ - memoryful multi-turn agents
13
+ - tool-failure recovery
14
+ - ambiguity and escalation
15
+ - adversarial or malformed tool output
16
+ - cost / latency / step-discipline checks
17
+
18
+ ## Recommended Portfolio Composition
19
+
20
+ - 5 golden workflows
21
+ - 5 historical regressions
22
+ - 5 ugly edge failures
23
+ - 3 degraded-tool scenarios
24
+ - 2 policy or escalation scenarios
25
+
26
+ ## How To Use Golden Suites
27
+
28
+ 1. Keep one or two scenarios for the happy path that must always work.
29
+ 2. Add scenarios from real incidents as soon as a failure is understood.
30
+ 3. Add edge-case scenarios for ambiguity, degraded tools, malformed outputs, and multi-turn drift.
31
+ 4. Group launch-critical workflows into config-level `suite_definitions`.
32
+ 5. Run one scenario while debugging locally.
33
+ 6. Run a `pre_merge` suite definition before merge.
34
+ 7. Run curated `release` and `incident_regressions` suite definitions before release.
35
+
36
+ ## Suggested Initial Internal-Team Scenarios
37
+
38
+ - coding destructive edit guardrails
39
+ - incident triage under noisy alerts
40
+ - escalation on ambiguity instead of guessing
41
+ - malformed tool output or partial tool output
42
+ - cross-session memory leakage
43
+ - follow-up recall across turns
44
+
45
+ ## Design Rule
46
+
47
+ Treat suite composition as a product artifact.
48
+
49
+ The suite is part of the system design, not a disposable test folder.
50
+
51
+ ## Recommended Suite Definitions
52
+
53
+ Use first-class `suite_definitions` instead of ad hoc tags alone:
54
+
55
+ ```yaml
56
+ suite_definitions:
57
+ - name: smoke
58
+ include:
59
+ tags: [smoke]
60
+
61
+ - name: pre_merge
62
+ include:
63
+ tags: [smoke, regression]
64
+
65
+ - name: release
66
+ include:
67
+ suites: [support, internal-teams]
68
+
69
+ - name: incident_regressions
70
+ include:
71
+ tags: [incident, regression]
72
+ ```
73
+
74
+ These become the operational units you wire into local verification, pre-merge checks, and release readiness.
@@ -0,0 +1,58 @@
1
+ # Integrations And Live Services
2
+
3
+ Use this guide to choose the right ARL provider path for the engineering question you are trying to answer.
4
+
5
+ ## Provider Matrix
6
+
7
+ ### `mock`
8
+
9
+ Use when you want:
10
+
11
+ - deterministic smoke tests
12
+ - stable docs examples
13
+ - baseline verification while changing the harness itself
14
+
15
+ ### `openai`
16
+
17
+ Use when you want:
18
+
19
+ - real model behavior against deterministic tool surfaces
20
+ - prompt and model validation before merge
21
+ - quick local comparisons where the model is the variable
22
+
23
+ ### `external_process`
24
+
25
+ Use when you want:
26
+
27
+ - a local Node or Python agent to participate in the runner-controlled tool loop
28
+ - the runner to remain authoritative for tools, step limits, and storage
29
+ - a thin adapter around an existing local agent implementation
30
+
31
+ ### `http`
32
+
33
+ Use when you want:
34
+
35
+ - production-like multi-turn validation against a running service
36
+ - the agent to own memory, conversation history, and internal tool execution
37
+ - live verification of a real app instead of a deterministic wrapper
38
+
39
+ `arl-test/` is the canonical example of this path in this repo.
40
+
41
+ ## Live-Service Verification
42
+
43
+ Default workflow:
44
+
45
+ 1. start the service
46
+ 2. run `agentlab` from the project containing the relevant scenarios and `agentlab.config.yaml`
47
+ 3. run one scenario while debugging
48
+ 4. run a suite before merge
49
+ 5. compare candidate runs or suite batches against a known baseline
50
+
51
+ ## Integration Design Rule
52
+
53
+ Choose the simplest provider that answers the engineering question you have.
54
+
55
+ - If you only need deterministic regression evidence, prefer `mock`.
56
+ - If you need real model behavior but deterministic tools, prefer `openai`.
57
+ - If you need a local agent implementation but still want runner-owned tools, prefer `external_process`.
58
+ - If you need the real running service with its own memory and orchestration, use `http`.