agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/dist/ui-assets/client.js
CHANGED
|
@@ -21731,7 +21731,8 @@ function App() {
|
|
|
21731
21731
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("main", { className: "page", children: [
|
|
21732
21732
|
route.type === "list" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunListPage, {}) : null,
|
|
21733
21733
|
route.type === "detail" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunDetailPage, { runId: route.runId }) : null,
|
|
21734
|
-
route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null
|
|
21734
|
+
route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null,
|
|
21735
|
+
route.type === "compare-suite" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null
|
|
21735
21736
|
] })
|
|
21736
21737
|
] });
|
|
21737
21738
|
}
|
|
@@ -21795,7 +21796,8 @@ function RunListPage() {
|
|
|
21795
21796
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("td", { children: run.totalSteps }),
|
|
21796
21797
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("td", { children: [
|
|
21797
21798
|
new Date(run.startedAt).toLocaleString(),
|
|
21798
|
-
index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null
|
|
21799
|
+
index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null,
|
|
21800
|
+
index > 0 && runs[index - 1].suite === run.suite && runs[index - 1].suiteBatchId && run.suiteBatchId && runs[index - 1].suiteBatchId !== run.suiteBatchId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) }) : null
|
|
21799
21801
|
] })
|
|
21800
21802
|
] }, run.id)) })
|
|
21801
21803
|
] }) : null
|
|
@@ -21814,6 +21816,7 @@ function RunDetailPage(props) {
|
|
|
21814
21816
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: detail.run.id }),
|
|
21815
21817
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: detail.run.scenarioId })
|
|
21816
21818
|
] }),
|
|
21819
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(FailureSummaryPanel, { detail }),
|
|
21817
21820
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21818
21821
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Status", value: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${detail.run.status}`, children: detail.run.status }) }),
|
|
21819
21822
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score", value: detail.run.score }),
|
|
@@ -21833,6 +21836,7 @@ function RunDetailPage(props) {
|
|
|
21833
21836
|
" ",
|
|
21834
21837
|
detail.agentVersion?.modelId ?? "-"
|
|
21835
21838
|
] }),
|
|
21839
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunIdentitySummary, { detail }),
|
|
21836
21840
|
detail.agentVersion?.command ? /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21837
21841
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Command:" }),
|
|
21838
21842
|
" ",
|
|
@@ -21890,6 +21894,71 @@ function RunDetailPage(props) {
|
|
|
21890
21894
|
] })
|
|
21891
21895
|
] });
|
|
21892
21896
|
}
|
|
21897
|
+
function FailureSummaryPanel(props) {
|
|
21898
|
+
const failureItems = getFailureSummaryItems(props.detail);
|
|
21899
|
+
if (failureItems.length === 0) {
|
|
21900
|
+
return null;
|
|
21901
|
+
}
|
|
21902
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21903
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Failures First" }),
|
|
21904
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21905
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Status:" }),
|
|
21906
|
+
" ",
|
|
21907
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: `pill ${props.detail.run.status}`, children: props.detail.run.status })
|
|
21908
|
+
] }),
|
|
21909
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21910
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Termination:" }),
|
|
21911
|
+
" ",
|
|
21912
|
+
props.detail.run.terminationReason
|
|
21913
|
+
] }),
|
|
21914
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: failureItems.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: item }, item)) })
|
|
21915
|
+
] });
|
|
21916
|
+
}
|
|
21917
|
+
function RunIdentitySummary(props) {
|
|
21918
|
+
const run = props.detail.run;
|
|
21919
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [
|
|
21920
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21921
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant set:" }),
|
|
21922
|
+
" ",
|
|
21923
|
+
run.variantSetName ?? "-"
|
|
21924
|
+
] }),
|
|
21925
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21926
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Variant:" }),
|
|
21927
|
+
" ",
|
|
21928
|
+
run.variantLabel ?? "-"
|
|
21929
|
+
] }),
|
|
21930
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21931
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Prompt version:" }),
|
|
21932
|
+
" ",
|
|
21933
|
+
run.promptVersion ?? "-"
|
|
21934
|
+
] }),
|
|
21935
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21936
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Model version:" }),
|
|
21937
|
+
" ",
|
|
21938
|
+
run.modelVersion ?? "-"
|
|
21939
|
+
] }),
|
|
21940
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21941
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Tool schema version:" }),
|
|
21942
|
+
" ",
|
|
21943
|
+
run.toolSchemaVersion ?? "-"
|
|
21944
|
+
] }),
|
|
21945
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21946
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Config label:" }),
|
|
21947
|
+
" ",
|
|
21948
|
+
run.configLabel ?? "-"
|
|
21949
|
+
] }),
|
|
21950
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21951
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Runtime profile:" }),
|
|
21952
|
+
" ",
|
|
21953
|
+
run.runtimeProfileName ?? "-"
|
|
21954
|
+
] }),
|
|
21955
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
21956
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Suite definition:" }),
|
|
21957
|
+
" ",
|
|
21958
|
+
run.suiteDefinitionName ?? "-"
|
|
21959
|
+
] })
|
|
21960
|
+
] });
|
|
21961
|
+
}
|
|
21893
21962
|
function ComparePage(props) {
|
|
21894
21963
|
const [data, setData] = (0, import_react.useState)(null);
|
|
21895
21964
|
(0, import_react.useEffect)(() => {
|
|
@@ -21914,6 +21983,7 @@ function ComparePage(props) {
|
|
|
21914
21983
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
|
|
21915
21984
|
] }),
|
|
21916
21985
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21986
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
21917
21987
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
|
|
21918
21988
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
|
|
21919
21989
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
|
|
@@ -21927,7 +21997,10 @@ function ComparePage(props) {
|
|
|
21927
21997
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21928
21998
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
|
|
21929
21999
|
data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
|
|
21930
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.
|
|
22000
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22001
|
+
diff.note,
|
|
22002
|
+
diff.hardGate ? " (hard gate)" : ""
|
|
22003
|
+
] }, diff.evaluatorId)) })
|
|
21931
22004
|
] }),
|
|
21932
22005
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21933
22006
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
|
|
@@ -22007,6 +22080,73 @@ function RunSide(props) {
|
|
|
22007
22080
|
] }) }, event.eventId)) })
|
|
22008
22081
|
] });
|
|
22009
22082
|
}
|
|
22083
|
+
function SuiteComparePage(props) {
|
|
22084
|
+
const [data, setData] = (0, import_react.useState)(null);
|
|
22085
|
+
(0, import_react.useEffect)(() => {
|
|
22086
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
22087
|
+
setData(null);
|
|
22088
|
+
return;
|
|
22089
|
+
}
|
|
22090
|
+
const url = new URL("/api/compare-suite", window.location.origin);
|
|
22091
|
+
url.searchParams.set("baselineBatch", props.baselineBatch);
|
|
22092
|
+
url.searchParams.set("candidateBatch", props.candidateBatch);
|
|
22093
|
+
void fetch(url).then((response) => response.json()).then((payload) => setData(payload));
|
|
22094
|
+
}, [props.baselineBatch, props.candidateBatch]);
|
|
22095
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
22096
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
|
|
22097
|
+
}
|
|
22098
|
+
if (!data) {
|
|
22099
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
|
|
22100
|
+
}
|
|
22101
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
|
|
22102
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
|
|
22103
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
|
|
22104
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
|
|
22105
|
+
] }),
|
|
22106
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
22107
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
22108
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
|
|
22109
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }),
|
|
22110
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }),
|
|
22111
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }),
|
|
22112
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })
|
|
22113
|
+
] }),
|
|
22114
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22115
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
|
|
22116
|
+
data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No suite-level notes recorded." }) : null,
|
|
22117
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
|
|
22118
|
+
] }),
|
|
22119
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "panel-grid", children: [
|
|
22120
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Regressions", items: data.regressions }),
|
|
22121
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Improvements", items: data.improvements })
|
|
22122
|
+
] }),
|
|
22123
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22124
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Missing scenarios" }),
|
|
22125
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
22126
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from candidate:" }),
|
|
22127
|
+
" ",
|
|
22128
|
+
data.missingFromCandidate.join(", ") || "None"
|
|
22129
|
+
] }),
|
|
22130
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
22131
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from baseline:" }),
|
|
22132
|
+
" ",
|
|
22133
|
+
data.missingFromBaseline.join(", ") || "None"
|
|
22134
|
+
] })
|
|
22135
|
+
] })
|
|
22136
|
+
] });
|
|
22137
|
+
}
|
|
22138
|
+
function ScenarioList(props) {
|
|
22139
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22140
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
22141
|
+
props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
|
|
22142
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22143
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
|
|
22144
|
+
" ",
|
|
22145
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
|
|
22146
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
|
|
22147
|
+
] }, item.scenarioId)) })
|
|
22148
|
+
] });
|
|
22149
|
+
}
|
|
22010
22150
|
function Stat(props) {
|
|
22011
22151
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stat", children: [
|
|
22012
22152
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: props.label }),
|
|
@@ -22019,6 +22159,21 @@ function EmptyState(props) {
|
|
|
22019
22159
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: props.description })
|
|
22020
22160
|
] });
|
|
22021
22161
|
}
|
|
22162
|
+
function getFailureSummaryItems(detail) {
|
|
22163
|
+
const items = [];
|
|
22164
|
+
if (detail.errorDetail) {
|
|
22165
|
+
items.push(`Error: ${detail.errorDetail}`);
|
|
22166
|
+
}
|
|
22167
|
+
for (const result of detail.evaluatorResults) {
|
|
22168
|
+
if (result.status === "fail") {
|
|
22169
|
+
items.push(`Evaluator ${result.evaluatorId}: ${result.message}`);
|
|
22170
|
+
}
|
|
22171
|
+
}
|
|
22172
|
+
if (detail.run.status !== "pass" && items.length === 0) {
|
|
22173
|
+
items.push("Run did not pass. Inspect evaluator results and trace for the first divergence.");
|
|
22174
|
+
}
|
|
22175
|
+
return items;
|
|
22176
|
+
}
|
|
22022
22177
|
function signed(value) {
|
|
22023
22178
|
return value > 0 ? `+${value}` : `${value}`;
|
|
22024
22179
|
}
|
|
@@ -22027,6 +22182,13 @@ function getRoute() {
|
|
|
22027
22182
|
if (url.pathname.startsWith("/runs/")) {
|
|
22028
22183
|
return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
|
|
22029
22184
|
}
|
|
22185
|
+
if (url.pathname === "/compare-suite") {
|
|
22186
|
+
return {
|
|
22187
|
+
type: "compare-suite",
|
|
22188
|
+
baselineBatch: url.searchParams.get("baselineBatch") ?? void 0,
|
|
22189
|
+
candidateBatch: url.searchParams.get("candidateBatch") ?? void 0
|
|
22190
|
+
};
|
|
22191
|
+
}
|
|
22030
22192
|
if (url.pathname === "/compare") {
|
|
22031
22193
|
return {
|
|
22032
22194
|
type: "compare",
|
package/docs/agents.md
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
Named agents are configured in `agentlab.config.yaml`.
|
|
4
|
+
|
|
5
|
+
Agents remain the stable execution unit even when you introduce Tier 1 comparison features. You still run one named agent at a time, but you can now group multiple named agents into a `variant_set` for prompt/model/config comparisons.
|
|
6
|
+
|
|
7
|
+
This repo supports four provider modes:
|
|
8
|
+
|
|
9
|
+
- `mock`
|
|
10
|
+
- `openai`
|
|
11
|
+
- `external_process`
|
|
12
|
+
- `http`
|
|
13
|
+
|
|
14
|
+
Choose the simplest provider that answers the engineering question you actually have:
|
|
15
|
+
|
|
16
|
+
- `mock` for deterministic harness verification
|
|
17
|
+
- `openai` for real model behavior on deterministic tools
|
|
18
|
+
- `external_process` for local agents where the runner should still own the tool loop
|
|
19
|
+
- `http` for real running services that own their own memory and internal orchestration
|
|
20
|
+
|
|
21
|
+
## Named Agent Config
|
|
22
|
+
|
|
23
|
+
Example covering all providers:
|
|
24
|
+
|
|
25
|
+
```yaml
|
|
26
|
+
agents:
|
|
27
|
+
- name: mock-default
|
|
28
|
+
provider: mock
|
|
29
|
+
label: mock-default
|
|
30
|
+
|
|
31
|
+
- name: openai-cheap
|
|
32
|
+
provider: openai
|
|
33
|
+
model: gpt-4o-mini
|
|
34
|
+
label: openai-cheap
|
|
35
|
+
|
|
36
|
+
- name: custom-node-agent
|
|
37
|
+
provider: external_process
|
|
38
|
+
command: node
|
|
39
|
+
args:
|
|
40
|
+
- custom_agents/node_agent.mjs
|
|
41
|
+
label: custom-node-agent
|
|
42
|
+
|
|
43
|
+
- name: my-production-agent
|
|
44
|
+
provider: http
|
|
45
|
+
url: http://localhost:3000/api/chat
|
|
46
|
+
label: my-production-agent
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Run a named agent with:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
53
|
+
agentlab run internal-teams.memory-followup-recall --agent my-production-agent
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Use a named variant set when you want to run one scenario or one suite against multiple agent variants and compare the results later:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
|
|
60
|
+
agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Each run records the underlying agent plus richer identity metadata such as `variant_label`, `prompt_version`, `model_version`, `tool_schema_version`, and `config_label`. Those fields appear in CLI summaries, `show`, stored run history, and the UI.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Mock
|
|
68
|
+
|
|
69
|
+
The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
|
|
70
|
+
|
|
71
|
+
Use it when you want:
|
|
72
|
+
|
|
73
|
+
- fast local verification
|
|
74
|
+
- stable docs examples
|
|
75
|
+
- predictable benchmark behavior
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## OpenAI
|
|
80
|
+
|
|
81
|
+
The OpenAI path uses your API key and a configured model.
|
|
82
|
+
|
|
83
|
+
Requirements:
|
|
84
|
+
|
|
85
|
+
- `OPENAI_API_KEY` in the environment
|
|
86
|
+
- a named `openai` agent in `agentlab.config.yaml`, or equivalent CLI runtime settings
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
export OPENAI_API_KEY=...
|
|
92
|
+
agentlab run support.refund-correct-order --agent openai-cheap
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The OpenAI path is useful, but less deterministic than the mock path.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## External Process
|
|
100
|
+
|
|
101
|
+
External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
|
|
102
|
+
|
|
103
|
+
The runner stays in control of:
|
|
104
|
+
|
|
105
|
+
- tool execution
|
|
106
|
+
- stopping conditions
|
|
107
|
+
- runtime limits
|
|
108
|
+
- persisted run state
|
|
109
|
+
|
|
110
|
+
The external agent decides what tool to call next or when to return a final answer.
|
|
111
|
+
|
|
112
|
+
### Protocol
|
|
113
|
+
|
|
114
|
+
Runner events:
|
|
115
|
+
|
|
116
|
+
- `run_started`
|
|
117
|
+
- `tool_result`
|
|
118
|
+
- `runner_error`
|
|
119
|
+
|
|
120
|
+
Agent responses:
|
|
121
|
+
|
|
122
|
+
- `tool_call`
|
|
123
|
+
- `final`
|
|
124
|
+
- `error`
|
|
125
|
+
|
|
126
|
+
Minimal flow:
|
|
127
|
+
|
|
128
|
+
1. the runner sends `run_started`
|
|
129
|
+
2. the agent returns `tool_call` or `final`
|
|
130
|
+
3. the runner executes the tool and sends `tool_result`
|
|
131
|
+
4. the agent continues until it returns `final` or `error`
|
|
132
|
+
|
|
133
|
+
Working examples:
|
|
134
|
+
|
|
135
|
+
- `custom_agents/node_agent.mjs`
|
|
136
|
+
- `custom_agents/python_agent.py`
|
|
137
|
+
|
|
138
|
+
Run one of them with:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
agentlab run support.refund-via-config-tool --agent custom-node-agent
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Environment Allowlist
|
|
145
|
+
|
|
146
|
+
External-process agents can optionally define `envAllowlist`.
|
|
147
|
+
|
|
148
|
+
Use it when a child process needs specific environment variables passed through.
|
|
149
|
+
|
|
150
|
+
```yaml
|
|
151
|
+
agents:
|
|
152
|
+
- name: custom-agent
|
|
153
|
+
provider: external_process
|
|
154
|
+
command: node
|
|
155
|
+
args:
|
|
156
|
+
- custom_agents/node_agent.mjs
|
|
157
|
+
envAllowlist:
|
|
158
|
+
- OPENAI_API_KEY
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Only allow through what the child actually needs.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## HTTP
|
|
166
|
+
|
|
167
|
+
The `http` provider is for testing real production agents that run as HTTP services — Express, FastAPI, Next.js API routes, or any service that accepts a POST and returns a JSON response.
|
|
168
|
+
|
|
169
|
+
Unlike the other providers, HTTP agents manage their own conversation history and tool execution internally. agentlab sends the current message and a `conversation_id` each turn, then evaluates the reply.
|
|
170
|
+
|
|
171
|
+
Use HTTP agents with `type: conversation` scenarios. See [scenarios.md](scenarios.md) for the conversation scenario format.
|
|
172
|
+
|
|
173
|
+
This is the default choice when validating memoryful or stateful agents that already run as a service.
|
|
174
|
+
|
|
175
|
+
HTTP agents can be included inside a `variant_set` the same way as other named agents. Runtime-profile fault injection is currently applied only to task/tool-loop runs. Conversation scenarios may still reference a runtime profile for reusable authoring, but ARL does not currently intercept internal HTTP-agent tools.
|
|
176
|
+
|
|
177
|
+
### Minimal Config
|
|
178
|
+
|
|
179
|
+
```yaml
|
|
180
|
+
agents:
|
|
181
|
+
- name: my-agent
|
|
182
|
+
provider: http
|
|
183
|
+
url: http://localhost:3000/api/chat
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Default contract: agentlab posts `{ message, conversation_id }` and expects `{ message }` in the response.
|
|
187
|
+
|
|
188
|
+
### Custom Field Names
|
|
189
|
+
|
|
190
|
+
If your agent uses different field names:
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
agents:
|
|
194
|
+
- name: my-agent-custom
|
|
195
|
+
provider: http
|
|
196
|
+
url: http://localhost:3000/api/chat
|
|
197
|
+
request_template:
|
|
198
|
+
query: "{{message}}"
|
|
199
|
+
session_id: "{{conversation_id}}"
|
|
200
|
+
response_field: reply
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
`request_template` values support three placeholders:
|
|
204
|
+
|
|
205
|
+
- `{{message}}` — the current step message
|
|
206
|
+
- `{{conversation_id}}` — the UUID generated for this run (consistent across all steps)
|
|
207
|
+
- `{{env.VAR_NAME}}` — reads from the environment at runtime
|
|
208
|
+
|
|
209
|
+
Whitespace inside `{{ }}` is ignored: `{{ message }}` and `{{message}}` are identical.
|
|
210
|
+
|
|
211
|
+
### Auth and Timeout
|
|
212
|
+
|
|
213
|
+
```yaml
|
|
214
|
+
agents:
|
|
215
|
+
- name: my-agent-auth
|
|
216
|
+
provider: http
|
|
217
|
+
url: http://localhost:3000/api/chat
|
|
218
|
+
headers:
|
|
219
|
+
Authorization: "Bearer {{env.MY_AGENT_TOKEN}}"
|
|
220
|
+
timeout_ms: 10000
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
`timeout_ms` defaults to 30000 (30 seconds) if not set.
|
|
224
|
+
|
|
225
|
+
Header values also support `{{message}}`, `{{conversation_id}}`, and `{{env.VAR_NAME}}` placeholders.
|
|
226
|
+
|
|
227
|
+
### Full Config Reference
|
|
228
|
+
|
|
229
|
+
| Field | Required | Default | Description |
|
|
230
|
+
|-------|----------|---------|-------------|
|
|
231
|
+
| `url` | yes | — | HTTP endpoint to POST to |
|
|
232
|
+
| `request_template` | no | `{ message, conversation_id }` | Custom request body shape |
|
|
233
|
+
| `response_field` | no | `message` | Field to read the reply from |
|
|
234
|
+
| `headers` | no | `{}` | Additional HTTP headers |
|
|
235
|
+
| `timeout_ms` | no | `30000` | Per-request timeout in milliseconds |
|
|
236
|
+
| `label` | no | agent name | Display label in CLI output and run history |
|
|
237
|
+
|
|
238
|
+
### How It Works
|
|
239
|
+
|
|
240
|
+
For each step in a conversation scenario:
|
|
241
|
+
|
|
242
|
+
1. agentlab generates a UUID `conversation_id` once at the start of the run
|
|
243
|
+
2. for every step, it POSTs the current message and `conversation_id` to your agent
|
|
244
|
+
3. your agent is responsible for maintaining conversation history using that id
|
|
245
|
+
4. agentlab reads the reply, measures latency, and runs per-step evaluators
|
|
246
|
+
5. if a hard-gate evaluator fails, the run stops immediately
|
|
247
|
+
|
|
248
|
+
### Error Handling
|
|
249
|
+
|
|
250
|
+
HTTP provider runs can end with these termination reasons:
|
|
251
|
+
|
|
252
|
+
| Reason | Cause |
|
|
253
|
+
|--------|-------|
|
|
254
|
+
| `http_connection_failed` | Could not connect to the URL |
|
|
255
|
+
| `http_error` | Agent returned HTTP 4xx or 5xx |
|
|
256
|
+
| `timeout_exceeded` | Request exceeded `timeout_ms` |
|
|
257
|
+
| `invalid_response_format` | Response is not valid JSON, or the expected field is missing |
|
|
258
|
+
| `evaluator_failed` | A per-step hard-gate evaluator failed |
|
|
259
|
+
|
|
260
|
+
Infrastructure errors (`http_connection_failed`, `http_error`, `timeout_exceeded`, `invalid_response_format`) always produce `status: error` and `score: 0`.
|
|
261
|
+
|
|
262
|
+
---
|
|
263
|
+
|
|
264
|
+
## Best Practices
|
|
265
|
+
|
|
266
|
+
- use named agents instead of ad hoc provider flags
|
|
267
|
+
- keep labels stable so compare output stays readable
|
|
268
|
+
- prefer the mock path for smoke tests and docs
|
|
269
|
+
- use external-process agents when you want to wrap a local Node or Python agent
|
|
270
|
+
- use http agents when your agent is already running as a service
|
|
271
|
+
- keep the runner authoritative for tools and termination (external_process and mock)
|
|
272
|
+
- keep your agent authoritative for tools and history (http)
|
|
273
|
+
- choose the simplest provider that answers the engineering question you actually have
|
|
274
|
+
|
|
275
|
+
## Common Errors
|
|
276
|
+
|
|
277
|
+
Typical failures:
|
|
278
|
+
|
|
279
|
+
- missing `OPENAI_API_KEY`
|
|
280
|
+
- unsupported provider name
|
|
281
|
+
- missing external-process `command`
|
|
282
|
+
- invalid `args` or `envAllowlist`
|
|
283
|
+
- child process returning invalid JSON
|
|
284
|
+
- http agent url not running when the test starts
|
|
285
|
+
- http agent returning a field name that doesn't match `response_field`
|
|
286
|
+
|
|
287
|
+
See [troubleshooting.md](troubleshooting.md) for fixes.
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Golden Suites
|
|
2
|
+
|
|
3
|
+
Golden suites are the scenario portfolio internal engineering teams should keep as long-lived regression assets.
|
|
4
|
+
|
|
5
|
+
They are not just demos. They are engineering memory for the behaviors that matter before merge and before release.
|
|
6
|
+
|
|
7
|
+
## Required Launch Categories
|
|
8
|
+
|
|
9
|
+
- coding agent regressions
|
|
10
|
+
- support and policy agents
|
|
11
|
+
- incident / ops agents
|
|
12
|
+
- memoryful multi-turn agents
|
|
13
|
+
- tool-failure recovery
|
|
14
|
+
- ambiguity and escalation
|
|
15
|
+
- adversarial or malformed tool output
|
|
16
|
+
- cost / latency / step-discipline checks
|
|
17
|
+
|
|
18
|
+
## Recommended Portfolio Composition
|
|
19
|
+
|
|
20
|
+
- 5 golden workflows
|
|
21
|
+
- 5 historical regressions
|
|
22
|
+
- 5 ugly edge failures
|
|
23
|
+
- 3 degraded-tool scenarios
|
|
24
|
+
- 2 policy or escalation scenarios
|
|
25
|
+
|
|
26
|
+
## How To Use Golden Suites
|
|
27
|
+
|
|
28
|
+
1. Keep one or two scenarios for the happy path that must always work.
|
|
29
|
+
2. Add scenarios from real incidents as soon as a failure is understood.
|
|
30
|
+
3. Add edge-case scenarios for ambiguity, degraded tools, malformed outputs, and multi-turn drift.
|
|
31
|
+
4. Group launch-critical workflows into config-level `suite_definitions`.
|
|
32
|
+
5. Run one scenario while debugging locally.
|
|
33
|
+
6. Run a `pre_merge` suite definition before merge.
|
|
34
|
+
7. Run curated `release` and `incident_regressions` suite definitions before release.
|
|
35
|
+
|
|
36
|
+
## Suggested Initial Internal-Team Scenarios
|
|
37
|
+
|
|
38
|
+
- coding destructive edit guardrails
|
|
39
|
+
- incident triage under noisy alerts
|
|
40
|
+
- escalation on ambiguity instead of guessing
|
|
41
|
+
- malformed tool output or partial tool output
|
|
42
|
+
- cross-session memory leakage
|
|
43
|
+
- follow-up recall across turns
|
|
44
|
+
|
|
45
|
+
## Design Rule
|
|
46
|
+
|
|
47
|
+
Treat suite composition as a product artifact.
|
|
48
|
+
|
|
49
|
+
The suite is part of the system design, not a disposable test folder.
|
|
50
|
+
|
|
51
|
+
## Recommended Suite Definitions
|
|
52
|
+
|
|
53
|
+
Use first-class `suite_definitions` instead of ad hoc tags alone:
|
|
54
|
+
|
|
55
|
+
```yaml
|
|
56
|
+
suite_definitions:
|
|
57
|
+
- name: smoke
|
|
58
|
+
include:
|
|
59
|
+
tags: [smoke]
|
|
60
|
+
|
|
61
|
+
- name: pre_merge
|
|
62
|
+
include:
|
|
63
|
+
tags: [smoke, regression]
|
|
64
|
+
|
|
65
|
+
- name: release
|
|
66
|
+
include:
|
|
67
|
+
suites: [support, internal-teams]
|
|
68
|
+
|
|
69
|
+
- name: incident_regressions
|
|
70
|
+
include:
|
|
71
|
+
tags: [incident, regression]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
These become the operational units you wire into local verification, pre-merge checks, and release readiness.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Integrations And Live Services
|
|
2
|
+
|
|
3
|
+
Use this guide to choose the right ARL provider path for the engineering question you are trying to answer.
|
|
4
|
+
|
|
5
|
+
## Provider Matrix
|
|
6
|
+
|
|
7
|
+
### `mock`
|
|
8
|
+
|
|
9
|
+
Use when you want:
|
|
10
|
+
|
|
11
|
+
- deterministic smoke tests
|
|
12
|
+
- stable docs examples
|
|
13
|
+
- baseline verification while changing the harness itself
|
|
14
|
+
|
|
15
|
+
### `openai`
|
|
16
|
+
|
|
17
|
+
Use when you want:
|
|
18
|
+
|
|
19
|
+
- real model behavior against deterministic tool surfaces
|
|
20
|
+
- prompt and model validation before merge
|
|
21
|
+
- quick local comparisons where the model is the variable
|
|
22
|
+
|
|
23
|
+
### `external_process`
|
|
24
|
+
|
|
25
|
+
Use when you want:
|
|
26
|
+
|
|
27
|
+
- a local Node or Python agent to participate in the runner-controlled tool loop
|
|
28
|
+
- the runner to remain authoritative for tools, step limits, and storage
|
|
29
|
+
- a thin adapter around an existing local agent implementation
|
|
30
|
+
|
|
31
|
+
### `http`
|
|
32
|
+
|
|
33
|
+
Use when you want:
|
|
34
|
+
|
|
35
|
+
- production-like multi-turn validation against a running service
|
|
36
|
+
- the agent to own memory, conversation history, and internal tool execution
|
|
37
|
+
- live verification of a real app instead of a deterministic wrapper
|
|
38
|
+
|
|
39
|
+
`arl-test/` is the canonical example of this path in this repo.
|
|
40
|
+
|
|
41
|
+
## Live-Service Verification
|
|
42
|
+
|
|
43
|
+
Default workflow:
|
|
44
|
+
|
|
45
|
+
1. start the service
|
|
46
|
+
2. run `agentlab` from the project containing the relevant scenarios and `agentlab.config.yaml`
|
|
47
|
+
3. run one scenario while debugging
|
|
48
|
+
4. run a suite before merge
|
|
49
|
+
5. compare candidate runs or suite batches against a known baseline
|
|
50
|
+
|
|
51
|
+
## Integration Design Rule
|
|
52
|
+
|
|
53
|
+
Choose the simplest provider that answers the engineering question you have.
|
|
54
|
+
|
|
55
|
+
- If you only need deterministic regression evidence, prefer `mock`.
|
|
56
|
+
- If you need real model behavior but deterministic tools, prefer `openai`.
|
|
57
|
+
- If you need a local agent implementation but still want runner-owned tools, prefer `external_process`.
|
|
58
|
+
- If you need the real running service with its own memory and orchestration, use `http`.
|