agent-regression-lab 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -123
- package/dist/agent/httpAdapter.js +78 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +37 -1
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/index.js +287 -102
- package/dist/lib/id.js +3 -0
- package/dist/scenarios.js +121 -9
- package/dist/storage.js +193 -29
- package/dist/tools.js +246 -0
- package/dist/ui/App.js +39 -3
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +83 -3
- package/docs/agents.md +152 -0
- package/docs/release-checklist.md +64 -0
- package/docs/scenarios.md +172 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +158 -0
- package/package.json +3 -2
package/dist/ui/server.js
CHANGED
|
@@ -80,6 +80,10 @@ function handleApi(url, response) {
|
|
|
80
80
|
...comparison.candidate,
|
|
81
81
|
errorDetail: getRunErrorDetail(comparison.candidate),
|
|
82
82
|
},
|
|
83
|
+
classification: comparison.classification,
|
|
84
|
+
verdictDelta: comparison.verdictDelta,
|
|
85
|
+
terminationDelta: comparison.terminationDelta,
|
|
86
|
+
outputChanged: comparison.outputChanged,
|
|
83
87
|
notes: comparison.notes,
|
|
84
88
|
deltas: comparison.deltas,
|
|
85
89
|
evaluatorDiffs: comparison.evaluatorDiffs,
|
|
@@ -87,11 +91,25 @@ function handleApi(url, response) {
|
|
|
87
91
|
});
|
|
88
92
|
return;
|
|
89
93
|
}
|
|
94
|
+
if (url.pathname === "/api/compare-suite") {
|
|
95
|
+
const baselineBatch = url.searchParams.get("baselineBatch");
|
|
96
|
+
const candidateBatch = url.searchParams.get("candidateBatch");
|
|
97
|
+
if (!baselineBatch || !candidateBatch) {
|
|
98
|
+
sendJson(response, 400, { error: "Both 'baselineBatch' and 'candidateBatch' query params are required." });
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
const comparison = storage.compareSuites(baselineBatch, candidateBatch);
|
|
102
|
+
sendJson(response, 200, comparison);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
90
105
|
sendJson(response, 404, { error: "Not found." });
|
|
91
106
|
}
|
|
92
107
|
catch (error) {
|
|
93
108
|
sendJson(response, 500, { error: error instanceof Error ? error.message : String(error) });
|
|
94
109
|
}
|
|
110
|
+
finally {
|
|
111
|
+
storage.close();
|
|
112
|
+
}
|
|
95
113
|
}
|
|
96
114
|
async function buildUiAssets() {
|
|
97
115
|
if (existsSync(PACKAGED_ASSETS_ROOT)) {
|
package/dist/ui-assets/client.js
CHANGED
|
@@ -21731,7 +21731,8 @@ function App() {
|
|
|
21731
21731
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("main", { className: "page", children: [
|
|
21732
21732
|
route.type === "list" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunListPage, {}) : null,
|
|
21733
21733
|
route.type === "detail" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunDetailPage, { runId: route.runId }) : null,
|
|
21734
|
-
route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null
|
|
21734
|
+
route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null,
|
|
21735
|
+
route.type === "compare-suite" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null
|
|
21735
21736
|
] })
|
|
21736
21737
|
] });
|
|
21737
21738
|
}
|
|
@@ -21795,7 +21796,8 @@ function RunListPage() {
|
|
|
21795
21796
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("td", { children: run.totalSteps }),
|
|
21796
21797
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("td", { children: [
|
|
21797
21798
|
new Date(run.startedAt).toLocaleString(),
|
|
21798
|
-
index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null
|
|
21799
|
+
index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null,
|
|
21800
|
+
index > 0 && runs[index - 1].suite === run.suite && runs[index - 1].suiteBatchId && run.suiteBatchId && runs[index - 1].suiteBatchId !== run.suiteBatchId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) }) : null
|
|
21799
21801
|
] })
|
|
21800
21802
|
] }, run.id)) })
|
|
21801
21803
|
] }) : null
|
|
@@ -21914,6 +21916,7 @@ function ComparePage(props) {
|
|
|
21914
21916
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
|
|
21915
21917
|
] }),
|
|
21916
21918
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
21919
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
21917
21920
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
|
|
21918
21921
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
|
|
21919
21922
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
|
|
@@ -21927,7 +21930,10 @@ function ComparePage(props) {
|
|
|
21927
21930
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21928
21931
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
|
|
21929
21932
|
data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
|
|
21930
|
-
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.
|
|
21933
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
21934
|
+
diff.note,
|
|
21935
|
+
diff.hardGate ? " (hard gate)" : ""
|
|
21936
|
+
] }, diff.evaluatorId)) })
|
|
21931
21937
|
] }),
|
|
21932
21938
|
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
21933
21939
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
|
|
@@ -22007,6 +22013,73 @@ function RunSide(props) {
|
|
|
22007
22013
|
] }) }, event.eventId)) })
|
|
22008
22014
|
] });
|
|
22009
22015
|
}
|
|
22016
|
+
function SuiteComparePage(props) {
|
|
22017
|
+
const [data, setData] = (0, import_react.useState)(null);
|
|
22018
|
+
(0, import_react.useEffect)(() => {
|
|
22019
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
22020
|
+
setData(null);
|
|
22021
|
+
return;
|
|
22022
|
+
}
|
|
22023
|
+
const url = new URL("/api/compare-suite", window.location.origin);
|
|
22024
|
+
url.searchParams.set("baselineBatch", props.baselineBatch);
|
|
22025
|
+
url.searchParams.set("candidateBatch", props.candidateBatch);
|
|
22026
|
+
void fetch(url).then((response) => response.json()).then((payload) => setData(payload));
|
|
22027
|
+
}, [props.baselineBatch, props.candidateBatch]);
|
|
22028
|
+
if (!props.baselineBatch || !props.candidateBatch) {
|
|
22029
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
|
|
22030
|
+
}
|
|
22031
|
+
if (!data) {
|
|
22032
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
|
|
22033
|
+
}
|
|
22034
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
|
|
22035
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
|
|
22036
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
|
|
22037
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
|
|
22038
|
+
] }),
|
|
22039
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
|
|
22040
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
|
|
22041
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
|
|
22042
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }),
|
|
22043
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }),
|
|
22044
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }),
|
|
22045
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })
|
|
22046
|
+
] }),
|
|
22047
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22048
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
|
|
22049
|
+
data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No suite-level notes recorded." }) : null,
|
|
22050
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
|
|
22051
|
+
] }),
|
|
22052
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "panel-grid", children: [
|
|
22053
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Regressions", items: data.regressions }),
|
|
22054
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Improvements", items: data.improvements })
|
|
22055
|
+
] }),
|
|
22056
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22057
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Missing scenarios" }),
|
|
22058
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
22059
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from candidate:" }),
|
|
22060
|
+
" ",
|
|
22061
|
+
data.missingFromCandidate.join(", ") || "None"
|
|
22062
|
+
] }),
|
|
22063
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
|
|
22064
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from baseline:" }),
|
|
22065
|
+
" ",
|
|
22066
|
+
data.missingFromBaseline.join(", ") || "None"
|
|
22067
|
+
] })
|
|
22068
|
+
] })
|
|
22069
|
+
] });
|
|
22070
|
+
}
|
|
22071
|
+
function ScenarioList(props) {
|
|
22072
|
+
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
|
|
22073
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
|
|
22074
|
+
props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
|
|
22075
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
|
|
22076
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
|
|
22077
|
+
" ",
|
|
22078
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
|
|
22079
|
+
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
|
|
22080
|
+
] }, item.scenarioId)) })
|
|
22081
|
+
] });
|
|
22082
|
+
}
|
|
22010
22083
|
function Stat(props) {
|
|
22011
22084
|
return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stat", children: [
|
|
22012
22085
|
/* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: props.label }),
|
|
@@ -22027,6 +22100,13 @@ function getRoute() {
|
|
|
22027
22100
|
if (url.pathname.startsWith("/runs/")) {
|
|
22028
22101
|
return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
|
|
22029
22102
|
}
|
|
22103
|
+
if (url.pathname === "/compare-suite") {
|
|
22104
|
+
return {
|
|
22105
|
+
type: "compare-suite",
|
|
22106
|
+
baselineBatch: url.searchParams.get("baselineBatch") ?? void 0,
|
|
22107
|
+
candidateBatch: url.searchParams.get("candidateBatch") ?? void 0
|
|
22108
|
+
};
|
|
22109
|
+
}
|
|
22030
22110
|
if (url.pathname === "/compare") {
|
|
22031
22111
|
return {
|
|
22032
22112
|
type: "compare",
|
package/docs/agents.md
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Agents
|
|
2
|
+
|
|
3
|
+
Named agents are configured in `agentlab.config.yaml`.
|
|
4
|
+
|
|
5
|
+
This repo currently supports three provider modes:
|
|
6
|
+
|
|
7
|
+
- `mock`
|
|
8
|
+
- `openai`
|
|
9
|
+
- `external_process`
|
|
10
|
+
|
|
11
|
+
## Named Agent Config
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
|
|
15
|
+
```yaml
|
|
16
|
+
agents:
|
|
17
|
+
- name: mock-default
|
|
18
|
+
provider: mock
|
|
19
|
+
label: mock-default
|
|
20
|
+
|
|
21
|
+
- name: openai-cheap
|
|
22
|
+
provider: openai
|
|
23
|
+
model: gpt-4o-mini
|
|
24
|
+
label: openai-cheap
|
|
25
|
+
|
|
26
|
+
- name: custom-node-agent
|
|
27
|
+
provider: external_process
|
|
28
|
+
command: node
|
|
29
|
+
args:
|
|
30
|
+
- custom_agents/node_agent.mjs
|
|
31
|
+
label: custom-node-agent
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Run a named agent with:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Mock
|
|
41
|
+
|
|
42
|
+
The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
|
|
43
|
+
|
|
44
|
+
Use it when you want:
|
|
45
|
+
|
|
46
|
+
- fast local verification
|
|
47
|
+
- stable docs examples
|
|
48
|
+
- predictable benchmark behavior
|
|
49
|
+
|
|
50
|
+
## OpenAI
|
|
51
|
+
|
|
52
|
+
The OpenAI path uses your API key and a configured model.
|
|
53
|
+
|
|
54
|
+
Requirements:
|
|
55
|
+
|
|
56
|
+
- `OPENAI_API_KEY` in the environment
|
|
57
|
+
- a named `openai` agent in `agentlab.config.yaml`, or equivalent CLI runtime settings
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
export OPENAI_API_KEY=...
|
|
63
|
+
agentlab run support.refund-correct-order --agent openai-cheap
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
The OpenAI path is useful, but less deterministic than the mock path.
|
|
67
|
+
|
|
68
|
+
## External Process
|
|
69
|
+
|
|
70
|
+
External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
|
|
71
|
+
|
|
72
|
+
The runner stays in control of:
|
|
73
|
+
|
|
74
|
+
- tool execution
|
|
75
|
+
- stopping conditions
|
|
76
|
+
- runtime limits
|
|
77
|
+
- persisted run state
|
|
78
|
+
|
|
79
|
+
The external agent decides what tool to call next or when to return a final answer.
|
|
80
|
+
|
|
81
|
+
### Protocol
|
|
82
|
+
|
|
83
|
+
Runner events:
|
|
84
|
+
|
|
85
|
+
- `run_started`
|
|
86
|
+
- `tool_result`
|
|
87
|
+
- `runner_error`
|
|
88
|
+
|
|
89
|
+
Agent responses:
|
|
90
|
+
|
|
91
|
+
- `tool_call`
|
|
92
|
+
- `final`
|
|
93
|
+
- `error`
|
|
94
|
+
|
|
95
|
+
Minimal flow:
|
|
96
|
+
|
|
97
|
+
1. the runner sends `run_started`
|
|
98
|
+
2. the agent returns `tool_call` or `final`
|
|
99
|
+
3. the runner executes the tool and sends `tool_result`
|
|
100
|
+
4. the agent continues until it returns `final` or `error`
|
|
101
|
+
|
|
102
|
+
Working examples:
|
|
103
|
+
|
|
104
|
+
- `custom_agents/node_agent.mjs`
|
|
105
|
+
- `custom_agents/python_agent.py`
|
|
106
|
+
|
|
107
|
+
Run one of them with:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
agentlab run support.refund-via-config-tool --agent custom-node-agent
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Environment Allowlist
|
|
114
|
+
|
|
115
|
+
External-process agents can optionally define `envAllowlist`.
|
|
116
|
+
|
|
117
|
+
Use it when a child process needs specific environment variables passed through.
|
|
118
|
+
|
|
119
|
+
Example shape:
|
|
120
|
+
|
|
121
|
+
```yaml
|
|
122
|
+
agents:
|
|
123
|
+
- name: custom-agent
|
|
124
|
+
provider: external_process
|
|
125
|
+
command: node
|
|
126
|
+
args:
|
|
127
|
+
- custom_agents/node_agent.mjs
|
|
128
|
+
envAllowlist:
|
|
129
|
+
- OPENAI_API_KEY
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Only allow through what the child actually needs.
|
|
133
|
+
|
|
134
|
+
## Best Practices
|
|
135
|
+
|
|
136
|
+
- use named agents instead of ad hoc local command strings
|
|
137
|
+
- keep labels stable so compare output stays readable
|
|
138
|
+
- prefer the mock path for smoke tests and docs
|
|
139
|
+
- use external-process agents when you want to wrap a local Node or Python agent implementation
|
|
140
|
+
- keep the runner authoritative for tools and termination
|
|
141
|
+
|
|
142
|
+
## Common Errors
|
|
143
|
+
|
|
144
|
+
Typical failures:
|
|
145
|
+
|
|
146
|
+
- missing `OPENAI_API_KEY`
|
|
147
|
+
- unsupported provider name
|
|
148
|
+
- missing external-process `command`
|
|
149
|
+
- invalid `args` or `envAllowlist`
|
|
150
|
+
- child process returning invalid JSON
|
|
151
|
+
|
|
152
|
+
See [troubleshooting.md](troubleshooting.md) for fixes.
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# Release Checklist
|
|
2
|
+
|
|
3
|
+
Use this before publishing a new npm version or telling users to upgrade.
|
|
4
|
+
|
|
5
|
+
## Verification
|
|
6
|
+
|
|
7
|
+
Run the full release gate:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm run check
|
|
11
|
+
npm test
|
|
12
|
+
npm run build
|
|
13
|
+
npm run smoke:cli
|
|
14
|
+
npm pack --dry-run
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Manual CLI Flow
|
|
18
|
+
|
|
19
|
+
Verify the canonical workflow:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
agentlab list scenarios
|
|
23
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
24
|
+
agentlab show <run-id>
|
|
25
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
26
|
+
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
27
|
+
agentlab run --suite support --agent mock-default
|
|
28
|
+
agentlab run --suite support --agent mock-default
|
|
29
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
30
|
+
agentlab ui
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Extension Smoke
|
|
34
|
+
|
|
35
|
+
Verify at least one extension path:
|
|
36
|
+
|
|
37
|
+
- run `support.refund-via-config-tool` with `custom-node-agent`, or
|
|
38
|
+
- verify a repo-local custom tool still loads from `agentlab.config.yaml`
|
|
39
|
+
|
|
40
|
+
## Docs Verification
|
|
41
|
+
|
|
42
|
+
Confirm these files match current behavior:
|
|
43
|
+
|
|
44
|
+
- `README.md`
|
|
45
|
+
- `docs/scenarios.md`
|
|
46
|
+
- `docs/tools.md`
|
|
47
|
+
- `docs/agents.md`
|
|
48
|
+
- `docs/troubleshooting.md`
|
|
49
|
+
|
|
50
|
+
Requirements:
|
|
51
|
+
|
|
52
|
+
- every command works as written
|
|
53
|
+
- every referenced path exists
|
|
54
|
+
- limitations are stated honestly
|
|
55
|
+
- `compare --suite` is documented using suite batch ids, not run ids
|
|
56
|
+
|
|
57
|
+
## Publish Hygiene
|
|
58
|
+
|
|
59
|
+
Before `npm publish`:
|
|
60
|
+
|
|
61
|
+
- confirm the package version is correct
|
|
62
|
+
- confirm the git tree contains the intended release changes
|
|
63
|
+
- confirm packaged UI assets are included in the tarball
|
|
64
|
+
- confirm the npm metadata still points at the correct repo, homepage, and issues URL
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Scenarios
|
|
2
|
+
|
|
3
|
+
Scenarios are YAML files under `scenarios/`. They are the core authoring interface for the product.
|
|
4
|
+
|
|
5
|
+
Each scenario should describe one narrow job for the agent, not a vague capability test.
|
|
6
|
+
|
|
7
|
+
## Required Shape
|
|
8
|
+
|
|
9
|
+
Each scenario should define:
|
|
10
|
+
|
|
11
|
+
- `id`
|
|
12
|
+
- `name`
|
|
13
|
+
- `suite`
|
|
14
|
+
- `task`
|
|
15
|
+
- `tools`
|
|
16
|
+
- `runtime`
|
|
17
|
+
- `evaluators`
|
|
18
|
+
|
|
19
|
+
Common optional fields already used in this repo:
|
|
20
|
+
|
|
21
|
+
- `description`
|
|
22
|
+
- `difficulty`
|
|
23
|
+
- `tags`
|
|
24
|
+
- task `context`
|
|
25
|
+
|
|
26
|
+
## Example
|
|
27
|
+
|
|
28
|
+
```yaml
|
|
29
|
+
id: support.refund-correct-order
|
|
30
|
+
name: Refund The Correct Order
|
|
31
|
+
suite: support
|
|
32
|
+
difficulty: easy
|
|
33
|
+
description: Refund only the duplicated charge.
|
|
34
|
+
tags:
|
|
35
|
+
- refund
|
|
36
|
+
- support
|
|
37
|
+
task:
|
|
38
|
+
instructions: |
|
|
39
|
+
The customer says they were charged twice.
|
|
40
|
+
Find the duplicated charge and refund only that order.
|
|
41
|
+
context:
|
|
42
|
+
customer_email: alice@example.com
|
|
43
|
+
tools:
|
|
44
|
+
allowed:
|
|
45
|
+
- crm.search_customer
|
|
46
|
+
- orders.list
|
|
47
|
+
- orders.refund
|
|
48
|
+
runtime:
|
|
49
|
+
max_steps: 8
|
|
50
|
+
timeout_seconds: 60
|
|
51
|
+
evaluators:
|
|
52
|
+
- id: refund-created
|
|
53
|
+
type: tool_call_assertion
|
|
54
|
+
mode: hard_gate
|
|
55
|
+
config:
|
|
56
|
+
tool: orders.refund
|
|
57
|
+
match:
|
|
58
|
+
order_id: ord_1024
|
|
59
|
+
- id: mentions-order
|
|
60
|
+
type: final_answer_contains
|
|
61
|
+
mode: weighted
|
|
62
|
+
weight: 1
|
|
63
|
+
config:
|
|
64
|
+
required_substrings:
|
|
65
|
+
- ord_1024
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Suites In This Repo
|
|
69
|
+
|
|
70
|
+
Current benchmark domains:
|
|
71
|
+
|
|
72
|
+
- `support`
|
|
73
|
+
- `coding`
|
|
74
|
+
- `research`
|
|
75
|
+
- `ops`
|
|
76
|
+
|
|
77
|
+
Use a suite when scenarios belong to one behavior family and should be runnable together with:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
agentlab run --suite support --agent mock-default
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
`run --suite` creates a suite batch id. That id is later used for:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Suite comparison is strict. Only compare batches from the same suite.
|
|
90
|
+
|
|
91
|
+
## Tools
|
|
92
|
+
|
|
93
|
+
Each scenario declares its allowed tools:
|
|
94
|
+
|
|
95
|
+
```yaml
|
|
96
|
+
tools:
|
|
97
|
+
allowed:
|
|
98
|
+
- crm.search_customer
|
|
99
|
+
- orders.list
|
|
100
|
+
- orders.refund
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Keep the tool allowlist as narrow as possible. A broad allowlist weakens the benchmark and makes regressions harder to interpret.
|
|
104
|
+
|
|
105
|
+
This repo supports both:
|
|
106
|
+
|
|
107
|
+
- built-in deterministic tools
|
|
108
|
+
- repo-local custom tools registered in `agentlab.config.yaml`
|
|
109
|
+
|
|
110
|
+
The launch benchmark now includes built-in tools for:
|
|
111
|
+
|
|
112
|
+
- support
|
|
113
|
+
- coding
|
|
114
|
+
- research
|
|
115
|
+
- ops
|
|
116
|
+
|
|
117
|
+
See [tools.md](tools.md) for custom tool registration.
|
|
118
|
+
|
|
119
|
+
## Runtime Limits
|
|
120
|
+
|
|
121
|
+
Scenarios can enforce:
|
|
122
|
+
|
|
123
|
+
- `max_steps`
|
|
124
|
+
- `timeout_seconds`
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
|
|
128
|
+
```yaml
|
|
129
|
+
runtime:
|
|
130
|
+
max_steps: 8
|
|
131
|
+
timeout_seconds: 60
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
These limits are enforced by the runner. Use them to keep runs bounded and comparisons meaningful.
|
|
135
|
+
|
|
136
|
+
## Evaluators
|
|
137
|
+
|
|
138
|
+
Use deterministic evaluators only.
|
|
139
|
+
|
|
140
|
+
The current evaluator set includes:
|
|
141
|
+
|
|
142
|
+
- `tool_call_assertion`
|
|
143
|
+
- `forbidden_tool`
|
|
144
|
+
- `final_answer_contains`
|
|
145
|
+
- `exact_final_answer`
|
|
146
|
+
- `step_count_max`
|
|
147
|
+
|
|
148
|
+
Guidance:
|
|
149
|
+
|
|
150
|
+
- use hard gates for non-negotiable behavior
|
|
151
|
+
- use weighted evaluators for softer quality checks
|
|
152
|
+
- prefer tool assertions or exact output checks over vague answer checks when possible
|
|
153
|
+
|
|
154
|
+
## Authoring Conventions
|
|
155
|
+
|
|
156
|
+
Use these defaults:
|
|
157
|
+
|
|
158
|
+
- `id` format: `<suite>.<short-name>`
|
|
159
|
+
- keep scenario jobs narrow and concrete
|
|
160
|
+
- keep fixture-backed context in `task.context`
|
|
161
|
+
- prefer deterministic fixture references over open-ended prompts
|
|
162
|
+
- include `difficulty`, `description`, and `tags` for every launch scenario
|
|
163
|
+
|
|
164
|
+
## Current Examples
|
|
165
|
+
|
|
166
|
+
Useful scenario references in this repo:
|
|
167
|
+
|
|
168
|
+
- support: `scenarios/support/refund-correct-order.yaml`
|
|
169
|
+
- support with config tool: `scenarios/support/refund-via-config-tool.yaml`
|
|
170
|
+
- coding: `scenarios/coding/fix-add-function.yaml`
|
|
171
|
+
- research: `scenarios/research/remote-work-policy.yaml`
|
|
172
|
+
- ops: `scenarios/ops/payments-api-alert.yaml`
|
package/docs/tools.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Custom Tools
|
|
2
|
+
|
|
3
|
+
Custom tools are registered in `agentlab.config.yaml` and loaded from repo-local JS or TS modules.
|
|
4
|
+
|
|
5
|
+
This is the main extension point when built-in tools are not enough.
|
|
6
|
+
|
|
7
|
+
## What A Tool Registration Needs
|
|
8
|
+
|
|
9
|
+
Each tool entry must define:
|
|
10
|
+
|
|
11
|
+
- `name`
|
|
12
|
+
- `modulePath`
|
|
13
|
+
- `exportName`
|
|
14
|
+
- `description`
|
|
15
|
+
- `inputSchema`
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
|
|
19
|
+
```yaml
|
|
20
|
+
tools:
|
|
21
|
+
- name: support.find_duplicate_charge
|
|
22
|
+
modulePath: user_tools/findDuplicateCharge.ts
|
|
23
|
+
exportName: findDuplicateCharge
|
|
24
|
+
description: Find the duplicated charge order id for a given customer.
|
|
25
|
+
inputSchema:
|
|
26
|
+
type: object
|
|
27
|
+
additionalProperties: false
|
|
28
|
+
properties:
|
|
29
|
+
customer_id:
|
|
30
|
+
type: string
|
|
31
|
+
description: Customer id to inspect for duplicated charges.
|
|
32
|
+
required:
|
|
33
|
+
- customer_id
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Tool Module Shape
|
|
37
|
+
|
|
38
|
+
The exported function should be async and should return JSON-serializable output.
|
|
39
|
+
|
|
40
|
+
Minimal example:
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
export async function myTool(input: unknown): Promise<{ ok: boolean }> {
|
|
44
|
+
return { ok: true };
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
The existing working example is:
|
|
49
|
+
|
|
50
|
+
- `user_tools/findDuplicateCharge.ts`
|
|
51
|
+
|
|
52
|
+
## Important Constraints
|
|
53
|
+
|
|
54
|
+
- `modulePath` must stay within the repo
|
|
55
|
+
- the module must exist at load time
|
|
56
|
+
- the named export must exist
|
|
57
|
+
- tool input should be validated defensively inside the tool
|
|
58
|
+
- tool output should be deterministic and JSON-serializable
|
|
59
|
+
|
|
60
|
+
For launch usage, treat tools as fixture-backed local functions, not live integrations.
|
|
61
|
+
|
|
62
|
+
## Recommended Pattern
|
|
63
|
+
|
|
64
|
+
Use this approach:
|
|
65
|
+
|
|
66
|
+
1. read fixture data from `fixtures/`
|
|
67
|
+
2. validate the input shape
|
|
68
|
+
3. return a small structured result
|
|
69
|
+
4. throw a clear error for missing fixture state or invalid input
|
|
70
|
+
|
|
71
|
+
The current `findDuplicateCharge` tool shows that pattern.
|
|
72
|
+
|
|
73
|
+
## Wiring A Tool Into A Scenario
|
|
74
|
+
|
|
75
|
+
1. register the tool in `agentlab.config.yaml`
|
|
76
|
+
2. add the tool name to the scenario allowlist
|
|
77
|
+
3. add an evaluator that confirms the tool was used correctly if the behavior is important
|
|
78
|
+
|
|
79
|
+
Example scenario:
|
|
80
|
+
|
|
81
|
+
- `scenarios/support/refund-via-config-tool.yaml`
|
|
82
|
+
|
|
83
|
+
## Best Practices
|
|
84
|
+
|
|
85
|
+
- keep tool names stable and descriptive
|
|
86
|
+
- keep tools scenario-agnostic where possible
|
|
87
|
+
- prefer read-only or sandboxed behavior
|
|
88
|
+
- do not mutate global machine state
|
|
89
|
+
- do not call live external systems in benchmark paths
|
|
90
|
+
- keep schemas narrow so agent tool calls are easy to validate and compare
|
|
91
|
+
|
|
92
|
+
## Common Errors
|
|
93
|
+
|
|
94
|
+
Typical config failures:
|
|
95
|
+
|
|
96
|
+
- duplicate tool names
|
|
97
|
+
- repo-external module paths
|
|
98
|
+
- missing module files
|
|
99
|
+
- missing exports
|
|
100
|
+
- invalid `inputSchema` shape
|
|
101
|
+
|
|
102
|
+
See [troubleshooting.md](troubleshooting.md) for failure examples and fixes.
|