agent-regression-lab 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ui/server.js CHANGED
@@ -80,6 +80,10 @@ function handleApi(url, response) {
80
80
  ...comparison.candidate,
81
81
  errorDetail: getRunErrorDetail(comparison.candidate),
82
82
  },
83
+ classification: comparison.classification,
84
+ verdictDelta: comparison.verdictDelta,
85
+ terminationDelta: comparison.terminationDelta,
86
+ outputChanged: comparison.outputChanged,
83
87
  notes: comparison.notes,
84
88
  deltas: comparison.deltas,
85
89
  evaluatorDiffs: comparison.evaluatorDiffs,
@@ -87,11 +91,25 @@ function handleApi(url, response) {
87
91
  });
88
92
  return;
89
93
  }
94
+ if (url.pathname === "/api/compare-suite") {
95
+ const baselineBatch = url.searchParams.get("baselineBatch");
96
+ const candidateBatch = url.searchParams.get("candidateBatch");
97
+ if (!baselineBatch || !candidateBatch) {
98
+ sendJson(response, 400, { error: "Both 'baselineBatch' and 'candidateBatch' query params are required." });
99
+ return;
100
+ }
101
+ const comparison = storage.compareSuites(baselineBatch, candidateBatch);
102
+ sendJson(response, 200, comparison);
103
+ return;
104
+ }
90
105
  sendJson(response, 404, { error: "Not found." });
91
106
  }
92
107
  catch (error) {
93
108
  sendJson(response, 500, { error: error instanceof Error ? error.message : String(error) });
94
109
  }
110
+ finally {
111
+ storage.close();
112
+ }
95
113
  }
96
114
  async function buildUiAssets() {
97
115
  if (existsSync(PACKAGED_ASSETS_ROOT)) {
@@ -21731,7 +21731,8 @@ function App() {
21731
21731
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("main", { className: "page", children: [
21732
21732
  route.type === "list" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunListPage, {}) : null,
21733
21733
  route.type === "detail" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(RunDetailPage, { runId: route.runId }) : null,
21734
- route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null
21734
+ route.type === "compare" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ComparePage, { baseline: route.baseline, candidate: route.candidate }) : null,
21735
+ route.type === "compare-suite" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SuiteComparePage, { baselineBatch: route.baselineBatch, candidateBatch: route.candidateBatch }) : null
21735
21736
  ] })
21736
21737
  ] });
21737
21738
  }
@@ -21795,7 +21796,8 @@ function RunListPage() {
21795
21796
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("td", { children: run.totalSteps }),
21796
21797
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("td", { children: [
21797
21798
  new Date(run.startedAt).toLocaleString(),
21798
- index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null
21799
+ index > 0 && runs[index - 1].scenarioId === run.scenarioId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${runs[index - 1].id}&candidate=${run.id}`, children: "compare previous" }) }) : null,
21800
+ index > 0 && runs[index - 1].suite === run.suite && runs[index - 1].suiteBatchId && run.suiteBatchId && runs[index - 1].suiteBatchId !== run.suiteBatchId ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare-suite?baselineBatch=${runs[index - 1].suiteBatchId}&candidateBatch=${run.suiteBatchId}`, children: "compare suite batch" }) }) : null
21799
21801
  ] })
21800
21802
  ] }, run.id)) })
21801
21803
  ] }) : null
@@ -21914,6 +21916,7 @@ function ComparePage(props) {
21914
21916
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.baseline.run.scenarioId })
21915
21917
  ] }),
21916
21918
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
21919
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
21917
21920
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.score) }),
21918
21921
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.runtimeMs)}ms` }),
21919
21922
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.steps) })
@@ -21927,7 +21930,10 @@ function ComparePage(props) {
21927
21930
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21928
21931
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Evaluator diffs" }),
21929
21932
  data.evaluatorDiffs.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No evaluator changes." }) : null,
21930
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: diff.note }, diff.evaluatorId)) })
21933
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.evaluatorDiffs.map((diff) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
21934
+ diff.note,
21935
+ diff.hardGate ? " (hard gate)" : ""
21936
+ ] }, diff.evaluatorId)) })
21931
21937
  ] }),
21932
21938
  /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
21933
21939
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Tool diffs" }),
@@ -22007,6 +22013,73 @@ function RunSide(props) {
22007
22013
  ] }) }, event.eventId)) })
22008
22014
  ] });
22009
22015
  }
22016
+ function SuiteComparePage(props) {
22017
+ const [data, setData] = (0, import_react.useState)(null);
22018
+ (0, import_react.useEffect)(() => {
22019
+ if (!props.baselineBatch || !props.candidateBatch) {
22020
+ setData(null);
22021
+ return;
22022
+ }
22023
+ const url = new URL("/api/compare-suite", window.location.origin);
22024
+ url.searchParams.set("baselineBatch", props.baselineBatch);
22025
+ url.searchParams.set("candidateBatch", props.candidateBatch);
22026
+ void fetch(url).then((response) => response.json()).then((payload) => setData(payload));
22027
+ }, [props.baselineBatch, props.candidateBatch]);
22028
+ if (!props.baselineBatch || !props.candidateBatch) {
22029
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "No suite comparison selected", description: "Open the suite compare page with baseline and candidate batch ids." });
22030
+ }
22031
+ if (!data) {
22032
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsx)(EmptyState, { title: "Loading suite comparison", description: "Fetching suite batches and computing regressions." });
22033
+ }
22034
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { children: [
22035
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "hero", children: [
22036
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h1", { children: "Suite Compare" }),
22037
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { children: data.suite })
22038
+ ] }),
22039
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stats", children: [
22040
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Classification", value: data.classification }),
22041
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Pass delta", value: signed(data.deltas.pass) }),
22042
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Fail delta", value: signed(data.deltas.fail) }),
22043
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Score delta", value: signed(data.deltas.averageScore) }),
22044
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Runtime delta", value: `${signed(data.deltas.averageRuntimeMs)}ms` }),
22045
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Stat, { label: "Step delta", value: signed(data.deltas.averageSteps) })
22046
+ ] }),
22047
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22048
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Notes" }),
22049
+ data.notes.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "No suite-level notes recorded." }) : null,
22050
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: data.notes.map((note) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("li", { children: note }, note)) })
22051
+ ] }),
22052
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "panel-grid", children: [
22053
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Regressions", items: data.regressions }),
22054
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(ScenarioList, { title: "Improvements", items: data.improvements })
22055
+ ] }),
22056
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22057
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: "Missing scenarios" }),
22058
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
22059
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from candidate:" }),
22060
+ " ",
22061
+ data.missingFromCandidate.join(", ") || "None"
22062
+ ] }),
22063
+ /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("p", { children: [
22064
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: "Missing from baseline:" }),
22065
+ " ",
22066
+ data.missingFromBaseline.join(", ") || "None"
22067
+ ] })
22068
+ ] })
22069
+ ] });
22070
+ }
22071
+ function ScenarioList(props) {
22072
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", { className: "panel", children: [
22073
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", { children: props.title }),
22074
+ props.items.length === 0 ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", { className: "muted", children: "None." }) : null,
22075
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("ul", { className: "stack", children: props.items.map((item) => /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("li", { children: [
22076
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("strong", { children: item.scenarioId }),
22077
+ " ",
22078
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("span", { className: "muted", children: item.comparison.classification }),
22079
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)("a", { href: `/compare?baseline=${item.comparison.baseline.run.id}&candidate=${item.comparison.candidate.run.id}`, children: "open run compare" }) })
22080
+ ] }, item.scenarioId)) })
22081
+ ] });
22082
+ }
22010
22083
  function Stat(props) {
22011
22084
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { className: "stat", children: [
22012
22085
  /* @__PURE__ */ (0, import_jsx_runtime.jsx)("div", { className: "muted", children: props.label }),
@@ -22027,6 +22100,13 @@ function getRoute() {
22027
22100
  if (url.pathname.startsWith("/runs/")) {
22028
22101
  return { type: "detail", runId: decodeURIComponent(url.pathname.slice("/runs/".length)) };
22029
22102
  }
22103
+ if (url.pathname === "/compare-suite") {
22104
+ return {
22105
+ type: "compare-suite",
22106
+ baselineBatch: url.searchParams.get("baselineBatch") ?? void 0,
22107
+ candidateBatch: url.searchParams.get("candidateBatch") ?? void 0
22108
+ };
22109
+ }
22030
22110
  if (url.pathname === "/compare") {
22031
22111
  return {
22032
22112
  type: "compare",
package/docs/agents.md ADDED
@@ -0,0 +1,152 @@
1
+ # Agents
2
+
3
+ Named agents are configured in `agentlab.config.yaml`.
4
+
5
+ This repo currently supports three provider modes:
6
+
7
+ - `mock`
8
+ - `openai`
9
+ - `external_process`
10
+
11
+ ## Named Agent Config
12
+
13
+ Example:
14
+
15
+ ```yaml
16
+ agents:
17
+ - name: mock-default
18
+ provider: mock
19
+ label: mock-default
20
+
21
+ - name: openai-cheap
22
+ provider: openai
23
+ model: gpt-4o-mini
24
+ label: openai-cheap
25
+
26
+ - name: custom-node-agent
27
+ provider: external_process
28
+ command: node
29
+ args:
30
+ - custom_agents/node_agent.mjs
31
+ label: custom-node-agent
32
+ ```
33
+
34
+ Run a named agent with:
35
+
36
+ ```bash
37
+ agentlab run support.refund-correct-order --agent mock-default
38
+ ```
39
+
40
+ ## Mock
41
+
42
+ The built-in mock adapter is the best path for deterministic smoke tests and baseline examples.
43
+
44
+ Use it when you want:
45
+
46
+ - fast local verification
47
+ - stable docs examples
48
+ - predictable benchmark behavior
49
+
50
+ ## OpenAI
51
+
52
+ The OpenAI path uses your API key and a configured model.
53
+
54
+ Requirements:
55
+
56
+ - `OPENAI_API_KEY` in the environment
57
+ - a named `openai` agent in `agentlab.config.yaml`, or equivalent CLI runtime settings
58
+
59
+ Example:
60
+
61
+ ```bash
62
+ export OPENAI_API_KEY=...
63
+ agentlab run support.refund-correct-order --agent openai-cheap
64
+ ```
65
+
66
+ The OpenAI path is useful, but less deterministic than the mock path.
67
+
68
+ ## External Process
69
+
70
+ External-process agents communicate with the runner over line-delimited JSON on stdin/stdout.
71
+
72
+ The runner stays in control of:
73
+
74
+ - tool execution
75
+ - stopping conditions
76
+ - runtime limits
77
+ - persisted run state
78
+
79
+ The external agent decides what tool to call next or when to return a final answer.
80
+
81
+ ### Protocol
82
+
83
+ Runner events:
84
+
85
+ - `run_started`
86
+ - `tool_result`
87
+ - `runner_error`
88
+
89
+ Agent responses:
90
+
91
+ - `tool_call`
92
+ - `final`
93
+ - `error`
94
+
95
+ Minimal flow:
96
+
97
+ 1. the runner sends `run_started`
98
+ 2. the agent returns `tool_call` or `final`
99
+ 3. the runner executes the tool and sends `tool_result`
100
+ 4. the agent continues until it returns `final` or `error`
101
+
102
+ Working examples:
103
+
104
+ - `custom_agents/node_agent.mjs`
105
+ - `custom_agents/python_agent.py`
106
+
107
+ Run one of them with:
108
+
109
+ ```bash
110
+ agentlab run support.refund-via-config-tool --agent custom-node-agent
111
+ ```
112
+
113
+ ## Environment Allowlist
114
+
115
+ External-process agents can optionally define `envAllowlist`.
116
+
117
+ Use it when a child process needs specific environment variables passed through.
118
+
119
+ Example shape:
120
+
121
+ ```yaml
122
+ agents:
123
+ - name: custom-agent
124
+ provider: external_process
125
+ command: node
126
+ args:
127
+ - custom_agents/node_agent.mjs
128
+ envAllowlist:
129
+ - OPENAI_API_KEY
130
+ ```
131
+
132
+ Only allow through what the child actually needs.
133
+
134
+ ## Best Practices
135
+
136
+ - use named agents instead of ad hoc local command strings
137
+ - keep labels stable so compare output stays readable
138
+ - prefer the mock path for smoke tests and docs
139
+ - use external-process agents when you want to wrap a local Node or Python agent implementation
140
+ - keep the runner authoritative for tools and termination
141
+
142
+ ## Common Errors
143
+
144
+ Typical failures:
145
+
146
+ - missing `OPENAI_API_KEY`
147
+ - unsupported provider name
148
+ - missing external-process `command`
149
+ - invalid `args` or `envAllowlist`
150
+ - child process returning invalid JSON
151
+
152
+ See [troubleshooting.md](troubleshooting.md) for fixes.
@@ -0,0 +1,64 @@
1
+ # Release Checklist
2
+
3
+ Use this before publishing a new npm version or telling users to upgrade.
4
+
5
+ ## Verification
6
+
7
+ Run the full release gate:
8
+
9
+ ```bash
10
+ npm run check
11
+ npm test
12
+ npm run build
13
+ npm run smoke:cli
14
+ npm pack --dry-run
15
+ ```
16
+
17
+ ## Manual CLI Flow
18
+
19
+ Verify the canonical workflow:
20
+
21
+ ```bash
22
+ agentlab list scenarios
23
+ agentlab run support.refund-correct-order --agent mock-default
24
+ agentlab show <run-id>
25
+ agentlab run support.refund-correct-order --agent mock-default
26
+ agentlab compare <baseline-run-id> <candidate-run-id>
27
+ agentlab run --suite support --agent mock-default
28
+ agentlab run --suite support --agent mock-default
29
+ agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
30
+ agentlab ui
31
+ ```
32
+
33
+ ## Extension Smoke
34
+
35
+ Verify at least one extension path:
36
+
37
+ - run `support.refund-via-config-tool` with `custom-node-agent`, or
38
+ - verify a repo-local custom tool still loads from `agentlab.config.yaml`
39
+
40
+ ## Docs Verification
41
+
42
+ Confirm these files match current behavior:
43
+
44
+ - `README.md`
45
+ - `docs/scenarios.md`
46
+ - `docs/tools.md`
47
+ - `docs/agents.md`
48
+ - `docs/troubleshooting.md`
49
+
50
+ Requirements:
51
+
52
+ - every command works as written
53
+ - every referenced path exists
54
+ - limitations are stated honestly
55
+ - `compare --suite` is documented using suite batch ids, not run ids
56
+
57
+ ## Publish Hygiene
58
+
59
+ Before `npm publish`:
60
+
61
+ - confirm the package version is correct
62
+ - confirm the git tree contains the intended release changes
63
+ - confirm packaged UI assets are included in the tarball
64
+ - confirm the npm metadata still points at the correct repo, homepage, and issues URL
@@ -0,0 +1,172 @@
1
+ # Scenarios
2
+
3
+ Scenarios are YAML files under `scenarios/`. They are the core authoring interface for the product.
4
+
5
+ Each scenario should describe one narrow job for the agent, not a vague capability test.
6
+
7
+ ## Required Shape
8
+
9
+ Each scenario should define:
10
+
11
+ - `id`
12
+ - `name`
13
+ - `suite`
14
+ - `task`
15
+ - `tools`
16
+ - `runtime`
17
+ - `evaluators`
18
+
19
+ Common optional fields already used in this repo:
20
+
21
+ - `description`
22
+ - `difficulty`
23
+ - `tags`
24
+ - task `context`
25
+
26
+ ## Example
27
+
28
+ ```yaml
29
+ id: support.refund-correct-order
30
+ name: Refund The Correct Order
31
+ suite: support
32
+ difficulty: easy
33
+ description: Refund only the duplicated charge.
34
+ tags:
35
+ - refund
36
+ - support
37
+ task:
38
+ instructions: |
39
+ The customer says they were charged twice.
40
+ Find the duplicated charge and refund only that order.
41
+ context:
42
+ customer_email: alice@example.com
43
+ tools:
44
+ allowed:
45
+ - crm.search_customer
46
+ - orders.list
47
+ - orders.refund
48
+ runtime:
49
+ max_steps: 8
50
+ timeout_seconds: 60
51
+ evaluators:
52
+ - id: refund-created
53
+ type: tool_call_assertion
54
+ mode: hard_gate
55
+ config:
56
+ tool: orders.refund
57
+ match:
58
+ order_id: ord_1024
59
+ - id: mentions-order
60
+ type: final_answer_contains
61
+ mode: weighted
62
+ weight: 1
63
+ config:
64
+ required_substrings:
65
+ - ord_1024
66
+ ```
67
+
68
+ ## Suites In This Repo
69
+
70
+ Current benchmark domains:
71
+
72
+ - `support`
73
+ - `coding`
74
+ - `research`
75
+ - `ops`
76
+
77
+ Use a suite when scenarios belong to one behavior family and should be runnable together with:
78
+
79
+ ```bash
80
+ agentlab run --suite support --agent mock-default
81
+ ```
82
+
83
+ `run --suite` creates a suite batch id. That id is later used for:
84
+
85
+ ```bash
86
+ agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
87
+ ```
88
+
89
+ Suite comparison is strict. Only compare batches from the same suite.
90
+
91
+ ## Tools
92
+
93
+ Each scenario declares its allowed tools:
94
+
95
+ ```yaml
96
+ tools:
97
+ allowed:
98
+ - crm.search_customer
99
+ - orders.list
100
+ - orders.refund
101
+ ```
102
+
103
+ Keep the tool allowlist as narrow as possible. A broad allowlist weakens the benchmark and makes regressions harder to interpret.
104
+
105
+ This repo supports both:
106
+
107
+ - built-in deterministic tools
108
+ - repo-local custom tools registered in `agentlab.config.yaml`
109
+
110
+ The launch benchmark now includes built-in tools for:
111
+
112
+ - support
113
+ - coding
114
+ - research
115
+ - ops
116
+
117
+ See [tools.md](tools.md) for custom tool registration.
118
+
119
+ ## Runtime Limits
120
+
121
+ Scenarios can enforce:
122
+
123
+ - `max_steps`
124
+ - `timeout_seconds`
125
+
126
+ Example:
127
+
128
+ ```yaml
129
+ runtime:
130
+ max_steps: 8
131
+ timeout_seconds: 60
132
+ ```
133
+
134
+ These limits are enforced by the runner. Use them to keep runs bounded and comparisons meaningful.
135
+
136
+ ## Evaluators
137
+
138
+ Use deterministic evaluators only.
139
+
140
+ The current evaluator set includes:
141
+
142
+ - `tool_call_assertion`
143
+ - `forbidden_tool`
144
+ - `final_answer_contains`
145
+ - `exact_final_answer`
146
+ - `step_count_max`
147
+
148
+ Guidance:
149
+
150
+ - use hard gates for non-negotiable behavior
151
+ - use weighted evaluators for softer quality checks
152
+ - prefer tool assertions or exact output checks over vague answer checks when possible
153
+
154
+ ## Authoring Conventions
155
+
156
+ Use these defaults:
157
+
158
+ - `id` format: `<suite>.<short-name>`
159
+ - keep scenario jobs narrow and concrete
160
+ - keep fixture-backed context in `task.context`
161
+ - prefer deterministic fixture references over open-ended prompts
162
+ - include `difficulty`, `description`, and `tags` for every launch scenario
163
+
164
+ ## Current Examples
165
+
166
+ Useful scenario references in this repo:
167
+
168
+ - support: `scenarios/support/refund-correct-order.yaml`
169
+ - support with config tool: `scenarios/support/refund-via-config-tool.yaml`
170
+ - coding: `scenarios/coding/fix-add-function.yaml`
171
+ - research: `scenarios/research/remote-work-policy.yaml`
172
+ - ops: `scenarios/ops/payments-api-alert.yaml`
package/docs/tools.md ADDED
@@ -0,0 +1,102 @@
1
+ # Custom Tools
2
+
3
+ Custom tools are registered in `agentlab.config.yaml` and loaded from repo-local JS or TS modules.
4
+
5
+ This is the main extension point when built-in tools are not enough.
6
+
7
+ ## What A Tool Registration Needs
8
+
9
+ Each tool entry must define:
10
+
11
+ - `name`
12
+ - `modulePath`
13
+ - `exportName`
14
+ - `description`
15
+ - `inputSchema`
16
+
17
+ Example:
18
+
19
+ ```yaml
20
+ tools:
21
+ - name: support.find_duplicate_charge
22
+ modulePath: user_tools/findDuplicateCharge.ts
23
+ exportName: findDuplicateCharge
24
+ description: Find the duplicated charge order id for a given customer.
25
+ inputSchema:
26
+ type: object
27
+ additionalProperties: false
28
+ properties:
29
+ customer_id:
30
+ type: string
31
+ description: Customer id to inspect for duplicated charges.
32
+ required:
33
+ - customer_id
34
+ ```
35
+
36
+ ## Tool Module Shape
37
+
38
+ The exported function should be async and should return JSON-serializable output.
39
+
40
+ Minimal example:
41
+
42
+ ```ts
43
+ export async function myTool(input: unknown): Promise<{ ok: boolean }> {
44
+ return { ok: true };
45
+ }
46
+ ```
47
+
48
+ The existing working example is:
49
+
50
+ - `user_tools/findDuplicateCharge.ts`
51
+
52
+ ## Important Constraints
53
+
54
+ - `modulePath` must stay within the repo
55
+ - the module must exist at load time
56
+ - the named export must exist
57
+ - tool input should be validated defensively inside the tool
58
+ - tool output should be deterministic and JSON-serializable
59
+
60
+ For launch usage, treat tools as fixture-backed local functions, not live integrations.
61
+
62
+ ## Recommended Pattern
63
+
64
+ Use this approach:
65
+
66
+ 1. read fixture data from `fixtures/`
67
+ 2. validate the input shape
68
+ 3. return a small structured result
69
+ 4. throw a clear error for missing fixture state or invalid input
70
+
71
+ The current `findDuplicateCharge` tool shows that pattern.
72
+
73
+ ## Wiring A Tool Into A Scenario
74
+
75
+ 1. register the tool in `agentlab.config.yaml`
76
+ 2. add the tool name to the scenario allowlist
77
+ 3. add an evaluator that confirms the tool was used correctly if the behavior is important
78
+
79
+ Example scenario:
80
+
81
+ - `scenarios/support/refund-via-config-tool.yaml`
82
+
83
+ ## Best Practices
84
+
85
+ - keep tool names stable and descriptive
86
+ - keep tools scenario-agnostic where possible
87
+ - prefer read-only or sandboxed behavior
88
+ - do not mutate global machine state
89
+ - do not call live external systems in benchmark paths
90
+ - keep schemas narrow so agent tool calls are easy to validate and compare
91
+
92
+ ## Common Errors
93
+
94
+ Typical config failures:
95
+
96
+ - duplicate tool names
97
+ - repo-external module paths
98
+ - missing module files
99
+ - missing exports
100
+ - invalid `inputSchema` shape
101
+
102
+ See [troubleshooting.md](troubleshooting.md) for failure examples and fixes.