agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/docs/tools.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Custom Tools
|
|
2
|
+
|
|
3
|
+
Custom tools are registered in `agentlab.config.yaml` and loaded from repo-local JS or TS modules.
|
|
4
|
+
|
|
5
|
+
This is the main extension point when built-in tools are not enough.
|
|
6
|
+
|
|
7
|
+
## What A Tool Registration Needs
|
|
8
|
+
|
|
9
|
+
Each tool entry must define:
|
|
10
|
+
|
|
11
|
+
- `name`
|
|
12
|
+
- `modulePath`
|
|
13
|
+
- `exportName`
|
|
14
|
+
- `description`
|
|
15
|
+
- `inputSchema`
|
|
16
|
+
|
|
17
|
+
Example:
|
|
18
|
+
|
|
19
|
+
```yaml
|
|
20
|
+
tools:
|
|
21
|
+
- name: support.find_duplicate_charge
|
|
22
|
+
modulePath: user_tools/findDuplicateCharge.ts
|
|
23
|
+
exportName: findDuplicateCharge
|
|
24
|
+
description: Find the duplicated charge order id for a given customer.
|
|
25
|
+
inputSchema:
|
|
26
|
+
type: object
|
|
27
|
+
additionalProperties: false
|
|
28
|
+
properties:
|
|
29
|
+
customer_id:
|
|
30
|
+
type: string
|
|
31
|
+
description: Customer id to inspect for duplicated charges.
|
|
32
|
+
required:
|
|
33
|
+
- customer_id
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Tool Module Shape
|
|
37
|
+
|
|
38
|
+
The exported function should be async and should return JSON-serializable output.
|
|
39
|
+
|
|
40
|
+
Minimal example:
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
export async function myTool(input: unknown): Promise<{ ok: boolean }> {
|
|
44
|
+
return { ok: true };
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
The existing working example is:
|
|
49
|
+
|
|
50
|
+
- `user_tools/findDuplicateCharge.ts`
|
|
51
|
+
|
|
52
|
+
## Important Constraints
|
|
53
|
+
|
|
54
|
+
- `modulePath` must stay within the repo
|
|
55
|
+
- the module must exist at load time
|
|
56
|
+
- the named export must exist
|
|
57
|
+
- tool input should be validated defensively inside the tool
|
|
58
|
+
- tool output should be deterministic and JSON-serializable
|
|
59
|
+
|
|
60
|
+
For launch usage, treat tools as fixture-backed local functions, not live integrations.
|
|
61
|
+
|
|
62
|
+
## Recommended Pattern
|
|
63
|
+
|
|
64
|
+
Use this approach:
|
|
65
|
+
|
|
66
|
+
1. read fixture data from `fixtures/`
|
|
67
|
+
2. validate the input shape
|
|
68
|
+
3. return a small structured result
|
|
69
|
+
4. throw a clear error for missing fixture state or invalid input
|
|
70
|
+
|
|
71
|
+
The current `findDuplicateCharge` tool shows that pattern.
|
|
72
|
+
|
|
73
|
+
## Wiring A Tool Into A Scenario
|
|
74
|
+
|
|
75
|
+
1. register the tool in `agentlab.config.yaml`
|
|
76
|
+
2. add the tool name to the scenario allowlist
|
|
77
|
+
3. add an evaluator that confirms the tool was used correctly if the behavior is important
|
|
78
|
+
|
|
79
|
+
Example scenario:
|
|
80
|
+
|
|
81
|
+
- `scenarios/support/refund-via-config-tool.yaml`
|
|
82
|
+
|
|
83
|
+
## Best Practices
|
|
84
|
+
|
|
85
|
+
- keep tool names stable and descriptive
|
|
86
|
+
- keep tools scenario-agnostic where possible
|
|
87
|
+
- prefer read-only or sandboxed behavior
|
|
88
|
+
- do not mutate global machine state
|
|
89
|
+
- do not call live external systems in benchmark paths
|
|
90
|
+
- keep schemas narrow so agent tool calls are easy to validate and compare
|
|
91
|
+
|
|
92
|
+
## Common Errors
|
|
93
|
+
|
|
94
|
+
Typical config failures:
|
|
95
|
+
|
|
96
|
+
- duplicate tool names
|
|
97
|
+
- repo-external module paths
|
|
98
|
+
- missing module files
|
|
99
|
+
- missing exports
|
|
100
|
+
- invalid `inputSchema` shape
|
|
101
|
+
|
|
102
|
+
See [troubleshooting.md](troubleshooting.md) for failure examples and fixes.
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# Troubleshooting
|
|
2
|
+
|
|
3
|
+
This page covers the main failure modes users hit during install, first run, and comparison.
|
|
4
|
+
|
|
5
|
+
## `agentlab: command not found`
|
|
6
|
+
|
|
7
|
+
You are probably in one of these states:
|
|
8
|
+
|
|
9
|
+
- the package is not installed globally
|
|
10
|
+
- you have not run `npm link` from the repo
|
|
11
|
+
- your shell path does not include npm global bins
|
|
12
|
+
|
|
13
|
+
Fast fixes:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install
|
|
17
|
+
npm run build
|
|
18
|
+
npm link
|
|
19
|
+
agentlab --help
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Or skip linking and use:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
npm run start -- --help
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## `OPENAI_API_KEY is required`
|
|
31
|
+
|
|
32
|
+
You used an OpenAI-backed agent without exporting the API key.
|
|
33
|
+
|
|
34
|
+
Fix:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
export OPENAI_API_KEY=...
|
|
38
|
+
agentlab run support.refund-correct-order --agent openai-cheap
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## `No scenarios found for suite ...`
|
|
44
|
+
|
|
45
|
+
The suite id must match a suite under `scenarios/`.
|
|
46
|
+
|
|
47
|
+
List valid options:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
agentlab list scenarios
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Current built-in suites in this repo include:
|
|
54
|
+
|
|
55
|
+
- `support`
|
|
56
|
+
- `coding`
|
|
57
|
+
- `research`
|
|
58
|
+
- `ops`
|
|
59
|
+
- `internal-teams`
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## `Run '<id>' not found`
|
|
64
|
+
|
|
65
|
+
`show` and run-to-run `compare` require run ids from completed runs.
|
|
66
|
+
|
|
67
|
+
Get a fresh run id by executing a scenario:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Then use:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
agentlab show <run-id>
|
|
77
|
+
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## `Missing baseline or candidate suite batch id`
|
|
83
|
+
|
|
84
|
+
`compare --suite` does not use run ids. It uses suite batch ids printed by `run --suite`.
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
agentlab run --suite support --agent mock-default
|
|
90
|
+
agentlab run --suite support --agent mock-default
|
|
91
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Cross-suite suite comparison errors
|
|
97
|
+
|
|
98
|
+
Suite batch comparison is strict. Compare batches from the same suite only.
|
|
99
|
+
|
|
100
|
+
This is valid:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
agentlab compare --suite suite_...support_batch_a suite_...support_batch_b
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
This is not valid:
|
|
107
|
+
|
|
108
|
+
- a `support` batch compared against an `ops` batch
|
|
109
|
+
- mixed or malformed suite batch selections
|
|
110
|
+
|
|
111
|
+
If you are unsure which batch came from which suite, rerun the suite and record the printed batch ids.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## `agentlab ui` fails to load assets
|
|
116
|
+
|
|
117
|
+
Installed packages should already include the built UI assets.
|
|
118
|
+
|
|
119
|
+
If you are running from a repo checkout, build first:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
npm install
|
|
123
|
+
npm run build
|
|
124
|
+
agentlab ui
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
If the problem persists, verify that these files exist:
|
|
128
|
+
|
|
129
|
+
- `dist/ui-assets/client.js`
|
|
130
|
+
- `dist/ui-assets/client.css`
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Config tool or agent not found
|
|
135
|
+
|
|
136
|
+
Typical reasons:
|
|
137
|
+
|
|
138
|
+
- `agentlab.config.yaml` is missing
|
|
139
|
+
- the configured `name` does not match the CLI `--agent` value
|
|
140
|
+
- `modulePath` points outside the repo
|
|
141
|
+
- the configured export or command does not exist
|
|
142
|
+
|
|
143
|
+
Working references in this repo:
|
|
144
|
+
|
|
145
|
+
- tool config: `agentlab.config.yaml`
|
|
146
|
+
- custom tool: `user_tools/findDuplicateCharge.ts`
|
|
147
|
+
- external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## HTTP agent errors
|
|
152
|
+
|
|
153
|
+
### `HTTP agents require a configured url`
|
|
154
|
+
|
|
155
|
+
You ran a conversation scenario with `--provider http` but no HTTP agent config was found.
|
|
156
|
+
|
|
157
|
+
Fix: define a named http agent in `agentlab.config.yaml`:
|
|
158
|
+
|
|
159
|
+
```yaml
|
|
160
|
+
agents:
|
|
161
|
+
- name: my-agent
|
|
162
|
+
provider: http
|
|
163
|
+
url: http://localhost:3000/api/chat
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Then run with:
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
agentlab run internal-teams.memory-followup-recall --agent my-agent
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### `termination_reason: http_connection_failed`
|
|
173
|
+
|
|
174
|
+
agentlab could not connect to your agent's URL. The most common cause is that the agent service is not running.
|
|
175
|
+
|
|
176
|
+
Check:
|
|
177
|
+
|
|
178
|
+
- is the service running on the configured port?
|
|
179
|
+
- is the URL in `agentlab.config.yaml` correct?
|
|
180
|
+
- is there a firewall or proxy blocking the connection?
|
|
181
|
+
|
|
182
|
+
### `termination_reason: http_error`
|
|
183
|
+
|
|
184
|
+
Your agent returned an HTTP 4xx or 5xx response.
|
|
185
|
+
|
|
186
|
+
Check:
|
|
187
|
+
|
|
188
|
+
- is the route path correct?
|
|
189
|
+
- does your agent expect a different request shape? Use `request_template` if so.
|
|
190
|
+
- are there auth errors? Check `headers` config.
|
|
191
|
+
|
|
192
|
+
### `termination_reason: timeout_exceeded`
|
|
193
|
+
|
|
194
|
+
Your agent did not respond within `timeout_ms` (default 30 seconds).
|
|
195
|
+
|
|
196
|
+
Fix options:
|
|
197
|
+
|
|
198
|
+
- increase `timeout_ms` in the agent config
|
|
199
|
+
- investigate why the agent is slow for the given input
|
|
200
|
+
|
|
201
|
+
### `termination_reason: invalid_response_format`
|
|
202
|
+
|
|
203
|
+
Your agent either returned non-JSON or did not include the expected field.
|
|
204
|
+
|
|
205
|
+
Defaults: agentlab reads the `message` field from the JSON response. Override with `response_field` if your agent uses a different name:
|
|
206
|
+
|
|
207
|
+
```yaml
|
|
208
|
+
agents:
|
|
209
|
+
- name: my-agent
|
|
210
|
+
provider: http
|
|
211
|
+
url: http://localhost:3000/api/chat
|
|
212
|
+
response_field: reply
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## `database is locked`
|
|
218
|
+
|
|
219
|
+
You hit SQLite write contention on the local artifacts DB.
|
|
220
|
+
|
|
221
|
+
Most common cause:
|
|
222
|
+
|
|
223
|
+
- multiple `agentlab` runs writing to the same `artifacts/agentlab.db` at the same time
|
|
224
|
+
|
|
225
|
+
Fix:
|
|
226
|
+
|
|
227
|
+
- wait for the current run to finish
|
|
228
|
+
- rerun sequentially instead of in parallel
|
|
229
|
+
- keep live HTTP fixture verification serialized when using the same local project directory
|
|
230
|
+
|
|
231
|
+
The product now uses a busy timeout, but sequential execution is still the safest path for local live verification.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Conversation scenario errors
|
|
236
|
+
|
|
237
|
+
### `Scenario '...' is a conversation scenario and requires provider: http`
|
|
238
|
+
|
|
239
|
+
You tried to run a `type: conversation` scenario with a non-HTTP agent (`mock`, `openai`, or `external_process`).
|
|
240
|
+
|
|
241
|
+
Conversation scenarios only work with `provider: http`. Configure an HTTP agent in `agentlab.config.yaml` and use `--agent <name>`.
|
|
242
|
+
|
|
243
|
+
### `Conversation scenario '...' must not define 'tools'`
|
|
244
|
+
|
|
245
|
+
Your conversation scenario YAML has a `tools:` field. HTTP agents manage their own tools internally — remove the `tools:` block.
|
|
246
|
+
|
|
247
|
+
### `Conversation scenario '...' must define at least one step`
|
|
248
|
+
|
|
249
|
+
The `steps:` list is empty or missing. Add at least one step:
|
|
250
|
+
|
|
251
|
+
```yaml
|
|
252
|
+
steps:
|
|
253
|
+
- role: user
|
|
254
|
+
message: "Hello"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
### Per-step evaluator type rejected
|
|
258
|
+
|
|
259
|
+
Only these evaluator types are valid inside `steps[].evaluators`:
|
|
260
|
+
|
|
261
|
+
- `response_contains`
|
|
262
|
+
- `response_not_contains`
|
|
263
|
+
- `response_matches_regex`
|
|
264
|
+
- `response_latency_max`
|
|
265
|
+
|
|
266
|
+
End-of-run types (`step_count_max`, `final_answer_contains`, `exact_final_answer`) belong at the top-level `evaluators:` block, not inside individual steps.
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## Global install behaves differently from repo mode
|
|
271
|
+
|
|
272
|
+
That usually means the current working directory is wrong.
|
|
273
|
+
|
|
274
|
+
The CLI operates on the current working directory and expects:
|
|
275
|
+
|
|
276
|
+
- `scenarios/`
|
|
277
|
+
- `fixtures/`
|
|
278
|
+
- optional `agentlab.config.yaml`
|
|
279
|
+
|
|
280
|
+
Run it from the project root you want to evaluate.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Release Verification
|
|
285
|
+
|
|
286
|
+
Before publishing or cutting a release, run:
|
|
287
|
+
|
|
288
|
+
```bash
|
|
289
|
+
npm run check
|
|
290
|
+
npm test
|
|
291
|
+
npm run build
|
|
292
|
+
npm run smoke:cli
|
|
293
|
+
npm pack --dry-run
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
For the full pre-launch checklist, see [release-checklist.md](release-checklist.md).
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Variant Sets
|
|
2
|
+
|
|
3
|
+
Variant sets are named comparison groups defined in `agentlab.config.yaml`.
|
|
4
|
+
|
|
5
|
+
They are the Tier 1 mechanism for prompt, model, tool-schema, and config experiments without turning every comparison into manual CLI bookkeeping.
|
|
6
|
+
|
|
7
|
+
## Why They Exist
|
|
8
|
+
|
|
9
|
+
Named agents remain the executable unit.
|
|
10
|
+
|
|
11
|
+
Variant sets sit on top of named agents so you can run the same scenario or suite against multiple variants and compare the results intentionally.
|
|
12
|
+
|
|
13
|
+
## Config Shape
|
|
14
|
+
|
|
15
|
+
```yaml
|
|
16
|
+
variant_sets:
|
|
17
|
+
- name: refund-agent-model-comparison
|
|
18
|
+
variants:
|
|
19
|
+
- agent: mock-default
|
|
20
|
+
label: baseline
|
|
21
|
+
prompt_version: prompt-v3
|
|
22
|
+
model_version: mock-model
|
|
23
|
+
tool_schema_version: support-tools-v1
|
|
24
|
+
config_label: baseline-refund-flow
|
|
25
|
+
- agent: mock-compact
|
|
26
|
+
label: concise
|
|
27
|
+
prompt_version: prompt-v4
|
|
28
|
+
model_version: mock-model
|
|
29
|
+
tool_schema_version: support-tools-v1
|
|
30
|
+
config_label: concise-refund-flow
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## CLI Usage
|
|
34
|
+
|
|
35
|
+
Run one scenario against all variants:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Run one suite definition against all variants:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Stored Identity
|
|
48
|
+
|
|
49
|
+
Each resulting run stores and surfaces:
|
|
50
|
+
|
|
51
|
+
- `variant_set_name`
|
|
52
|
+
- `variant_label`
|
|
53
|
+
- `prompt_version`
|
|
54
|
+
- `model_version`
|
|
55
|
+
- `tool_schema_version`
|
|
56
|
+
- `config_label`
|
|
57
|
+
- `config_hash`
|
|
58
|
+
|
|
59
|
+
Those fields appear in CLI run summaries, `agentlab show`, run history, comparisons, and the UI.
|
|
60
|
+
|
|
61
|
+
## Design Rule
|
|
62
|
+
|
|
63
|
+
Use variant sets for intentional experiments. Keep named agents stable, and treat the variant set as the comparison layer.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-regression-lab",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"description": "Local-first scenario-based evaluation harness for AI agents.",
|
|
6
6
|
"license": "MIT",
|
|
@@ -26,7 +26,8 @@
|
|
|
26
26
|
"files": [
|
|
27
27
|
"dist",
|
|
28
28
|
"dist/ui-assets",
|
|
29
|
-
"README.md"
|
|
29
|
+
"README.md",
|
|
30
|
+
"docs"
|
|
30
31
|
],
|
|
31
32
|
"engines": {
|
|
32
33
|
"node": ">=22"
|
|
@@ -35,7 +36,7 @@
|
|
|
35
36
|
"build": "tsc -p tsconfig.json && npm run build:ui",
|
|
36
37
|
"build:ui": "esbuild src/ui/client.tsx --bundle --format=esm --platform=browser --outdir=dist/ui-assets --loader:.css=css --log-level=warning",
|
|
37
38
|
"check": "tsc -p tsconfig.json --noEmit",
|
|
38
|
-
"test": "tsx --test tests/**/*.test.ts",
|
|
39
|
+
"test": "tsx --test tests/*.test.ts tests/**/*.test.ts",
|
|
39
40
|
"smoke:cli": "npm run build && node dist/index.js --help && node dist/index.js version",
|
|
40
41
|
"start": "tsx src/index.ts",
|
|
41
42
|
"run": "tsx src/index.ts"
|