agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +78 -11
  2. package/bin/agentlab.js +2 -0
  3. package/dist/agent/factory.js +20 -6
  4. package/dist/agent/httpAdapter.js +5 -4
  5. package/dist/config.js +199 -12
  6. package/dist/evaluators.js +56 -1
  7. package/dist/index.js +157 -11
  8. package/dist/init.js +88 -0
  9. package/dist/lib/id.js +3 -0
  10. package/dist/runOutput.js +46 -0
  11. package/dist/runner.js +31 -9
  12. package/dist/scenarios.js +90 -2
  13. package/dist/scoring.js +2 -2
  14. package/dist/storage.js +117 -7
  15. package/dist/tools.js +56 -2
  16. package/dist/trace.js +4 -2
  17. package/dist/ui/App.js +75 -7
  18. package/dist/ui-assets/client.css +92 -0
  19. package/dist/ui-assets/client.js +183 -19
  20. package/docs/agents.md +143 -8
  21. package/docs/coding-agents.md +74 -0
  22. package/docs/golden-suites.md +74 -0
  23. package/docs/integrations-and-live-services.md +58 -0
  24. package/docs/memory-and-stateful-agents.md +51 -0
  25. package/docs/release-checklist.md +30 -0
  26. package/docs/runtime-profiles.md +67 -0
  27. package/docs/scenarios.md +303 -56
  28. package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
  29. package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
  30. package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
  31. package/docs/tools.md +34 -3
  32. package/docs/troubleshooting.md +193 -0
  33. package/docs/variant-sets.md +63 -0
  34. package/examples/coding-tools/README.md +21 -0
  35. package/examples/coding-tools/index.js +11 -0
  36. package/examples/coding-tools/package.json +8 -0
  37. package/examples/support-tools/README.md +21 -0
  38. package/examples/support-tools/index.js +8 -0
  39. package/examples/support-tools/package.json +8 -0
  40. package/package.json +7 -5
package/docs/tools.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Custom Tools
2
2
 
3
- Custom tools are registered in `agentlab.config.yaml` and loaded from repo-local JS or TS modules.
3
+ Custom tools are registered in `agentlab.config.yaml` and can be loaded from repo-local JS/TS modules or installed npm packages.
4
4
 
5
5
  This is the main extension point when built-in tools are not enough.
6
6
 
@@ -9,12 +9,14 @@ This is the main extension point when built-in tools are not enough.
9
9
  Each tool entry must define:
10
10
 
11
11
  - `name`
12
- - `modulePath`
12
+ - exactly one source:
13
+ - `modulePath`, or
14
+ - `package`
13
15
  - `exportName`
14
16
  - `description`
15
17
  - `inputSchema`
16
18
 
17
- Example:
19
+ Repo-local example:
18
20
 
19
21
  ```yaml
20
22
  tools:
@@ -33,6 +35,25 @@ tools:
33
35
  - customer_id
34
36
  ```
35
37
 
38
+ Installed package example:
39
+
40
+ ```yaml
41
+ tools:
42
+ - name: support.find_duplicate_charge
43
+ package: "@agentlab/example-support-tools"
44
+ exportName: findDuplicateCharge
45
+ description: Find the duplicated charge order id for a given customer.
46
+ inputSchema:
47
+ type: object
48
+ additionalProperties: false
49
+ properties:
50
+ customer_id:
51
+ type: string
52
+ description: Customer id to inspect for duplicated charges.
53
+ required:
54
+ - customer_id
55
+ ```
56
+
36
57
  ## Tool Module Shape
37
58
 
38
59
  The exported function should be async and should return JSON-serializable output.
@@ -48,11 +69,15 @@ export async function myTool(input: unknown): Promise<{ ok: boolean }> {
48
69
  The existing working example is:
49
70
 
50
71
  - `user_tools/findDuplicateCharge.ts`
72
+ - `examples/support-tools`
73
+ - `examples/coding-tools`
51
74
 
52
75
  ## Important Constraints
53
76
 
77
+ - each tool must define exactly one of `modulePath` or `package`
54
78
  - `modulePath` must stay within the repo
55
79
  - the module must exist at load time
80
+ - installed packages must be resolvable from the current project
56
81
  - the named export must exist
57
82
  - tool input should be validated defensively inside the tool
58
83
  - tool output should be deterministic and JSON-serializable
@@ -100,3 +125,9 @@ Typical config failures:
100
125
  - invalid `inputSchema` shape
101
126
 
102
127
  See [troubleshooting.md](troubleshooting.md) for failure examples and fixes.
128
+
129
+ For installed-package workflows, a good local path is:
130
+
131
+ ```bash
132
+ npm install @agentlab/example-support-tools
133
+ ```
@@ -25,6 +25,8 @@ Or skip linking and use:
25
25
  npm run start -- --help
26
26
  ```
27
27
 
28
+ ---
29
+
28
30
  ## `OPENAI_API_KEY is required`
29
31
 
30
32
  You used an OpenAI-backed agent without exporting the API key.
@@ -36,6 +38,8 @@ export OPENAI_API_KEY=...
36
38
  agentlab run support.refund-correct-order --agent openai-cheap
37
39
  ```
38
40
 
41
+ ---
42
+
39
43
  ## `No scenarios found for suite ...`
40
44
 
41
45
  The suite id must match a suite under `scenarios/`.
@@ -52,6 +56,9 @@ Current built-in suites in this repo include:
52
56
  - `coding`
53
57
  - `research`
54
58
  - `ops`
59
+ - `internal-teams`
60
+
61
+ ---
55
62
 
56
63
  ## `Run '<id>' not found`
57
64
 
@@ -70,6 +77,8 @@ agentlab show <run-id>
70
77
  agentlab compare <baseline-run-id> <candidate-run-id>
71
78
  ```
72
79
 
80
+ ---
81
+
73
82
  ## `Missing baseline or candidate suite batch id`
74
83
 
75
84
  `compare --suite` does not use run ids. It uses suite batch ids printed by `run --suite`.
@@ -82,6 +91,8 @@ agentlab run --suite support --agent mock-default
82
91
  agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
83
92
  ```
84
93
 
94
+ ---
95
+
85
96
  ## Cross-suite suite comparison errors
86
97
 
87
98
  Suite batch comparison is strict. Compare batches from the same suite only.
@@ -99,6 +110,8 @@ This is not valid:
99
110
 
100
111
  If you are unsure which batch came from which suite, rerun the suite and record the printed batch ids.
101
112
 
113
+ ---
114
+
102
115
  ## `agentlab ui` fails to load assets
103
116
 
104
117
  Installed packages should already include the built UI assets.
@@ -116,6 +129,8 @@ If the problem persists, verify that these files exist:
116
129
  - `dist/ui-assets/client.js`
117
130
  - `dist/ui-assets/client.css`
118
131
 
132
+ ---
133
+
119
134
  ## Config tool or agent not found
120
135
 
121
136
  Typical reasons:
@@ -123,14 +138,190 @@ Typical reasons:
123
138
  - `agentlab.config.yaml` is missing
124
139
  - the configured `name` does not match the CLI `--agent` value
125
140
  - `modulePath` points outside the repo
141
+ - both `modulePath` and `package` were provided for the same tool
142
+ - the configured npm package is not installed
126
143
  - the configured export or command does not exist
127
144
 
128
145
  Working references in this repo:
129
146
 
130
147
  - tool config: `agentlab.config.yaml`
131
148
  - custom tool: `user_tools/findDuplicateCharge.ts`
149
+ - package-style tools: `examples/support-tools`, `examples/coding-tools`
132
150
  - external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
133
151
 
152
+ ### `Tool '<name>' must define exactly one of 'modulePath' or 'package'`
153
+
154
+ Your tool registration is ambiguous or incomplete.
155
+
156
+ Valid:
157
+
158
+ ```yaml
159
+ tools:
160
+ - name: support.find_duplicate_charge
161
+ modulePath: ./user_tools/findDuplicateCharge.ts
162
+ exportName: findDuplicateCharge
163
+ ```
164
+
165
+ Also valid:
166
+
167
+ ```yaml
168
+ tools:
169
+ - name: support.find_duplicate_charge
170
+ package: "@agentlab/example-support-tools"
171
+ exportName: findDuplicateCharge
172
+ ```
173
+
174
+ Invalid:
175
+
176
+ - setting both `modulePath` and `package`
177
+ - setting neither of them
178
+
179
+ ### `Tool '<name>' failed to load package '<pkg>'`
180
+
181
+ The package-backed tool could not be resolved from the current project.
182
+
183
+ Check:
184
+
185
+ - the package is installed in the current project
186
+ - the package name is correct
187
+ - the package exports the named function you configured
188
+
189
+ Typical fix:
190
+
191
+ ```bash
192
+ npm install @agentlab/example-support-tools
193
+ ```
194
+
195
+ ### `Tool '<name>' export '<export>' is not a function`
196
+
197
+ The module loaded successfully, but the named export does not exist or is not callable.
198
+
199
+ Check:
200
+
201
+ - `exportName` matches the actual exported function name
202
+ - the package or local module uses ESM exports as expected
203
+
204
+ ---
205
+
206
+ ## HTTP agent errors
207
+
208
+ ### `HTTP agents require a configured url`
209
+
210
+ You ran a conversation scenario with `--provider http` but no HTTP agent config was found.
211
+
212
+ Fix: define a named http agent in `agentlab.config.yaml`:
213
+
214
+ ```yaml
215
+ agents:
216
+ - name: my-agent
217
+ provider: http
218
+ url: http://localhost:3000/api/chat
219
+ ```
220
+
221
+ Then run with:
222
+
223
+ ```bash
224
+ agentlab run internal-teams.memory-followup-recall --agent my-agent
225
+ ```
226
+
227
+ ### `termination_reason: http_connection_failed`
228
+
229
+ agentlab could not connect to your agent's URL. The most common cause is that the agent service is not running.
230
+
231
+ Check:
232
+
233
+ - is the service running on the configured port?
234
+ - is the URL in `agentlab.config.yaml` correct?
235
+ - is there a firewall or proxy blocking the connection?
236
+
237
+ ### `termination_reason: http_error`
238
+
239
+ Your agent returned an HTTP 4xx or 5xx response.
240
+
241
+ Check:
242
+
243
+ - is the route path correct?
244
+ - does your agent expect a different request shape? Use `request_template` if so.
245
+ - are there auth errors? Check `headers` config.
246
+
247
+ ### `termination_reason: timeout_exceeded`
248
+
249
+ Your agent did not respond within `timeout_ms` (default 30 seconds).
250
+
251
+ Fix options:
252
+
253
+ - increase `timeout_ms` in the agent config
254
+ - investigate why the agent is slow for the given input
255
+
256
+ ### `termination_reason: invalid_response_format`
257
+
258
+ Your agent either returned non-JSON or did not include the expected field.
259
+
260
+ Defaults: agentlab reads the `message` field from the JSON response. Override with `response_field` if your agent uses a different name:
261
+
262
+ ```yaml
263
+ agents:
264
+ - name: my-agent
265
+ provider: http
266
+ url: http://localhost:3000/api/chat
267
+ response_field: reply
268
+ ```
269
+
270
+ ---
271
+
272
+ ## `database is locked`
273
+
274
+ You hit SQLite write contention on the local artifacts DB.
275
+
276
+ Most common cause:
277
+
278
+ - multiple `agentlab` runs writing to the same `artifacts/agentlab.db` at the same time
279
+
280
+ Fix:
281
+
282
+ - wait for the current run to finish
283
+ - rerun sequentially instead of in parallel
284
+ - keep live HTTP fixture verification serialized when using the same local project directory
285
+
286
+ The product now uses a busy timeout, but sequential execution is still the safest path for local live verification.
287
+
288
+ ---
289
+
290
+ ## Conversation scenario errors
291
+
292
+ ### `Scenario '...' is a conversation scenario and requires provider: http`
293
+
294
+ You tried to run a `type: conversation` scenario with a non-HTTP agent (`mock`, `openai`, or `external_process`).
295
+
296
+ Conversation scenarios only work with `provider: http`. Configure an HTTP agent in `agentlab.config.yaml` and use `--agent <name>`.
297
+
298
+ ### `Conversation scenario '...' must not define 'tools'`
299
+
300
+ Your conversation scenario YAML has a `tools:` field. HTTP agents manage their own tools internally — remove the `tools:` block.
301
+
302
+ ### `Conversation scenario '...' must define at least one step`
303
+
304
+ The `steps:` list is empty or missing. Add at least one step:
305
+
306
+ ```yaml
307
+ steps:
308
+ - role: user
309
+ message: "Hello"
310
+ ```
311
+
312
+ ### Per-step evaluator type rejected
313
+
314
+ Only these evaluator types are valid inside `steps[].evaluators`:
315
+
316
+ - `response_contains`
317
+ - `response_not_contains`
318
+ - `response_matches_regex`
319
+ - `response_latency_max`
320
+
321
+ End-of-run types (`step_count_max`, `final_answer_contains`, `exact_final_answer`) belong at the top-level `evaluators:` block, not inside individual steps.
322
+
323
+ ---
324
+
134
325
  ## Global install behaves differently from repo mode
135
326
 
136
327
  That usually means the current working directory is wrong.
@@ -143,6 +334,8 @@ The CLI operates on the current working directory and expects:
143
334
 
144
335
  Run it from the project root you want to evaluate.
145
336
 
337
+ ---
338
+
146
339
  ## Release Verification
147
340
 
148
341
  Before publishing or cutting a release, run:
@@ -0,0 +1,63 @@
1
+ # Variant Sets
2
+
3
+ Variant sets are named comparison groups defined in `agentlab.config.yaml`.
4
+
5
+ They are the Tier 1 mechanism for prompt, model, tool-schema, and config experiments without turning every comparison into manual CLI bookkeeping.
6
+
7
+ ## Why They Exist
8
+
9
+ Named agents remain the executable unit.
10
+
11
+ Variant sets sit on top of named agents so you can run the same scenario or suite against multiple variants and compare the results intentionally.
12
+
13
+ ## Config Shape
14
+
15
+ ```yaml
16
+ variant_sets:
17
+ - name: refund-agent-model-comparison
18
+ variants:
19
+ - agent: mock-default
20
+ label: baseline
21
+ prompt_version: prompt-v3
22
+ model_version: mock-model
23
+ tool_schema_version: support-tools-v1
24
+ config_label: baseline-refund-flow
25
+ - agent: mock-compact
26
+ label: concise
27
+ prompt_version: prompt-v4
28
+ model_version: mock-model
29
+ tool_schema_version: support-tools-v1
30
+ config_label: concise-refund-flow
31
+ ```
32
+
33
+ ## CLI Usage
34
+
35
+ Run one scenario against all variants:
36
+
37
+ ```bash
38
+ agentlab run support.refund-correct-order --variant-set refund-agent-model-comparison
39
+ ```
40
+
41
+ Run one suite definition against all variants:
42
+
43
+ ```bash
44
+ agentlab run --suite-def pre_merge --variant-set refund-agent-model-comparison
45
+ ```
46
+
47
+ ## Stored Identity
48
+
49
+ Each resulting run stores and surfaces:
50
+
51
+ - `variant_set_name`
52
+ - `variant_label`
53
+ - `prompt_version`
54
+ - `model_version`
55
+ - `tool_schema_version`
56
+ - `config_label`
57
+ - `config_hash`
58
+
59
+ Those fields appear in CLI run summaries, `agentlab show`, run history, comparisons, and the UI.
60
+
61
+ ## Design Rule
62
+
63
+ Use variant sets for intentional experiments. Keep named agents stable, and treat the variant set as the comparison layer.
@@ -0,0 +1,21 @@
1
+ # Example Coding Tools
2
+
3
+ Minimal package-style coding-tool example for Agent Regression Lab.
4
+
5
+ Register it in `agentlab.config.yaml` like this:
6
+
7
+ ```yaml
8
+ tools:
9
+ - name: coding.read_repo_hint
10
+ package: "@agentlab/example-coding-tools"
11
+ exportName: readRepoHint
12
+ description: Return a small repo hint for the target path.
13
+ inputSchema:
14
+ type: object
15
+ additionalProperties: false
16
+ properties:
17
+ path:
18
+ type: string
19
+ required:
20
+ - path
21
+ ```
@@ -0,0 +1,11 @@
1
+ export async function readRepoHint(input) {
2
+ const path = String(input?.path ?? "");
3
+ if (!path) {
4
+ throw new Error("path is required");
5
+ }
6
+
7
+ return {
8
+ path,
9
+ hint: "Check the target file before editing.",
10
+ };
11
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "name": "@agentlab/example-coding-tools",
3
+ "private": true,
4
+ "type": "module",
5
+ "exports": {
6
+ ".": "./index.js"
7
+ }
8
+ }
@@ -0,0 +1,21 @@
1
+ # Example Support Tools
2
+
3
+ Minimal package-style tool example for Agent Regression Lab.
4
+
5
+ Register it in `agentlab.config.yaml` like this:
6
+
7
+ ```yaml
8
+ tools:
9
+ - name: support.find_duplicate_charge
10
+ package: "@agentlab/example-support-tools"
11
+ exportName: findDuplicateCharge
12
+ description: Find the duplicated charge order id for a given customer.
13
+ inputSchema:
14
+ type: object
15
+ additionalProperties: false
16
+ properties:
17
+ customer_id:
18
+ type: string
19
+ required:
20
+ - customer_id
21
+ ```
@@ -0,0 +1,8 @@
1
+ export async function findDuplicateCharge(input) {
2
+ const customerId = String(input?.customer_id ?? "");
3
+ if (!customerId) {
4
+ throw new Error("customer_id is required");
5
+ }
6
+
7
+ return { order_id: `dup_${customerId}` };
8
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "name": "@agentlab/example-support-tools",
3
+ "private": true,
4
+ "type": "module",
5
+ "exports": {
6
+ ".": "./index.js"
7
+ }
8
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-regression-lab",
3
- "version": "0.2.0",
3
+ "version": "0.4.0",
4
4
  "private": false,
5
5
  "description": "Local-first scenario-based evaluation harness for AI agents.",
6
6
  "license": "MIT",
@@ -21,22 +21,24 @@
21
21
  ],
22
22
  "type": "module",
23
23
  "bin": {
24
- "agentlab": "./dist/index.js"
24
+ "agentlab": "bin/agentlab.js"
25
25
  },
26
26
  "files": [
27
+ "bin",
27
28
  "dist",
28
29
  "dist/ui-assets",
29
30
  "README.md",
30
- "docs"
31
+ "docs",
32
+ "examples"
31
33
  ],
32
34
  "engines": {
33
- "node": ">=22"
35
+ "node": ">=18"
34
36
  },
35
37
  "scripts": {
36
38
  "build": "tsc -p tsconfig.json && npm run build:ui",
37
39
  "build:ui": "esbuild src/ui/client.tsx --bundle --format=esm --platform=browser --outdir=dist/ui-assets --loader:.css=css --log-level=warning",
38
40
  "check": "tsc -p tsconfig.json --noEmit",
39
- "test": "tsx --test tests/**/*.test.ts",
41
+ "test": "tsx --test tests/*.test.ts tests/**/*.test.ts",
40
42
  "smoke:cli": "npm run build && node dist/index.js --help && node dist/index.js version",
41
43
  "start": "tsx src/index.ts",
42
44
  "run": "tsx src/index.ts"