agent-regression-lab 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -123
- package/dist/agent/httpAdapter.js +78 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +37 -1
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/index.js +287 -102
- package/dist/lib/id.js +3 -0
- package/dist/scenarios.js +121 -9
- package/dist/storage.js +193 -29
- package/dist/tools.js +246 -0
- package/dist/ui/App.js +39 -3
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +83 -3
- package/docs/agents.md +152 -0
- package/docs/release-checklist.md +64 -0
- package/docs/scenarios.md +172 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +158 -0
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -2,61 +2,114 @@
|
|
|
2
2
|
|
|
3
3
|
Agent Regression Lab is a local-first evaluation harness for AI agents.
|
|
4
4
|
|
|
5
|
-
It
|
|
5
|
+
It gives you a repeatable way to define scenarios in YAML, run agents against deterministic tool surfaces, store traces and scores locally, and compare runs or suite batches over time.
|
|
6
6
|
|
|
7
|
-
This is an alpha developer tool. It is
|
|
7
|
+
This is an alpha developer tool. It is ready for early technical users, but it is not a polished platform.
|
|
8
|
+
|
|
9
|
+
## Who It Is For
|
|
10
|
+
|
|
11
|
+
- engineers building or debugging agent workflows
|
|
12
|
+
- researchers who want repeatable local evals
|
|
13
|
+
- teams that want a simple local regression harness before investing in heavier infrastructure
|
|
8
14
|
|
|
9
15
|
## What It Supports Today
|
|
10
16
|
|
|
11
17
|
- YAML scenarios under `scenarios/`
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
18
|
+
- deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
|
|
19
|
+
- named agents from `agentlab.config.yaml`
|
|
20
|
+
- built-in `mock`, `openai`, and `external_process` agent modes
|
|
15
21
|
- SQLite-backed local run history under `artifacts/agentlab.db`
|
|
16
22
|
- CLI commands to list, run, show, compare, and launch the UI
|
|
17
|
-
-
|
|
23
|
+
- local web UI for run inspection, run comparison, and suite batch comparison
|
|
18
24
|
|
|
19
|
-
##
|
|
25
|
+
## First 10 Minutes
|
|
20
26
|
|
|
21
|
-
|
|
27
|
+
The fastest path is to run the CLI from a local checkout.
|
|
28
|
+
|
|
29
|
+
1. Install dependencies and build:
|
|
22
30
|
|
|
23
31
|
```bash
|
|
24
32
|
npm install
|
|
33
|
+
npm run check
|
|
34
|
+
npm test
|
|
35
|
+
npm run build
|
|
25
36
|
```
|
|
26
37
|
|
|
27
|
-
2.
|
|
38
|
+
2. Verify the CLI:
|
|
28
39
|
|
|
29
40
|
```bash
|
|
30
|
-
|
|
31
|
-
npm test
|
|
32
|
-
npm run build
|
|
41
|
+
agentlab --help
|
|
33
42
|
```
|
|
34
43
|
|
|
35
|
-
|
|
44
|
+
If you have not linked the package locally yet, use:
|
|
36
45
|
|
|
37
46
|
```bash
|
|
38
|
-
npm
|
|
47
|
+
npm link
|
|
48
|
+
agentlab --help
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
3. List scenarios:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
agentlab list scenarios
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
4. Run a deterministic sample scenario:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
agentlab run support.refund-correct-order --agent mock-default
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
5. Inspect the run:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
agentlab show <run-id>
|
|
39
67
|
```
|
|
40
68
|
|
|
41
|
-
|
|
69
|
+
6. Run the same scenario again, then compare the two runs:
|
|
42
70
|
|
|
43
71
|
```bash
|
|
44
|
-
|
|
72
|
+
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
45
73
|
```
|
|
46
74
|
|
|
47
|
-
|
|
75
|
+
7. Launch the local UI:
|
|
48
76
|
|
|
49
77
|
```bash
|
|
50
|
-
|
|
78
|
+
agentlab ui
|
|
51
79
|
```
|
|
52
80
|
|
|
53
81
|
The UI starts on `http://127.0.0.1:4173`.
|
|
54
82
|
|
|
55
|
-
|
|
83
|
+
8. Run a suite and compare two suite batches:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
agentlab run --suite support --agent mock-default
|
|
87
|
+
agentlab run --suite support --agent mock-default
|
|
88
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
89
|
+
```
|
|
56
90
|
|
|
57
|
-
|
|
91
|
+
`run --suite` prints a `Suite batch:` id at the end. That is the id used by `compare --suite`.
|
|
58
92
|
|
|
59
|
-
|
|
93
|
+
## Install
|
|
94
|
+
|
|
95
|
+
### Installed CLI
|
|
96
|
+
|
|
97
|
+
After the package is published:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
npm install -g agent-regression-lab
|
|
101
|
+
agentlab --help
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
You can also use:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
npx agent-regression-lab --help
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Local Development Install
|
|
111
|
+
|
|
112
|
+
From this repo:
|
|
60
113
|
|
|
61
114
|
```bash
|
|
62
115
|
npm install
|
|
@@ -65,72 +118,50 @@ npm link
|
|
|
65
118
|
agentlab --help
|
|
66
119
|
```
|
|
67
120
|
|
|
68
|
-
|
|
121
|
+
### Repo-Local Dev Mode
|
|
122
|
+
|
|
123
|
+
If you do not want to link the package yet:
|
|
69
124
|
|
|
70
125
|
```bash
|
|
71
|
-
npm
|
|
72
|
-
|
|
126
|
+
npm run start -- --help
|
|
127
|
+
npm run start -- run support.refund-correct-order --agent mock-default
|
|
73
128
|
```
|
|
74
129
|
|
|
75
|
-
The CLI operates on the current working directory. Run it from the root of a project that contains `scenarios/`, `fixtures/`, and optional `agentlab.config.yaml`.
|
|
76
|
-
|
|
77
130
|
## CLI
|
|
78
131
|
|
|
132
|
+
Supported command surface:
|
|
133
|
+
|
|
79
134
|
```text
|
|
80
135
|
agentlab list scenarios
|
|
81
136
|
agentlab run <scenario-id> [--agent <name>]
|
|
82
137
|
agentlab run --suite <suite-id> [--agent <name>]
|
|
83
138
|
agentlab show <run-id>
|
|
84
139
|
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
140
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
85
141
|
agentlab ui
|
|
142
|
+
agentlab version
|
|
143
|
+
agentlab help
|
|
86
144
|
```
|
|
87
145
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
##
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
Current scenario features:
|
|
95
|
-
|
|
96
|
-
- task instructions
|
|
97
|
-
- fixture references
|
|
98
|
-
- allowed and forbidden tools
|
|
99
|
-
- `max_steps`
|
|
100
|
-
- `timeout_seconds`
|
|
101
|
-
- evaluator configuration
|
|
102
|
-
|
|
103
|
-
Example scenario shape:
|
|
104
|
-
|
|
105
|
-
```yaml
|
|
106
|
-
id: support.refund-correct-order
|
|
107
|
-
name: Refund The Correct Order
|
|
108
|
-
suite: support
|
|
109
|
-
task:
|
|
110
|
-
instructions: |
|
|
111
|
-
The customer says they were charged twice.
|
|
112
|
-
Find the duplicated charge and refund only that order.
|
|
113
|
-
tools:
|
|
114
|
-
allowed:
|
|
115
|
-
- crm.search_customer
|
|
116
|
-
- orders.list
|
|
117
|
-
- orders.refund
|
|
118
|
-
runtime:
|
|
119
|
-
max_steps: 8
|
|
120
|
-
timeout_seconds: 60
|
|
121
|
-
evaluators:
|
|
122
|
-
- id: refund-created
|
|
123
|
-
type: tool_call_assertion
|
|
124
|
-
mode: hard_gate
|
|
125
|
-
config:
|
|
126
|
-
tool: orders.refund
|
|
127
|
-
match:
|
|
128
|
-
order_id: ord_1024
|
|
129
|
-
```
|
|
146
|
+
The CLI operates on the current working directory. Run it from the root of a project that contains `scenarios/`, `fixtures/`, and optional `agentlab.config.yaml`.
|
|
147
|
+
|
|
148
|
+
## Canonical Workflow
|
|
149
|
+
|
|
150
|
+
Use this as the default mental model:
|
|
130
151
|
|
|
131
|
-
|
|
152
|
+
1. list scenarios
|
|
153
|
+
2. run one scenario or one suite
|
|
154
|
+
3. note the run id or suite batch id
|
|
155
|
+
4. inspect the run in CLI or UI
|
|
156
|
+
5. compare two runs or two suite batches
|
|
157
|
+
6. extend the setup with a named agent or repo-local tool when needed
|
|
132
158
|
|
|
133
|
-
|
|
159
|
+
## Config And Extension Points
|
|
160
|
+
|
|
161
|
+
`agentlab.config.yaml` is the public extension point for:
|
|
162
|
+
|
|
163
|
+
- named agents
|
|
164
|
+
- repo-local custom tools
|
|
134
165
|
|
|
135
166
|
Supported agent providers:
|
|
136
167
|
|
|
@@ -138,68 +169,54 @@ Supported agent providers:
|
|
|
138
169
|
- `openai`
|
|
139
170
|
- `external_process`
|
|
140
171
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
-
|
|
144
|
-
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
tools:
|
|
158
|
-
- name: support.find_duplicate_charge
|
|
159
|
-
modulePath: user_tools/findDuplicateCharge.ts
|
|
160
|
-
exportName: findDuplicateCharge
|
|
161
|
-
description: Find the duplicated charge order id for a given customer.
|
|
162
|
-
inputSchema:
|
|
163
|
-
type: object
|
|
164
|
-
additionalProperties: false
|
|
165
|
-
properties:
|
|
166
|
-
customer_id:
|
|
167
|
-
type: string
|
|
168
|
-
required:
|
|
169
|
-
- customer_id
|
|
170
|
-
```
|
|
172
|
+
Working sample assets already live in this repo:
|
|
173
|
+
|
|
174
|
+
- external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
|
|
175
|
+
- custom tool: `user_tools/findDuplicateCharge.ts`
|
|
176
|
+
- sample config: `agentlab.config.yaml`
|
|
177
|
+
|
|
178
|
+
See:
|
|
179
|
+
|
|
180
|
+
- [docs/scenarios.md](docs/scenarios.md)
|
|
181
|
+
- [docs/tools.md](docs/tools.md)
|
|
182
|
+
- [docs/agents.md](docs/agents.md)
|
|
183
|
+
- [docs/troubleshooting.md](docs/troubleshooting.md)
|
|
184
|
+
- [docs/release-checklist.md](docs/release-checklist.md)
|
|
185
|
+
|
|
186
|
+
## Local Data And Artifacts
|
|
171
187
|
|
|
172
|
-
|
|
188
|
+
By default the product writes local state under `artifacts/`.
|
|
173
189
|
|
|
174
|
-
|
|
190
|
+
Important paths:
|
|
175
191
|
|
|
176
|
-
|
|
192
|
+
- SQLite DB: `artifacts/agentlab.db`
|
|
193
|
+
- per-run trace output: `artifacts/<run-id>/trace.json`
|
|
194
|
+
- local UI assets at runtime: served from packaged `dist/ui-assets` or built into `artifacts/ui/` in repo mode
|
|
177
195
|
|
|
178
|
-
|
|
179
|
-
- `tool_result`
|
|
180
|
-
- `runner_error`
|
|
196
|
+
If you delete `artifacts/`, you remove stored run history and generated local outputs.
|
|
181
197
|
|
|
182
|
-
|
|
198
|
+
## Determinism
|
|
183
199
|
|
|
184
|
-
|
|
185
|
-
- `final`
|
|
186
|
-
- `error`
|
|
200
|
+
The benchmark is designed to be deterministic enough for repeated local evaluation:
|
|
187
201
|
|
|
188
|
-
|
|
202
|
+
- built-in tools read from local fixtures
|
|
203
|
+
- scenarios declare fixed tool allowlists and evaluator rules
|
|
204
|
+
- scoring is rule-based
|
|
205
|
+
- suite comparison is based on stored local runs and suite batch ids
|
|
189
206
|
|
|
190
|
-
|
|
207
|
+
Agent behavior can still vary depending on the provider path. The built-in `mock` path is the most deterministic path for smoke tests and baseline examples.
|
|
191
208
|
|
|
192
|
-
|
|
193
|
-
2. agent sends back a `tool_call` or `final`
|
|
194
|
-
3. runner executes the tool and sends `tool_result`
|
|
195
|
-
4. agent sends the next `tool_call` or `final`
|
|
209
|
+
## Limitations
|
|
196
210
|
|
|
197
|
-
|
|
211
|
+
- this is a local-first alpha, not a hosted platform
|
|
212
|
+
- custom tool loading is limited to repo-local module paths
|
|
213
|
+
- external agents integrate through the local stdin/stdout protocol only
|
|
214
|
+
- the UI is intentionally minimal and optimized for debugging
|
|
215
|
+
- the benchmark is broader than before, but still small compared to a mature benchmark product
|
|
198
216
|
|
|
199
|
-
##
|
|
217
|
+
## Next Docs
|
|
200
218
|
|
|
201
|
-
-
|
|
202
|
-
-
|
|
203
|
-
-
|
|
204
|
-
-
|
|
205
|
-
- the benchmark suite is still small
|
|
219
|
+
- scenario authoring: [docs/scenarios.md](docs/scenarios.md)
|
|
220
|
+
- custom tools: [docs/tools.md](docs/tools.md)
|
|
221
|
+
- named agents and external-process protocol: [docs/agents.md](docs/agents.md)
|
|
222
|
+
- common failure modes: [docs/troubleshooting.md](docs/troubleshooting.md)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { performance } from "node:perf_hooks";
|
|
2
|
+
export function interpolateTemplate(template, message, conversationId) {
|
|
3
|
+
return template.replace(/\{\{([^}]+)\}\}/g, (_, key) => {
|
|
4
|
+
if (key === "message")
|
|
5
|
+
return message;
|
|
6
|
+
if (key === "conversation_id")
|
|
7
|
+
return conversationId;
|
|
8
|
+
if (key.startsWith("env."))
|
|
9
|
+
return process.env[key.slice(4)] ?? "";
|
|
10
|
+
return "";
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
export function buildRequestBody(template, message, conversationId) {
|
|
14
|
+
if (!template) {
|
|
15
|
+
return { message, conversation_id: conversationId };
|
|
16
|
+
}
|
|
17
|
+
const result = {};
|
|
18
|
+
for (const [field, valueTemplate] of Object.entries(template)) {
|
|
19
|
+
result[field] = interpolateTemplate(valueTemplate, message, conversationId);
|
|
20
|
+
}
|
|
21
|
+
return result;
|
|
22
|
+
}
|
|
23
|
+
export function extractReply(body, responseField) {
|
|
24
|
+
const field = responseField ?? "message";
|
|
25
|
+
if (typeof body === "object" && body !== null && field in body) {
|
|
26
|
+
const value = body[field];
|
|
27
|
+
return typeof value === "string" ? value : null;
|
|
28
|
+
}
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
export async function callHttpAgent(input) {
|
|
32
|
+
const { url, message, conversationId, request_template, response_field, headers = {}, timeout_ms = 30000 } = input;
|
|
33
|
+
const body = buildRequestBody(request_template, message, conversationId);
|
|
34
|
+
const interpolatedHeaders = {};
|
|
35
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
36
|
+
interpolatedHeaders[key] = interpolateTemplate(value, message, conversationId);
|
|
37
|
+
}
|
|
38
|
+
const controller = new AbortController();
|
|
39
|
+
const timeoutHandle = setTimeout(() => controller.abort(), timeout_ms);
|
|
40
|
+
const start = performance.now();
|
|
41
|
+
let response;
|
|
42
|
+
try {
|
|
43
|
+
response = await fetch(url, {
|
|
44
|
+
method: "POST",
|
|
45
|
+
headers: { "Content-Type": "application/json", ...interpolatedHeaders },
|
|
46
|
+
body: JSON.stringify(body),
|
|
47
|
+
signal: controller.signal,
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
catch (error) {
|
|
51
|
+
clearTimeout(timeoutHandle);
|
|
52
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
53
|
+
throw Object.assign(new Error(`Request to ${url} timed out after ${timeout_ms}ms`), { code: "timeout_exceeded" });
|
|
54
|
+
}
|
|
55
|
+
throw Object.assign(new Error(`Connection to ${url} failed: ${error instanceof Error ? error.message : String(error)}`), { code: "http_connection_failed" });
|
|
56
|
+
}
|
|
57
|
+
clearTimeout(timeoutHandle);
|
|
58
|
+
const latencyMs = Math.round(performance.now() - start);
|
|
59
|
+
if (!response.ok) {
|
|
60
|
+
throw Object.assign(new Error(`HTTP ${response.status} from ${url}`), {
|
|
61
|
+
code: "http_error",
|
|
62
|
+
httpStatus: response.status,
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
let parsed;
|
|
66
|
+
try {
|
|
67
|
+
parsed = await response.json();
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
throw Object.assign(new Error(`Response from ${url} is not valid JSON`), { code: "invalid_response_format" });
|
|
71
|
+
}
|
|
72
|
+
const reply = extractReply(parsed, response_field);
|
|
73
|
+
if (reply === null) {
|
|
74
|
+
const field = response_field ?? "message";
|
|
75
|
+
throw Object.assign(new Error(`Response from ${url} missing expected field '${field}'`), { code: "invalid_response_format" });
|
|
76
|
+
}
|
|
77
|
+
return { reply, latencyMs };
|
|
78
|
+
}
|