agent-regression-lab 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,205 @@
1
+ # Agent Regression Lab
2
+
3
+ Agent Regression Lab is a local-first evaluation harness for AI agents.
4
+
5
+ It lets you define fixed scenarios in YAML, run an agent against them repeatedly, capture a structured trace, score the result, and compare runs over time.
6
+
7
+ This is an alpha developer tool. It is useful now for local benchmarking and debugging, but it is not yet a polished platform.
8
+
9
+ ## What It Supports Today
10
+
11
+ - YAML scenarios under `scenarios/`
12
+ - Deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
13
+ - Named agents from `agentlab.config.yaml`
14
+ - Built-in `mock`, `openai`, and `external_process` agent modes
15
+ - SQLite-backed local run history under `artifacts/agentlab.db`
16
+ - CLI commands to list, run, show, compare, and launch the UI
17
+ - Local web UI for run inspection and direct run-to-run comparison
18
+
19
+ ## Quickstart
20
+
21
+ 1. Install dependencies:
22
+
23
+ ```bash
24
+ npm install
25
+ ```
26
+
27
+ 2. Run the typecheck, tests, and build:
28
+
29
+ ```bash
30
+ npm run check
31
+ npm test
32
+ npm run build
33
+ ```
34
+
35
+ 3. Run a scenario:
36
+
37
+ ```bash
38
+ npm run start -- run support.refund-correct-order --agent mock-default
39
+ ```
40
+
41
+ 4. Inspect a run:
42
+
43
+ ```bash
44
+ npm run start -- show <run-id>
45
+ ```
46
+
47
+ 5. Launch the local UI:
48
+
49
+ ```bash
50
+ npm run start -- ui
51
+ ```
52
+
53
+ The UI starts on `http://127.0.0.1:4173`.
54
+
55
+ ## Installable CLI
56
+
57
+ The package can be installed as a Node CLI.
58
+
59
+ Local development install:
60
+
61
+ ```bash
62
+ npm install
63
+ npm run build
64
+ npm link
65
+ agentlab --help
66
+ ```
67
+
68
+ Packed or published install:
69
+
70
+ ```bash
71
+ npm install -g agent-regression-lab
72
+ agentlab --help
73
+ ```
74
+
75
+ The CLI operates on the current working directory. Run it from the root of a project that contains `scenarios/`, `fixtures/`, and optional `agentlab.config.yaml`.
76
+
77
+ ## CLI
78
+
79
+ ```text
80
+ agentlab list scenarios
81
+ agentlab run <scenario-id> [--agent <name>]
82
+ agentlab run --suite <suite-id> [--agent <name>]
83
+ agentlab show <run-id>
84
+ agentlab compare <baseline-run-id> <candidate-run-id>
85
+ agentlab ui
86
+ ```
87
+
88
+ You can also run these through `npm run start -- ...` during local development.
89
+
90
+ ## Scenarios
91
+
92
+ Scenarios are YAML files under `scenarios/`.
93
+
94
+ Current scenario features:
95
+
96
+ - task instructions
97
+ - fixture references
98
+ - allowed and forbidden tools
99
+ - `max_steps`
100
+ - `timeout_seconds`
101
+ - evaluator configuration
102
+
103
+ Example scenario shape:
104
+
105
+ ```yaml
106
+ id: support.refund-correct-order
107
+ name: Refund The Correct Order
108
+ suite: support
109
+ task:
110
+ instructions: |
111
+ The customer says they were charged twice.
112
+ Find the duplicated charge and refund only that order.
113
+ tools:
114
+ allowed:
115
+ - crm.search_customer
116
+ - orders.list
117
+ - orders.refund
118
+ runtime:
119
+ max_steps: 8
120
+ timeout_seconds: 60
121
+ evaluators:
122
+ - id: refund-created
123
+ type: tool_call_assertion
124
+ mode: hard_gate
125
+ config:
126
+ tool: orders.refund
127
+ match:
128
+ order_id: ord_1024
129
+ ```
130
+
131
+ ## Custom Agents And Tools
132
+
133
+ `agentlab.config.yaml` is the extension point for named agents and repo-local tools.
134
+
135
+ Supported agent providers:
136
+
137
+ - `mock`
138
+ - `openai`
139
+ - `external_process`
140
+
141
+ Supported custom tool model:
142
+
143
+ - repo-local JS/TS module path
144
+ - named export that resolves to an async function
145
+
146
+ Example config:
147
+
148
+ ```yaml
149
+ agents:
150
+ - name: custom-node-agent
151
+ provider: external_process
152
+ command: node
153
+ args:
154
+ - custom_agents/node_agent.mjs
155
+ label: custom-node-agent
156
+
157
+ tools:
158
+ - name: support.find_duplicate_charge
159
+ modulePath: user_tools/findDuplicateCharge.ts
160
+ exportName: findDuplicateCharge
161
+ description: Find the duplicated charge order id for a given customer.
162
+ inputSchema:
163
+ type: object
164
+ additionalProperties: false
165
+ properties:
166
+ customer_id:
167
+ type: string
168
+ required:
169
+ - customer_id
170
+ ```
171
+
172
+ ## External Process Protocol
173
+
174
+ External agents communicate with the runner over line-delimited JSON on stdin/stdout.
175
+
176
+ Runner events:
177
+
178
+ - `run_started`
179
+ - `tool_result`
180
+ - `runner_error`
181
+
182
+ Agent responses:
183
+
184
+ - `tool_call`
185
+ - `final`
186
+ - `error`
187
+
188
+ The runner stays in control of the loop. External agents must not execute tools directly.
189
+
190
+ Minimal flow:
191
+
192
+ 1. runner sends `run_started` with instructions, tool specs, context, and limits
193
+ 2. agent sends back a `tool_call` or `final`
194
+ 3. runner executes the tool and sends `tool_result`
195
+ 4. agent sends the next `tool_call` or `final`
196
+
197
+ See `custom_agents/node_agent.mjs` and `custom_agents/python_agent.py` for working examples.
198
+
199
+ ## Honest Limitations
200
+
201
+ - comparison is run-to-run, not full suite regression analysis yet
202
+ - tool loading is limited to local repo module paths
203
+ - external agents use the local stdin/stdout protocol only
204
+ - the UI is intentionally minimal and optimized for debugging, not dashboards
205
+ - the benchmark suite is still small
@@ -0,0 +1,173 @@
1
+ import { spawn } from "node:child_process";
2
+ import readline from "node:readline";
3
+ class ExternalProcessAgentSession {
4
+ input;
5
+ options;
6
+ process;
7
+ stdoutLines = [];
8
+ stderrLines = [];
9
+ pendingResolver;
10
+ exited = false;
11
+ closed = false;
12
+ constructor(input, options) {
13
+ this.input = input;
14
+ this.options = options;
15
+ this.process = spawn(this.options.command, this.options.args, {
16
+ stdio: ["pipe", "pipe", "pipe"],
17
+ env: buildChildEnv(this.options.envAllowlist),
18
+ });
19
+ const stdoutReader = readline.createInterface({ input: this.process.stdout });
20
+ stdoutReader.on("line", (line) => this.handleStdoutLine(line));
21
+ this.process.stderr.on("data", (chunk) => {
22
+ this.stderrLines.push(String(chunk).trim());
23
+ });
24
+ this.process.on("exit", (code, signal) => {
25
+ this.exited = true;
26
+ if (this.pendingResolver) {
27
+ const detail = this.stderrLines.filter(Boolean).join(" | ");
28
+ this.pendingResolver.reject(new Error(`External agent exited before responding (code=${String(code)}, signal=${String(signal)}${detail ? `, stderr=${detail}` : ""}).`));
29
+ clearTimeout(this.pendingResolver.timer);
30
+ this.pendingResolver = undefined;
31
+ }
32
+ });
33
+ }
34
+ async next(event) {
35
+ if (this.exited || this.closed) {
36
+ return { type: "error", message: "External agent process is no longer running." };
37
+ }
38
+ try {
39
+ const response = await this.sendAndReceive(toProtocolEvent(event, this.input), this.options.responseTimeoutMs);
40
+ const parsed = parseProtocolResponse(response);
41
+ if (parsed.type === "final" || parsed.type === "error") {
42
+ this.close();
43
+ }
44
+ return parsed;
45
+ }
46
+ catch (error) {
47
+ this.close();
48
+ return { type: "error", message: error instanceof Error ? error.message : String(error) };
49
+ }
50
+ }
51
+ sendAndReceive(event, timeoutMs) {
52
+ this.process.stdin.write(`${JSON.stringify(event)}\n`);
53
+ if (this.stdoutLines.length > 0) {
54
+ return Promise.resolve(this.stdoutLines.shift());
55
+ }
56
+ return new Promise((resolve, reject) => {
57
+ const timer = setTimeout(() => {
58
+ this.pendingResolver = undefined;
59
+ reject(new Error(`External agent timed out after ${timeoutMs}ms waiting for a response.`));
60
+ }, timeoutMs);
61
+ this.pendingResolver = { resolve, reject, timer };
62
+ });
63
+ }
64
+ handleStdoutLine(line) {
65
+ const trimmed = line.trim();
66
+ if (!trimmed) {
67
+ return;
68
+ }
69
+ if (this.pendingResolver) {
70
+ const { resolve, timer } = this.pendingResolver;
71
+ clearTimeout(timer);
72
+ this.pendingResolver = undefined;
73
+ resolve(trimmed);
74
+ return;
75
+ }
76
+ this.stdoutLines.push(trimmed);
77
+ }
78
+ close() {
79
+ if (this.closed) {
80
+ return;
81
+ }
82
+ this.closed = true;
83
+ if (!this.exited) {
84
+ this.process.kill();
85
+ }
86
+ }
87
+ }
88
+ export class ExternalProcessAgentAdapter {
89
+ options;
90
+ constructor(options) {
91
+ this.options = options;
92
+ }
93
+ async startRun(input) {
94
+ if (!this.options.command) {
95
+ throw new Error("External process agent requires a command.");
96
+ }
97
+ return new ExternalProcessAgentSession(input, {
98
+ command: this.options.command,
99
+ args: this.options.args ?? [],
100
+ envAllowlist: this.options.envAllowlist ?? [],
101
+ responseTimeoutMs: this.options.responseTimeoutMs ?? 10_000,
102
+ });
103
+ }
104
+ }
105
+ function toProtocolEvent(event, input) {
106
+ if (event.type === "run_started") {
107
+ return { type: "run_started", input };
108
+ }
109
+ if (event.type === "tool_result") {
110
+ return event;
111
+ }
112
+ return event;
113
+ }
114
+ function parseProtocolResponse(raw) {
115
+ let parsed;
116
+ try {
117
+ parsed = JSON.parse(raw);
118
+ }
119
+ catch {
120
+ throw new Error(`External agent returned invalid JSON: ${raw}`);
121
+ }
122
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed) || typeof parsed.type !== "string") {
123
+ throw new Error("External agent returned an invalid protocol message.");
124
+ }
125
+ const type = parsed.type;
126
+ if (type === "tool_call") {
127
+ if (typeof parsed.toolName !== "string") {
128
+ throw new Error("External agent tool_call response is missing toolName.");
129
+ }
130
+ return {
131
+ type: "tool_call",
132
+ toolName: parsed.toolName,
133
+ input: parsed.input ?? {},
134
+ metadata: isObject(parsed.metadata) ? parsed.metadata : undefined,
135
+ };
136
+ }
137
+ if (type === "final") {
138
+ if (typeof parsed.output !== "string") {
139
+ throw new Error("External agent final response is missing output.");
140
+ }
141
+ return {
142
+ type: "final",
143
+ output: parsed.output,
144
+ metadata: isObject(parsed.metadata) ? parsed.metadata : undefined,
145
+ };
146
+ }
147
+ if (type === "error") {
148
+ if (typeof parsed.message !== "string") {
149
+ throw new Error("External agent error response is missing message.");
150
+ }
151
+ return {
152
+ type: "error",
153
+ message: parsed.message,
154
+ retryable: Boolean(parsed.retryable),
155
+ };
156
+ }
157
+ throw new Error(`External agent returned unsupported response type '${String(type)}'.`);
158
+ }
159
+ function buildChildEnv(allowlist) {
160
+ const env = {};
161
+ for (const key of allowlist) {
162
+ if (process.env[key] !== undefined) {
163
+ env[key] = process.env[key];
164
+ }
165
+ }
166
+ env.PATH = process.env.PATH;
167
+ env.PWD = process.cwd();
168
+ env.HOME = process.env.HOME;
169
+ return env;
170
+ }
171
+ function isObject(value) {
172
+ return typeof value === "object" && value !== null && !Array.isArray(value);
173
+ }
@@ -0,0 +1,84 @@
1
+ import { ExternalProcessAgentAdapter } from "./externalProcessAdapter.js";
2
+ import { MockAgentAdapter } from "./mockAdapter.js";
3
+ import { OpenAIResponsesAgentAdapter } from "./openaiResponsesAdapter.js";
4
+ import { createAgentVersionId } from "../lib/id.js";
5
+ class MockAgentAdapterFactory {
6
+ createAdapter() {
7
+ return new MockAgentAdapter();
8
+ }
9
+ createVersion(config) {
10
+ const label = config.label ?? config.agentName ?? "mock-support-agent-v1";
11
+ const payload = { adapter: "mock", domain: "support", agentName: config.agentName };
12
+ return {
13
+ id: createAgentVersionId(label, payload),
14
+ label,
15
+ modelId: "mock-model",
16
+ provider: "mock",
17
+ config: payload,
18
+ };
19
+ }
20
+ }
21
+ class OpenAIAdapterFactory {
22
+ createAdapter() {
23
+ return new OpenAIResponsesAgentAdapter({
24
+ apiKey: process.env.OPENAI_API_KEY,
25
+ });
26
+ }
27
+ createVersion(config) {
28
+ const model = config.model ?? "gpt-4o-mini";
29
+ const label = config.label ?? config.agentName ?? `openai-${model}`;
30
+ const payload = { provider: "openai", model, agentName: config.agentName };
31
+ return {
32
+ id: createAgentVersionId(label, payload),
33
+ label,
34
+ modelId: model,
35
+ provider: "openai",
36
+ config: payload,
37
+ };
38
+ }
39
+ }
40
+ class ExternalProcessAdapterFactory {
41
+ createAdapter(config = {}) {
42
+ return new ExternalProcessAgentAdapter({
43
+ command: config.command ?? "",
44
+ args: config.args ?? [],
45
+ envAllowlist: config.envAllowlist ?? [],
46
+ });
47
+ }
48
+ createVersion(config) {
49
+ const label = config.label ?? config.agentName ?? "external-process-agent";
50
+ const payload = {
51
+ provider: "external_process",
52
+ command: config.command,
53
+ args: config.args ?? [],
54
+ agentName: config.agentName,
55
+ };
56
+ return {
57
+ id: createAgentVersionId(label, payload),
58
+ label,
59
+ provider: "external_process",
60
+ command: config.command,
61
+ args: config.args ?? [],
62
+ config: payload,
63
+ };
64
+ }
65
+ }
66
+ export function createAgentFactory(config) {
67
+ switch (config.provider) {
68
+ case "mock":
69
+ return new MockAgentAdapterFactory();
70
+ case "openai":
71
+ return new OpenAIAdapterFactory();
72
+ case "external_process":
73
+ return {
74
+ createAdapter: () => new ExternalProcessAgentAdapter({
75
+ command: config.command ?? "",
76
+ args: config.args ?? [],
77
+ envAllowlist: config.envAllowlist ?? [],
78
+ }),
79
+ createVersion: (runtimeConfig) => new ExternalProcessAdapterFactory().createVersion(runtimeConfig),
80
+ };
81
+ default:
82
+ throw new Error(`Unsupported provider '${String(config.provider)}'.`);
83
+ }
84
+ }
@@ -0,0 +1,96 @@
1
+ class MockAgentSession {
2
+ input;
3
+ state = { step: "start" };
4
+ constructor(input) {
5
+ this.input = input;
6
+ }
7
+ hasTool(toolName) {
8
+ return this.input.availableTools.some((tool) => tool.name === toolName);
9
+ }
10
+ async next(event) {
11
+ if (event.type === "runner_error") {
12
+ return { type: "error", message: event.message };
13
+ }
14
+ if (this.state.step === "start") {
15
+ const email = String(this.input.context.customer_email ?? "");
16
+ this.state = { step: "listed_customer" };
17
+ return {
18
+ type: "tool_call",
19
+ toolName: "crm.search_customer",
20
+ input: { email },
21
+ metadata: { message: "Looking up customer." },
22
+ };
23
+ }
24
+ if (this.state.step === "listed_customer") {
25
+ if (event.type !== "tool_result") {
26
+ return { type: "error", message: "Expected customer lookup result." };
27
+ }
28
+ const result = event.result;
29
+ if (this.hasTool("support.find_duplicate_charge")) {
30
+ this.state = { step: "found_duplicate" };
31
+ return {
32
+ type: "tool_call",
33
+ toolName: "support.find_duplicate_charge",
34
+ input: { customer_id: String(result.id ?? "") },
35
+ metadata: { message: "Looking up the duplicated order directly." },
36
+ };
37
+ }
38
+ this.state = { step: "listed_orders" };
39
+ return {
40
+ type: "tool_call",
41
+ toolName: "orders.list",
42
+ input: { customer_id: String(result.id ?? "") },
43
+ metadata: { message: "Listing customer orders." },
44
+ };
45
+ }
46
+ if (this.state.step === "listed_orders") {
47
+ if (event.type !== "tool_result" || !Array.isArray(event.result)) {
48
+ return { type: "error", message: "Expected order list result." };
49
+ }
50
+ const duplicate = event.result.find((order) => typeof order === "object" && order !== null && order.id === "ord_1024");
51
+ if (!duplicate?.id) {
52
+ return { type: "error", message: "Could not identify duplicate order." };
53
+ }
54
+ this.state = { step: "done" };
55
+ return {
56
+ type: "tool_call",
57
+ toolName: "orders.refund",
58
+ input: { order_id: duplicate.id },
59
+ metadata: { message: "Refunding the duplicated charge." },
60
+ };
61
+ }
62
+ if (this.state.step === "found_duplicate") {
63
+ if (event.type !== "tool_result" || typeof event.result !== "object" || event.result === null) {
64
+ return { type: "error", message: "Expected duplicate lookup result." };
65
+ }
66
+ const result = event.result;
67
+ if (!result.order_id) {
68
+ return { type: "error", message: "Duplicate lookup did not return an order id." };
69
+ }
70
+ this.state = { step: "done" };
71
+ return {
72
+ type: "tool_call",
73
+ toolName: "orders.refund",
74
+ input: { order_id: result.order_id },
75
+ metadata: { message: "Refunding the duplicated charge." },
76
+ };
77
+ }
78
+ if (this.state.step === "done") {
79
+ if (event.type !== "tool_result" || typeof event.result !== "object" || event.result === null) {
80
+ return { type: "error", message: "Expected refund result." };
81
+ }
82
+ const refund = event.result;
83
+ return {
84
+ type: "final",
85
+ output: `Refunded duplicated charge on order ${refund.order_id} for ${refund.amount} ${refund.currency}.`,
86
+ metadata: { completed: true },
87
+ };
88
+ }
89
+ return { type: "error", message: "Unexpected session state." };
90
+ }
91
+ }
92
+ export class MockAgentAdapter {
93
+ async startRun(input) {
94
+ return new MockAgentSession(input);
95
+ }
96
+ }