agent-regression-lab 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -11
- package/bin/agentlab.js +2 -0
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +199 -12
- package/dist/evaluators.js +56 -1
- package/dist/index.js +157 -11
- package/dist/init.js +88 -0
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +56 -2
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +75 -7
- package/dist/ui-assets/client.css +92 -0
- package/dist/ui-assets/client.js +183 -19
- package/docs/agents.md +143 -8
- package/docs/coding-agents.md +74 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
- package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
- package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
- package/docs/tools.md +34 -3
- package/docs/troubleshooting.md +193 -0
- package/docs/variant-sets.md +63 -0
- package/examples/coding-tools/README.md +21 -0
- package/examples/coding-tools/index.js +11 -0
- package/examples/coding-tools/package.json +8 -0
- package/examples/support-tools/README.md +21 -0
- package/examples/support-tools/index.js +8 -0
- package/examples/support-tools/package.json +8 -0
- package/package.json +7 -5
package/README.md
CHANGED
|
@@ -1,27 +1,71 @@
|
|
|
1
1
|
# Agent Regression Lab
|
|
2
2
|
|
|
3
|
-
Agent Regression Lab is
|
|
3
|
+
Agent Regression Lab is the local-first regression spine for agent engineering teams.
|
|
4
4
|
|
|
5
|
-
It gives
|
|
5
|
+
It gives teams a repeatable way to define expected agent behavior in YAML, replay it against deterministic tool surfaces or live HTTP agents, store traces and scores locally, and compare candidate behavior against known baselines over time.
|
|
6
6
|
|
|
7
|
-
This is
|
|
7
|
+
This is a local-first alpha for early technical teams. It is strongest when used across one workflow spine:
|
|
8
|
+
|
|
9
|
+
- debug a single scenario while building
|
|
10
|
+
- validate a branch with a suite before merge
|
|
11
|
+
- run curated golden suites before release
|
|
12
|
+
- keep incident-derived scenarios as engineering memory
|
|
8
13
|
|
|
9
14
|
## Who It Is For
|
|
10
15
|
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
- teams
|
|
16
|
+
- teams shipping prompt, model, tool, workflow, and memory changes
|
|
17
|
+
- engineers who need repeatable before/after evidence instead of vibes
|
|
18
|
+
- teams validating live HTTP agents as well as deterministic local scenarios
|
|
19
|
+
- researchers and technical operators who want local control before adopting heavier hosted infrastructure
|
|
20
|
+
|
|
21
|
+
## Why Teams Use It
|
|
22
|
+
|
|
23
|
+
- catch regressions before merge or release
|
|
24
|
+
- debug subtle behavioral changes with full traces
|
|
25
|
+
- compare model, prompt, tool, and workflow changes against a known baseline
|
|
26
|
+
- build a portfolio of golden workflows, historical regressions, and ugly edge cases
|
|
27
|
+
- preserve engineering memory so old failures do not quietly return
|
|
14
28
|
|
|
15
29
|
## What It Supports Today
|
|
16
30
|
|
|
17
31
|
- YAML scenarios under `scenarios/`
|
|
18
|
-
- deterministic built-in tools plus
|
|
32
|
+
- deterministic built-in tools plus custom tools from `agentlab.config.yaml`
|
|
19
33
|
- named agents from `agentlab.config.yaml`
|
|
20
|
-
- built-in `mock`, `openai`, and `
|
|
34
|
+
- built-in `mock`, `openai`, `external_process`, and `http` agent modes
|
|
35
|
+
- `type: conversation` multi-turn dialog scenarios for HTTP agents
|
|
21
36
|
- SQLite-backed local run history under `artifacts/agentlab.db`
|
|
22
37
|
- CLI commands to list, run, show, compare, and launch the UI
|
|
23
38
|
- local web UI for run inspection, run comparison, and suite batch comparison
|
|
24
39
|
|
|
40
|
+
## Workflow Spine
|
|
41
|
+
|
|
42
|
+
Use this as the default product story:
|
|
43
|
+
|
|
44
|
+
1. debug locally with one scenario
|
|
45
|
+
2. validate a branch with a suite
|
|
46
|
+
3. run curated golden suites before release
|
|
47
|
+
4. keep incident-derived scenarios as permanent regression assets
|
|
48
|
+
|
|
49
|
+
## Start Here
|
|
50
|
+
|
|
51
|
+
If your agent runs as an HTTP service:
|
|
52
|
+
|
|
53
|
+
- use `provider: http`
|
|
54
|
+
- start with [arl-test](arl-test)
|
|
55
|
+
- read [docs/agents.md](docs/agents.md) and [docs/scenarios.md](docs/scenarios.md)
|
|
56
|
+
|
|
57
|
+
If you are validating coding-agent changes:
|
|
58
|
+
|
|
59
|
+
- start with the coding scenarios under `scenarios/coding/`
|
|
60
|
+
- read [docs/coding-agents.md](docs/coding-agents.md)
|
|
61
|
+
- use deterministic tool-loop runs first, then compare before/after behavior
|
|
62
|
+
|
|
63
|
+
If you want pre-merge regression checks in CI:
|
|
64
|
+
|
|
65
|
+
- use `suite_definitions`
|
|
66
|
+
- start with `.github/workflows/agentlab-pre-merge.yml`
|
|
67
|
+
- run `agentlab run --suite-def pre_merge --agent mock-default`
|
|
68
|
+
|
|
25
69
|
## First 10 Minutes
|
|
26
70
|
|
|
27
71
|
The fastest path is to run the CLI from a local checkout.
|
|
@@ -135,6 +179,8 @@ Supported command surface:
|
|
|
135
179
|
agentlab list scenarios
|
|
136
180
|
agentlab run <scenario-id> [--agent <name>]
|
|
137
181
|
agentlab run --suite <suite-id> [--agent <name>]
|
|
182
|
+
agentlab run --suite-def <name> [--agent <name>]
|
|
183
|
+
agentlab run <scenario-id> [--variant-set <name>]
|
|
138
184
|
agentlab show <run-id>
|
|
139
185
|
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
140
186
|
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
@@ -154,25 +200,42 @@ Use this as the default mental model:
|
|
|
154
200
|
3. note the run id or suite batch id
|
|
155
201
|
4. inspect the run in CLI or UI
|
|
156
202
|
5. compare two runs or two suite batches
|
|
157
|
-
6. extend the setup with a named agent or repo-local
|
|
203
|
+
6. extend the setup with a named agent or custom tools from repo-local files or installed packages when needed
|
|
204
|
+
|
|
205
|
+
## Canonical Live HTTP Fixture
|
|
206
|
+
|
|
207
|
+
`arl-test/` is the canonical live HTTP regression fixture in this repo.
|
|
208
|
+
|
|
209
|
+
Use it to verify the production-like HTTP path end to end:
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
cd arl-test
|
|
213
|
+
npm start
|
|
214
|
+
node ../dist/index.js list scenarios
|
|
215
|
+
node ../dist/index.js run order-tracking-in-transit --agent support-agent
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
The `arl-test` scenarios are intended to behave like a real internal-team regression fixture, not just a toy demo.
|
|
158
219
|
|
|
159
220
|
## Config And Extension Points
|
|
160
221
|
|
|
161
222
|
`agentlab.config.yaml` is the public extension point for:
|
|
162
223
|
|
|
163
224
|
- named agents
|
|
164
|
-
- repo-local
|
|
225
|
+
- custom tools from repo-local files or installed npm packages
|
|
165
226
|
|
|
166
227
|
Supported agent providers:
|
|
167
228
|
|
|
168
229
|
- `mock`
|
|
169
230
|
- `openai`
|
|
170
231
|
- `external_process`
|
|
232
|
+
- `http` — point at a running HTTP service for multi-turn conversation testing
|
|
171
233
|
|
|
172
234
|
Working sample assets already live in this repo:
|
|
173
235
|
|
|
174
236
|
- external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
|
|
175
237
|
- custom tool: `user_tools/findDuplicateCharge.ts`
|
|
238
|
+
- package-style tool examples: `examples/support-tools`, `examples/coding-tools`
|
|
176
239
|
- sample config: `agentlab.config.yaml`
|
|
177
240
|
|
|
178
241
|
See:
|
|
@@ -209,14 +272,18 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
|
|
|
209
272
|
## Limitations
|
|
210
273
|
|
|
211
274
|
- this is a local-first alpha, not a hosted platform
|
|
212
|
-
-
|
|
275
|
+
- the published package/example ecosystem is still small
|
|
213
276
|
- external agents integrate through the local stdin/stdout protocol only
|
|
214
277
|
- the UI is intentionally minimal and optimized for debugging
|
|
278
|
+
- SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB
|
|
215
279
|
- the benchmark is broader than before, but still small compared to a mature benchmark product
|
|
216
280
|
|
|
217
281
|
## Next Docs
|
|
218
282
|
|
|
219
283
|
- scenario authoring: [docs/scenarios.md](docs/scenarios.md)
|
|
284
|
+
- golden suites: [docs/golden-suites.md](docs/golden-suites.md)
|
|
285
|
+
- integrations and live services: [docs/integrations-and-live-services.md](docs/integrations-and-live-services.md)
|
|
286
|
+
- memory and stateful agents: [docs/memory-and-stateful-agents.md](docs/memory-and-stateful-agents.md)
|
|
220
287
|
- custom tools: [docs/tools.md](docs/tools.md)
|
|
221
288
|
- named agents and external-process protocol: [docs/agents.md](docs/agents.md)
|
|
222
289
|
- common failure modes: [docs/troubleshooting.md](docs/troubleshooting.md)
|
package/bin/agentlab.js
ADDED
package/dist/agent/factory.js
CHANGED
|
@@ -2,6 +2,20 @@ import { ExternalProcessAgentAdapter } from "./externalProcessAdapter.js";
|
|
|
2
2
|
import { MockAgentAdapter } from "./mockAdapter.js";
|
|
3
3
|
import { OpenAIResponsesAgentAdapter } from "./openaiResponsesAdapter.js";
|
|
4
4
|
import { createAgentVersionId } from "../lib/id.js";
|
|
5
|
+
function attachIdentityMetadata(version, config) {
|
|
6
|
+
return {
|
|
7
|
+
...version,
|
|
8
|
+
variantSetName: config.variantSetName,
|
|
9
|
+
variantLabel: config.variantLabel,
|
|
10
|
+
promptVersion: config.promptVersion,
|
|
11
|
+
modelVersion: config.modelVersion,
|
|
12
|
+
toolSchemaVersion: config.toolSchemaVersion,
|
|
13
|
+
configLabel: config.configLabel,
|
|
14
|
+
configHash: config.configHash,
|
|
15
|
+
runtimeProfileName: config.runtimeProfileName,
|
|
16
|
+
suiteDefinitionName: config.suiteDefinitionName,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
5
19
|
class MockAgentAdapterFactory {
|
|
6
20
|
createAdapter() {
|
|
7
21
|
return new MockAgentAdapter();
|
|
@@ -9,13 +23,13 @@ class MockAgentAdapterFactory {
|
|
|
9
23
|
createVersion(config) {
|
|
10
24
|
const label = config.label ?? config.agentName ?? "mock-support-agent-v1";
|
|
11
25
|
const payload = { adapter: "mock", domain: "support", agentName: config.agentName };
|
|
12
|
-
return {
|
|
26
|
+
return attachIdentityMetadata({
|
|
13
27
|
id: createAgentVersionId(label, payload),
|
|
14
28
|
label,
|
|
15
29
|
modelId: "mock-model",
|
|
16
30
|
provider: "mock",
|
|
17
31
|
config: payload,
|
|
18
|
-
};
|
|
32
|
+
}, config);
|
|
19
33
|
}
|
|
20
34
|
}
|
|
21
35
|
class OpenAIAdapterFactory {
|
|
@@ -28,13 +42,13 @@ class OpenAIAdapterFactory {
|
|
|
28
42
|
const model = config.model ?? "gpt-4o-mini";
|
|
29
43
|
const label = config.label ?? config.agentName ?? `openai-${model}`;
|
|
30
44
|
const payload = { provider: "openai", model, agentName: config.agentName };
|
|
31
|
-
return {
|
|
45
|
+
return attachIdentityMetadata({
|
|
32
46
|
id: createAgentVersionId(label, payload),
|
|
33
47
|
label,
|
|
34
48
|
modelId: model,
|
|
35
49
|
provider: "openai",
|
|
36
50
|
config: payload,
|
|
37
|
-
};
|
|
51
|
+
}, config);
|
|
38
52
|
}
|
|
39
53
|
}
|
|
40
54
|
class ExternalProcessAdapterFactory {
|
|
@@ -53,14 +67,14 @@ class ExternalProcessAdapterFactory {
|
|
|
53
67
|
args: config.args ?? [],
|
|
54
68
|
agentName: config.agentName,
|
|
55
69
|
};
|
|
56
|
-
return {
|
|
70
|
+
return attachIdentityMetadata({
|
|
57
71
|
id: createAgentVersionId(label, payload),
|
|
58
72
|
label,
|
|
59
73
|
provider: "external_process",
|
|
60
74
|
command: config.command,
|
|
61
75
|
args: config.args ?? [],
|
|
62
76
|
config: payload,
|
|
63
|
-
};
|
|
77
|
+
}, config);
|
|
64
78
|
}
|
|
65
79
|
}
|
|
66
80
|
export function createAgentFactory(config) {
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { performance } from "node:perf_hooks";
|
|
2
2
|
export function interpolateTemplate(template, message, conversationId) {
|
|
3
3
|
return template.replace(/\{\{([^}]+)\}\}/g, (_, key) => {
|
|
4
|
-
|
|
4
|
+
const k = key.trim();
|
|
5
|
+
if (k === "message")
|
|
5
6
|
return message;
|
|
6
|
-
if (
|
|
7
|
+
if (k === "conversation_id")
|
|
7
8
|
return conversationId;
|
|
8
|
-
if (
|
|
9
|
-
return process.env[
|
|
9
|
+
if (k.startsWith("env."))
|
|
10
|
+
return process.env[k.slice(4)] ?? "";
|
|
10
11
|
return "";
|
|
11
12
|
});
|
|
12
13
|
}
|
package/dist/config.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import { statSync, readFileSync } from "node:fs";
|
|
2
2
|
import { resolve, relative, sep } from "node:path";
|
|
3
3
|
import { parse } from "yaml";
|
|
4
|
-
const CONFIG_PATH = resolve("agentlab.config.yaml");
|
|
5
4
|
export function loadAgentLabConfig() {
|
|
6
|
-
|
|
5
|
+
const configPath = resolve("agentlab.config.yaml");
|
|
6
|
+
if (!exists(configPath)) {
|
|
7
7
|
return {};
|
|
8
8
|
}
|
|
9
|
-
const raw = readFileSync(
|
|
9
|
+
const raw = readFileSync(configPath, "utf8");
|
|
10
10
|
const parsed = parse(raw);
|
|
11
11
|
validateConfig(parsed);
|
|
12
12
|
return parsed;
|
|
@@ -41,6 +41,47 @@ function validateConfig(value) {
|
|
|
41
41
|
names.add(agent.name);
|
|
42
42
|
}
|
|
43
43
|
}
|
|
44
|
+
const agents = (value.agents ?? []);
|
|
45
|
+
const agentNames = new Set(agents.map((agent) => agent.name));
|
|
46
|
+
if (value.variant_sets !== undefined) {
|
|
47
|
+
if (!Array.isArray(value.variant_sets)) {
|
|
48
|
+
throw new Error("agentlab.config.yaml field 'variant_sets' must be an array.");
|
|
49
|
+
}
|
|
50
|
+
const names = new Set();
|
|
51
|
+
for (const variantSet of value.variant_sets) {
|
|
52
|
+
validateVariantSetDefinition(variantSet, agentNames);
|
|
53
|
+
if (names.has(variantSet.name)) {
|
|
54
|
+
throw new Error(`agentlab.config.yaml defines duplicate variant set '${variantSet.name}'.`);
|
|
55
|
+
}
|
|
56
|
+
names.add(variantSet.name);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (value.runtime_profiles !== undefined) {
|
|
60
|
+
if (!Array.isArray(value.runtime_profiles)) {
|
|
61
|
+
throw new Error("agentlab.config.yaml field 'runtime_profiles' must be an array.");
|
|
62
|
+
}
|
|
63
|
+
const names = new Set();
|
|
64
|
+
for (const runtimeProfile of value.runtime_profiles) {
|
|
65
|
+
validateRuntimeProfileDefinition(runtimeProfile);
|
|
66
|
+
if (names.has(runtimeProfile.name)) {
|
|
67
|
+
throw new Error(`agentlab.config.yaml defines duplicate runtime profile '${runtimeProfile.name}'.`);
|
|
68
|
+
}
|
|
69
|
+
names.add(runtimeProfile.name);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
if (value.suite_definitions !== undefined) {
|
|
73
|
+
if (!Array.isArray(value.suite_definitions)) {
|
|
74
|
+
throw new Error("agentlab.config.yaml field 'suite_definitions' must be an array.");
|
|
75
|
+
}
|
|
76
|
+
const names = new Set();
|
|
77
|
+
for (const suiteDefinition of value.suite_definitions) {
|
|
78
|
+
validateSuiteDefinition(suiteDefinition);
|
|
79
|
+
if (names.has(suiteDefinition.name)) {
|
|
80
|
+
throw new Error(`agentlab.config.yaml defines duplicate suite definition '${suiteDefinition.name}'.`);
|
|
81
|
+
}
|
|
82
|
+
names.add(suiteDefinition.name);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
44
85
|
}
|
|
45
86
|
function validateToolRegistration(value) {
|
|
46
87
|
if (!isObject(value)) {
|
|
@@ -49,8 +90,10 @@ function validateToolRegistration(value) {
|
|
|
49
90
|
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
50
91
|
throw new Error("Each tool registration must define a non-empty 'name'.");
|
|
51
92
|
}
|
|
52
|
-
|
|
53
|
-
|
|
93
|
+
const hasModulePath = typeof value.modulePath === "string" && value.modulePath.length > 0;
|
|
94
|
+
const hasPackage = typeof value.package === "string" && value.package.length > 0;
|
|
95
|
+
if ((hasModulePath ? 1 : 0) + (hasPackage ? 1 : 0) !== 1) {
|
|
96
|
+
throw new Error(`Tool '${value.name}' must define exactly one of 'modulePath' or 'package'.`);
|
|
54
97
|
}
|
|
55
98
|
if (typeof value.exportName !== "string" || value.exportName.length === 0) {
|
|
56
99
|
throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
|
|
@@ -61,13 +104,15 @@ function validateToolRegistration(value) {
|
|
|
61
104
|
if (!isObject(value.inputSchema)) {
|
|
62
105
|
throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
|
|
63
106
|
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
107
|
+
if (hasModulePath) {
|
|
108
|
+
const resolved = resolve(value.modulePath);
|
|
109
|
+
const root = `${process.cwd()}${sep}`;
|
|
110
|
+
if (!(resolved === process.cwd() || resolved.startsWith(root))) {
|
|
111
|
+
throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
|
|
112
|
+
}
|
|
113
|
+
if (!exists(resolved)) {
|
|
114
|
+
throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
|
|
115
|
+
}
|
|
71
116
|
}
|
|
72
117
|
}
|
|
73
118
|
function validateAgentRegistration(value) {
|
|
@@ -145,6 +190,148 @@ export function getAgentRegistration(name) {
|
|
|
145
190
|
}
|
|
146
191
|
return match;
|
|
147
192
|
}
|
|
193
|
+
export function getVariantSet(name) {
|
|
194
|
+
const match = loadAgentLabConfig().variant_sets?.find((variantSet) => variantSet.name === name);
|
|
195
|
+
if (!match) {
|
|
196
|
+
throw new Error(`agentlab.config.yaml does not define variant set '${name}'.`);
|
|
197
|
+
}
|
|
198
|
+
return match;
|
|
199
|
+
}
|
|
200
|
+
export function getRuntimeProfile(name) {
|
|
201
|
+
const match = loadAgentLabConfig().runtime_profiles?.find((runtimeProfile) => runtimeProfile.name === name);
|
|
202
|
+
if (!match) {
|
|
203
|
+
throw new Error(`agentlab.config.yaml does not define runtime profile '${name}'.`);
|
|
204
|
+
}
|
|
205
|
+
return match;
|
|
206
|
+
}
|
|
207
|
+
export function getSuiteDefinition(name) {
|
|
208
|
+
const match = loadAgentLabConfig().suite_definitions?.find((suiteDefinition) => suiteDefinition.name === name);
|
|
209
|
+
if (!match) {
|
|
210
|
+
throw new Error(`agentlab.config.yaml does not define suite definition '${name}'.`);
|
|
211
|
+
}
|
|
212
|
+
return match;
|
|
213
|
+
}
|
|
214
|
+
function validateVariantSetDefinition(value, agentNames) {
|
|
215
|
+
if (!isObject(value)) {
|
|
216
|
+
throw new Error("Each variant set definition in agentlab.config.yaml must be an object.");
|
|
217
|
+
}
|
|
218
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
219
|
+
throw new Error("Each variant set definition must define a non-empty 'name'.");
|
|
220
|
+
}
|
|
221
|
+
if (!Array.isArray(value.variants)) {
|
|
222
|
+
throw new Error(`Variant set '${value.name}' must define a 'variants' array.`);
|
|
223
|
+
}
|
|
224
|
+
const labels = new Set();
|
|
225
|
+
for (const variant of value.variants) {
|
|
226
|
+
if (!isObject(variant)) {
|
|
227
|
+
throw new Error(`Variant set '${value.name}' contains a non-object variant definition.`);
|
|
228
|
+
}
|
|
229
|
+
if (typeof variant.agent !== "string" || variant.agent.length === 0) {
|
|
230
|
+
throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'agent' required.`);
|
|
231
|
+
}
|
|
232
|
+
if (!agentNames.has(variant.agent)) {
|
|
233
|
+
throw new Error(`Variant set '${value.name}' references unknown agent '${variant.agent}'.`);
|
|
234
|
+
}
|
|
235
|
+
if (typeof variant.label !== "string" || variant.label.length === 0) {
|
|
236
|
+
throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'label' required.`);
|
|
237
|
+
}
|
|
238
|
+
if (labels.has(variant.label)) {
|
|
239
|
+
throw new Error(`Variant set '${value.name}' defines duplicate variant label '${variant.label}'.`);
|
|
240
|
+
}
|
|
241
|
+
labels.add(variant.label);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
function validateRuntimeProfileDefinition(value) {
|
|
245
|
+
if (!isObject(value)) {
|
|
246
|
+
throw new Error("Each runtime profile definition in agentlab.config.yaml must be an object.");
|
|
247
|
+
}
|
|
248
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
249
|
+
throw new Error("Each runtime profile definition must define a non-empty 'name'.");
|
|
250
|
+
}
|
|
251
|
+
if (value.tool_faults !== undefined) {
|
|
252
|
+
if (!Array.isArray(value.tool_faults)) {
|
|
253
|
+
throw new Error(`Runtime profile '${value.name}' field 'tool_faults' must be an array.`);
|
|
254
|
+
}
|
|
255
|
+
for (const fault of value.tool_faults) {
|
|
256
|
+
if (!isObject(fault)) {
|
|
257
|
+
throw new Error(`Runtime profile '${value.name}' contains a non-object tool fault definition.`);
|
|
258
|
+
}
|
|
259
|
+
if (typeof fault.tool !== "string" || fault.tool.length === 0) {
|
|
260
|
+
throw new Error(`Runtime profile '${value.name}' contains a tool fault with a non-empty 'tool' required.`);
|
|
261
|
+
}
|
|
262
|
+
if (fault.mode !== "timeout" && fault.mode !== "error" && fault.mode !== "malformed_output" && fault.mode !== "partial_output") {
|
|
263
|
+
throw new Error(`Runtime profile '${value.name}' uses invalid tool fault mode '${String(fault.mode)}'.`);
|
|
264
|
+
}
|
|
265
|
+
if (fault.error_message !== undefined && (typeof fault.error_message !== "string" || fault.error_message.length === 0)) {
|
|
266
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'error_message' must be a non-empty string.`);
|
|
267
|
+
}
|
|
268
|
+
if (fault.timeout_ms !== undefined && (typeof fault.timeout_ms !== "number" || fault.timeout_ms <= 0)) {
|
|
269
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'timeout_ms' must be a positive number.`);
|
|
270
|
+
}
|
|
271
|
+
if (fault.partial_output !== undefined && !isObject(fault.partial_output)) {
|
|
272
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'partial_output' must be an object.`);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
if (value.state !== undefined) {
|
|
277
|
+
if (!isObject(value.state)) {
|
|
278
|
+
throw new Error(`Runtime profile '${value.name}' field 'state' must be an object.`);
|
|
279
|
+
}
|
|
280
|
+
if (value.state.reset !== "per_run" && value.state.reset !== "per_variant_run" && value.state.reset !== "manual") {
|
|
281
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.reset' must be one of 'per_run', 'per_variant_run', or 'manual'.`);
|
|
282
|
+
}
|
|
283
|
+
if (value.state.seeded_messages !== undefined) {
|
|
284
|
+
if (!Array.isArray(value.state.seeded_messages)) {
|
|
285
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.seeded_messages' must be an array.`);
|
|
286
|
+
}
|
|
287
|
+
for (const message of value.state.seeded_messages) {
|
|
288
|
+
if (!isObject(message)) {
|
|
289
|
+
throw new Error(`Runtime profile '${value.name}' contains a non-object seeded message.`);
|
|
290
|
+
}
|
|
291
|
+
if (message.role !== "user" && message.role !== "assistant") {
|
|
292
|
+
throw new Error(`Runtime profile '${value.name}' seeded message role must be 'user' or 'assistant'.`);
|
|
293
|
+
}
|
|
294
|
+
if (typeof message.message !== "string" || message.message.length === 0) {
|
|
295
|
+
throw new Error(`Runtime profile '${value.name}' seeded message must define a non-empty 'message'.`);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
if (value.state.memory_blob !== undefined && !isObject(value.state.memory_blob)) {
|
|
300
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.memory_blob' must be an object.`);
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
function validateSuiteDefinition(value) {
|
|
305
|
+
if (!isObject(value)) {
|
|
306
|
+
throw new Error("Each suite definition in agentlab.config.yaml must be an object.");
|
|
307
|
+
}
|
|
308
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
309
|
+
throw new Error("Each suite definition must define a non-empty 'name'.");
|
|
310
|
+
}
|
|
311
|
+
if (!isObject(value.include)) {
|
|
312
|
+
throw new Error(`Suite definition '${value.name}' must define an object 'include'.`);
|
|
313
|
+
}
|
|
314
|
+
validateSuiteSelectorArray(value.include, value.name, "include.scenarios");
|
|
315
|
+
validateSuiteSelectorArray(value.include, value.name, "include.tags");
|
|
316
|
+
validateSuiteSelectorArray(value.include, value.name, "include.suites");
|
|
317
|
+
if (value.exclude !== undefined) {
|
|
318
|
+
if (!isObject(value.exclude)) {
|
|
319
|
+
throw new Error(`Suite definition '${value.name}' field 'exclude' must be an object.`);
|
|
320
|
+
}
|
|
321
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.scenarios");
|
|
322
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.tags");
|
|
323
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.suites");
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
function validateSuiteSelectorArray(value, suiteName, key) {
|
|
327
|
+
const fieldName = key.split(".")[1];
|
|
328
|
+
const selector = value[fieldName];
|
|
329
|
+
if (selector !== undefined) {
|
|
330
|
+
if (!Array.isArray(selector) || selector.some((item) => typeof item !== "string")) {
|
|
331
|
+
throw new Error(`Suite definition '${suiteName}' field '${key}' must be an array of strings.`);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
148
335
|
function exists(path) {
|
|
149
336
|
try {
|
|
150
337
|
statSync(path);
|
package/dist/evaluators.js
CHANGED
|
@@ -13,6 +13,12 @@ function evaluateOne(evaluator, bundle) {
|
|
|
13
13
|
return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
|
|
14
14
|
case "step_count_max":
|
|
15
15
|
return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
|
|
16
|
+
case "tool_call_count_max":
|
|
17
|
+
return evaluateToolCallCountMax(evaluator, bundle.run.totalToolCalls);
|
|
18
|
+
case "tool_repeat_max":
|
|
19
|
+
return evaluateToolRepeatMax(evaluator, bundle.toolCalls);
|
|
20
|
+
case "cost_max":
|
|
21
|
+
return evaluateCostMax(evaluator, bundle.run.totalCostUsd);
|
|
16
22
|
default:
|
|
17
23
|
return {
|
|
18
24
|
evaluatorId: evaluator.id,
|
|
@@ -86,7 +92,8 @@ function evaluateExactFinalAnswer(evaluator, finalOutput) {
|
|
|
86
92
|
};
|
|
87
93
|
}
|
|
88
94
|
function evaluateStepCountMax(evaluator, stepCount) {
|
|
89
|
-
const
|
|
95
|
+
const rawMax = evaluator.config.max ?? evaluator.config.max_steps;
|
|
96
|
+
const max = Number(rawMax ?? 0);
|
|
90
97
|
const passed = stepCount <= max;
|
|
91
98
|
return {
|
|
92
99
|
evaluatorId: evaluator.id,
|
|
@@ -98,6 +105,54 @@ function evaluateStepCountMax(evaluator, stepCount) {
|
|
|
98
105
|
message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
|
|
99
106
|
};
|
|
100
107
|
}
|
|
108
|
+
function evaluateToolCallCountMax(evaluator, totalToolCalls) {
|
|
109
|
+
const max = Number(evaluator.config.max ?? 0);
|
|
110
|
+
const passed = totalToolCalls <= max;
|
|
111
|
+
return {
|
|
112
|
+
evaluatorId: evaluator.id,
|
|
113
|
+
evaluatorType: evaluator.type,
|
|
114
|
+
mode: evaluator.mode,
|
|
115
|
+
status: passed ? "pass" : "fail",
|
|
116
|
+
weight: evaluator.weight,
|
|
117
|
+
rawScore: passed ? 1 : 0,
|
|
118
|
+
message: passed
|
|
119
|
+
? `Tool call count ${totalToolCalls} is within max ${max}.`
|
|
120
|
+
: `Tool call count ${totalToolCalls} exceeds max ${max}.`,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
function evaluateToolRepeatMax(evaluator, toolCalls) {
|
|
124
|
+
const tool = String(evaluator.config.tool ?? "");
|
|
125
|
+
const max = Number(evaluator.config.max ?? 0);
|
|
126
|
+
const count = toolCalls.filter((call) => call.toolName === tool).length;
|
|
127
|
+
const passed = count <= max;
|
|
128
|
+
return {
|
|
129
|
+
evaluatorId: evaluator.id,
|
|
130
|
+
evaluatorType: evaluator.type,
|
|
131
|
+
mode: evaluator.mode,
|
|
132
|
+
status: passed ? "pass" : "fail",
|
|
133
|
+
weight: evaluator.weight,
|
|
134
|
+
rawScore: passed ? 1 : 0,
|
|
135
|
+
message: passed
|
|
136
|
+
? `Tool '${tool}' usage count ${count} is within max ${max}.`
|
|
137
|
+
: `Tool '${tool}' usage count ${count} exceeds max ${max}.`,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
function evaluateCostMax(evaluator, totalCostUsd) {
|
|
141
|
+
const maxUsd = Number(evaluator.config.max_usd ?? 0);
|
|
142
|
+
const total = totalCostUsd ?? 0;
|
|
143
|
+
const passed = total <= maxUsd;
|
|
144
|
+
return {
|
|
145
|
+
evaluatorId: evaluator.id,
|
|
146
|
+
evaluatorType: evaluator.type,
|
|
147
|
+
mode: evaluator.mode,
|
|
148
|
+
status: passed ? "pass" : "fail",
|
|
149
|
+
weight: evaluator.weight,
|
|
150
|
+
rawScore: passed ? 1 : 0,
|
|
151
|
+
message: passed
|
|
152
|
+
? `Total cost ${total} is within max ${maxUsd}.`
|
|
153
|
+
: `Total cost ${total} exceeds max ${maxUsd}.`,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
101
156
|
function matches(input, match) {
|
|
102
157
|
if (!isObject(input)) {
|
|
103
158
|
return false;
|