agent-regression-lab 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -7
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +5 -4
- package/dist/config.js +186 -3
- package/dist/evaluators.js +56 -1
- package/dist/index.js +143 -11
- package/dist/lib/id.js +3 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +90 -2
- package/dist/scoring.js +2 -2
- package/dist/storage.js +117 -7
- package/dist/tools.js +38 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +28 -2
- package/dist/ui-assets/client.js +82 -0
- package/docs/agents.md +143 -8
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +30 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +303 -56
- package/docs/troubleshooting.md +138 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,27 +1,51 @@
|
|
|
1
1
|
# Agent Regression Lab
|
|
2
2
|
|
|
3
|
-
Agent Regression Lab is
|
|
3
|
+
Agent Regression Lab is the local-first regression spine for agent engineering teams.
|
|
4
4
|
|
|
5
|
-
It gives
|
|
5
|
+
It gives teams a repeatable way to define expected agent behavior in YAML, replay it against deterministic tool surfaces or live HTTP agents, store traces and scores locally, and compare candidate behavior against known baselines over time.
|
|
6
6
|
|
|
7
|
-
This is
|
|
7
|
+
This is a local-first alpha for early technical teams. It is strongest when used across one workflow spine:
|
|
8
|
+
|
|
9
|
+
- debug a single scenario while building
|
|
10
|
+
- validate a branch with a suite before merge
|
|
11
|
+
- run curated golden suites before release
|
|
12
|
+
- keep incident-derived scenarios as engineering memory
|
|
8
13
|
|
|
9
14
|
## Who It Is For
|
|
10
15
|
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
- teams
|
|
16
|
+
- teams shipping prompt, model, tool, workflow, and memory changes
|
|
17
|
+
- engineers who need repeatable before/after evidence instead of vibes
|
|
18
|
+
- teams validating live HTTP agents as well as deterministic local scenarios
|
|
19
|
+
- researchers and technical operators who want local control before adopting heavier hosted infrastructure
|
|
20
|
+
|
|
21
|
+
## Why Teams Use It
|
|
22
|
+
|
|
23
|
+
- catch regressions before merge or release
|
|
24
|
+
- debug subtle behavioral changes with full traces
|
|
25
|
+
- compare model, prompt, tool, and workflow changes against a known baseline
|
|
26
|
+
- build a portfolio of golden workflows, historical regressions, and ugly edge cases
|
|
27
|
+
- preserve engineering memory so old failures do not quietly return
|
|
14
28
|
|
|
15
29
|
## What It Supports Today
|
|
16
30
|
|
|
17
31
|
- YAML scenarios under `scenarios/`
|
|
18
32
|
- deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
|
|
19
33
|
- named agents from `agentlab.config.yaml`
|
|
20
|
-
- built-in `mock`, `openai`, and `
|
|
34
|
+
- built-in `mock`, `openai`, `external_process`, and `http` agent modes
|
|
35
|
+
- `type: conversation` multi-turn dialog scenarios for HTTP agents
|
|
21
36
|
- SQLite-backed local run history under `artifacts/agentlab.db`
|
|
22
37
|
- CLI commands to list, run, show, compare, and launch the UI
|
|
23
38
|
- local web UI for run inspection, run comparison, and suite batch comparison
|
|
24
39
|
|
|
40
|
+
## Workflow Spine
|
|
41
|
+
|
|
42
|
+
Use this as the default product story:
|
|
43
|
+
|
|
44
|
+
1. debug locally with one scenario
|
|
45
|
+
2. validate a branch with a suite
|
|
46
|
+
3. run curated golden suites before release
|
|
47
|
+
4. keep incident-derived scenarios as permanent regression assets
|
|
48
|
+
|
|
25
49
|
## First 10 Minutes
|
|
26
50
|
|
|
27
51
|
The fastest path is to run the CLI from a local checkout.
|
|
@@ -135,6 +159,8 @@ Supported command surface:
|
|
|
135
159
|
agentlab list scenarios
|
|
136
160
|
agentlab run <scenario-id> [--agent <name>]
|
|
137
161
|
agentlab run --suite <suite-id> [--agent <name>]
|
|
162
|
+
agentlab run --suite-def <name> [--agent <name>]
|
|
163
|
+
agentlab run <scenario-id> [--variant-set <name>]
|
|
138
164
|
agentlab show <run-id>
|
|
139
165
|
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
140
166
|
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
@@ -156,6 +182,21 @@ Use this as the default mental model:
|
|
|
156
182
|
5. compare two runs or two suite batches
|
|
157
183
|
6. extend the setup with a named agent or repo-local tool when needed
|
|
158
184
|
|
|
185
|
+
## Canonical Live HTTP Fixture
|
|
186
|
+
|
|
187
|
+
`arl-test/` is the canonical live HTTP regression fixture in this repo.
|
|
188
|
+
|
|
189
|
+
Use it to verify the production-like HTTP path end to end:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
cd arl-test
|
|
193
|
+
npm start
|
|
194
|
+
node ../dist/index.js list scenarios
|
|
195
|
+
node ../dist/index.js run order-tracking-in-transit --agent support-agent
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
The `arl-test` scenarios are intended to behave like a real internal-team regression fixture, not just a toy demo.
|
|
199
|
+
|
|
159
200
|
## Config And Extension Points
|
|
160
201
|
|
|
161
202
|
`agentlab.config.yaml` is the public extension point for:
|
|
@@ -168,6 +209,7 @@ Supported agent providers:
|
|
|
168
209
|
- `mock`
|
|
169
210
|
- `openai`
|
|
170
211
|
- `external_process`
|
|
212
|
+
- `http` — point at a running HTTP service for multi-turn conversation testing
|
|
171
213
|
|
|
172
214
|
Working sample assets already live in this repo:
|
|
173
215
|
|
|
@@ -212,11 +254,15 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
|
|
|
212
254
|
- custom tool loading is limited to repo-local module paths
|
|
213
255
|
- external agents integrate through the local stdin/stdout protocol only
|
|
214
256
|
- the UI is intentionally minimal and optimized for debugging
|
|
257
|
+
- SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB
|
|
215
258
|
- the benchmark is broader than before, but still small compared to a mature benchmark product
|
|
216
259
|
|
|
217
260
|
## Next Docs
|
|
218
261
|
|
|
219
262
|
- scenario authoring: [docs/scenarios.md](docs/scenarios.md)
|
|
263
|
+
- golden suites: [docs/golden-suites.md](docs/golden-suites.md)
|
|
264
|
+
- integrations and live services: [docs/integrations-and-live-services.md](docs/integrations-and-live-services.md)
|
|
265
|
+
- memory and stateful agents: [docs/memory-and-stateful-agents.md](docs/memory-and-stateful-agents.md)
|
|
220
266
|
- custom tools: [docs/tools.md](docs/tools.md)
|
|
221
267
|
- named agents and external-process protocol: [docs/agents.md](docs/agents.md)
|
|
222
268
|
- common failure modes: [docs/troubleshooting.md](docs/troubleshooting.md)
|
package/dist/agent/factory.js
CHANGED
|
@@ -2,6 +2,20 @@ import { ExternalProcessAgentAdapter } from "./externalProcessAdapter.js";
|
|
|
2
2
|
import { MockAgentAdapter } from "./mockAdapter.js";
|
|
3
3
|
import { OpenAIResponsesAgentAdapter } from "./openaiResponsesAdapter.js";
|
|
4
4
|
import { createAgentVersionId } from "../lib/id.js";
|
|
5
|
+
function attachIdentityMetadata(version, config) {
|
|
6
|
+
return {
|
|
7
|
+
...version,
|
|
8
|
+
variantSetName: config.variantSetName,
|
|
9
|
+
variantLabel: config.variantLabel,
|
|
10
|
+
promptVersion: config.promptVersion,
|
|
11
|
+
modelVersion: config.modelVersion,
|
|
12
|
+
toolSchemaVersion: config.toolSchemaVersion,
|
|
13
|
+
configLabel: config.configLabel,
|
|
14
|
+
configHash: config.configHash,
|
|
15
|
+
runtimeProfileName: config.runtimeProfileName,
|
|
16
|
+
suiteDefinitionName: config.suiteDefinitionName,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
5
19
|
class MockAgentAdapterFactory {
|
|
6
20
|
createAdapter() {
|
|
7
21
|
return new MockAgentAdapter();
|
|
@@ -9,13 +23,13 @@ class MockAgentAdapterFactory {
|
|
|
9
23
|
createVersion(config) {
|
|
10
24
|
const label = config.label ?? config.agentName ?? "mock-support-agent-v1";
|
|
11
25
|
const payload = { adapter: "mock", domain: "support", agentName: config.agentName };
|
|
12
|
-
return {
|
|
26
|
+
return attachIdentityMetadata({
|
|
13
27
|
id: createAgentVersionId(label, payload),
|
|
14
28
|
label,
|
|
15
29
|
modelId: "mock-model",
|
|
16
30
|
provider: "mock",
|
|
17
31
|
config: payload,
|
|
18
|
-
};
|
|
32
|
+
}, config);
|
|
19
33
|
}
|
|
20
34
|
}
|
|
21
35
|
class OpenAIAdapterFactory {
|
|
@@ -28,13 +42,13 @@ class OpenAIAdapterFactory {
|
|
|
28
42
|
const model = config.model ?? "gpt-4o-mini";
|
|
29
43
|
const label = config.label ?? config.agentName ?? `openai-${model}`;
|
|
30
44
|
const payload = { provider: "openai", model, agentName: config.agentName };
|
|
31
|
-
return {
|
|
45
|
+
return attachIdentityMetadata({
|
|
32
46
|
id: createAgentVersionId(label, payload),
|
|
33
47
|
label,
|
|
34
48
|
modelId: model,
|
|
35
49
|
provider: "openai",
|
|
36
50
|
config: payload,
|
|
37
|
-
};
|
|
51
|
+
}, config);
|
|
38
52
|
}
|
|
39
53
|
}
|
|
40
54
|
class ExternalProcessAdapterFactory {
|
|
@@ -53,14 +67,14 @@ class ExternalProcessAdapterFactory {
|
|
|
53
67
|
args: config.args ?? [],
|
|
54
68
|
agentName: config.agentName,
|
|
55
69
|
};
|
|
56
|
-
return {
|
|
70
|
+
return attachIdentityMetadata({
|
|
57
71
|
id: createAgentVersionId(label, payload),
|
|
58
72
|
label,
|
|
59
73
|
provider: "external_process",
|
|
60
74
|
command: config.command,
|
|
61
75
|
args: config.args ?? [],
|
|
62
76
|
config: payload,
|
|
63
|
-
};
|
|
77
|
+
}, config);
|
|
64
78
|
}
|
|
65
79
|
}
|
|
66
80
|
export function createAgentFactory(config) {
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import { performance } from "node:perf_hooks";
|
|
2
2
|
export function interpolateTemplate(template, message, conversationId) {
|
|
3
3
|
return template.replace(/\{\{([^}]+)\}\}/g, (_, key) => {
|
|
4
|
-
|
|
4
|
+
const k = key.trim();
|
|
5
|
+
if (k === "message")
|
|
5
6
|
return message;
|
|
6
|
-
if (
|
|
7
|
+
if (k === "conversation_id")
|
|
7
8
|
return conversationId;
|
|
8
|
-
if (
|
|
9
|
-
return process.env[
|
|
9
|
+
if (k.startsWith("env."))
|
|
10
|
+
return process.env[k.slice(4)] ?? "";
|
|
10
11
|
return "";
|
|
11
12
|
});
|
|
12
13
|
}
|
package/dist/config.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import { statSync, readFileSync } from "node:fs";
|
|
2
2
|
import { resolve, relative, sep } from "node:path";
|
|
3
3
|
import { parse } from "yaml";
|
|
4
|
-
const CONFIG_PATH = resolve("agentlab.config.yaml");
|
|
5
4
|
export function loadAgentLabConfig() {
|
|
6
|
-
|
|
5
|
+
const configPath = resolve("agentlab.config.yaml");
|
|
6
|
+
if (!exists(configPath)) {
|
|
7
7
|
return {};
|
|
8
8
|
}
|
|
9
|
-
const raw = readFileSync(
|
|
9
|
+
const raw = readFileSync(configPath, "utf8");
|
|
10
10
|
const parsed = parse(raw);
|
|
11
11
|
validateConfig(parsed);
|
|
12
12
|
return parsed;
|
|
@@ -41,6 +41,47 @@ function validateConfig(value) {
|
|
|
41
41
|
names.add(agent.name);
|
|
42
42
|
}
|
|
43
43
|
}
|
|
44
|
+
const agents = (value.agents ?? []);
|
|
45
|
+
const agentNames = new Set(agents.map((agent) => agent.name));
|
|
46
|
+
if (value.variant_sets !== undefined) {
|
|
47
|
+
if (!Array.isArray(value.variant_sets)) {
|
|
48
|
+
throw new Error("agentlab.config.yaml field 'variant_sets' must be an array.");
|
|
49
|
+
}
|
|
50
|
+
const names = new Set();
|
|
51
|
+
for (const variantSet of value.variant_sets) {
|
|
52
|
+
validateVariantSetDefinition(variantSet, agentNames);
|
|
53
|
+
if (names.has(variantSet.name)) {
|
|
54
|
+
throw new Error(`agentlab.config.yaml defines duplicate variant set '${variantSet.name}'.`);
|
|
55
|
+
}
|
|
56
|
+
names.add(variantSet.name);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (value.runtime_profiles !== undefined) {
|
|
60
|
+
if (!Array.isArray(value.runtime_profiles)) {
|
|
61
|
+
throw new Error("agentlab.config.yaml field 'runtime_profiles' must be an array.");
|
|
62
|
+
}
|
|
63
|
+
const names = new Set();
|
|
64
|
+
for (const runtimeProfile of value.runtime_profiles) {
|
|
65
|
+
validateRuntimeProfileDefinition(runtimeProfile);
|
|
66
|
+
if (names.has(runtimeProfile.name)) {
|
|
67
|
+
throw new Error(`agentlab.config.yaml defines duplicate runtime profile '${runtimeProfile.name}'.`);
|
|
68
|
+
}
|
|
69
|
+
names.add(runtimeProfile.name);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
if (value.suite_definitions !== undefined) {
|
|
73
|
+
if (!Array.isArray(value.suite_definitions)) {
|
|
74
|
+
throw new Error("agentlab.config.yaml field 'suite_definitions' must be an array.");
|
|
75
|
+
}
|
|
76
|
+
const names = new Set();
|
|
77
|
+
for (const suiteDefinition of value.suite_definitions) {
|
|
78
|
+
validateSuiteDefinition(suiteDefinition);
|
|
79
|
+
if (names.has(suiteDefinition.name)) {
|
|
80
|
+
throw new Error(`agentlab.config.yaml defines duplicate suite definition '${suiteDefinition.name}'.`);
|
|
81
|
+
}
|
|
82
|
+
names.add(suiteDefinition.name);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
44
85
|
}
|
|
45
86
|
function validateToolRegistration(value) {
|
|
46
87
|
if (!isObject(value)) {
|
|
@@ -145,6 +186,148 @@ export function getAgentRegistration(name) {
|
|
|
145
186
|
}
|
|
146
187
|
return match;
|
|
147
188
|
}
|
|
189
|
+
export function getVariantSet(name) {
|
|
190
|
+
const match = loadAgentLabConfig().variant_sets?.find((variantSet) => variantSet.name === name);
|
|
191
|
+
if (!match) {
|
|
192
|
+
throw new Error(`agentlab.config.yaml does not define variant set '${name}'.`);
|
|
193
|
+
}
|
|
194
|
+
return match;
|
|
195
|
+
}
|
|
196
|
+
export function getRuntimeProfile(name) {
|
|
197
|
+
const match = loadAgentLabConfig().runtime_profiles?.find((runtimeProfile) => runtimeProfile.name === name);
|
|
198
|
+
if (!match) {
|
|
199
|
+
throw new Error(`agentlab.config.yaml does not define runtime profile '${name}'.`);
|
|
200
|
+
}
|
|
201
|
+
return match;
|
|
202
|
+
}
|
|
203
|
+
export function getSuiteDefinition(name) {
|
|
204
|
+
const match = loadAgentLabConfig().suite_definitions?.find((suiteDefinition) => suiteDefinition.name === name);
|
|
205
|
+
if (!match) {
|
|
206
|
+
throw new Error(`agentlab.config.yaml does not define suite definition '${name}'.`);
|
|
207
|
+
}
|
|
208
|
+
return match;
|
|
209
|
+
}
|
|
210
|
+
function validateVariantSetDefinition(value, agentNames) {
|
|
211
|
+
if (!isObject(value)) {
|
|
212
|
+
throw new Error("Each variant set definition in agentlab.config.yaml must be an object.");
|
|
213
|
+
}
|
|
214
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
215
|
+
throw new Error("Each variant set definition must define a non-empty 'name'.");
|
|
216
|
+
}
|
|
217
|
+
if (!Array.isArray(value.variants)) {
|
|
218
|
+
throw new Error(`Variant set '${value.name}' must define a 'variants' array.`);
|
|
219
|
+
}
|
|
220
|
+
const labels = new Set();
|
|
221
|
+
for (const variant of value.variants) {
|
|
222
|
+
if (!isObject(variant)) {
|
|
223
|
+
throw new Error(`Variant set '${value.name}' contains a non-object variant definition.`);
|
|
224
|
+
}
|
|
225
|
+
if (typeof variant.agent !== "string" || variant.agent.length === 0) {
|
|
226
|
+
throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'agent' required.`);
|
|
227
|
+
}
|
|
228
|
+
if (!agentNames.has(variant.agent)) {
|
|
229
|
+
throw new Error(`Variant set '${value.name}' references unknown agent '${variant.agent}'.`);
|
|
230
|
+
}
|
|
231
|
+
if (typeof variant.label !== "string" || variant.label.length === 0) {
|
|
232
|
+
throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'label' required.`);
|
|
233
|
+
}
|
|
234
|
+
if (labels.has(variant.label)) {
|
|
235
|
+
throw new Error(`Variant set '${value.name}' defines duplicate variant label '${variant.label}'.`);
|
|
236
|
+
}
|
|
237
|
+
labels.add(variant.label);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
function validateRuntimeProfileDefinition(value) {
|
|
241
|
+
if (!isObject(value)) {
|
|
242
|
+
throw new Error("Each runtime profile definition in agentlab.config.yaml must be an object.");
|
|
243
|
+
}
|
|
244
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
245
|
+
throw new Error("Each runtime profile definition must define a non-empty 'name'.");
|
|
246
|
+
}
|
|
247
|
+
if (value.tool_faults !== undefined) {
|
|
248
|
+
if (!Array.isArray(value.tool_faults)) {
|
|
249
|
+
throw new Error(`Runtime profile '${value.name}' field 'tool_faults' must be an array.`);
|
|
250
|
+
}
|
|
251
|
+
for (const fault of value.tool_faults) {
|
|
252
|
+
if (!isObject(fault)) {
|
|
253
|
+
throw new Error(`Runtime profile '${value.name}' contains a non-object tool fault definition.`);
|
|
254
|
+
}
|
|
255
|
+
if (typeof fault.tool !== "string" || fault.tool.length === 0) {
|
|
256
|
+
throw new Error(`Runtime profile '${value.name}' contains a tool fault with a non-empty 'tool' required.`);
|
|
257
|
+
}
|
|
258
|
+
if (fault.mode !== "timeout" && fault.mode !== "error" && fault.mode !== "malformed_output" && fault.mode !== "partial_output") {
|
|
259
|
+
throw new Error(`Runtime profile '${value.name}' uses invalid tool fault mode '${String(fault.mode)}'.`);
|
|
260
|
+
}
|
|
261
|
+
if (fault.error_message !== undefined && (typeof fault.error_message !== "string" || fault.error_message.length === 0)) {
|
|
262
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'error_message' must be a non-empty string.`);
|
|
263
|
+
}
|
|
264
|
+
if (fault.timeout_ms !== undefined && (typeof fault.timeout_ms !== "number" || fault.timeout_ms <= 0)) {
|
|
265
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'timeout_ms' must be a positive number.`);
|
|
266
|
+
}
|
|
267
|
+
if (fault.partial_output !== undefined && !isObject(fault.partial_output)) {
|
|
268
|
+
throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'partial_output' must be an object.`);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
if (value.state !== undefined) {
|
|
273
|
+
if (!isObject(value.state)) {
|
|
274
|
+
throw new Error(`Runtime profile '${value.name}' field 'state' must be an object.`);
|
|
275
|
+
}
|
|
276
|
+
if (value.state.reset !== "per_run" && value.state.reset !== "per_variant_run" && value.state.reset !== "manual") {
|
|
277
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.reset' must be one of 'per_run', 'per_variant_run', or 'manual'.`);
|
|
278
|
+
}
|
|
279
|
+
if (value.state.seeded_messages !== undefined) {
|
|
280
|
+
if (!Array.isArray(value.state.seeded_messages)) {
|
|
281
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.seeded_messages' must be an array.`);
|
|
282
|
+
}
|
|
283
|
+
for (const message of value.state.seeded_messages) {
|
|
284
|
+
if (!isObject(message)) {
|
|
285
|
+
throw new Error(`Runtime profile '${value.name}' contains a non-object seeded message.`);
|
|
286
|
+
}
|
|
287
|
+
if (message.role !== "user" && message.role !== "assistant") {
|
|
288
|
+
throw new Error(`Runtime profile '${value.name}' seeded message role must be 'user' or 'assistant'.`);
|
|
289
|
+
}
|
|
290
|
+
if (typeof message.message !== "string" || message.message.length === 0) {
|
|
291
|
+
throw new Error(`Runtime profile '${value.name}' seeded message must define a non-empty 'message'.`);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
if (value.state.memory_blob !== undefined && !isObject(value.state.memory_blob)) {
|
|
296
|
+
throw new Error(`Runtime profile '${value.name}' field 'state.memory_blob' must be an object.`);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
function validateSuiteDefinition(value) {
|
|
301
|
+
if (!isObject(value)) {
|
|
302
|
+
throw new Error("Each suite definition in agentlab.config.yaml must be an object.");
|
|
303
|
+
}
|
|
304
|
+
if (typeof value.name !== "string" || value.name.length === 0) {
|
|
305
|
+
throw new Error("Each suite definition must define a non-empty 'name'.");
|
|
306
|
+
}
|
|
307
|
+
if (!isObject(value.include)) {
|
|
308
|
+
throw new Error(`Suite definition '${value.name}' must define an object 'include'.`);
|
|
309
|
+
}
|
|
310
|
+
validateSuiteSelectorArray(value.include, value.name, "include.scenarios");
|
|
311
|
+
validateSuiteSelectorArray(value.include, value.name, "include.tags");
|
|
312
|
+
validateSuiteSelectorArray(value.include, value.name, "include.suites");
|
|
313
|
+
if (value.exclude !== undefined) {
|
|
314
|
+
if (!isObject(value.exclude)) {
|
|
315
|
+
throw new Error(`Suite definition '${value.name}' field 'exclude' must be an object.`);
|
|
316
|
+
}
|
|
317
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.scenarios");
|
|
318
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.tags");
|
|
319
|
+
validateSuiteSelectorArray(value.exclude, value.name, "exclude.suites");
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
function validateSuiteSelectorArray(value, suiteName, key) {
|
|
323
|
+
const fieldName = key.split(".")[1];
|
|
324
|
+
const selector = value[fieldName];
|
|
325
|
+
if (selector !== undefined) {
|
|
326
|
+
if (!Array.isArray(selector) || selector.some((item) => typeof item !== "string")) {
|
|
327
|
+
throw new Error(`Suite definition '${suiteName}' field '${key}' must be an array of strings.`);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
148
331
|
function exists(path) {
|
|
149
332
|
try {
|
|
150
333
|
statSync(path);
|
package/dist/evaluators.js
CHANGED
|
@@ -13,6 +13,12 @@ function evaluateOne(evaluator, bundle) {
|
|
|
13
13
|
return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
|
|
14
14
|
case "step_count_max":
|
|
15
15
|
return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
|
|
16
|
+
case "tool_call_count_max":
|
|
17
|
+
return evaluateToolCallCountMax(evaluator, bundle.run.totalToolCalls);
|
|
18
|
+
case "tool_repeat_max":
|
|
19
|
+
return evaluateToolRepeatMax(evaluator, bundle.toolCalls);
|
|
20
|
+
case "cost_max":
|
|
21
|
+
return evaluateCostMax(evaluator, bundle.run.totalCostUsd);
|
|
16
22
|
default:
|
|
17
23
|
return {
|
|
18
24
|
evaluatorId: evaluator.id,
|
|
@@ -86,7 +92,8 @@ function evaluateExactFinalAnswer(evaluator, finalOutput) {
|
|
|
86
92
|
};
|
|
87
93
|
}
|
|
88
94
|
function evaluateStepCountMax(evaluator, stepCount) {
|
|
89
|
-
const
|
|
95
|
+
const rawMax = evaluator.config.max ?? evaluator.config.max_steps;
|
|
96
|
+
const max = Number(rawMax ?? 0);
|
|
90
97
|
const passed = stepCount <= max;
|
|
91
98
|
return {
|
|
92
99
|
evaluatorId: evaluator.id,
|
|
@@ -98,6 +105,54 @@ function evaluateStepCountMax(evaluator, stepCount) {
|
|
|
98
105
|
message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
|
|
99
106
|
};
|
|
100
107
|
}
|
|
108
|
+
function evaluateToolCallCountMax(evaluator, totalToolCalls) {
|
|
109
|
+
const max = Number(evaluator.config.max ?? 0);
|
|
110
|
+
const passed = totalToolCalls <= max;
|
|
111
|
+
return {
|
|
112
|
+
evaluatorId: evaluator.id,
|
|
113
|
+
evaluatorType: evaluator.type,
|
|
114
|
+
mode: evaluator.mode,
|
|
115
|
+
status: passed ? "pass" : "fail",
|
|
116
|
+
weight: evaluator.weight,
|
|
117
|
+
rawScore: passed ? 1 : 0,
|
|
118
|
+
message: passed
|
|
119
|
+
? `Tool call count ${totalToolCalls} is within max ${max}.`
|
|
120
|
+
: `Tool call count ${totalToolCalls} exceeds max ${max}.`,
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
function evaluateToolRepeatMax(evaluator, toolCalls) {
|
|
124
|
+
const tool = String(evaluator.config.tool ?? "");
|
|
125
|
+
const max = Number(evaluator.config.max ?? 0);
|
|
126
|
+
const count = toolCalls.filter((call) => call.toolName === tool).length;
|
|
127
|
+
const passed = count <= max;
|
|
128
|
+
return {
|
|
129
|
+
evaluatorId: evaluator.id,
|
|
130
|
+
evaluatorType: evaluator.type,
|
|
131
|
+
mode: evaluator.mode,
|
|
132
|
+
status: passed ? "pass" : "fail",
|
|
133
|
+
weight: evaluator.weight,
|
|
134
|
+
rawScore: passed ? 1 : 0,
|
|
135
|
+
message: passed
|
|
136
|
+
? `Tool '${tool}' usage count ${count} is within max ${max}.`
|
|
137
|
+
: `Tool '${tool}' usage count ${count} exceeds max ${max}.`,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
function evaluateCostMax(evaluator, totalCostUsd) {
|
|
141
|
+
const maxUsd = Number(evaluator.config.max_usd ?? 0);
|
|
142
|
+
const total = totalCostUsd ?? 0;
|
|
143
|
+
const passed = total <= maxUsd;
|
|
144
|
+
return {
|
|
145
|
+
evaluatorId: evaluator.id,
|
|
146
|
+
evaluatorType: evaluator.type,
|
|
147
|
+
mode: evaluator.mode,
|
|
148
|
+
status: passed ? "pass" : "fail",
|
|
149
|
+
weight: evaluator.weight,
|
|
150
|
+
rawScore: passed ? 1 : 0,
|
|
151
|
+
message: passed
|
|
152
|
+
? `Total cost ${total} is within max ${maxUsd}.`
|
|
153
|
+
: `Total cost ${total} exceeds max ${maxUsd}.`,
|
|
154
|
+
};
|
|
155
|
+
}
|
|
101
156
|
function matches(input, match) {
|
|
102
157
|
if (!isObject(input)) {
|
|
103
158
|
return false;
|