agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +78 -11
  2. package/bin/agentlab.js +2 -0
  3. package/dist/agent/factory.js +20 -6
  4. package/dist/agent/httpAdapter.js +5 -4
  5. package/dist/config.js +199 -12
  6. package/dist/evaluators.js +56 -1
  7. package/dist/index.js +157 -11
  8. package/dist/init.js +88 -0
  9. package/dist/lib/id.js +3 -0
  10. package/dist/runOutput.js +46 -0
  11. package/dist/runner.js +31 -9
  12. package/dist/scenarios.js +90 -2
  13. package/dist/scoring.js +2 -2
  14. package/dist/storage.js +117 -7
  15. package/dist/tools.js +56 -2
  16. package/dist/trace.js +4 -2
  17. package/dist/ui/App.js +75 -7
  18. package/dist/ui-assets/client.css +92 -0
  19. package/dist/ui-assets/client.js +183 -19
  20. package/docs/agents.md +143 -8
  21. package/docs/coding-agents.md +74 -0
  22. package/docs/golden-suites.md +74 -0
  23. package/docs/integrations-and-live-services.md +58 -0
  24. package/docs/memory-and-stateful-agents.md +51 -0
  25. package/docs/release-checklist.md +30 -0
  26. package/docs/runtime-profiles.md +67 -0
  27. package/docs/scenarios.md +303 -56
  28. package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
  29. package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
  30. package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
  31. package/docs/tools.md +34 -3
  32. package/docs/troubleshooting.md +193 -0
  33. package/docs/variant-sets.md +63 -0
  34. package/examples/coding-tools/README.md +21 -0
  35. package/examples/coding-tools/index.js +11 -0
  36. package/examples/coding-tools/package.json +8 -0
  37. package/examples/support-tools/README.md +21 -0
  38. package/examples/support-tools/index.js +8 -0
  39. package/examples/support-tools/package.json +8 -0
  40. package/package.json +7 -5
package/README.md CHANGED
@@ -1,27 +1,71 @@
1
1
  # Agent Regression Lab
2
2
 
3
- Agent Regression Lab is a local-first evaluation harness for AI agents.
3
+ Agent Regression Lab is the local-first regression spine for agent engineering teams.
4
4
 
5
- It gives you a repeatable way to define scenarios in YAML, run agents against deterministic tool surfaces, store traces and scores locally, and compare runs or suite batches over time.
5
+ It gives teams a repeatable way to define expected agent behavior in YAML, replay it against deterministic tool surfaces or live HTTP agents, store traces and scores locally, and compare candidate behavior against known baselines over time.
6
6
 
7
- This is an alpha developer tool. It is ready for early technical users, but it is not a polished platform.
7
+ This is a local-first alpha for early technical teams. It is strongest when used across one workflow spine:
8
+
9
+ - debug a single scenario while building
10
+ - validate a branch with a suite before merge
11
+ - run curated golden suites before release
12
+ - keep incident-derived scenarios as engineering memory
8
13
 
9
14
  ## Who It Is For
10
15
 
11
- - engineers building or debugging agent workflows
12
- - researchers who want repeatable local evals
13
- - teams that want a simple local regression harness before investing in heavier infrastructure
16
+ - teams shipping prompt, model, tool, workflow, and memory changes
17
+ - engineers who need repeatable before/after evidence instead of vibes
18
+ - teams validating live HTTP agents as well as deterministic local scenarios
19
+ - researchers and technical operators who want local control before adopting heavier hosted infrastructure
20
+
21
+ ## Why Teams Use It
22
+
23
+ - catch regressions before merge or release
24
+ - debug subtle behavioral changes with full traces
25
+ - compare model, prompt, tool, and workflow changes against a known baseline
26
+ - build a portfolio of golden workflows, historical regressions, and ugly edge cases
27
+ - preserve engineering memory so old failures do not quietly return
14
28
 
15
29
  ## What It Supports Today
16
30
 
17
31
  - YAML scenarios under `scenarios/`
18
- - deterministic built-in tools plus repo-local custom tools from `agentlab.config.yaml`
32
+ - deterministic built-in tools plus custom tools from `agentlab.config.yaml`
19
33
  - named agents from `agentlab.config.yaml`
20
- - built-in `mock`, `openai`, and `external_process` agent modes
34
+ - built-in `mock`, `openai`, `external_process`, and `http` agent modes
35
+ - `type: conversation` multi-turn dialog scenarios for HTTP agents
21
36
  - SQLite-backed local run history under `artifacts/agentlab.db`
22
37
  - CLI commands to list, run, show, compare, and launch the UI
23
38
  - local web UI for run inspection, run comparison, and suite batch comparison
24
39
 
40
+ ## Workflow Spine
41
+
42
+ Use this as the default product story:
43
+
44
+ 1. debug locally with one scenario
45
+ 2. validate a branch with a suite
46
+ 3. run curated golden suites before release
47
+ 4. keep incident-derived scenarios as permanent regression assets
48
+
49
+ ## Start Here
50
+
51
+ If your agent runs as an HTTP service:
52
+
53
+ - use `provider: http`
54
+ - start with [arl-test](arl-test)
55
+ - read [docs/agents.md](docs/agents.md) and [docs/scenarios.md](docs/scenarios.md)
56
+
57
+ If you are validating coding-agent changes:
58
+
59
+ - start with the coding scenarios under `scenarios/coding/`
60
+ - read [docs/coding-agents.md](docs/coding-agents.md)
61
+ - use deterministic tool-loop runs first, then compare before/after behavior
62
+
63
+ If you want pre-merge regression checks in CI:
64
+
65
+ - use `suite_definitions`
66
+ - start with `.github/workflows/agentlab-pre-merge.yml`
67
+ - run `agentlab run --suite-def pre_merge --agent mock-default`
68
+
25
69
  ## First 10 Minutes
26
70
 
27
71
  The fastest path is to run the CLI from a local checkout.
@@ -135,6 +179,8 @@ Supported command surface:
135
179
  agentlab list scenarios
136
180
  agentlab run <scenario-id> [--agent <name>]
137
181
  agentlab run --suite <suite-id> [--agent <name>]
182
+ agentlab run --suite-def <name> [--agent <name>]
183
+ agentlab run <scenario-id> [--variant-set <name>]
138
184
  agentlab show <run-id>
139
185
  agentlab compare <baseline-run-id> <candidate-run-id>
140
186
  agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
@@ -154,25 +200,42 @@ Use this as the default mental model:
154
200
  3. note the run id or suite batch id
155
201
  4. inspect the run in CLI or UI
156
202
  5. compare two runs or two suite batches
157
- 6. extend the setup with a named agent or repo-local tool when needed
203
+ 6. extend the setup with a named agent or custom tools from repo-local files or installed packages when needed
204
+
205
+ ## Canonical Live HTTP Fixture
206
+
207
+ `arl-test/` is the canonical live HTTP regression fixture in this repo.
208
+
209
+ Use it to verify the production-like HTTP path end to end:
210
+
211
+ ```bash
212
+ cd arl-test
213
+ npm start
214
+ node ../dist/index.js list scenarios
215
+ node ../dist/index.js run order-tracking-in-transit --agent support-agent
216
+ ```
217
+
218
+ The `arl-test` scenarios are intended to behave like a real internal-team regression fixture, not just a toy demo.
158
219
 
159
220
  ## Config And Extension Points
160
221
 
161
222
  `agentlab.config.yaml` is the public extension point for:
162
223
 
163
224
  - named agents
164
- - repo-local custom tools
225
+ - custom tools from repo-local files or installed npm packages
165
226
 
166
227
  Supported agent providers:
167
228
 
168
229
  - `mock`
169
230
  - `openai`
170
231
  - `external_process`
232
+ - `http` — point at a running HTTP service for multi-turn conversation testing
171
233
 
172
234
  Working sample assets already live in this repo:
173
235
 
174
236
  - external agents: `custom_agents/node_agent.mjs`, `custom_agents/python_agent.py`
175
237
  - custom tool: `user_tools/findDuplicateCharge.ts`
238
+ - package-style tool examples: `examples/support-tools`, `examples/coding-tools`
176
239
  - sample config: `agentlab.config.yaml`
177
240
 
178
241
  See:
@@ -209,14 +272,18 @@ Agent behavior can still vary depending on the provider path. The built-in `mock
209
272
  ## Limitations
210
273
 
211
274
  - this is a local-first alpha, not a hosted platform
212
- - custom tool loading is limited to repo-local module paths
275
+ - the published package/example ecosystem is still small
213
276
  - external agents integrate through the local stdin/stdout protocol only
214
277
  - the UI is intentionally minimal and optimized for debugging
278
+ - SQLite-backed local storage still makes sequential live verification the safest path when reusing the same local artifacts DB
215
279
  - the benchmark is broader than before, but still small compared to a mature benchmark product
216
280
 
217
281
  ## Next Docs
218
282
 
219
283
  - scenario authoring: [docs/scenarios.md](docs/scenarios.md)
284
+ - golden suites: [docs/golden-suites.md](docs/golden-suites.md)
285
+ - integrations and live services: [docs/integrations-and-live-services.md](docs/integrations-and-live-services.md)
286
+ - memory and stateful agents: [docs/memory-and-stateful-agents.md](docs/memory-and-stateful-agents.md)
220
287
  - custom tools: [docs/tools.md](docs/tools.md)
221
288
  - named agents and external-process protocol: [docs/agents.md](docs/agents.md)
222
289
  - common failure modes: [docs/troubleshooting.md](docs/troubleshooting.md)
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ import "../dist/index.js";
@@ -2,6 +2,20 @@ import { ExternalProcessAgentAdapter } from "./externalProcessAdapter.js";
2
2
  import { MockAgentAdapter } from "./mockAdapter.js";
3
3
  import { OpenAIResponsesAgentAdapter } from "./openaiResponsesAdapter.js";
4
4
  import { createAgentVersionId } from "../lib/id.js";
5
+ function attachIdentityMetadata(version, config) {
6
+ return {
7
+ ...version,
8
+ variantSetName: config.variantSetName,
9
+ variantLabel: config.variantLabel,
10
+ promptVersion: config.promptVersion,
11
+ modelVersion: config.modelVersion,
12
+ toolSchemaVersion: config.toolSchemaVersion,
13
+ configLabel: config.configLabel,
14
+ configHash: config.configHash,
15
+ runtimeProfileName: config.runtimeProfileName,
16
+ suiteDefinitionName: config.suiteDefinitionName,
17
+ };
18
+ }
5
19
  class MockAgentAdapterFactory {
6
20
  createAdapter() {
7
21
  return new MockAgentAdapter();
@@ -9,13 +23,13 @@ class MockAgentAdapterFactory {
9
23
  createVersion(config) {
10
24
  const label = config.label ?? config.agentName ?? "mock-support-agent-v1";
11
25
  const payload = { adapter: "mock", domain: "support", agentName: config.agentName };
12
- return {
26
+ return attachIdentityMetadata({
13
27
  id: createAgentVersionId(label, payload),
14
28
  label,
15
29
  modelId: "mock-model",
16
30
  provider: "mock",
17
31
  config: payload,
18
- };
32
+ }, config);
19
33
  }
20
34
  }
21
35
  class OpenAIAdapterFactory {
@@ -28,13 +42,13 @@ class OpenAIAdapterFactory {
28
42
  const model = config.model ?? "gpt-4o-mini";
29
43
  const label = config.label ?? config.agentName ?? `openai-${model}`;
30
44
  const payload = { provider: "openai", model, agentName: config.agentName };
31
- return {
45
+ return attachIdentityMetadata({
32
46
  id: createAgentVersionId(label, payload),
33
47
  label,
34
48
  modelId: model,
35
49
  provider: "openai",
36
50
  config: payload,
37
- };
51
+ }, config);
38
52
  }
39
53
  }
40
54
  class ExternalProcessAdapterFactory {
@@ -53,14 +67,14 @@ class ExternalProcessAdapterFactory {
53
67
  args: config.args ?? [],
54
68
  agentName: config.agentName,
55
69
  };
56
- return {
70
+ return attachIdentityMetadata({
57
71
  id: createAgentVersionId(label, payload),
58
72
  label,
59
73
  provider: "external_process",
60
74
  command: config.command,
61
75
  args: config.args ?? [],
62
76
  config: payload,
63
- };
77
+ }, config);
64
78
  }
65
79
  }
66
80
  export function createAgentFactory(config) {
@@ -1,12 +1,13 @@
1
1
  import { performance } from "node:perf_hooks";
2
2
  export function interpolateTemplate(template, message, conversationId) {
3
3
  return template.replace(/\{\{([^}]+)\}\}/g, (_, key) => {
4
- if (key === "message")
4
+ const k = key.trim();
5
+ if (k === "message")
5
6
  return message;
6
- if (key === "conversation_id")
7
+ if (k === "conversation_id")
7
8
  return conversationId;
8
- if (key.startsWith("env."))
9
- return process.env[key.slice(4)] ?? "";
9
+ if (k.startsWith("env."))
10
+ return process.env[k.slice(4)] ?? "";
10
11
  return "";
11
12
  });
12
13
  }
package/dist/config.js CHANGED
@@ -1,12 +1,12 @@
1
1
  import { statSync, readFileSync } from "node:fs";
2
2
  import { resolve, relative, sep } from "node:path";
3
3
  import { parse } from "yaml";
4
- const CONFIG_PATH = resolve("agentlab.config.yaml");
5
4
  export function loadAgentLabConfig() {
6
- if (!exists(CONFIG_PATH)) {
5
+ const configPath = resolve("agentlab.config.yaml");
6
+ if (!exists(configPath)) {
7
7
  return {};
8
8
  }
9
- const raw = readFileSync(CONFIG_PATH, "utf8");
9
+ const raw = readFileSync(configPath, "utf8");
10
10
  const parsed = parse(raw);
11
11
  validateConfig(parsed);
12
12
  return parsed;
@@ -41,6 +41,47 @@ function validateConfig(value) {
41
41
  names.add(agent.name);
42
42
  }
43
43
  }
44
+ const agents = (value.agents ?? []);
45
+ const agentNames = new Set(agents.map((agent) => agent.name));
46
+ if (value.variant_sets !== undefined) {
47
+ if (!Array.isArray(value.variant_sets)) {
48
+ throw new Error("agentlab.config.yaml field 'variant_sets' must be an array.");
49
+ }
50
+ const names = new Set();
51
+ for (const variantSet of value.variant_sets) {
52
+ validateVariantSetDefinition(variantSet, agentNames);
53
+ if (names.has(variantSet.name)) {
54
+ throw new Error(`agentlab.config.yaml defines duplicate variant set '${variantSet.name}'.`);
55
+ }
56
+ names.add(variantSet.name);
57
+ }
58
+ }
59
+ if (value.runtime_profiles !== undefined) {
60
+ if (!Array.isArray(value.runtime_profiles)) {
61
+ throw new Error("agentlab.config.yaml field 'runtime_profiles' must be an array.");
62
+ }
63
+ const names = new Set();
64
+ for (const runtimeProfile of value.runtime_profiles) {
65
+ validateRuntimeProfileDefinition(runtimeProfile);
66
+ if (names.has(runtimeProfile.name)) {
67
+ throw new Error(`agentlab.config.yaml defines duplicate runtime profile '${runtimeProfile.name}'.`);
68
+ }
69
+ names.add(runtimeProfile.name);
70
+ }
71
+ }
72
+ if (value.suite_definitions !== undefined) {
73
+ if (!Array.isArray(value.suite_definitions)) {
74
+ throw new Error("agentlab.config.yaml field 'suite_definitions' must be an array.");
75
+ }
76
+ const names = new Set();
77
+ for (const suiteDefinition of value.suite_definitions) {
78
+ validateSuiteDefinition(suiteDefinition);
79
+ if (names.has(suiteDefinition.name)) {
80
+ throw new Error(`agentlab.config.yaml defines duplicate suite definition '${suiteDefinition.name}'.`);
81
+ }
82
+ names.add(suiteDefinition.name);
83
+ }
84
+ }
44
85
  }
45
86
  function validateToolRegistration(value) {
46
87
  if (!isObject(value)) {
@@ -49,8 +90,10 @@ function validateToolRegistration(value) {
49
90
  if (typeof value.name !== "string" || value.name.length === 0) {
50
91
  throw new Error("Each tool registration must define a non-empty 'name'.");
51
92
  }
52
- if (typeof value.modulePath !== "string" || value.modulePath.length === 0) {
53
- throw new Error(`Tool '${value.name}' must define a non-empty 'modulePath'.`);
93
+ const hasModulePath = typeof value.modulePath === "string" && value.modulePath.length > 0;
94
+ const hasPackage = typeof value.package === "string" && value.package.length > 0;
95
+ if ((hasModulePath ? 1 : 0) + (hasPackage ? 1 : 0) !== 1) {
96
+ throw new Error(`Tool '${value.name}' must define exactly one of 'modulePath' or 'package'.`);
54
97
  }
55
98
  if (typeof value.exportName !== "string" || value.exportName.length === 0) {
56
99
  throw new Error(`Tool '${value.name}' must define a non-empty 'exportName'.`);
@@ -61,13 +104,15 @@ function validateToolRegistration(value) {
61
104
  if (!isObject(value.inputSchema)) {
62
105
  throw new Error(`Tool '${value.name}' must define an object 'inputSchema'.`);
63
106
  }
64
- const resolved = resolve(value.modulePath);
65
- const root = `${process.cwd()}${sep}`;
66
- if (!(resolved === process.cwd() || resolved.startsWith(root))) {
67
- throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
68
- }
69
- if (!exists(resolved)) {
70
- throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
107
+ if (hasModulePath) {
108
+ const resolved = resolve(value.modulePath);
109
+ const root = `${process.cwd()}${sep}`;
110
+ if (!(resolved === process.cwd() || resolved.startsWith(root))) {
111
+ throw new Error(`Tool '${value.name}' modulePath must stay within the repo.`);
112
+ }
113
+ if (!exists(resolved)) {
114
+ throw new Error(`Tool '${value.name}' references missing module '${relative(process.cwd(), resolved)}'.`);
115
+ }
71
116
  }
72
117
  }
73
118
  function validateAgentRegistration(value) {
@@ -145,6 +190,148 @@ export function getAgentRegistration(name) {
145
190
  }
146
191
  return match;
147
192
  }
193
+ export function getVariantSet(name) {
194
+ const match = loadAgentLabConfig().variant_sets?.find((variantSet) => variantSet.name === name);
195
+ if (!match) {
196
+ throw new Error(`agentlab.config.yaml does not define variant set '${name}'.`);
197
+ }
198
+ return match;
199
+ }
200
+ export function getRuntimeProfile(name) {
201
+ const match = loadAgentLabConfig().runtime_profiles?.find((runtimeProfile) => runtimeProfile.name === name);
202
+ if (!match) {
203
+ throw new Error(`agentlab.config.yaml does not define runtime profile '${name}'.`);
204
+ }
205
+ return match;
206
+ }
207
+ export function getSuiteDefinition(name) {
208
+ const match = loadAgentLabConfig().suite_definitions?.find((suiteDefinition) => suiteDefinition.name === name);
209
+ if (!match) {
210
+ throw new Error(`agentlab.config.yaml does not define suite definition '${name}'.`);
211
+ }
212
+ return match;
213
+ }
214
+ function validateVariantSetDefinition(value, agentNames) {
215
+ if (!isObject(value)) {
216
+ throw new Error("Each variant set definition in agentlab.config.yaml must be an object.");
217
+ }
218
+ if (typeof value.name !== "string" || value.name.length === 0) {
219
+ throw new Error("Each variant set definition must define a non-empty 'name'.");
220
+ }
221
+ if (!Array.isArray(value.variants)) {
222
+ throw new Error(`Variant set '${value.name}' must define a 'variants' array.`);
223
+ }
224
+ const labels = new Set();
225
+ for (const variant of value.variants) {
226
+ if (!isObject(variant)) {
227
+ throw new Error(`Variant set '${value.name}' contains a non-object variant definition.`);
228
+ }
229
+ if (typeof variant.agent !== "string" || variant.agent.length === 0) {
230
+ throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'agent' required.`);
231
+ }
232
+ if (!agentNames.has(variant.agent)) {
233
+ throw new Error(`Variant set '${value.name}' references unknown agent '${variant.agent}'.`);
234
+ }
235
+ if (typeof variant.label !== "string" || variant.label.length === 0) {
236
+ throw new Error(`Variant set '${value.name}' contains a variant with a non-empty 'label' required.`);
237
+ }
238
+ if (labels.has(variant.label)) {
239
+ throw new Error(`Variant set '${value.name}' defines duplicate variant label '${variant.label}'.`);
240
+ }
241
+ labels.add(variant.label);
242
+ }
243
+ }
244
+ function validateRuntimeProfileDefinition(value) {
245
+ if (!isObject(value)) {
246
+ throw new Error("Each runtime profile definition in agentlab.config.yaml must be an object.");
247
+ }
248
+ if (typeof value.name !== "string" || value.name.length === 0) {
249
+ throw new Error("Each runtime profile definition must define a non-empty 'name'.");
250
+ }
251
+ if (value.tool_faults !== undefined) {
252
+ if (!Array.isArray(value.tool_faults)) {
253
+ throw new Error(`Runtime profile '${value.name}' field 'tool_faults' must be an array.`);
254
+ }
255
+ for (const fault of value.tool_faults) {
256
+ if (!isObject(fault)) {
257
+ throw new Error(`Runtime profile '${value.name}' contains a non-object tool fault definition.`);
258
+ }
259
+ if (typeof fault.tool !== "string" || fault.tool.length === 0) {
260
+ throw new Error(`Runtime profile '${value.name}' contains a tool fault with a non-empty 'tool' required.`);
261
+ }
262
+ if (fault.mode !== "timeout" && fault.mode !== "error" && fault.mode !== "malformed_output" && fault.mode !== "partial_output") {
263
+ throw new Error(`Runtime profile '${value.name}' uses invalid tool fault mode '${String(fault.mode)}'.`);
264
+ }
265
+ if (fault.error_message !== undefined && (typeof fault.error_message !== "string" || fault.error_message.length === 0)) {
266
+ throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'error_message' must be a non-empty string.`);
267
+ }
268
+ if (fault.timeout_ms !== undefined && (typeof fault.timeout_ms !== "number" || fault.timeout_ms <= 0)) {
269
+ throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'timeout_ms' must be a positive number.`);
270
+ }
271
+ if (fault.partial_output !== undefined && !isObject(fault.partial_output)) {
272
+ throw new Error(`Runtime profile '${value.name}' tool fault for '${fault.tool}' field 'partial_output' must be an object.`);
273
+ }
274
+ }
275
+ }
276
+ if (value.state !== undefined) {
277
+ if (!isObject(value.state)) {
278
+ throw new Error(`Runtime profile '${value.name}' field 'state' must be an object.`);
279
+ }
280
+ if (value.state.reset !== "per_run" && value.state.reset !== "per_variant_run" && value.state.reset !== "manual") {
281
+ throw new Error(`Runtime profile '${value.name}' field 'state.reset' must be one of 'per_run', 'per_variant_run', or 'manual'.`);
282
+ }
283
+ if (value.state.seeded_messages !== undefined) {
284
+ if (!Array.isArray(value.state.seeded_messages)) {
285
+ throw new Error(`Runtime profile '${value.name}' field 'state.seeded_messages' must be an array.`);
286
+ }
287
+ for (const message of value.state.seeded_messages) {
288
+ if (!isObject(message)) {
289
+ throw new Error(`Runtime profile '${value.name}' contains a non-object seeded message.`);
290
+ }
291
+ if (message.role !== "user" && message.role !== "assistant") {
292
+ throw new Error(`Runtime profile '${value.name}' seeded message role must be 'user' or 'assistant'.`);
293
+ }
294
+ if (typeof message.message !== "string" || message.message.length === 0) {
295
+ throw new Error(`Runtime profile '${value.name}' seeded message must define a non-empty 'message'.`);
296
+ }
297
+ }
298
+ }
299
+ if (value.state.memory_blob !== undefined && !isObject(value.state.memory_blob)) {
300
+ throw new Error(`Runtime profile '${value.name}' field 'state.memory_blob' must be an object.`);
301
+ }
302
+ }
303
+ }
304
+ function validateSuiteDefinition(value) {
305
+ if (!isObject(value)) {
306
+ throw new Error("Each suite definition in agentlab.config.yaml must be an object.");
307
+ }
308
+ if (typeof value.name !== "string" || value.name.length === 0) {
309
+ throw new Error("Each suite definition must define a non-empty 'name'.");
310
+ }
311
+ if (!isObject(value.include)) {
312
+ throw new Error(`Suite definition '${value.name}' must define an object 'include'.`);
313
+ }
314
+ validateSuiteSelectorArray(value.include, value.name, "include.scenarios");
315
+ validateSuiteSelectorArray(value.include, value.name, "include.tags");
316
+ validateSuiteSelectorArray(value.include, value.name, "include.suites");
317
+ if (value.exclude !== undefined) {
318
+ if (!isObject(value.exclude)) {
319
+ throw new Error(`Suite definition '${value.name}' field 'exclude' must be an object.`);
320
+ }
321
+ validateSuiteSelectorArray(value.exclude, value.name, "exclude.scenarios");
322
+ validateSuiteSelectorArray(value.exclude, value.name, "exclude.tags");
323
+ validateSuiteSelectorArray(value.exclude, value.name, "exclude.suites");
324
+ }
325
+ }
326
+ function validateSuiteSelectorArray(value, suiteName, key) {
327
+ const fieldName = key.split(".")[1];
328
+ const selector = value[fieldName];
329
+ if (selector !== undefined) {
330
+ if (!Array.isArray(selector) || selector.some((item) => typeof item !== "string")) {
331
+ throw new Error(`Suite definition '${suiteName}' field '${key}' must be an array of strings.`);
332
+ }
333
+ }
334
+ }
148
335
  function exists(path) {
149
336
  try {
150
337
  statSync(path);
@@ -13,6 +13,12 @@ function evaluateOne(evaluator, bundle) {
13
13
  return evaluateExactFinalAnswer(evaluator, bundle.run.finalOutput);
14
14
  case "step_count_max":
15
15
  return evaluateStepCountMax(evaluator, bundle.run.totalSteps);
16
+ case "tool_call_count_max":
17
+ return evaluateToolCallCountMax(evaluator, bundle.run.totalToolCalls);
18
+ case "tool_repeat_max":
19
+ return evaluateToolRepeatMax(evaluator, bundle.toolCalls);
20
+ case "cost_max":
21
+ return evaluateCostMax(evaluator, bundle.run.totalCostUsd);
16
22
  default:
17
23
  return {
18
24
  evaluatorId: evaluator.id,
@@ -86,7 +92,8 @@ function evaluateExactFinalAnswer(evaluator, finalOutput) {
86
92
  };
87
93
  }
88
94
  function evaluateStepCountMax(evaluator, stepCount) {
89
- const max = Number(evaluator.config.max_steps ?? 0);
95
+ const rawMax = evaluator.config.max ?? evaluator.config.max_steps;
96
+ const max = Number(rawMax ?? 0);
90
97
  const passed = stepCount <= max;
91
98
  return {
92
99
  evaluatorId: evaluator.id,
@@ -98,6 +105,54 @@ function evaluateStepCountMax(evaluator, stepCount) {
98
105
  message: passed ? `Step count ${stepCount} is within max ${max}.` : `Step count ${stepCount} exceeds max ${max}.`,
99
106
  };
100
107
  }
108
+ function evaluateToolCallCountMax(evaluator, totalToolCalls) {
109
+ const max = Number(evaluator.config.max ?? 0);
110
+ const passed = totalToolCalls <= max;
111
+ return {
112
+ evaluatorId: evaluator.id,
113
+ evaluatorType: evaluator.type,
114
+ mode: evaluator.mode,
115
+ status: passed ? "pass" : "fail",
116
+ weight: evaluator.weight,
117
+ rawScore: passed ? 1 : 0,
118
+ message: passed
119
+ ? `Tool call count ${totalToolCalls} is within max ${max}.`
120
+ : `Tool call count ${totalToolCalls} exceeds max ${max}.`,
121
+ };
122
+ }
123
+ function evaluateToolRepeatMax(evaluator, toolCalls) {
124
+ const tool = String(evaluator.config.tool ?? "");
125
+ const max = Number(evaluator.config.max ?? 0);
126
+ const count = toolCalls.filter((call) => call.toolName === tool).length;
127
+ const passed = count <= max;
128
+ return {
129
+ evaluatorId: evaluator.id,
130
+ evaluatorType: evaluator.type,
131
+ mode: evaluator.mode,
132
+ status: passed ? "pass" : "fail",
133
+ weight: evaluator.weight,
134
+ rawScore: passed ? 1 : 0,
135
+ message: passed
136
+ ? `Tool '${tool}' usage count ${count} is within max ${max}.`
137
+ : `Tool '${tool}' usage count ${count} exceeds max ${max}.`,
138
+ };
139
+ }
140
+ function evaluateCostMax(evaluator, totalCostUsd) {
141
+ const maxUsd = Number(evaluator.config.max_usd ?? 0);
142
+ const total = totalCostUsd ?? 0;
143
+ const passed = total <= maxUsd;
144
+ return {
145
+ evaluatorId: evaluator.id,
146
+ evaluatorType: evaluator.type,
147
+ mode: evaluator.mode,
148
+ status: passed ? "pass" : "fail",
149
+ weight: evaluator.weight,
150
+ rawScore: passed ? 1 : 0,
151
+ message: passed
152
+ ? `Total cost ${total} is within max ${maxUsd}.`
153
+ : `Total cost ${total} exceeds max ${maxUsd}.`,
154
+ };
155
+ }
101
156
  function matches(input, match) {
102
157
  if (!isObject(input)) {
103
158
  return false;