agent-regression-lab 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +78 -11
  2. package/bin/agentlab.js +2 -0
  3. package/dist/agent/factory.js +20 -6
  4. package/dist/agent/httpAdapter.js +5 -4
  5. package/dist/config.js +199 -12
  6. package/dist/evaluators.js +56 -1
  7. package/dist/index.js +157 -11
  8. package/dist/init.js +88 -0
  9. package/dist/lib/id.js +3 -0
  10. package/dist/runOutput.js +46 -0
  11. package/dist/runner.js +31 -9
  12. package/dist/scenarios.js +90 -2
  13. package/dist/scoring.js +2 -2
  14. package/dist/storage.js +117 -7
  15. package/dist/tools.js +56 -2
  16. package/dist/trace.js +4 -2
  17. package/dist/ui/App.js +75 -7
  18. package/dist/ui-assets/client.css +92 -0
  19. package/dist/ui-assets/client.js +183 -19
  20. package/docs/agents.md +143 -8
  21. package/docs/coding-agents.md +74 -0
  22. package/docs/golden-suites.md +74 -0
  23. package/docs/integrations-and-live-services.md +58 -0
  24. package/docs/memory-and-stateful-agents.md +51 -0
  25. package/docs/release-checklist.md +30 -0
  26. package/docs/runtime-profiles.md +67 -0
  27. package/docs/scenarios.md +303 -56
  28. package/docs/superpowers/plans/2026-04-13-phase-2-lite-phase-3-plan.md +160 -0
  29. package/docs/superpowers/plans/2026-04-13-phase-one-npm-tools-plan.md +502 -0
  30. package/docs/superpowers/specs/2026-04-13-phase-2-lite-phase-3-design.md +164 -0
  31. package/docs/tools.md +34 -3
  32. package/docs/troubleshooting.md +193 -0
  33. package/docs/variant-sets.md +63 -0
  34. package/examples/coding-tools/README.md +21 -0
  35. package/examples/coding-tools/index.js +11 -0
  36. package/examples/coding-tools/package.json +8 -0
  37. package/examples/support-tools/README.md +21 -0
  38. package/examples/support-tools/index.js +8 -0
  39. package/examples/support-tools/package.json +8 -0
  40. package/package.json +7 -5
package/dist/index.js CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env node
2
2
  import packageJson from "../package.json" with { type: "json" };
3
+ import { pathToFileURL } from "node:url";
3
4
  import { createAgentFactory } from "./agent/factory.js";
4
- import { getAgentRegistration } from "./config.js";
5
- import { createSuiteBatchId } from "./lib/id.js";
6
- import { getRunErrorDetail } from "./runOutput.js";
5
+ import { getAgentRegistration, getVariantSet } from "./config.js";
6
+ import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
7
+ import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
8
+ import { initProject } from "./init.js";
7
9
  async function main() {
8
10
  const [, , command, ...args] = process.argv;
9
11
  switch (command) {
@@ -32,15 +34,21 @@ async function main() {
32
34
  case "ui":
33
35
  await handleUi();
34
36
  break;
37
+ case "init":
38
+ await handleInit(args);
39
+ break;
35
40
  default:
36
41
  printUsage();
37
42
  }
38
43
  }
39
44
  function printUsage() {
40
45
  console.log(`Usage:
46
+ agentlab init <project-name>
41
47
  agentlab list scenarios
42
48
  agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
43
49
  agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
50
+ agentlab run --suite-def <name> [--agent <name>]
51
+ agentlab run <scenario-id> [--variant-set <name>]
44
52
  agentlab show <run-id>
45
53
  agentlab compare <baseline-run-id> <candidate-run-id>
46
54
  agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
@@ -61,10 +69,25 @@ async function handleList(args) {
61
69
  console.log(`${scenario.id}\t${scenario.suite}\t${scenario.difficulty ?? "-"}\t${scenario.description ?? ""}`);
62
70
  }
63
71
  }
72
+ async function handleInit(args) {
73
+ const projectName = args[0];
74
+ if (!projectName) {
75
+ console.error("Error: project-name is required.");
76
+ console.error("Usage: agentlab init <project-name>");
77
+ process.exit(1);
78
+ }
79
+ await initProject(projectName);
80
+ }
64
81
  async function handleRun(args) {
65
82
  const parsed = parseRunArgs(args);
66
83
  const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
67
- const { loadScenariosBySuite } = await import("./scenarios.js");
84
+ const { loadScenariosBySuite, loadScenariosBySuiteDefinition } = await import("./scenarios.js");
85
+ if (parsed.suite && parsed.suiteDefinition) {
86
+ throw new Error("--suite and --suite-def cannot be used together.");
87
+ }
88
+ if (parsed.runtimeConfig.agentName && parsed.variantSetName) {
89
+ throw new Error("--agent and --variant-set cannot be used together.");
90
+ }
68
91
  if (parsed.suite) {
69
92
  const suite = parsed.suite;
70
93
  const scenarios = loadScenariosBySuite(suite);
@@ -73,16 +96,53 @@ async function handleRun(args) {
73
96
  }
74
97
  const suiteBatchId = createSuiteBatchId();
75
98
  const runs = [];
76
- for (const scenario of scenarios) {
77
- runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
99
+ if (parsed.variantSetName) {
100
+ console.log(`Variant set: ${parsed.variantSetName}`);
101
+ for (const scenario of scenarios) {
102
+ runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId));
103
+ }
104
+ }
105
+ else {
106
+ for (const scenario of scenarios) {
107
+ runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
108
+ }
78
109
  }
79
110
  printSuiteSummary(suite, runs, suiteBatchId);
80
111
  return;
81
112
  }
113
+ if (parsed.suiteDefinition) {
114
+ const suiteDefinition = parsed.suiteDefinition;
115
+ const scenarios = loadScenariosBySuiteDefinition(suiteDefinition);
116
+ if (scenarios.length === 0) {
117
+ throw new Error(`No scenarios found for suite definition '${suiteDefinition}'.`);
118
+ }
119
+ const suiteBatchId = createSuiteBatchId();
120
+ const runs = [];
121
+ console.log(`Suite definition: ${suiteDefinition}`);
122
+ if (parsed.variantSetName) {
123
+ console.log(`Variant set: ${parsed.variantSetName}`);
124
+ for (const scenario of scenarios) {
125
+ runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId, suiteDefinition));
126
+ }
127
+ }
128
+ else {
129
+ const suiteRuntimeConfig = { ...runtimeConfig, suiteDefinitionName: suiteDefinition };
130
+ for (const scenario of scenarios) {
131
+ runs.push(await executeOne(scenario.definition.id, suiteRuntimeConfig, suiteBatchId));
132
+ }
133
+ }
134
+ printSuiteSummary(suiteDefinition, runs, suiteBatchId);
135
+ return;
136
+ }
82
137
  const scenarioId = parsed.scenarioId;
83
138
  if (!scenarioId) {
84
139
  throw new Error("Missing scenario id.");
85
140
  }
141
+ if (parsed.variantSetName) {
142
+ console.log(`Variant set: ${parsed.variantSetName}`);
143
+ await executeVariantSetScenario(scenarioId, parsed.variantSetName);
144
+ return;
145
+ }
86
146
  // Detect scenario type to route to the right runner
87
147
  const { listScenarioFiles } = await import("./scenarios.js");
88
148
  const { parse } = await import("yaml");
@@ -97,6 +157,12 @@ async function handleRun(args) {
97
157
  break;
98
158
  }
99
159
  }
160
+ if (scenarioType === "task" && runtimeConfig.provider === "http") {
161
+ throw new Error(`Scenario '${scenarioId}' is a task scenario. HTTP agents (provider: http) only work with ` +
162
+ `type: conversation scenarios.\n` +
163
+ `To test an HTTP agent, create a conversation scenario (type: conversation) — ` +
164
+ `conversation scenarios do not use a tools: block. See docs/scenarios.md for the format.`);
165
+ }
100
166
  if (scenarioType === "conversation") {
101
167
  if (runtimeConfig.provider !== "http") {
102
168
  throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
@@ -147,6 +213,15 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
147
213
  tools: toolRegistry,
148
214
  });
149
215
  bundle.run.suiteBatchId = suiteBatchId;
216
+ bundle.run.variantSetName = agentVersion.variantSetName;
217
+ bundle.run.variantLabel = agentVersion.variantLabel;
218
+ bundle.run.promptVersion = agentVersion.promptVersion;
219
+ bundle.run.modelVersion = agentVersion.modelVersion;
220
+ bundle.run.toolSchemaVersion = agentVersion.toolSchemaVersion;
221
+ bundle.run.configLabel = agentVersion.configLabel;
222
+ bundle.run.configHash = agentVersion.configHash;
223
+ bundle.run.runtimeProfileName = loaded.definition.runtime_profile;
224
+ bundle.run.suiteDefinitionName = runtimeConfig.suiteDefinitionName;
150
225
  bundle.agentVersion = agentVersion;
151
226
  storage.saveRun(bundle);
152
227
  printRunSummary(bundle);
@@ -156,6 +231,45 @@ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
156
231
  storage.close();
157
232
  }
158
233
  }
234
+ export async function executeVariantSetScenario(scenarioId, variantSetName, suiteBatchId, suiteDefinitionName) {
235
+ const variantSet = getVariantSet(variantSetName);
236
+ const runs = [];
237
+ for (const variant of variantSet.variants) {
238
+ const registration = getAgentRegistration(variant.agent);
239
+ const runtimeConfig = buildVariantRuntimeConfig(registration, variantSet.name, variant, suiteDefinitionName);
240
+ runs.push(await executeOne(scenarioId, runtimeConfig, suiteBatchId));
241
+ }
242
+ return runs;
243
+ }
244
+ function buildVariantRuntimeConfig(registration, variantSetName, variant, suiteDefinitionName) {
245
+ const runtimeConfig = {
246
+ ...registration,
247
+ agentName: registration.name,
248
+ label: registration.label ?? variant.label,
249
+ variantSetName,
250
+ variantLabel: variant.label,
251
+ promptVersion: variant.prompt_version,
252
+ modelVersion: variant.model_version,
253
+ toolSchemaVersion: variant.tool_schema_version,
254
+ configLabel: variant.config_label,
255
+ suiteDefinitionName,
256
+ };
257
+ runtimeConfig.configHash = createConfigHash({
258
+ provider: runtimeConfig.provider,
259
+ agentName: runtimeConfig.agentName,
260
+ label: runtimeConfig.label,
261
+ model: runtimeConfig.model,
262
+ command: runtimeConfig.command,
263
+ args: runtimeConfig.args ?? [],
264
+ variantSetName,
265
+ variantLabel: variant.label,
266
+ promptVersion: variant.prompt_version,
267
+ modelVersion: variant.model_version,
268
+ toolSchemaVersion: variant.tool_schema_version,
269
+ configLabel: variant.config_label,
270
+ });
271
+ return runtimeConfig;
272
+ }
159
273
  export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
160
274
  const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
161
275
  import("./storage.js"),
@@ -257,6 +371,9 @@ function printRunSummary(bundle) {
257
371
  if (bundle.agentVersion?.command) {
258
372
  console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
259
373
  }
374
+ for (const line of formatRunIdentityLines(bundle)) {
375
+ console.log(line);
376
+ }
260
377
  console.log(`Runtime: ${bundle.run.durationMs}ms`);
261
378
  if (bundle.run.status !== "pass") {
262
379
  console.log(`Reason: ${bundle.run.terminationReason}`);
@@ -264,6 +381,13 @@ function printRunSummary(bundle) {
264
381
  if (errorDetail) {
265
382
  console.log(`Error: ${errorDetail}`);
266
383
  }
384
+ const failedEvaluators = getFailedEvaluatorSummaries(bundle);
385
+ if (failedEvaluators.length > 0) {
386
+ console.log("Failed evaluators:");
387
+ for (const summary of failedEvaluators) {
388
+ console.log(`- ${summary}`);
389
+ }
390
+ }
267
391
  }
268
392
  }
269
393
  async function handleShow(args) {
@@ -394,6 +518,8 @@ function parseRunArgs(args) {
394
518
  const runtimeConfig = { provider: "mock" };
395
519
  let scenarioId;
396
520
  let suite;
521
+ let suiteDefinition;
522
+ let variantSetName;
397
523
  for (let index = 0; index < args.length; index += 1) {
398
524
  const arg = args[index];
399
525
  if (arg === "--suite") {
@@ -401,6 +527,16 @@ function parseRunArgs(args) {
401
527
  index += 1;
402
528
  continue;
403
529
  }
530
+ if (arg === "--suite-def") {
531
+ suiteDefinition = args[index + 1];
532
+ index += 1;
533
+ continue;
534
+ }
535
+ if (arg === "--variant-set") {
536
+ variantSetName = args[index + 1];
537
+ index += 1;
538
+ continue;
539
+ }
404
540
  if (arg === "--provider") {
405
541
  const provider = args[index + 1];
406
542
  if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
@@ -431,7 +567,7 @@ function parseRunArgs(args) {
431
567
  }
432
568
  throw new Error(`Unexpected argument '${arg}'.`);
433
569
  }
434
- return { scenarioId, suite, runtimeConfig };
570
+ return { scenarioId, suite, suiteDefinition, variantSetName, runtimeConfig };
435
571
  }
436
572
  function validateRuntimeConfig(config) {
437
573
  if (config.agentName) {
@@ -475,7 +611,17 @@ function validateRuntimeConfig(config) {
475
611
  }
476
612
  return config;
477
613
  }
478
- main().catch((error) => {
479
- console.error(error instanceof Error ? error.message : String(error));
480
- process.exitCode = 1;
481
- });
614
+ if (isEntrypoint()) {
615
+ main().catch((error) => {
616
+ const message = error instanceof Error ? error.message : String(error);
617
+ console.error(formatCliErrorMessage(message));
618
+ process.exitCode = 1;
619
+ });
620
+ }
621
+ function isEntrypoint() {
622
+ const entry = process.argv[1];
623
+ if (!entry) {
624
+ return false;
625
+ }
626
+ return import.meta.url === pathToFileURL(entry).href;
627
+ }
package/dist/init.js ADDED
@@ -0,0 +1,88 @@
1
+ import { existsSync, mkdirSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ const SAMPLE_SCENARIO = `id: sample.hello-world
4
+ name: Hello World Sample
5
+ suite: sample
6
+ description: A minimal example to verify your setup.
7
+ difficulty: easy
8
+ tags:
9
+ - smoke
10
+ - sample
11
+ task:
12
+ instructions: |
13
+ Say hello to the user and confirm the system is working.
14
+ context:
15
+ user_name: Alice
16
+ tools:
17
+ allowed: []
18
+ runtime:
19
+ max_steps: 5
20
+ evaluators:
21
+ - id: greeting-output
22
+ type: final_answer_contains
23
+ mode: hard_gate
24
+ config:
25
+ required_substrings:
26
+ - "Hello"
27
+ `;
28
+ const SAMPLE_FIXTURE = `{
29
+ "users": [
30
+ { "id": "user_001", "name": "Alice", "email": "alice@example.com" }
31
+ ]
32
+ }
33
+ `;
34
+ const SAMPLE_CONFIG = `# Agent Regression Lab Configuration
35
+ # Docs: https://github.com/YakshithK/agent-regression-lab#readme
36
+
37
+ agents:
38
+ - name: mock-default
39
+ provider: mock
40
+ label: mock-default
41
+
42
+ # Uncomment and configure to test with OpenAI:
43
+ # - name: openai-test
44
+ # provider: openai
45
+ # model: gpt-4o-mini
46
+ # label: openai-test
47
+
48
+ # Tools can be registered from either:
49
+ # 1. repo-local files
50
+ # 2. installed npm packages
51
+ #
52
+ # tools:
53
+ # - name: my.local_tool
54
+ # modulePath: ./tools/customTool.ts
55
+ # exportName: customTool
56
+ # description: My repo-local custom tool.
57
+ # inputSchema:
58
+ # type: object
59
+ #
60
+ # - name: support.find_duplicate_charge
61
+ # package: "@agentlab/example-support-tools"
62
+ # exportName: findDuplicateCharge
63
+ # description: Find the duplicated charge order id for a given customer.
64
+ # inputSchema:
65
+ # type: object
66
+ `;
67
+ export async function initProject(projectName) {
68
+ const targetDir = join(process.cwd(), projectName);
69
+ if (existsSync(targetDir)) {
70
+ throw new Error(`Directory '${projectName}' already exists.`);
71
+ }
72
+ // Create directory structure
73
+ mkdirSync(targetDir, { recursive: true });
74
+ mkdirSync(join(targetDir, "scenarios"), { recursive: true });
75
+ mkdirSync(join(targetDir, "scenarios", "sample"), { recursive: true });
76
+ mkdirSync(join(targetDir, "fixtures"), { recursive: true });
77
+ // Write files
78
+ writeFileSync(join(targetDir, "scenarios", "sample", "hello-world.yaml"), SAMPLE_SCENARIO);
79
+ writeFileSync(join(targetDir, "fixtures", "users.json"), SAMPLE_FIXTURE);
80
+ writeFileSync(join(targetDir, "agentlab.config.yaml"), SAMPLE_CONFIG);
81
+ console.log(`Created '${projectName}' with sample scenario.`);
82
+ console.log("");
83
+ console.log("Next steps:");
84
+ console.log(` cd ${projectName}`);
85
+ console.log(" npm install @agentlab/example-support-tools");
86
+ console.log(" # then register package-backed tools in agentlab.config.yaml if needed");
87
+ console.log(" agentlab run sample.hello-world --agent mock-default");
88
+ }
package/dist/lib/id.js CHANGED
@@ -17,3 +17,6 @@ export function createToolCallId() {
17
17
  export function createAgentVersionId(label, config) {
18
18
  return `agent_${hashText(`${label}:${JSON.stringify(config)}`).slice(0, 12)}`;
19
19
  }
20
+ export function createConfigHash(input) {
21
+ return createAgentVersionId("config", input);
22
+ }
package/dist/runOutput.js CHANGED
@@ -1,5 +1,11 @@
1
1
  export function getRunErrorDetail(bundle) {
2
2
  for (const event of [...bundle.traceEvents].reverse()) {
3
+ if (event.type === "conversation_finished") {
4
+ const errorMessage = event.payload.errorMessage;
5
+ if (typeof errorMessage === "string") {
6
+ return errorMessage;
7
+ }
8
+ }
3
9
  if (event.type === "agent_error") {
4
10
  const message = event.payload.message;
5
11
  return typeof message === "string" ? message : undefined;
@@ -11,3 +17,43 @@ export function getRunErrorDetail(bundle) {
11
17
  }
12
18
  return undefined;
13
19
  }
20
+ export function formatCliErrorMessage(message) {
21
+ if (message.includes("database is locked")) {
22
+ return "SQLite database is locked. Retry the run sequentially or wait for the current run to finish.";
23
+ }
24
+ return message;
25
+ }
26
+ export function getFailedEvaluatorSummaries(bundle) {
27
+ return bundle.evaluatorResults
28
+ .filter((result) => result.status === "fail")
29
+ .map((result) => `${result.evaluatorId}: ${result.message}`);
30
+ }
31
+ export function formatRunIdentityLines(bundle) {
32
+ const lines = [];
33
+ const run = bundle.run;
34
+ if (run.variantSetName) {
35
+ lines.push(`Variant set: ${run.variantSetName}`);
36
+ }
37
+ if (run.variantLabel) {
38
+ lines.push(`Variant: ${run.variantLabel}`);
39
+ }
40
+ if (run.promptVersion) {
41
+ lines.push(`Prompt version: ${run.promptVersion}`);
42
+ }
43
+ if (run.modelVersion) {
44
+ lines.push(`Model version: ${run.modelVersion}`);
45
+ }
46
+ if (run.toolSchemaVersion) {
47
+ lines.push(`Tool schema version: ${run.toolSchemaVersion}`);
48
+ }
49
+ if (run.configLabel) {
50
+ lines.push(`Config label: ${run.configLabel}`);
51
+ }
52
+ if (run.runtimeProfileName) {
53
+ lines.push(`Runtime profile: ${run.runtimeProfileName}`);
54
+ }
55
+ if (run.suiteDefinitionName) {
56
+ lines.push(`Suite definition: ${run.suiteDefinitionName}`);
57
+ }
58
+ return lines;
59
+ }
package/dist/runner.js CHANGED
@@ -1,7 +1,9 @@
1
1
  import { performance } from "node:perf_hooks";
2
+ import { getRuntimeProfile } from "./config.js";
2
3
  import { createToolCallId, createRunId } from "./lib/id.js";
3
4
  import { evaluateScenario } from "./evaluators.js";
4
5
  import { computeScore } from "./scoring.js";
6
+ import { applyRuntimeProfileToTools } from "./tools.js";
5
7
  import { TraceRecorder } from "./trace.js";
6
8
  export async function runScenario(deps) {
7
9
  const runId = createRunId();
@@ -9,6 +11,8 @@ export async function runScenario(deps) {
9
11
  const runStart = performance.now();
10
12
  const trace = new TraceRecorder(runId, deps.scenario.id);
11
13
  const toolCalls = [];
14
+ const runtimeProfile = deps.scenario.runtime_profile ? getRuntimeProfile(deps.scenario.runtime_profile) : undefined;
15
+ const tools = applyRuntimeProfileToTools(deps.tools, runtimeProfile, trace);
12
16
  const maxSteps = deps.scenario.runtime?.max_steps ?? 8;
13
17
  const timeoutSeconds = deps.scenario.runtime?.timeout_seconds;
14
18
  const deadline = timeoutSeconds ? Date.now() + timeoutSeconds * 1000 : undefined;
@@ -22,6 +26,9 @@ export async function runScenario(deps) {
22
26
  maxSteps,
23
27
  timeoutSeconds,
24
28
  });
29
+ trace.record("system", "runtime_profile_applied", {
30
+ name: runtimeProfile?.name ?? null,
31
+ }, { countStep: false });
25
32
  const availableTools = deps.toolSpecs.filter((tool) => deps.scenario.tools.allowed.includes(tool.name));
26
33
  const session = await deps.agentAdapter.startRun({
27
34
  instructions: deps.scenario.task.instructions,
@@ -72,7 +79,7 @@ export async function runScenario(deps) {
72
79
  trace.record("runner", "forbidden_tool_attempted", { toolName });
73
80
  break;
74
81
  }
75
- const handler = deps.tools[toolName];
82
+ const handler = tools[toolName];
76
83
  if (!handler) {
77
84
  status = "error";
78
85
  terminationReason = "tool_error";
@@ -98,7 +105,8 @@ export async function runScenario(deps) {
98
105
  }
99
106
  catch (error) {
100
107
  const message = error instanceof Error ? error.message : String(error);
101
- if (deadline && Date.now() >= deadline) {
108
+ const isInjectedTimeout = error instanceof Error && error.code === "timeout_exceeded";
109
+ if (isInjectedTimeout || (deadline && Date.now() >= deadline)) {
102
110
  status = "error";
103
111
  terminationReason = "timeout_exceeded";
104
112
  trace.record("runner", "timeout_exceeded", { timeoutSeconds, message });
@@ -182,18 +190,32 @@ export async function runScenario(deps) {
182
190
  function hasTimedOut(deadline) {
183
191
  return deadline !== undefined && Date.now() >= deadline;
184
192
  }
193
+ function toolRaceTimeoutError(message) {
194
+ const error = new Error(message);
195
+ error.code = "timeout_exceeded";
196
+ return error;
197
+ }
185
198
  async function raceWithTimeout(promise, deadline, message) {
186
199
  if (deadline === undefined) {
187
200
  return promise;
188
201
  }
189
202
  const remainingMs = deadline - Date.now();
190
203
  if (remainingMs <= 0) {
191
- throw new Error(message);
204
+ throw toolRaceTimeoutError(message);
205
+ }
206
+ let timeoutHandle;
207
+ try {
208
+ return await Promise.race([
209
+ promise,
210
+ new Promise((_, reject) => {
211
+ timeoutHandle = setTimeout(() => reject(toolRaceTimeoutError(message)), remainingMs);
212
+ timeoutHandle.unref?.();
213
+ }),
214
+ ]);
215
+ }
216
+ finally {
217
+ if (timeoutHandle !== undefined) {
218
+ clearTimeout(timeoutHandle);
219
+ }
192
220
  }
193
- return await Promise.race([
194
- promise,
195
- new Promise((_, reject) => {
196
- setTimeout(() => reject(new Error(message)), remainingMs);
197
- }),
198
- ]);
199
221
  }
package/dist/scenarios.js CHANGED
@@ -2,9 +2,20 @@ import { readFileSync, readdirSync, statSync } from "node:fs";
2
2
  import { createHash } from "node:crypto";
3
3
  import { join, relative, resolve } from "node:path";
4
4
  import { parse } from "yaml";
5
- import { loadAgentLabConfig } from "./config.js";
5
+ import { getRuntimeProfile, getSuiteDefinition, loadAgentLabConfig } from "./config.js";
6
6
  import { getBuiltinToolSpecs } from "./tools.js";
7
7
  const SCENARIOS_ROOT = resolve("scenarios");
8
+ const VALID_TASK_EVALUATOR_TYPES = new Set([
9
+ "exact_final_answer",
10
+ "final_answer_contains",
11
+ "forbidden_tool",
12
+ "tool_call_assertion",
13
+ "step_count_max",
14
+ "tool_call_count_max",
15
+ "tool_repeat_max",
16
+ "cost_max",
17
+ ]);
18
+ const VALID_EVALUATOR_MODES = new Set(["hard_gate", "weighted"]);
8
19
  export function listScenarioFiles(root = SCENARIOS_ROOT) {
9
20
  if (!safeExists(root)) {
10
21
  return [];
@@ -55,6 +66,8 @@ export function listScenarios() {
55
66
  }
56
67
  export function loadScenarioById(scenarioId) {
57
68
  for (const filePath of listScenarioFiles()) {
69
+ if (getScenarioType(filePath) !== "task")
70
+ continue;
58
71
  const loaded = loadScenarioByPath(filePath, getKnownToolNames());
59
72
  if (loaded.definition.id === scenarioId) {
60
73
  return loaded;
@@ -68,6 +81,20 @@ export function loadScenariosBySuite(suite) {
68
81
  .map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
69
82
  .filter(({ definition }) => definition.suite === suite);
70
83
  }
84
+ export function loadScenariosBySuiteDefinition(name) {
85
+ const suiteDefinition = getSuiteDefinition(name);
86
+ const knownToolNames = getKnownToolNames();
87
+ const scenarioFiles = listScenarioFiles(resolve("scenarios"));
88
+ const loadedScenarios = scenarioFiles.map((filePath) => loadScenarioRecordByPath(filePath, knownToolNames));
89
+ const included = loadedScenarios
90
+ .filter(({ definition }) => matchesSuiteDefinitionInclude(definition, suiteDefinition));
91
+ const excludedIds = new Set(loadedScenarios
92
+ .filter(({ definition }) => matchesSuiteDefinitionExclude(definition, suiteDefinition))
93
+ .map(({ definition }) => definition.id));
94
+ return included
95
+ .filter(({ definition }) => !excludedIds.has(definition.id))
96
+ .sort((left, right) => left.definition.id.localeCompare(right.definition.id));
97
+ }
71
98
  export function loadScenarioByPath(filePath, knownToolNames = getKnownToolNames()) {
72
99
  const absolutePath = resolve(filePath);
73
100
  const raw = readFileSync(absolutePath, "utf8");
@@ -107,7 +134,10 @@ function validateScenario(value, filePath, knownToolNames) {
107
134
  throw new Error(`Scenario file '${filePath}' references unknown allowed tool '${toolName}'.`);
108
135
  }
109
136
  }
110
- if (Array.isArray(value.tools.forbidden)) {
137
+ if (value.tools.forbidden !== undefined) {
138
+ if (!Array.isArray(value.tools.forbidden)) {
139
+ throw new Error(`Scenario file '${filePath}' field 'tools.forbidden' must be an array of strings.`);
140
+ }
111
141
  for (const toolName of value.tools.forbidden) {
112
142
  if (typeof toolName !== "string") {
113
143
  throw new Error(`Scenario file '${filePath}' contains a non-string tool name in tools.forbidden.`);
@@ -122,6 +152,17 @@ function validateScenario(value, filePath, knownToolNames) {
122
152
  if (!isObject(evaluator) || typeof evaluator.id !== "string" || typeof evaluator.type !== "string") {
123
153
  throw new Error(`Scenario file '${filePath}' has an invalid evaluator entry.`);
124
154
  }
155
+ if (!VALID_TASK_EVALUATOR_TYPES.has(evaluator.type)) {
156
+ throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid type '${evaluator.type}'. ` +
157
+ `Valid types: ${[...VALID_TASK_EVALUATOR_TYPES].join(", ")}.`);
158
+ }
159
+ if (!VALID_EVALUATOR_MODES.has(evaluator.mode)) {
160
+ throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' has invalid mode '${String(evaluator.mode)}'. ` +
161
+ `Valid modes: hard_gate, weighted.`);
162
+ }
163
+ if (!isObject(evaluator.config)) {
164
+ throw new Error(`Scenario file '${filePath}' evaluator '${evaluator.id}' must define an object config.`);
165
+ }
125
166
  if (evaluatorIds.has(evaluator.id)) {
126
167
  throw new Error(`Scenario file '${filePath}' defines duplicate evaluator id '${evaluator.id}'.`);
127
168
  }
@@ -131,6 +172,12 @@ function validateScenario(value, filePath, knownToolNames) {
131
172
  validatePositiveInt(value.runtime.max_steps, "runtime.max_steps", filePath);
132
173
  validatePositiveInt(value.runtime.timeout_seconds, "runtime.timeout_seconds", filePath);
133
174
  }
175
+ if (value.runtime_profile !== undefined) {
176
+ if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
177
+ throw new Error(`Scenario file '${filePath}' field 'runtime_profile' must be a non-empty string.`);
178
+ }
179
+ getRuntimeProfile(value.runtime_profile);
180
+ }
134
181
  if (isObject(value.context) && Array.isArray(value.context.fixtures)) {
135
182
  for (const fixturePath of value.context.fixtures) {
136
183
  if (typeof fixturePath !== "string") {
@@ -226,6 +273,17 @@ function validateConversationEvaluatorList(evaluators, context, filePath) {
226
273
  if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
227
274
  throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
228
275
  }
276
+ if (ev.type === "response_contains" || ev.type === "response_not_contains") {
277
+ if (!isObject(ev.config)) {
278
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define an object config.`);
279
+ }
280
+ if ("text" in ev.config) {
281
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} uses stale 'config.text'; use 'config.keywords: string[]'.`);
282
+ }
283
+ if (!Array.isArray(ev.config.keywords) || ev.config.keywords.some((kw) => typeof kw !== "string")) {
284
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must define config.keywords as a string array.`);
285
+ }
286
+ }
229
287
  }
230
288
  }
231
289
  function validateConversationScenario(value, filePath) {
@@ -240,6 +298,12 @@ function validateConversationScenario(value, filePath) {
240
298
  if (value.type !== "conversation") {
241
299
  throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
242
300
  }
301
+ if (value.runtime_profile !== undefined) {
302
+ if (typeof value.runtime_profile !== "string" || value.runtime_profile.length === 0) {
303
+ throw new Error(`Conversation scenario '${filePath}' field 'runtime_profile' must be a non-empty string.`);
304
+ }
305
+ getRuntimeProfile(value.runtime_profile);
306
+ }
243
307
  if ("tools" in value) {
244
308
  throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
245
309
  }
@@ -265,3 +329,27 @@ function validateConversationScenario(value, filePath) {
265
329
  validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
266
330
  }
267
331
  }
332
+ function loadScenarioRecordByPath(filePath, knownToolNames = getKnownToolNames()) {
333
+ if (getScenarioType(filePath) === "conversation") {
334
+ return loadConversationScenarioByPath(filePath);
335
+ }
336
+ return loadScenarioByPath(filePath, knownToolNames);
337
+ }
338
+ function matchesSuiteDefinitionInclude(definition, suiteDefinition) {
339
+ return matchesSuiteDefinitionSelectors(definition, suiteDefinition.include);
340
+ }
341
+ function matchesSuiteDefinitionExclude(definition, suiteDefinition) {
342
+ return suiteDefinition.exclude !== undefined && matchesSuiteDefinitionSelectors(definition, suiteDefinition.exclude);
343
+ }
344
+ function matchesSuiteDefinitionSelectors(definition, selectors) {
345
+ if (selectors.scenarios?.includes(definition.id)) {
346
+ return true;
347
+ }
348
+ if (selectors.tags?.some((tag) => definition.tags?.includes(tag) ?? false)) {
349
+ return true;
350
+ }
351
+ if (selectors.suites?.includes(definition.suite)) {
352
+ return true;
353
+ }
354
+ return false;
355
+ }
package/dist/scoring.js CHANGED
@@ -4,9 +4,9 @@ export function computeScore(results) {
4
4
  const weighted = results.filter((result) => result.mode === "weighted");
5
5
  let score = 100;
6
6
  if (weighted.length > 0) {
7
- const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 0), 0);
7
+ const totalWeight = weighted.reduce((sum, result) => sum + (result.weight ?? 1), 0);
8
8
  const earnedWeight = weighted.reduce((sum, result) => {
9
- const weight = result.weight ?? 0;
9
+ const weight = result.weight ?? 1;
10
10
  return sum + (result.status === "pass" ? weight : 0);
11
11
  }, 0);
12
12
  score = totalWeight === 0 ? 100 : Math.round((earnedWeight / totalWeight) * 100);