agent-regression-lab 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,8 @@
1
1
  #!/usr/bin/env node
2
+ import packageJson from "../package.json" with { type: "json" };
2
3
  import { createAgentFactory } from "./agent/factory.js";
3
4
  import { getAgentRegistration } from "./config.js";
5
+ import { createSuiteBatchId } from "./lib/id.js";
4
6
  import { getRunErrorDetail } from "./runOutput.js";
5
7
  async function main() {
6
8
  const [, , command, ...args] = process.argv;
@@ -9,27 +11,27 @@ async function main() {
9
11
  case "--help":
10
12
  case "-h":
11
13
  printUsage();
12
- return;
14
+ break;
13
15
  case "version":
14
16
  case "--version":
15
17
  case "-v":
16
18
  printVersion();
17
- return;
19
+ break;
18
20
  case "list":
19
21
  await handleList(args);
20
- return;
22
+ break;
21
23
  case "run":
22
24
  await handleRun(args);
23
- return;
25
+ break;
24
26
  case "show":
25
27
  await handleShow(args);
26
- return;
28
+ break;
27
29
  case "compare":
28
30
  await handleCompare(args);
29
- return;
31
+ break;
30
32
  case "ui":
31
33
  await handleUi();
32
- return;
34
+ break;
33
35
  default:
34
36
  printUsage();
35
37
  }
@@ -37,16 +39,17 @@ async function main() {
37
39
  function printUsage() {
38
40
  console.log(`Usage:
39
41
  agentlab list scenarios
40
- agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
41
- agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
42
+ agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
43
+ agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
42
44
  agentlab show <run-id>
43
45
  agentlab compare <baseline-run-id> <candidate-run-id>
46
+ agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
44
47
  agentlab ui
45
48
  agentlab help
46
49
  agentlab version`);
47
50
  }
48
51
  function printVersion() {
49
- console.log("0.1.0");
52
+ console.log(packageJson.version);
50
53
  }
51
54
  async function handleList(args) {
52
55
  if (args[0] !== "scenarios") {
@@ -64,35 +67,56 @@ async function handleRun(args) {
64
67
  const { loadScenariosBySuite } = await import("./scenarios.js");
65
68
  if (parsed.suite) {
66
69
  const suite = parsed.suite;
67
- if (!suite) {
68
- throw new Error("Missing suite id.");
69
- }
70
70
  const scenarios = loadScenariosBySuite(suite);
71
71
  if (scenarios.length === 0) {
72
72
  throw new Error(`No scenarios found for suite '${suite}'.`);
73
73
  }
74
+ const suiteBatchId = createSuiteBatchId();
74
75
  const runs = [];
75
76
  for (const scenario of scenarios) {
76
- runs.push(await executeOne(scenario.definition.id, runtimeConfig));
77
+ runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
77
78
  }
78
- const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
79
- const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
80
- const errored = runs.filter((bundle) => bundle.run.status === "error").length;
81
- const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
82
- console.log(`Suite: ${suite}`);
83
- console.log(`Passed: ${passed}/${runs.length}`);
84
- console.log(`Failed: ${failed}/${runs.length}`);
85
- console.log(`Errored: ${errored}/${runs.length}`);
86
- console.log(`Average score: ${avgScore}`);
79
+ printSuiteSummary(suite, runs, suiteBatchId);
87
80
  return;
88
81
  }
89
82
  const scenarioId = parsed.scenarioId;
90
83
  if (!scenarioId) {
91
84
  throw new Error("Missing scenario id.");
92
85
  }
93
- await executeOne(scenarioId, runtimeConfig);
86
+ // Detect scenario type to route to the right runner
87
+ const { listScenarioFiles } = await import("./scenarios.js");
88
+ const { parse } = await import("yaml");
89
+ const { readFileSync } = await import("node:fs");
90
+ const { resolve } = await import("node:path");
91
+ let scenarioType = "task";
92
+ for (const filePath of listScenarioFiles()) {
93
+ const raw = readFileSync(resolve(filePath), "utf8");
94
+ const parsedYaml = parse(raw);
95
+ if (parsedYaml.id === scenarioId) {
96
+ scenarioType = parsedYaml.type === "conversation" ? "conversation" : "task";
97
+ break;
98
+ }
99
+ }
100
+ if (scenarioType === "conversation") {
101
+ if (runtimeConfig.provider !== "http") {
102
+ throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
103
+ }
104
+ const httpConfig = {
105
+ name: runtimeConfig.agentName ?? "http-agent",
106
+ provider: "http",
107
+ url: runtimeConfig.url,
108
+ request_template: runtimeConfig.request_template,
109
+ response_field: runtimeConfig.response_field,
110
+ headers: runtimeConfig.headers,
111
+ timeout_ms: runtimeConfig.timeout_ms,
112
+ };
113
+ await executeConversation(scenarioId, httpConfig, runtimeConfig.label);
114
+ }
115
+ else {
116
+ await executeOne(scenarioId, runtimeConfig);
117
+ }
94
118
  }
95
- async function executeOne(scenarioId, runtimeConfig) {
119
+ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
96
120
  const [{ Storage }, { loadToolRegistry, loadToolSpecs }, { loadScenarioById }, { runScenario }] = await Promise.all([
97
121
  import("./storage.js"),
98
122
  import("./tools.js"),
@@ -100,31 +124,119 @@ async function executeOne(scenarioId, runtimeConfig) {
100
124
  import("./runner.js"),
101
125
  ]);
102
126
  const storage = new Storage();
103
- const toolSpecs = await loadToolSpecs();
104
- const toolRegistry = await loadToolRegistry();
105
- const loaded = loadScenarioById(scenarioId);
106
- storage.upsertScenario({
107
- id: loaded.definition.id,
108
- name: loaded.definition.name,
109
- suite: loaded.definition.suite,
110
- difficulty: loaded.definition.difficulty,
111
- description: loaded.definition.description,
112
- }, loaded.definition, loaded.filePath, loaded.fileHash);
113
- const factory = createAgentFactory(runtimeConfig);
114
- const agentVersion = factory.createVersion(runtimeConfig);
115
- storage.upsertAgentVersion(agentVersion);
116
- const bundle = await runScenario({
117
- agentAdapter: factory.createAdapter(),
118
- agentVersion,
119
- scenario: loaded.definition,
120
- scenarioFileHash: loaded.fileHash,
121
- toolSpecs,
122
- tools: toolRegistry,
123
- });
124
- bundle.agentVersion = agentVersion;
125
- storage.saveRun(bundle);
126
- printRunSummary(bundle);
127
- return bundle;
127
+ try {
128
+ const toolSpecs = await loadToolSpecs();
129
+ const toolRegistry = await loadToolRegistry();
130
+ const loaded = loadScenarioById(scenarioId);
131
+ storage.upsertScenario({
132
+ id: loaded.definition.id,
133
+ name: loaded.definition.name,
134
+ suite: loaded.definition.suite,
135
+ difficulty: loaded.definition.difficulty,
136
+ description: loaded.definition.description,
137
+ }, loaded.definition, loaded.filePath, loaded.fileHash);
138
+ const factory = createAgentFactory(runtimeConfig);
139
+ const agentVersion = factory.createVersion(runtimeConfig);
140
+ storage.upsertAgentVersion(agentVersion);
141
+ const bundle = await runScenario({
142
+ agentAdapter: factory.createAdapter(),
143
+ agentVersion,
144
+ scenario: loaded.definition,
145
+ scenarioFileHash: loaded.fileHash,
146
+ toolSpecs,
147
+ tools: toolRegistry,
148
+ });
149
+ bundle.run.suiteBatchId = suiteBatchId;
150
+ bundle.agentVersion = agentVersion;
151
+ storage.saveRun(bundle);
152
+ printRunSummary(bundle);
153
+ return bundle;
154
+ }
155
+ finally {
156
+ storage.close();
157
+ }
158
+ }
159
+ export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
160
+ const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
161
+ import("./storage.js"),
162
+ import("./scenarios.js"),
163
+ import("./conversationRunner.js"),
164
+ import("./lib/id.js"),
165
+ ]);
166
+ const storage = new Storage();
167
+ try {
168
+ const loaded = loadConversationScenarioById(scenarioId);
169
+ storage.upsertScenario({
170
+ id: loaded.definition.id,
171
+ name: loaded.definition.name,
172
+ suite: loaded.definition.suite,
173
+ difficulty: loaded.definition.difficulty,
174
+ description: loaded.definition.description,
175
+ }, loaded.definition, loaded.filePath, loaded.fileHash);
176
+ const agentLabel = label ?? httpConfig.label ?? httpConfig.name;
177
+ const agentConfig = { provider: "http", url: httpConfig.url, agentName: httpConfig.name };
178
+ const agentVersion = {
179
+ id: createAgentVersionId(agentLabel, agentConfig),
180
+ label: agentLabel,
181
+ provider: "http",
182
+ config: agentConfig,
183
+ };
184
+ storage.upsertAgentVersion(agentVersion);
185
+ const bundle = await runConversation({
186
+ httpConfig,
187
+ agentVersion,
188
+ scenario: loaded.definition,
189
+ scenarioFileHash: loaded.fileHash,
190
+ });
191
+ bundle.run.suiteBatchId = suiteBatchId;
192
+ bundle.agentVersion = agentVersion;
193
+ storage.saveRun(bundle);
194
+ printConversationSummary(bundle, httpConfig.url, loaded.definition.steps.length);
195
+ return bundle;
196
+ }
197
+ finally {
198
+ storage.close();
199
+ }
200
+ }
201
+ function printSuiteSummary(suite, runs, suiteBatchId) {
202
+ const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
203
+ const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
204
+ const errored = runs.filter((bundle) => bundle.run.status === "error").length;
205
+ const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
206
+ console.log(`Suite: ${suite}`);
207
+ console.log(`Passed: ${passed}/${runs.length}`);
208
+ console.log(`Failed: ${failed}/${runs.length}`);
209
+ console.log(`Errored: ${errored}/${runs.length}`);
210
+ console.log(`Average score: ${avgScore}`);
211
+ console.log(`Suite batch: ${suiteBatchId}`);
212
+ }
213
+ function printConversationSummary(bundle, agentUrl, totalSteps) {
214
+ const statusLabel = bundle.run.status.toUpperCase();
215
+ console.log(`run ${bundle.run.scenarioId} — ${statusLabel}`);
216
+ console.log(` agent: ${bundle.agentVersion?.label ?? bundle.run.agentVersionId} (${agentUrl})`);
217
+ console.log(` turns completed: ${bundle.run.totalSteps}/${totalSteps}`);
218
+ const stepEvals = bundle.evaluatorResults.filter((r) => r.evaluatorId.startsWith("step_"));
219
+ const stepIndices = new Set(stepEvals.map((r) => {
220
+ const match = r.evaluatorId.match(/^step_(\d+)_/);
221
+ return match ? parseInt(match[1], 10) : -1;
222
+ }));
223
+ for (const stepIndex of [...stepIndices].sort((a, b) => a - b)) {
224
+ const resultsForStep = stepEvals.filter((r) => r.evaluatorId.startsWith(`step_${stepIndex}_`));
225
+ const allPass = resultsForStep.every((r) => r.status === "pass");
226
+ const stepStatus = allPass ? "pass" : "FAIL";
227
+ const details = resultsForStep.map((r) => {
228
+ if (r.evaluatorType === "response_latency_max") {
229
+ const latencyMatch = r.message.match(/(\d+)ms/);
230
+ return latencyMatch ? `latency ${latencyMatch[1]}ms ✓` : r.message;
231
+ }
232
+ return `${r.evaluatorType} ${r.status === "pass" ? "✓" : "✗"}`;
233
+ });
234
+ console.log(` step ${stepIndex + 1}: ${stepStatus}${details.length > 0 ? ` (${details.join(", ")})` : ""}`);
235
+ }
236
+ if (bundle.run.status !== "pass") {
237
+ console.log(` run stopped (${bundle.run.terminationReason})`);
238
+ }
239
+ console.log(` run id: ${bundle.run.id}`);
128
240
  }
129
241
  async function handleUi() {
130
242
  const { startUiServer } = await import("./ui/server.js");
@@ -161,64 +273,122 @@ async function handleShow(args) {
161
273
  }
162
274
  const { Storage } = await import("./storage.js");
163
275
  const storage = new Storage();
164
- const bundle = storage.getRun(runId);
165
- if (!bundle) {
166
- throw new Error(`Run '${runId}' not found.`);
167
- }
168
- console.log(`Run: ${bundle.run.id}`);
169
- console.log(`Scenario: ${bundle.run.scenarioId}`);
170
- console.log(`Status: ${bundle.run.status.toUpperCase()}`);
171
- console.log(`Score: ${bundle.run.score}/100`);
172
- if (bundle.agentVersion) {
173
- console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
174
- console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
175
- if (bundle.agentVersion.command) {
176
- console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
276
+ try {
277
+ const bundle = storage.getRun(runId);
278
+ if (!bundle) {
279
+ throw new Error(`Run '${runId}' not found.`);
280
+ }
281
+ console.log(`Run: ${bundle.run.id}`);
282
+ console.log(`Scenario: ${bundle.run.scenarioId}`);
283
+ console.log(`Status: ${bundle.run.status.toUpperCase()}`);
284
+ console.log(`Score: ${bundle.run.score}/100`);
285
+ if (bundle.agentVersion) {
286
+ console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
287
+ console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
288
+ if (bundle.agentVersion.command) {
289
+ console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
290
+ }
291
+ }
292
+ console.log(`Termination: ${bundle.run.terminationReason}`);
293
+ const errorDetail = getRunErrorDetail(bundle);
294
+ if (errorDetail) {
295
+ console.log(`Error: ${errorDetail}`);
296
+ }
297
+ console.log(`Final output: ${bundle.run.finalOutput}`);
298
+ console.log("Evaluators:");
299
+ for (const result of bundle.evaluatorResults) {
300
+ console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
177
301
  }
178
302
  }
179
- console.log(`Termination: ${bundle.run.terminationReason}`);
180
- const errorDetail = getRunErrorDetail(bundle);
181
- if (errorDetail) {
182
- console.log(`Error: ${errorDetail}`);
183
- }
184
- console.log(`Final output: ${bundle.run.finalOutput}`);
185
- console.log("Evaluators:");
186
- for (const result of bundle.evaluatorResults) {
187
- console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
303
+ finally {
304
+ storage.close();
188
305
  }
189
306
  }
190
307
  async function handleCompare(args) {
191
- const [baselineRunId, candidateRunId] = args;
192
- if (!baselineRunId || !candidateRunId) {
193
- throw new Error("Missing baseline or candidate run id.");
194
- }
308
+ const isSuiteCompare = args[0] === "--suite";
195
309
  const { Storage } = await import("./storage.js");
196
310
  const storage = new Storage();
197
- const comparison = storage.compareRuns(baselineRunId, candidateRunId);
198
- console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
199
- console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
200
- console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
201
- console.log("Changes:");
202
- if (comparison.notes.length === 0) {
203
- console.log("- No material changes.");
204
- }
205
- else {
206
- for (const note of comparison.notes) {
207
- console.log(`- ${note}`);
311
+ try {
312
+ if (isSuiteCompare) {
313
+ const baselineBatchId = args[1];
314
+ const candidateBatchId = args[2];
315
+ if (!baselineBatchId || !candidateBatchId) {
316
+ throw new Error("Missing baseline or candidate suite batch id.");
317
+ }
318
+ const comparison = storage.compareSuites(baselineBatchId, candidateBatchId);
319
+ console.log(`Suite: ${comparison.suite}`);
320
+ console.log(`Baseline batch: ${comparison.baselineBatchId}`);
321
+ console.log(`Candidate batch: ${comparison.candidateBatchId}`);
322
+ console.log(`Classification: ${comparison.classification.toUpperCase()}`);
323
+ console.log(`Pass delta: ${signedMetric(comparison.deltas.pass)}`);
324
+ console.log(`Fail delta: ${signedMetric(comparison.deltas.fail)}`);
325
+ console.log(`Error delta: ${signedMetric(comparison.deltas.error)}`);
326
+ console.log(`Average score delta: ${signedMetric(comparison.deltas.averageScore)}`);
327
+ console.log(`Average runtime delta: ${signedMetric(comparison.deltas.averageRuntimeMs)}ms`);
328
+ console.log(`Average steps delta: ${signedMetric(comparison.deltas.averageSteps)}`);
329
+ if (comparison.notes.length > 0) {
330
+ console.log("Notes:");
331
+ for (const note of comparison.notes) {
332
+ console.log(`- ${note}`);
333
+ }
334
+ }
335
+ if (comparison.regressions.length > 0) {
336
+ console.log("Regressions:");
337
+ for (const regression of comparison.regressions) {
338
+ console.log(`- ${regression.scenarioId}: ${regression.comparison.classification}`);
339
+ }
340
+ }
341
+ if (comparison.improvements.length > 0) {
342
+ console.log("Improvements:");
343
+ for (const improvement of comparison.improvements) {
344
+ console.log(`- ${improvement.scenarioId}: ${improvement.comparison.classification}`);
345
+ }
346
+ }
347
+ if (comparison.missingFromCandidate.length > 0) {
348
+ console.log(`Missing from candidate: ${comparison.missingFromCandidate.join(", ")}`);
349
+ }
350
+ if (comparison.missingFromBaseline.length > 0) {
351
+ console.log(`Missing from baseline: ${comparison.missingFromBaseline.join(", ")}`);
352
+ }
353
+ return;
208
354
  }
209
- }
210
- if (comparison.evaluatorDiffs.length > 0) {
211
- console.log("Evaluator diffs:");
212
- for (const diff of comparison.evaluatorDiffs) {
213
- console.log(`- ${diff.note}`);
355
+ const [baselineRunId, candidateRunId] = args;
356
+ if (!baselineRunId || !candidateRunId) {
357
+ throw new Error("Missing baseline or candidate run id.");
214
358
  }
215
- }
216
- if (comparison.toolDiffs.length > 0) {
217
- console.log("Tool diffs:");
218
- for (const diff of comparison.toolDiffs) {
219
- console.log(`- ${diff.note}`);
359
+ const comparison = storage.compareRuns(baselineRunId, candidateRunId);
360
+ console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
361
+ console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
362
+ console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
363
+ console.log(`Classification: ${comparison.classification.toUpperCase()}`);
364
+ console.log("Changes:");
365
+ if (comparison.notes.length === 0) {
366
+ console.log("- No material changes.");
367
+ }
368
+ else {
369
+ for (const note of comparison.notes) {
370
+ console.log(`- ${note}`);
371
+ }
372
+ }
373
+ if (comparison.evaluatorDiffs.length > 0) {
374
+ console.log("Evaluator diffs:");
375
+ for (const diff of comparison.evaluatorDiffs) {
376
+ console.log(`- ${diff.note}`);
377
+ }
378
+ }
379
+ if (comparison.toolDiffs.length > 0) {
380
+ console.log("Tool diffs:");
381
+ for (const diff of comparison.toolDiffs) {
382
+ console.log(`- ${diff.note}`);
383
+ }
220
384
  }
221
385
  }
386
+ finally {
387
+ storage.close();
388
+ }
389
+ }
390
+ function signedMetric(value) {
391
+ return value > 0 ? `+${value}` : `${value}`;
222
392
  }
223
393
  function parseRunArgs(args) {
224
394
  const runtimeConfig = { provider: "mock" };
@@ -233,7 +403,7 @@ function parseRunArgs(args) {
233
403
  }
234
404
  if (arg === "--provider") {
235
405
  const provider = args[index + 1];
236
- if (provider !== "mock" && provider !== "openai" && provider !== "external_process") {
406
+ if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
237
407
  throw new Error(`Unsupported provider '${String(provider)}'.`);
238
408
  }
239
409
  runtimeConfig.provider = provider;
@@ -267,11 +437,20 @@ function validateRuntimeConfig(config) {
267
437
  if (config.agentName) {
268
438
  const registration = getAgentRegistration(config.agentName);
269
439
  config.provider = registration.provider;
270
- config.model = config.model ?? registration.model;
271
440
  config.label = config.label ?? registration.label ?? registration.name;
272
- config.command = registration.command;
273
- config.args = registration.args;
274
- config.envAllowlist = registration.envAllowlist;
441
+ if (registration.provider !== "http") {
442
+ config.model = config.model ?? registration.model;
443
+ config.command = registration.command;
444
+ config.args = registration.args;
445
+ config.envAllowlist = registration.envAllowlist;
446
+ }
447
+ else {
448
+ config.url = registration.url;
449
+ config.request_template = registration.request_template;
450
+ config.response_field = registration.response_field;
451
+ config.headers = registration.headers;
452
+ config.timeout_ms = registration.timeout_ms;
453
+ }
275
454
  }
276
455
  if (config.provider === "openai") {
277
456
  if (!process.env.OPENAI_API_KEY) {
@@ -288,6 +467,12 @@ function validateRuntimeConfig(config) {
288
467
  }
289
468
  config.label = config.label ?? config.agentName ?? "external-process-agent";
290
469
  }
470
+ if (config.provider === "http") {
471
+ if (!config.url) {
472
+ throw new Error("HTTP agents require a configured url. Use --agent <name> with provider: http in agentlab.config.yaml.");
473
+ }
474
+ config.label = config.label ?? config.agentName ?? "http-agent";
475
+ }
291
476
  return config;
292
477
  }
293
478
  main().catch((error) => {
package/dist/lib/id.js CHANGED
@@ -5,6 +5,9 @@ export function hashText(text) {
5
5
  export function createRunId() {
6
6
  return `run_${Date.now()}`;
7
7
  }
8
+ export function createSuiteBatchId() {
9
+ return `suite_${Date.now()}_${randomUUID().slice(0, 8)}`;
10
+ }
8
11
  export function createEventId() {
9
12
  return `evt_${randomUUID()}`;
10
13
  }
package/dist/scenarios.js CHANGED
@@ -26,15 +26,31 @@ export function listScenarioFiles(root = SCENARIOS_ROOT) {
26
26
  return results.sort();
27
27
  }
28
28
  export function listScenarios() {
29
- return listScenarioFiles().map((filePath) => {
30
- const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
31
- return {
32
- id: definition.id,
33
- name: definition.name,
34
- suite: definition.suite,
35
- difficulty: definition.difficulty,
36
- description: definition.description,
37
- };
29
+ return listScenarioFiles().flatMap((filePath) => {
30
+ try {
31
+ const scenarioType = getScenarioType(filePath);
32
+ if (scenarioType === "conversation") {
33
+ const { definition } = loadConversationScenarioByPath(filePath);
34
+ return [{
35
+ id: definition.id,
36
+ name: definition.name,
37
+ suite: definition.suite,
38
+ difficulty: definition.difficulty,
39
+ description: definition.description,
40
+ }];
41
+ }
42
+ const { definition } = loadScenarioByPath(filePath, getKnownToolNames());
43
+ return [{
44
+ id: definition.id,
45
+ name: definition.name,
46
+ suite: definition.suite,
47
+ difficulty: definition.difficulty,
48
+ description: definition.description,
49
+ }];
50
+ }
51
+ catch {
52
+ return [];
53
+ }
38
54
  });
39
55
  }
40
56
  export function loadScenarioById(scenarioId) {
@@ -48,6 +64,7 @@ export function loadScenarioById(scenarioId) {
48
64
  }
49
65
  export function loadScenariosBySuite(suite) {
50
66
  return listScenarioFiles()
67
+ .filter((filePath) => getScenarioType(filePath) === "task")
51
68
  .map((filePath) => loadScenarioByPath(filePath, getKnownToolNames()))
52
69
  .filter(({ definition }) => definition.suite === suite);
53
70
  }
@@ -153,3 +170,98 @@ function getKnownToolNames() {
153
170
  }
154
171
  return names;
155
172
  }
173
+ export function getScenarioType(filePath) {
174
+ const absolutePath = resolve(filePath);
175
+ const raw = readFileSync(absolutePath, "utf8");
176
+ const parsed = parse(raw);
177
+ if (isObject(parsed) && parsed.type === "conversation") {
178
+ return "conversation";
179
+ }
180
+ return "task";
181
+ }
182
+ export function loadConversationScenarioByPath(filePath) {
183
+ const absolutePath = resolve(filePath);
184
+ const raw = readFileSync(absolutePath, "utf8");
185
+ const parsed = parse(raw);
186
+ validateConversationScenario(parsed, absolutePath);
187
+ return {
188
+ definition: parsed,
189
+ filePath: relative(process.cwd(), absolutePath),
190
+ fileHash: createHash("sha256").update(raw).digest("hex"),
191
+ };
192
+ }
193
+ export function loadConversationScenarioById(scenarioId) {
194
+ for (const filePath of listScenarioFiles()) {
195
+ const absolutePath = resolve(filePath);
196
+ const raw = readFileSync(absolutePath, "utf8");
197
+ const parsed = parse(raw);
198
+ if (parsed.type === "conversation" && parsed.id === scenarioId) {
199
+ return loadConversationScenarioByPath(filePath);
200
+ }
201
+ }
202
+ throw new Error(`Conversation scenario '${scenarioId}' not found.`);
203
+ }
204
+ const VALID_CONVERSATION_EVALUATOR_TYPES = new Set([
205
+ "response_contains",
206
+ "response_not_contains",
207
+ "response_matches_regex",
208
+ "response_latency_max",
209
+ "step_count_max",
210
+ "exact_final_answer",
211
+ "final_answer_contains",
212
+ ]);
213
+ function validateConversationEvaluatorList(evaluators, context, filePath) {
214
+ if (!Array.isArray(evaluators)) {
215
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluators must be an array.`);
216
+ }
217
+ for (let i = 0; i < evaluators.length; i += 1) {
218
+ const ev = evaluators[i];
219
+ if (!isObject(ev)) {
220
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must be an object.`);
221
+ }
222
+ if (typeof ev.type !== "string" || !VALID_CONVERSATION_EVALUATOR_TYPES.has(ev.type)) {
223
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} has invalid type '${String(ev.type)}'. ` +
224
+ `Valid types: ${[...VALID_CONVERSATION_EVALUATOR_TYPES].join(", ")}.`);
225
+ }
226
+ if (ev.mode !== "hard_gate" && ev.mode !== "weighted") {
227
+ throw new Error(`Conversation scenario '${filePath}' ${context} evaluator ${i} must have mode: hard_gate or weighted.`);
228
+ }
229
+ }
230
+ }
231
+ function validateConversationScenario(value, filePath) {
232
+ if (!isObject(value)) {
233
+ throw new Error(`Scenario file '${filePath}' must contain a YAML object.`);
234
+ }
235
+ for (const field of ["id", "name", "suite"]) {
236
+ if (typeof value[field] !== "string" || value[field].length === 0) {
237
+ throw new Error(`Conversation scenario '${filePath}' is missing required string field '${field}'.`);
238
+ }
239
+ }
240
+ if (value.type !== "conversation") {
241
+ throw new Error(`Scenario file '${filePath}' does not have type: conversation.`);
242
+ }
243
+ if ("tools" in value) {
244
+ throw new Error(`Conversation scenario '${filePath}' must not define 'tools'. HTTP agents manage their own tools internally.`);
245
+ }
246
+ if (!Array.isArray(value.steps) || value.steps.length === 0) {
247
+ throw new Error(`Conversation scenario '${filePath}' must define at least one step.`);
248
+ }
249
+ for (let i = 0; i < value.steps.length; i += 1) {
250
+ const step = value.steps[i];
251
+ if (!isObject(step)) {
252
+ throw new Error(`Conversation scenario '${filePath}' step ${i} must be an object.`);
253
+ }
254
+ if (step.role !== "user") {
255
+ throw new Error(`Conversation scenario '${filePath}' step ${i} must have role: user.`);
256
+ }
257
+ if (typeof step.message !== "string" || step.message.length === 0) {
258
+ throw new Error(`Conversation scenario '${filePath}' step ${i} must have a non-empty message.`);
259
+ }
260
+ if (step.evaluators !== undefined) {
261
+ validateConversationEvaluatorList(step.evaluators, `step ${i}`, filePath);
262
+ }
263
+ }
264
+ if (value.evaluators !== undefined) {
265
+ validateConversationEvaluatorList(value.evaluators, "end-of-run evaluators", filePath);
266
+ }
267
+ }