agent-regression-lab 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,10 @@
1
1
  #!/usr/bin/env node
2
+ import packageJson from "../package.json" with { type: "json" };
3
+ import { pathToFileURL } from "node:url";
2
4
  import { createAgentFactory } from "./agent/factory.js";
3
- import { getAgentRegistration } from "./config.js";
4
- import { getRunErrorDetail } from "./runOutput.js";
5
+ import { getAgentRegistration, getVariantSet } from "./config.js";
6
+ import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
7
+ import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
5
8
  async function main() {
6
9
  const [, , command, ...args] = process.argv;
7
10
  switch (command) {
@@ -9,27 +12,27 @@ async function main() {
9
12
  case "--help":
10
13
  case "-h":
11
14
  printUsage();
12
- return;
15
+ break;
13
16
  case "version":
14
17
  case "--version":
15
18
  case "-v":
16
19
  printVersion();
17
- return;
20
+ break;
18
21
  case "list":
19
22
  await handleList(args);
20
- return;
23
+ break;
21
24
  case "run":
22
25
  await handleRun(args);
23
- return;
26
+ break;
24
27
  case "show":
25
28
  await handleShow(args);
26
- return;
29
+ break;
27
30
  case "compare":
28
31
  await handleCompare(args);
29
- return;
32
+ break;
30
33
  case "ui":
31
34
  await handleUi();
32
- return;
35
+ break;
33
36
  default:
34
37
  printUsage();
35
38
  }
@@ -37,16 +40,19 @@ async function main() {
37
40
  function printUsage() {
38
41
  console.log(`Usage:
39
42
  agentlab list scenarios
40
- agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
41
- agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
43
+ agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
44
+ agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
45
+ agentlab run --suite-def <name> [--agent <name>]
46
+ agentlab run <scenario-id> [--variant-set <name>]
42
47
  agentlab show <run-id>
43
48
  agentlab compare <baseline-run-id> <candidate-run-id>
49
+ agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
44
50
  agentlab ui
45
51
  agentlab help
46
52
  agentlab version`);
47
53
  }
48
54
  function printVersion() {
49
- console.log("0.1.0");
55
+ console.log(packageJson.version);
50
56
  }
51
57
  async function handleList(args) {
52
58
  if (args[0] !== "scenarios") {
@@ -61,38 +67,108 @@ async function handleList(args) {
61
67
  async function handleRun(args) {
62
68
  const parsed = parseRunArgs(args);
63
69
  const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
64
- const { loadScenariosBySuite } = await import("./scenarios.js");
70
+ const { loadScenariosBySuite, loadScenariosBySuiteDefinition } = await import("./scenarios.js");
71
+ if (parsed.suite && parsed.suiteDefinition) {
72
+ throw new Error("--suite and --suite-def cannot be used together.");
73
+ }
74
+ if (parsed.runtimeConfig.agentName && parsed.variantSetName) {
75
+ throw new Error("--agent and --variant-set cannot be used together.");
76
+ }
65
77
  if (parsed.suite) {
66
78
  const suite = parsed.suite;
67
- if (!suite) {
68
- throw new Error("Missing suite id.");
69
- }
70
79
  const scenarios = loadScenariosBySuite(suite);
71
80
  if (scenarios.length === 0) {
72
81
  throw new Error(`No scenarios found for suite '${suite}'.`);
73
82
  }
83
+ const suiteBatchId = createSuiteBatchId();
74
84
  const runs = [];
75
- for (const scenario of scenarios) {
76
- runs.push(await executeOne(scenario.definition.id, runtimeConfig));
77
- }
78
- const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
79
- const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
80
- const errored = runs.filter((bundle) => bundle.run.status === "error").length;
81
- const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
82
- console.log(`Suite: ${suite}`);
83
- console.log(`Passed: ${passed}/${runs.length}`);
84
- console.log(`Failed: ${failed}/${runs.length}`);
85
- console.log(`Errored: ${errored}/${runs.length}`);
86
- console.log(`Average score: ${avgScore}`);
85
+ if (parsed.variantSetName) {
86
+ console.log(`Variant set: ${parsed.variantSetName}`);
87
+ for (const scenario of scenarios) {
88
+ runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId));
89
+ }
90
+ }
91
+ else {
92
+ for (const scenario of scenarios) {
93
+ runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
94
+ }
95
+ }
96
+ printSuiteSummary(suite, runs, suiteBatchId);
97
+ return;
98
+ }
99
+ if (parsed.suiteDefinition) {
100
+ const suiteDefinition = parsed.suiteDefinition;
101
+ const scenarios = loadScenariosBySuiteDefinition(suiteDefinition);
102
+ if (scenarios.length === 0) {
103
+ throw new Error(`No scenarios found for suite definition '${suiteDefinition}'.`);
104
+ }
105
+ const suiteBatchId = createSuiteBatchId();
106
+ const runs = [];
107
+ console.log(`Suite definition: ${suiteDefinition}`);
108
+ if (parsed.variantSetName) {
109
+ console.log(`Variant set: ${parsed.variantSetName}`);
110
+ for (const scenario of scenarios) {
111
+ runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId, suiteDefinition));
112
+ }
113
+ }
114
+ else {
115
+ const suiteRuntimeConfig = { ...runtimeConfig, suiteDefinitionName: suiteDefinition };
116
+ for (const scenario of scenarios) {
117
+ runs.push(await executeOne(scenario.definition.id, suiteRuntimeConfig, suiteBatchId));
118
+ }
119
+ }
120
+ printSuiteSummary(suiteDefinition, runs, suiteBatchId);
87
121
  return;
88
122
  }
89
123
  const scenarioId = parsed.scenarioId;
90
124
  if (!scenarioId) {
91
125
  throw new Error("Missing scenario id.");
92
126
  }
93
- await executeOne(scenarioId, runtimeConfig);
127
+ if (parsed.variantSetName) {
128
+ console.log(`Variant set: ${parsed.variantSetName}`);
129
+ await executeVariantSetScenario(scenarioId, parsed.variantSetName);
130
+ return;
131
+ }
132
+ // Detect scenario type to route to the right runner
133
+ const { listScenarioFiles } = await import("./scenarios.js");
134
+ const { parse } = await import("yaml");
135
+ const { readFileSync } = await import("node:fs");
136
+ const { resolve } = await import("node:path");
137
+ let scenarioType = "task";
138
+ for (const filePath of listScenarioFiles()) {
139
+ const raw = readFileSync(resolve(filePath), "utf8");
140
+ const parsedYaml = parse(raw);
141
+ if (parsedYaml.id === scenarioId) {
142
+ scenarioType = parsedYaml.type === "conversation" ? "conversation" : "task";
143
+ break;
144
+ }
145
+ }
146
+ if (scenarioType === "task" && runtimeConfig.provider === "http") {
147
+ throw new Error(`Scenario '${scenarioId}' is a task scenario. HTTP agents (provider: http) only work with ` +
148
+ `type: conversation scenarios.\n` +
149
+ `To test an HTTP agent, create a conversation scenario (type: conversation) — ` +
150
+ `conversation scenarios do not use a tools: block. See docs/scenarios.md for the format.`);
151
+ }
152
+ if (scenarioType === "conversation") {
153
+ if (runtimeConfig.provider !== "http") {
154
+ throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
155
+ }
156
+ const httpConfig = {
157
+ name: runtimeConfig.agentName ?? "http-agent",
158
+ provider: "http",
159
+ url: runtimeConfig.url,
160
+ request_template: runtimeConfig.request_template,
161
+ response_field: runtimeConfig.response_field,
162
+ headers: runtimeConfig.headers,
163
+ timeout_ms: runtimeConfig.timeout_ms,
164
+ };
165
+ await executeConversation(scenarioId, httpConfig, runtimeConfig.label);
166
+ }
167
+ else {
168
+ await executeOne(scenarioId, runtimeConfig);
169
+ }
94
170
  }
95
- async function executeOne(scenarioId, runtimeConfig) {
171
+ async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
96
172
  const [{ Storage }, { loadToolRegistry, loadToolSpecs }, { loadScenarioById }, { runScenario }] = await Promise.all([
97
173
  import("./storage.js"),
98
174
  import("./tools.js"),
@@ -100,31 +176,167 @@ async function executeOne(scenarioId, runtimeConfig) {
100
176
  import("./runner.js"),
101
177
  ]);
102
178
  const storage = new Storage();
103
- const toolSpecs = await loadToolSpecs();
104
- const toolRegistry = await loadToolRegistry();
105
- const loaded = loadScenarioById(scenarioId);
106
- storage.upsertScenario({
107
- id: loaded.definition.id,
108
- name: loaded.definition.name,
109
- suite: loaded.definition.suite,
110
- difficulty: loaded.definition.difficulty,
111
- description: loaded.definition.description,
112
- }, loaded.definition, loaded.filePath, loaded.fileHash);
113
- const factory = createAgentFactory(runtimeConfig);
114
- const agentVersion = factory.createVersion(runtimeConfig);
115
- storage.upsertAgentVersion(agentVersion);
116
- const bundle = await runScenario({
117
- agentAdapter: factory.createAdapter(),
118
- agentVersion,
119
- scenario: loaded.definition,
120
- scenarioFileHash: loaded.fileHash,
121
- toolSpecs,
122
- tools: toolRegistry,
179
+ try {
180
+ const toolSpecs = await loadToolSpecs();
181
+ const toolRegistry = await loadToolRegistry();
182
+ const loaded = loadScenarioById(scenarioId);
183
+ storage.upsertScenario({
184
+ id: loaded.definition.id,
185
+ name: loaded.definition.name,
186
+ suite: loaded.definition.suite,
187
+ difficulty: loaded.definition.difficulty,
188
+ description: loaded.definition.description,
189
+ }, loaded.definition, loaded.filePath, loaded.fileHash);
190
+ const factory = createAgentFactory(runtimeConfig);
191
+ const agentVersion = factory.createVersion(runtimeConfig);
192
+ storage.upsertAgentVersion(agentVersion);
193
+ const bundle = await runScenario({
194
+ agentAdapter: factory.createAdapter(),
195
+ agentVersion,
196
+ scenario: loaded.definition,
197
+ scenarioFileHash: loaded.fileHash,
198
+ toolSpecs,
199
+ tools: toolRegistry,
200
+ });
201
+ bundle.run.suiteBatchId = suiteBatchId;
202
+ bundle.run.variantSetName = agentVersion.variantSetName;
203
+ bundle.run.variantLabel = agentVersion.variantLabel;
204
+ bundle.run.promptVersion = agentVersion.promptVersion;
205
+ bundle.run.modelVersion = agentVersion.modelVersion;
206
+ bundle.run.toolSchemaVersion = agentVersion.toolSchemaVersion;
207
+ bundle.run.configLabel = agentVersion.configLabel;
208
+ bundle.run.configHash = agentVersion.configHash;
209
+ bundle.run.runtimeProfileName = loaded.definition.runtime_profile;
210
+ bundle.run.suiteDefinitionName = runtimeConfig.suiteDefinitionName;
211
+ bundle.agentVersion = agentVersion;
212
+ storage.saveRun(bundle);
213
+ printRunSummary(bundle);
214
+ return bundle;
215
+ }
216
+ finally {
217
+ storage.close();
218
+ }
219
+ }
220
+ export async function executeVariantSetScenario(scenarioId, variantSetName, suiteBatchId, suiteDefinitionName) {
221
+ const variantSet = getVariantSet(variantSetName);
222
+ const runs = [];
223
+ for (const variant of variantSet.variants) {
224
+ const registration = getAgentRegistration(variant.agent);
225
+ const runtimeConfig = buildVariantRuntimeConfig(registration, variantSet.name, variant, suiteDefinitionName);
226
+ runs.push(await executeOne(scenarioId, runtimeConfig, suiteBatchId));
227
+ }
228
+ return runs;
229
+ }
230
+ function buildVariantRuntimeConfig(registration, variantSetName, variant, suiteDefinitionName) {
231
+ const runtimeConfig = {
232
+ ...registration,
233
+ agentName: registration.name,
234
+ label: registration.label ?? variant.label,
235
+ variantSetName,
236
+ variantLabel: variant.label,
237
+ promptVersion: variant.prompt_version,
238
+ modelVersion: variant.model_version,
239
+ toolSchemaVersion: variant.tool_schema_version,
240
+ configLabel: variant.config_label,
241
+ suiteDefinitionName,
242
+ };
243
+ runtimeConfig.configHash = createConfigHash({
244
+ provider: runtimeConfig.provider,
245
+ agentName: runtimeConfig.agentName,
246
+ label: runtimeConfig.label,
247
+ model: runtimeConfig.model,
248
+ command: runtimeConfig.command,
249
+ args: runtimeConfig.args ?? [],
250
+ variantSetName,
251
+ variantLabel: variant.label,
252
+ promptVersion: variant.prompt_version,
253
+ modelVersion: variant.model_version,
254
+ toolSchemaVersion: variant.tool_schema_version,
255
+ configLabel: variant.config_label,
123
256
  });
124
- bundle.agentVersion = agentVersion;
125
- storage.saveRun(bundle);
126
- printRunSummary(bundle);
127
- return bundle;
257
+ return runtimeConfig;
258
+ }
259
+ export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
260
+ const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
261
+ import("./storage.js"),
262
+ import("./scenarios.js"),
263
+ import("./conversationRunner.js"),
264
+ import("./lib/id.js"),
265
+ ]);
266
+ const storage = new Storage();
267
+ try {
268
+ const loaded = loadConversationScenarioById(scenarioId);
269
+ storage.upsertScenario({
270
+ id: loaded.definition.id,
271
+ name: loaded.definition.name,
272
+ suite: loaded.definition.suite,
273
+ difficulty: loaded.definition.difficulty,
274
+ description: loaded.definition.description,
275
+ }, loaded.definition, loaded.filePath, loaded.fileHash);
276
+ const agentLabel = label ?? httpConfig.label ?? httpConfig.name;
277
+ const agentConfig = { provider: "http", url: httpConfig.url, agentName: httpConfig.name };
278
+ const agentVersion = {
279
+ id: createAgentVersionId(agentLabel, agentConfig),
280
+ label: agentLabel,
281
+ provider: "http",
282
+ config: agentConfig,
283
+ };
284
+ storage.upsertAgentVersion(agentVersion);
285
+ const bundle = await runConversation({
286
+ httpConfig,
287
+ agentVersion,
288
+ scenario: loaded.definition,
289
+ scenarioFileHash: loaded.fileHash,
290
+ });
291
+ bundle.run.suiteBatchId = suiteBatchId;
292
+ bundle.agentVersion = agentVersion;
293
+ storage.saveRun(bundle);
294
+ printConversationSummary(bundle, httpConfig.url, loaded.definition.steps.length);
295
+ return bundle;
296
+ }
297
+ finally {
298
+ storage.close();
299
+ }
300
+ }
301
+ function printSuiteSummary(suite, runs, suiteBatchId) {
302
+ const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
303
+ const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
304
+ const errored = runs.filter((bundle) => bundle.run.status === "error").length;
305
+ const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
306
+ console.log(`Suite: ${suite}`);
307
+ console.log(`Passed: ${passed}/${runs.length}`);
308
+ console.log(`Failed: ${failed}/${runs.length}`);
309
+ console.log(`Errored: ${errored}/${runs.length}`);
310
+ console.log(`Average score: ${avgScore}`);
311
+ console.log(`Suite batch: ${suiteBatchId}`);
312
+ }
313
+ function printConversationSummary(bundle, agentUrl, totalSteps) {
314
+ const statusLabel = bundle.run.status.toUpperCase();
315
+ console.log(`run ${bundle.run.scenarioId} — ${statusLabel}`);
316
+ console.log(` agent: ${bundle.agentVersion?.label ?? bundle.run.agentVersionId} (${agentUrl})`);
317
+ console.log(` turns completed: ${bundle.run.totalSteps}/${totalSteps}`);
318
+ const stepEvals = bundle.evaluatorResults.filter((r) => r.evaluatorId.startsWith("step_"));
319
+ const stepIndices = new Set(stepEvals.map((r) => {
320
+ const match = r.evaluatorId.match(/^step_(\d+)_/);
321
+ return match ? parseInt(match[1], 10) : -1;
322
+ }));
323
+ for (const stepIndex of [...stepIndices].sort((a, b) => a - b)) {
324
+ const resultsForStep = stepEvals.filter((r) => r.evaluatorId.startsWith(`step_${stepIndex}_`));
325
+ const allPass = resultsForStep.every((r) => r.status === "pass");
326
+ const stepStatus = allPass ? "pass" : "FAIL";
327
+ const details = resultsForStep.map((r) => {
328
+ if (r.evaluatorType === "response_latency_max") {
329
+ const latencyMatch = r.message.match(/(\d+)ms/);
330
+ return latencyMatch ? `latency ${latencyMatch[1]}ms ✓` : r.message;
331
+ }
332
+ return `${r.evaluatorType} ${r.status === "pass" ? "✓" : "✗"}`;
333
+ });
334
+ console.log(` step ${stepIndex + 1}: ${stepStatus}${details.length > 0 ? ` (${details.join(", ")})` : ""}`);
335
+ }
336
+ if (bundle.run.status !== "pass") {
337
+ console.log(` run stopped (${bundle.run.terminationReason})`);
338
+ }
339
+ console.log(` run id: ${bundle.run.id}`);
128
340
  }
129
341
  async function handleUi() {
130
342
  const { startUiServer } = await import("./ui/server.js");
@@ -145,6 +357,9 @@ function printRunSummary(bundle) {
145
357
  if (bundle.agentVersion?.command) {
146
358
  console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
147
359
  }
360
+ for (const line of formatRunIdentityLines(bundle)) {
361
+ console.log(line);
362
+ }
148
363
  console.log(`Runtime: ${bundle.run.durationMs}ms`);
149
364
  if (bundle.run.status !== "pass") {
150
365
  console.log(`Reason: ${bundle.run.terminationReason}`);
@@ -152,6 +367,13 @@ function printRunSummary(bundle) {
152
367
  if (errorDetail) {
153
368
  console.log(`Error: ${errorDetail}`);
154
369
  }
370
+ const failedEvaluators = getFailedEvaluatorSummaries(bundle);
371
+ if (failedEvaluators.length > 0) {
372
+ console.log("Failed evaluators:");
373
+ for (const summary of failedEvaluators) {
374
+ console.log(`- ${summary}`);
375
+ }
376
+ }
155
377
  }
156
378
  }
157
379
  async function handleShow(args) {
@@ -161,69 +383,129 @@ async function handleShow(args) {
161
383
  }
162
384
  const { Storage } = await import("./storage.js");
163
385
  const storage = new Storage();
164
- const bundle = storage.getRun(runId);
165
- if (!bundle) {
166
- throw new Error(`Run '${runId}' not found.`);
167
- }
168
- console.log(`Run: ${bundle.run.id}`);
169
- console.log(`Scenario: ${bundle.run.scenarioId}`);
170
- console.log(`Status: ${bundle.run.status.toUpperCase()}`);
171
- console.log(`Score: ${bundle.run.score}/100`);
172
- if (bundle.agentVersion) {
173
- console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
174
- console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
175
- if (bundle.agentVersion.command) {
176
- console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
386
+ try {
387
+ const bundle = storage.getRun(runId);
388
+ if (!bundle) {
389
+ throw new Error(`Run '${runId}' not found.`);
390
+ }
391
+ console.log(`Run: ${bundle.run.id}`);
392
+ console.log(`Scenario: ${bundle.run.scenarioId}`);
393
+ console.log(`Status: ${bundle.run.status.toUpperCase()}`);
394
+ console.log(`Score: ${bundle.run.score}/100`);
395
+ if (bundle.agentVersion) {
396
+ console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
397
+ console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
398
+ if (bundle.agentVersion.command) {
399
+ console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
400
+ }
401
+ }
402
+ console.log(`Termination: ${bundle.run.terminationReason}`);
403
+ const errorDetail = getRunErrorDetail(bundle);
404
+ if (errorDetail) {
405
+ console.log(`Error: ${errorDetail}`);
406
+ }
407
+ console.log(`Final output: ${bundle.run.finalOutput}`);
408
+ console.log("Evaluators:");
409
+ for (const result of bundle.evaluatorResults) {
410
+ console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
177
411
  }
178
412
  }
179
- console.log(`Termination: ${bundle.run.terminationReason}`);
180
- const errorDetail = getRunErrorDetail(bundle);
181
- if (errorDetail) {
182
- console.log(`Error: ${errorDetail}`);
183
- }
184
- console.log(`Final output: ${bundle.run.finalOutput}`);
185
- console.log("Evaluators:");
186
- for (const result of bundle.evaluatorResults) {
187
- console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
413
+ finally {
414
+ storage.close();
188
415
  }
189
416
  }
190
417
  async function handleCompare(args) {
191
- const [baselineRunId, candidateRunId] = args;
192
- if (!baselineRunId || !candidateRunId) {
193
- throw new Error("Missing baseline or candidate run id.");
194
- }
418
+ const isSuiteCompare = args[0] === "--suite";
195
419
  const { Storage } = await import("./storage.js");
196
420
  const storage = new Storage();
197
- const comparison = storage.compareRuns(baselineRunId, candidateRunId);
198
- console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
199
- console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
200
- console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
201
- console.log("Changes:");
202
- if (comparison.notes.length === 0) {
203
- console.log("- No material changes.");
204
- }
205
- else {
206
- for (const note of comparison.notes) {
207
- console.log(`- ${note}`);
421
+ try {
422
+ if (isSuiteCompare) {
423
+ const baselineBatchId = args[1];
424
+ const candidateBatchId = args[2];
425
+ if (!baselineBatchId || !candidateBatchId) {
426
+ throw new Error("Missing baseline or candidate suite batch id.");
427
+ }
428
+ const comparison = storage.compareSuites(baselineBatchId, candidateBatchId);
429
+ console.log(`Suite: ${comparison.suite}`);
430
+ console.log(`Baseline batch: ${comparison.baselineBatchId}`);
431
+ console.log(`Candidate batch: ${comparison.candidateBatchId}`);
432
+ console.log(`Classification: ${comparison.classification.toUpperCase()}`);
433
+ console.log(`Pass delta: ${signedMetric(comparison.deltas.pass)}`);
434
+ console.log(`Fail delta: ${signedMetric(comparison.deltas.fail)}`);
435
+ console.log(`Error delta: ${signedMetric(comparison.deltas.error)}`);
436
+ console.log(`Average score delta: ${signedMetric(comparison.deltas.averageScore)}`);
437
+ console.log(`Average runtime delta: ${signedMetric(comparison.deltas.averageRuntimeMs)}ms`);
438
+ console.log(`Average steps delta: ${signedMetric(comparison.deltas.averageSteps)}`);
439
+ if (comparison.notes.length > 0) {
440
+ console.log("Notes:");
441
+ for (const note of comparison.notes) {
442
+ console.log(`- ${note}`);
443
+ }
444
+ }
445
+ if (comparison.regressions.length > 0) {
446
+ console.log("Regressions:");
447
+ for (const regression of comparison.regressions) {
448
+ console.log(`- ${regression.scenarioId}: ${regression.comparison.classification}`);
449
+ }
450
+ }
451
+ if (comparison.improvements.length > 0) {
452
+ console.log("Improvements:");
453
+ for (const improvement of comparison.improvements) {
454
+ console.log(`- ${improvement.scenarioId}: ${improvement.comparison.classification}`);
455
+ }
456
+ }
457
+ if (comparison.missingFromCandidate.length > 0) {
458
+ console.log(`Missing from candidate: ${comparison.missingFromCandidate.join(", ")}`);
459
+ }
460
+ if (comparison.missingFromBaseline.length > 0) {
461
+ console.log(`Missing from baseline: ${comparison.missingFromBaseline.join(", ")}`);
462
+ }
463
+ return;
208
464
  }
209
- }
210
- if (comparison.evaluatorDiffs.length > 0) {
211
- console.log("Evaluator diffs:");
212
- for (const diff of comparison.evaluatorDiffs) {
213
- console.log(`- ${diff.note}`);
465
+ const [baselineRunId, candidateRunId] = args;
466
+ if (!baselineRunId || !candidateRunId) {
467
+ throw new Error("Missing baseline or candidate run id.");
214
468
  }
215
- }
216
- if (comparison.toolDiffs.length > 0) {
217
- console.log("Tool diffs:");
218
- for (const diff of comparison.toolDiffs) {
219
- console.log(`- ${diff.note}`);
469
+ const comparison = storage.compareRuns(baselineRunId, candidateRunId);
470
+ console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
471
+ console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
472
+ console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
473
+ console.log(`Classification: ${comparison.classification.toUpperCase()}`);
474
+ console.log("Changes:");
475
+ if (comparison.notes.length === 0) {
476
+ console.log("- No material changes.");
477
+ }
478
+ else {
479
+ for (const note of comparison.notes) {
480
+ console.log(`- ${note}`);
481
+ }
482
+ }
483
+ if (comparison.evaluatorDiffs.length > 0) {
484
+ console.log("Evaluator diffs:");
485
+ for (const diff of comparison.evaluatorDiffs) {
486
+ console.log(`- ${diff.note}`);
487
+ }
220
488
  }
489
+ if (comparison.toolDiffs.length > 0) {
490
+ console.log("Tool diffs:");
491
+ for (const diff of comparison.toolDiffs) {
492
+ console.log(`- ${diff.note}`);
493
+ }
494
+ }
495
+ }
496
+ finally {
497
+ storage.close();
221
498
  }
222
499
  }
500
+ function signedMetric(value) {
501
+ return value > 0 ? `+${value}` : `${value}`;
502
+ }
223
503
  function parseRunArgs(args) {
224
504
  const runtimeConfig = { provider: "mock" };
225
505
  let scenarioId;
226
506
  let suite;
507
+ let suiteDefinition;
508
+ let variantSetName;
227
509
  for (let index = 0; index < args.length; index += 1) {
228
510
  const arg = args[index];
229
511
  if (arg === "--suite") {
@@ -231,9 +513,19 @@ function parseRunArgs(args) {
231
513
  index += 1;
232
514
  continue;
233
515
  }
516
+ if (arg === "--suite-def") {
517
+ suiteDefinition = args[index + 1];
518
+ index += 1;
519
+ continue;
520
+ }
521
+ if (arg === "--variant-set") {
522
+ variantSetName = args[index + 1];
523
+ index += 1;
524
+ continue;
525
+ }
234
526
  if (arg === "--provider") {
235
527
  const provider = args[index + 1];
236
- if (provider !== "mock" && provider !== "openai" && provider !== "external_process") {
528
+ if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
237
529
  throw new Error(`Unsupported provider '${String(provider)}'.`);
238
530
  }
239
531
  runtimeConfig.provider = provider;
@@ -261,17 +553,26 @@ function parseRunArgs(args) {
261
553
  }
262
554
  throw new Error(`Unexpected argument '${arg}'.`);
263
555
  }
264
- return { scenarioId, suite, runtimeConfig };
556
+ return { scenarioId, suite, suiteDefinition, variantSetName, runtimeConfig };
265
557
  }
266
558
  function validateRuntimeConfig(config) {
267
559
  if (config.agentName) {
268
560
  const registration = getAgentRegistration(config.agentName);
269
561
  config.provider = registration.provider;
270
- config.model = config.model ?? registration.model;
271
562
  config.label = config.label ?? registration.label ?? registration.name;
272
- config.command = registration.command;
273
- config.args = registration.args;
274
- config.envAllowlist = registration.envAllowlist;
563
+ if (registration.provider !== "http") {
564
+ config.model = config.model ?? registration.model;
565
+ config.command = registration.command;
566
+ config.args = registration.args;
567
+ config.envAllowlist = registration.envAllowlist;
568
+ }
569
+ else {
570
+ config.url = registration.url;
571
+ config.request_template = registration.request_template;
572
+ config.response_field = registration.response_field;
573
+ config.headers = registration.headers;
574
+ config.timeout_ms = registration.timeout_ms;
575
+ }
275
576
  }
276
577
  if (config.provider === "openai") {
277
578
  if (!process.env.OPENAI_API_KEY) {
@@ -288,9 +589,25 @@ function validateRuntimeConfig(config) {
288
589
  }
289
590
  config.label = config.label ?? config.agentName ?? "external-process-agent";
290
591
  }
592
+ if (config.provider === "http") {
593
+ if (!config.url) {
594
+ throw new Error("HTTP agents require a configured url. Use --agent <name> with provider: http in agentlab.config.yaml.");
595
+ }
596
+ config.label = config.label ?? config.agentName ?? "http-agent";
597
+ }
291
598
  return config;
292
599
  }
293
- main().catch((error) => {
294
- console.error(error instanceof Error ? error.message : String(error));
295
- process.exitCode = 1;
296
- });
600
+ if (isEntrypoint()) {
601
+ main().catch((error) => {
602
+ const message = error instanceof Error ? error.message : String(error);
603
+ console.error(formatCliErrorMessage(message));
604
+ process.exitCode = 1;
605
+ });
606
+ }
607
+ function isEntrypoint() {
608
+ const entry = process.argv[1];
609
+ if (!entry) {
610
+ return false;
611
+ }
612
+ return import.meta.url === pathToFileURL(entry).href;
613
+ }