agent-regression-lab 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +186 -123
- package/dist/agent/factory.js +20 -6
- package/dist/agent/httpAdapter.js +79 -0
- package/dist/agent/mockAdapter.js +210 -13
- package/dist/config.js +223 -4
- package/dist/conversationEvaluators.js +167 -0
- package/dist/conversationRunner.js +199 -0
- package/dist/evaluators.js +56 -1
- package/dist/index.js +428 -111
- package/dist/lib/id.js +6 -0
- package/dist/runOutput.js +46 -0
- package/dist/runner.js +31 -9
- package/dist/scenarios.js +211 -11
- package/dist/scoring.js +2 -2
- package/dist/storage.js +305 -31
- package/dist/tools.js +284 -0
- package/dist/trace.js +4 -2
- package/dist/ui/App.js +67 -5
- package/dist/ui/server.js +18 -0
- package/dist/ui-assets/client.js +165 -3
- package/docs/agents.md +287 -0
- package/docs/golden-suites.md +74 -0
- package/docs/integrations-and-live-services.md +58 -0
- package/docs/memory-and-stateful-agents.md +51 -0
- package/docs/release-checklist.md +94 -0
- package/docs/runtime-profiles.md +67 -0
- package/docs/scenarios.md +419 -0
- package/docs/tools.md +102 -0
- package/docs/troubleshooting.md +296 -0
- package/docs/variant-sets.md +63 -0
- package/package.json +4 -3
package/dist/index.js
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import packageJson from "../package.json" with { type: "json" };
|
|
3
|
+
import { pathToFileURL } from "node:url";
|
|
2
4
|
import { createAgentFactory } from "./agent/factory.js";
|
|
3
|
-
import { getAgentRegistration } from "./config.js";
|
|
4
|
-
import {
|
|
5
|
+
import { getAgentRegistration, getVariantSet } from "./config.js";
|
|
6
|
+
import { createConfigHash, createSuiteBatchId } from "./lib/id.js";
|
|
7
|
+
import { formatCliErrorMessage, formatRunIdentityLines, getFailedEvaluatorSummaries, getRunErrorDetail } from "./runOutput.js";
|
|
5
8
|
async function main() {
|
|
6
9
|
const [, , command, ...args] = process.argv;
|
|
7
10
|
switch (command) {
|
|
@@ -9,27 +12,27 @@ async function main() {
|
|
|
9
12
|
case "--help":
|
|
10
13
|
case "-h":
|
|
11
14
|
printUsage();
|
|
12
|
-
|
|
15
|
+
break;
|
|
13
16
|
case "version":
|
|
14
17
|
case "--version":
|
|
15
18
|
case "-v":
|
|
16
19
|
printVersion();
|
|
17
|
-
|
|
20
|
+
break;
|
|
18
21
|
case "list":
|
|
19
22
|
await handleList(args);
|
|
20
|
-
|
|
23
|
+
break;
|
|
21
24
|
case "run":
|
|
22
25
|
await handleRun(args);
|
|
23
|
-
|
|
26
|
+
break;
|
|
24
27
|
case "show":
|
|
25
28
|
await handleShow(args);
|
|
26
|
-
|
|
29
|
+
break;
|
|
27
30
|
case "compare":
|
|
28
31
|
await handleCompare(args);
|
|
29
|
-
|
|
32
|
+
break;
|
|
30
33
|
case "ui":
|
|
31
34
|
await handleUi();
|
|
32
|
-
|
|
35
|
+
break;
|
|
33
36
|
default:
|
|
34
37
|
printUsage();
|
|
35
38
|
}
|
|
@@ -37,16 +40,19 @@ async function main() {
|
|
|
37
40
|
function printUsage() {
|
|
38
41
|
console.log(`Usage:
|
|
39
42
|
agentlab list scenarios
|
|
40
|
-
agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
|
|
41
|
-
agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process] [--model <model>] [--agent-label <label>]
|
|
43
|
+
agentlab run <scenario-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
44
|
+
agentlab run --suite <suite-id> [--agent <name>] [--provider mock|openai|external_process|http] [--model <model>] [--agent-label <label>]
|
|
45
|
+
agentlab run --suite-def <name> [--agent <name>]
|
|
46
|
+
agentlab run <scenario-id> [--variant-set <name>]
|
|
42
47
|
agentlab show <run-id>
|
|
43
48
|
agentlab compare <baseline-run-id> <candidate-run-id>
|
|
49
|
+
agentlab compare --suite <baseline-batch-id> <candidate-batch-id>
|
|
44
50
|
agentlab ui
|
|
45
51
|
agentlab help
|
|
46
52
|
agentlab version`);
|
|
47
53
|
}
|
|
48
54
|
function printVersion() {
|
|
49
|
-
console.log(
|
|
55
|
+
console.log(packageJson.version);
|
|
50
56
|
}
|
|
51
57
|
async function handleList(args) {
|
|
52
58
|
if (args[0] !== "scenarios") {
|
|
@@ -61,38 +67,108 @@ async function handleList(args) {
|
|
|
61
67
|
async function handleRun(args) {
|
|
62
68
|
const parsed = parseRunArgs(args);
|
|
63
69
|
const runtimeConfig = validateRuntimeConfig(parsed.runtimeConfig);
|
|
64
|
-
const { loadScenariosBySuite } = await import("./scenarios.js");
|
|
70
|
+
const { loadScenariosBySuite, loadScenariosBySuiteDefinition } = await import("./scenarios.js");
|
|
71
|
+
if (parsed.suite && parsed.suiteDefinition) {
|
|
72
|
+
throw new Error("--suite and --suite-def cannot be used together.");
|
|
73
|
+
}
|
|
74
|
+
if (parsed.runtimeConfig.agentName && parsed.variantSetName) {
|
|
75
|
+
throw new Error("--agent and --variant-set cannot be used together.");
|
|
76
|
+
}
|
|
65
77
|
if (parsed.suite) {
|
|
66
78
|
const suite = parsed.suite;
|
|
67
|
-
if (!suite) {
|
|
68
|
-
throw new Error("Missing suite id.");
|
|
69
|
-
}
|
|
70
79
|
const scenarios = loadScenariosBySuite(suite);
|
|
71
80
|
if (scenarios.length === 0) {
|
|
72
81
|
throw new Error(`No scenarios found for suite '${suite}'.`);
|
|
73
82
|
}
|
|
83
|
+
const suiteBatchId = createSuiteBatchId();
|
|
74
84
|
const runs = [];
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
if (parsed.variantSetName) {
|
|
86
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
87
|
+
for (const scenario of scenarios) {
|
|
88
|
+
runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId));
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
for (const scenario of scenarios) {
|
|
93
|
+
runs.push(await executeOne(scenario.definition.id, runtimeConfig, suiteBatchId));
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
printSuiteSummary(suite, runs, suiteBatchId);
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
if (parsed.suiteDefinition) {
|
|
100
|
+
const suiteDefinition = parsed.suiteDefinition;
|
|
101
|
+
const scenarios = loadScenariosBySuiteDefinition(suiteDefinition);
|
|
102
|
+
if (scenarios.length === 0) {
|
|
103
|
+
throw new Error(`No scenarios found for suite definition '${suiteDefinition}'.`);
|
|
104
|
+
}
|
|
105
|
+
const suiteBatchId = createSuiteBatchId();
|
|
106
|
+
const runs = [];
|
|
107
|
+
console.log(`Suite definition: ${suiteDefinition}`);
|
|
108
|
+
if (parsed.variantSetName) {
|
|
109
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
110
|
+
for (const scenario of scenarios) {
|
|
111
|
+
runs.push(...await executeVariantSetScenario(scenario.definition.id, parsed.variantSetName, suiteBatchId, suiteDefinition));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
else {
|
|
115
|
+
const suiteRuntimeConfig = { ...runtimeConfig, suiteDefinitionName: suiteDefinition };
|
|
116
|
+
for (const scenario of scenarios) {
|
|
117
|
+
runs.push(await executeOne(scenario.definition.id, suiteRuntimeConfig, suiteBatchId));
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
printSuiteSummary(suiteDefinition, runs, suiteBatchId);
|
|
87
121
|
return;
|
|
88
122
|
}
|
|
89
123
|
const scenarioId = parsed.scenarioId;
|
|
90
124
|
if (!scenarioId) {
|
|
91
125
|
throw new Error("Missing scenario id.");
|
|
92
126
|
}
|
|
93
|
-
|
|
127
|
+
if (parsed.variantSetName) {
|
|
128
|
+
console.log(`Variant set: ${parsed.variantSetName}`);
|
|
129
|
+
await executeVariantSetScenario(scenarioId, parsed.variantSetName);
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
// Detect scenario type to route to the right runner
|
|
133
|
+
const { listScenarioFiles } = await import("./scenarios.js");
|
|
134
|
+
const { parse } = await import("yaml");
|
|
135
|
+
const { readFileSync } = await import("node:fs");
|
|
136
|
+
const { resolve } = await import("node:path");
|
|
137
|
+
let scenarioType = "task";
|
|
138
|
+
for (const filePath of listScenarioFiles()) {
|
|
139
|
+
const raw = readFileSync(resolve(filePath), "utf8");
|
|
140
|
+
const parsedYaml = parse(raw);
|
|
141
|
+
if (parsedYaml.id === scenarioId) {
|
|
142
|
+
scenarioType = parsedYaml.type === "conversation" ? "conversation" : "task";
|
|
143
|
+
break;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
if (scenarioType === "task" && runtimeConfig.provider === "http") {
|
|
147
|
+
throw new Error(`Scenario '${scenarioId}' is a task scenario. HTTP agents (provider: http) only work with ` +
|
|
148
|
+
`type: conversation scenarios.\n` +
|
|
149
|
+
`To test an HTTP agent, create a conversation scenario (type: conversation) — ` +
|
|
150
|
+
`conversation scenarios do not use a tools: block. See docs/scenarios.md for the format.`);
|
|
151
|
+
}
|
|
152
|
+
if (scenarioType === "conversation") {
|
|
153
|
+
if (runtimeConfig.provider !== "http") {
|
|
154
|
+
throw new Error(`Scenario '${scenarioId}' is a conversation scenario and requires provider: http. Use --agent <name> with a configured HTTP agent.`);
|
|
155
|
+
}
|
|
156
|
+
const httpConfig = {
|
|
157
|
+
name: runtimeConfig.agentName ?? "http-agent",
|
|
158
|
+
provider: "http",
|
|
159
|
+
url: runtimeConfig.url,
|
|
160
|
+
request_template: runtimeConfig.request_template,
|
|
161
|
+
response_field: runtimeConfig.response_field,
|
|
162
|
+
headers: runtimeConfig.headers,
|
|
163
|
+
timeout_ms: runtimeConfig.timeout_ms,
|
|
164
|
+
};
|
|
165
|
+
await executeConversation(scenarioId, httpConfig, runtimeConfig.label);
|
|
166
|
+
}
|
|
167
|
+
else {
|
|
168
|
+
await executeOne(scenarioId, runtimeConfig);
|
|
169
|
+
}
|
|
94
170
|
}
|
|
95
|
-
async function executeOne(scenarioId, runtimeConfig) {
|
|
171
|
+
async function executeOne(scenarioId, runtimeConfig, suiteBatchId) {
|
|
96
172
|
const [{ Storage }, { loadToolRegistry, loadToolSpecs }, { loadScenarioById }, { runScenario }] = await Promise.all([
|
|
97
173
|
import("./storage.js"),
|
|
98
174
|
import("./tools.js"),
|
|
@@ -100,31 +176,167 @@ async function executeOne(scenarioId, runtimeConfig) {
|
|
|
100
176
|
import("./runner.js"),
|
|
101
177
|
]);
|
|
102
178
|
const storage = new Storage();
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
179
|
+
try {
|
|
180
|
+
const toolSpecs = await loadToolSpecs();
|
|
181
|
+
const toolRegistry = await loadToolRegistry();
|
|
182
|
+
const loaded = loadScenarioById(scenarioId);
|
|
183
|
+
storage.upsertScenario({
|
|
184
|
+
id: loaded.definition.id,
|
|
185
|
+
name: loaded.definition.name,
|
|
186
|
+
suite: loaded.definition.suite,
|
|
187
|
+
difficulty: loaded.definition.difficulty,
|
|
188
|
+
description: loaded.definition.description,
|
|
189
|
+
}, loaded.definition, loaded.filePath, loaded.fileHash);
|
|
190
|
+
const factory = createAgentFactory(runtimeConfig);
|
|
191
|
+
const agentVersion = factory.createVersion(runtimeConfig);
|
|
192
|
+
storage.upsertAgentVersion(agentVersion);
|
|
193
|
+
const bundle = await runScenario({
|
|
194
|
+
agentAdapter: factory.createAdapter(),
|
|
195
|
+
agentVersion,
|
|
196
|
+
scenario: loaded.definition,
|
|
197
|
+
scenarioFileHash: loaded.fileHash,
|
|
198
|
+
toolSpecs,
|
|
199
|
+
tools: toolRegistry,
|
|
200
|
+
});
|
|
201
|
+
bundle.run.suiteBatchId = suiteBatchId;
|
|
202
|
+
bundle.run.variantSetName = agentVersion.variantSetName;
|
|
203
|
+
bundle.run.variantLabel = agentVersion.variantLabel;
|
|
204
|
+
bundle.run.promptVersion = agentVersion.promptVersion;
|
|
205
|
+
bundle.run.modelVersion = agentVersion.modelVersion;
|
|
206
|
+
bundle.run.toolSchemaVersion = agentVersion.toolSchemaVersion;
|
|
207
|
+
bundle.run.configLabel = agentVersion.configLabel;
|
|
208
|
+
bundle.run.configHash = agentVersion.configHash;
|
|
209
|
+
bundle.run.runtimeProfileName = loaded.definition.runtime_profile;
|
|
210
|
+
bundle.run.suiteDefinitionName = runtimeConfig.suiteDefinitionName;
|
|
211
|
+
bundle.agentVersion = agentVersion;
|
|
212
|
+
storage.saveRun(bundle);
|
|
213
|
+
printRunSummary(bundle);
|
|
214
|
+
return bundle;
|
|
215
|
+
}
|
|
216
|
+
finally {
|
|
217
|
+
storage.close();
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
export async function executeVariantSetScenario(scenarioId, variantSetName, suiteBatchId, suiteDefinitionName) {
|
|
221
|
+
const variantSet = getVariantSet(variantSetName);
|
|
222
|
+
const runs = [];
|
|
223
|
+
for (const variant of variantSet.variants) {
|
|
224
|
+
const registration = getAgentRegistration(variant.agent);
|
|
225
|
+
const runtimeConfig = buildVariantRuntimeConfig(registration, variantSet.name, variant, suiteDefinitionName);
|
|
226
|
+
runs.push(await executeOne(scenarioId, runtimeConfig, suiteBatchId));
|
|
227
|
+
}
|
|
228
|
+
return runs;
|
|
229
|
+
}
|
|
230
|
+
function buildVariantRuntimeConfig(registration, variantSetName, variant, suiteDefinitionName) {
|
|
231
|
+
const runtimeConfig = {
|
|
232
|
+
...registration,
|
|
233
|
+
agentName: registration.name,
|
|
234
|
+
label: registration.label ?? variant.label,
|
|
235
|
+
variantSetName,
|
|
236
|
+
variantLabel: variant.label,
|
|
237
|
+
promptVersion: variant.prompt_version,
|
|
238
|
+
modelVersion: variant.model_version,
|
|
239
|
+
toolSchemaVersion: variant.tool_schema_version,
|
|
240
|
+
configLabel: variant.config_label,
|
|
241
|
+
suiteDefinitionName,
|
|
242
|
+
};
|
|
243
|
+
runtimeConfig.configHash = createConfigHash({
|
|
244
|
+
provider: runtimeConfig.provider,
|
|
245
|
+
agentName: runtimeConfig.agentName,
|
|
246
|
+
label: runtimeConfig.label,
|
|
247
|
+
model: runtimeConfig.model,
|
|
248
|
+
command: runtimeConfig.command,
|
|
249
|
+
args: runtimeConfig.args ?? [],
|
|
250
|
+
variantSetName,
|
|
251
|
+
variantLabel: variant.label,
|
|
252
|
+
promptVersion: variant.prompt_version,
|
|
253
|
+
modelVersion: variant.model_version,
|
|
254
|
+
toolSchemaVersion: variant.tool_schema_version,
|
|
255
|
+
configLabel: variant.config_label,
|
|
123
256
|
});
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
257
|
+
return runtimeConfig;
|
|
258
|
+
}
|
|
259
|
+
export async function executeConversation(scenarioId, httpConfig, label, suiteBatchId) {
|
|
260
|
+
const [{ Storage }, { loadConversationScenarioById }, { runConversation }, { createAgentVersionId }] = await Promise.all([
|
|
261
|
+
import("./storage.js"),
|
|
262
|
+
import("./scenarios.js"),
|
|
263
|
+
import("./conversationRunner.js"),
|
|
264
|
+
import("./lib/id.js"),
|
|
265
|
+
]);
|
|
266
|
+
const storage = new Storage();
|
|
267
|
+
try {
|
|
268
|
+
const loaded = loadConversationScenarioById(scenarioId);
|
|
269
|
+
storage.upsertScenario({
|
|
270
|
+
id: loaded.definition.id,
|
|
271
|
+
name: loaded.definition.name,
|
|
272
|
+
suite: loaded.definition.suite,
|
|
273
|
+
difficulty: loaded.definition.difficulty,
|
|
274
|
+
description: loaded.definition.description,
|
|
275
|
+
}, loaded.definition, loaded.filePath, loaded.fileHash);
|
|
276
|
+
const agentLabel = label ?? httpConfig.label ?? httpConfig.name;
|
|
277
|
+
const agentConfig = { provider: "http", url: httpConfig.url, agentName: httpConfig.name };
|
|
278
|
+
const agentVersion = {
|
|
279
|
+
id: createAgentVersionId(agentLabel, agentConfig),
|
|
280
|
+
label: agentLabel,
|
|
281
|
+
provider: "http",
|
|
282
|
+
config: agentConfig,
|
|
283
|
+
};
|
|
284
|
+
storage.upsertAgentVersion(agentVersion);
|
|
285
|
+
const bundle = await runConversation({
|
|
286
|
+
httpConfig,
|
|
287
|
+
agentVersion,
|
|
288
|
+
scenario: loaded.definition,
|
|
289
|
+
scenarioFileHash: loaded.fileHash,
|
|
290
|
+
});
|
|
291
|
+
bundle.run.suiteBatchId = suiteBatchId;
|
|
292
|
+
bundle.agentVersion = agentVersion;
|
|
293
|
+
storage.saveRun(bundle);
|
|
294
|
+
printConversationSummary(bundle, httpConfig.url, loaded.definition.steps.length);
|
|
295
|
+
return bundle;
|
|
296
|
+
}
|
|
297
|
+
finally {
|
|
298
|
+
storage.close();
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
function printSuiteSummary(suite, runs, suiteBatchId) {
|
|
302
|
+
const passed = runs.filter((bundle) => bundle.run.status === "pass").length;
|
|
303
|
+
const failed = runs.filter((bundle) => bundle.run.status === "fail").length;
|
|
304
|
+
const errored = runs.filter((bundle) => bundle.run.status === "error").length;
|
|
305
|
+
const avgScore = Math.round(runs.reduce((sum, bundle) => sum + bundle.run.score, 0) / runs.length);
|
|
306
|
+
console.log(`Suite: ${suite}`);
|
|
307
|
+
console.log(`Passed: ${passed}/${runs.length}`);
|
|
308
|
+
console.log(`Failed: ${failed}/${runs.length}`);
|
|
309
|
+
console.log(`Errored: ${errored}/${runs.length}`);
|
|
310
|
+
console.log(`Average score: ${avgScore}`);
|
|
311
|
+
console.log(`Suite batch: ${suiteBatchId}`);
|
|
312
|
+
}
|
|
313
|
+
function printConversationSummary(bundle, agentUrl, totalSteps) {
|
|
314
|
+
const statusLabel = bundle.run.status.toUpperCase();
|
|
315
|
+
console.log(`run ${bundle.run.scenarioId} — ${statusLabel}`);
|
|
316
|
+
console.log(` agent: ${bundle.agentVersion?.label ?? bundle.run.agentVersionId} (${agentUrl})`);
|
|
317
|
+
console.log(` turns completed: ${bundle.run.totalSteps}/${totalSteps}`);
|
|
318
|
+
const stepEvals = bundle.evaluatorResults.filter((r) => r.evaluatorId.startsWith("step_"));
|
|
319
|
+
const stepIndices = new Set(stepEvals.map((r) => {
|
|
320
|
+
const match = r.evaluatorId.match(/^step_(\d+)_/);
|
|
321
|
+
return match ? parseInt(match[1], 10) : -1;
|
|
322
|
+
}));
|
|
323
|
+
for (const stepIndex of [...stepIndices].sort((a, b) => a - b)) {
|
|
324
|
+
const resultsForStep = stepEvals.filter((r) => r.evaluatorId.startsWith(`step_${stepIndex}_`));
|
|
325
|
+
const allPass = resultsForStep.every((r) => r.status === "pass");
|
|
326
|
+
const stepStatus = allPass ? "pass" : "FAIL";
|
|
327
|
+
const details = resultsForStep.map((r) => {
|
|
328
|
+
if (r.evaluatorType === "response_latency_max") {
|
|
329
|
+
const latencyMatch = r.message.match(/(\d+)ms/);
|
|
330
|
+
return latencyMatch ? `latency ${latencyMatch[1]}ms ✓` : r.message;
|
|
331
|
+
}
|
|
332
|
+
return `${r.evaluatorType} ${r.status === "pass" ? "✓" : "✗"}`;
|
|
333
|
+
});
|
|
334
|
+
console.log(` step ${stepIndex + 1}: ${stepStatus}${details.length > 0 ? ` (${details.join(", ")})` : ""}`);
|
|
335
|
+
}
|
|
336
|
+
if (bundle.run.status !== "pass") {
|
|
337
|
+
console.log(` run stopped (${bundle.run.terminationReason})`);
|
|
338
|
+
}
|
|
339
|
+
console.log(` run id: ${bundle.run.id}`);
|
|
128
340
|
}
|
|
129
341
|
async function handleUi() {
|
|
130
342
|
const { startUiServer } = await import("./ui/server.js");
|
|
@@ -145,6 +357,9 @@ function printRunSummary(bundle) {
|
|
|
145
357
|
if (bundle.agentVersion?.command) {
|
|
146
358
|
console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
|
|
147
359
|
}
|
|
360
|
+
for (const line of formatRunIdentityLines(bundle)) {
|
|
361
|
+
console.log(line);
|
|
362
|
+
}
|
|
148
363
|
console.log(`Runtime: ${bundle.run.durationMs}ms`);
|
|
149
364
|
if (bundle.run.status !== "pass") {
|
|
150
365
|
console.log(`Reason: ${bundle.run.terminationReason}`);
|
|
@@ -152,6 +367,13 @@ function printRunSummary(bundle) {
|
|
|
152
367
|
if (errorDetail) {
|
|
153
368
|
console.log(`Error: ${errorDetail}`);
|
|
154
369
|
}
|
|
370
|
+
const failedEvaluators = getFailedEvaluatorSummaries(bundle);
|
|
371
|
+
if (failedEvaluators.length > 0) {
|
|
372
|
+
console.log("Failed evaluators:");
|
|
373
|
+
for (const summary of failedEvaluators) {
|
|
374
|
+
console.log(`- ${summary}`);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
155
377
|
}
|
|
156
378
|
}
|
|
157
379
|
async function handleShow(args) {
|
|
@@ -161,69 +383,129 @@ async function handleShow(args) {
|
|
|
161
383
|
}
|
|
162
384
|
const { Storage } = await import("./storage.js");
|
|
163
385
|
const storage = new Storage();
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
386
|
+
try {
|
|
387
|
+
const bundle = storage.getRun(runId);
|
|
388
|
+
if (!bundle) {
|
|
389
|
+
throw new Error(`Run '${runId}' not found.`);
|
|
390
|
+
}
|
|
391
|
+
console.log(`Run: ${bundle.run.id}`);
|
|
392
|
+
console.log(`Scenario: ${bundle.run.scenarioId}`);
|
|
393
|
+
console.log(`Status: ${bundle.run.status.toUpperCase()}`);
|
|
394
|
+
console.log(`Score: ${bundle.run.score}/100`);
|
|
395
|
+
if (bundle.agentVersion) {
|
|
396
|
+
console.log(`Provider: ${bundle.agentVersion.provider ?? "unknown"}`);
|
|
397
|
+
console.log(`Model: ${bundle.agentVersion.modelId ?? "unknown"}`);
|
|
398
|
+
if (bundle.agentVersion.command) {
|
|
399
|
+
console.log(`Command: ${bundle.agentVersion.command} ${(bundle.agentVersion.args ?? []).join(" ")}`.trim());
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
console.log(`Termination: ${bundle.run.terminationReason}`);
|
|
403
|
+
const errorDetail = getRunErrorDetail(bundle);
|
|
404
|
+
if (errorDetail) {
|
|
405
|
+
console.log(`Error: ${errorDetail}`);
|
|
406
|
+
}
|
|
407
|
+
console.log(`Final output: ${bundle.run.finalOutput}`);
|
|
408
|
+
console.log("Evaluators:");
|
|
409
|
+
for (const result of bundle.evaluatorResults) {
|
|
410
|
+
console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
|
|
177
411
|
}
|
|
178
412
|
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
if (errorDetail) {
|
|
182
|
-
console.log(`Error: ${errorDetail}`);
|
|
183
|
-
}
|
|
184
|
-
console.log(`Final output: ${bundle.run.finalOutput}`);
|
|
185
|
-
console.log("Evaluators:");
|
|
186
|
-
for (const result of bundle.evaluatorResults) {
|
|
187
|
-
console.log(`- ${result.evaluatorId}: ${result.status.toUpperCase()} - ${result.message}`);
|
|
413
|
+
finally {
|
|
414
|
+
storage.close();
|
|
188
415
|
}
|
|
189
416
|
}
|
|
190
417
|
async function handleCompare(args) {
|
|
191
|
-
const [
|
|
192
|
-
if (!baselineRunId || !candidateRunId) {
|
|
193
|
-
throw new Error("Missing baseline or candidate run id.");
|
|
194
|
-
}
|
|
418
|
+
const isSuiteCompare = args[0] === "--suite";
|
|
195
419
|
const { Storage } = await import("./storage.js");
|
|
196
420
|
const storage = new Storage();
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
console.log(
|
|
421
|
+
try {
|
|
422
|
+
if (isSuiteCompare) {
|
|
423
|
+
const baselineBatchId = args[1];
|
|
424
|
+
const candidateBatchId = args[2];
|
|
425
|
+
if (!baselineBatchId || !candidateBatchId) {
|
|
426
|
+
throw new Error("Missing baseline or candidate suite batch id.");
|
|
427
|
+
}
|
|
428
|
+
const comparison = storage.compareSuites(baselineBatchId, candidateBatchId);
|
|
429
|
+
console.log(`Suite: ${comparison.suite}`);
|
|
430
|
+
console.log(`Baseline batch: ${comparison.baselineBatchId}`);
|
|
431
|
+
console.log(`Candidate batch: ${comparison.candidateBatchId}`);
|
|
432
|
+
console.log(`Classification: ${comparison.classification.toUpperCase()}`);
|
|
433
|
+
console.log(`Pass delta: ${signedMetric(comparison.deltas.pass)}`);
|
|
434
|
+
console.log(`Fail delta: ${signedMetric(comparison.deltas.fail)}`);
|
|
435
|
+
console.log(`Error delta: ${signedMetric(comparison.deltas.error)}`);
|
|
436
|
+
console.log(`Average score delta: ${signedMetric(comparison.deltas.averageScore)}`);
|
|
437
|
+
console.log(`Average runtime delta: ${signedMetric(comparison.deltas.averageRuntimeMs)}ms`);
|
|
438
|
+
console.log(`Average steps delta: ${signedMetric(comparison.deltas.averageSteps)}`);
|
|
439
|
+
if (comparison.notes.length > 0) {
|
|
440
|
+
console.log("Notes:");
|
|
441
|
+
for (const note of comparison.notes) {
|
|
442
|
+
console.log(`- ${note}`);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
if (comparison.regressions.length > 0) {
|
|
446
|
+
console.log("Regressions:");
|
|
447
|
+
for (const regression of comparison.regressions) {
|
|
448
|
+
console.log(`- ${regression.scenarioId}: ${regression.comparison.classification}`);
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
if (comparison.improvements.length > 0) {
|
|
452
|
+
console.log("Improvements:");
|
|
453
|
+
for (const improvement of comparison.improvements) {
|
|
454
|
+
console.log(`- ${improvement.scenarioId}: ${improvement.comparison.classification}`);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
if (comparison.missingFromCandidate.length > 0) {
|
|
458
|
+
console.log(`Missing from candidate: ${comparison.missingFromCandidate.join(", ")}`);
|
|
459
|
+
}
|
|
460
|
+
if (comparison.missingFromBaseline.length > 0) {
|
|
461
|
+
console.log(`Missing from baseline: ${comparison.missingFromBaseline.join(", ")}`);
|
|
462
|
+
}
|
|
463
|
+
return;
|
|
208
464
|
}
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
for (const diff of comparison.evaluatorDiffs) {
|
|
213
|
-
console.log(`- ${diff.note}`);
|
|
465
|
+
const [baselineRunId, candidateRunId] = args;
|
|
466
|
+
if (!baselineRunId || !candidateRunId) {
|
|
467
|
+
throw new Error("Missing baseline or candidate run id.");
|
|
214
468
|
}
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
console.log(
|
|
218
|
-
|
|
219
|
-
|
|
469
|
+
const comparison = storage.compareRuns(baselineRunId, candidateRunId);
|
|
470
|
+
console.log(`Scenario: ${comparison.baseline.run.scenarioId}`);
|
|
471
|
+
console.log(`Baseline: ${comparison.baseline.run.id} (${comparison.baseline.run.status.toUpperCase()} ${comparison.baseline.run.score}/100)`);
|
|
472
|
+
console.log(`Candidate: ${comparison.candidate.run.id} (${comparison.candidate.run.status.toUpperCase()} ${comparison.candidate.run.score}/100)`);
|
|
473
|
+
console.log(`Classification: ${comparison.classification.toUpperCase()}`);
|
|
474
|
+
console.log("Changes:");
|
|
475
|
+
if (comparison.notes.length === 0) {
|
|
476
|
+
console.log("- No material changes.");
|
|
477
|
+
}
|
|
478
|
+
else {
|
|
479
|
+
for (const note of comparison.notes) {
|
|
480
|
+
console.log(`- ${note}`);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
if (comparison.evaluatorDiffs.length > 0) {
|
|
484
|
+
console.log("Evaluator diffs:");
|
|
485
|
+
for (const diff of comparison.evaluatorDiffs) {
|
|
486
|
+
console.log(`- ${diff.note}`);
|
|
487
|
+
}
|
|
220
488
|
}
|
|
489
|
+
if (comparison.toolDiffs.length > 0) {
|
|
490
|
+
console.log("Tool diffs:");
|
|
491
|
+
for (const diff of comparison.toolDiffs) {
|
|
492
|
+
console.log(`- ${diff.note}`);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
finally {
|
|
497
|
+
storage.close();
|
|
221
498
|
}
|
|
222
499
|
}
|
|
500
|
+
function signedMetric(value) {
|
|
501
|
+
return value > 0 ? `+${value}` : `${value}`;
|
|
502
|
+
}
|
|
223
503
|
function parseRunArgs(args) {
|
|
224
504
|
const runtimeConfig = { provider: "mock" };
|
|
225
505
|
let scenarioId;
|
|
226
506
|
let suite;
|
|
507
|
+
let suiteDefinition;
|
|
508
|
+
let variantSetName;
|
|
227
509
|
for (let index = 0; index < args.length; index += 1) {
|
|
228
510
|
const arg = args[index];
|
|
229
511
|
if (arg === "--suite") {
|
|
@@ -231,9 +513,19 @@ function parseRunArgs(args) {
|
|
|
231
513
|
index += 1;
|
|
232
514
|
continue;
|
|
233
515
|
}
|
|
516
|
+
if (arg === "--suite-def") {
|
|
517
|
+
suiteDefinition = args[index + 1];
|
|
518
|
+
index += 1;
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
if (arg === "--variant-set") {
|
|
522
|
+
variantSetName = args[index + 1];
|
|
523
|
+
index += 1;
|
|
524
|
+
continue;
|
|
525
|
+
}
|
|
234
526
|
if (arg === "--provider") {
|
|
235
527
|
const provider = args[index + 1];
|
|
236
|
-
if (provider !== "mock" && provider !== "openai" && provider !== "external_process") {
|
|
528
|
+
if (provider !== "mock" && provider !== "openai" && provider !== "external_process" && provider !== "http") {
|
|
237
529
|
throw new Error(`Unsupported provider '${String(provider)}'.`);
|
|
238
530
|
}
|
|
239
531
|
runtimeConfig.provider = provider;
|
|
@@ -261,17 +553,26 @@ function parseRunArgs(args) {
|
|
|
261
553
|
}
|
|
262
554
|
throw new Error(`Unexpected argument '${arg}'.`);
|
|
263
555
|
}
|
|
264
|
-
return { scenarioId, suite, runtimeConfig };
|
|
556
|
+
return { scenarioId, suite, suiteDefinition, variantSetName, runtimeConfig };
|
|
265
557
|
}
|
|
266
558
|
function validateRuntimeConfig(config) {
|
|
267
559
|
if (config.agentName) {
|
|
268
560
|
const registration = getAgentRegistration(config.agentName);
|
|
269
561
|
config.provider = registration.provider;
|
|
270
|
-
config.model = config.model ?? registration.model;
|
|
271
562
|
config.label = config.label ?? registration.label ?? registration.name;
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
563
|
+
if (registration.provider !== "http") {
|
|
564
|
+
config.model = config.model ?? registration.model;
|
|
565
|
+
config.command = registration.command;
|
|
566
|
+
config.args = registration.args;
|
|
567
|
+
config.envAllowlist = registration.envAllowlist;
|
|
568
|
+
}
|
|
569
|
+
else {
|
|
570
|
+
config.url = registration.url;
|
|
571
|
+
config.request_template = registration.request_template;
|
|
572
|
+
config.response_field = registration.response_field;
|
|
573
|
+
config.headers = registration.headers;
|
|
574
|
+
config.timeout_ms = registration.timeout_ms;
|
|
575
|
+
}
|
|
275
576
|
}
|
|
276
577
|
if (config.provider === "openai") {
|
|
277
578
|
if (!process.env.OPENAI_API_KEY) {
|
|
@@ -288,9 +589,25 @@ function validateRuntimeConfig(config) {
|
|
|
288
589
|
}
|
|
289
590
|
config.label = config.label ?? config.agentName ?? "external-process-agent";
|
|
290
591
|
}
|
|
592
|
+
if (config.provider === "http") {
|
|
593
|
+
if (!config.url) {
|
|
594
|
+
throw new Error("HTTP agents require a configured url. Use --agent <name> with provider: http in agentlab.config.yaml.");
|
|
595
|
+
}
|
|
596
|
+
config.label = config.label ?? config.agentName ?? "http-agent";
|
|
597
|
+
}
|
|
291
598
|
return config;
|
|
292
599
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
600
|
+
if (isEntrypoint()) {
|
|
601
|
+
main().catch((error) => {
|
|
602
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
603
|
+
console.error(formatCliErrorMessage(message));
|
|
604
|
+
process.exitCode = 1;
|
|
605
|
+
});
|
|
606
|
+
}
|
|
607
|
+
function isEntrypoint() {
|
|
608
|
+
const entry = process.argv[1];
|
|
609
|
+
if (!entry) {
|
|
610
|
+
return false;
|
|
611
|
+
}
|
|
612
|
+
return import.meta.url === pathToFileURL(entry).href;
|
|
613
|
+
}
|