agentgrader 1.0.7 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +356 -20
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { cac } from 'cac';
|
|
4
|
+
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
4
5
|
import { randomUUID } from 'crypto';
|
|
5
|
-
import { resolve, dirname, isAbsolute } from 'path';
|
|
6
|
+
import { resolve, dirname, isAbsolute, basename } from 'path';
|
|
6
7
|
import { render, Box, Text } from 'ink';
|
|
7
|
-
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
8
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
9
9
|
import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
|
|
10
10
|
import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
@@ -13,9 +13,158 @@ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agen
|
|
|
13
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
14
14
|
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
15
|
import { parse, stringify } from 'yaml';
|
|
16
|
-
import { ZodError } from 'zod';
|
|
16
|
+
import { z, ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
|
+
var CONTENT_PREVIEW_MAX = 200;
|
|
20
|
+
var ANSI = {
|
|
21
|
+
reset: "\x1B[0m",
|
|
22
|
+
gray: "\x1B[90m",
|
|
23
|
+
yellow: "\x1B[33m",
|
|
24
|
+
cyan: "\x1B[36m",
|
|
25
|
+
blue: "\x1B[34m"};
|
|
26
|
+
function paint(text, code) {
|
|
27
|
+
if (!process.stdout.isTTY) return text;
|
|
28
|
+
return `${code}${text}${ANSI.reset}`;
|
|
29
|
+
}
|
|
30
|
+
function truncateContent(content, full) {
|
|
31
|
+
if (full || content.length <= CONTENT_PREVIEW_MAX) return content;
|
|
32
|
+
return `${content.slice(0, CONTENT_PREVIEW_MAX)}...`;
|
|
33
|
+
}
|
|
34
|
+
function normalizeContent(content) {
|
|
35
|
+
return (content ?? "").trim();
|
|
36
|
+
}
|
|
37
|
+
function formatStepSummary(step, full) {
|
|
38
|
+
if (!step) return "(no step)";
|
|
39
|
+
const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
|
|
40
|
+
if (!step.content) return label;
|
|
41
|
+
const preview = truncateContent(step.content.replace(/\n/g, " "), full);
|
|
42
|
+
return `${label} ${preview}`;
|
|
43
|
+
}
|
|
44
|
+
function stepsByIndex(traces) {
|
|
45
|
+
const map = /* @__PURE__ */ new Map();
|
|
46
|
+
for (const step of traces) {
|
|
47
|
+
map.set(step.stepIndex, step);
|
|
48
|
+
}
|
|
49
|
+
return map;
|
|
50
|
+
}
|
|
51
|
+
function stepsDiverge(a, b) {
|
|
52
|
+
if (!a || !b) return true;
|
|
53
|
+
if (a.kind !== b.kind) return true;
|
|
54
|
+
if ((a.tool ?? "") !== (b.tool ?? "")) return true;
|
|
55
|
+
if (normalizeContent(a.content) !== normalizeContent(b.content)) return true;
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
function formatRunStatus(run) {
|
|
59
|
+
const passed = run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : "";
|
|
60
|
+
return `${run.status}${passed}`;
|
|
61
|
+
}
|
|
62
|
+
function printRunHeader(label, run) {
|
|
63
|
+
const tag = label === "A" ? paint(`Run A (${run.id})`, ANSI.cyan) : paint(`Run B (${run.id})`, ANSI.blue);
|
|
64
|
+
console.log(tag);
|
|
65
|
+
console.log(` test case: ${run.testCaseId}`);
|
|
66
|
+
console.log(` agent config: ${run.agentConfigId}`);
|
|
67
|
+
console.log(` status: ${formatRunStatus(run)}`);
|
|
68
|
+
console.log(` steps: ${run.stepsCount}`);
|
|
69
|
+
console.log(` cost: $${run.costUsd.toFixed(4)}`);
|
|
70
|
+
console.log(` duration: ${run.durationMs}ms`);
|
|
71
|
+
if (run.error) console.log(` error: ${run.error}`);
|
|
72
|
+
}
|
|
73
|
+
async function compareCommand(runIdA, runIdB, opts) {
|
|
74
|
+
const db = initDb();
|
|
75
|
+
const [runA, runB, tracesA, tracesB] = await Promise.all([
|
|
76
|
+
getRun(db, runIdA),
|
|
77
|
+
getRun(db, runIdB),
|
|
78
|
+
getTraces(db, runIdA),
|
|
79
|
+
getTraces(db, runIdB)
|
|
80
|
+
]);
|
|
81
|
+
if (!runA) {
|
|
82
|
+
console.error(`Run not found: ${runIdA}`);
|
|
83
|
+
process.exit(1);
|
|
84
|
+
}
|
|
85
|
+
if (!runB) {
|
|
86
|
+
console.error(`Run not found: ${runIdB}`);
|
|
87
|
+
process.exit(1);
|
|
88
|
+
}
|
|
89
|
+
console.log("");
|
|
90
|
+
printRunHeader("A", runA);
|
|
91
|
+
console.log("");
|
|
92
|
+
printRunHeader("B", runB);
|
|
93
|
+
console.log("");
|
|
94
|
+
if (runA.testCaseId !== runB.testCaseId) {
|
|
95
|
+
console.log(
|
|
96
|
+
paint(
|
|
97
|
+
"\u26A0\uFE0F Comparing runs of different test cases - step alignment may not be meaningful.",
|
|
98
|
+
ANSI.yellow
|
|
99
|
+
)
|
|
100
|
+
);
|
|
101
|
+
console.log("");
|
|
102
|
+
}
|
|
103
|
+
const mapA = stepsByIndex(tracesA);
|
|
104
|
+
const mapB = stepsByIndex(tracesB);
|
|
105
|
+
const maxIndex = Math.max(
|
|
106
|
+
tracesA.length > 0 ? Math.max(...tracesA.map((s) => s.stepIndex)) : -1,
|
|
107
|
+
tracesB.length > 0 ? Math.max(...tracesB.map((s) => s.stepIndex)) : -1,
|
|
108
|
+
-1
|
|
109
|
+
);
|
|
110
|
+
if (maxIndex < 0) {
|
|
111
|
+
console.log("No steps recorded for either run.");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
const divergentIndices = /* @__PURE__ */ new Set();
|
|
115
|
+
for (let i = 0; i <= maxIndex; i++) {
|
|
116
|
+
if (stepsDiverge(mapA.get(i), mapB.get(i))) {
|
|
117
|
+
divergentIndices.add(i);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
const visibleIndices = /* @__PURE__ */ new Set();
|
|
121
|
+
if (opts.onlyDiff) {
|
|
122
|
+
for (const idx of divergentIndices) {
|
|
123
|
+
visibleIndices.add(idx);
|
|
124
|
+
if (idx > 0) visibleIndices.add(idx - 1);
|
|
125
|
+
if (idx < maxIndex) visibleIndices.add(idx + 1);
|
|
126
|
+
}
|
|
127
|
+
} else {
|
|
128
|
+
for (let i = 0; i <= maxIndex; i++) visibleIndices.add(i);
|
|
129
|
+
}
|
|
130
|
+
const sortedVisible = [...visibleIndices].sort((a, b) => a - b);
|
|
131
|
+
if (sortedVisible.length === 0) {
|
|
132
|
+
console.log("No divergent steps (nothing to show with --only-diff).");
|
|
133
|
+
} else {
|
|
134
|
+
console.log("Step comparison:");
|
|
135
|
+
for (const i of sortedVisible) {
|
|
136
|
+
const stepA = mapA.get(i);
|
|
137
|
+
const stepB = mapB.get(i);
|
|
138
|
+
const divergent = stepsDiverge(stepA, stepB);
|
|
139
|
+
if (divergent) {
|
|
140
|
+
console.log(paint(`[step ${i}] DIVERGENT`, ANSI.yellow));
|
|
141
|
+
console.log(` A: ${formatStepSummary(stepA, opts.full ?? false)}`);
|
|
142
|
+
console.log(` B: ${formatStepSummary(stepB, opts.full ?? false)}`);
|
|
143
|
+
} else {
|
|
144
|
+
const line = formatStepSummary(stepA ?? stepB, opts.full ?? false);
|
|
145
|
+
console.log(paint(`[step ${i}] (same)`, ANSI.gray));
|
|
146
|
+
console.log(` ${line}`);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const totalSteps = maxIndex + 1;
|
|
151
|
+
const diffCount = divergentIndices.size;
|
|
152
|
+
let firstDivergence = null;
|
|
153
|
+
for (let i = 0; i <= maxIndex; i++) {
|
|
154
|
+
if (divergentIndices.has(i)) {
|
|
155
|
+
firstDivergence = i;
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
console.log("");
|
|
160
|
+
console.log(`${diffCount} of ${totalSteps} step(s) differ.`);
|
|
161
|
+
if (firstDivergence !== null) {
|
|
162
|
+
console.log(`First divergence at step ${firstDivergence}.`);
|
|
163
|
+
} else {
|
|
164
|
+
console.log("No divergence detected.");
|
|
165
|
+
}
|
|
166
|
+
console.log("");
|
|
167
|
+
}
|
|
19
168
|
var CONFIG_COL_WIDTH = 24;
|
|
20
169
|
var CONFIG_LABEL_MAX = 20;
|
|
21
170
|
function truncateLabel(name, max = CONFIG_LABEL_MAX) {
|
|
@@ -179,6 +328,125 @@ function loadAgentConfig(yamlPath) {
|
|
|
179
328
|
}
|
|
180
329
|
return config;
|
|
181
330
|
}
|
|
331
|
+
|
|
332
|
+
// src/lib/resolve-agent-config-paths.ts
|
|
333
|
+
function globToRegex(glob) {
|
|
334
|
+
const escaped = glob.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
335
|
+
return new RegExp(`^${escaped}$`);
|
|
336
|
+
}
|
|
337
|
+
function collectYamlFilesRecursive(dir) {
|
|
338
|
+
const files = [];
|
|
339
|
+
for (const entry of readdirSync(dir)) {
|
|
340
|
+
if (entry.startsWith(".")) continue;
|
|
341
|
+
const fullPath = resolve(dir, entry);
|
|
342
|
+
const stat = statSync(fullPath);
|
|
343
|
+
if (stat.isDirectory()) {
|
|
344
|
+
files.push(...collectYamlFilesRecursive(fullPath));
|
|
345
|
+
} else if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
346
|
+
files.push(fullPath);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
return files;
|
|
350
|
+
}
|
|
351
|
+
function findAgentConfigYamlFilesInDir(dir) {
|
|
352
|
+
const resolvedDir = resolve(dir);
|
|
353
|
+
const files = [];
|
|
354
|
+
for (const entry of readdirSync(resolvedDir)) {
|
|
355
|
+
if (entry.startsWith(".")) continue;
|
|
356
|
+
const fullPath = resolve(resolvedDir, entry);
|
|
357
|
+
if (!statSync(fullPath).isFile()) continue;
|
|
358
|
+
if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
359
|
+
files.push(fullPath);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
return files.sort();
|
|
363
|
+
}
|
|
364
|
+
function expandAgentConfigGlob(globPattern, baseDir) {
|
|
365
|
+
const base = resolve(baseDir);
|
|
366
|
+
const normalized = globPattern.replace(/^\.\//, "");
|
|
367
|
+
if (normalized.includes("**")) {
|
|
368
|
+
const [prefix, suffixPart] = normalized.split("**");
|
|
369
|
+
const searchRoot = prefix.replace(/\/$/, "") ? resolve(base, prefix.replace(/\/$/, "")) : base;
|
|
370
|
+
const suffix = (suffixPart ?? "").replace(/^\//, "") || "*.yaml";
|
|
371
|
+
const regex2 = globToRegex(suffix);
|
|
372
|
+
return collectYamlFilesRecursive(searchRoot).filter((filePath) => regex2.test(basename(filePath))).sort();
|
|
373
|
+
}
|
|
374
|
+
const slashIdx = normalized.lastIndexOf("/");
|
|
375
|
+
const cwd = slashIdx === -1 ? base : resolve(base, normalized.slice(0, slashIdx));
|
|
376
|
+
const fileGlob = slashIdx === -1 ? normalized : normalized.slice(slashIdx + 1);
|
|
377
|
+
const regex = globToRegex(fileGlob);
|
|
378
|
+
return readdirSync(cwd).filter((entry) => {
|
|
379
|
+
if (entry.startsWith(".")) return false;
|
|
380
|
+
const fullPath = resolve(cwd, entry);
|
|
381
|
+
return statSync(fullPath).isFile() && regex.test(entry);
|
|
382
|
+
}).map((entry) => resolve(cwd, entry)).sort();
|
|
383
|
+
}
|
|
384
|
+
function resolveAgentConfigPathList(input) {
|
|
385
|
+
const paths = /* @__PURE__ */ new Set();
|
|
386
|
+
if (input.commaSeparated) {
|
|
387
|
+
for (const part of input.commaSeparated.split(",")) {
|
|
388
|
+
const trimmed = part.trim();
|
|
389
|
+
if (trimmed) paths.add(resolve(trimmed));
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
if (input.dir) {
|
|
393
|
+
for (const file of findAgentConfigYamlFilesInDir(input.dir)) {
|
|
394
|
+
paths.add(file);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
const baseDir = input.relativeTo ? resolve(input.relativeTo) : process.cwd();
|
|
398
|
+
if (input.explicitPaths) {
|
|
399
|
+
for (const p of input.explicitPaths) {
|
|
400
|
+
paths.add(resolve(baseDir, p));
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
if (input.globs) {
|
|
404
|
+
for (const pattern of input.globs) {
|
|
405
|
+
for (const file of expandAgentConfigGlob(pattern, baseDir)) {
|
|
406
|
+
paths.add(file);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
const sorted = [...paths].sort();
|
|
411
|
+
if (sorted.length === 0) {
|
|
412
|
+
throw new Error("No agent config YAML files found.");
|
|
413
|
+
}
|
|
414
|
+
return sorted;
|
|
415
|
+
}
|
|
416
|
+
function loadAgentConfigsFromPaths(paths) {
|
|
417
|
+
return paths.map((p) => loadAgentConfig(p));
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// src/lib/load-bench-manifest.ts
|
|
421
|
+
var AgentsSchema = z.object({
|
|
422
|
+
paths: z.array(z.string()).optional(),
|
|
423
|
+
glob: z.union([z.string(), z.array(z.string())]).optional()
|
|
424
|
+
}).refine((data) => (data.paths?.length ?? 0) > 0 || data.glob !== void 0, {
|
|
425
|
+
message: "agents must specify at least one of paths or glob"
|
|
426
|
+
});
|
|
427
|
+
var BenchManifestSchema = z.object({
|
|
428
|
+
name: z.string().optional(),
|
|
429
|
+
suite: z.string(),
|
|
430
|
+
agents: AgentsSchema,
|
|
431
|
+
concurrency: z.number().optional()
|
|
432
|
+
});
|
|
433
|
+
function loadBenchManifest(yamlPath) {
|
|
434
|
+
const path = resolve(yamlPath);
|
|
435
|
+
const raw = parse(readFileSync(path, "utf-8"));
|
|
436
|
+
return BenchManifestSchema.parse(raw);
|
|
437
|
+
}
|
|
438
|
+
function resolveManifestAgentConfigPaths(manifest, manifestPath) {
|
|
439
|
+
const manifestDir = dirname(resolve(manifestPath));
|
|
440
|
+
const globs = manifest.agents.glob ? Array.isArray(manifest.agents.glob) ? manifest.agents.glob : [manifest.agents.glob] : void 0;
|
|
441
|
+
return resolveAgentConfigPathList({
|
|
442
|
+
explicitPaths: manifest.agents.paths,
|
|
443
|
+
globs,
|
|
444
|
+
relativeTo: manifestDir
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
function resolveManifestSuiteDir(manifest, manifestPath) {
|
|
448
|
+
return resolve(dirname(resolve(manifestPath)), manifest.suite);
|
|
449
|
+
}
|
|
182
450
|
function loadMatrix(yamlPath) {
|
|
183
451
|
const path = resolve(yamlPath);
|
|
184
452
|
const fileContent = readFileSync(path, "utf-8");
|
|
@@ -272,26 +540,57 @@ function findTestCaseYamlFiles(dir) {
|
|
|
272
540
|
return files;
|
|
273
541
|
}
|
|
274
542
|
async function runBenchCommand(opts) {
|
|
275
|
-
|
|
276
|
-
|
|
543
|
+
let suiteDir;
|
|
544
|
+
let concurrency = opts.concurrency ?? 2;
|
|
277
545
|
let agentConfigs;
|
|
278
546
|
let matrixId;
|
|
279
|
-
if (opts.
|
|
280
|
-
const
|
|
281
|
-
|
|
282
|
-
|
|
547
|
+
if (opts.manifest) {
|
|
548
|
+
const manifestPath = resolve(opts.manifest);
|
|
549
|
+
const manifest = loadBenchManifest(manifestPath);
|
|
550
|
+
suiteDir = resolveManifestSuiteDir(manifest, manifestPath);
|
|
551
|
+
if (manifest.concurrency !== void 0 && opts.concurrency === void 0) {
|
|
552
|
+
concurrency = manifest.concurrency;
|
|
553
|
+
}
|
|
554
|
+
if (opts.matrix) {
|
|
555
|
+
throw new Error("Use either --manifest or --matrix, not both.");
|
|
556
|
+
}
|
|
557
|
+
const configPaths = resolveManifestAgentConfigPaths(manifest, manifestPath);
|
|
558
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
283
559
|
console.log(
|
|
284
|
-
`
|
|
560
|
+
`Bench manifest "${manifest.name ?? manifestPath}" loaded ${agentConfigs.length} agent config(s) from ${configPaths.length} file(s).`
|
|
285
561
|
);
|
|
286
|
-
} else if (opts.configs) {
|
|
287
|
-
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
288
|
-
agentConfigs = configPaths.map((p) => loadAgentConfig(p));
|
|
289
562
|
} else {
|
|
290
|
-
|
|
563
|
+
if (!opts.suite) {
|
|
564
|
+
throw new Error("--suite is required unless --manifest is provided.");
|
|
565
|
+
}
|
|
566
|
+
suiteDir = resolve(opts.suite);
|
|
567
|
+
if (opts.matrix) {
|
|
568
|
+
if (opts.configs || opts.configsDir) {
|
|
569
|
+
throw new Error("Use either --matrix or --configs/--configs-dir, not both.");
|
|
570
|
+
}
|
|
571
|
+
const matrix = loadMatrix(opts.matrix);
|
|
572
|
+
agentConfigs = expandMatrix(matrix);
|
|
573
|
+
matrixId = randomUUID();
|
|
574
|
+
console.log(
|
|
575
|
+
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
576
|
+
);
|
|
577
|
+
} else {
|
|
578
|
+
const configPaths = resolveAgentConfigPathList({
|
|
579
|
+
commaSeparated: opts.configs,
|
|
580
|
+
dir: opts.configsDir
|
|
581
|
+
});
|
|
582
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
583
|
+
if (opts.configsDir) {
|
|
584
|
+
console.log(`Loaded ${agentConfigs.length} agent config(s) from ${opts.configsDir}.`);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
if (agentConfigs.length === 0) {
|
|
589
|
+
throw new Error("No agent configs to benchmark.");
|
|
291
590
|
}
|
|
292
591
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
293
592
|
if (yamlFiles.length === 0) {
|
|
294
|
-
console.error(`No test cases found in suite directory: ${
|
|
593
|
+
console.error(`No test cases found in suite directory: ${suiteDir}`);
|
|
295
594
|
process.exit(1);
|
|
296
595
|
}
|
|
297
596
|
const testCases = [];
|
|
@@ -879,25 +1178,51 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
879
1178
|
process.exit(1);
|
|
880
1179
|
}
|
|
881
1180
|
});
|
|
882
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1181
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1182
|
+
"--configs-dir <dir>",
|
|
1183
|
+
"Directory of AgentConfig YAML files (all .yaml/.yml files in the folder)"
|
|
1184
|
+
).option(
|
|
1185
|
+
"--manifest <manifest>",
|
|
1186
|
+
"Path to a bench manifest YAML (suite + agent paths/glob in one file)"
|
|
1187
|
+
).option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
883
1188
|
"--matrix <matrix>",
|
|
884
1189
|
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
885
|
-
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
1190
|
+
).example("agr bench --manifest bench.yaml").example("agr bench --suite tasks --configs-dir ./agents").example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
886
1191
|
if (!options.configs && options.config) {
|
|
887
1192
|
options.configs = options.config;
|
|
888
1193
|
}
|
|
889
|
-
|
|
1194
|
+
const agentSourceCount = [
|
|
1195
|
+
options.configs,
|
|
1196
|
+
options.configsDir,
|
|
1197
|
+
options.matrix,
|
|
1198
|
+
options.manifest
|
|
1199
|
+
].filter(Boolean).length;
|
|
1200
|
+
if (options.manifest) {
|
|
1201
|
+
if (agentSourceCount > 1) {
|
|
1202
|
+
console.error(
|
|
1203
|
+
"Error: --manifest cannot be combined with --configs, --configs-dir, or --matrix."
|
|
1204
|
+
);
|
|
1205
|
+
process.exit(1);
|
|
1206
|
+
}
|
|
1207
|
+
} else if (!options.suite || agentSourceCount === 0) {
|
|
890
1208
|
console.error(
|
|
891
|
-
"Error: --suite
|
|
1209
|
+
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
|
|
1210
|
+
);
|
|
1211
|
+
process.exit(1);
|
|
1212
|
+
} else if (agentSourceCount > 1) {
|
|
1213
|
+
console.error(
|
|
1214
|
+
"Error: use only one agent source: --configs, --configs-dir, or --matrix."
|
|
892
1215
|
);
|
|
893
1216
|
process.exit(1);
|
|
894
1217
|
}
|
|
895
1218
|
try {
|
|
896
1219
|
await runBenchCommand({
|
|
897
1220
|
configs: options.configs,
|
|
1221
|
+
configsDir: options.configsDir,
|
|
898
1222
|
suite: options.suite,
|
|
899
1223
|
concurrency: Number(options.concurrency),
|
|
900
|
-
matrix: options.matrix
|
|
1224
|
+
matrix: options.matrix,
|
|
1225
|
+
manifest: options.manifest
|
|
901
1226
|
});
|
|
902
1227
|
} catch (err) {
|
|
903
1228
|
console.error(`Error executing benchmark: ${err.message}`);
|
|
@@ -943,6 +1268,17 @@ cli.command("trace <runId>", "Show the step trace and metrics for a single run")
|
|
|
943
1268
|
process.exit(1);
|
|
944
1269
|
}
|
|
945
1270
|
});
|
|
1271
|
+
cli.command("compare <runIdA> <runIdB>", "Compare the step traces of two runs side by side").option("--full", "Print full step content without truncation").option(
|
|
1272
|
+
"--only-diff",
|
|
1273
|
+
"Show only divergent steps plus one step of context before and after each"
|
|
1274
|
+
).example("agr compare <runIdA> <runIdB> --only-diff").action(async (runIdA, runIdB, options) => {
|
|
1275
|
+
try {
|
|
1276
|
+
await compareCommand(runIdA, runIdB, options);
|
|
1277
|
+
} catch (err) {
|
|
1278
|
+
console.error(`Error executing compare: ${err.message}`);
|
|
1279
|
+
process.exit(1);
|
|
1280
|
+
}
|
|
1281
|
+
});
|
|
946
1282
|
cli.help();
|
|
947
1283
|
try {
|
|
948
1284
|
cli.parse();
|