agentgrader 1.0.7 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +195 -19
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { cac } from 'cac';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
|
-
import { resolve, dirname, isAbsolute } from 'path';
|
|
5
|
+
import { resolve, dirname, isAbsolute, basename } from 'path';
|
|
6
6
|
import { render, Box, Text } from 'ink';
|
|
7
7
|
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
8
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
@@ -13,7 +13,7 @@ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agen
|
|
|
13
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
14
14
|
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
15
|
import { parse, stringify } from 'yaml';
|
|
16
|
-
import { ZodError } from 'zod';
|
|
16
|
+
import { z, ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
19
|
var CONFIG_COL_WIDTH = 24;
|
|
@@ -179,6 +179,125 @@ function loadAgentConfig(yamlPath) {
|
|
|
179
179
|
}
|
|
180
180
|
return config;
|
|
181
181
|
}
|
|
182
|
+
|
|
183
|
+
// src/lib/resolve-agent-config-paths.ts
|
|
184
|
+
function globToRegex(glob) {
|
|
185
|
+
const escaped = glob.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
186
|
+
return new RegExp(`^${escaped}$`);
|
|
187
|
+
}
|
|
188
|
+
function collectYamlFilesRecursive(dir) {
|
|
189
|
+
const files = [];
|
|
190
|
+
for (const entry of readdirSync(dir)) {
|
|
191
|
+
if (entry.startsWith(".")) continue;
|
|
192
|
+
const fullPath = resolve(dir, entry);
|
|
193
|
+
const stat = statSync(fullPath);
|
|
194
|
+
if (stat.isDirectory()) {
|
|
195
|
+
files.push(...collectYamlFilesRecursive(fullPath));
|
|
196
|
+
} else if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
197
|
+
files.push(fullPath);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return files;
|
|
201
|
+
}
|
|
202
|
+
function findAgentConfigYamlFilesInDir(dir) {
|
|
203
|
+
const resolvedDir = resolve(dir);
|
|
204
|
+
const files = [];
|
|
205
|
+
for (const entry of readdirSync(resolvedDir)) {
|
|
206
|
+
if (entry.startsWith(".")) continue;
|
|
207
|
+
const fullPath = resolve(resolvedDir, entry);
|
|
208
|
+
if (!statSync(fullPath).isFile()) continue;
|
|
209
|
+
if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
210
|
+
files.push(fullPath);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
return files.sort();
|
|
214
|
+
}
|
|
215
|
+
function expandAgentConfigGlob(globPattern, baseDir) {
|
|
216
|
+
const base = resolve(baseDir);
|
|
217
|
+
const normalized = globPattern.replace(/^\.\//, "");
|
|
218
|
+
if (normalized.includes("**")) {
|
|
219
|
+
const [prefix, suffixPart] = normalized.split("**");
|
|
220
|
+
const searchRoot = prefix.replace(/\/$/, "") ? resolve(base, prefix.replace(/\/$/, "")) : base;
|
|
221
|
+
const suffix = (suffixPart ?? "").replace(/^\//, "") || "*.yaml";
|
|
222
|
+
const regex2 = globToRegex(suffix);
|
|
223
|
+
return collectYamlFilesRecursive(searchRoot).filter((filePath) => regex2.test(basename(filePath))).sort();
|
|
224
|
+
}
|
|
225
|
+
const slashIdx = normalized.lastIndexOf("/");
|
|
226
|
+
const cwd = slashIdx === -1 ? base : resolve(base, normalized.slice(0, slashIdx));
|
|
227
|
+
const fileGlob = slashIdx === -1 ? normalized : normalized.slice(slashIdx + 1);
|
|
228
|
+
const regex = globToRegex(fileGlob);
|
|
229
|
+
return readdirSync(cwd).filter((entry) => {
|
|
230
|
+
if (entry.startsWith(".")) return false;
|
|
231
|
+
const fullPath = resolve(cwd, entry);
|
|
232
|
+
return statSync(fullPath).isFile() && regex.test(entry);
|
|
233
|
+
}).map((entry) => resolve(cwd, entry)).sort();
|
|
234
|
+
}
|
|
235
|
+
function resolveAgentConfigPathList(input) {
|
|
236
|
+
const paths = /* @__PURE__ */ new Set();
|
|
237
|
+
if (input.commaSeparated) {
|
|
238
|
+
for (const part of input.commaSeparated.split(",")) {
|
|
239
|
+
const trimmed = part.trim();
|
|
240
|
+
if (trimmed) paths.add(resolve(trimmed));
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
if (input.dir) {
|
|
244
|
+
for (const file of findAgentConfigYamlFilesInDir(input.dir)) {
|
|
245
|
+
paths.add(file);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
const baseDir = input.relativeTo ? resolve(input.relativeTo) : process.cwd();
|
|
249
|
+
if (input.explicitPaths) {
|
|
250
|
+
for (const p of input.explicitPaths) {
|
|
251
|
+
paths.add(resolve(baseDir, p));
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (input.globs) {
|
|
255
|
+
for (const pattern of input.globs) {
|
|
256
|
+
for (const file of expandAgentConfigGlob(pattern, baseDir)) {
|
|
257
|
+
paths.add(file);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const sorted = [...paths].sort();
|
|
262
|
+
if (sorted.length === 0) {
|
|
263
|
+
throw new Error("No agent config YAML files found.");
|
|
264
|
+
}
|
|
265
|
+
return sorted;
|
|
266
|
+
}
|
|
267
|
+
function loadAgentConfigsFromPaths(paths) {
|
|
268
|
+
return paths.map((p) => loadAgentConfig(p));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// src/lib/load-bench-manifest.ts
|
|
272
|
+
var AgentsSchema = z.object({
|
|
273
|
+
paths: z.array(z.string()).optional(),
|
|
274
|
+
glob: z.union([z.string(), z.array(z.string())]).optional()
|
|
275
|
+
}).refine((data) => (data.paths?.length ?? 0) > 0 || data.glob !== void 0, {
|
|
276
|
+
message: "agents must specify at least one of paths or glob"
|
|
277
|
+
});
|
|
278
|
+
var BenchManifestSchema = z.object({
|
|
279
|
+
name: z.string().optional(),
|
|
280
|
+
suite: z.string(),
|
|
281
|
+
agents: AgentsSchema,
|
|
282
|
+
concurrency: z.number().optional()
|
|
283
|
+
});
|
|
284
|
+
function loadBenchManifest(yamlPath) {
|
|
285
|
+
const path = resolve(yamlPath);
|
|
286
|
+
const raw = parse(readFileSync(path, "utf-8"));
|
|
287
|
+
return BenchManifestSchema.parse(raw);
|
|
288
|
+
}
|
|
289
|
+
function resolveManifestAgentConfigPaths(manifest, manifestPath) {
|
|
290
|
+
const manifestDir = dirname(resolve(manifestPath));
|
|
291
|
+
const globs = manifest.agents.glob ? Array.isArray(manifest.agents.glob) ? manifest.agents.glob : [manifest.agents.glob] : void 0;
|
|
292
|
+
return resolveAgentConfigPathList({
|
|
293
|
+
explicitPaths: manifest.agents.paths,
|
|
294
|
+
globs,
|
|
295
|
+
relativeTo: manifestDir
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
function resolveManifestSuiteDir(manifest, manifestPath) {
|
|
299
|
+
return resolve(dirname(resolve(manifestPath)), manifest.suite);
|
|
300
|
+
}
|
|
182
301
|
function loadMatrix(yamlPath) {
|
|
183
302
|
const path = resolve(yamlPath);
|
|
184
303
|
const fileContent = readFileSync(path, "utf-8");
|
|
@@ -272,26 +391,57 @@ function findTestCaseYamlFiles(dir) {
|
|
|
272
391
|
return files;
|
|
273
392
|
}
|
|
274
393
|
async function runBenchCommand(opts) {
|
|
275
|
-
|
|
276
|
-
|
|
394
|
+
let suiteDir;
|
|
395
|
+
let concurrency = opts.concurrency ?? 2;
|
|
277
396
|
let agentConfigs;
|
|
278
397
|
let matrixId;
|
|
279
|
-
if (opts.
|
|
280
|
-
const
|
|
281
|
-
|
|
282
|
-
|
|
398
|
+
if (opts.manifest) {
|
|
399
|
+
const manifestPath = resolve(opts.manifest);
|
|
400
|
+
const manifest = loadBenchManifest(manifestPath);
|
|
401
|
+
suiteDir = resolveManifestSuiteDir(manifest, manifestPath);
|
|
402
|
+
if (manifest.concurrency !== void 0 && opts.concurrency === void 0) {
|
|
403
|
+
concurrency = manifest.concurrency;
|
|
404
|
+
}
|
|
405
|
+
if (opts.matrix) {
|
|
406
|
+
throw new Error("Use either --manifest or --matrix, not both.");
|
|
407
|
+
}
|
|
408
|
+
const configPaths = resolveManifestAgentConfigPaths(manifest, manifestPath);
|
|
409
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
283
410
|
console.log(
|
|
284
|
-
`
|
|
411
|
+
`Bench manifest "${manifest.name ?? manifestPath}" loaded ${agentConfigs.length} agent config(s) from ${configPaths.length} file(s).`
|
|
285
412
|
);
|
|
286
|
-
} else if (opts.configs) {
|
|
287
|
-
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
288
|
-
agentConfigs = configPaths.map((p) => loadAgentConfig(p));
|
|
289
413
|
} else {
|
|
290
|
-
|
|
414
|
+
if (!opts.suite) {
|
|
415
|
+
throw new Error("--suite is required unless --manifest is provided.");
|
|
416
|
+
}
|
|
417
|
+
suiteDir = resolve(opts.suite);
|
|
418
|
+
if (opts.matrix) {
|
|
419
|
+
if (opts.configs || opts.configsDir) {
|
|
420
|
+
throw new Error("Use either --matrix or --configs/--configs-dir, not both.");
|
|
421
|
+
}
|
|
422
|
+
const matrix = loadMatrix(opts.matrix);
|
|
423
|
+
agentConfigs = expandMatrix(matrix);
|
|
424
|
+
matrixId = randomUUID();
|
|
425
|
+
console.log(
|
|
426
|
+
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
427
|
+
);
|
|
428
|
+
} else {
|
|
429
|
+
const configPaths = resolveAgentConfigPathList({
|
|
430
|
+
commaSeparated: opts.configs,
|
|
431
|
+
dir: opts.configsDir
|
|
432
|
+
});
|
|
433
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
434
|
+
if (opts.configsDir) {
|
|
435
|
+
console.log(`Loaded ${agentConfigs.length} agent config(s) from ${opts.configsDir}.`);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
if (agentConfigs.length === 0) {
|
|
440
|
+
throw new Error("No agent configs to benchmark.");
|
|
291
441
|
}
|
|
292
442
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
293
443
|
if (yamlFiles.length === 0) {
|
|
294
|
-
console.error(`No test cases found in suite directory: ${
|
|
444
|
+
console.error(`No test cases found in suite directory: ${suiteDir}`);
|
|
295
445
|
process.exit(1);
|
|
296
446
|
}
|
|
297
447
|
const testCases = [];
|
|
@@ -879,25 +1029,51 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
879
1029
|
process.exit(1);
|
|
880
1030
|
}
|
|
881
1031
|
});
|
|
882
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1032
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1033
|
+
"--configs-dir <dir>",
|
|
1034
|
+
"Directory of AgentConfig YAML files (all .yaml/.yml files in the folder)"
|
|
1035
|
+
).option(
|
|
1036
|
+
"--manifest <manifest>",
|
|
1037
|
+
"Path to a bench manifest YAML (suite + agent paths/glob in one file)"
|
|
1038
|
+
).option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
883
1039
|
"--matrix <matrix>",
|
|
884
1040
|
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
885
|
-
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
1041
|
+
).example("agr bench --manifest bench.yaml").example("agr bench --suite tasks --configs-dir ./agents").example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
886
1042
|
if (!options.configs && options.config) {
|
|
887
1043
|
options.configs = options.config;
|
|
888
1044
|
}
|
|
889
|
-
|
|
1045
|
+
const agentSourceCount = [
|
|
1046
|
+
options.configs,
|
|
1047
|
+
options.configsDir,
|
|
1048
|
+
options.matrix,
|
|
1049
|
+
options.manifest
|
|
1050
|
+
].filter(Boolean).length;
|
|
1051
|
+
if (options.manifest) {
|
|
1052
|
+
if (agentSourceCount > 1) {
|
|
1053
|
+
console.error(
|
|
1054
|
+
"Error: --manifest cannot be combined with --configs, --configs-dir, or --matrix."
|
|
1055
|
+
);
|
|
1056
|
+
process.exit(1);
|
|
1057
|
+
}
|
|
1058
|
+
} else if (!options.suite || agentSourceCount === 0) {
|
|
1059
|
+
console.error(
|
|
1060
|
+
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
|
|
1061
|
+
);
|
|
1062
|
+
process.exit(1);
|
|
1063
|
+
} else if (agentSourceCount > 1) {
|
|
890
1064
|
console.error(
|
|
891
|
-
"Error:
|
|
1065
|
+
"Error: use only one agent source: --configs, --configs-dir, or --matrix."
|
|
892
1066
|
);
|
|
893
1067
|
process.exit(1);
|
|
894
1068
|
}
|
|
895
1069
|
try {
|
|
896
1070
|
await runBenchCommand({
|
|
897
1071
|
configs: options.configs,
|
|
1072
|
+
configsDir: options.configsDir,
|
|
898
1073
|
suite: options.suite,
|
|
899
1074
|
concurrency: Number(options.concurrency),
|
|
900
|
-
matrix: options.matrix
|
|
1075
|
+
matrix: options.matrix,
|
|
1076
|
+
manifest: options.manifest
|
|
901
1077
|
});
|
|
902
1078
|
} catch (err) {
|
|
903
1079
|
console.error(`Error executing benchmark: ${err.message}`);
|