agentgrader 1.0.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +195 -19
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import 'dotenv/config';
3
3
  import { cac } from 'cac';
4
4
  import { randomUUID } from 'crypto';
5
- import { resolve, dirname, isAbsolute } from 'path';
5
+ import { resolve, dirname, isAbsolute, basename } from 'path';
6
6
  import { render, Box, Text } from 'ink';
7
7
  import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
8
8
  import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
@@ -13,7 +13,7 @@ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agen
13
13
  import { jsx, jsxs } from 'react/jsx-runtime';
14
14
  import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
15
15
  import { parse, stringify } from 'yaml';
16
- import { ZodError } from 'zod';
16
+ import { z, ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
19
19
  var CONFIG_COL_WIDTH = 24;
@@ -179,6 +179,125 @@ function loadAgentConfig(yamlPath) {
179
179
  }
180
180
  return config;
181
181
  }
182
+
183
+ // src/lib/resolve-agent-config-paths.ts
184
+ function globToRegex(glob) {
185
+ const escaped = glob.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
186
+ return new RegExp(`^${escaped}$`);
187
+ }
188
+ function collectYamlFilesRecursive(dir) {
189
+ const files = [];
190
+ for (const entry of readdirSync(dir)) {
191
+ if (entry.startsWith(".")) continue;
192
+ const fullPath = resolve(dir, entry);
193
+ const stat = statSync(fullPath);
194
+ if (stat.isDirectory()) {
195
+ files.push(...collectYamlFilesRecursive(fullPath));
196
+ } else if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
197
+ files.push(fullPath);
198
+ }
199
+ }
200
+ return files;
201
+ }
202
+ function findAgentConfigYamlFilesInDir(dir) {
203
+ const resolvedDir = resolve(dir);
204
+ const files = [];
205
+ for (const entry of readdirSync(resolvedDir)) {
206
+ if (entry.startsWith(".")) continue;
207
+ const fullPath = resolve(resolvedDir, entry);
208
+ if (!statSync(fullPath).isFile()) continue;
209
+ if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
210
+ files.push(fullPath);
211
+ }
212
+ }
213
+ return files.sort();
214
+ }
215
+ function expandAgentConfigGlob(globPattern, baseDir) {
216
+ const base = resolve(baseDir);
217
+ const normalized = globPattern.replace(/^\.\//, "");
218
+ if (normalized.includes("**")) {
219
+ const [prefix, suffixPart] = normalized.split("**");
220
+ const searchRoot = prefix.replace(/\/$/, "") ? resolve(base, prefix.replace(/\/$/, "")) : base;
221
+ const suffix = (suffixPart ?? "").replace(/^\//, "") || "*.yaml";
222
+ const regex2 = globToRegex(suffix);
223
+ return collectYamlFilesRecursive(searchRoot).filter((filePath) => regex2.test(basename(filePath))).sort();
224
+ }
225
+ const slashIdx = normalized.lastIndexOf("/");
226
+ const cwd = slashIdx === -1 ? base : resolve(base, normalized.slice(0, slashIdx));
227
+ const fileGlob = slashIdx === -1 ? normalized : normalized.slice(slashIdx + 1);
228
+ const regex = globToRegex(fileGlob);
229
+ return readdirSync(cwd).filter((entry) => {
230
+ if (entry.startsWith(".")) return false;
231
+ const fullPath = resolve(cwd, entry);
232
+ return statSync(fullPath).isFile() && regex.test(entry);
233
+ }).map((entry) => resolve(cwd, entry)).sort();
234
+ }
235
+ function resolveAgentConfigPathList(input) {
236
+ const paths = /* @__PURE__ */ new Set();
237
+ if (input.commaSeparated) {
238
+ for (const part of input.commaSeparated.split(",")) {
239
+ const trimmed = part.trim();
240
+ if (trimmed) paths.add(resolve(trimmed));
241
+ }
242
+ }
243
+ if (input.dir) {
244
+ for (const file of findAgentConfigYamlFilesInDir(input.dir)) {
245
+ paths.add(file);
246
+ }
247
+ }
248
+ const baseDir = input.relativeTo ? resolve(input.relativeTo) : process.cwd();
249
+ if (input.explicitPaths) {
250
+ for (const p of input.explicitPaths) {
251
+ paths.add(resolve(baseDir, p));
252
+ }
253
+ }
254
+ if (input.globs) {
255
+ for (const pattern of input.globs) {
256
+ for (const file of expandAgentConfigGlob(pattern, baseDir)) {
257
+ paths.add(file);
258
+ }
259
+ }
260
+ }
261
+ const sorted = [...paths].sort();
262
+ if (sorted.length === 0) {
263
+ throw new Error("No agent config YAML files found.");
264
+ }
265
+ return sorted;
266
+ }
267
+ function loadAgentConfigsFromPaths(paths) {
268
+ return paths.map((p) => loadAgentConfig(p));
269
+ }
270
+
271
+ // src/lib/load-bench-manifest.ts
272
+ var AgentsSchema = z.object({
273
+ paths: z.array(z.string()).optional(),
274
+ glob: z.union([z.string(), z.array(z.string())]).optional()
275
+ }).refine((data) => (data.paths?.length ?? 0) > 0 || data.glob !== void 0, {
276
+ message: "agents must specify at least one of paths or glob"
277
+ });
278
+ var BenchManifestSchema = z.object({
279
+ name: z.string().optional(),
280
+ suite: z.string(),
281
+ agents: AgentsSchema,
282
+ concurrency: z.number().optional()
283
+ });
284
+ function loadBenchManifest(yamlPath) {
285
+ const path = resolve(yamlPath);
286
+ const raw = parse(readFileSync(path, "utf-8"));
287
+ return BenchManifestSchema.parse(raw);
288
+ }
289
+ function resolveManifestAgentConfigPaths(manifest, manifestPath) {
290
+ const manifestDir = dirname(resolve(manifestPath));
291
+ const globs = manifest.agents.glob ? Array.isArray(manifest.agents.glob) ? manifest.agents.glob : [manifest.agents.glob] : void 0;
292
+ return resolveAgentConfigPathList({
293
+ explicitPaths: manifest.agents.paths,
294
+ globs,
295
+ relativeTo: manifestDir
296
+ });
297
+ }
298
+ function resolveManifestSuiteDir(manifest, manifestPath) {
299
+ return resolve(dirname(resolve(manifestPath)), manifest.suite);
300
+ }
182
301
  function loadMatrix(yamlPath) {
183
302
  const path = resolve(yamlPath);
184
303
  const fileContent = readFileSync(path, "utf-8");
@@ -272,26 +391,57 @@ function findTestCaseYamlFiles(dir) {
272
391
  return files;
273
392
  }
274
393
  async function runBenchCommand(opts) {
275
- const suiteDir = resolve(opts.suite);
276
- const concurrency = opts.concurrency || 2;
394
+ let suiteDir;
395
+ let concurrency = opts.concurrency ?? 2;
277
396
  let agentConfigs;
278
397
  let matrixId;
279
- if (opts.matrix) {
280
- const matrix = loadMatrix(opts.matrix);
281
- agentConfigs = expandMatrix(matrix);
282
- matrixId = randomUUID();
398
+ if (opts.manifest) {
399
+ const manifestPath = resolve(opts.manifest);
400
+ const manifest = loadBenchManifest(manifestPath);
401
+ suiteDir = resolveManifestSuiteDir(manifest, manifestPath);
402
+ if (manifest.concurrency !== void 0 && opts.concurrency === void 0) {
403
+ concurrency = manifest.concurrency;
404
+ }
405
+ if (opts.matrix) {
406
+ throw new Error("Use either --manifest or --matrix, not both.");
407
+ }
408
+ const configPaths = resolveManifestAgentConfigPaths(manifest, manifestPath);
409
+ agentConfigs = loadAgentConfigsFromPaths(configPaths);
283
410
  console.log(
284
- `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
411
+ `Bench manifest "${manifest.name ?? manifestPath}" loaded ${agentConfigs.length} agent config(s) from ${configPaths.length} file(s).`
285
412
  );
286
- } else if (opts.configs) {
287
- const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
288
- agentConfigs = configPaths.map((p) => loadAgentConfig(p));
289
413
  } else {
290
- throw new Error("Either --configs or --matrix must be provided.");
414
+ if (!opts.suite) {
415
+ throw new Error("--suite is required unless --manifest is provided.");
416
+ }
417
+ suiteDir = resolve(opts.suite);
418
+ if (opts.matrix) {
419
+ if (opts.configs || opts.configsDir) {
420
+ throw new Error("Use either --matrix or --configs/--configs-dir, not both.");
421
+ }
422
+ const matrix = loadMatrix(opts.matrix);
423
+ agentConfigs = expandMatrix(matrix);
424
+ matrixId = randomUUID();
425
+ console.log(
426
+ `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
427
+ );
428
+ } else {
429
+ const configPaths = resolveAgentConfigPathList({
430
+ commaSeparated: opts.configs,
431
+ dir: opts.configsDir
432
+ });
433
+ agentConfigs = loadAgentConfigsFromPaths(configPaths);
434
+ if (opts.configsDir) {
435
+ console.log(`Loaded ${agentConfigs.length} agent config(s) from ${opts.configsDir}.`);
436
+ }
437
+ }
438
+ }
439
+ if (agentConfigs.length === 0) {
440
+ throw new Error("No agent configs to benchmark.");
291
441
  }
292
442
  const yamlFiles = findTestCaseYamlFiles(suiteDir);
293
443
  if (yamlFiles.length === 0) {
294
- console.error(`No test cases found in suite directory: ${opts.suite}`);
444
+ console.error(`No test cases found in suite directory: ${suiteDir}`);
295
445
  process.exit(1);
296
446
  }
297
447
  const testCases = [];
@@ -879,25 +1029,51 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
879
1029
  process.exit(1);
880
1030
  }
881
1031
  });
882
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
1032
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
1033
+ "--configs-dir <dir>",
1034
+ "Directory of AgentConfig YAML files (all .yaml/.yml files in the folder)"
1035
+ ).option(
1036
+ "--manifest <manifest>",
1037
+ "Path to a bench manifest YAML (suite + agent paths/glob in one file)"
1038
+ ).option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
883
1039
  "--matrix <matrix>",
884
1040
  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
885
- ).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
1041
+ ).example("agr bench --manifest bench.yaml").example("agr bench --suite tasks --configs-dir ./agents").example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
886
1042
  if (!options.configs && options.config) {
887
1043
  options.configs = options.config;
888
1044
  }
889
- if (!options.suite || !options.configs && !options.matrix) {
1045
+ const agentSourceCount = [
1046
+ options.configs,
1047
+ options.configsDir,
1048
+ options.matrix,
1049
+ options.manifest
1050
+ ].filter(Boolean).length;
1051
+ if (options.manifest) {
1052
+ if (agentSourceCount > 1) {
1053
+ console.error(
1054
+ "Error: --manifest cannot be combined with --configs, --configs-dir, or --matrix."
1055
+ );
1056
+ process.exit(1);
1057
+ }
1058
+ } else if (!options.suite || agentSourceCount === 0) {
1059
+ console.error(
1060
+ "Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
1061
+ );
1062
+ process.exit(1);
1063
+ } else if (agentSourceCount > 1) {
890
1064
  console.error(
891
- "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
1065
+ "Error: use only one agent source: --configs, --configs-dir, or --matrix."
892
1066
  );
893
1067
  process.exit(1);
894
1068
  }
895
1069
  try {
896
1070
  await runBenchCommand({
897
1071
  configs: options.configs,
1072
+ configsDir: options.configsDir,
898
1073
  suite: options.suite,
899
1074
  concurrency: Number(options.concurrency),
900
- matrix: options.matrix
1075
+ matrix: options.matrix,
1076
+ manifest: options.manifest
901
1077
  });
902
1078
  } catch (err) {
903
1079
  console.error(`Error executing benchmark: ${err.message}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.7",
3
+ "version": "1.1.0",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",