agentgrader 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +274 -46
  2. package/package.json +3 -3
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import 'dotenv/config';
3
3
  import { cac } from 'cac';
4
4
  import { randomUUID } from 'crypto';
5
- import { resolve, dirname, isAbsolute } from 'path';
5
+ import { resolve, dirname, isAbsolute, basename } from 'path';
6
6
  import { render, Box, Text } from 'ink';
7
7
  import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
8
8
  import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
@@ -13,9 +13,15 @@ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agen
13
13
  import { jsx, jsxs } from 'react/jsx-runtime';
14
14
  import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
15
15
  import { parse, stringify } from 'yaml';
16
- import { ZodError } from 'zod';
16
+ import { z, ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
19
+ var CONFIG_COL_WIDTH = 24;
20
+ var CONFIG_LABEL_MAX = 20;
21
+ function truncateLabel(name, max = CONFIG_LABEL_MAX) {
22
+ if (name.length <= max) return name;
23
+ return `${name.slice(0, max - 1)}\u2026`;
24
+ }
19
25
  var Dashboard = ({ runs, testCases, configs, isFinished }) => {
20
26
  let totalCost = 0;
21
27
  let totalSteps = 0;
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
67
73
  r.testCaseId
68
74
  ] }),
69
75
  /* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
70
- /* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
76
+ /* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
71
77
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
72
78
  " (Steps: ",
73
79
  r.stepsCount,
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
81
87
  /* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
82
88
  /* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
83
89
  /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
84
- configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", children: cfg }) }, cfg))
90
+ configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
85
91
  ] }),
86
92
  testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
87
- /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
93
+ /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
88
94
  configs.map((cfg) => {
89
95
  const key = `${tc}_${cfg}`;
90
96
  const run = runs[key];
91
97
  if (!run) {
92
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
98
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
93
99
  }
94
100
  if (run.status === "running") {
95
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
101
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
96
102
  }
97
103
  if (run.status === "failed" || !run.passed) {
98
104
  const seconds2 = (run.durationMs / 1e3).toFixed(1);
99
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "red", children: [
105
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
100
106
  "\u2717 ",
101
107
  seconds2,
102
108
  "s ($",
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
105
111
  ] }) }, cfg);
106
112
  }
107
113
  const seconds = (run.durationMs / 1e3).toFixed(1);
108
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "green", children: [
114
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
109
115
  "\u2713 ",
110
116
  seconds,
111
117
  "s ($",
@@ -173,6 +179,125 @@ function loadAgentConfig(yamlPath) {
173
179
  }
174
180
  return config;
175
181
  }
182
+
183
+ // src/lib/resolve-agent-config-paths.ts
184
+ function globToRegex(glob) {
185
+ const escaped = glob.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
186
+ return new RegExp(`^${escaped}$`);
187
+ }
188
+ function collectYamlFilesRecursive(dir) {
189
+ const files = [];
190
+ for (const entry of readdirSync(dir)) {
191
+ if (entry.startsWith(".")) continue;
192
+ const fullPath = resolve(dir, entry);
193
+ const stat = statSync(fullPath);
194
+ if (stat.isDirectory()) {
195
+ files.push(...collectYamlFilesRecursive(fullPath));
196
+ } else if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
197
+ files.push(fullPath);
198
+ }
199
+ }
200
+ return files;
201
+ }
202
+ function findAgentConfigYamlFilesInDir(dir) {
203
+ const resolvedDir = resolve(dir);
204
+ const files = [];
205
+ for (const entry of readdirSync(resolvedDir)) {
206
+ if (entry.startsWith(".")) continue;
207
+ const fullPath = resolve(resolvedDir, entry);
208
+ if (!statSync(fullPath).isFile()) continue;
209
+ if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
210
+ files.push(fullPath);
211
+ }
212
+ }
213
+ return files.sort();
214
+ }
215
+ function expandAgentConfigGlob(globPattern, baseDir) {
216
+ const base = resolve(baseDir);
217
+ const normalized = globPattern.replace(/^\.\//, "");
218
+ if (normalized.includes("**")) {
219
+ const [prefix, suffixPart] = normalized.split("**");
220
+ const searchRoot = prefix.replace(/\/$/, "") ? resolve(base, prefix.replace(/\/$/, "")) : base;
221
+ const suffix = (suffixPart ?? "").replace(/^\//, "") || "*.yaml";
222
+ const regex2 = globToRegex(suffix);
223
+ return collectYamlFilesRecursive(searchRoot).filter((filePath) => regex2.test(basename(filePath))).sort();
224
+ }
225
+ const slashIdx = normalized.lastIndexOf("/");
226
+ const cwd = slashIdx === -1 ? base : resolve(base, normalized.slice(0, slashIdx));
227
+ const fileGlob = slashIdx === -1 ? normalized : normalized.slice(slashIdx + 1);
228
+ const regex = globToRegex(fileGlob);
229
+ return readdirSync(cwd).filter((entry) => {
230
+ if (entry.startsWith(".")) return false;
231
+ const fullPath = resolve(cwd, entry);
232
+ return statSync(fullPath).isFile() && regex.test(entry);
233
+ }).map((entry) => resolve(cwd, entry)).sort();
234
+ }
235
+ function resolveAgentConfigPathList(input) {
236
+ const paths = /* @__PURE__ */ new Set();
237
+ if (input.commaSeparated) {
238
+ for (const part of input.commaSeparated.split(",")) {
239
+ const trimmed = part.trim();
240
+ if (trimmed) paths.add(resolve(trimmed));
241
+ }
242
+ }
243
+ if (input.dir) {
244
+ for (const file of findAgentConfigYamlFilesInDir(input.dir)) {
245
+ paths.add(file);
246
+ }
247
+ }
248
+ const baseDir = input.relativeTo ? resolve(input.relativeTo) : process.cwd();
249
+ if (input.explicitPaths) {
250
+ for (const p of input.explicitPaths) {
251
+ paths.add(resolve(baseDir, p));
252
+ }
253
+ }
254
+ if (input.globs) {
255
+ for (const pattern of input.globs) {
256
+ for (const file of expandAgentConfigGlob(pattern, baseDir)) {
257
+ paths.add(file);
258
+ }
259
+ }
260
+ }
261
+ const sorted = [...paths].sort();
262
+ if (sorted.length === 0) {
263
+ throw new Error("No agent config YAML files found.");
264
+ }
265
+ return sorted;
266
+ }
267
+ function loadAgentConfigsFromPaths(paths) {
268
+ return paths.map((p) => loadAgentConfig(p));
269
+ }
270
+
271
+ // src/lib/load-bench-manifest.ts
272
+ var AgentsSchema = z.object({
273
+ paths: z.array(z.string()).optional(),
274
+ glob: z.union([z.string(), z.array(z.string())]).optional()
275
+ }).refine((data) => (data.paths?.length ?? 0) > 0 || data.glob !== void 0, {
276
+ message: "agents must specify at least one of paths or glob"
277
+ });
278
+ var BenchManifestSchema = z.object({
279
+ name: z.string().optional(),
280
+ suite: z.string(),
281
+ agents: AgentsSchema,
282
+ concurrency: z.number().optional()
283
+ });
284
+ function loadBenchManifest(yamlPath) {
285
+ const path = resolve(yamlPath);
286
+ const raw = parse(readFileSync(path, "utf-8"));
287
+ return BenchManifestSchema.parse(raw);
288
+ }
289
+ function resolveManifestAgentConfigPaths(manifest, manifestPath) {
290
+ const manifestDir = dirname(resolve(manifestPath));
291
+ const globs = manifest.agents.glob ? Array.isArray(manifest.agents.glob) ? manifest.agents.glob : [manifest.agents.glob] : void 0;
292
+ return resolveAgentConfigPathList({
293
+ explicitPaths: manifest.agents.paths,
294
+ globs,
295
+ relativeTo: manifestDir
296
+ });
297
+ }
298
+ function resolveManifestSuiteDir(manifest, manifestPath) {
299
+ return resolve(dirname(resolve(manifestPath)), manifest.suite);
300
+ }
176
301
  function loadMatrix(yamlPath) {
177
302
  const path = resolve(yamlPath);
178
303
  const fileContent = readFileSync(path, "utf-8");
@@ -266,26 +391,57 @@ function findTestCaseYamlFiles(dir) {
266
391
  return files;
267
392
  }
268
393
  async function runBenchCommand(opts) {
269
- const suiteDir = resolve(opts.suite);
270
- const concurrency = opts.concurrency || 2;
394
+ let suiteDir;
395
+ let concurrency = opts.concurrency ?? 2;
271
396
  let agentConfigs;
272
397
  let matrixId;
273
- if (opts.matrix) {
274
- const matrix = loadMatrix(opts.matrix);
275
- agentConfigs = expandMatrix(matrix);
276
- matrixId = randomUUID();
398
+ if (opts.manifest) {
399
+ const manifestPath = resolve(opts.manifest);
400
+ const manifest = loadBenchManifest(manifestPath);
401
+ suiteDir = resolveManifestSuiteDir(manifest, manifestPath);
402
+ if (manifest.concurrency !== void 0 && opts.concurrency === void 0) {
403
+ concurrency = manifest.concurrency;
404
+ }
405
+ if (opts.matrix) {
406
+ throw new Error("Use either --manifest or --matrix, not both.");
407
+ }
408
+ const configPaths = resolveManifestAgentConfigPaths(manifest, manifestPath);
409
+ agentConfigs = loadAgentConfigsFromPaths(configPaths);
277
410
  console.log(
278
- `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
411
+ `Bench manifest "${manifest.name ?? manifestPath}" loaded ${agentConfigs.length} agent config(s) from ${configPaths.length} file(s).`
279
412
  );
280
- } else if (opts.configs) {
281
- const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
282
- agentConfigs = configPaths.map((p) => loadAgentConfig(p));
283
413
  } else {
284
- throw new Error("Either --configs or --matrix must be provided.");
414
+ if (!opts.suite) {
415
+ throw new Error("--suite is required unless --manifest is provided.");
416
+ }
417
+ suiteDir = resolve(opts.suite);
418
+ if (opts.matrix) {
419
+ if (opts.configs || opts.configsDir) {
420
+ throw new Error("Use either --matrix or --configs/--configs-dir, not both.");
421
+ }
422
+ const matrix = loadMatrix(opts.matrix);
423
+ agentConfigs = expandMatrix(matrix);
424
+ matrixId = randomUUID();
425
+ console.log(
426
+ `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
427
+ );
428
+ } else {
429
+ const configPaths = resolveAgentConfigPathList({
430
+ commaSeparated: opts.configs,
431
+ dir: opts.configsDir
432
+ });
433
+ agentConfigs = loadAgentConfigsFromPaths(configPaths);
434
+ if (opts.configsDir) {
435
+ console.log(`Loaded ${agentConfigs.length} agent config(s) from ${opts.configsDir}.`);
436
+ }
437
+ }
438
+ }
439
+ if (agentConfigs.length === 0) {
440
+ throw new Error("No agent configs to benchmark.");
285
441
  }
286
442
  const yamlFiles = findTestCaseYamlFiles(suiteDir);
287
443
  if (yamlFiles.length === 0) {
288
- console.error(`No test cases found in suite directory: ${opts.suite}`);
444
+ console.error(`No test cases found in suite directory: ${suiteDir}`);
289
445
  process.exit(1);
290
446
  }
291
447
  const testCases = [];
@@ -461,6 +617,11 @@ async function validateCommand(testCasePath, opts) {
461
617
  console.log(
462
618
  "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
463
619
  );
620
+ if (report.ok && !opts?.strict) {
621
+ console.log(
622
+ "Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
623
+ );
624
+ }
464
625
  }
465
626
  console.log("");
466
627
  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
@@ -506,8 +667,8 @@ async function importPrCommand(repo, prNumber, opts) {
506
667
  if (testDiff.trim()) {
507
668
  writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
508
669
  }
670
+ const fixtureDir = resolve(outDir, "fixture");
509
671
  if (opts.cloneFixture) {
510
- const fixtureDir = resolve(outDir, "fixture");
511
672
  console.log(`
512
673
  Cloning ${owner}/${repoName} into ${fixtureDir}...`);
513
674
  execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
@@ -516,8 +677,12 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
516
677
  console.log(`Checking out base commit ${pr.base.sha}...`);
517
678
  execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
518
679
  }
519
- const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
520
- const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
680
+ const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
681
+ const { success, test_command, testCommandHint } = projectTestDefaults(
682
+ projectKind,
683
+ opts.cloneFixture ?? false,
684
+ fixtureDir
685
+ );
521
686
  const yamlDoc = {
522
687
  name: slug,
523
688
  description: pr.title,
@@ -535,7 +700,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
535
700
  if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
536
701
  if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
537
702
  if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
538
- writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
703
+ writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
539
704
  console.log(`
540
705
  Imported PR #${pr.number}: "${pr.title}"`);
541
706
  console.log(`Wrote scaffold to: ${outDir}`);
@@ -551,6 +716,9 @@ Imported PR #${pr.number}: "${pr.title}"`);
551
716
  console.log(
552
717
  ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
553
718
  );
719
+ console.log(
720
+ "\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
721
+ );
554
722
  } else {
555
723
  console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
556
724
  console.log(
@@ -570,50 +738,81 @@ function detectProjectKind(fixtureDir) {
570
738
  if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
571
739
  return "unknown";
572
740
  }
573
- function projectTestDefaults(kind, cloned) {
741
+ function projectTestDefaults(kind, cloned, fixtureDir) {
574
742
  if (!cloned) {
575
743
  return {
576
744
  success: [
577
745
  { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
578
746
  ],
579
- test_command: "<TODO: shell command that runs tests with TAP output>"
747
+ test_command: "<TODO: shell command that runs tests with TAP output>",
748
+ testCommandHint: "none"
580
749
  };
581
750
  }
582
751
  switch (kind) {
583
752
  case "python":
584
753
  return {
585
754
  success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
586
- test_command: "pytest --tap-stream"
755
+ test_command: "pytest --tap-stream",
756
+ testCommandHint: "python"
587
757
  };
588
758
  case "node":
589
- return {
590
- success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
591
- test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
592
- };
759
+ return detectNodeTestRunner(fixtureDir);
593
760
  case "go":
594
761
  return {
595
762
  success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
596
- test_command: "<TODO: configure a TAP-producing test command for go>"
763
+ test_command: "<TODO: configure a TAP-producing test command for go>",
764
+ testCommandHint: "go"
597
765
  };
598
766
  default:
599
767
  return {
600
768
  success: [
601
769
  { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
602
770
  ],
603
- test_command: "<TODO: shell command that runs tests with TAP output>"
771
+ test_command: "<TODO: shell command that runs tests with TAP output>",
772
+ testCommandHint: "none"
604
773
  };
605
774
  }
606
775
  }
607
- function buildAgrYaml(doc, projectKind) {
776
+ function detectNodeTestRunner(fixtureDir) {
777
+ const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
778
+ const fallback = {
779
+ success,
780
+ test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
781
+ testCommandHint: "node-unknown"
782
+ };
783
+ try {
784
+ const pkgPath = resolve(fixtureDir, "package.json");
785
+ if (!existsSync(pkgPath)) return fallback;
786
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
787
+ const deps = { ...pkg.dependencies, ...pkg.devDependencies };
788
+ if (deps.ava) {
789
+ return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
790
+ }
791
+ if (deps.vitest) {
792
+ return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
793
+ }
794
+ if (deps.jest) {
795
+ return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
796
+ }
797
+ return fallback;
798
+ } catch {
799
+ return fallback;
800
+ }
801
+ }
802
+ function buildAgrYaml(doc, testCommandHint) {
608
803
  let yaml = stringify(doc);
609
804
  const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
610
805
  yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
611
806
  fail_to_pass:`);
612
- if (projectKind === "python") {
613
- yaml = yaml.replace(
614
- /^test_command: (.+)$/m,
615
- "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
616
- );
807
+ const testCommandComments = {
808
+ python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
809
+ jest: "# jest does not output TAP by default; consider jest-tap-reporter",
810
+ "node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
811
+ };
812
+ const comment = testCommandComments[testCommandHint];
813
+ if (comment) {
814
+ yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
815
+ $&`);
617
816
  }
618
817
  return yaml;
619
818
  }
@@ -830,25 +1029,51 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
830
1029
  process.exit(1);
831
1030
  }
832
1031
  });
833
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
1032
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
1033
+ "--configs-dir <dir>",
1034
+ "Directory of AgentConfig YAML files (all .yaml/.yml files in the folder)"
1035
+ ).option(
1036
+ "--manifest <manifest>",
1037
+ "Path to a bench manifest YAML (suite + agent paths/glob in one file)"
1038
+ ).option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
834
1039
  "--matrix <matrix>",
835
1040
  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
836
- ).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
1041
+ ).example("agr bench --manifest bench.yaml").example("agr bench --suite tasks --configs-dir ./agents").example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
837
1042
  if (!options.configs && options.config) {
838
1043
  options.configs = options.config;
839
1044
  }
840
- if (!options.suite || !options.configs && !options.matrix) {
1045
+ const agentSourceCount = [
1046
+ options.configs,
1047
+ options.configsDir,
1048
+ options.matrix,
1049
+ options.manifest
1050
+ ].filter(Boolean).length;
1051
+ if (options.manifest) {
1052
+ if (agentSourceCount > 1) {
1053
+ console.error(
1054
+ "Error: --manifest cannot be combined with --configs, --configs-dir, or --matrix."
1055
+ );
1056
+ process.exit(1);
1057
+ }
1058
+ } else if (!options.suite || agentSourceCount === 0) {
1059
+ console.error(
1060
+ "Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
1061
+ );
1062
+ process.exit(1);
1063
+ } else if (agentSourceCount > 1) {
841
1064
  console.error(
842
- "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
1065
+ "Error: use only one agent source: --configs, --configs-dir, or --matrix."
843
1066
  );
844
1067
  process.exit(1);
845
1068
  }
846
1069
  try {
847
1070
  await runBenchCommand({
848
1071
  configs: options.configs,
1072
+ configsDir: options.configsDir,
849
1073
  suite: options.suite,
850
1074
  concurrency: Number(options.concurrency),
851
- matrix: options.matrix
1075
+ matrix: options.matrix,
1076
+ manifest: options.manifest
852
1077
  });
853
1078
  } catch (err) {
854
1079
  console.error(`Error executing benchmark: ${err.message}`);
@@ -872,7 +1097,10 @@ cli.command(
872
1097
  cli.command(
873
1098
  "import-pr <repo> <prNumber>",
874
1099
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
875
- ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
1100
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option(
1101
+ "--clone-fixture",
1102
+ "Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
1103
+ ).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
876
1104
  try {
877
1105
  await importPrCommand(repo, prNumber, options);
878
1106
  } catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.6",
3
+ "version": "1.1.0",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -19,9 +19,9 @@
19
19
  "dev": "bun run src/index.ts"
20
20
  },
21
21
  "dependencies": {
22
- "@agentgrader/agent-openrouter": "^2.0.1",
22
+ "@agentgrader/agent-openrouter": "^2.0.3",
23
23
  "@agentgrader/core": "^1.1.3",
24
- "@agentgrader/optimizer": "^0.1.0",
24
+ "@agentgrader/optimizer": "^0.1.1",
25
25
  "@agentgrader/sandbox-docker": "^2.0.2",
26
26
  "@agentgrader/scorer-static": "^0.1.0",
27
27
  "@agentgrader/store": "^1.0.3",