agentgrader 1.0.2 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +246 -33
  2. package/package.json +9 -5
package/dist/index.js CHANGED
@@ -1,15 +1,20 @@
1
1
  #!/usr/bin/env node
2
+ import 'dotenv/config';
2
3
  import { cac } from 'cac';
4
+ import { randomUUID } from 'crypto';
3
5
  import { resolve, dirname, isAbsolute } from 'path';
4
6
  import { render, Box, Text } from 'ink';
5
- import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
7
+ import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
6
8
  import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
7
9
  import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
8
10
  import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
11
+ import { StaticQualityScorer } from '@agentgrader/scorer-static';
12
+ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
9
13
  import { jsx, jsxs } from 'react/jsx-runtime';
10
14
  import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
11
15
  import { stringify, parse } from 'yaml';
12
- import { randomUUID } from 'crypto';
16
+ import { ZodError } from 'zod';
17
+ import { execFileSync } from 'child_process';
13
18
 
14
19
  var Dashboard = ({ runs, testCases, configs, isFinished }) => {
15
20
  let totalCost = 0;
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
134
139
  ] })
135
140
  ] });
136
141
  };
142
+
143
+ // src/lib/format-zod-error.ts
144
+ function formatZodError(err, fileLabel) {
145
+ const lines = err.issues.map((issue) => {
146
+ const path = issue.path.join(".") || "(root)";
147
+ return ` - ${path}: ${issue.message}`;
148
+ });
149
+ return `Invalid ${fileLabel}:
150
+ ${lines.join("\n")}`;
151
+ }
152
+
153
+ // src/lib/load-agent-config.ts
137
154
  function loadAgentConfig(yamlPath) {
138
155
  const path = resolve(yamlPath);
139
156
  const fileContent = readFileSync(path, "utf-8");
140
157
  const raw = parse(fileContent);
141
158
  const dir = dirname(path);
142
- const config = AgentConfigSchema.parse(raw);
159
+ let config;
160
+ try {
161
+ config = AgentConfigSchema.parse(raw);
162
+ } catch (err) {
163
+ if (err instanceof ZodError) {
164
+ throw new Error(formatZodError(err, `agent config "${path}"`));
165
+ }
166
+ throw err;
167
+ }
143
168
  config.id = config.id || config.name;
144
169
  if (config.toolkits) {
145
170
  config.toolkits = config.toolkits.map(
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
148
173
  }
149
174
  return config;
150
175
  }
176
+ function loadMatrix(yamlPath) {
177
+ const path = resolve(yamlPath);
178
+ const fileContent = readFileSync(path, "utf-8");
179
+ const raw = parse(fileContent);
180
+ return MatrixSchema.parse(raw);
181
+ }
151
182
  function loadTestCase(yamlPath) {
152
183
  const path = resolve(yamlPath);
153
184
  const fileContent = readFileSync(path, "utf-8");
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
156
187
  if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
157
188
  raw.fixture = resolve(dir, raw.fixture);
158
189
  }
159
- const testCase = TestCaseSchema.parse(raw);
190
+ let testCase;
191
+ try {
192
+ testCase = TestCaseSchema.parse(raw);
193
+ } catch (err) {
194
+ if (err instanceof ZodError) {
195
+ throw new Error(formatZodError(err, `test case "${path}"`));
196
+ }
197
+ throw err;
198
+ }
160
199
  testCase.id = testCase.id || testCase.name;
161
200
  if (testCase.toolkits) {
162
201
  testCase.toolkits = testCase.toolkits.map(
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
228
267
  }
229
268
  async function runBenchCommand(opts) {
230
269
  const suiteDir = resolve(opts.suite);
231
- const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
232
270
  const concurrency = opts.concurrency || 2;
233
- const agentConfigs = configPaths.map((p) => loadAgentConfig(p));
271
+ let agentConfigs;
272
+ let matrixId;
273
+ if (opts.matrix) {
274
+ const matrix = loadMatrix(opts.matrix);
275
+ agentConfigs = expandMatrix(matrix);
276
+ matrixId = randomUUID();
277
+ console.log(
278
+ `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
279
+ );
280
+ } else if (opts.configs) {
281
+ const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
282
+ agentConfigs = configPaths.map((p) => loadAgentConfig(p));
283
+ } else {
284
+ throw new Error("Either --configs or --matrix must be provided.");
285
+ }
234
286
  const yamlFiles = findTestCaseYamlFiles(suiteDir);
235
287
  if (yamlFiles.length === 0) {
236
288
  console.error(`No test cases found in suite directory: ${opts.suite}`);
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
295
347
  sandboxProvider,
296
348
  db,
297
349
  concurrency,
298
- onRunUpdate
350
+ onRunUpdate,
351
+ extraScorers: [new StaticQualityScorer()],
352
+ matrixId
299
353
  });
300
354
  } catch (err) {
301
355
  console.error("Benchmark runner encountered an error:", err);
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
312
366
  )
313
367
  );
314
368
  printTagBreakdown(testCases, agentConfigs, runStates);
369
+ if (matrixId) {
370
+ await printMatrixSummary(db, matrixId, agentConfigs);
371
+ }
315
372
  process.exit(0);
316
373
  }
374
+ async function printMatrixSummary(db, matrixId, agentConfigs) {
375
+ const runs = await getRunsByMatrixId(db, matrixId);
376
+ const aggregates = aggregateResults(runs, agentConfigs);
377
+ if (aggregates.length === 0) return;
378
+ const front = paretoFront(aggregates);
379
+ const frontIds = new Set(front.map((a) => a.agentConfigId));
380
+ const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
381
+ console.log("\n================ MATRIX SUMMARY ================");
382
+ for (const agg of aggregates) {
383
+ const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
384
+ const solveRatePct = (agg.solveRate * 100).toFixed(0);
385
+ const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
386
+ console.log(
387
+ `${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
388
+ );
389
+ }
390
+ console.log(
391
+ `
392
+ * = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
393
+ );
394
+ console.log("=================================================\n");
395
+ }
317
396
  function printTagBreakdown(testCases, agentConfigs, runStates) {
318
397
  const tagStats = {};
319
398
  for (const tc of testCases) {
@@ -340,6 +419,26 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
340
419
  }
341
420
  console.log("=================================================\n");
342
421
  }
422
+ async function validateCommand(testCasePath) {
423
+ const testCase = loadTestCase(testCasePath);
424
+ console.log(`Validating "${testCase.name}" (${testCasePath})...
425
+ `);
426
+ const sandboxProvider = new DockerSandboxProvider();
427
+ const report = await validateTestCase({ testCase, sandboxProvider });
428
+ for (const check of report.checks) {
429
+ const icon = check.passed ? "\u2705" : "\u274C";
430
+ console.log(`${icon} ${check.name}`);
431
+ if (check.detail && check.detail !== "ok") {
432
+ const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
433
+ console.log(indented);
434
+ }
435
+ }
436
+ console.log("");
437
+ console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
438
+ process.exit(report.ok ? 0 : 1);
439
+ }
440
+
441
+ // src/commands/import-pr.ts
343
442
  var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
344
443
  async function importPrCommand(repo, prNumber, opts) {
345
444
  const [owner, repoName] = repo.split("/");
@@ -406,12 +505,33 @@ Imported PR #${pr.number}: "${pr.title}"`);
406
505
  console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
407
506
  if (testDiff.trim())
408
507
  console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
508
+ if (opts.cloneFixture) {
509
+ const fixtureDir = resolve(outDir, "fixture");
510
+ console.log(`
511
+ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
512
+ execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
513
+ stdio: "inherit"
514
+ });
515
+ console.log(`Checking out base commit ${pr.base.sha}...`);
516
+ execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
517
+ }
409
518
  console.log("\nNext steps:");
410
- console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
411
- console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
412
- console.log(
413
- ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
414
- );
519
+ if (!opts.cloneFixture) {
520
+ console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
521
+ console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
522
+ console.log(
523
+ ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
524
+ );
525
+ } else {
526
+ console.log(" 1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
527
+ console.log(
528
+ ` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
529
+ );
530
+ }
531
+ if (opts.validate) {
532
+ console.log("\nRunning validation...\n");
533
+ await validateCommand(resolve(outDir, "agr.yaml"));
534
+ }
415
535
  }
416
536
  function buildPrompt(pr) {
417
537
  const body = (pr.body || "").trim();
@@ -459,6 +579,14 @@ async function runSingleCommand(testCasePath, opts) {
459
579
  const adapter = new AiSdkAgentAdapter();
460
580
  const db = initDb();
461
581
  await saveTestCase(db, testCaseToDbRow(testCase));
582
+ await saveAgentConfig(db, {
583
+ id: agentConfig.id || agentConfig.name,
584
+ name: agentConfig.name,
585
+ model: agentConfig.model,
586
+ maxSteps: agentConfig.max_steps,
587
+ temperature: agentConfig.temperature,
588
+ createdAt: Math.floor(Date.now() / 1e3)
589
+ });
462
590
  const runId = randomUUID();
463
591
  try {
464
592
  const result = await runSingle({
@@ -493,23 +621,82 @@ async function runSingleCommand(testCasePath, opts) {
493
621
  }
494
622
  process.exit(0);
495
623
  }
496
- async function validateCommand(testCasePath) {
497
- const testCase = loadTestCase(testCasePath);
498
- console.log(`Validating "${testCase.name}" (${testCasePath})...
499
- `);
500
- const sandboxProvider = new DockerSandboxProvider();
501
- const report = await validateTestCase({ testCase, sandboxProvider });
502
- for (const check of report.checks) {
503
- const icon = check.passed ? "\u2705" : "\u274C";
504
- console.log(`${icon} ${check.name}`);
505
- if (check.detail && check.detail !== "ok") {
506
- const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
507
- console.log(indented);
624
+ async function traceCommand(runId, opts) {
625
+ const db = initDb();
626
+ const run = await getRun(db, runId);
627
+ if (!run) {
628
+ console.error(`Run not found: ${runId}`);
629
+ process.exit(1);
630
+ }
631
+ console.log(`Run ${run.id}`);
632
+ console.log(` test case: ${run.testCaseId}`);
633
+ console.log(` agent config: ${run.agentConfigId}`);
634
+ console.log(
635
+ ` status: ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
636
+ );
637
+ console.log(` cost: $${run.costUsd.toFixed(4)}`);
638
+ console.log(` duration: ${run.durationMs}ms`);
639
+ if (run.error) console.log(` error: ${run.error}`);
640
+ if (opts.quality) {
641
+ printQualityBreakdown(run.metrics);
642
+ return;
643
+ }
644
+ const steps = await getTraces(db, runId);
645
+ console.log(`
646
+ ${steps.length} step(s):`);
647
+ for (const step of steps) {
648
+ const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
649
+ console.log(
650
+ ` [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
651
+ );
652
+ if (step.content) {
653
+ const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
654
+ console.log(` ${preview.replace(/\n/g, "\n ")}`);
508
655
  }
509
656
  }
510
- console.log("");
511
- console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
512
- process.exit(report.ok ? 0 : 1);
657
+ }
658
+ function printQualityBreakdown(metricsJson) {
659
+ const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
660
+ console.log("\n================ QUALITY BREAKDOWN ================");
661
+ const staticQuality = metrics?.["static-quality"]?.quality;
662
+ const llmJudge = metrics?.["llm-judge"]?.quality;
663
+ const diff = metrics?.diff;
664
+ const localization = metrics?.localization;
665
+ if (staticQuality) {
666
+ console.log("Static quality (static-quality):");
667
+ if (staticQuality.diffLines !== void 0) console.log(` diff lines: ${staticQuality.diffLines}`);
668
+ if (staticQuality.filesModified !== void 0)
669
+ console.log(` files modified: ${staticQuality.filesModified}`);
670
+ if (staticQuality.todosIntroduced !== void 0)
671
+ console.log(` TODOs introduced: ${staticQuality.todosIntroduced}`);
672
+ if (staticQuality.linterViolations !== void 0)
673
+ console.log(` lint violations: ${staticQuality.linterViolations}`);
674
+ }
675
+ if (llmJudge) {
676
+ if (staticQuality) console.log("");
677
+ console.log("LLM judge (llm-judge):");
678
+ if (llmJudge.llmJudgeScore !== void 0)
679
+ console.log(` score: ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
680
+ if (llmJudge.llmJudgeDetail) console.log(` rationale: ${llmJudge.llmJudgeDetail}`);
681
+ }
682
+ if (diff) {
683
+ if (staticQuality || llmJudge) console.log("");
684
+ console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
685
+ }
686
+ if (localization) {
687
+ console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
688
+ }
689
+ if (!staticQuality && !llmJudge && !diff && !localization) {
690
+ console.log(" (no quality metrics recorded for this run)");
691
+ }
692
+ console.log("=====================================================\n");
693
+ }
694
+ function safeParseJson(value) {
695
+ try {
696
+ return JSON.parse(value);
697
+ } catch {
698
+ return void 0;
699
+ }
513
700
  }
514
701
 
515
702
  // src/index.ts
@@ -522,16 +709,20 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
522
709
  process.exit(1);
523
710
  }
524
711
  });
525
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).action(async (options) => {
526
- if (!options.configs || !options.suite) {
527
- console.error("Error: --configs and --suite are required for benchmarking.");
712
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
713
+ "--matrix <matrix>",
714
+ "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
715
+ ).action(async (options) => {
716
+ if (!options.suite || !options.configs && !options.matrix) {
717
+ console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
528
718
  process.exit(1);
529
719
  }
530
720
  try {
531
721
  await runBenchCommand({
532
722
  configs: options.configs,
533
723
  suite: options.suite,
534
- concurrency: Number(options.concurrency)
724
+ concurrency: Number(options.concurrency),
725
+ matrix: options.matrix
535
726
  });
536
727
  } catch (err) {
537
728
  console.error(`Error executing benchmark: ${err.message}`);
@@ -552,7 +743,7 @@ cli.command(
552
743
  cli.command(
553
744
  "import-pr <repo> <prNumber>",
554
745
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
555
- ).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
746
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
556
747
  try {
557
748
  await importPrCommand(repo, prNumber, options);
558
749
  } catch (err) {
@@ -560,5 +751,27 @@ cli.command(
560
751
  process.exit(1);
561
752
  }
562
753
  });
754
+ cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
755
+ "--quality",
756
+ "Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
757
+ ).action(async (runId, options) => {
758
+ try {
759
+ await traceCommand(runId, options);
760
+ } catch (err) {
761
+ console.error(`Error executing trace: ${err.message}`);
762
+ process.exit(1);
763
+ }
764
+ });
563
765
  cli.help();
564
- cli.parse();
766
+ try {
767
+ cli.parse();
768
+ } catch (err) {
769
+ if (err.name === "CACError") {
770
+ console.error(`
771
+ \u274C ${err.message}
772
+ `);
773
+ cli.outputHelp();
774
+ process.exit(1);
775
+ }
776
+ throw err;
777
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.2",
3
+ "version": "1.0.5",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -19,14 +19,18 @@
19
19
  "dev": "bun run src/index.ts"
20
20
  },
21
21
  "dependencies": {
22
- "@agentgrader/agent-openrouter": "^2.0.0",
23
- "@agentgrader/core": "^1.1.0",
24
- "@agentgrader/sandbox-docker": "^2.0.0",
22
+ "@agentgrader/agent-openrouter": "^2.0.1",
23
+ "@agentgrader/core": "^1.1.1",
24
+ "@agentgrader/optimizer": "^0.1.0",
25
+ "@agentgrader/sandbox-docker": "^2.0.2",
26
+ "@agentgrader/scorer-static": "^0.1.0",
25
27
  "@agentgrader/store": "^1.0.2",
26
28
  "cac": "^6.7.14",
29
+ "dotenv": "^17.4.2",
27
30
  "ink": "^4.4.1",
28
31
  "react": "^18.2.0",
29
- "yaml": "^2.5.1"
32
+ "yaml": "^2.5.1",
33
+ "zod": "^3.23.8"
30
34
  },
31
35
  "devDependencies": {
32
36
  "@types/react": "^18.2.0",