agentgrader 1.0.2 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +390 -48
  2. package/package.json +12 -8
package/dist/index.js CHANGED
@@ -1,15 +1,20 @@
1
1
  #!/usr/bin/env node
2
+ import 'dotenv/config';
2
3
  import { cac } from 'cac';
4
+ import { randomUUID } from 'crypto';
3
5
  import { resolve, dirname, isAbsolute } from 'path';
4
6
  import { render, Box, Text } from 'ink';
5
- import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
7
+ import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
6
8
  import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
7
9
  import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
8
10
  import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
11
+ import { StaticQualityScorer } from '@agentgrader/scorer-static';
12
+ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
9
13
  import { jsx, jsxs } from 'react/jsx-runtime';
10
- import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
11
- import { stringify, parse } from 'yaml';
12
- import { randomUUID } from 'crypto';
14
+ import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
15
+ import { parse, stringify } from 'yaml';
16
+ import { ZodError } from 'zod';
17
+ import { execFileSync } from 'child_process';
13
18
 
14
19
  var Dashboard = ({ runs, testCases, configs, isFinished }) => {
15
20
  let totalCost = 0;
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
134
139
  ] })
135
140
  ] });
136
141
  };
142
+
143
+ // src/lib/format-zod-error.ts
144
+ function formatZodError(err, fileLabel) {
145
+ const lines = err.issues.map((issue) => {
146
+ const path = issue.path.join(".") || "(root)";
147
+ return ` - ${path}: ${issue.message}`;
148
+ });
149
+ return `Invalid ${fileLabel}:
150
+ ${lines.join("\n")}`;
151
+ }
152
+
153
+ // src/lib/load-agent-config.ts
137
154
  function loadAgentConfig(yamlPath) {
138
155
  const path = resolve(yamlPath);
139
156
  const fileContent = readFileSync(path, "utf-8");
140
157
  const raw = parse(fileContent);
141
158
  const dir = dirname(path);
142
- const config = AgentConfigSchema.parse(raw);
159
+ let config;
160
+ try {
161
+ config = AgentConfigSchema.parse(raw);
162
+ } catch (err) {
163
+ if (err instanceof ZodError) {
164
+ throw new Error(formatZodError(err, `agent config "${path}"`));
165
+ }
166
+ throw err;
167
+ }
143
168
  config.id = config.id || config.name;
144
169
  if (config.toolkits) {
145
170
  config.toolkits = config.toolkits.map(
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
148
173
  }
149
174
  return config;
150
175
  }
176
+ function loadMatrix(yamlPath) {
177
+ const path = resolve(yamlPath);
178
+ const fileContent = readFileSync(path, "utf-8");
179
+ const raw = parse(fileContent);
180
+ return MatrixSchema.parse(raw);
181
+ }
151
182
  function loadTestCase(yamlPath) {
152
183
  const path = resolve(yamlPath);
153
184
  const fileContent = readFileSync(path, "utf-8");
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
156
187
  if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
157
188
  raw.fixture = resolve(dir, raw.fixture);
158
189
  }
159
- const testCase = TestCaseSchema.parse(raw);
190
+ let testCase;
191
+ try {
192
+ testCase = TestCaseSchema.parse(raw);
193
+ } catch (err) {
194
+ if (err instanceof ZodError) {
195
+ throw new Error(formatZodError(err, `test case "${path}"`));
196
+ }
197
+ throw err;
198
+ }
160
199
  testCase.id = testCase.id || testCase.name;
161
200
  if (testCase.toolkits) {
162
201
  testCase.toolkits = testCase.toolkits.map(
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
228
267
  }
229
268
  async function runBenchCommand(opts) {
230
269
  const suiteDir = resolve(opts.suite);
231
- const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
232
270
  const concurrency = opts.concurrency || 2;
233
- const agentConfigs = configPaths.map((p) => loadAgentConfig(p));
271
+ let agentConfigs;
272
+ let matrixId;
273
+ if (opts.matrix) {
274
+ const matrix = loadMatrix(opts.matrix);
275
+ agentConfigs = expandMatrix(matrix);
276
+ matrixId = randomUUID();
277
+ console.log(
278
+ `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
279
+ );
280
+ } else if (opts.configs) {
281
+ const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
282
+ agentConfigs = configPaths.map((p) => loadAgentConfig(p));
283
+ } else {
284
+ throw new Error("Either --configs or --matrix must be provided.");
285
+ }
234
286
  const yamlFiles = findTestCaseYamlFiles(suiteDir);
235
287
  if (yamlFiles.length === 0) {
236
288
  console.error(`No test cases found in suite directory: ${opts.suite}`);
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
295
347
  sandboxProvider,
296
348
  db,
297
349
  concurrency,
298
- onRunUpdate
350
+ onRunUpdate,
351
+ extraScorers: [new StaticQualityScorer()],
352
+ matrixId
299
353
  });
300
354
  } catch (err) {
301
355
  console.error("Benchmark runner encountered an error:", err);
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
312
366
  )
313
367
  );
314
368
  printTagBreakdown(testCases, agentConfigs, runStates);
369
+ if (matrixId) {
370
+ await printMatrixSummary(db, matrixId, agentConfigs);
371
+ }
315
372
  process.exit(0);
316
373
  }
374
+ async function printMatrixSummary(db, matrixId, agentConfigs) {
375
+ const runs = await getRunsByMatrixId(db, matrixId);
376
+ const aggregates = aggregateResults(runs, agentConfigs);
377
+ if (aggregates.length === 0) return;
378
+ const front = paretoFront(aggregates);
379
+ const frontIds = new Set(front.map((a) => a.agentConfigId));
380
+ const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
381
+ console.log("\n================ MATRIX SUMMARY ================");
382
+ for (const agg of aggregates) {
383
+ const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
384
+ const solveRatePct = (agg.solveRate * 100).toFixed(0);
385
+ const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
386
+ console.log(
387
+ `${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
388
+ );
389
+ }
390
+ console.log(
391
+ `
392
+ * = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
393
+ );
394
+ console.log("=================================================\n");
395
+ }
317
396
  function printTagBreakdown(testCases, agentConfigs, runStates) {
318
397
  const tagStats = {};
319
398
  for (const tc of testCases) {
@@ -340,6 +419,55 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
340
419
  }
341
420
  console.log("=================================================\n");
342
421
  }
422
+ function isSkippedCheck(check) {
423
+ return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
424
+ }
425
+ function checkIcon(check) {
426
+ if (!check.passed) return "\u274C";
427
+ if (isSkippedCheck(check)) return "\u26A0\uFE0F";
428
+ return "\u2705";
429
+ }
430
+ async function validateCommand(testCasePath, opts) {
431
+ const testCase = loadTestCase(testCasePath);
432
+ if (opts?.strict) {
433
+ const missing = [];
434
+ if (!testCase.test_command) missing.push("test_command");
435
+ if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
436
+ if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
437
+ if (missing.length > 0) {
438
+ console.error(
439
+ `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
440
+ );
441
+ process.exit(1);
442
+ }
443
+ }
444
+ console.log(`Validating "${testCase.name}" (${testCasePath})...
445
+ `);
446
+ const sandboxProvider = new DockerSandboxProvider();
447
+ const report = await validateTestCase({ testCase, sandboxProvider });
448
+ const hadExecutionSkip = report.checks.some(
449
+ (c) => c.name.includes("execution-checks (skipped")
450
+ );
451
+ for (const check of report.checks) {
452
+ const icon = checkIcon(check);
453
+ console.log(`${icon} ${check.name}`);
454
+ if (check.detail && check.detail !== "ok") {
455
+ const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
456
+ console.log(indented);
457
+ }
458
+ }
459
+ if (hadExecutionSkip) {
460
+ console.log("");
461
+ console.log(
462
+ "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
463
+ );
464
+ }
465
+ console.log("");
466
+ console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
467
+ process.exit(report.ok ? 0 : 1);
468
+ }
469
+
470
+ // src/commands/import-pr.ts
343
471
  var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
344
472
  async function importPrCommand(repo, prNumber, opts) {
345
473
  const [owner, repoName] = repo.split("/");
@@ -378,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
378
506
  if (testDiff.trim()) {
379
507
  writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
380
508
  }
509
+ if (opts.cloneFixture) {
510
+ const fixtureDir = resolve(outDir, "fixture");
511
+ console.log(`
512
+ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
513
+ execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
514
+ stdio: "inherit"
515
+ });
516
+ console.log(`Checking out base commit ${pr.base.sha}...`);
517
+ execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
518
+ }
519
+ const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
520
+ const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
381
521
  const yamlDoc = {
382
522
  name: slug,
383
523
  description: pr.title,
384
524
  fixture: "./fixture",
385
525
  prompt: buildPrompt(pr),
386
- success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
526
+ success,
387
527
  timeout_seconds: 600,
388
528
  tags: ["imported", repoName],
389
529
  created_at: pr.created_at,
390
- // TODO: fill these in after setting up ./fixture (checked out at
391
- // base.sha below) and running the test suite to discover real test names.
392
- test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
393
- fail_to_pass: ["<TODO: fill in via `agr validate`>"],
394
- pass_to_pass: ["<TODO: fill in via `agr validate`>"]
530
+ test_command,
531
+ fail_to_pass: [],
532
+ pass_to_pass: []
395
533
  };
396
534
  if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
397
535
  if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
398
536
  if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
399
537
  if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
400
- writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
538
+ writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
401
539
  console.log(`
402
540
  Imported PR #${pr.number}: "${pr.title}"`);
403
541
  console.log(`Wrote scaffold to: ${outDir}`);
@@ -407,11 +545,77 @@ Imported PR #${pr.number}: "${pr.title}"`);
407
545
  if (testDiff.trim())
408
546
  console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
409
547
  console.log("\nNext steps:");
410
- console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
411
- console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
412
- console.log(
413
- ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
414
- );
548
+ if (!opts.cloneFixture) {
549
+ console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
550
+ console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
551
+ console.log(
552
+ ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
553
+ );
554
+ } else {
555
+ console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
556
+ console.log(
557
+ ` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
558
+ );
559
+ }
560
+ if (opts.validate) {
561
+ console.log("\nRunning validation...\n");
562
+ await validateCommand(resolve(outDir, "agr.yaml"));
563
+ }
564
+ }
565
+ function detectProjectKind(fixtureDir) {
566
+ if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
567
+ return "python";
568
+ }
569
+ if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
570
+ if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
571
+ return "unknown";
572
+ }
573
+ function projectTestDefaults(kind, cloned) {
574
+ if (!cloned) {
575
+ return {
576
+ success: [
577
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
578
+ ],
579
+ test_command: "<TODO: shell command that runs tests with TAP output>"
580
+ };
581
+ }
582
+ switch (kind) {
583
+ case "python":
584
+ return {
585
+ success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
586
+ test_command: "pytest --tap-stream"
587
+ };
588
+ case "node":
589
+ return {
590
+ success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
591
+ test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
592
+ };
593
+ case "go":
594
+ return {
595
+ success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
596
+ test_command: "<TODO: configure a TAP-producing test command for go>"
597
+ };
598
+ default:
599
+ return {
600
+ success: [
601
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
602
+ ],
603
+ test_command: "<TODO: shell command that runs tests with TAP output>"
604
+ };
605
+ }
606
+ }
607
+ function buildAgrYaml(doc, projectKind) {
608
+ let yaml = stringify(doc);
609
+ const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
610
+ yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
611
+ fail_to_pass:`);
612
+ if (projectKind === "python") {
613
+ yaml = yaml.replace(
614
+ /^test_command: (.+)$/m,
615
+ "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
616
+ );
617
+ }
618
+ return yaml;
415
619
  }
416
620
  function buildPrompt(pr) {
417
621
  const body = (pr.body || "").trim();
@@ -443,6 +647,35 @@ function splitDiff(diff) {
443
647
  forbidModified
444
648
  };
445
649
  }
650
+ var VERBOSE_CONTENT_MAX = 200;
651
+ function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
652
+ if (value.length <= max) return value;
653
+ return `${value.slice(0, max)}...`;
654
+ }
655
+ function formatVerboseStep(step) {
656
+ const prefix = `[step ${step.index}] ${step.kind}`;
657
+ if (step.kind === "tool_call" && step.tool) {
658
+ const args = step.content ? truncateForVerbose(step.content) : "";
659
+ return `${prefix}: ${step.tool}(${args})`;
660
+ }
661
+ if (step.kind === "tool_result" && step.tool) {
662
+ const result = step.content ? truncateForVerbose(step.content) : "";
663
+ return `${prefix}: ${step.tool} -> ${result}`;
664
+ }
665
+ if (step.kind === "message" && step.content) {
666
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
667
+ }
668
+ if (step.content) {
669
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
670
+ }
671
+ return prefix;
672
+ }
673
+ function formatMetricDetail(label, detail) {
674
+ if (/^No .+ configured; skipping/.test(detail)) {
675
+ return `\u26A0\uFE0F ${label}: ${detail}`;
676
+ }
677
+ return `${label}: ${detail}`;
678
+ }
446
679
  async function runSingleCommand(testCasePath, opts) {
447
680
  const testCase = loadTestCase(testCasePath);
448
681
  let agentConfig = {
@@ -459,6 +692,14 @@ async function runSingleCommand(testCasePath, opts) {
459
692
  const adapter = new AiSdkAgentAdapter();
460
693
  const db = initDb();
461
694
  await saveTestCase(db, testCaseToDbRow(testCase));
695
+ await saveAgentConfig(db, {
696
+ id: agentConfig.id || agentConfig.name,
697
+ name: agentConfig.name,
698
+ model: agentConfig.model,
699
+ maxSteps: agentConfig.max_steps,
700
+ temperature: agentConfig.temperature,
701
+ createdAt: Math.floor(Date.now() / 1e3)
702
+ });
462
703
  const runId = randomUUID();
463
704
  try {
464
705
  const result = await runSingle({
@@ -467,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
467
708
  adapter,
468
709
  sandboxProvider,
469
710
  db,
470
- runId
711
+ runId,
712
+ onStep: opts.verbose ? (step) => {
713
+ console.log(formatVerboseStep(step));
714
+ } : void 0
471
715
  });
472
716
  console.log("\n================ RUN SUMMARY ================");
473
717
  console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -478,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
478
722
  console.log(`Error: ${result.error}`);
479
723
  }
480
724
  if (result.metrics?.regression) {
481
- console.log(`Regression: ${result.metrics.regression.detail}`);
725
+ console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
482
726
  }
483
727
  if (result.metrics?.diff) {
484
728
  console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
485
729
  }
486
730
  if (result.metrics?.localization) {
487
- console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
731
+ console.log(
732
+ formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
733
+ );
488
734
  }
489
735
  console.log("=============================================\n");
490
736
  } catch (err) {
@@ -493,28 +739,90 @@ async function runSingleCommand(testCasePath, opts) {
493
739
  }
494
740
  process.exit(0);
495
741
  }
496
- async function validateCommand(testCasePath) {
497
- const testCase = loadTestCase(testCasePath);
498
- console.log(`Validating "${testCase.name}" (${testCasePath})...
499
- `);
500
- const sandboxProvider = new DockerSandboxProvider();
501
- const report = await validateTestCase({ testCase, sandboxProvider });
502
- for (const check of report.checks) {
503
- const icon = check.passed ? "\u2705" : "\u274C";
504
- console.log(`${icon} ${check.name}`);
505
- if (check.detail && check.detail !== "ok") {
506
- const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
507
- console.log(indented);
742
+ async function traceCommand(runId, opts) {
743
+ const db = initDb();
744
+ const run = await getRun(db, runId);
745
+ if (!run) {
746
+ console.error(`Run not found: ${runId}`);
747
+ process.exit(1);
748
+ }
749
+ console.log(`Run ${run.id}`);
750
+ console.log(` test case: ${run.testCaseId}`);
751
+ console.log(` agent config: ${run.agentConfigId}`);
752
+ console.log(
753
+ ` status: ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
754
+ );
755
+ console.log(` cost: $${run.costUsd.toFixed(4)}`);
756
+ console.log(` duration: ${run.durationMs}ms`);
757
+ if (run.error) console.log(` error: ${run.error}`);
758
+ if (opts.quality) {
759
+ printQualityBreakdown(run.metrics);
760
+ return;
761
+ }
762
+ const steps = await getTraces(db, runId);
763
+ console.log(`
764
+ ${steps.length} step(s):`);
765
+ for (const step of steps) {
766
+ const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
767
+ console.log(
768
+ ` [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
769
+ );
770
+ if (step.content) {
771
+ const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
772
+ console.log(` ${preview.replace(/\n/g, "\n ")}`);
508
773
  }
509
774
  }
510
- console.log("");
511
- console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
512
- process.exit(report.ok ? 0 : 1);
775
+ }
776
+ function printQualityBreakdown(metricsJson) {
777
+ const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
778
+ console.log("\n================ QUALITY BREAKDOWN ================");
779
+ const staticQuality = metrics?.["static-quality"]?.quality;
780
+ const llmJudge = metrics?.["llm-judge"]?.quality;
781
+ const diff = metrics?.diff;
782
+ const localization = metrics?.localization;
783
+ if (staticQuality) {
784
+ console.log("Static quality (static-quality):");
785
+ if (staticQuality.diffLines !== void 0) console.log(` diff lines: ${staticQuality.diffLines}`);
786
+ if (staticQuality.filesModified !== void 0)
787
+ console.log(` files modified: ${staticQuality.filesModified}`);
788
+ if (staticQuality.todosIntroduced !== void 0)
789
+ console.log(` TODOs introduced: ${staticQuality.todosIntroduced}`);
790
+ if (staticQuality.linterViolations !== void 0)
791
+ console.log(` lint violations: ${staticQuality.linterViolations}`);
792
+ }
793
+ if (llmJudge) {
794
+ if (staticQuality) console.log("");
795
+ console.log("LLM judge (llm-judge):");
796
+ if (llmJudge.llmJudgeScore !== void 0)
797
+ console.log(` score: ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
798
+ if (llmJudge.llmJudgeDetail) console.log(` rationale: ${llmJudge.llmJudgeDetail}`);
799
+ }
800
+ if (diff) {
801
+ if (staticQuality || llmJudge) console.log("");
802
+ console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
803
+ }
804
+ if (localization) {
805
+ console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
806
+ }
807
+ if (!staticQuality && !llmJudge && !diff && !localization) {
808
+ console.log(" (no quality metrics recorded for this run)");
809
+ }
810
+ console.log("=====================================================\n");
811
+ }
812
+ function safeParseJson(value) {
813
+ try {
814
+ return JSON.parse(value);
815
+ } catch {
816
+ return void 0;
817
+ }
513
818
  }
514
819
 
515
820
  // src/index.ts
516
821
  var cli = cac("agr");
517
- cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
822
+ cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
823
+ "--verbose",
824
+ "Stream agent steps live to the console as they happen"
825
+ ).action(async (testCase, options) => {
518
826
  try {
519
827
  await runSingleCommand(testCase, options);
520
828
  } catch (err) {
@@ -522,16 +830,25 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
522
830
  process.exit(1);
523
831
  }
524
832
  });
525
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).action(async (options) => {
526
- if (!options.configs || !options.suite) {
527
- console.error("Error: --configs and --suite are required for benchmarking.");
833
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
834
+ "--matrix <matrix>",
835
+ "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
836
+ ).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
837
+ if (!options.configs && options.config) {
838
+ options.configs = options.config;
839
+ }
840
+ if (!options.suite || !options.configs && !options.matrix) {
841
+ console.error(
842
+ "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
843
+ );
528
844
  process.exit(1);
529
845
  }
530
846
  try {
531
847
  await runBenchCommand({
532
848
  configs: options.configs,
533
849
  suite: options.suite,
534
- concurrency: Number(options.concurrency)
850
+ concurrency: Number(options.concurrency),
851
+ matrix: options.matrix
535
852
  });
536
853
  } catch (err) {
537
854
  console.error(`Error executing benchmark: ${err.message}`);
@@ -541,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
541
858
  cli.command(
542
859
  "validate <testCase>",
543
860
  "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
544
- ).action(async (testCase) => {
861
+ ).option(
862
+ "--strict",
863
+ "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
864
+ ).action(async (testCase, options) => {
545
865
  try {
546
- await validateCommand(testCase);
866
+ await validateCommand(testCase, options);
547
867
  } catch (err) {
548
868
  console.error(`Error executing validate: ${err.message}`);
549
869
  process.exit(1);
@@ -552,7 +872,7 @@ cli.command(
552
872
  cli.command(
553
873
  "import-pr <repo> <prNumber>",
554
874
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
555
- ).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
875
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
556
876
  try {
557
877
  await importPrCommand(repo, prNumber, options);
558
878
  } catch (err) {
@@ -560,5 +880,27 @@ cli.command(
560
880
  process.exit(1);
561
881
  }
562
882
  });
883
+ cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
884
+ "--quality",
885
+ "Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
886
+ ).action(async (runId, options) => {
887
+ try {
888
+ await traceCommand(runId, options);
889
+ } catch (err) {
890
+ console.error(`Error executing trace: ${err.message}`);
891
+ process.exit(1);
892
+ }
893
+ });
563
894
  cli.help();
564
- cli.parse();
895
+ try {
896
+ cli.parse();
897
+ } catch (err) {
898
+ if (err.name === "CACError") {
899
+ console.error(`
900
+ \u274C ${err.message}
901
+ `);
902
+ cli.outputHelp();
903
+ process.exit(1);
904
+ }
905
+ throw err;
906
+ }
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.2",
3
+ "version": "1.0.6",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
7
7
  "bin": {
8
- "agr": "./dist/index.js",
9
- "agentgrader": "./dist/index.js"
8
+ "agr": "dist/index.js",
9
+ "agentgrader": "dist/index.js"
10
10
  },
11
11
  "main": "./dist/index.js",
12
12
  "types": "./dist/index.d.ts",
@@ -19,14 +19,18 @@
19
19
  "dev": "bun run src/index.ts"
20
20
  },
21
21
  "dependencies": {
22
- "@agentgrader/agent-openrouter": "^2.0.0",
23
- "@agentgrader/core": "^1.1.0",
24
- "@agentgrader/sandbox-docker": "^2.0.0",
25
- "@agentgrader/store": "^1.0.2",
22
+ "@agentgrader/agent-openrouter": "^2.0.1",
23
+ "@agentgrader/core": "^1.1.3",
24
+ "@agentgrader/optimizer": "^0.1.0",
25
+ "@agentgrader/sandbox-docker": "^2.0.2",
26
+ "@agentgrader/scorer-static": "^0.1.0",
27
+ "@agentgrader/store": "^1.0.3",
26
28
  "cac": "^6.7.14",
29
+ "dotenv": "^17.4.2",
27
30
  "ink": "^4.4.1",
28
31
  "react": "^18.2.0",
29
- "yaml": "^2.5.1"
32
+ "yaml": "^2.5.1",
33
+ "zod": "^3.23.8"
30
34
  },
31
35
  "devDependencies": {
32
36
  "@types/react": "^18.2.0",