npm - agentgrader - Versions diffs - 1.0.2 → 1.0.6 - Mend

agentgrader 1.0.2 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +390 -48
package/package.json +12 -8

package/dist/index.js CHANGED Viewed

@@ -1,15 +1,20 @@
 #!/usr/bin/env node
+import 'dotenv/config';
 import { cac } from 'cac';
+import { randomUUID } from 'crypto';
 import { resolve, dirname, isAbsolute } from 'path';
 import { render, Box, Text } from 'ink';
-import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
+import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
 import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
 import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
 import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
+import { StaticQualityScorer } from '@agentgrader/scorer-static';
+import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
 import { jsx, jsxs } from 'react/jsx-runtime';
-import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
-import { stringify, parse } from 'yaml';
-import { randomUUID } from 'crypto';
+import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
+import { parse, stringify } from 'yaml';
+import { ZodError } from 'zod';
+import { execFileSync } from 'child_process';
 var Dashboard = ({ runs, testCases, configs, isFinished }) => {
   let totalCost = 0;
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
     ] })
   ] });
 };
+// src/lib/format-zod-error.ts
+function formatZodError(err, fileLabel) {
+  const lines = err.issues.map((issue) => {
+    const path = issue.path.join(".") || "(root)";
+    return `  - ${path}: ${issue.message}`;
+  });
+  return `Invalid ${fileLabel}:
+${lines.join("\n")}`;
+}
+// src/lib/load-agent-config.ts
 function loadAgentConfig(yamlPath) {
   const path = resolve(yamlPath);
   const fileContent = readFileSync(path, "utf-8");
   const raw = parse(fileContent);
   const dir = dirname(path);
-  const config = AgentConfigSchema.parse(raw);
+  let config;
+  try {
+    config = AgentConfigSchema.parse(raw);
+  } catch (err) {
+    if (err instanceof ZodError) {
+      throw new Error(formatZodError(err, `agent config "${path}"`));
+    }
+    throw err;
+  }
   config.id = config.id || config.name;
   if (config.toolkits) {
     config.toolkits = config.toolkits.map(
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
   }
   return config;
 }
+function loadMatrix(yamlPath) {
+  const path = resolve(yamlPath);
+  const fileContent = readFileSync(path, "utf-8");
+  const raw = parse(fileContent);
+  return MatrixSchema.parse(raw);
+}
 function loadTestCase(yamlPath) {
   const path = resolve(yamlPath);
   const fileContent = readFileSync(path, "utf-8");
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
   if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
     raw.fixture = resolve(dir, raw.fixture);
   }
-  const testCase = TestCaseSchema.parse(raw);
+  let testCase;
+  try {
+    testCase = TestCaseSchema.parse(raw);
+  } catch (err) {
+    if (err instanceof ZodError) {
+      throw new Error(formatZodError(err, `test case "${path}"`));
+    }
+    throw err;
+  }
   testCase.id = testCase.id || testCase.name;
   if (testCase.toolkits) {
     testCase.toolkits = testCase.toolkits.map(
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
 }
 async function runBenchCommand(opts) {
   const suiteDir = resolve(opts.suite);
-  const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
   const concurrency = opts.concurrency || 2;
-  const agentConfigs = configPaths.map((p) => loadAgentConfig(p));
+  let agentConfigs;
+  let matrixId;
+  if (opts.matrix) {
+    const matrix = loadMatrix(opts.matrix);
+    agentConfigs = expandMatrix(matrix);
+    matrixId = randomUUID();
+    console.log(
+      `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
+    );
+  } else if (opts.configs) {
+    const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
+    agentConfigs = configPaths.map((p) => loadAgentConfig(p));
+  } else {
+    throw new Error("Either --configs or --matrix must be provided.");
+  }
   const yamlFiles = findTestCaseYamlFiles(suiteDir);
   if (yamlFiles.length === 0) {
     console.error(`No test cases found in suite directory: ${opts.suite}`);
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
       sandboxProvider,
       db,
       concurrency,
-      onRunUpdate
+      onRunUpdate,
+      extraScorers: [new StaticQualityScorer()],
+      matrixId
     });
   } catch (err) {
     console.error("Benchmark runner encountered an error:", err);
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
     )
   );
   printTagBreakdown(testCases, agentConfigs, runStates);
+  if (matrixId) {
+    await printMatrixSummary(db, matrixId, agentConfigs);
+  }
   process.exit(0);
 }
+async function printMatrixSummary(db, matrixId, agentConfigs) {
+  const runs = await getRunsByMatrixId(db, matrixId);
+  const aggregates = aggregateResults(runs, agentConfigs);
+  if (aggregates.length === 0) return;
+  const front = paretoFront(aggregates);
+  const frontIds = new Set(front.map((a) => a.agentConfigId));
+  const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
+  console.log("\n================ MATRIX SUMMARY ================");
+  for (const agg of aggregates) {
+    const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
+    const solveRatePct = (agg.solveRate * 100).toFixed(0);
+    const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
+    console.log(
+      `${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
+    );
+  }
+  console.log(
+    `
+* = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
+  );
+  console.log("=================================================\n");
+}
 function printTagBreakdown(testCases, agentConfigs, runStates) {
   const tagStats = {};
   for (const tc of testCases) {
@@ -340,6 +419,55 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
   }
   console.log("=================================================\n");
 }
+function isSkippedCheck(check) {
+  return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
+}
+function checkIcon(check) {
+  if (!check.passed) return "\u274C";
+  if (isSkippedCheck(check)) return "\u26A0\uFE0F";
+  return "\u2705";
+}
+async function validateCommand(testCasePath, opts) {
+  const testCase = loadTestCase(testCasePath);
+  if (opts?.strict) {
+    const missing = [];
+    if (!testCase.test_command) missing.push("test_command");
+    if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
+    if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
+    if (missing.length > 0) {
+      console.error(
+        `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
+      );
+      process.exit(1);
+    }
+  }
+  console.log(`Validating "${testCase.name}" (${testCasePath})...
+`);
+  const sandboxProvider = new DockerSandboxProvider();
+  const report = await validateTestCase({ testCase, sandboxProvider });
+  const hadExecutionSkip = report.checks.some(
+    (c) => c.name.includes("execution-checks (skipped")
+  );
+  for (const check of report.checks) {
+    const icon = checkIcon(check);
+    console.log(`${icon} ${check.name}`);
+    if (check.detail && check.detail !== "ok") {
+      const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
+      console.log(indented);
+    }
+  }
+  if (hadExecutionSkip) {
+    console.log("");
+    console.log(
+      "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
+    );
+  }
+  console.log("");
+  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
+  process.exit(report.ok ? 0 : 1);
+}
+// src/commands/import-pr.ts
 var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
 async function importPrCommand(repo, prNumber, opts) {
   const [owner, repoName] = repo.split("/");
@@ -378,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
   if (testDiff.trim()) {
     writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
   }
+  if (opts.cloneFixture) {
+    const fixtureDir = resolve(outDir, "fixture");
+    console.log(`
+Cloning ${owner}/${repoName} into ${fixtureDir}...`);
+    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
+      stdio: "inherit"
+    });
+    console.log(`Checking out base commit ${pr.base.sha}...`);
+    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
+  }
+  const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
+  const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
   const yamlDoc = {
     name: slug,
     description: pr.title,
     fixture: "./fixture",
     prompt: buildPrompt(pr),
-    success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
+    success,
     timeout_seconds: 600,
     tags: ["imported", repoName],
     created_at: pr.created_at,
-    // TODO: fill these in after setting up ./fixture (checked out at
-    // base.sha below) and running the test suite to discover real test names.
-    test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
-    fail_to_pass: ["<TODO: fill in via `agr validate`>"],
-    pass_to_pass: ["<TODO: fill in via `agr validate`>"]
+    test_command,
+    fail_to_pass: [],
+    pass_to_pass: []
   };
   if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
   if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
   if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
   if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
-  writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
+  writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
   console.log(`
 Imported PR #${pr.number}: "${pr.title}"`);
   console.log(`Wrote scaffold to: ${outDir}`);
@@ -407,11 +545,77 @@ Imported PR #${pr.number}: "${pr.title}"`);
   if (testDiff.trim())
     console.log(`  - test_patch.patch (${forbidModified.length} test file(s) changed)`);
   console.log("\nNext steps:");
-  console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
-  console.log("  2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
-  console.log(
-    `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
-  );
+  if (!opts.cloneFixture) {
+    console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
+    console.log("  2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
+    console.log(
+      `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
+    );
+  } else {
+    console.log("  1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
+    console.log(
+      `  2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
+    );
+  }
+  if (opts.validate) {
+    console.log("\nRunning validation...\n");
+    await validateCommand(resolve(outDir, "agr.yaml"));
+  }
+}
+function detectProjectKind(fixtureDir) {
+  if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
+    return "python";
+  }
+  if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
+  if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
+  return "unknown";
+}
+function projectTestDefaults(kind, cloned) {
+  if (!cloned) {
+    return {
+      success: [
+        { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+      ],
+      test_command: "<TODO: shell command that runs tests with TAP output>"
+    };
+  }
+  switch (kind) {
+    case "python":
+      return {
+        success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
+        test_command: "pytest --tap-stream"
+      };
+    case "node":
+      return {
+        success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
+        test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
+      };
+    case "go":
+      return {
+        success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
+        test_command: "<TODO: configure a TAP-producing test command for go>"
+      };
+    default:
+      return {
+        success: [
+          { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+        ],
+        test_command: "<TODO: shell command that runs tests with TAP output>"
+      };
+  }
+}
+function buildAgrYaml(doc, projectKind) {
+  let yaml = stringify(doc);
+  const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
+  yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
+fail_to_pass:`);
+  if (projectKind === "python") {
+    yaml = yaml.replace(
+      /^test_command: (.+)$/m,
+      "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
+    );
+  }
+  return yaml;
 }
 function buildPrompt(pr) {
   const body = (pr.body || "").trim();
@@ -443,6 +647,35 @@ function splitDiff(diff) {
     forbidModified
   };
 }
+var VERBOSE_CONTENT_MAX = 200;
+function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
+  if (value.length <= max) return value;
+  return `${value.slice(0, max)}...`;
+}
+function formatVerboseStep(step) {
+  const prefix = `[step ${step.index}] ${step.kind}`;
+  if (step.kind === "tool_call" && step.tool) {
+    const args = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool}(${args})`;
+  }
+  if (step.kind === "tool_result" && step.tool) {
+    const result = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool} -> ${result}`;
+  }
+  if (step.kind === "message" && step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  if (step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  return prefix;
+}
+function formatMetricDetail(label, detail) {
+  if (/^No .+ configured; skipping/.test(detail)) {
+    return `\u26A0\uFE0F ${label}: ${detail}`;
+  }
+  return `${label}: ${detail}`;
+}
 async function runSingleCommand(testCasePath, opts) {
   const testCase = loadTestCase(testCasePath);
   let agentConfig = {
@@ -459,6 +692,14 @@ async function runSingleCommand(testCasePath, opts) {
   const adapter = new AiSdkAgentAdapter();
   const db = initDb();
   await saveTestCase(db, testCaseToDbRow(testCase));
+  await saveAgentConfig(db, {
+    id: agentConfig.id || agentConfig.name,
+    name: agentConfig.name,
+    model: agentConfig.model,
+    maxSteps: agentConfig.max_steps,
+    temperature: agentConfig.temperature,
+    createdAt: Math.floor(Date.now() / 1e3)
+  });
   const runId = randomUUID();
   try {
     const result = await runSingle({
@@ -467,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
       adapter,
       sandboxProvider,
       db,
-      runId
+      runId,
+      onStep: opts.verbose ? (step) => {
+        console.log(formatVerboseStep(step));
+      } : void 0
     });
     console.log("\n================ RUN SUMMARY ================");
     console.log(`Status:    ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -478,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
       console.log(`Error:     ${result.error}`);
     }
     if (result.metrics?.regression) {
-      console.log(`Regression: ${result.metrics.regression.detail}`);
+      console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
     }
     if (result.metrics?.diff) {
       console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
     }
     if (result.metrics?.localization) {
-      console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
+      console.log(
+        formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
+      );
     }
     console.log("=============================================\n");
   } catch (err) {
@@ -493,28 +739,90 @@ async function runSingleCommand(testCasePath, opts) {
   }
   process.exit(0);
 }
-async function validateCommand(testCasePath) {
-  const testCase = loadTestCase(testCasePath);
-  console.log(`Validating "${testCase.name}" (${testCasePath})...
-`);
-  const sandboxProvider = new DockerSandboxProvider();
-  const report = await validateTestCase({ testCase, sandboxProvider });
-  for (const check of report.checks) {
-    const icon = check.passed ? "\u2705" : "\u274C";
-    console.log(`${icon} ${check.name}`);
-    if (check.detail && check.detail !== "ok") {
-      const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
-      console.log(indented);
+async function traceCommand(runId, opts) {
+  const db = initDb();
+  const run = await getRun(db, runId);
+  if (!run) {
+    console.error(`Run not found: ${runId}`);
+    process.exit(1);
+  }
+  console.log(`Run ${run.id}`);
+  console.log(`  test case:    ${run.testCaseId}`);
+  console.log(`  agent config: ${run.agentConfigId}`);
+  console.log(
+    `  status:       ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
+  );
+  console.log(`  cost:         $${run.costUsd.toFixed(4)}`);
+  console.log(`  duration:     ${run.durationMs}ms`);
+  if (run.error) console.log(`  error:        ${run.error}`);
+  if (opts.quality) {
+    printQualityBreakdown(run.metrics);
+    return;
+  }
+  const steps = await getTraces(db, runId);
+  console.log(`
+${steps.length} step(s):`);
+  for (const step of steps) {
+    const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
+    console.log(
+      `  [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
+    );
+    if (step.content) {
+      const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
+      console.log(`      ${preview.replace(/\n/g, "\n      ")}`);
     }
   }
-  console.log("");
-  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
-  process.exit(report.ok ? 0 : 1);
+}
+function printQualityBreakdown(metricsJson) {
+  const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
+  console.log("\n================ QUALITY BREAKDOWN ================");
+  const staticQuality = metrics?.["static-quality"]?.quality;
+  const llmJudge = metrics?.["llm-judge"]?.quality;
+  const diff = metrics?.diff;
+  const localization = metrics?.localization;
+  if (staticQuality) {
+    console.log("Static quality (static-quality):");
+    if (staticQuality.diffLines !== void 0) console.log(`  diff lines:       ${staticQuality.diffLines}`);
+    if (staticQuality.filesModified !== void 0)
+      console.log(`  files modified:   ${staticQuality.filesModified}`);
+    if (staticQuality.todosIntroduced !== void 0)
+      console.log(`  TODOs introduced: ${staticQuality.todosIntroduced}`);
+    if (staticQuality.linterViolations !== void 0)
+      console.log(`  lint violations:  ${staticQuality.linterViolations}`);
+  }
+  if (llmJudge) {
+    if (staticQuality) console.log("");
+    console.log("LLM judge (llm-judge):");
+    if (llmJudge.llmJudgeScore !== void 0)
+      console.log(`  score:     ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
+    if (llmJudge.llmJudgeDetail) console.log(`  rationale: ${llmJudge.llmJudgeDetail}`);
+  }
+  if (diff) {
+    if (staticQuality || llmJudge) console.log("");
+    console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
+  }
+  if (localization) {
+    console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
+  }
+  if (!staticQuality && !llmJudge && !diff && !localization) {
+    console.log("  (no quality metrics recorded for this run)");
+  }
+  console.log("=====================================================\n");
+}
+function safeParseJson(value) {
+  try {
+    return JSON.parse(value);
+  } catch {
+    return void 0;
+  }
 }
 // src/index.ts
 var cli = cac("agr");
-cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
+cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
+  "--verbose",
+  "Stream agent steps live to the console as they happen"
+).action(async (testCase, options) => {
   try {
     await runSingleCommand(testCase, options);
   } catch (err) {
@@ -522,16 +830,25 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
     process.exit(1);
   }
 });
-cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).action(async (options) => {
-  if (!options.configs || !options.suite) {
-    console.error("Error: --configs and --suite are required for benchmarking.");
+cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
+  "--matrix <matrix>",
+  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
+).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
+  if (!options.configs && options.config) {
+    options.configs = options.config;
+  }
+  if (!options.suite || !options.configs && !options.matrix) {
+    console.error(
+      "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
+    );
     process.exit(1);
   }
   try {
     await runBenchCommand({
       configs: options.configs,
       suite: options.suite,
-      concurrency: Number(options.concurrency)
+      concurrency: Number(options.concurrency),
+      matrix: options.matrix
     });
   } catch (err) {
     console.error(`Error executing benchmark: ${err.message}`);
@@ -541,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
 cli.command(
   "validate <testCase>",
   "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
-).action(async (testCase) => {
+).option(
+  "--strict",
+  "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
+).action(async (testCase, options) => {
   try {
-    await validateCommand(testCase);
+    await validateCommand(testCase, options);
   } catch (err) {
     console.error(`Error executing validate: ${err.message}`);
     process.exit(1);
@@ -552,7 +872,7 @@ cli.command(
 cli.command(
   "import-pr <repo> <prNumber>",
   "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
-).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
+).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
   try {
     await importPrCommand(repo, prNumber, options);
   } catch (err) {
@@ -560,5 +880,27 @@ cli.command(
     process.exit(1);
   }
 });
+cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
+  "--quality",
+  "Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
+).action(async (runId, options) => {
+  try {
+    await traceCommand(runId, options);
+  } catch (err) {
+    console.error(`Error executing trace: ${err.message}`);
+    process.exit(1);
+  }
+});
 cli.help();
-cli.parse();
+try {
+  cli.parse();
+} catch (err) {
+  if (err.name === "CACError") {
+    console.error(`
+\u274C ${err.message}
+`);
+    cli.outputHelp();
+    process.exit(1);
+  }
+  throw err;
+}

package/package.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "agentgrader",
-  "version": "1.0.2",
+  "version": "1.0.6",
   "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
   "license": "MIT",
   "type": "module",
   "bin": {
-    "agr": "./dist/index.js",
-    "agentgrader": "./dist/index.js"
+    "agr": "dist/index.js",
+    "agentgrader": "dist/index.js"
   },
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -19,14 +19,18 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/agent-openrouter": "^2.0.0",
-    "@agentgrader/core": "^1.1.0",
-    "@agentgrader/sandbox-docker": "^2.0.0",
-    "@agentgrader/store": "^1.0.2",
+    "@agentgrader/agent-openrouter": "^2.0.1",
+    "@agentgrader/core": "^1.1.3",
+    "@agentgrader/optimizer": "^0.1.0",
+    "@agentgrader/sandbox-docker": "^2.0.2",
+    "@agentgrader/scorer-static": "^0.1.0",
+    "@agentgrader/store": "^1.0.3",
     "cac": "^6.7.14",
+    "dotenv": "^17.4.2",
     "ink": "^4.4.1",
     "react": "^18.2.0",
-    "yaml": "^2.5.1"
+    "yaml": "^2.5.1",
+    "zod": "^3.23.8"
   },
   "devDependencies": {
     "@types/react": "^18.2.0",