npm - agentgrader - Versions diffs - 1.0.2 → 1.0.5 - Mend

agentgrader 1.0.2 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +246 -33
package/package.json +9 -5

package/dist/index.js CHANGED Viewed

@@ -1,15 +1,20 @@
 #!/usr/bin/env node
+import 'dotenv/config';
 import { cac } from 'cac';
+import { randomUUID } from 'crypto';
 import { resolve, dirname, isAbsolute } from 'path';
 import { render, Box, Text } from 'ink';
-import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
+import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
 import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
 import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
 import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
+import { StaticQualityScorer } from '@agentgrader/scorer-static';
+import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
 import { jsx, jsxs } from 'react/jsx-runtime';
 import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
 import { stringify, parse } from 'yaml';
-import { randomUUID } from 'crypto';
+import { ZodError } from 'zod';
+import { execFileSync } from 'child_process';
 var Dashboard = ({ runs, testCases, configs, isFinished }) => {
   let totalCost = 0;
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
     ] })
   ] });
 };
+// src/lib/format-zod-error.ts
+function formatZodError(err, fileLabel) {
+  const lines = err.issues.map((issue) => {
+    const path = issue.path.join(".") || "(root)";
+    return `  - ${path}: ${issue.message}`;
+  });
+  return `Invalid ${fileLabel}:
+${lines.join("\n")}`;
+}
+// src/lib/load-agent-config.ts
 function loadAgentConfig(yamlPath) {
   const path = resolve(yamlPath);
   const fileContent = readFileSync(path, "utf-8");
   const raw = parse(fileContent);
   const dir = dirname(path);
-  const config = AgentConfigSchema.parse(raw);
+  let config;
+  try {
+    config = AgentConfigSchema.parse(raw);
+  } catch (err) {
+    if (err instanceof ZodError) {
+      throw new Error(formatZodError(err, `agent config "${path}"`));
+    }
+    throw err;
+  }
   config.id = config.id || config.name;
   if (config.toolkits) {
     config.toolkits = config.toolkits.map(
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
   }
   return config;
 }
+function loadMatrix(yamlPath) {
+  const path = resolve(yamlPath);
+  const fileContent = readFileSync(path, "utf-8");
+  const raw = parse(fileContent);
+  return MatrixSchema.parse(raw);
+}
 function loadTestCase(yamlPath) {
   const path = resolve(yamlPath);
   const fileContent = readFileSync(path, "utf-8");
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
   if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
     raw.fixture = resolve(dir, raw.fixture);
   }
-  const testCase = TestCaseSchema.parse(raw);
+  let testCase;
+  try {
+    testCase = TestCaseSchema.parse(raw);
+  } catch (err) {
+    if (err instanceof ZodError) {
+      throw new Error(formatZodError(err, `test case "${path}"`));
+    }
+    throw err;
+  }
   testCase.id = testCase.id || testCase.name;
   if (testCase.toolkits) {
     testCase.toolkits = testCase.toolkits.map(
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
 }
 async function runBenchCommand(opts) {
   const suiteDir = resolve(opts.suite);
-  const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
   const concurrency = opts.concurrency || 2;
-  const agentConfigs = configPaths.map((p) => loadAgentConfig(p));
+  let agentConfigs;
+  let matrixId;
+  if (opts.matrix) {
+    const matrix = loadMatrix(opts.matrix);
+    agentConfigs = expandMatrix(matrix);
+    matrixId = randomUUID();
+    console.log(
+      `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
+    );
+  } else if (opts.configs) {
+    const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
+    agentConfigs = configPaths.map((p) => loadAgentConfig(p));
+  } else {
+    throw new Error("Either --configs or --matrix must be provided.");
+  }
   const yamlFiles = findTestCaseYamlFiles(suiteDir);
   if (yamlFiles.length === 0) {
     console.error(`No test cases found in suite directory: ${opts.suite}`);
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
       sandboxProvider,
       db,
       concurrency,
-      onRunUpdate
+      onRunUpdate,
+      extraScorers: [new StaticQualityScorer()],
+      matrixId
     });
   } catch (err) {
     console.error("Benchmark runner encountered an error:", err);
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
     )
   );
   printTagBreakdown(testCases, agentConfigs, runStates);
+  if (matrixId) {
+    await printMatrixSummary(db, matrixId, agentConfigs);
+  }
   process.exit(0);
 }
+async function printMatrixSummary(db, matrixId, agentConfigs) {
+  const runs = await getRunsByMatrixId(db, matrixId);
+  const aggregates = aggregateResults(runs, agentConfigs);
+  if (aggregates.length === 0) return;
+  const front = paretoFront(aggregates);
+  const frontIds = new Set(front.map((a) => a.agentConfigId));
+  const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
+  console.log("\n================ MATRIX SUMMARY ================");
+  for (const agg of aggregates) {
+    const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
+    const solveRatePct = (agg.solveRate * 100).toFixed(0);
+    const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
+    console.log(
+      `${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
+    );
+  }
+  console.log(
+    `
+* = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
+  );
+  console.log("=================================================\n");
+}
 function printTagBreakdown(testCases, agentConfigs, runStates) {
   const tagStats = {};
   for (const tc of testCases) {
@@ -340,6 +419,26 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
   }
   console.log("=================================================\n");
 }
+async function validateCommand(testCasePath) {
+  const testCase = loadTestCase(testCasePath);
+  console.log(`Validating "${testCase.name}" (${testCasePath})...
+`);
+  const sandboxProvider = new DockerSandboxProvider();
+  const report = await validateTestCase({ testCase, sandboxProvider });
+  for (const check of report.checks) {
+    const icon = check.passed ? "\u2705" : "\u274C";
+    console.log(`${icon} ${check.name}`);
+    if (check.detail && check.detail !== "ok") {
+      const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
+      console.log(indented);
+    }
+  }
+  console.log("");
+  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
+  process.exit(report.ok ? 0 : 1);
+}
+// src/commands/import-pr.ts
 var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
 async function importPrCommand(repo, prNumber, opts) {
   const [owner, repoName] = repo.split("/");
@@ -406,12 +505,33 @@ Imported PR #${pr.number}: "${pr.title}"`);
     console.log(`  - solution.patch (${expectedFiles.length} file(s) changed)`);
   if (testDiff.trim())
     console.log(`  - test_patch.patch (${forbidModified.length} test file(s) changed)`);
+  if (opts.cloneFixture) {
+    const fixtureDir = resolve(outDir, "fixture");
+    console.log(`
+Cloning ${owner}/${repoName} into ${fixtureDir}...`);
+    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
+      stdio: "inherit"
+    });
+    console.log(`Checking out base commit ${pr.base.sha}...`);
+    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
+  }
   console.log("\nNext steps:");
-  console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
-  console.log("  2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
-  console.log(
-    `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
-  );
+  if (!opts.cloneFixture) {
+    console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
+    console.log("  2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
+    console.log(
+      `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
+    );
+  } else {
+    console.log("  1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
+    console.log(
+      `  2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
+    );
+  }
+  if (opts.validate) {
+    console.log("\nRunning validation...\n");
+    await validateCommand(resolve(outDir, "agr.yaml"));
+  }
 }
 function buildPrompt(pr) {
   const body = (pr.body || "").trim();
@@ -459,6 +579,14 @@ async function runSingleCommand(testCasePath, opts) {
   const adapter = new AiSdkAgentAdapter();
   const db = initDb();
   await saveTestCase(db, testCaseToDbRow(testCase));
+  await saveAgentConfig(db, {
+    id: agentConfig.id || agentConfig.name,
+    name: agentConfig.name,
+    model: agentConfig.model,
+    maxSteps: agentConfig.max_steps,
+    temperature: agentConfig.temperature,
+    createdAt: Math.floor(Date.now() / 1e3)
+  });
   const runId = randomUUID();
   try {
     const result = await runSingle({
@@ -493,23 +621,82 @@ async function runSingleCommand(testCasePath, opts) {
   }
   process.exit(0);
 }
-async function validateCommand(testCasePath) {
-  const testCase = loadTestCase(testCasePath);
-  console.log(`Validating "${testCase.name}" (${testCasePath})...
-`);
-  const sandboxProvider = new DockerSandboxProvider();
-  const report = await validateTestCase({ testCase, sandboxProvider });
-  for (const check of report.checks) {
-    const icon = check.passed ? "\u2705" : "\u274C";
-    console.log(`${icon} ${check.name}`);
-    if (check.detail && check.detail !== "ok") {
-      const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
-      console.log(indented);
+async function traceCommand(runId, opts) {
+  const db = initDb();
+  const run = await getRun(db, runId);
+  if (!run) {
+    console.error(`Run not found: ${runId}`);
+    process.exit(1);
+  }
+  console.log(`Run ${run.id}`);
+  console.log(`  test case:    ${run.testCaseId}`);
+  console.log(`  agent config: ${run.agentConfigId}`);
+  console.log(
+    `  status:       ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
+  );
+  console.log(`  cost:         $${run.costUsd.toFixed(4)}`);
+  console.log(`  duration:     ${run.durationMs}ms`);
+  if (run.error) console.log(`  error:        ${run.error}`);
+  if (opts.quality) {
+    printQualityBreakdown(run.metrics);
+    return;
+  }
+  const steps = await getTraces(db, runId);
+  console.log(`
+${steps.length} step(s):`);
+  for (const step of steps) {
+    const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
+    console.log(
+      `  [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
+    );
+    if (step.content) {
+      const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
+      console.log(`      ${preview.replace(/\n/g, "\n      ")}`);
     }
   }
-  console.log("");
-  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
-  process.exit(report.ok ? 0 : 1);
+}
+function printQualityBreakdown(metricsJson) {
+  const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
+  console.log("\n================ QUALITY BREAKDOWN ================");
+  const staticQuality = metrics?.["static-quality"]?.quality;
+  const llmJudge = metrics?.["llm-judge"]?.quality;
+  const diff = metrics?.diff;
+  const localization = metrics?.localization;
+  if (staticQuality) {
+    console.log("Static quality (static-quality):");
+    if (staticQuality.diffLines !== void 0) console.log(`  diff lines:       ${staticQuality.diffLines}`);
+    if (staticQuality.filesModified !== void 0)
+      console.log(`  files modified:   ${staticQuality.filesModified}`);
+    if (staticQuality.todosIntroduced !== void 0)
+      console.log(`  TODOs introduced: ${staticQuality.todosIntroduced}`);
+    if (staticQuality.linterViolations !== void 0)
+      console.log(`  lint violations:  ${staticQuality.linterViolations}`);
+  }
+  if (llmJudge) {
+    if (staticQuality) console.log("");
+    console.log("LLM judge (llm-judge):");
+    if (llmJudge.llmJudgeScore !== void 0)
+      console.log(`  score:     ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
+    if (llmJudge.llmJudgeDetail) console.log(`  rationale: ${llmJudge.llmJudgeDetail}`);
+  }
+  if (diff) {
+    if (staticQuality || llmJudge) console.log("");
+    console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
+  }
+  if (localization) {
+    console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
+  }
+  if (!staticQuality && !llmJudge && !diff && !localization) {
+    console.log("  (no quality metrics recorded for this run)");
+  }
+  console.log("=====================================================\n");
+}
+function safeParseJson(value) {
+  try {
+    return JSON.parse(value);
+  } catch {
+    return void 0;
+  }
 }
 // src/index.ts
@@ -522,16 +709,20 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
     process.exit(1);
   }
 });
-cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).action(async (options) => {
-  if (!options.configs || !options.suite) {
-    console.error("Error: --configs and --suite are required for benchmarking.");
+cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
+  "--matrix <matrix>",
+  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
+).action(async (options) => {
+  if (!options.suite || !options.configs && !options.matrix) {
+    console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
     process.exit(1);
   }
   try {
     await runBenchCommand({
       configs: options.configs,
       suite: options.suite,
-      concurrency: Number(options.concurrency)
+      concurrency: Number(options.concurrency),
+      matrix: options.matrix
     });
   } catch (err) {
     console.error(`Error executing benchmark: ${err.message}`);
@@ -552,7 +743,7 @@ cli.command(
 cli.command(
   "import-pr <repo> <prNumber>",
   "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
-).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
+).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
   try {
     await importPrCommand(repo, prNumber, options);
   } catch (err) {
@@ -560,5 +751,27 @@ cli.command(
     process.exit(1);
   }
 });
+cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
+  "--quality",
+  "Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
+).action(async (runId, options) => {
+  try {
+    await traceCommand(runId, options);
+  } catch (err) {
+    console.error(`Error executing trace: ${err.message}`);
+    process.exit(1);
+  }
+});
 cli.help();
-cli.parse();
+try {
+  cli.parse();
+} catch (err) {
+  if (err.name === "CACError") {
+    console.error(`
+\u274C ${err.message}
+`);
+    cli.outputHelp();
+    process.exit(1);
+  }
+  throw err;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentgrader",
-  "version": "1.0.2",
+  "version": "1.0.5",
   "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
   "license": "MIT",
   "type": "module",
@@ -19,14 +19,18 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/agent-openrouter": "^2.0.0",
-    "@agentgrader/core": "^1.1.0",
-    "@agentgrader/sandbox-docker": "^2.0.0",
+    "@agentgrader/agent-openrouter": "^2.0.1",
+    "@agentgrader/core": "^1.1.1",
+    "@agentgrader/optimizer": "^0.1.0",
+    "@agentgrader/sandbox-docker": "^2.0.2",
+    "@agentgrader/scorer-static": "^0.1.0",
     "@agentgrader/store": "^1.0.2",
     "cac": "^6.7.14",
+    "dotenv": "^17.4.2",
     "ink": "^4.4.1",
     "react": "^18.2.0",
-    "yaml": "^2.5.1"
+    "yaml": "^2.5.1",
+    "zod": "^3.23.8"
   },
   "devDependencies": {
     "@types/react": "^18.2.0",