npm - agentgrader - Versions diffs - 1.0.5 → 1.0.7 - Mend

agentgrader 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +220 -39
package/package.json +7 -7

package/dist/index.js CHANGED Viewed

@@ -11,11 +11,17 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
 import { StaticQualityScorer } from '@agentgrader/scorer-static';
 import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
 import { jsx, jsxs } from 'react/jsx-runtime';
-import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
-import { stringify, parse } from 'yaml';
+import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
+import { parse, stringify } from 'yaml';
 import { ZodError } from 'zod';
 import { execFileSync } from 'child_process';
+var CONFIG_COL_WIDTH = 24;
+var CONFIG_LABEL_MAX = 20;
+function truncateLabel(name, max = CONFIG_LABEL_MAX) {
+  if (name.length <= max) return name;
+  return `${name.slice(0, max - 1)}\u2026`;
+}
 var Dashboard = ({ runs, testCases, configs, isFinished }) => {
   let totalCost = 0;
   let totalSteps = 0;
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
           r.testCaseId
         ] }),
         /* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
-        /* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
+        /* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
         /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
           " (Steps: ",
           r.stepsCount,
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
     /* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
       /* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
         /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
-        configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", children: cfg }) }, cfg))
+        configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
       ] }),
       testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
-        /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
+        /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
         configs.map((cfg) => {
           const key = `${tc}_${cfg}`;
           const run = runs[key];
           if (!run) {
-            return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
+            return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
           }
           if (run.status === "running") {
-            return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
+            return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
           }
           if (run.status === "failed" || !run.passed) {
             const seconds2 = (run.durationMs / 1e3).toFixed(1);
-            return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "red", children: [
+            return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
               "\u2717 ",
               seconds2,
               "s ($",
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
             ] }) }, cfg);
           }
           const seconds = (run.durationMs / 1e3).toFixed(1);
-          return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "green", children: [
+          return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
             "\u2713 ",
             seconds,
             "s ($",
@@ -419,20 +425,54 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
   }
   console.log("=================================================\n");
 }
-async function validateCommand(testCasePath) {
+function isSkippedCheck(check) {
+  return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
+}
+function checkIcon(check) {
+  if (!check.passed) return "\u274C";
+  if (isSkippedCheck(check)) return "\u26A0\uFE0F";
+  return "\u2705";
+}
+async function validateCommand(testCasePath, opts) {
   const testCase = loadTestCase(testCasePath);
+  if (opts?.strict) {
+    const missing = [];
+    if (!testCase.test_command) missing.push("test_command");
+    if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
+    if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
+    if (missing.length > 0) {
+      console.error(
+        `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
+      );
+      process.exit(1);
+    }
+  }
   console.log(`Validating "${testCase.name}" (${testCasePath})...
 `);
   const sandboxProvider = new DockerSandboxProvider();
   const report = await validateTestCase({ testCase, sandboxProvider });
+  const hadExecutionSkip = report.checks.some(
+    (c) => c.name.includes("execution-checks (skipped")
+  );
   for (const check of report.checks) {
-    const icon = check.passed ? "\u2705" : "\u274C";
+    const icon = checkIcon(check);
     console.log(`${icon} ${check.name}`);
     if (check.detail && check.detail !== "ok") {
       const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
       console.log(indented);
     }
   }
+  if (hadExecutionSkip) {
+    console.log("");
+    console.log(
+      "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
+    );
+    if (report.ok && !opts?.strict) {
+      console.log(
+        "Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
+      );
+    }
+  }
   console.log("");
   console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
   process.exit(report.ok ? 0 : 1);
@@ -477,26 +517,40 @@ async function importPrCommand(repo, prNumber, opts) {
   if (testDiff.trim()) {
     writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
   }
+  const fixtureDir = resolve(outDir, "fixture");
+  if (opts.cloneFixture) {
+    console.log(`
+Cloning ${owner}/${repoName} into ${fixtureDir}...`);
+    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
+      stdio: "inherit"
+    });
+    console.log(`Checking out base commit ${pr.base.sha}...`);
+    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
+  }
+  const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
+  const { success, test_command, testCommandHint } = projectTestDefaults(
+    projectKind,
+    opts.cloneFixture ?? false,
+    fixtureDir
+  );
   const yamlDoc = {
     name: slug,
     description: pr.title,
     fixture: "./fixture",
     prompt: buildPrompt(pr),
-    success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
+    success,
     timeout_seconds: 600,
     tags: ["imported", repoName],
     created_at: pr.created_at,
-    // TODO: fill these in after setting up ./fixture (checked out at
-    // base.sha below) and running the test suite to discover real test names.
-    test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
-    fail_to_pass: ["<TODO: fill in via `agr validate`>"],
-    pass_to_pass: ["<TODO: fill in via `agr validate`>"]
+    test_command,
+    fail_to_pass: [],
+    pass_to_pass: []
   };
   if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
   if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
   if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
   if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
-  writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
+  writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
   console.log(`
 Imported PR #${pr.number}: "${pr.title}"`);
   console.log(`Wrote scaffold to: ${outDir}`);
@@ -505,16 +559,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
     console.log(`  - solution.patch (${expectedFiles.length} file(s) changed)`);
   if (testDiff.trim())
     console.log(`  - test_patch.patch (${forbidModified.length} test file(s) changed)`);
-  if (opts.cloneFixture) {
-    const fixtureDir = resolve(outDir, "fixture");
-    console.log(`
-Cloning ${owner}/${repoName} into ${fixtureDir}...`);
-    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
-      stdio: "inherit"
-    });
-    console.log(`Checking out base commit ${pr.base.sha}...`);
-    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
-  }
   console.log("\nNext steps:");
   if (!opts.cloneFixture) {
     console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
@@ -522,8 +566,11 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
     console.log(
       `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
     );
+    console.log(
+      "\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
+    );
   } else {
-    console.log("  1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
+    console.log("  1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
     console.log(
       `  2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
     );
@@ -533,6 +580,92 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
     await validateCommand(resolve(outDir, "agr.yaml"));
   }
 }
+function detectProjectKind(fixtureDir) {
+  if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
+    return "python";
+  }
+  if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
+  if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
+  return "unknown";
+}
+function projectTestDefaults(kind, cloned, fixtureDir) {
+  if (!cloned) {
+    return {
+      success: [
+        { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+      ],
+      test_command: "<TODO: shell command that runs tests with TAP output>",
+      testCommandHint: "none"
+    };
+  }
+  switch (kind) {
+    case "python":
+      return {
+        success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
+        test_command: "pytest --tap-stream",
+        testCommandHint: "python"
+      };
+    case "node":
+      return detectNodeTestRunner(fixtureDir);
+    case "go":
+      return {
+        success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
+        test_command: "<TODO: configure a TAP-producing test command for go>",
+        testCommandHint: "go"
+      };
+    default:
+      return {
+        success: [
+          { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+        ],
+        test_command: "<TODO: shell command that runs tests with TAP output>",
+        testCommandHint: "none"
+      };
+  }
+}
+function detectNodeTestRunner(fixtureDir) {
+  const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
+  const fallback = {
+    success,
+    test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
+    testCommandHint: "node-unknown"
+  };
+  try {
+    const pkgPath = resolve(fixtureDir, "package.json");
+    if (!existsSync(pkgPath)) return fallback;
+    const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
+    const deps = { ...pkg.dependencies, ...pkg.devDependencies };
+    if (deps.ava) {
+      return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
+    }
+    if (deps.vitest) {
+      return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
+    }
+    if (deps.jest) {
+      return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
+    }
+    return fallback;
+  } catch {
+    return fallback;
+  }
+}
+function buildAgrYaml(doc, testCommandHint) {
+  let yaml = stringify(doc);
+  const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
+  yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
+fail_to_pass:`);
+  const testCommandComments = {
+    python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
+    jest: "# jest does not output TAP by default; consider jest-tap-reporter",
+    "node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
+  };
+  const comment = testCommandComments[testCommandHint];
+  if (comment) {
+    yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
+$&`);
+  }
+  return yaml;
+}
 function buildPrompt(pr) {
   const body = (pr.body || "").trim();
   return body ? `${pr.title}
@@ -563,6 +696,35 @@ function splitDiff(diff) {
     forbidModified
   };
 }
+var VERBOSE_CONTENT_MAX = 200;
+function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
+  if (value.length <= max) return value;
+  return `${value.slice(0, max)}...`;
+}
+function formatVerboseStep(step) {
+  const prefix = `[step ${step.index}] ${step.kind}`;
+  if (step.kind === "tool_call" && step.tool) {
+    const args = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool}(${args})`;
+  }
+  if (step.kind === "tool_result" && step.tool) {
+    const result = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool} -> ${result}`;
+  }
+  if (step.kind === "message" && step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  if (step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  return prefix;
+}
+function formatMetricDetail(label, detail) {
+  if (/^No .+ configured; skipping/.test(detail)) {
+    return `\u26A0\uFE0F ${label}: ${detail}`;
+  }
+  return `${label}: ${detail}`;
+}
 async function runSingleCommand(testCasePath, opts) {
   const testCase = loadTestCase(testCasePath);
   let agentConfig = {
@@ -595,7 +757,10 @@ async function runSingleCommand(testCasePath, opts) {
       adapter,
       sandboxProvider,
       db,
-      runId
+      runId,
+      onStep: opts.verbose ? (step) => {
+        console.log(formatVerboseStep(step));
+      } : void 0
     });
     console.log("\n================ RUN SUMMARY ================");
     console.log(`Status:    ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -606,13 +771,15 @@ async function runSingleCommand(testCasePath, opts) {
       console.log(`Error:     ${result.error}`);
     }
     if (result.metrics?.regression) {
-      console.log(`Regression: ${result.metrics.regression.detail}`);
+      console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
     }
     if (result.metrics?.diff) {
       console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
     }
     if (result.metrics?.localization) {
-      console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
+      console.log(
+        formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
+      );
     }
     console.log("=============================================\n");
   } catch (err) {
@@ -701,7 +868,10 @@ function safeParseJson(value) {
 // src/index.ts
 var cli = cac("agr");
-cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
+cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
+  "--verbose",
+  "Stream agent steps live to the console as they happen"
+).action(async (testCase, options) => {
   try {
     await runSingleCommand(testCase, options);
   } catch (err) {
@@ -709,12 +879,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
     process.exit(1);
   }
 });
-cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
+cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
   "--matrix <matrix>",
   "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
-).action(async (options) => {
+).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
+  if (!options.configs && options.config) {
+    options.configs = options.config;
+  }
   if (!options.suite || !options.configs && !options.matrix) {
-    console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
+    console.error(
+      "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
+    );
     process.exit(1);
   }
   try {
@@ -732,9 +907,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
 cli.command(
   "validate <testCase>",
   "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
-).action(async (testCase) => {
+).option(
+  "--strict",
+  "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
+).action(async (testCase, options) => {
   try {
-    await validateCommand(testCase);
+    await validateCommand(testCase, options);
   } catch (err) {
     console.error(`Error executing validate: ${err.message}`);
     process.exit(1);
@@ -743,7 +921,10 @@ cli.command(
 cli.command(
   "import-pr <repo> <prNumber>",
   "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
-).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
+).option("--out <dir>", "Output directory for the scaffolded test case").option(
+  "--clone-fixture",
+  "Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
+).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
   try {
     await importPrCommand(repo, prNumber, options);
   } catch (err) {

package/package.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "agentgrader",
-  "version": "1.0.5",
+  "version": "1.0.7",
   "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
   "license": "MIT",
   "type": "module",
   "bin": {
-    "agr": "./dist/index.js",
-    "agentgrader": "./dist/index.js"
+    "agr": "dist/index.js",
+    "agentgrader": "dist/index.js"
   },
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -19,12 +19,12 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/agent-openrouter": "^2.0.1",
-    "@agentgrader/core": "^1.1.1",
-    "@agentgrader/optimizer": "^0.1.0",
+    "@agentgrader/agent-openrouter": "^2.0.3",
+    "@agentgrader/core": "^1.1.3",
+    "@agentgrader/optimizer": "^0.1.1",
     "@agentgrader/sandbox-docker": "^2.0.2",
     "@agentgrader/scorer-static": "^0.1.0",
-    "@agentgrader/store": "^1.0.2",
+    "@agentgrader/store": "^1.0.3",
     "cac": "^6.7.14",
     "dotenv": "^17.4.2",
     "ink": "^4.4.1",