npm - agentgrader - Versions diffs - 1.0.5 → 1.0.6 - Mend

agentgrader 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +161 -32
package/package.json +5 -5

package/dist/index.js CHANGED Viewed

@@ -11,8 +11,8 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
 import { StaticQualityScorer } from '@agentgrader/scorer-static';
 import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
 import { jsx, jsxs } from 'react/jsx-runtime';
-import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
-import { stringify, parse } from 'yaml';
+import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
+import { parse, stringify } from 'yaml';
 import { ZodError } from 'zod';
 import { execFileSync } from 'child_process';
@@ -419,20 +419,49 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
   }
   console.log("=================================================\n");
 }
-async function validateCommand(testCasePath) {
+function isSkippedCheck(check) {
+  return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
+}
+function checkIcon(check) {
+  if (!check.passed) return "\u274C";
+  if (isSkippedCheck(check)) return "\u26A0\uFE0F";
+  return "\u2705";
+}
+async function validateCommand(testCasePath, opts) {
   const testCase = loadTestCase(testCasePath);
+  if (opts?.strict) {
+    const missing = [];
+    if (!testCase.test_command) missing.push("test_command");
+    if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
+    if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
+    if (missing.length > 0) {
+      console.error(
+        `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
+      );
+      process.exit(1);
+    }
+  }
   console.log(`Validating "${testCase.name}" (${testCasePath})...
 `);
   const sandboxProvider = new DockerSandboxProvider();
   const report = await validateTestCase({ testCase, sandboxProvider });
+  const hadExecutionSkip = report.checks.some(
+    (c) => c.name.includes("execution-checks (skipped")
+  );
   for (const check of report.checks) {
-    const icon = check.passed ? "\u2705" : "\u274C";
+    const icon = checkIcon(check);
     console.log(`${icon} ${check.name}`);
     if (check.detail && check.detail !== "ok") {
       const indented = check.detail.split("\n").map((line) => `   ${line}`).join("\n");
       console.log(indented);
     }
   }
+  if (hadExecutionSkip) {
+    console.log("");
+    console.log(
+      "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
+    );
+  }
   console.log("");
   console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
   process.exit(report.ok ? 0 : 1);
@@ -477,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
   if (testDiff.trim()) {
     writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
   }
+  if (opts.cloneFixture) {
+    const fixtureDir = resolve(outDir, "fixture");
+    console.log(`
+Cloning ${owner}/${repoName} into ${fixtureDir}...`);
+    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
+      stdio: "inherit"
+    });
+    console.log(`Checking out base commit ${pr.base.sha}...`);
+    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
+  }
+  const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
+  const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
   const yamlDoc = {
     name: slug,
     description: pr.title,
     fixture: "./fixture",
     prompt: buildPrompt(pr),
-    success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
+    success,
     timeout_seconds: 600,
     tags: ["imported", repoName],
     created_at: pr.created_at,
-    // TODO: fill these in after setting up ./fixture (checked out at
-    // base.sha below) and running the test suite to discover real test names.
-    test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
-    fail_to_pass: ["<TODO: fill in via `agr validate`>"],
-    pass_to_pass: ["<TODO: fill in via `agr validate`>"]
+    test_command,
+    fail_to_pass: [],
+    pass_to_pass: []
   };
   if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
   if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
   if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
   if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
-  writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
+  writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
   console.log(`
 Imported PR #${pr.number}: "${pr.title}"`);
   console.log(`Wrote scaffold to: ${outDir}`);
@@ -505,16 +544,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
     console.log(`  - solution.patch (${expectedFiles.length} file(s) changed)`);
   if (testDiff.trim())
     console.log(`  - test_patch.patch (${forbidModified.length} test file(s) changed)`);
-  if (opts.cloneFixture) {
-    const fixtureDir = resolve(outDir, "fixture");
-    console.log(`
-Cloning ${owner}/${repoName} into ${fixtureDir}...`);
-    execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
-      stdio: "inherit"
-    });
-    console.log(`Checking out base commit ${pr.base.sha}...`);
-    execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
-  }
   console.log("\nNext steps:");
   if (!opts.cloneFixture) {
     console.log(`  1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
@@ -523,7 +552,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
       `  3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
     );
   } else {
-    console.log("  1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
+    console.log("  1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
     console.log(
       `  2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
     );
@@ -533,6 +562,61 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
     await validateCommand(resolve(outDir, "agr.yaml"));
   }
 }
+function detectProjectKind(fixtureDir) {
+  if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
+    return "python";
+  }
+  if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
+  if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
+  return "unknown";
+}
+function projectTestDefaults(kind, cloned) {
+  if (!cloned) {
+    return {
+      success: [
+        { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+      ],
+      test_command: "<TODO: shell command that runs tests with TAP output>"
+    };
+  }
+  switch (kind) {
+    case "python":
+      return {
+        success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
+        test_command: "pytest --tap-stream"
+      };
+    case "node":
+      return {
+        success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
+        test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
+      };
+    case "go":
+      return {
+        success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
+        test_command: "<TODO: configure a TAP-producing test command for go>"
+      };
+    default:
+      return {
+        success: [
+          { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
+        ],
+        test_command: "<TODO: shell command that runs tests with TAP output>"
+      };
+  }
+}
+function buildAgrYaml(doc, projectKind) {
+  let yaml = stringify(doc);
+  const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
+  yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
+fail_to_pass:`);
+  if (projectKind === "python") {
+    yaml = yaml.replace(
+      /^test_command: (.+)$/m,
+      "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
+    );
+  }
+  return yaml;
+}
 function buildPrompt(pr) {
   const body = (pr.body || "").trim();
   return body ? `${pr.title}
@@ -563,6 +647,35 @@ function splitDiff(diff) {
     forbidModified
   };
 }
+var VERBOSE_CONTENT_MAX = 200;
+function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
+  if (value.length <= max) return value;
+  return `${value.slice(0, max)}...`;
+}
+function formatVerboseStep(step) {
+  const prefix = `[step ${step.index}] ${step.kind}`;
+  if (step.kind === "tool_call" && step.tool) {
+    const args = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool}(${args})`;
+  }
+  if (step.kind === "tool_result" && step.tool) {
+    const result = step.content ? truncateForVerbose(step.content) : "";
+    return `${prefix}: ${step.tool} -> ${result}`;
+  }
+  if (step.kind === "message" && step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  if (step.content) {
+    return `${prefix}: ${truncateForVerbose(step.content)}`;
+  }
+  return prefix;
+}
+function formatMetricDetail(label, detail) {
+  if (/^No .+ configured; skipping/.test(detail)) {
+    return `\u26A0\uFE0F ${label}: ${detail}`;
+  }
+  return `${label}: ${detail}`;
+}
 async function runSingleCommand(testCasePath, opts) {
   const testCase = loadTestCase(testCasePath);
   let agentConfig = {
@@ -595,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
       adapter,
       sandboxProvider,
       db,
-      runId
+      runId,
+      onStep: opts.verbose ? (step) => {
+        console.log(formatVerboseStep(step));
+      } : void 0
     });
     console.log("\n================ RUN SUMMARY ================");
     console.log(`Status:    ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -606,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
       console.log(`Error:     ${result.error}`);
     }
     if (result.metrics?.regression) {
-      console.log(`Regression: ${result.metrics.regression.detail}`);
+      console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
     }
     if (result.metrics?.diff) {
       console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
     }
     if (result.metrics?.localization) {
-      console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
+      console.log(
+        formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
+      );
     }
     console.log("=============================================\n");
   } catch (err) {
@@ -701,7 +819,10 @@ function safeParseJson(value) {
 // src/index.ts
 var cli = cac("agr");
-cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
+cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
+  "--verbose",
+  "Stream agent steps live to the console as they happen"
+).action(async (testCase, options) => {
   try {
     await runSingleCommand(testCase, options);
   } catch (err) {
@@ -709,12 +830,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
     process.exit(1);
   }
 });
-cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
+cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
   "--matrix <matrix>",
   "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
-).action(async (options) => {
+).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
+  if (!options.configs && options.config) {
+    options.configs = options.config;
+  }
   if (!options.suite || !options.configs && !options.matrix) {
-    console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
+    console.error(
+      "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
+    );
     process.exit(1);
   }
   try {
@@ -732,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
 cli.command(
   "validate <testCase>",
   "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
-).action(async (testCase) => {
+).option(
+  "--strict",
+  "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
+).action(async (testCase, options) => {
   try {
-    await validateCommand(testCase);
+    await validateCommand(testCase, options);
   } catch (err) {
     console.error(`Error executing validate: ${err.message}`);
     process.exit(1);
@@ -743,7 +872,7 @@ cli.command(
 cli.command(
   "import-pr <repo> <prNumber>",
   "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
-).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
+).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
   try {
     await importPrCommand(repo, prNumber, options);
   } catch (err) {

package/package.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "agentgrader",
-  "version": "1.0.5",
+  "version": "1.0.6",
   "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
   "license": "MIT",
   "type": "module",
   "bin": {
-    "agr": "./dist/index.js",
-    "agentgrader": "./dist/index.js"
+    "agr": "dist/index.js",
+    "agentgrader": "dist/index.js"
   },
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -20,11 +20,11 @@
   },
   "dependencies": {
     "@agentgrader/agent-openrouter": "^2.0.1",
-    "@agentgrader/core": "^1.1.1",
+    "@agentgrader/core": "^1.1.3",
     "@agentgrader/optimizer": "^0.1.0",
     "@agentgrader/sandbox-docker": "^2.0.2",
     "@agentgrader/scorer-static": "^0.1.0",
-    "@agentgrader/store": "^1.0.2",
+    "@agentgrader/store": "^1.0.3",
     "cac": "^6.7.14",
     "dotenv": "^17.4.2",
     "ink": "^4.4.1",