npm - agentgrader - Versions diffs - 1.1.0 → 1.3.0 - Mend

agentgrader 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +203 -8
package/package.json +6 -6

package/dist/index.js CHANGED Viewed

@@ -1,10 +1,10 @@
 #!/usr/bin/env node
 import 'dotenv/config';
 import { cac } from 'cac';
+import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
 import { randomUUID } from 'crypto';
 import { resolve, dirname, isAbsolute, basename } from 'path';
 import { render, Box, Text } from 'ink';
-import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
 import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
 import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
 import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
@@ -16,6 +16,155 @@ import { parse, stringify } from 'yaml';
 import { z, ZodError } from 'zod';
 import { execFileSync } from 'child_process';
+var CONTENT_PREVIEW_MAX = 200;
+var ANSI = {
+  reset: "\x1B[0m",
+  gray: "\x1B[90m",
+  yellow: "\x1B[33m",
+  cyan: "\x1B[36m",
+  blue: "\x1B[34m"};
+function paint(text, code) {
+  if (!process.stdout.isTTY) return text;
+  return `${code}${text}${ANSI.reset}`;
+}
+function truncateContent(content, full) {
+  if (full || content.length <= CONTENT_PREVIEW_MAX) return content;
+  return `${content.slice(0, CONTENT_PREVIEW_MAX)}...`;
+}
+function normalizeContent(content) {
+  return (content ?? "").trim();
+}
+function formatStepSummary(step, full) {
+  if (!step) return "(no step)";
+  const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
+  if (!step.content) return label;
+  const preview = truncateContent(step.content.replace(/\n/g, " "), full);
+  return `${label} ${preview}`;
+}
+function stepsByIndex(traces) {
+  const map = /* @__PURE__ */ new Map();
+  for (const step of traces) {
+    map.set(step.stepIndex, step);
+  }
+  return map;
+}
+function stepsDiverge(a, b) {
+  if (!a || !b) return true;
+  if (a.kind !== b.kind) return true;
+  if ((a.tool ?? "") !== (b.tool ?? "")) return true;
+  if (normalizeContent(a.content) !== normalizeContent(b.content)) return true;
+  return false;
+}
+function formatRunStatus(run) {
+  const passed = run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : "";
+  return `${run.status}${passed}`;
+}
+function printRunHeader(label, run) {
+  const tag = label === "A" ? paint(`Run A (${run.id})`, ANSI.cyan) : paint(`Run B (${run.id})`, ANSI.blue);
+  console.log(tag);
+  console.log(`  test case:    ${run.testCaseId}`);
+  console.log(`  agent config: ${run.agentConfigId}`);
+  console.log(`  status:       ${formatRunStatus(run)}`);
+  console.log(`  steps:        ${run.stepsCount}`);
+  console.log(`  cost:         $${run.costUsd.toFixed(4)}`);
+  console.log(`  duration:     ${run.durationMs}ms`);
+  if (run.error) console.log(`  error:        ${run.error}`);
+}
+async function compareCommand(runIdA, runIdB, opts) {
+  const db = initDb();
+  const [runA, runB, tracesA, tracesB] = await Promise.all([
+    getRun(db, runIdA),
+    getRun(db, runIdB),
+    getTraces(db, runIdA),
+    getTraces(db, runIdB)
+  ]);
+  if (!runA) {
+    console.error(`Run not found: ${runIdA}`);
+    process.exit(1);
+  }
+  if (!runB) {
+    console.error(`Run not found: ${runIdB}`);
+    process.exit(1);
+  }
+  console.log("");
+  printRunHeader("A", runA);
+  console.log("");
+  printRunHeader("B", runB);
+  console.log("");
+  if (runA.testCaseId !== runB.testCaseId) {
+    console.log(
+      paint(
+        "\u26A0\uFE0F  Comparing runs of different test cases - step alignment may not be meaningful.",
+        ANSI.yellow
+      )
+    );
+    console.log("");
+  }
+  const mapA = stepsByIndex(tracesA);
+  const mapB = stepsByIndex(tracesB);
+  const maxIndex = Math.max(
+    tracesA.length > 0 ? Math.max(...tracesA.map((s) => s.stepIndex)) : -1,
+    tracesB.length > 0 ? Math.max(...tracesB.map((s) => s.stepIndex)) : -1,
+    -1
+  );
+  if (maxIndex < 0) {
+    console.log("No steps recorded for either run.");
+    return;
+  }
+  const divergentIndices = /* @__PURE__ */ new Set();
+  for (let i = 0; i <= maxIndex; i++) {
+    if (stepsDiverge(mapA.get(i), mapB.get(i))) {
+      divergentIndices.add(i);
+    }
+  }
+  const visibleIndices = /* @__PURE__ */ new Set();
+  if (opts.onlyDiff) {
+    for (const idx of divergentIndices) {
+      visibleIndices.add(idx);
+      if (idx > 0) visibleIndices.add(idx - 1);
+      if (idx < maxIndex) visibleIndices.add(idx + 1);
+    }
+  } else {
+    for (let i = 0; i <= maxIndex; i++) visibleIndices.add(i);
+  }
+  const sortedVisible = [...visibleIndices].sort((a, b) => a - b);
+  if (sortedVisible.length === 0) {
+    console.log("No divergent steps (nothing to show with --only-diff).");
+  } else {
+    console.log("Step comparison:");
+    for (const i of sortedVisible) {
+      const stepA = mapA.get(i);
+      const stepB = mapB.get(i);
+      const divergent = stepsDiverge(stepA, stepB);
+      if (divergent) {
+        console.log(paint(`[step ${i}] DIVERGENT`, ANSI.yellow));
+        console.log(`  A: ${formatStepSummary(stepA, opts.full ?? false)}`);
+        console.log(`  B: ${formatStepSummary(stepB, opts.full ?? false)}`);
+      } else {
+        const line = formatStepSummary(stepA ?? stepB, opts.full ?? false);
+        console.log(paint(`[step ${i}] (same)`, ANSI.gray));
+        console.log(`  ${line}`);
+      }
+    }
+  }
+  const totalSteps = maxIndex + 1;
+  const diffCount = divergentIndices.size;
+  let firstDivergence = null;
+  for (let i = 0; i <= maxIndex; i++) {
+    if (divergentIndices.has(i)) {
+      firstDivergence = i;
+      break;
+    }
+  }
+  console.log("");
+  console.log(`${diffCount} of ${totalSteps} step(s) differ.`);
+  if (firstDivergence !== null) {
+    console.log(`First divergence at step ${firstDivergence}.`);
+  } else {
+    console.log("No divergence detected.");
+  }
+  console.log("");
+}
 var CONFIG_COL_WIDTH = 24;
 var CONFIG_LABEL_MAX = 20;
 function truncateLabel(name, max = CONFIG_LABEL_MAX) {
@@ -327,6 +476,9 @@ function loadTestCase(yamlPath) {
       (toolkit) => isAbsolute(toolkit) ? toolkit : resolve(dir, toolkit)
     );
   }
+  if (testCase.agent_config) {
+    testCase.agent_config = isAbsolute(testCase.agent_config) ? testCase.agent_config : resolve(dir, testCase.agent_config);
+  }
   if (testCase.solution && looksLikeFilePath(testCase.solution)) {
     testCase.solution = readPatchFile(dir, testCase.solution);
   }
@@ -335,6 +487,26 @@ function loadTestCase(yamlPath) {
   }
   return testCase;
 }
+function resolveSharedAgentConfigFromTestCases(testCases) {
+  if (testCases.length === 0) {
+    throw new Error("No test cases loaded.");
+  }
+  const paths = /* @__PURE__ */ new Set();
+  for (const tc of testCases) {
+    if (!tc.agent_config) {
+      throw new Error(
+        "Either --configs, --configs-dir, --matrix, or --manifest must be provided, or every test case in the suite must define the same agent_config in agr.yaml."
+      );
+    }
+    paths.add(tc.agent_config);
+  }
+  if (paths.size > 1) {
+    throw new Error(
+      `Multiple agent_config values found across test cases (${[...paths].join(", ")}). Use --configs, --configs-dir, or --matrix to specify agent configs explicitly.`
+    );
+  }
+  return [...paths][0];
+}
 function looksLikeFilePath(value) {
   const trimmed = value.trimStart();
   if (trimmed.startsWith("diff ") || trimmed.startsWith("---") || trimmed.startsWith("***")) {
@@ -393,7 +565,7 @@ function findTestCaseYamlFiles(dir) {
 async function runBenchCommand(opts) {
   let suiteDir;
   let concurrency = opts.concurrency ?? 2;
-  let agentConfigs;
+  let agentConfigs = [];
   let matrixId;
   if (opts.manifest) {
     const manifestPath = resolve(opts.manifest);
@@ -425,7 +597,7 @@ async function runBenchCommand(opts) {
       console.log(
         `Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
       );
-    } else {
+    } else if (opts.configs || opts.configsDir) {
       const configPaths = resolveAgentConfigPathList({
         commaSeparated: opts.configs,
         dir: opts.configsDir
@@ -436,9 +608,6 @@ async function runBenchCommand(opts) {
       }
     }
   }
-  if (agentConfigs.length === 0) {
-    throw new Error("No agent configs to benchmark.");
-  }
   const yamlFiles = findTestCaseYamlFiles(suiteDir);
   if (yamlFiles.length === 0) {
     console.error(`No test cases found in suite directory: ${suiteDir}`);
@@ -448,6 +617,16 @@ async function runBenchCommand(opts) {
   for (const f of yamlFiles) {
     testCases.push(loadTestCase(f));
   }
+  if (agentConfigs.length === 0) {
+    const sharedAgentConfig = resolveSharedAgentConfigFromTestCases(testCases);
+    const configPaths = resolveAgentConfigPathList({
+      explicitPaths: [sharedAgentConfig]
+    });
+    agentConfigs = loadAgentConfigsFromPaths(configPaths);
+    console.log(
+      `Using shared agent_config from agr.yaml: ${sharedAgentConfig} (${agentConfigs.length} config).`
+    );
+  }
   const db = initDb();
   for (const tc of testCases) {
     await saveTestCase(db, testCaseToDbRow(tc));
@@ -885,6 +1064,11 @@ async function runSingleCommand(testCasePath, opts) {
   };
   if (opts.config) {
     agentConfig = loadAgentConfig(opts.config);
+  } else if (testCase.agent_config) {
+    agentConfig = loadAgentConfig(testCase.agent_config);
+    console.log(
+      `Using agent config from agr.yaml: ${testCase.agent_config} (model: ${agentConfig.model})`
+    );
   }
   console.log(`Starting run for "${testCase.name}" using model "${agentConfig.model}"...`);
   const sandboxProvider = new DockerSandboxProvider();
@@ -1055,9 +1239,9 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
       );
       process.exit(1);
     }
-  } else if (!options.suite || agentSourceCount === 0) {
+  } else if (!options.suite) {
     console.error(
-      "Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
+      "Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, --matrix, or a shared agent_config in every agr.yaml."
     );
     process.exit(1);
   } else if (agentSourceCount > 1) {
@@ -1119,6 +1303,17 @@ cli.command("trace <runId>", "Show the step trace and metrics for a single run")
     process.exit(1);
   }
 });
+cli.command("compare <runIdA> <runIdB>", "Compare the step traces of two runs side by side").option("--full", "Print full step content without truncation").option(
+  "--only-diff",
+  "Show only divergent steps plus one step of context before and after each"
+).example("agr compare <runIdA> <runIdB> --only-diff").action(async (runIdA, runIdB, options) => {
+  try {
+    await compareCommand(runIdA, runIdB, options);
+  } catch (err) {
+    console.error(`Error executing compare: ${err.message}`);
+    process.exit(1);
+  }
+});
 cli.help();
 try {
   cli.parse();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agentgrader",
-  "version": "1.1.0",
+  "version": "1.3.0",
   "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
   "license": "MIT",
   "type": "module",
@@ -19,11 +19,11 @@
     "dev": "bun run src/index.ts"
   },
   "dependencies": {
-    "@agentgrader/agent-openrouter": "^2.0.3",
-    "@agentgrader/core": "^1.1.3",
-    "@agentgrader/optimizer": "^0.1.1",
-    "@agentgrader/sandbox-docker": "^2.0.2",
-    "@agentgrader/scorer-static": "^0.1.0",
+    "@agentgrader/agent-openrouter": "^3.0.0",
+    "@agentgrader/core": "^1.2.0",
+    "@agentgrader/optimizer": "^1.0.0",
+    "@agentgrader/sandbox-docker": "^3.0.0",
+    "@agentgrader/scorer-static": "^1.0.0",
     "@agentgrader/store": "^1.0.3",
     "cac": "^6.7.14",
     "dotenv": "^17.4.2",