agentgrader 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +161 -32
  2. package/package.json +5 -5
package/dist/index.js CHANGED
@@ -11,8 +11,8 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
11
11
  import { StaticQualityScorer } from '@agentgrader/scorer-static';
12
12
  import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
13
13
  import { jsx, jsxs } from 'react/jsx-runtime';
14
- import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
15
- import { stringify, parse } from 'yaml';
14
+ import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
15
+ import { parse, stringify } from 'yaml';
16
16
  import { ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
@@ -419,20 +419,49 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
419
419
  }
420
420
  console.log("=================================================\n");
421
421
  }
422
- async function validateCommand(testCasePath) {
422
+ function isSkippedCheck(check) {
423
+ return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
424
+ }
425
+ function checkIcon(check) {
426
+ if (!check.passed) return "\u274C";
427
+ if (isSkippedCheck(check)) return "\u26A0\uFE0F";
428
+ return "\u2705";
429
+ }
430
+ async function validateCommand(testCasePath, opts) {
423
431
  const testCase = loadTestCase(testCasePath);
432
+ if (opts?.strict) {
433
+ const missing = [];
434
+ if (!testCase.test_command) missing.push("test_command");
435
+ if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
436
+ if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
437
+ if (missing.length > 0) {
438
+ console.error(
439
+ `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
440
+ );
441
+ process.exit(1);
442
+ }
443
+ }
424
444
  console.log(`Validating "${testCase.name}" (${testCasePath})...
425
445
  `);
426
446
  const sandboxProvider = new DockerSandboxProvider();
427
447
  const report = await validateTestCase({ testCase, sandboxProvider });
448
+ const hadExecutionSkip = report.checks.some(
449
+ (c) => c.name.includes("execution-checks (skipped")
450
+ );
428
451
  for (const check of report.checks) {
429
- const icon = check.passed ? "\u2705" : "\u274C";
452
+ const icon = checkIcon(check);
430
453
  console.log(`${icon} ${check.name}`);
431
454
  if (check.detail && check.detail !== "ok") {
432
455
  const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
433
456
  console.log(indented);
434
457
  }
435
458
  }
459
+ if (hadExecutionSkip) {
460
+ console.log("");
461
+ console.log(
462
+ "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
463
+ );
464
+ }
436
465
  console.log("");
437
466
  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
438
467
  process.exit(report.ok ? 0 : 1);
@@ -477,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
477
506
  if (testDiff.trim()) {
478
507
  writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
479
508
  }
509
+ if (opts.cloneFixture) {
510
+ const fixtureDir = resolve(outDir, "fixture");
511
+ console.log(`
512
+ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
513
+ execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
514
+ stdio: "inherit"
515
+ });
516
+ console.log(`Checking out base commit ${pr.base.sha}...`);
517
+ execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
518
+ }
519
+ const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
520
+ const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
480
521
  const yamlDoc = {
481
522
  name: slug,
482
523
  description: pr.title,
483
524
  fixture: "./fixture",
484
525
  prompt: buildPrompt(pr),
485
- success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
526
+ success,
486
527
  timeout_seconds: 600,
487
528
  tags: ["imported", repoName],
488
529
  created_at: pr.created_at,
489
- // TODO: fill these in after setting up ./fixture (checked out at
490
- // base.sha below) and running the test suite to discover real test names.
491
- test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
492
- fail_to_pass: ["<TODO: fill in via `agr validate`>"],
493
- pass_to_pass: ["<TODO: fill in via `agr validate`>"]
530
+ test_command,
531
+ fail_to_pass: [],
532
+ pass_to_pass: []
494
533
  };
495
534
  if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
496
535
  if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
497
536
  if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
498
537
  if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
499
- writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
538
+ writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
500
539
  console.log(`
501
540
  Imported PR #${pr.number}: "${pr.title}"`);
502
541
  console.log(`Wrote scaffold to: ${outDir}`);
@@ -505,16 +544,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
505
544
  console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
506
545
  if (testDiff.trim())
507
546
  console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
508
- if (opts.cloneFixture) {
509
- const fixtureDir = resolve(outDir, "fixture");
510
- console.log(`
511
- Cloning ${owner}/${repoName} into ${fixtureDir}...`);
512
- execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
513
- stdio: "inherit"
514
- });
515
- console.log(`Checking out base commit ${pr.base.sha}...`);
516
- execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
517
- }
518
547
  console.log("\nNext steps:");
519
548
  if (!opts.cloneFixture) {
520
549
  console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
@@ -523,7 +552,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
523
552
  ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
524
553
  );
525
554
  } else {
526
- console.log(" 1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
555
+ console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
527
556
  console.log(
528
557
  ` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
529
558
  );
@@ -533,6 +562,61 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
533
562
  await validateCommand(resolve(outDir, "agr.yaml"));
534
563
  }
535
564
  }
565
+ function detectProjectKind(fixtureDir) {
566
+ if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
567
+ return "python";
568
+ }
569
+ if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
570
+ if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
571
+ return "unknown";
572
+ }
573
+ function projectTestDefaults(kind, cloned) {
574
+ if (!cloned) {
575
+ return {
576
+ success: [
577
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
578
+ ],
579
+ test_command: "<TODO: shell command that runs tests with TAP output>"
580
+ };
581
+ }
582
+ switch (kind) {
583
+ case "python":
584
+ return {
585
+ success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
586
+ test_command: "pytest --tap-stream"
587
+ };
588
+ case "node":
589
+ return {
590
+ success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
591
+ test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
592
+ };
593
+ case "go":
594
+ return {
595
+ success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
596
+ test_command: "<TODO: configure a TAP-producing test command for go>"
597
+ };
598
+ default:
599
+ return {
600
+ success: [
601
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
602
+ ],
603
+ test_command: "<TODO: shell command that runs tests with TAP output>"
604
+ };
605
+ }
606
+ }
607
+ function buildAgrYaml(doc, projectKind) {
608
+ let yaml = stringify(doc);
609
+ const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
610
+ yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
611
+ fail_to_pass:`);
612
+ if (projectKind === "python") {
613
+ yaml = yaml.replace(
614
+ /^test_command: (.+)$/m,
615
+ "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
616
+ );
617
+ }
618
+ return yaml;
619
+ }
536
620
  function buildPrompt(pr) {
537
621
  const body = (pr.body || "").trim();
538
622
  return body ? `${pr.title}
@@ -563,6 +647,35 @@ function splitDiff(diff) {
563
647
  forbidModified
564
648
  };
565
649
  }
650
+ var VERBOSE_CONTENT_MAX = 200;
651
+ function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
652
+ if (value.length <= max) return value;
653
+ return `${value.slice(0, max)}...`;
654
+ }
655
+ function formatVerboseStep(step) {
656
+ const prefix = `[step ${step.index}] ${step.kind}`;
657
+ if (step.kind === "tool_call" && step.tool) {
658
+ const args = step.content ? truncateForVerbose(step.content) : "";
659
+ return `${prefix}: ${step.tool}(${args})`;
660
+ }
661
+ if (step.kind === "tool_result" && step.tool) {
662
+ const result = step.content ? truncateForVerbose(step.content) : "";
663
+ return `${prefix}: ${step.tool} -> ${result}`;
664
+ }
665
+ if (step.kind === "message" && step.content) {
666
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
667
+ }
668
+ if (step.content) {
669
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
670
+ }
671
+ return prefix;
672
+ }
673
+ function formatMetricDetail(label, detail) {
674
+ if (/^No .+ configured; skipping/.test(detail)) {
675
+ return `\u26A0\uFE0F ${label}: ${detail}`;
676
+ }
677
+ return `${label}: ${detail}`;
678
+ }
566
679
  async function runSingleCommand(testCasePath, opts) {
567
680
  const testCase = loadTestCase(testCasePath);
568
681
  let agentConfig = {
@@ -595,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
595
708
  adapter,
596
709
  sandboxProvider,
597
710
  db,
598
- runId
711
+ runId,
712
+ onStep: opts.verbose ? (step) => {
713
+ console.log(formatVerboseStep(step));
714
+ } : void 0
599
715
  });
600
716
  console.log("\n================ RUN SUMMARY ================");
601
717
  console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -606,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
606
722
  console.log(`Error: ${result.error}`);
607
723
  }
608
724
  if (result.metrics?.regression) {
609
- console.log(`Regression: ${result.metrics.regression.detail}`);
725
+ console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
610
726
  }
611
727
  if (result.metrics?.diff) {
612
728
  console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
613
729
  }
614
730
  if (result.metrics?.localization) {
615
- console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
731
+ console.log(
732
+ formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
733
+ );
616
734
  }
617
735
  console.log("=============================================\n");
618
736
  } catch (err) {
@@ -701,7 +819,10 @@ function safeParseJson(value) {
701
819
 
702
820
  // src/index.ts
703
821
  var cli = cac("agr");
704
- cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
822
+ cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
823
+ "--verbose",
824
+ "Stream agent steps live to the console as they happen"
825
+ ).action(async (testCase, options) => {
705
826
  try {
706
827
  await runSingleCommand(testCase, options);
707
828
  } catch (err) {
@@ -709,12 +830,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
709
830
  process.exit(1);
710
831
  }
711
832
  });
712
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
833
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
713
834
  "--matrix <matrix>",
714
835
  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
715
- ).action(async (options) => {
836
+ ).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
837
+ if (!options.configs && options.config) {
838
+ options.configs = options.config;
839
+ }
716
840
  if (!options.suite || !options.configs && !options.matrix) {
717
- console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
841
+ console.error(
842
+ "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
843
+ );
718
844
  process.exit(1);
719
845
  }
720
846
  try {
@@ -732,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
732
858
  cli.command(
733
859
  "validate <testCase>",
734
860
  "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
735
- ).action(async (testCase) => {
861
+ ).option(
862
+ "--strict",
863
+ "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
864
+ ).action(async (testCase, options) => {
736
865
  try {
737
- await validateCommand(testCase);
866
+ await validateCommand(testCase, options);
738
867
  } catch (err) {
739
868
  console.error(`Error executing validate: ${err.message}`);
740
869
  process.exit(1);
@@ -743,7 +872,7 @@ cli.command(
743
872
  cli.command(
744
873
  "import-pr <repo> <prNumber>",
745
874
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
746
- ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
875
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
747
876
  try {
748
877
  await importPrCommand(repo, prNumber, options);
749
878
  } catch (err) {
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
7
7
  "bin": {
8
- "agr": "./dist/index.js",
9
- "agentgrader": "./dist/index.js"
8
+ "agr": "dist/index.js",
9
+ "agentgrader": "dist/index.js"
10
10
  },
11
11
  "main": "./dist/index.js",
12
12
  "types": "./dist/index.d.ts",
@@ -20,11 +20,11 @@
20
20
  },
21
21
  "dependencies": {
22
22
  "@agentgrader/agent-openrouter": "^2.0.1",
23
- "@agentgrader/core": "^1.1.1",
23
+ "@agentgrader/core": "^1.1.3",
24
24
  "@agentgrader/optimizer": "^0.1.0",
25
25
  "@agentgrader/sandbox-docker": "^2.0.2",
26
26
  "@agentgrader/scorer-static": "^0.1.0",
27
- "@agentgrader/store": "^1.0.2",
27
+ "@agentgrader/store": "^1.0.3",
28
28
  "cac": "^6.7.14",
29
29
  "dotenv": "^17.4.2",
30
30
  "ink": "^4.4.1",