agentgrader 1.0.6 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +79 -27
  2. package/package.json +3 -3
package/dist/index.js CHANGED
@@ -16,6 +16,12 @@ import { parse, stringify } from 'yaml';
16
16
  import { ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
19
+ var CONFIG_COL_WIDTH = 24;
20
+ var CONFIG_LABEL_MAX = 20;
21
+ function truncateLabel(name, max = CONFIG_LABEL_MAX) {
22
+ if (name.length <= max) return name;
23
+ return `${name.slice(0, max - 1)}\u2026`;
24
+ }
19
25
  var Dashboard = ({ runs, testCases, configs, isFinished }) => {
20
26
  let totalCost = 0;
21
27
  let totalSteps = 0;
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
67
73
  r.testCaseId
68
74
  ] }),
69
75
  /* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
70
- /* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
76
+ /* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
71
77
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
72
78
  " (Steps: ",
73
79
  r.stepsCount,
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
81
87
  /* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
82
88
  /* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
83
89
  /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
84
- configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", children: cfg }) }, cfg))
90
+ configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
85
91
  ] }),
86
92
  testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
87
- /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
93
+ /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
88
94
  configs.map((cfg) => {
89
95
  const key = `${tc}_${cfg}`;
90
96
  const run = runs[key];
91
97
  if (!run) {
92
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
98
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
93
99
  }
94
100
  if (run.status === "running") {
95
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
101
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
96
102
  }
97
103
  if (run.status === "failed" || !run.passed) {
98
104
  const seconds2 = (run.durationMs / 1e3).toFixed(1);
99
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "red", children: [
105
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
100
106
  "\u2717 ",
101
107
  seconds2,
102
108
  "s ($",
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
105
111
  ] }) }, cfg);
106
112
  }
107
113
  const seconds = (run.durationMs / 1e3).toFixed(1);
108
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "green", children: [
114
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
109
115
  "\u2713 ",
110
116
  seconds,
111
117
  "s ($",
@@ -461,6 +467,11 @@ async function validateCommand(testCasePath, opts) {
461
467
  console.log(
462
468
  "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
463
469
  );
470
+ if (report.ok && !opts?.strict) {
471
+ console.log(
472
+ "Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
473
+ );
474
+ }
464
475
  }
465
476
  console.log("");
466
477
  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
@@ -506,8 +517,8 @@ async function importPrCommand(repo, prNumber, opts) {
506
517
  if (testDiff.trim()) {
507
518
  writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
508
519
  }
520
+ const fixtureDir = resolve(outDir, "fixture");
509
521
  if (opts.cloneFixture) {
510
- const fixtureDir = resolve(outDir, "fixture");
511
522
  console.log(`
512
523
  Cloning ${owner}/${repoName} into ${fixtureDir}...`);
513
524
  execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
@@ -516,8 +527,12 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
516
527
  console.log(`Checking out base commit ${pr.base.sha}...`);
517
528
  execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
518
529
  }
519
- const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
520
- const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
530
+ const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
531
+ const { success, test_command, testCommandHint } = projectTestDefaults(
532
+ projectKind,
533
+ opts.cloneFixture ?? false,
534
+ fixtureDir
535
+ );
521
536
  const yamlDoc = {
522
537
  name: slug,
523
538
  description: pr.title,
@@ -535,7 +550,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
535
550
  if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
536
551
  if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
537
552
  if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
538
- writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
553
+ writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
539
554
  console.log(`
540
555
  Imported PR #${pr.number}: "${pr.title}"`);
541
556
  console.log(`Wrote scaffold to: ${outDir}`);
@@ -551,6 +566,9 @@ Imported PR #${pr.number}: "${pr.title}"`);
551
566
  console.log(
552
567
  ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
553
568
  );
569
+ console.log(
570
+ "\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
571
+ );
554
572
  } else {
555
573
  console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
556
574
  console.log(
@@ -570,50 +588,81 @@ function detectProjectKind(fixtureDir) {
570
588
  if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
571
589
  return "unknown";
572
590
  }
573
- function projectTestDefaults(kind, cloned) {
591
+ function projectTestDefaults(kind, cloned, fixtureDir) {
574
592
  if (!cloned) {
575
593
  return {
576
594
  success: [
577
595
  { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
578
596
  ],
579
- test_command: "<TODO: shell command that runs tests with TAP output>"
597
+ test_command: "<TODO: shell command that runs tests with TAP output>",
598
+ testCommandHint: "none"
580
599
  };
581
600
  }
582
601
  switch (kind) {
583
602
  case "python":
584
603
  return {
585
604
  success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
586
- test_command: "pytest --tap-stream"
605
+ test_command: "pytest --tap-stream",
606
+ testCommandHint: "python"
587
607
  };
588
608
  case "node":
589
- return {
590
- success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
591
- test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
592
- };
609
+ return detectNodeTestRunner(fixtureDir);
593
610
  case "go":
594
611
  return {
595
612
  success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
596
- test_command: "<TODO: configure a TAP-producing test command for go>"
613
+ test_command: "<TODO: configure a TAP-producing test command for go>",
614
+ testCommandHint: "go"
597
615
  };
598
616
  default:
599
617
  return {
600
618
  success: [
601
619
  { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
602
620
  ],
603
- test_command: "<TODO: shell command that runs tests with TAP output>"
621
+ test_command: "<TODO: shell command that runs tests with TAP output>",
622
+ testCommandHint: "none"
604
623
  };
605
624
  }
606
625
  }
607
- function buildAgrYaml(doc, projectKind) {
626
+ function detectNodeTestRunner(fixtureDir) {
627
+ const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
628
+ const fallback = {
629
+ success,
630
+ test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
631
+ testCommandHint: "node-unknown"
632
+ };
633
+ try {
634
+ const pkgPath = resolve(fixtureDir, "package.json");
635
+ if (!existsSync(pkgPath)) return fallback;
636
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
637
+ const deps = { ...pkg.dependencies, ...pkg.devDependencies };
638
+ if (deps.ava) {
639
+ return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
640
+ }
641
+ if (deps.vitest) {
642
+ return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
643
+ }
644
+ if (deps.jest) {
645
+ return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
646
+ }
647
+ return fallback;
648
+ } catch {
649
+ return fallback;
650
+ }
651
+ }
652
+ function buildAgrYaml(doc, testCommandHint) {
608
653
  let yaml = stringify(doc);
609
654
  const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
610
655
  yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
611
656
  fail_to_pass:`);
612
- if (projectKind === "python") {
613
- yaml = yaml.replace(
614
- /^test_command: (.+)$/m,
615
- "# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
616
- );
657
+ const testCommandComments = {
658
+ python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
659
+ jest: "# jest does not output TAP by default; consider jest-tap-reporter",
660
+ "node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
661
+ };
662
+ const comment = testCommandComments[testCommandHint];
663
+ if (comment) {
664
+ yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
665
+ $&`);
617
666
  }
618
667
  return yaml;
619
668
  }
@@ -872,7 +921,10 @@ cli.command(
872
921
  cli.command(
873
922
  "import-pr <repo> <prNumber>",
874
923
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
875
- ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
924
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option(
925
+ "--clone-fixture",
926
+ "Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
927
+ ).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
876
928
  try {
877
929
  await importPrCommand(repo, prNumber, options);
878
930
  } catch (err) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.6",
3
+ "version": "1.0.7",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -19,9 +19,9 @@
19
19
  "dev": "bun run src/index.ts"
20
20
  },
21
21
  "dependencies": {
22
- "@agentgrader/agent-openrouter": "^2.0.1",
22
+ "@agentgrader/agent-openrouter": "^2.0.3",
23
23
  "@agentgrader/core": "^1.1.3",
24
- "@agentgrader/optimizer": "^0.1.0",
24
+ "@agentgrader/optimizer": "^0.1.1",
25
25
  "@agentgrader/sandbox-docker": "^2.0.2",
26
26
  "@agentgrader/scorer-static": "^0.1.0",
27
27
  "@agentgrader/store": "^1.0.3",