agentgrader 1.0.6 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +79 -27
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -16,6 +16,12 @@ import { parse, stringify } from 'yaml';
|
|
|
16
16
|
import { ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
|
+
var CONFIG_COL_WIDTH = 24;
|
|
20
|
+
var CONFIG_LABEL_MAX = 20;
|
|
21
|
+
function truncateLabel(name, max = CONFIG_LABEL_MAX) {
|
|
22
|
+
if (name.length <= max) return name;
|
|
23
|
+
return `${name.slice(0, max - 1)}\u2026`;
|
|
24
|
+
}
|
|
19
25
|
var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
20
26
|
let totalCost = 0;
|
|
21
27
|
let totalSteps = 0;
|
|
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
67
73
|
r.testCaseId
|
|
68
74
|
] }),
|
|
69
75
|
/* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
|
|
70
|
-
/* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
|
|
76
|
+
/* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
|
|
71
77
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
72
78
|
" (Steps: ",
|
|
73
79
|
r.stepsCount,
|
|
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
81
87
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
|
|
82
88
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
|
|
83
89
|
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
|
|
84
|
-
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width:
|
|
90
|
+
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
|
|
85
91
|
] }),
|
|
86
92
|
testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
|
|
87
|
-
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
|
|
93
|
+
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
|
|
88
94
|
configs.map((cfg) => {
|
|
89
95
|
const key = `${tc}_${cfg}`;
|
|
90
96
|
const run = runs[key];
|
|
91
97
|
if (!run) {
|
|
92
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
98
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
|
|
93
99
|
}
|
|
94
100
|
if (run.status === "running") {
|
|
95
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
101
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
|
|
96
102
|
}
|
|
97
103
|
if (run.status === "failed" || !run.passed) {
|
|
98
104
|
const seconds2 = (run.durationMs / 1e3).toFixed(1);
|
|
99
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
105
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
|
|
100
106
|
"\u2717 ",
|
|
101
107
|
seconds2,
|
|
102
108
|
"s ($",
|
|
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
105
111
|
] }) }, cfg);
|
|
106
112
|
}
|
|
107
113
|
const seconds = (run.durationMs / 1e3).toFixed(1);
|
|
108
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
114
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
|
|
109
115
|
"\u2713 ",
|
|
110
116
|
seconds,
|
|
111
117
|
"s ($",
|
|
@@ -461,6 +467,11 @@ async function validateCommand(testCasePath, opts) {
|
|
|
461
467
|
console.log(
|
|
462
468
|
"Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
|
|
463
469
|
);
|
|
470
|
+
if (report.ok && !opts?.strict) {
|
|
471
|
+
console.log(
|
|
472
|
+
"Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
|
|
473
|
+
);
|
|
474
|
+
}
|
|
464
475
|
}
|
|
465
476
|
console.log("");
|
|
466
477
|
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
@@ -506,8 +517,8 @@ async function importPrCommand(repo, prNumber, opts) {
|
|
|
506
517
|
if (testDiff.trim()) {
|
|
507
518
|
writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
|
|
508
519
|
}
|
|
520
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
509
521
|
if (opts.cloneFixture) {
|
|
510
|
-
const fixtureDir = resolve(outDir, "fixture");
|
|
511
522
|
console.log(`
|
|
512
523
|
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
513
524
|
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
@@ -516,8 +527,12 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
516
527
|
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
517
528
|
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
518
529
|
}
|
|
519
|
-
const projectKind = opts.cloneFixture ? detectProjectKind(
|
|
520
|
-
const { success, test_command } = projectTestDefaults(
|
|
530
|
+
const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
|
|
531
|
+
const { success, test_command, testCommandHint } = projectTestDefaults(
|
|
532
|
+
projectKind,
|
|
533
|
+
opts.cloneFixture ?? false,
|
|
534
|
+
fixtureDir
|
|
535
|
+
);
|
|
521
536
|
const yamlDoc = {
|
|
522
537
|
name: slug,
|
|
523
538
|
description: pr.title,
|
|
@@ -535,7 +550,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
535
550
|
if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
|
|
536
551
|
if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
|
|
537
552
|
if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
|
|
538
|
-
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc,
|
|
553
|
+
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
|
|
539
554
|
console.log(`
|
|
540
555
|
Imported PR #${pr.number}: "${pr.title}"`);
|
|
541
556
|
console.log(`Wrote scaffold to: ${outDir}`);
|
|
@@ -551,6 +566,9 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
551
566
|
console.log(
|
|
552
567
|
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
553
568
|
);
|
|
569
|
+
console.log(
|
|
570
|
+
"\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
|
|
571
|
+
);
|
|
554
572
|
} else {
|
|
555
573
|
console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
|
|
556
574
|
console.log(
|
|
@@ -570,50 +588,81 @@ function detectProjectKind(fixtureDir) {
|
|
|
570
588
|
if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
|
|
571
589
|
return "unknown";
|
|
572
590
|
}
|
|
573
|
-
function projectTestDefaults(kind, cloned) {
|
|
591
|
+
function projectTestDefaults(kind, cloned, fixtureDir) {
|
|
574
592
|
if (!cloned) {
|
|
575
593
|
return {
|
|
576
594
|
success: [
|
|
577
595
|
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
578
596
|
],
|
|
579
|
-
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
597
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
598
|
+
testCommandHint: "none"
|
|
580
599
|
};
|
|
581
600
|
}
|
|
582
601
|
switch (kind) {
|
|
583
602
|
case "python":
|
|
584
603
|
return {
|
|
585
604
|
success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
|
|
586
|
-
test_command: "pytest --tap-stream"
|
|
605
|
+
test_command: "pytest --tap-stream",
|
|
606
|
+
testCommandHint: "python"
|
|
587
607
|
};
|
|
588
608
|
case "node":
|
|
589
|
-
return
|
|
590
|
-
success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
|
|
591
|
-
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
|
|
592
|
-
};
|
|
609
|
+
return detectNodeTestRunner(fixtureDir);
|
|
593
610
|
case "go":
|
|
594
611
|
return {
|
|
595
612
|
success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
|
|
596
|
-
test_command: "<TODO: configure a TAP-producing test command for go>"
|
|
613
|
+
test_command: "<TODO: configure a TAP-producing test command for go>",
|
|
614
|
+
testCommandHint: "go"
|
|
597
615
|
};
|
|
598
616
|
default:
|
|
599
617
|
return {
|
|
600
618
|
success: [
|
|
601
619
|
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
602
620
|
],
|
|
603
|
-
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
621
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
622
|
+
testCommandHint: "none"
|
|
604
623
|
};
|
|
605
624
|
}
|
|
606
625
|
}
|
|
607
|
-
function
|
|
626
|
+
function detectNodeTestRunner(fixtureDir) {
|
|
627
|
+
const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
|
|
628
|
+
const fallback = {
|
|
629
|
+
success,
|
|
630
|
+
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
|
|
631
|
+
testCommandHint: "node-unknown"
|
|
632
|
+
};
|
|
633
|
+
try {
|
|
634
|
+
const pkgPath = resolve(fixtureDir, "package.json");
|
|
635
|
+
if (!existsSync(pkgPath)) return fallback;
|
|
636
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
637
|
+
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
638
|
+
if (deps.ava) {
|
|
639
|
+
return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
|
|
640
|
+
}
|
|
641
|
+
if (deps.vitest) {
|
|
642
|
+
return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
|
|
643
|
+
}
|
|
644
|
+
if (deps.jest) {
|
|
645
|
+
return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
|
|
646
|
+
}
|
|
647
|
+
return fallback;
|
|
648
|
+
} catch {
|
|
649
|
+
return fallback;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
function buildAgrYaml(doc, testCommandHint) {
|
|
608
653
|
let yaml = stringify(doc);
|
|
609
654
|
const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
|
|
610
655
|
yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
|
|
611
656
|
fail_to_pass:`);
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
657
|
+
const testCommandComments = {
|
|
658
|
+
python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
|
|
659
|
+
jest: "# jest does not output TAP by default; consider jest-tap-reporter",
|
|
660
|
+
"node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
|
|
661
|
+
};
|
|
662
|
+
const comment = testCommandComments[testCommandHint];
|
|
663
|
+
if (comment) {
|
|
664
|
+
yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
|
|
665
|
+
$&`);
|
|
617
666
|
}
|
|
618
667
|
return yaml;
|
|
619
668
|
}
|
|
@@ -872,7 +921,10 @@ cli.command(
|
|
|
872
921
|
cli.command(
|
|
873
922
|
"import-pr <repo> <prNumber>",
|
|
874
923
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
875
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
924
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
925
|
+
"--clone-fixture",
|
|
926
|
+
"Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
|
|
927
|
+
).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
|
|
876
928
|
try {
|
|
877
929
|
await importPrCommand(repo, prNumber, options);
|
|
878
930
|
} catch (err) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^2.0.
|
|
22
|
+
"@agentgrader/agent-openrouter": "^2.0.3",
|
|
23
23
|
"@agentgrader/core": "^1.1.3",
|
|
24
|
-
"@agentgrader/optimizer": "^0.1.
|
|
24
|
+
"@agentgrader/optimizer": "^0.1.1",
|
|
25
25
|
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
26
|
"@agentgrader/scorer-static": "^0.1.0",
|
|
27
27
|
"@agentgrader/store": "^1.0.3",
|