agentgrader 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +161 -32
- package/package.json +5 -5
package/dist/index.js
CHANGED
|
@@ -11,8 +11,8 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
|
11
11
|
import { StaticQualityScorer } from '@agentgrader/scorer-static';
|
|
12
12
|
import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
|
|
13
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
14
|
-
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
|
|
15
|
-
import {
|
|
14
|
+
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
|
+
import { parse, stringify } from 'yaml';
|
|
16
16
|
import { ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
@@ -419,20 +419,49 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
|
419
419
|
}
|
|
420
420
|
console.log("=================================================\n");
|
|
421
421
|
}
|
|
422
|
-
|
|
422
|
+
function isSkippedCheck(check) {
|
|
423
|
+
return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
|
|
424
|
+
}
|
|
425
|
+
function checkIcon(check) {
|
|
426
|
+
if (!check.passed) return "\u274C";
|
|
427
|
+
if (isSkippedCheck(check)) return "\u26A0\uFE0F";
|
|
428
|
+
return "\u2705";
|
|
429
|
+
}
|
|
430
|
+
async function validateCommand(testCasePath, opts) {
|
|
423
431
|
const testCase = loadTestCase(testCasePath);
|
|
432
|
+
if (opts?.strict) {
|
|
433
|
+
const missing = [];
|
|
434
|
+
if (!testCase.test_command) missing.push("test_command");
|
|
435
|
+
if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
|
|
436
|
+
if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
|
|
437
|
+
if (missing.length > 0) {
|
|
438
|
+
console.error(
|
|
439
|
+
`Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
|
|
440
|
+
);
|
|
441
|
+
process.exit(1);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
424
444
|
console.log(`Validating "${testCase.name}" (${testCasePath})...
|
|
425
445
|
`);
|
|
426
446
|
const sandboxProvider = new DockerSandboxProvider();
|
|
427
447
|
const report = await validateTestCase({ testCase, sandboxProvider });
|
|
448
|
+
const hadExecutionSkip = report.checks.some(
|
|
449
|
+
(c) => c.name.includes("execution-checks (skipped")
|
|
450
|
+
);
|
|
428
451
|
for (const check of report.checks) {
|
|
429
|
-
const icon = check
|
|
452
|
+
const icon = checkIcon(check);
|
|
430
453
|
console.log(`${icon} ${check.name}`);
|
|
431
454
|
if (check.detail && check.detail !== "ok") {
|
|
432
455
|
const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
|
|
433
456
|
console.log(indented);
|
|
434
457
|
}
|
|
435
458
|
}
|
|
459
|
+
if (hadExecutionSkip) {
|
|
460
|
+
console.log("");
|
|
461
|
+
console.log(
|
|
462
|
+
"Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
|
|
463
|
+
);
|
|
464
|
+
}
|
|
436
465
|
console.log("");
|
|
437
466
|
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
438
467
|
process.exit(report.ok ? 0 : 1);
|
|
@@ -477,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
|
|
|
477
506
|
if (testDiff.trim()) {
|
|
478
507
|
writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
|
|
479
508
|
}
|
|
509
|
+
if (opts.cloneFixture) {
|
|
510
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
511
|
+
console.log(`
|
|
512
|
+
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
513
|
+
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
514
|
+
stdio: "inherit"
|
|
515
|
+
});
|
|
516
|
+
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
517
|
+
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
518
|
+
}
|
|
519
|
+
const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
|
|
520
|
+
const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
|
|
480
521
|
const yamlDoc = {
|
|
481
522
|
name: slug,
|
|
482
523
|
description: pr.title,
|
|
483
524
|
fixture: "./fixture",
|
|
484
525
|
prompt: buildPrompt(pr),
|
|
485
|
-
success
|
|
526
|
+
success,
|
|
486
527
|
timeout_seconds: 600,
|
|
487
528
|
tags: ["imported", repoName],
|
|
488
529
|
created_at: pr.created_at,
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
fail_to_pass: ["<TODO: fill in via `agr validate`>"],
|
|
493
|
-
pass_to_pass: ["<TODO: fill in via `agr validate`>"]
|
|
530
|
+
test_command,
|
|
531
|
+
fail_to_pass: [],
|
|
532
|
+
pass_to_pass: []
|
|
494
533
|
};
|
|
495
534
|
if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
|
|
496
535
|
if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
|
|
497
536
|
if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
|
|
498
537
|
if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
|
|
499
|
-
writeFileSync(resolve(outDir, "agr.yaml"),
|
|
538
|
+
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
|
|
500
539
|
console.log(`
|
|
501
540
|
Imported PR #${pr.number}: "${pr.title}"`);
|
|
502
541
|
console.log(`Wrote scaffold to: ${outDir}`);
|
|
@@ -505,16 +544,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
505
544
|
console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
|
|
506
545
|
if (testDiff.trim())
|
|
507
546
|
console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
|
|
508
|
-
if (opts.cloneFixture) {
|
|
509
|
-
const fixtureDir = resolve(outDir, "fixture");
|
|
510
|
-
console.log(`
|
|
511
|
-
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
512
|
-
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
513
|
-
stdio: "inherit"
|
|
514
|
-
});
|
|
515
|
-
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
516
|
-
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
517
|
-
}
|
|
518
547
|
console.log("\nNext steps:");
|
|
519
548
|
if (!opts.cloneFixture) {
|
|
520
549
|
console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
|
|
@@ -523,7 +552,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
523
552
|
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
524
553
|
);
|
|
525
554
|
} else {
|
|
526
|
-
console.log(" 1. Fill in
|
|
555
|
+
console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
|
|
527
556
|
console.log(
|
|
528
557
|
` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
529
558
|
);
|
|
@@ -533,6 +562,61 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
533
562
|
await validateCommand(resolve(outDir, "agr.yaml"));
|
|
534
563
|
}
|
|
535
564
|
}
|
|
565
|
+
function detectProjectKind(fixtureDir) {
|
|
566
|
+
if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
|
|
567
|
+
return "python";
|
|
568
|
+
}
|
|
569
|
+
if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
|
|
570
|
+
if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
|
|
571
|
+
return "unknown";
|
|
572
|
+
}
|
|
573
|
+
function projectTestDefaults(kind, cloned) {
|
|
574
|
+
if (!cloned) {
|
|
575
|
+
return {
|
|
576
|
+
success: [
|
|
577
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
578
|
+
],
|
|
579
|
+
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
switch (kind) {
|
|
583
|
+
case "python":
|
|
584
|
+
return {
|
|
585
|
+
success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
|
|
586
|
+
test_command: "pytest --tap-stream"
|
|
587
|
+
};
|
|
588
|
+
case "node":
|
|
589
|
+
return {
|
|
590
|
+
success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
|
|
591
|
+
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
|
|
592
|
+
};
|
|
593
|
+
case "go":
|
|
594
|
+
return {
|
|
595
|
+
success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
|
|
596
|
+
test_command: "<TODO: configure a TAP-producing test command for go>"
|
|
597
|
+
};
|
|
598
|
+
default:
|
|
599
|
+
return {
|
|
600
|
+
success: [
|
|
601
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
602
|
+
],
|
|
603
|
+
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
function buildAgrYaml(doc, projectKind) {
|
|
608
|
+
let yaml = stringify(doc);
|
|
609
|
+
const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
|
|
610
|
+
yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
|
|
611
|
+
fail_to_pass:`);
|
|
612
|
+
if (projectKind === "python") {
|
|
613
|
+
yaml = yaml.replace(
|
|
614
|
+
/^test_command: (.+)$/m,
|
|
615
|
+
"# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
|
|
616
|
+
);
|
|
617
|
+
}
|
|
618
|
+
return yaml;
|
|
619
|
+
}
|
|
536
620
|
function buildPrompt(pr) {
|
|
537
621
|
const body = (pr.body || "").trim();
|
|
538
622
|
return body ? `${pr.title}
|
|
@@ -563,6 +647,35 @@ function splitDiff(diff) {
|
|
|
563
647
|
forbidModified
|
|
564
648
|
};
|
|
565
649
|
}
|
|
650
|
+
var VERBOSE_CONTENT_MAX = 200;
|
|
651
|
+
function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
|
|
652
|
+
if (value.length <= max) return value;
|
|
653
|
+
return `${value.slice(0, max)}...`;
|
|
654
|
+
}
|
|
655
|
+
function formatVerboseStep(step) {
|
|
656
|
+
const prefix = `[step ${step.index}] ${step.kind}`;
|
|
657
|
+
if (step.kind === "tool_call" && step.tool) {
|
|
658
|
+
const args = step.content ? truncateForVerbose(step.content) : "";
|
|
659
|
+
return `${prefix}: ${step.tool}(${args})`;
|
|
660
|
+
}
|
|
661
|
+
if (step.kind === "tool_result" && step.tool) {
|
|
662
|
+
const result = step.content ? truncateForVerbose(step.content) : "";
|
|
663
|
+
return `${prefix}: ${step.tool} -> ${result}`;
|
|
664
|
+
}
|
|
665
|
+
if (step.kind === "message" && step.content) {
|
|
666
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
667
|
+
}
|
|
668
|
+
if (step.content) {
|
|
669
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
670
|
+
}
|
|
671
|
+
return prefix;
|
|
672
|
+
}
|
|
673
|
+
function formatMetricDetail(label, detail) {
|
|
674
|
+
if (/^No .+ configured; skipping/.test(detail)) {
|
|
675
|
+
return `\u26A0\uFE0F ${label}: ${detail}`;
|
|
676
|
+
}
|
|
677
|
+
return `${label}: ${detail}`;
|
|
678
|
+
}
|
|
566
679
|
async function runSingleCommand(testCasePath, opts) {
|
|
567
680
|
const testCase = loadTestCase(testCasePath);
|
|
568
681
|
let agentConfig = {
|
|
@@ -595,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
595
708
|
adapter,
|
|
596
709
|
sandboxProvider,
|
|
597
710
|
db,
|
|
598
|
-
runId
|
|
711
|
+
runId,
|
|
712
|
+
onStep: opts.verbose ? (step) => {
|
|
713
|
+
console.log(formatVerboseStep(step));
|
|
714
|
+
} : void 0
|
|
599
715
|
});
|
|
600
716
|
console.log("\n================ RUN SUMMARY ================");
|
|
601
717
|
console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
|
|
@@ -606,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
606
722
|
console.log(`Error: ${result.error}`);
|
|
607
723
|
}
|
|
608
724
|
if (result.metrics?.regression) {
|
|
609
|
-
console.log(
|
|
725
|
+
console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
|
|
610
726
|
}
|
|
611
727
|
if (result.metrics?.diff) {
|
|
612
728
|
console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
|
|
613
729
|
}
|
|
614
730
|
if (result.metrics?.localization) {
|
|
615
|
-
console.log(
|
|
731
|
+
console.log(
|
|
732
|
+
formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
|
|
733
|
+
);
|
|
616
734
|
}
|
|
617
735
|
console.log("=============================================\n");
|
|
618
736
|
} catch (err) {
|
|
@@ -701,7 +819,10 @@ function safeParseJson(value) {
|
|
|
701
819
|
|
|
702
820
|
// src/index.ts
|
|
703
821
|
var cli = cac("agr");
|
|
704
|
-
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").
|
|
822
|
+
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
|
|
823
|
+
"--verbose",
|
|
824
|
+
"Stream agent steps live to the console as they happen"
|
|
825
|
+
).action(async (testCase, options) => {
|
|
705
826
|
try {
|
|
706
827
|
await runSingleCommand(testCase, options);
|
|
707
828
|
} catch (err) {
|
|
@@ -709,12 +830,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
709
830
|
process.exit(1);
|
|
710
831
|
}
|
|
711
832
|
});
|
|
712
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
833
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
713
834
|
"--matrix <matrix>",
|
|
714
835
|
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
715
|
-
).action(async (options) => {
|
|
836
|
+
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
837
|
+
if (!options.configs && options.config) {
|
|
838
|
+
options.configs = options.config;
|
|
839
|
+
}
|
|
716
840
|
if (!options.suite || !options.configs && !options.matrix) {
|
|
717
|
-
console.error(
|
|
841
|
+
console.error(
|
|
842
|
+
"Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
|
|
843
|
+
);
|
|
718
844
|
process.exit(1);
|
|
719
845
|
}
|
|
720
846
|
try {
|
|
@@ -732,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
|
|
|
732
858
|
cli.command(
|
|
733
859
|
"validate <testCase>",
|
|
734
860
|
"Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
|
|
735
|
-
).
|
|
861
|
+
).option(
|
|
862
|
+
"--strict",
|
|
863
|
+
"Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
|
|
864
|
+
).action(async (testCase, options) => {
|
|
736
865
|
try {
|
|
737
|
-
await validateCommand(testCase);
|
|
866
|
+
await validateCommand(testCase, options);
|
|
738
867
|
} catch (err) {
|
|
739
868
|
console.error(`Error executing validate: ${err.message}`);
|
|
740
869
|
process.exit(1);
|
|
@@ -743,7 +872,7 @@ cli.command(
|
|
|
743
872
|
cli.command(
|
|
744
873
|
"import-pr <repo> <prNumber>",
|
|
745
874
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
746
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
|
|
875
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
|
|
747
876
|
try {
|
|
748
877
|
await importPrCommand(repo, prNumber, options);
|
|
749
878
|
} catch (err) {
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"bin": {
|
|
8
|
-
"agr": "
|
|
9
|
-
"agentgrader": "
|
|
8
|
+
"agr": "dist/index.js",
|
|
9
|
+
"agentgrader": "dist/index.js"
|
|
10
10
|
},
|
|
11
11
|
"main": "./dist/index.js",
|
|
12
12
|
"types": "./dist/index.d.ts",
|
|
@@ -20,11 +20,11 @@
|
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
22
|
"@agentgrader/agent-openrouter": "^2.0.1",
|
|
23
|
-
"@agentgrader/core": "^1.1.
|
|
23
|
+
"@agentgrader/core": "^1.1.3",
|
|
24
24
|
"@agentgrader/optimizer": "^0.1.0",
|
|
25
25
|
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
26
|
"@agentgrader/scorer-static": "^0.1.0",
|
|
27
|
-
"@agentgrader/store": "^1.0.
|
|
27
|
+
"@agentgrader/store": "^1.0.3",
|
|
28
28
|
"cac": "^6.7.14",
|
|
29
29
|
"dotenv": "^17.4.2",
|
|
30
30
|
"ink": "^4.4.1",
|