agentgrader 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +220 -39
- package/package.json +7 -7
package/dist/index.js
CHANGED
|
@@ -11,11 +11,17 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
|
11
11
|
import { StaticQualityScorer } from '@agentgrader/scorer-static';
|
|
12
12
|
import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
|
|
13
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
14
|
-
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
|
|
15
|
-
import {
|
|
14
|
+
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
|
+
import { parse, stringify } from 'yaml';
|
|
16
16
|
import { ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
|
+
var CONFIG_COL_WIDTH = 24;
|
|
20
|
+
var CONFIG_LABEL_MAX = 20;
|
|
21
|
+
function truncateLabel(name, max = CONFIG_LABEL_MAX) {
|
|
22
|
+
if (name.length <= max) return name;
|
|
23
|
+
return `${name.slice(0, max - 1)}\u2026`;
|
|
24
|
+
}
|
|
19
25
|
var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
20
26
|
let totalCost = 0;
|
|
21
27
|
let totalSteps = 0;
|
|
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
67
73
|
r.testCaseId
|
|
68
74
|
] }),
|
|
69
75
|
/* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
|
|
70
|
-
/* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
|
|
76
|
+
/* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
|
|
71
77
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
72
78
|
" (Steps: ",
|
|
73
79
|
r.stepsCount,
|
|
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
81
87
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
|
|
82
88
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
|
|
83
89
|
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
|
|
84
|
-
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width:
|
|
90
|
+
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
|
|
85
91
|
] }),
|
|
86
92
|
testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
|
|
87
|
-
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
|
|
93
|
+
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
|
|
88
94
|
configs.map((cfg) => {
|
|
89
95
|
const key = `${tc}_${cfg}`;
|
|
90
96
|
const run = runs[key];
|
|
91
97
|
if (!run) {
|
|
92
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
98
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
|
|
93
99
|
}
|
|
94
100
|
if (run.status === "running") {
|
|
95
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
101
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
|
|
96
102
|
}
|
|
97
103
|
if (run.status === "failed" || !run.passed) {
|
|
98
104
|
const seconds2 = (run.durationMs / 1e3).toFixed(1);
|
|
99
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
105
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
|
|
100
106
|
"\u2717 ",
|
|
101
107
|
seconds2,
|
|
102
108
|
"s ($",
|
|
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
105
111
|
] }) }, cfg);
|
|
106
112
|
}
|
|
107
113
|
const seconds = (run.durationMs / 1e3).toFixed(1);
|
|
108
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
114
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
|
|
109
115
|
"\u2713 ",
|
|
110
116
|
seconds,
|
|
111
117
|
"s ($",
|
|
@@ -419,20 +425,54 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
|
419
425
|
}
|
|
420
426
|
console.log("=================================================\n");
|
|
421
427
|
}
|
|
422
|
-
|
|
428
|
+
function isSkippedCheck(check) {
|
|
429
|
+
return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
|
|
430
|
+
}
|
|
431
|
+
function checkIcon(check) {
|
|
432
|
+
if (!check.passed) return "\u274C";
|
|
433
|
+
if (isSkippedCheck(check)) return "\u26A0\uFE0F";
|
|
434
|
+
return "\u2705";
|
|
435
|
+
}
|
|
436
|
+
async function validateCommand(testCasePath, opts) {
|
|
423
437
|
const testCase = loadTestCase(testCasePath);
|
|
438
|
+
if (opts?.strict) {
|
|
439
|
+
const missing = [];
|
|
440
|
+
if (!testCase.test_command) missing.push("test_command");
|
|
441
|
+
if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
|
|
442
|
+
if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
|
|
443
|
+
if (missing.length > 0) {
|
|
444
|
+
console.error(
|
|
445
|
+
`Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
|
|
446
|
+
);
|
|
447
|
+
process.exit(1);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
424
450
|
console.log(`Validating "${testCase.name}" (${testCasePath})...
|
|
425
451
|
`);
|
|
426
452
|
const sandboxProvider = new DockerSandboxProvider();
|
|
427
453
|
const report = await validateTestCase({ testCase, sandboxProvider });
|
|
454
|
+
const hadExecutionSkip = report.checks.some(
|
|
455
|
+
(c) => c.name.includes("execution-checks (skipped")
|
|
456
|
+
);
|
|
428
457
|
for (const check of report.checks) {
|
|
429
|
-
const icon = check
|
|
458
|
+
const icon = checkIcon(check);
|
|
430
459
|
console.log(`${icon} ${check.name}`);
|
|
431
460
|
if (check.detail && check.detail !== "ok") {
|
|
432
461
|
const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
|
|
433
462
|
console.log(indented);
|
|
434
463
|
}
|
|
435
464
|
}
|
|
465
|
+
if (hadExecutionSkip) {
|
|
466
|
+
console.log("");
|
|
467
|
+
console.log(
|
|
468
|
+
"Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
|
|
469
|
+
);
|
|
470
|
+
if (report.ok && !opts?.strict) {
|
|
471
|
+
console.log(
|
|
472
|
+
"Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
|
|
473
|
+
);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
436
476
|
console.log("");
|
|
437
477
|
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
438
478
|
process.exit(report.ok ? 0 : 1);
|
|
@@ -477,26 +517,40 @@ async function importPrCommand(repo, prNumber, opts) {
|
|
|
477
517
|
if (testDiff.trim()) {
|
|
478
518
|
writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
|
|
479
519
|
}
|
|
520
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
521
|
+
if (opts.cloneFixture) {
|
|
522
|
+
console.log(`
|
|
523
|
+
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
524
|
+
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
525
|
+
stdio: "inherit"
|
|
526
|
+
});
|
|
527
|
+
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
528
|
+
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
529
|
+
}
|
|
530
|
+
const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
|
|
531
|
+
const { success, test_command, testCommandHint } = projectTestDefaults(
|
|
532
|
+
projectKind,
|
|
533
|
+
opts.cloneFixture ?? false,
|
|
534
|
+
fixtureDir
|
|
535
|
+
);
|
|
480
536
|
const yamlDoc = {
|
|
481
537
|
name: slug,
|
|
482
538
|
description: pr.title,
|
|
483
539
|
fixture: "./fixture",
|
|
484
540
|
prompt: buildPrompt(pr),
|
|
485
|
-
success
|
|
541
|
+
success,
|
|
486
542
|
timeout_seconds: 600,
|
|
487
543
|
tags: ["imported", repoName],
|
|
488
544
|
created_at: pr.created_at,
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
fail_to_pass: ["<TODO: fill in via `agr validate`>"],
|
|
493
|
-
pass_to_pass: ["<TODO: fill in via `agr validate`>"]
|
|
545
|
+
test_command,
|
|
546
|
+
fail_to_pass: [],
|
|
547
|
+
pass_to_pass: []
|
|
494
548
|
};
|
|
495
549
|
if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
|
|
496
550
|
if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
|
|
497
551
|
if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
|
|
498
552
|
if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
|
|
499
|
-
writeFileSync(resolve(outDir, "agr.yaml"),
|
|
553
|
+
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
|
|
500
554
|
console.log(`
|
|
501
555
|
Imported PR #${pr.number}: "${pr.title}"`);
|
|
502
556
|
console.log(`Wrote scaffold to: ${outDir}`);
|
|
@@ -505,16 +559,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
505
559
|
console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
|
|
506
560
|
if (testDiff.trim())
|
|
507
561
|
console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
|
|
508
|
-
if (opts.cloneFixture) {
|
|
509
|
-
const fixtureDir = resolve(outDir, "fixture");
|
|
510
|
-
console.log(`
|
|
511
|
-
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
512
|
-
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
513
|
-
stdio: "inherit"
|
|
514
|
-
});
|
|
515
|
-
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
516
|
-
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
517
|
-
}
|
|
518
562
|
console.log("\nNext steps:");
|
|
519
563
|
if (!opts.cloneFixture) {
|
|
520
564
|
console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
|
|
@@ -522,8 +566,11 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
522
566
|
console.log(
|
|
523
567
|
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
524
568
|
);
|
|
569
|
+
console.log(
|
|
570
|
+
"\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
|
|
571
|
+
);
|
|
525
572
|
} else {
|
|
526
|
-
console.log(" 1. Fill in
|
|
573
|
+
console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
|
|
527
574
|
console.log(
|
|
528
575
|
` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
529
576
|
);
|
|
@@ -533,6 +580,92 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
533
580
|
await validateCommand(resolve(outDir, "agr.yaml"));
|
|
534
581
|
}
|
|
535
582
|
}
|
|
583
|
+
function detectProjectKind(fixtureDir) {
|
|
584
|
+
if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
|
|
585
|
+
return "python";
|
|
586
|
+
}
|
|
587
|
+
if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
|
|
588
|
+
if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
|
|
589
|
+
return "unknown";
|
|
590
|
+
}
|
|
591
|
+
function projectTestDefaults(kind, cloned, fixtureDir) {
|
|
592
|
+
if (!cloned) {
|
|
593
|
+
return {
|
|
594
|
+
success: [
|
|
595
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
596
|
+
],
|
|
597
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
598
|
+
testCommandHint: "none"
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
switch (kind) {
|
|
602
|
+
case "python":
|
|
603
|
+
return {
|
|
604
|
+
success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
|
|
605
|
+
test_command: "pytest --tap-stream",
|
|
606
|
+
testCommandHint: "python"
|
|
607
|
+
};
|
|
608
|
+
case "node":
|
|
609
|
+
return detectNodeTestRunner(fixtureDir);
|
|
610
|
+
case "go":
|
|
611
|
+
return {
|
|
612
|
+
success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
|
|
613
|
+
test_command: "<TODO: configure a TAP-producing test command for go>",
|
|
614
|
+
testCommandHint: "go"
|
|
615
|
+
};
|
|
616
|
+
default:
|
|
617
|
+
return {
|
|
618
|
+
success: [
|
|
619
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
620
|
+
],
|
|
621
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
622
|
+
testCommandHint: "none"
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
function detectNodeTestRunner(fixtureDir) {
|
|
627
|
+
const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
|
|
628
|
+
const fallback = {
|
|
629
|
+
success,
|
|
630
|
+
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
|
|
631
|
+
testCommandHint: "node-unknown"
|
|
632
|
+
};
|
|
633
|
+
try {
|
|
634
|
+
const pkgPath = resolve(fixtureDir, "package.json");
|
|
635
|
+
if (!existsSync(pkgPath)) return fallback;
|
|
636
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
637
|
+
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
638
|
+
if (deps.ava) {
|
|
639
|
+
return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
|
|
640
|
+
}
|
|
641
|
+
if (deps.vitest) {
|
|
642
|
+
return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
|
|
643
|
+
}
|
|
644
|
+
if (deps.jest) {
|
|
645
|
+
return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
|
|
646
|
+
}
|
|
647
|
+
return fallback;
|
|
648
|
+
} catch {
|
|
649
|
+
return fallback;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
function buildAgrYaml(doc, testCommandHint) {
|
|
653
|
+
let yaml = stringify(doc);
|
|
654
|
+
const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
|
|
655
|
+
yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
|
|
656
|
+
fail_to_pass:`);
|
|
657
|
+
const testCommandComments = {
|
|
658
|
+
python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
|
|
659
|
+
jest: "# jest does not output TAP by default; consider jest-tap-reporter",
|
|
660
|
+
"node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
|
|
661
|
+
};
|
|
662
|
+
const comment = testCommandComments[testCommandHint];
|
|
663
|
+
if (comment) {
|
|
664
|
+
yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
|
|
665
|
+
$&`);
|
|
666
|
+
}
|
|
667
|
+
return yaml;
|
|
668
|
+
}
|
|
536
669
|
function buildPrompt(pr) {
|
|
537
670
|
const body = (pr.body || "").trim();
|
|
538
671
|
return body ? `${pr.title}
|
|
@@ -563,6 +696,35 @@ function splitDiff(diff) {
|
|
|
563
696
|
forbidModified
|
|
564
697
|
};
|
|
565
698
|
}
|
|
699
|
+
var VERBOSE_CONTENT_MAX = 200;
|
|
700
|
+
function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
|
|
701
|
+
if (value.length <= max) return value;
|
|
702
|
+
return `${value.slice(0, max)}...`;
|
|
703
|
+
}
|
|
704
|
+
function formatVerboseStep(step) {
|
|
705
|
+
const prefix = `[step ${step.index}] ${step.kind}`;
|
|
706
|
+
if (step.kind === "tool_call" && step.tool) {
|
|
707
|
+
const args = step.content ? truncateForVerbose(step.content) : "";
|
|
708
|
+
return `${prefix}: ${step.tool}(${args})`;
|
|
709
|
+
}
|
|
710
|
+
if (step.kind === "tool_result" && step.tool) {
|
|
711
|
+
const result = step.content ? truncateForVerbose(step.content) : "";
|
|
712
|
+
return `${prefix}: ${step.tool} -> ${result}`;
|
|
713
|
+
}
|
|
714
|
+
if (step.kind === "message" && step.content) {
|
|
715
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
716
|
+
}
|
|
717
|
+
if (step.content) {
|
|
718
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
719
|
+
}
|
|
720
|
+
return prefix;
|
|
721
|
+
}
|
|
722
|
+
function formatMetricDetail(label, detail) {
|
|
723
|
+
if (/^No .+ configured; skipping/.test(detail)) {
|
|
724
|
+
return `\u26A0\uFE0F ${label}: ${detail}`;
|
|
725
|
+
}
|
|
726
|
+
return `${label}: ${detail}`;
|
|
727
|
+
}
|
|
566
728
|
async function runSingleCommand(testCasePath, opts) {
|
|
567
729
|
const testCase = loadTestCase(testCasePath);
|
|
568
730
|
let agentConfig = {
|
|
@@ -595,7 +757,10 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
595
757
|
adapter,
|
|
596
758
|
sandboxProvider,
|
|
597
759
|
db,
|
|
598
|
-
runId
|
|
760
|
+
runId,
|
|
761
|
+
onStep: opts.verbose ? (step) => {
|
|
762
|
+
console.log(formatVerboseStep(step));
|
|
763
|
+
} : void 0
|
|
599
764
|
});
|
|
600
765
|
console.log("\n================ RUN SUMMARY ================");
|
|
601
766
|
console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
|
|
@@ -606,13 +771,15 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
606
771
|
console.log(`Error: ${result.error}`);
|
|
607
772
|
}
|
|
608
773
|
if (result.metrics?.regression) {
|
|
609
|
-
console.log(
|
|
774
|
+
console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
|
|
610
775
|
}
|
|
611
776
|
if (result.metrics?.diff) {
|
|
612
777
|
console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
|
|
613
778
|
}
|
|
614
779
|
if (result.metrics?.localization) {
|
|
615
|
-
console.log(
|
|
780
|
+
console.log(
|
|
781
|
+
formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
|
|
782
|
+
);
|
|
616
783
|
}
|
|
617
784
|
console.log("=============================================\n");
|
|
618
785
|
} catch (err) {
|
|
@@ -701,7 +868,10 @@ function safeParseJson(value) {
|
|
|
701
868
|
|
|
702
869
|
// src/index.ts
|
|
703
870
|
var cli = cac("agr");
|
|
704
|
-
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").
|
|
871
|
+
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
|
|
872
|
+
"--verbose",
|
|
873
|
+
"Stream agent steps live to the console as they happen"
|
|
874
|
+
).action(async (testCase, options) => {
|
|
705
875
|
try {
|
|
706
876
|
await runSingleCommand(testCase, options);
|
|
707
877
|
} catch (err) {
|
|
@@ -709,12 +879,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
709
879
|
process.exit(1);
|
|
710
880
|
}
|
|
711
881
|
});
|
|
712
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
882
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
713
883
|
"--matrix <matrix>",
|
|
714
884
|
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
715
|
-
).action(async (options) => {
|
|
885
|
+
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
886
|
+
if (!options.configs && options.config) {
|
|
887
|
+
options.configs = options.config;
|
|
888
|
+
}
|
|
716
889
|
if (!options.suite || !options.configs && !options.matrix) {
|
|
717
|
-
console.error(
|
|
890
|
+
console.error(
|
|
891
|
+
"Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
|
|
892
|
+
);
|
|
718
893
|
process.exit(1);
|
|
719
894
|
}
|
|
720
895
|
try {
|
|
@@ -732,9 +907,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
|
|
|
732
907
|
cli.command(
|
|
733
908
|
"validate <testCase>",
|
|
734
909
|
"Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
|
|
735
|
-
).
|
|
910
|
+
).option(
|
|
911
|
+
"--strict",
|
|
912
|
+
"Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
|
|
913
|
+
).action(async (testCase, options) => {
|
|
736
914
|
try {
|
|
737
|
-
await validateCommand(testCase);
|
|
915
|
+
await validateCommand(testCase, options);
|
|
738
916
|
} catch (err) {
|
|
739
917
|
console.error(`Error executing validate: ${err.message}`);
|
|
740
918
|
process.exit(1);
|
|
@@ -743,7 +921,10 @@ cli.command(
|
|
|
743
921
|
cli.command(
|
|
744
922
|
"import-pr <repo> <prNumber>",
|
|
745
923
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
746
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
924
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
925
|
+
"--clone-fixture",
|
|
926
|
+
"Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
|
|
927
|
+
).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
|
|
747
928
|
try {
|
|
748
929
|
await importPrCommand(repo, prNumber, options);
|
|
749
930
|
} catch (err) {
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"bin": {
|
|
8
|
-
"agr": "
|
|
9
|
-
"agentgrader": "
|
|
8
|
+
"agr": "dist/index.js",
|
|
9
|
+
"agentgrader": "dist/index.js"
|
|
10
10
|
},
|
|
11
11
|
"main": "./dist/index.js",
|
|
12
12
|
"types": "./dist/index.d.ts",
|
|
@@ -19,12 +19,12 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^2.0.
|
|
23
|
-
"@agentgrader/core": "^1.1.
|
|
24
|
-
"@agentgrader/optimizer": "^0.1.
|
|
22
|
+
"@agentgrader/agent-openrouter": "^2.0.3",
|
|
23
|
+
"@agentgrader/core": "^1.1.3",
|
|
24
|
+
"@agentgrader/optimizer": "^0.1.1",
|
|
25
25
|
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
26
|
"@agentgrader/scorer-static": "^0.1.0",
|
|
27
|
-
"@agentgrader/store": "^1.0.
|
|
27
|
+
"@agentgrader/store": "^1.0.3",
|
|
28
28
|
"cac": "^6.7.14",
|
|
29
29
|
"dotenv": "^17.4.2",
|
|
30
30
|
"ink": "^4.4.1",
|