agentgrader 1.0.6 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +274 -46
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { cac } from 'cac';
|
|
4
4
|
import { randomUUID } from 'crypto';
|
|
5
|
-
import { resolve, dirname, isAbsolute } from 'path';
|
|
5
|
+
import { resolve, dirname, isAbsolute, basename } from 'path';
|
|
6
6
|
import { render, Box, Text } from 'ink';
|
|
7
7
|
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
8
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
@@ -13,9 +13,15 @@ import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agen
|
|
|
13
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
14
14
|
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
15
|
import { parse, stringify } from 'yaml';
|
|
16
|
-
import { ZodError } from 'zod';
|
|
16
|
+
import { z, ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
|
+
var CONFIG_COL_WIDTH = 24;
|
|
20
|
+
var CONFIG_LABEL_MAX = 20;
|
|
21
|
+
function truncateLabel(name, max = CONFIG_LABEL_MAX) {
|
|
22
|
+
if (name.length <= max) return name;
|
|
23
|
+
return `${name.slice(0, max - 1)}\u2026`;
|
|
24
|
+
}
|
|
19
25
|
var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
20
26
|
let totalCost = 0;
|
|
21
27
|
let totalSteps = 0;
|
|
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
67
73
|
r.testCaseId
|
|
68
74
|
] }),
|
|
69
75
|
/* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
|
|
70
|
-
/* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
|
|
76
|
+
/* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
|
|
71
77
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
72
78
|
" (Steps: ",
|
|
73
79
|
r.stepsCount,
|
|
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
81
87
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
|
|
82
88
|
/* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
|
|
83
89
|
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
|
|
84
|
-
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width:
|
|
90
|
+
configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
|
|
85
91
|
] }),
|
|
86
92
|
testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
|
|
87
|
-
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
|
|
93
|
+
/* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
|
|
88
94
|
configs.map((cfg) => {
|
|
89
95
|
const key = `${tc}_${cfg}`;
|
|
90
96
|
const run = runs[key];
|
|
91
97
|
if (!run) {
|
|
92
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
98
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
|
|
93
99
|
}
|
|
94
100
|
if (run.status === "running") {
|
|
95
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
101
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
|
|
96
102
|
}
|
|
97
103
|
if (run.status === "failed" || !run.passed) {
|
|
98
104
|
const seconds2 = (run.durationMs / 1e3).toFixed(1);
|
|
99
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
105
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
|
|
100
106
|
"\u2717 ",
|
|
101
107
|
seconds2,
|
|
102
108
|
"s ($",
|
|
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
105
111
|
] }) }, cfg);
|
|
106
112
|
}
|
|
107
113
|
const seconds = (run.durationMs / 1e3).toFixed(1);
|
|
108
|
-
return /* @__PURE__ */ jsx(Box, { width:
|
|
114
|
+
return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
|
|
109
115
|
"\u2713 ",
|
|
110
116
|
seconds,
|
|
111
117
|
"s ($",
|
|
@@ -173,6 +179,125 @@ function loadAgentConfig(yamlPath) {
|
|
|
173
179
|
}
|
|
174
180
|
return config;
|
|
175
181
|
}
|
|
182
|
+
|
|
183
|
+
// src/lib/resolve-agent-config-paths.ts
|
|
184
|
+
function globToRegex(glob) {
|
|
185
|
+
const escaped = glob.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
186
|
+
return new RegExp(`^${escaped}$`);
|
|
187
|
+
}
|
|
188
|
+
function collectYamlFilesRecursive(dir) {
|
|
189
|
+
const files = [];
|
|
190
|
+
for (const entry of readdirSync(dir)) {
|
|
191
|
+
if (entry.startsWith(".")) continue;
|
|
192
|
+
const fullPath = resolve(dir, entry);
|
|
193
|
+
const stat = statSync(fullPath);
|
|
194
|
+
if (stat.isDirectory()) {
|
|
195
|
+
files.push(...collectYamlFilesRecursive(fullPath));
|
|
196
|
+
} else if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
197
|
+
files.push(fullPath);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
return files;
|
|
201
|
+
}
|
|
202
|
+
function findAgentConfigYamlFilesInDir(dir) {
|
|
203
|
+
const resolvedDir = resolve(dir);
|
|
204
|
+
const files = [];
|
|
205
|
+
for (const entry of readdirSync(resolvedDir)) {
|
|
206
|
+
if (entry.startsWith(".")) continue;
|
|
207
|
+
const fullPath = resolve(resolvedDir, entry);
|
|
208
|
+
if (!statSync(fullPath).isFile()) continue;
|
|
209
|
+
if (entry.endsWith(".yaml") || entry.endsWith(".yml")) {
|
|
210
|
+
files.push(fullPath);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
return files.sort();
|
|
214
|
+
}
|
|
215
|
+
function expandAgentConfigGlob(globPattern, baseDir) {
|
|
216
|
+
const base = resolve(baseDir);
|
|
217
|
+
const normalized = globPattern.replace(/^\.\//, "");
|
|
218
|
+
if (normalized.includes("**")) {
|
|
219
|
+
const [prefix, suffixPart] = normalized.split("**");
|
|
220
|
+
const searchRoot = prefix.replace(/\/$/, "") ? resolve(base, prefix.replace(/\/$/, "")) : base;
|
|
221
|
+
const suffix = (suffixPart ?? "").replace(/^\//, "") || "*.yaml";
|
|
222
|
+
const regex2 = globToRegex(suffix);
|
|
223
|
+
return collectYamlFilesRecursive(searchRoot).filter((filePath) => regex2.test(basename(filePath))).sort();
|
|
224
|
+
}
|
|
225
|
+
const slashIdx = normalized.lastIndexOf("/");
|
|
226
|
+
const cwd = slashIdx === -1 ? base : resolve(base, normalized.slice(0, slashIdx));
|
|
227
|
+
const fileGlob = slashIdx === -1 ? normalized : normalized.slice(slashIdx + 1);
|
|
228
|
+
const regex = globToRegex(fileGlob);
|
|
229
|
+
return readdirSync(cwd).filter((entry) => {
|
|
230
|
+
if (entry.startsWith(".")) return false;
|
|
231
|
+
const fullPath = resolve(cwd, entry);
|
|
232
|
+
return statSync(fullPath).isFile() && regex.test(entry);
|
|
233
|
+
}).map((entry) => resolve(cwd, entry)).sort();
|
|
234
|
+
}
|
|
235
|
+
function resolveAgentConfigPathList(input) {
|
|
236
|
+
const paths = /* @__PURE__ */ new Set();
|
|
237
|
+
if (input.commaSeparated) {
|
|
238
|
+
for (const part of input.commaSeparated.split(",")) {
|
|
239
|
+
const trimmed = part.trim();
|
|
240
|
+
if (trimmed) paths.add(resolve(trimmed));
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
if (input.dir) {
|
|
244
|
+
for (const file of findAgentConfigYamlFilesInDir(input.dir)) {
|
|
245
|
+
paths.add(file);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
const baseDir = input.relativeTo ? resolve(input.relativeTo) : process.cwd();
|
|
249
|
+
if (input.explicitPaths) {
|
|
250
|
+
for (const p of input.explicitPaths) {
|
|
251
|
+
paths.add(resolve(baseDir, p));
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (input.globs) {
|
|
255
|
+
for (const pattern of input.globs) {
|
|
256
|
+
for (const file of expandAgentConfigGlob(pattern, baseDir)) {
|
|
257
|
+
paths.add(file);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const sorted = [...paths].sort();
|
|
262
|
+
if (sorted.length === 0) {
|
|
263
|
+
throw new Error("No agent config YAML files found.");
|
|
264
|
+
}
|
|
265
|
+
return sorted;
|
|
266
|
+
}
|
|
267
|
+
function loadAgentConfigsFromPaths(paths) {
|
|
268
|
+
return paths.map((p) => loadAgentConfig(p));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// src/lib/load-bench-manifest.ts
|
|
272
|
+
var AgentsSchema = z.object({
|
|
273
|
+
paths: z.array(z.string()).optional(),
|
|
274
|
+
glob: z.union([z.string(), z.array(z.string())]).optional()
|
|
275
|
+
}).refine((data) => (data.paths?.length ?? 0) > 0 || data.glob !== void 0, {
|
|
276
|
+
message: "agents must specify at least one of paths or glob"
|
|
277
|
+
});
|
|
278
|
+
var BenchManifestSchema = z.object({
|
|
279
|
+
name: z.string().optional(),
|
|
280
|
+
suite: z.string(),
|
|
281
|
+
agents: AgentsSchema,
|
|
282
|
+
concurrency: z.number().optional()
|
|
283
|
+
});
|
|
284
|
+
function loadBenchManifest(yamlPath) {
|
|
285
|
+
const path = resolve(yamlPath);
|
|
286
|
+
const raw = parse(readFileSync(path, "utf-8"));
|
|
287
|
+
return BenchManifestSchema.parse(raw);
|
|
288
|
+
}
|
|
289
|
+
function resolveManifestAgentConfigPaths(manifest, manifestPath) {
|
|
290
|
+
const manifestDir = dirname(resolve(manifestPath));
|
|
291
|
+
const globs = manifest.agents.glob ? Array.isArray(manifest.agents.glob) ? manifest.agents.glob : [manifest.agents.glob] : void 0;
|
|
292
|
+
return resolveAgentConfigPathList({
|
|
293
|
+
explicitPaths: manifest.agents.paths,
|
|
294
|
+
globs,
|
|
295
|
+
relativeTo: manifestDir
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
function resolveManifestSuiteDir(manifest, manifestPath) {
|
|
299
|
+
return resolve(dirname(resolve(manifestPath)), manifest.suite);
|
|
300
|
+
}
|
|
176
301
|
function loadMatrix(yamlPath) {
|
|
177
302
|
const path = resolve(yamlPath);
|
|
178
303
|
const fileContent = readFileSync(path, "utf-8");
|
|
@@ -266,26 +391,57 @@ function findTestCaseYamlFiles(dir) {
|
|
|
266
391
|
return files;
|
|
267
392
|
}
|
|
268
393
|
async function runBenchCommand(opts) {
|
|
269
|
-
|
|
270
|
-
|
|
394
|
+
let suiteDir;
|
|
395
|
+
let concurrency = opts.concurrency ?? 2;
|
|
271
396
|
let agentConfigs;
|
|
272
397
|
let matrixId;
|
|
273
|
-
if (opts.
|
|
274
|
-
const
|
|
275
|
-
|
|
276
|
-
|
|
398
|
+
if (opts.manifest) {
|
|
399
|
+
const manifestPath = resolve(opts.manifest);
|
|
400
|
+
const manifest = loadBenchManifest(manifestPath);
|
|
401
|
+
suiteDir = resolveManifestSuiteDir(manifest, manifestPath);
|
|
402
|
+
if (manifest.concurrency !== void 0 && opts.concurrency === void 0) {
|
|
403
|
+
concurrency = manifest.concurrency;
|
|
404
|
+
}
|
|
405
|
+
if (opts.matrix) {
|
|
406
|
+
throw new Error("Use either --manifest or --matrix, not both.");
|
|
407
|
+
}
|
|
408
|
+
const configPaths = resolveManifestAgentConfigPaths(manifest, manifestPath);
|
|
409
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
277
410
|
console.log(
|
|
278
|
-
`
|
|
411
|
+
`Bench manifest "${manifest.name ?? manifestPath}" loaded ${agentConfigs.length} agent config(s) from ${configPaths.length} file(s).`
|
|
279
412
|
);
|
|
280
|
-
} else if (opts.configs) {
|
|
281
|
-
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
282
|
-
agentConfigs = configPaths.map((p) => loadAgentConfig(p));
|
|
283
413
|
} else {
|
|
284
|
-
|
|
414
|
+
if (!opts.suite) {
|
|
415
|
+
throw new Error("--suite is required unless --manifest is provided.");
|
|
416
|
+
}
|
|
417
|
+
suiteDir = resolve(opts.suite);
|
|
418
|
+
if (opts.matrix) {
|
|
419
|
+
if (opts.configs || opts.configsDir) {
|
|
420
|
+
throw new Error("Use either --matrix or --configs/--configs-dir, not both.");
|
|
421
|
+
}
|
|
422
|
+
const matrix = loadMatrix(opts.matrix);
|
|
423
|
+
agentConfigs = expandMatrix(matrix);
|
|
424
|
+
matrixId = randomUUID();
|
|
425
|
+
console.log(
|
|
426
|
+
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
427
|
+
);
|
|
428
|
+
} else {
|
|
429
|
+
const configPaths = resolveAgentConfigPathList({
|
|
430
|
+
commaSeparated: opts.configs,
|
|
431
|
+
dir: opts.configsDir
|
|
432
|
+
});
|
|
433
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
434
|
+
if (opts.configsDir) {
|
|
435
|
+
console.log(`Loaded ${agentConfigs.length} agent config(s) from ${opts.configsDir}.`);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
if (agentConfigs.length === 0) {
|
|
440
|
+
throw new Error("No agent configs to benchmark.");
|
|
285
441
|
}
|
|
286
442
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
287
443
|
if (yamlFiles.length === 0) {
|
|
288
|
-
console.error(`No test cases found in suite directory: ${
|
|
444
|
+
console.error(`No test cases found in suite directory: ${suiteDir}`);
|
|
289
445
|
process.exit(1);
|
|
290
446
|
}
|
|
291
447
|
const testCases = [];
|
|
@@ -461,6 +617,11 @@ async function validateCommand(testCasePath, opts) {
|
|
|
461
617
|
console.log(
|
|
462
618
|
"Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
|
|
463
619
|
);
|
|
620
|
+
if (report.ok && !opts?.strict) {
|
|
621
|
+
console.log(
|
|
622
|
+
"Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
|
|
623
|
+
);
|
|
624
|
+
}
|
|
464
625
|
}
|
|
465
626
|
console.log("");
|
|
466
627
|
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
@@ -506,8 +667,8 @@ async function importPrCommand(repo, prNumber, opts) {
|
|
|
506
667
|
if (testDiff.trim()) {
|
|
507
668
|
writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
|
|
508
669
|
}
|
|
670
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
509
671
|
if (opts.cloneFixture) {
|
|
510
|
-
const fixtureDir = resolve(outDir, "fixture");
|
|
511
672
|
console.log(`
|
|
512
673
|
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
513
674
|
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
@@ -516,8 +677,12 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
516
677
|
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
517
678
|
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
518
679
|
}
|
|
519
|
-
const projectKind = opts.cloneFixture ? detectProjectKind(
|
|
520
|
-
const { success, test_command } = projectTestDefaults(
|
|
680
|
+
const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
|
|
681
|
+
const { success, test_command, testCommandHint } = projectTestDefaults(
|
|
682
|
+
projectKind,
|
|
683
|
+
opts.cloneFixture ?? false,
|
|
684
|
+
fixtureDir
|
|
685
|
+
);
|
|
521
686
|
const yamlDoc = {
|
|
522
687
|
name: slug,
|
|
523
688
|
description: pr.title,
|
|
@@ -535,7 +700,7 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
|
535
700
|
if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
|
|
536
701
|
if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
|
|
537
702
|
if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
|
|
538
|
-
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc,
|
|
703
|
+
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
|
|
539
704
|
console.log(`
|
|
540
705
|
Imported PR #${pr.number}: "${pr.title}"`);
|
|
541
706
|
console.log(`Wrote scaffold to: ${outDir}`);
|
|
@@ -551,6 +716,9 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
551
716
|
console.log(
|
|
552
717
|
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
553
718
|
);
|
|
719
|
+
console.log(
|
|
720
|
+
"\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
|
|
721
|
+
);
|
|
554
722
|
} else {
|
|
555
723
|
console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
|
|
556
724
|
console.log(
|
|
@@ -570,50 +738,81 @@ function detectProjectKind(fixtureDir) {
|
|
|
570
738
|
if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
|
|
571
739
|
return "unknown";
|
|
572
740
|
}
|
|
573
|
-
function projectTestDefaults(kind, cloned) {
|
|
741
|
+
function projectTestDefaults(kind, cloned, fixtureDir) {
|
|
574
742
|
if (!cloned) {
|
|
575
743
|
return {
|
|
576
744
|
success: [
|
|
577
745
|
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
578
746
|
],
|
|
579
|
-
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
747
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
748
|
+
testCommandHint: "none"
|
|
580
749
|
};
|
|
581
750
|
}
|
|
582
751
|
switch (kind) {
|
|
583
752
|
case "python":
|
|
584
753
|
return {
|
|
585
754
|
success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
|
|
586
|
-
test_command: "pytest --tap-stream"
|
|
755
|
+
test_command: "pytest --tap-stream",
|
|
756
|
+
testCommandHint: "python"
|
|
587
757
|
};
|
|
588
758
|
case "node":
|
|
589
|
-
return
|
|
590
|
-
success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
|
|
591
|
-
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
|
|
592
|
-
};
|
|
759
|
+
return detectNodeTestRunner(fixtureDir);
|
|
593
760
|
case "go":
|
|
594
761
|
return {
|
|
595
762
|
success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
|
|
596
|
-
test_command: "<TODO: configure a TAP-producing test command for go>"
|
|
763
|
+
test_command: "<TODO: configure a TAP-producing test command for go>",
|
|
764
|
+
testCommandHint: "go"
|
|
597
765
|
};
|
|
598
766
|
default:
|
|
599
767
|
return {
|
|
600
768
|
success: [
|
|
601
769
|
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
602
770
|
],
|
|
603
|
-
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
771
|
+
test_command: "<TODO: shell command that runs tests with TAP output>",
|
|
772
|
+
testCommandHint: "none"
|
|
604
773
|
};
|
|
605
774
|
}
|
|
606
775
|
}
|
|
607
|
-
function
|
|
776
|
+
function detectNodeTestRunner(fixtureDir) {
|
|
777
|
+
const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
|
|
778
|
+
const fallback = {
|
|
779
|
+
success,
|
|
780
|
+
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
|
|
781
|
+
testCommandHint: "node-unknown"
|
|
782
|
+
};
|
|
783
|
+
try {
|
|
784
|
+
const pkgPath = resolve(fixtureDir, "package.json");
|
|
785
|
+
if (!existsSync(pkgPath)) return fallback;
|
|
786
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
|
787
|
+
const deps = { ...pkg.dependencies, ...pkg.devDependencies };
|
|
788
|
+
if (deps.ava) {
|
|
789
|
+
return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
|
|
790
|
+
}
|
|
791
|
+
if (deps.vitest) {
|
|
792
|
+
return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
|
|
793
|
+
}
|
|
794
|
+
if (deps.jest) {
|
|
795
|
+
return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
|
|
796
|
+
}
|
|
797
|
+
return fallback;
|
|
798
|
+
} catch {
|
|
799
|
+
return fallback;
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
function buildAgrYaml(doc, testCommandHint) {
|
|
608
803
|
let yaml = stringify(doc);
|
|
609
804
|
const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
|
|
610
805
|
yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
|
|
611
806
|
fail_to_pass:`);
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
807
|
+
const testCommandComments = {
|
|
808
|
+
python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
|
|
809
|
+
jest: "# jest does not output TAP by default; consider jest-tap-reporter",
|
|
810
|
+
"node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
|
|
811
|
+
};
|
|
812
|
+
const comment = testCommandComments[testCommandHint];
|
|
813
|
+
if (comment) {
|
|
814
|
+
yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
|
|
815
|
+
$&`);
|
|
617
816
|
}
|
|
618
817
|
return yaml;
|
|
619
818
|
}
|
|
@@ -830,25 +1029,51 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
830
1029
|
process.exit(1);
|
|
831
1030
|
}
|
|
832
1031
|
});
|
|
833
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1032
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option(
|
|
1033
|
+
"--configs-dir <dir>",
|
|
1034
|
+
"Directory of AgentConfig YAML files (all .yaml/.yml files in the folder)"
|
|
1035
|
+
).option(
|
|
1036
|
+
"--manifest <manifest>",
|
|
1037
|
+
"Path to a bench manifest YAML (suite + agent paths/glob in one file)"
|
|
1038
|
+
).option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
834
1039
|
"--matrix <matrix>",
|
|
835
1040
|
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
836
|
-
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
1041
|
+
).example("agr bench --manifest bench.yaml").example("agr bench --suite tasks --configs-dir ./agents").example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
837
1042
|
if (!options.configs && options.config) {
|
|
838
1043
|
options.configs = options.config;
|
|
839
1044
|
}
|
|
840
|
-
|
|
1045
|
+
const agentSourceCount = [
|
|
1046
|
+
options.configs,
|
|
1047
|
+
options.configsDir,
|
|
1048
|
+
options.matrix,
|
|
1049
|
+
options.manifest
|
|
1050
|
+
].filter(Boolean).length;
|
|
1051
|
+
if (options.manifest) {
|
|
1052
|
+
if (agentSourceCount > 1) {
|
|
1053
|
+
console.error(
|
|
1054
|
+
"Error: --manifest cannot be combined with --configs, --configs-dir, or --matrix."
|
|
1055
|
+
);
|
|
1056
|
+
process.exit(1);
|
|
1057
|
+
}
|
|
1058
|
+
} else if (!options.suite || agentSourceCount === 0) {
|
|
1059
|
+
console.error(
|
|
1060
|
+
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or --matrix."
|
|
1061
|
+
);
|
|
1062
|
+
process.exit(1);
|
|
1063
|
+
} else if (agentSourceCount > 1) {
|
|
841
1064
|
console.error(
|
|
842
|
-
"Error:
|
|
1065
|
+
"Error: use only one agent source: --configs, --configs-dir, or --matrix."
|
|
843
1066
|
);
|
|
844
1067
|
process.exit(1);
|
|
845
1068
|
}
|
|
846
1069
|
try {
|
|
847
1070
|
await runBenchCommand({
|
|
848
1071
|
configs: options.configs,
|
|
1072
|
+
configsDir: options.configsDir,
|
|
849
1073
|
suite: options.suite,
|
|
850
1074
|
concurrency: Number(options.concurrency),
|
|
851
|
-
matrix: options.matrix
|
|
1075
|
+
matrix: options.matrix,
|
|
1076
|
+
manifest: options.manifest
|
|
852
1077
|
});
|
|
853
1078
|
} catch (err) {
|
|
854
1079
|
console.error(`Error executing benchmark: ${err.message}`);
|
|
@@ -872,7 +1097,10 @@ cli.command(
|
|
|
872
1097
|
cli.command(
|
|
873
1098
|
"import-pr <repo> <prNumber>",
|
|
874
1099
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
875
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
1100
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option(
|
|
1101
|
+
"--clone-fixture",
|
|
1102
|
+
"Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
|
|
1103
|
+
).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
|
|
876
1104
|
try {
|
|
877
1105
|
await importPrCommand(repo, prNumber, options);
|
|
878
1106
|
} catch (err) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^2.0.
|
|
22
|
+
"@agentgrader/agent-openrouter": "^2.0.3",
|
|
23
23
|
"@agentgrader/core": "^1.1.3",
|
|
24
|
-
"@agentgrader/optimizer": "^0.1.
|
|
24
|
+
"@agentgrader/optimizer": "^0.1.1",
|
|
25
25
|
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
26
|
"@agentgrader/scorer-static": "^0.1.0",
|
|
27
27
|
"@agentgrader/store": "^1.0.3",
|