agentgrader 1.0.2 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +390 -48
- package/package.json +12 -8
package/dist/index.js
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
2
3
|
import { cac } from 'cac';
|
|
4
|
+
import { randomUUID } from 'crypto';
|
|
3
5
|
import { resolve, dirname, isAbsolute } from 'path';
|
|
4
6
|
import { render, Box, Text } from 'ink';
|
|
5
|
-
import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
|
|
7
|
+
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
6
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
7
9
|
import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
|
|
8
10
|
import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
11
|
+
import { StaticQualityScorer } from '@agentgrader/scorer-static';
|
|
12
|
+
import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
|
|
9
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
10
|
-
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
14
|
+
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
|
|
15
|
+
import { parse, stringify } from 'yaml';
|
|
16
|
+
import { ZodError } from 'zod';
|
|
17
|
+
import { execFileSync } from 'child_process';
|
|
13
18
|
|
|
14
19
|
var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
15
20
|
let totalCost = 0;
|
|
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
134
139
|
] })
|
|
135
140
|
] });
|
|
136
141
|
};
|
|
142
|
+
|
|
143
|
+
// src/lib/format-zod-error.ts
|
|
144
|
+
function formatZodError(err, fileLabel) {
|
|
145
|
+
const lines = err.issues.map((issue) => {
|
|
146
|
+
const path = issue.path.join(".") || "(root)";
|
|
147
|
+
return ` - ${path}: ${issue.message}`;
|
|
148
|
+
});
|
|
149
|
+
return `Invalid ${fileLabel}:
|
|
150
|
+
${lines.join("\n")}`;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// src/lib/load-agent-config.ts
|
|
137
154
|
function loadAgentConfig(yamlPath) {
|
|
138
155
|
const path = resolve(yamlPath);
|
|
139
156
|
const fileContent = readFileSync(path, "utf-8");
|
|
140
157
|
const raw = parse(fileContent);
|
|
141
158
|
const dir = dirname(path);
|
|
142
|
-
|
|
159
|
+
let config;
|
|
160
|
+
try {
|
|
161
|
+
config = AgentConfigSchema.parse(raw);
|
|
162
|
+
} catch (err) {
|
|
163
|
+
if (err instanceof ZodError) {
|
|
164
|
+
throw new Error(formatZodError(err, `agent config "${path}"`));
|
|
165
|
+
}
|
|
166
|
+
throw err;
|
|
167
|
+
}
|
|
143
168
|
config.id = config.id || config.name;
|
|
144
169
|
if (config.toolkits) {
|
|
145
170
|
config.toolkits = config.toolkits.map(
|
|
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
|
|
|
148
173
|
}
|
|
149
174
|
return config;
|
|
150
175
|
}
|
|
176
|
+
function loadMatrix(yamlPath) {
|
|
177
|
+
const path = resolve(yamlPath);
|
|
178
|
+
const fileContent = readFileSync(path, "utf-8");
|
|
179
|
+
const raw = parse(fileContent);
|
|
180
|
+
return MatrixSchema.parse(raw);
|
|
181
|
+
}
|
|
151
182
|
function loadTestCase(yamlPath) {
|
|
152
183
|
const path = resolve(yamlPath);
|
|
153
184
|
const fileContent = readFileSync(path, "utf-8");
|
|
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
|
|
|
156
187
|
if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
|
|
157
188
|
raw.fixture = resolve(dir, raw.fixture);
|
|
158
189
|
}
|
|
159
|
-
|
|
190
|
+
let testCase;
|
|
191
|
+
try {
|
|
192
|
+
testCase = TestCaseSchema.parse(raw);
|
|
193
|
+
} catch (err) {
|
|
194
|
+
if (err instanceof ZodError) {
|
|
195
|
+
throw new Error(formatZodError(err, `test case "${path}"`));
|
|
196
|
+
}
|
|
197
|
+
throw err;
|
|
198
|
+
}
|
|
160
199
|
testCase.id = testCase.id || testCase.name;
|
|
161
200
|
if (testCase.toolkits) {
|
|
162
201
|
testCase.toolkits = testCase.toolkits.map(
|
|
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
|
|
|
228
267
|
}
|
|
229
268
|
async function runBenchCommand(opts) {
|
|
230
269
|
const suiteDir = resolve(opts.suite);
|
|
231
|
-
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
232
270
|
const concurrency = opts.concurrency || 2;
|
|
233
|
-
|
|
271
|
+
let agentConfigs;
|
|
272
|
+
let matrixId;
|
|
273
|
+
if (opts.matrix) {
|
|
274
|
+
const matrix = loadMatrix(opts.matrix);
|
|
275
|
+
agentConfigs = expandMatrix(matrix);
|
|
276
|
+
matrixId = randomUUID();
|
|
277
|
+
console.log(
|
|
278
|
+
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
279
|
+
);
|
|
280
|
+
} else if (opts.configs) {
|
|
281
|
+
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
282
|
+
agentConfigs = configPaths.map((p) => loadAgentConfig(p));
|
|
283
|
+
} else {
|
|
284
|
+
throw new Error("Either --configs or --matrix must be provided.");
|
|
285
|
+
}
|
|
234
286
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
235
287
|
if (yamlFiles.length === 0) {
|
|
236
288
|
console.error(`No test cases found in suite directory: ${opts.suite}`);
|
|
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
|
|
|
295
347
|
sandboxProvider,
|
|
296
348
|
db,
|
|
297
349
|
concurrency,
|
|
298
|
-
onRunUpdate
|
|
350
|
+
onRunUpdate,
|
|
351
|
+
extraScorers: [new StaticQualityScorer()],
|
|
352
|
+
matrixId
|
|
299
353
|
});
|
|
300
354
|
} catch (err) {
|
|
301
355
|
console.error("Benchmark runner encountered an error:", err);
|
|
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
|
|
|
312
366
|
)
|
|
313
367
|
);
|
|
314
368
|
printTagBreakdown(testCases, agentConfigs, runStates);
|
|
369
|
+
if (matrixId) {
|
|
370
|
+
await printMatrixSummary(db, matrixId, agentConfigs);
|
|
371
|
+
}
|
|
315
372
|
process.exit(0);
|
|
316
373
|
}
|
|
374
|
+
async function printMatrixSummary(db, matrixId, agentConfigs) {
|
|
375
|
+
const runs = await getRunsByMatrixId(db, matrixId);
|
|
376
|
+
const aggregates = aggregateResults(runs, agentConfigs);
|
|
377
|
+
if (aggregates.length === 0) return;
|
|
378
|
+
const front = paretoFront(aggregates);
|
|
379
|
+
const frontIds = new Set(front.map((a) => a.agentConfigId));
|
|
380
|
+
const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
|
|
381
|
+
console.log("\n================ MATRIX SUMMARY ================");
|
|
382
|
+
for (const agg of aggregates) {
|
|
383
|
+
const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
|
|
384
|
+
const solveRatePct = (agg.solveRate * 100).toFixed(0);
|
|
385
|
+
const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
|
|
386
|
+
console.log(
|
|
387
|
+
`${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
console.log(
|
|
391
|
+
`
|
|
392
|
+
* = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
|
|
393
|
+
);
|
|
394
|
+
console.log("=================================================\n");
|
|
395
|
+
}
|
|
317
396
|
function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
318
397
|
const tagStats = {};
|
|
319
398
|
for (const tc of testCases) {
|
|
@@ -340,6 +419,55 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
|
340
419
|
}
|
|
341
420
|
console.log("=================================================\n");
|
|
342
421
|
}
|
|
422
|
+
function isSkippedCheck(check) {
|
|
423
|
+
return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
|
|
424
|
+
}
|
|
425
|
+
function checkIcon(check) {
|
|
426
|
+
if (!check.passed) return "\u274C";
|
|
427
|
+
if (isSkippedCheck(check)) return "\u26A0\uFE0F";
|
|
428
|
+
return "\u2705";
|
|
429
|
+
}
|
|
430
|
+
async function validateCommand(testCasePath, opts) {
|
|
431
|
+
const testCase = loadTestCase(testCasePath);
|
|
432
|
+
if (opts?.strict) {
|
|
433
|
+
const missing = [];
|
|
434
|
+
if (!testCase.test_command) missing.push("test_command");
|
|
435
|
+
if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
|
|
436
|
+
if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
|
|
437
|
+
if (missing.length > 0) {
|
|
438
|
+
console.error(
|
|
439
|
+
`Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
|
|
440
|
+
);
|
|
441
|
+
process.exit(1);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
console.log(`Validating "${testCase.name}" (${testCasePath})...
|
|
445
|
+
`);
|
|
446
|
+
const sandboxProvider = new DockerSandboxProvider();
|
|
447
|
+
const report = await validateTestCase({ testCase, sandboxProvider });
|
|
448
|
+
const hadExecutionSkip = report.checks.some(
|
|
449
|
+
(c) => c.name.includes("execution-checks (skipped")
|
|
450
|
+
);
|
|
451
|
+
for (const check of report.checks) {
|
|
452
|
+
const icon = checkIcon(check);
|
|
453
|
+
console.log(`${icon} ${check.name}`);
|
|
454
|
+
if (check.detail && check.detail !== "ok") {
|
|
455
|
+
const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
|
|
456
|
+
console.log(indented);
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
if (hadExecutionSkip) {
|
|
460
|
+
console.log("");
|
|
461
|
+
console.log(
|
|
462
|
+
"Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
|
|
463
|
+
);
|
|
464
|
+
}
|
|
465
|
+
console.log("");
|
|
466
|
+
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
467
|
+
process.exit(report.ok ? 0 : 1);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// src/commands/import-pr.ts
|
|
343
471
|
var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
|
|
344
472
|
async function importPrCommand(repo, prNumber, opts) {
|
|
345
473
|
const [owner, repoName] = repo.split("/");
|
|
@@ -378,26 +506,36 @@ async function importPrCommand(repo, prNumber, opts) {
|
|
|
378
506
|
if (testDiff.trim()) {
|
|
379
507
|
writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
|
|
380
508
|
}
|
|
509
|
+
if (opts.cloneFixture) {
|
|
510
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
511
|
+
console.log(`
|
|
512
|
+
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
513
|
+
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
514
|
+
stdio: "inherit"
|
|
515
|
+
});
|
|
516
|
+
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
517
|
+
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
518
|
+
}
|
|
519
|
+
const projectKind = opts.cloneFixture ? detectProjectKind(resolve(outDir, "fixture")) : "unknown";
|
|
520
|
+
const { success, test_command } = projectTestDefaults(projectKind, opts.cloneFixture ?? false);
|
|
381
521
|
const yamlDoc = {
|
|
382
522
|
name: slug,
|
|
383
523
|
description: pr.title,
|
|
384
524
|
fixture: "./fixture",
|
|
385
525
|
prompt: buildPrompt(pr),
|
|
386
|
-
success
|
|
526
|
+
success,
|
|
387
527
|
timeout_seconds: 600,
|
|
388
528
|
tags: ["imported", repoName],
|
|
389
529
|
created_at: pr.created_at,
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
fail_to_pass: ["<TODO: fill in via `agr validate`>"],
|
|
394
|
-
pass_to_pass: ["<TODO: fill in via `agr validate`>"]
|
|
530
|
+
test_command,
|
|
531
|
+
fail_to_pass: [],
|
|
532
|
+
pass_to_pass: []
|
|
395
533
|
};
|
|
396
534
|
if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
|
|
397
535
|
if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
|
|
398
536
|
if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
|
|
399
537
|
if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
|
|
400
|
-
writeFileSync(resolve(outDir, "agr.yaml"),
|
|
538
|
+
writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, projectKind));
|
|
401
539
|
console.log(`
|
|
402
540
|
Imported PR #${pr.number}: "${pr.title}"`);
|
|
403
541
|
console.log(`Wrote scaffold to: ${outDir}`);
|
|
@@ -407,11 +545,77 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
407
545
|
if (testDiff.trim())
|
|
408
546
|
console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
|
|
409
547
|
console.log("\nNext steps:");
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
)
|
|
548
|
+
if (!opts.cloneFixture) {
|
|
549
|
+
console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
|
|
550
|
+
console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
|
|
551
|
+
console.log(
|
|
552
|
+
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
553
|
+
);
|
|
554
|
+
} else {
|
|
555
|
+
console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
|
|
556
|
+
console.log(
|
|
557
|
+
` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
if (opts.validate) {
|
|
561
|
+
console.log("\nRunning validation...\n");
|
|
562
|
+
await validateCommand(resolve(outDir, "agr.yaml"));
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
function detectProjectKind(fixtureDir) {
|
|
566
|
+
if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
|
|
567
|
+
return "python";
|
|
568
|
+
}
|
|
569
|
+
if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
|
|
570
|
+
if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
|
|
571
|
+
return "unknown";
|
|
572
|
+
}
|
|
573
|
+
function projectTestDefaults(kind, cloned) {
|
|
574
|
+
if (!cloned) {
|
|
575
|
+
return {
|
|
576
|
+
success: [
|
|
577
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
578
|
+
],
|
|
579
|
+
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
switch (kind) {
|
|
583
|
+
case "python":
|
|
584
|
+
return {
|
|
585
|
+
success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
|
|
586
|
+
test_command: "pytest --tap-stream"
|
|
587
|
+
};
|
|
588
|
+
case "node":
|
|
589
|
+
return {
|
|
590
|
+
success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
|
|
591
|
+
test_command: "tsx --test --test-reporter=tap src/**/*.test.ts"
|
|
592
|
+
};
|
|
593
|
+
case "go":
|
|
594
|
+
return {
|
|
595
|
+
success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
|
|
596
|
+
test_command: "<TODO: configure a TAP-producing test command for go>"
|
|
597
|
+
};
|
|
598
|
+
default:
|
|
599
|
+
return {
|
|
600
|
+
success: [
|
|
601
|
+
{ run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
|
|
602
|
+
],
|
|
603
|
+
test_command: "<TODO: shell command that runs tests with TAP output>"
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
function buildAgrYaml(doc, projectKind) {
|
|
608
|
+
let yaml = stringify(doc);
|
|
609
|
+
const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
|
|
610
|
+
yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
|
|
611
|
+
fail_to_pass:`);
|
|
612
|
+
if (projectKind === "python") {
|
|
613
|
+
yaml = yaml.replace(
|
|
614
|
+
/^test_command: (.+)$/m,
|
|
615
|
+
"# Requires pytest-tap for TAP output (pip install pytest-tap).\n$&"
|
|
616
|
+
);
|
|
617
|
+
}
|
|
618
|
+
return yaml;
|
|
415
619
|
}
|
|
416
620
|
function buildPrompt(pr) {
|
|
417
621
|
const body = (pr.body || "").trim();
|
|
@@ -443,6 +647,35 @@ function splitDiff(diff) {
|
|
|
443
647
|
forbidModified
|
|
444
648
|
};
|
|
445
649
|
}
|
|
650
|
+
var VERBOSE_CONTENT_MAX = 200;
|
|
651
|
+
function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
|
|
652
|
+
if (value.length <= max) return value;
|
|
653
|
+
return `${value.slice(0, max)}...`;
|
|
654
|
+
}
|
|
655
|
+
function formatVerboseStep(step) {
|
|
656
|
+
const prefix = `[step ${step.index}] ${step.kind}`;
|
|
657
|
+
if (step.kind === "tool_call" && step.tool) {
|
|
658
|
+
const args = step.content ? truncateForVerbose(step.content) : "";
|
|
659
|
+
return `${prefix}: ${step.tool}(${args})`;
|
|
660
|
+
}
|
|
661
|
+
if (step.kind === "tool_result" && step.tool) {
|
|
662
|
+
const result = step.content ? truncateForVerbose(step.content) : "";
|
|
663
|
+
return `${prefix}: ${step.tool} -> ${result}`;
|
|
664
|
+
}
|
|
665
|
+
if (step.kind === "message" && step.content) {
|
|
666
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
667
|
+
}
|
|
668
|
+
if (step.content) {
|
|
669
|
+
return `${prefix}: ${truncateForVerbose(step.content)}`;
|
|
670
|
+
}
|
|
671
|
+
return prefix;
|
|
672
|
+
}
|
|
673
|
+
function formatMetricDetail(label, detail) {
|
|
674
|
+
if (/^No .+ configured; skipping/.test(detail)) {
|
|
675
|
+
return `\u26A0\uFE0F ${label}: ${detail}`;
|
|
676
|
+
}
|
|
677
|
+
return `${label}: ${detail}`;
|
|
678
|
+
}
|
|
446
679
|
async function runSingleCommand(testCasePath, opts) {
|
|
447
680
|
const testCase = loadTestCase(testCasePath);
|
|
448
681
|
let agentConfig = {
|
|
@@ -459,6 +692,14 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
459
692
|
const adapter = new AiSdkAgentAdapter();
|
|
460
693
|
const db = initDb();
|
|
461
694
|
await saveTestCase(db, testCaseToDbRow(testCase));
|
|
695
|
+
await saveAgentConfig(db, {
|
|
696
|
+
id: agentConfig.id || agentConfig.name,
|
|
697
|
+
name: agentConfig.name,
|
|
698
|
+
model: agentConfig.model,
|
|
699
|
+
maxSteps: agentConfig.max_steps,
|
|
700
|
+
temperature: agentConfig.temperature,
|
|
701
|
+
createdAt: Math.floor(Date.now() / 1e3)
|
|
702
|
+
});
|
|
462
703
|
const runId = randomUUID();
|
|
463
704
|
try {
|
|
464
705
|
const result = await runSingle({
|
|
@@ -467,7 +708,10 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
467
708
|
adapter,
|
|
468
709
|
sandboxProvider,
|
|
469
710
|
db,
|
|
470
|
-
runId
|
|
711
|
+
runId,
|
|
712
|
+
onStep: opts.verbose ? (step) => {
|
|
713
|
+
console.log(formatVerboseStep(step));
|
|
714
|
+
} : void 0
|
|
471
715
|
});
|
|
472
716
|
console.log("\n================ RUN SUMMARY ================");
|
|
473
717
|
console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
|
|
@@ -478,13 +722,15 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
478
722
|
console.log(`Error: ${result.error}`);
|
|
479
723
|
}
|
|
480
724
|
if (result.metrics?.regression) {
|
|
481
|
-
console.log(
|
|
725
|
+
console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
|
|
482
726
|
}
|
|
483
727
|
if (result.metrics?.diff) {
|
|
484
728
|
console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
|
|
485
729
|
}
|
|
486
730
|
if (result.metrics?.localization) {
|
|
487
|
-
console.log(
|
|
731
|
+
console.log(
|
|
732
|
+
formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
|
|
733
|
+
);
|
|
488
734
|
}
|
|
489
735
|
console.log("=============================================\n");
|
|
490
736
|
} catch (err) {
|
|
@@ -493,28 +739,90 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
493
739
|
}
|
|
494
740
|
process.exit(0);
|
|
495
741
|
}
|
|
496
|
-
async function
|
|
497
|
-
const
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
742
|
+
async function traceCommand(runId, opts) {
|
|
743
|
+
const db = initDb();
|
|
744
|
+
const run = await getRun(db, runId);
|
|
745
|
+
if (!run) {
|
|
746
|
+
console.error(`Run not found: ${runId}`);
|
|
747
|
+
process.exit(1);
|
|
748
|
+
}
|
|
749
|
+
console.log(`Run ${run.id}`);
|
|
750
|
+
console.log(` test case: ${run.testCaseId}`);
|
|
751
|
+
console.log(` agent config: ${run.agentConfigId}`);
|
|
752
|
+
console.log(
|
|
753
|
+
` status: ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
|
|
754
|
+
);
|
|
755
|
+
console.log(` cost: $${run.costUsd.toFixed(4)}`);
|
|
756
|
+
console.log(` duration: ${run.durationMs}ms`);
|
|
757
|
+
if (run.error) console.log(` error: ${run.error}`);
|
|
758
|
+
if (opts.quality) {
|
|
759
|
+
printQualityBreakdown(run.metrics);
|
|
760
|
+
return;
|
|
761
|
+
}
|
|
762
|
+
const steps = await getTraces(db, runId);
|
|
763
|
+
console.log(`
|
|
764
|
+
${steps.length} step(s):`);
|
|
765
|
+
for (const step of steps) {
|
|
766
|
+
const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
|
|
767
|
+
console.log(
|
|
768
|
+
` [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
|
|
769
|
+
);
|
|
770
|
+
if (step.content) {
|
|
771
|
+
const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
|
|
772
|
+
console.log(` ${preview.replace(/\n/g, "\n ")}`);
|
|
508
773
|
}
|
|
509
774
|
}
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
775
|
+
}
|
|
776
|
+
function printQualityBreakdown(metricsJson) {
|
|
777
|
+
const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
|
|
778
|
+
console.log("\n================ QUALITY BREAKDOWN ================");
|
|
779
|
+
const staticQuality = metrics?.["static-quality"]?.quality;
|
|
780
|
+
const llmJudge = metrics?.["llm-judge"]?.quality;
|
|
781
|
+
const diff = metrics?.diff;
|
|
782
|
+
const localization = metrics?.localization;
|
|
783
|
+
if (staticQuality) {
|
|
784
|
+
console.log("Static quality (static-quality):");
|
|
785
|
+
if (staticQuality.diffLines !== void 0) console.log(` diff lines: ${staticQuality.diffLines}`);
|
|
786
|
+
if (staticQuality.filesModified !== void 0)
|
|
787
|
+
console.log(` files modified: ${staticQuality.filesModified}`);
|
|
788
|
+
if (staticQuality.todosIntroduced !== void 0)
|
|
789
|
+
console.log(` TODOs introduced: ${staticQuality.todosIntroduced}`);
|
|
790
|
+
if (staticQuality.linterViolations !== void 0)
|
|
791
|
+
console.log(` lint violations: ${staticQuality.linterViolations}`);
|
|
792
|
+
}
|
|
793
|
+
if (llmJudge) {
|
|
794
|
+
if (staticQuality) console.log("");
|
|
795
|
+
console.log("LLM judge (llm-judge):");
|
|
796
|
+
if (llmJudge.llmJudgeScore !== void 0)
|
|
797
|
+
console.log(` score: ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
|
|
798
|
+
if (llmJudge.llmJudgeDetail) console.log(` rationale: ${llmJudge.llmJudgeDetail}`);
|
|
799
|
+
}
|
|
800
|
+
if (diff) {
|
|
801
|
+
if (staticQuality || llmJudge) console.log("");
|
|
802
|
+
console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
|
|
803
|
+
}
|
|
804
|
+
if (localization) {
|
|
805
|
+
console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
|
|
806
|
+
}
|
|
807
|
+
if (!staticQuality && !llmJudge && !diff && !localization) {
|
|
808
|
+
console.log(" (no quality metrics recorded for this run)");
|
|
809
|
+
}
|
|
810
|
+
console.log("=====================================================\n");
|
|
811
|
+
}
|
|
812
|
+
function safeParseJson(value) {
|
|
813
|
+
try {
|
|
814
|
+
return JSON.parse(value);
|
|
815
|
+
} catch {
|
|
816
|
+
return void 0;
|
|
817
|
+
}
|
|
513
818
|
}
|
|
514
819
|
|
|
515
820
|
// src/index.ts
|
|
516
821
|
var cli = cac("agr");
|
|
517
|
-
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").
|
|
822
|
+
cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
|
|
823
|
+
"--verbose",
|
|
824
|
+
"Stream agent steps live to the console as they happen"
|
|
825
|
+
).action(async (testCase, options) => {
|
|
518
826
|
try {
|
|
519
827
|
await runSingleCommand(testCase, options);
|
|
520
828
|
} catch (err) {
|
|
@@ -522,16 +830,25 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
522
830
|
process.exit(1);
|
|
523
831
|
}
|
|
524
832
|
});
|
|
525
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).
|
|
526
|
-
|
|
527
|
-
|
|
833
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
834
|
+
"--matrix <matrix>",
|
|
835
|
+
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
836
|
+
).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
|
|
837
|
+
if (!options.configs && options.config) {
|
|
838
|
+
options.configs = options.config;
|
|
839
|
+
}
|
|
840
|
+
if (!options.suite || !options.configs && !options.matrix) {
|
|
841
|
+
console.error(
|
|
842
|
+
"Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
|
|
843
|
+
);
|
|
528
844
|
process.exit(1);
|
|
529
845
|
}
|
|
530
846
|
try {
|
|
531
847
|
await runBenchCommand({
|
|
532
848
|
configs: options.configs,
|
|
533
849
|
suite: options.suite,
|
|
534
|
-
concurrency: Number(options.concurrency)
|
|
850
|
+
concurrency: Number(options.concurrency),
|
|
851
|
+
matrix: options.matrix
|
|
535
852
|
});
|
|
536
853
|
} catch (err) {
|
|
537
854
|
console.error(`Error executing benchmark: ${err.message}`);
|
|
@@ -541,9 +858,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
|
|
|
541
858
|
cli.command(
|
|
542
859
|
"validate <testCase>",
|
|
543
860
|
"Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
|
|
544
|
-
).
|
|
861
|
+
).option(
|
|
862
|
+
"--strict",
|
|
863
|
+
"Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
|
|
864
|
+
).action(async (testCase, options) => {
|
|
545
865
|
try {
|
|
546
|
-
await validateCommand(testCase);
|
|
866
|
+
await validateCommand(testCase, options);
|
|
547
867
|
} catch (err) {
|
|
548
868
|
console.error(`Error executing validate: ${err.message}`);
|
|
549
869
|
process.exit(1);
|
|
@@ -552,7 +872,7 @@ cli.command(
|
|
|
552
872
|
cli.command(
|
|
553
873
|
"import-pr <repo> <prNumber>",
|
|
554
874
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
555
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
|
|
875
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
|
|
556
876
|
try {
|
|
557
877
|
await importPrCommand(repo, prNumber, options);
|
|
558
878
|
} catch (err) {
|
|
@@ -560,5 +880,27 @@ cli.command(
|
|
|
560
880
|
process.exit(1);
|
|
561
881
|
}
|
|
562
882
|
});
|
|
883
|
+
cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
|
|
884
|
+
"--quality",
|
|
885
|
+
"Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
|
|
886
|
+
).action(async (runId, options) => {
|
|
887
|
+
try {
|
|
888
|
+
await traceCommand(runId, options);
|
|
889
|
+
} catch (err) {
|
|
890
|
+
console.error(`Error executing trace: ${err.message}`);
|
|
891
|
+
process.exit(1);
|
|
892
|
+
}
|
|
893
|
+
});
|
|
563
894
|
cli.help();
|
|
564
|
-
|
|
895
|
+
try {
|
|
896
|
+
cli.parse();
|
|
897
|
+
} catch (err) {
|
|
898
|
+
if (err.name === "CACError") {
|
|
899
|
+
console.error(`
|
|
900
|
+
\u274C ${err.message}
|
|
901
|
+
`);
|
|
902
|
+
cli.outputHelp();
|
|
903
|
+
process.exit(1);
|
|
904
|
+
}
|
|
905
|
+
throw err;
|
|
906
|
+
}
|
package/package.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.6",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"bin": {
|
|
8
|
-
"agr": "
|
|
9
|
-
"agentgrader": "
|
|
8
|
+
"agr": "dist/index.js",
|
|
9
|
+
"agentgrader": "dist/index.js"
|
|
10
10
|
},
|
|
11
11
|
"main": "./dist/index.js",
|
|
12
12
|
"types": "./dist/index.d.ts",
|
|
@@ -19,14 +19,18 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^2.0.
|
|
23
|
-
"@agentgrader/core": "^1.1.
|
|
24
|
-
"@agentgrader/
|
|
25
|
-
"@agentgrader/
|
|
22
|
+
"@agentgrader/agent-openrouter": "^2.0.1",
|
|
23
|
+
"@agentgrader/core": "^1.1.3",
|
|
24
|
+
"@agentgrader/optimizer": "^0.1.0",
|
|
25
|
+
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
|
+
"@agentgrader/scorer-static": "^0.1.0",
|
|
27
|
+
"@agentgrader/store": "^1.0.3",
|
|
26
28
|
"cac": "^6.7.14",
|
|
29
|
+
"dotenv": "^17.4.2",
|
|
27
30
|
"ink": "^4.4.1",
|
|
28
31
|
"react": "^18.2.0",
|
|
29
|
-
"yaml": "^2.5.1"
|
|
32
|
+
"yaml": "^2.5.1",
|
|
33
|
+
"zod": "^3.23.8"
|
|
30
34
|
},
|
|
31
35
|
"devDependencies": {
|
|
32
36
|
"@types/react": "^18.2.0",
|