agentgrader 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +203 -8
- package/package.json +6 -6
package/dist/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import 'dotenv/config';
|
|
3
3
|
import { cac } from 'cac';
|
|
4
|
+
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
4
5
|
import { randomUUID } from 'crypto';
|
|
5
6
|
import { resolve, dirname, isAbsolute, basename } from 'path';
|
|
6
7
|
import { render, Box, Text } from 'ink';
|
|
7
|
-
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
8
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
9
9
|
import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
|
|
10
10
|
import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
@@ -16,6 +16,155 @@ import { parse, stringify } from 'yaml';
|
|
|
16
16
|
import { z, ZodError } from 'zod';
|
|
17
17
|
import { execFileSync } from 'child_process';
|
|
18
18
|
|
|
19
|
+
var CONTENT_PREVIEW_MAX = 200;
|
|
20
|
+
var ANSI = {
|
|
21
|
+
reset: "\x1B[0m",
|
|
22
|
+
gray: "\x1B[90m",
|
|
23
|
+
yellow: "\x1B[33m",
|
|
24
|
+
cyan: "\x1B[36m",
|
|
25
|
+
blue: "\x1B[34m"};
|
|
26
|
+
function paint(text, code) {
|
|
27
|
+
if (!process.stdout.isTTY) return text;
|
|
28
|
+
return `${code}${text}${ANSI.reset}`;
|
|
29
|
+
}
|
|
30
|
+
function truncateContent(content, full) {
|
|
31
|
+
if (full || content.length <= CONTENT_PREVIEW_MAX) return content;
|
|
32
|
+
return `${content.slice(0, CONTENT_PREVIEW_MAX)}...`;
|
|
33
|
+
}
|
|
34
|
+
function normalizeContent(content) {
|
|
35
|
+
return (content ?? "").trim();
|
|
36
|
+
}
|
|
37
|
+
function formatStepSummary(step, full) {
|
|
38
|
+
if (!step) return "(no step)";
|
|
39
|
+
const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
|
|
40
|
+
if (!step.content) return label;
|
|
41
|
+
const preview = truncateContent(step.content.replace(/\n/g, " "), full);
|
|
42
|
+
return `${label} ${preview}`;
|
|
43
|
+
}
|
|
44
|
+
function stepsByIndex(traces) {
|
|
45
|
+
const map = /* @__PURE__ */ new Map();
|
|
46
|
+
for (const step of traces) {
|
|
47
|
+
map.set(step.stepIndex, step);
|
|
48
|
+
}
|
|
49
|
+
return map;
|
|
50
|
+
}
|
|
51
|
+
function stepsDiverge(a, b) {
|
|
52
|
+
if (!a || !b) return true;
|
|
53
|
+
if (a.kind !== b.kind) return true;
|
|
54
|
+
if ((a.tool ?? "") !== (b.tool ?? "")) return true;
|
|
55
|
+
if (normalizeContent(a.content) !== normalizeContent(b.content)) return true;
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
function formatRunStatus(run) {
|
|
59
|
+
const passed = run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : "";
|
|
60
|
+
return `${run.status}${passed}`;
|
|
61
|
+
}
|
|
62
|
+
function printRunHeader(label, run) {
|
|
63
|
+
const tag = label === "A" ? paint(`Run A (${run.id})`, ANSI.cyan) : paint(`Run B (${run.id})`, ANSI.blue);
|
|
64
|
+
console.log(tag);
|
|
65
|
+
console.log(` test case: ${run.testCaseId}`);
|
|
66
|
+
console.log(` agent config: ${run.agentConfigId}`);
|
|
67
|
+
console.log(` status: ${formatRunStatus(run)}`);
|
|
68
|
+
console.log(` steps: ${run.stepsCount}`);
|
|
69
|
+
console.log(` cost: $${run.costUsd.toFixed(4)}`);
|
|
70
|
+
console.log(` duration: ${run.durationMs}ms`);
|
|
71
|
+
if (run.error) console.log(` error: ${run.error}`);
|
|
72
|
+
}
|
|
73
|
+
async function compareCommand(runIdA, runIdB, opts) {
|
|
74
|
+
const db = initDb();
|
|
75
|
+
const [runA, runB, tracesA, tracesB] = await Promise.all([
|
|
76
|
+
getRun(db, runIdA),
|
|
77
|
+
getRun(db, runIdB),
|
|
78
|
+
getTraces(db, runIdA),
|
|
79
|
+
getTraces(db, runIdB)
|
|
80
|
+
]);
|
|
81
|
+
if (!runA) {
|
|
82
|
+
console.error(`Run not found: ${runIdA}`);
|
|
83
|
+
process.exit(1);
|
|
84
|
+
}
|
|
85
|
+
if (!runB) {
|
|
86
|
+
console.error(`Run not found: ${runIdB}`);
|
|
87
|
+
process.exit(1);
|
|
88
|
+
}
|
|
89
|
+
console.log("");
|
|
90
|
+
printRunHeader("A", runA);
|
|
91
|
+
console.log("");
|
|
92
|
+
printRunHeader("B", runB);
|
|
93
|
+
console.log("");
|
|
94
|
+
if (runA.testCaseId !== runB.testCaseId) {
|
|
95
|
+
console.log(
|
|
96
|
+
paint(
|
|
97
|
+
"\u26A0\uFE0F Comparing runs of different test cases - step alignment may not be meaningful.",
|
|
98
|
+
ANSI.yellow
|
|
99
|
+
)
|
|
100
|
+
);
|
|
101
|
+
console.log("");
|
|
102
|
+
}
|
|
103
|
+
const mapA = stepsByIndex(tracesA);
|
|
104
|
+
const mapB = stepsByIndex(tracesB);
|
|
105
|
+
const maxIndex = Math.max(
|
|
106
|
+
tracesA.length > 0 ? Math.max(...tracesA.map((s) => s.stepIndex)) : -1,
|
|
107
|
+
tracesB.length > 0 ? Math.max(...tracesB.map((s) => s.stepIndex)) : -1,
|
|
108
|
+
-1
|
|
109
|
+
);
|
|
110
|
+
if (maxIndex < 0) {
|
|
111
|
+
console.log("No steps recorded for either run.");
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
const divergentIndices = /* @__PURE__ */ new Set();
|
|
115
|
+
for (let i = 0; i <= maxIndex; i++) {
|
|
116
|
+
if (stepsDiverge(mapA.get(i), mapB.get(i))) {
|
|
117
|
+
divergentIndices.add(i);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
const visibleIndices = /* @__PURE__ */ new Set();
|
|
121
|
+
if (opts.onlyDiff) {
|
|
122
|
+
for (const idx of divergentIndices) {
|
|
123
|
+
visibleIndices.add(idx);
|
|
124
|
+
if (idx > 0) visibleIndices.add(idx - 1);
|
|
125
|
+
if (idx < maxIndex) visibleIndices.add(idx + 1);
|
|
126
|
+
}
|
|
127
|
+
} else {
|
|
128
|
+
for (let i = 0; i <= maxIndex; i++) visibleIndices.add(i);
|
|
129
|
+
}
|
|
130
|
+
const sortedVisible = [...visibleIndices].sort((a, b) => a - b);
|
|
131
|
+
if (sortedVisible.length === 0) {
|
|
132
|
+
console.log("No divergent steps (nothing to show with --only-diff).");
|
|
133
|
+
} else {
|
|
134
|
+
console.log("Step comparison:");
|
|
135
|
+
for (const i of sortedVisible) {
|
|
136
|
+
const stepA = mapA.get(i);
|
|
137
|
+
const stepB = mapB.get(i);
|
|
138
|
+
const divergent = stepsDiverge(stepA, stepB);
|
|
139
|
+
if (divergent) {
|
|
140
|
+
console.log(paint(`[step ${i}] DIVERGENT`, ANSI.yellow));
|
|
141
|
+
console.log(` A: ${formatStepSummary(stepA, opts.full ?? false)}`);
|
|
142
|
+
console.log(` B: ${formatStepSummary(stepB, opts.full ?? false)}`);
|
|
143
|
+
} else {
|
|
144
|
+
const line = formatStepSummary(stepA ?? stepB, opts.full ?? false);
|
|
145
|
+
console.log(paint(`[step ${i}] (same)`, ANSI.gray));
|
|
146
|
+
console.log(` ${line}`);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
const totalSteps = maxIndex + 1;
|
|
151
|
+
const diffCount = divergentIndices.size;
|
|
152
|
+
let firstDivergence = null;
|
|
153
|
+
for (let i = 0; i <= maxIndex; i++) {
|
|
154
|
+
if (divergentIndices.has(i)) {
|
|
155
|
+
firstDivergence = i;
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
console.log("");
|
|
160
|
+
console.log(`${diffCount} of ${totalSteps} step(s) differ.`);
|
|
161
|
+
if (firstDivergence !== null) {
|
|
162
|
+
console.log(`First divergence at step ${firstDivergence}.`);
|
|
163
|
+
} else {
|
|
164
|
+
console.log("No divergence detected.");
|
|
165
|
+
}
|
|
166
|
+
console.log("");
|
|
167
|
+
}
|
|
19
168
|
var CONFIG_COL_WIDTH = 24;
|
|
20
169
|
var CONFIG_LABEL_MAX = 20;
|
|
21
170
|
function truncateLabel(name, max = CONFIG_LABEL_MAX) {
|
|
@@ -327,6 +476,9 @@ function loadTestCase(yamlPath) {
|
|
|
327
476
|
(toolkit) => isAbsolute(toolkit) ? toolkit : resolve(dir, toolkit)
|
|
328
477
|
);
|
|
329
478
|
}
|
|
479
|
+
if (testCase.agent_config) {
|
|
480
|
+
testCase.agent_config = isAbsolute(testCase.agent_config) ? testCase.agent_config : resolve(dir, testCase.agent_config);
|
|
481
|
+
}
|
|
330
482
|
if (testCase.solution && looksLikeFilePath(testCase.solution)) {
|
|
331
483
|
testCase.solution = readPatchFile(dir, testCase.solution);
|
|
332
484
|
}
|
|
@@ -335,6 +487,26 @@ function loadTestCase(yamlPath) {
|
|
|
335
487
|
}
|
|
336
488
|
return testCase;
|
|
337
489
|
}
|
|
490
|
+
function resolveSharedAgentConfigFromTestCases(testCases) {
|
|
491
|
+
if (testCases.length === 0) {
|
|
492
|
+
throw new Error("No test cases loaded.");
|
|
493
|
+
}
|
|
494
|
+
const paths = /* @__PURE__ */ new Set();
|
|
495
|
+
for (const tc of testCases) {
|
|
496
|
+
if (!tc.agent_config) {
|
|
497
|
+
throw new Error(
|
|
498
|
+
"Either --configs, --configs-dir, --matrix, or --manifest must be provided, or every test case in the suite must define the same agent_config in agr.yaml."
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
paths.add(tc.agent_config);
|
|
502
|
+
}
|
|
503
|
+
if (paths.size > 1) {
|
|
504
|
+
throw new Error(
|
|
505
|
+
`Multiple agent_config values found across test cases (${[...paths].join(", ")}). Use --configs, --configs-dir, or --matrix to specify agent configs explicitly.`
|
|
506
|
+
);
|
|
507
|
+
}
|
|
508
|
+
return [...paths][0];
|
|
509
|
+
}
|
|
338
510
|
function looksLikeFilePath(value) {
|
|
339
511
|
const trimmed = value.trimStart();
|
|
340
512
|
if (trimmed.startsWith("diff ") || trimmed.startsWith("---") || trimmed.startsWith("***")) {
|
|
@@ -393,7 +565,7 @@ function findTestCaseYamlFiles(dir) {
|
|
|
393
565
|
async function runBenchCommand(opts) {
|
|
394
566
|
let suiteDir;
|
|
395
567
|
let concurrency = opts.concurrency ?? 2;
|
|
396
|
-
let agentConfigs;
|
|
568
|
+
let agentConfigs = [];
|
|
397
569
|
let matrixId;
|
|
398
570
|
if (opts.manifest) {
|
|
399
571
|
const manifestPath = resolve(opts.manifest);
|
|
@@ -425,7 +597,7 @@ async function runBenchCommand(opts) {
|
|
|
425
597
|
console.log(
|
|
426
598
|
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
427
599
|
);
|
|
428
|
-
} else {
|
|
600
|
+
} else if (opts.configs || opts.configsDir) {
|
|
429
601
|
const configPaths = resolveAgentConfigPathList({
|
|
430
602
|
commaSeparated: opts.configs,
|
|
431
603
|
dir: opts.configsDir
|
|
@@ -436,9 +608,6 @@ async function runBenchCommand(opts) {
|
|
|
436
608
|
}
|
|
437
609
|
}
|
|
438
610
|
}
|
|
439
|
-
if (agentConfigs.length === 0) {
|
|
440
|
-
throw new Error("No agent configs to benchmark.");
|
|
441
|
-
}
|
|
442
611
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
443
612
|
if (yamlFiles.length === 0) {
|
|
444
613
|
console.error(`No test cases found in suite directory: ${suiteDir}`);
|
|
@@ -448,6 +617,16 @@ async function runBenchCommand(opts) {
|
|
|
448
617
|
for (const f of yamlFiles) {
|
|
449
618
|
testCases.push(loadTestCase(f));
|
|
450
619
|
}
|
|
620
|
+
if (agentConfigs.length === 0) {
|
|
621
|
+
const sharedAgentConfig = resolveSharedAgentConfigFromTestCases(testCases);
|
|
622
|
+
const configPaths = resolveAgentConfigPathList({
|
|
623
|
+
explicitPaths: [sharedAgentConfig]
|
|
624
|
+
});
|
|
625
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
626
|
+
console.log(
|
|
627
|
+
`Using shared agent_config from agr.yaml: ${sharedAgentConfig} (${agentConfigs.length} config).`
|
|
628
|
+
);
|
|
629
|
+
}
|
|
451
630
|
const db = initDb();
|
|
452
631
|
for (const tc of testCases) {
|
|
453
632
|
await saveTestCase(db, testCaseToDbRow(tc));
|
|
@@ -885,6 +1064,11 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
885
1064
|
};
|
|
886
1065
|
if (opts.config) {
|
|
887
1066
|
agentConfig = loadAgentConfig(opts.config);
|
|
1067
|
+
} else if (testCase.agent_config) {
|
|
1068
|
+
agentConfig = loadAgentConfig(testCase.agent_config);
|
|
1069
|
+
console.log(
|
|
1070
|
+
`Using agent config from agr.yaml: ${testCase.agent_config} (model: ${agentConfig.model})`
|
|
1071
|
+
);
|
|
888
1072
|
}
|
|
889
1073
|
console.log(`Starting run for "${testCase.name}" using model "${agentConfig.model}"...`);
|
|
890
1074
|
const sandboxProvider = new DockerSandboxProvider();
|
|
@@ -1055,9 +1239,9 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
|
|
|
1055
1239
|
);
|
|
1056
1240
|
process.exit(1);
|
|
1057
1241
|
}
|
|
1058
|
-
} else if (!options.suite
|
|
1242
|
+
} else if (!options.suite) {
|
|
1059
1243
|
console.error(
|
|
1060
|
-
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or
|
|
1244
|
+
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, --matrix, or a shared agent_config in every agr.yaml."
|
|
1061
1245
|
);
|
|
1062
1246
|
process.exit(1);
|
|
1063
1247
|
} else if (agentSourceCount > 1) {
|
|
@@ -1119,6 +1303,17 @@ cli.command("trace <runId>", "Show the step trace and metrics for a single run")
|
|
|
1119
1303
|
process.exit(1);
|
|
1120
1304
|
}
|
|
1121
1305
|
});
|
|
1306
|
+
cli.command("compare <runIdA> <runIdB>", "Compare the step traces of two runs side by side").option("--full", "Print full step content without truncation").option(
|
|
1307
|
+
"--only-diff",
|
|
1308
|
+
"Show only divergent steps plus one step of context before and after each"
|
|
1309
|
+
).example("agr compare <runIdA> <runIdB> --only-diff").action(async (runIdA, runIdB, options) => {
|
|
1310
|
+
try {
|
|
1311
|
+
await compareCommand(runIdA, runIdB, options);
|
|
1312
|
+
} catch (err) {
|
|
1313
|
+
console.error(`Error executing compare: ${err.message}`);
|
|
1314
|
+
process.exit(1);
|
|
1315
|
+
}
|
|
1316
|
+
});
|
|
1122
1317
|
cli.help();
|
|
1123
1318
|
try {
|
|
1124
1319
|
cli.parse();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -19,11 +19,11 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^
|
|
23
|
-
"@agentgrader/core": "^1.
|
|
24
|
-
"@agentgrader/optimizer": "^0.
|
|
25
|
-
"@agentgrader/sandbox-docker": "^
|
|
26
|
-
"@agentgrader/scorer-static": "^
|
|
22
|
+
"@agentgrader/agent-openrouter": "^3.0.0",
|
|
23
|
+
"@agentgrader/core": "^1.2.0",
|
|
24
|
+
"@agentgrader/optimizer": "^1.0.0",
|
|
25
|
+
"@agentgrader/sandbox-docker": "^3.0.0",
|
|
26
|
+
"@agentgrader/scorer-static": "^1.0.0",
|
|
27
27
|
"@agentgrader/store": "^1.0.3",
|
|
28
28
|
"cac": "^6.7.14",
|
|
29
29
|
"dotenv": "^17.4.2",
|