agentgrader 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+
2
+ export { }
package/dist/index.js ADDED
@@ -0,0 +1,564 @@
1
+ #!/usr/bin/env node
2
+ import { cac } from 'cac';
3
+ import { resolve, dirname, isAbsolute } from 'path';
4
+ import { render, Box, Text } from 'ink';
5
+ import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
6
+ import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
7
+ import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
8
+ import { OpenRouterAgentAdapter } from '@agentgrader/agent-openrouter';
9
+ import { jsx, jsxs } from 'react/jsx-runtime';
10
+ import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
11
+ import { stringify, parse } from 'yaml';
12
+ import { randomUUID } from 'crypto';
13
+
14
+ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
15
+ let totalCost = 0;
16
+ let totalSteps = 0;
17
+ let passedCount = 0;
18
+ let completedCount = 0;
19
+ for (const run of Object.values(runs)) {
20
+ totalCost += run.costUsd || 0;
21
+ totalSteps += run.stepsCount || 0;
22
+ if (run.status === "completed" || run.status === "failed") {
23
+ completedCount++;
24
+ if (run.passed) passedCount++;
25
+ }
26
+ }
27
+ const totalRuns = testCases.length * configs.length;
28
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
29
+ /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 2, marginBottom: 1, flexDirection: "column", children: [
30
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "\u{1F525} AGENTGRADER BENCHMARK \u{1F525}" }),
31
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Docker Sandboxes \u2022 Parallel Execution \u2022 Mastra Orchestration" })
32
+ ] }),
33
+ /* @__PURE__ */ jsxs(Box, { marginBottom: 1, children: [
34
+ /* @__PURE__ */ jsx(Text, { bold: true, children: "Progress: " }),
35
+ /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
36
+ completedCount,
37
+ " / ",
38
+ totalRuns,
39
+ " completed"
40
+ ] }),
41
+ /* @__PURE__ */ jsx(Text, { children: " | " }),
42
+ /* @__PURE__ */ jsx(Text, { bold: true, children: "Passed: " }),
43
+ /* @__PURE__ */ jsxs(Text, { color: "green", children: [
44
+ passedCount,
45
+ " / ",
46
+ completedCount || 0,
47
+ " solves"
48
+ ] }),
49
+ /* @__PURE__ */ jsx(Text, { children: " | " }),
50
+ /* @__PURE__ */ jsx(Text, { bold: true, children: "Total Cost: " }),
51
+ /* @__PURE__ */ jsxs(Text, { color: "magenta", children: [
52
+ "$",
53
+ totalCost.toFixed(4)
54
+ ] })
55
+ ] }),
56
+ !isFinished && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
57
+ /* @__PURE__ */ jsx(Text, { bold: true, underline: true, color: "yellow", children: "Active Runs:" }),
58
+ Object.values(runs).filter((r) => r.status === "running").map((r) => /* @__PURE__ */ jsxs(Box, { marginLeft: 2, children: [
59
+ /* @__PURE__ */ jsx(Text, { color: "yellow", children: "\u25CF" }),
60
+ /* @__PURE__ */ jsxs(Text, { bold: true, children: [
61
+ " ",
62
+ r.testCaseId
63
+ ] }),
64
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
65
+ /* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
66
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
67
+ " (Steps: ",
68
+ r.stepsCount,
69
+ ", Cost: $",
70
+ r.costUsd.toFixed(4),
71
+ ")"
72
+ ] })
73
+ ] }, r.runId)),
74
+ Object.values(runs).filter((r) => r.status === "running").length === 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "No active runs (waiting or queuing)..." }) })
75
+ ] }),
76
+ /* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
77
+ /* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
78
+ /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
79
+ configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", children: cfg }) }, cfg))
80
+ ] }),
81
+ testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
82
+ /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
83
+ configs.map((cfg) => {
84
+ const key = `${tc}_${cfg}`;
85
+ const run = runs[key];
86
+ if (!run) {
87
+ return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
88
+ }
89
+ if (run.status === "running") {
90
+ return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
91
+ }
92
+ if (run.status === "failed" || !run.passed) {
93
+ const seconds2 = (run.durationMs / 1e3).toFixed(1);
94
+ return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "red", children: [
95
+ "\u2717 ",
96
+ seconds2,
97
+ "s ($",
98
+ run.costUsd.toFixed(3),
99
+ ")"
100
+ ] }) }, cfg);
101
+ }
102
+ const seconds = (run.durationMs / 1e3).toFixed(1);
103
+ return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "green", children: [
104
+ "\u2713 ",
105
+ seconds,
106
+ "s ($",
107
+ run.costUsd.toFixed(3),
108
+ ")"
109
+ ] }) }, cfg);
110
+ })
111
+ ] }, tc))
112
+ ] }),
113
+ isFinished && /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", borderStyle: "double", borderColor: "green", padding: 1, children: [
114
+ /* @__PURE__ */ jsx(Text, { color: "green", bold: true, children: "Benchmark finished successfully!" }),
115
+ /* @__PURE__ */ jsxs(Text, { children: [
116
+ "Total runs executed: ",
117
+ totalRuns
118
+ ] }),
119
+ /* @__PURE__ */ jsxs(Text, { children: [
120
+ "Successful solves: ",
121
+ passedCount,
122
+ " (",
123
+ (passedCount / totalRuns * 100).toFixed(0),
124
+ "%)"
125
+ ] }),
126
+ /* @__PURE__ */ jsxs(Text, { children: [
127
+ "Total model API cost: $",
128
+ totalCost.toFixed(4)
129
+ ] }),
130
+ /* @__PURE__ */ jsxs(Text, { children: [
131
+ "Average steps per run: ",
132
+ (totalSteps / totalRuns).toFixed(1)
133
+ ] })
134
+ ] })
135
+ ] });
136
+ };
137
+ function loadAgentConfig(yamlPath) {
138
+ const path = resolve(yamlPath);
139
+ const fileContent = readFileSync(path, "utf-8");
140
+ const raw = parse(fileContent);
141
+ const dir = dirname(path);
142
+ const config = AgentConfigSchema.parse(raw);
143
+ config.id = config.id || config.name;
144
+ if (config.toolkits) {
145
+ config.toolkits = config.toolkits.map(
146
+ (toolkit) => isAbsolute(toolkit) ? toolkit : resolve(dir, toolkit)
147
+ );
148
+ }
149
+ return config;
150
+ }
151
+ function loadTestCase(yamlPath) {
152
+ const path = resolve(yamlPath);
153
+ const fileContent = readFileSync(path, "utf-8");
154
+ const raw = parse(fileContent);
155
+ const dir = dirname(path);
156
+ if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
157
+ raw.fixture = resolve(dir, raw.fixture);
158
+ }
159
+ const testCase = TestCaseSchema.parse(raw);
160
+ testCase.id = testCase.id || testCase.name;
161
+ if (testCase.toolkits) {
162
+ testCase.toolkits = testCase.toolkits.map(
163
+ (toolkit) => isAbsolute(toolkit) ? toolkit : resolve(dir, toolkit)
164
+ );
165
+ }
166
+ if (testCase.solution && looksLikeFilePath(testCase.solution)) {
167
+ testCase.solution = readPatchFile(dir, testCase.solution);
168
+ }
169
+ if (testCase.test_patch && looksLikeFilePath(testCase.test_patch)) {
170
+ testCase.test_patch = readPatchFile(dir, testCase.test_patch);
171
+ }
172
+ return testCase;
173
+ }
174
+ function looksLikeFilePath(value) {
175
+ const trimmed = value.trimStart();
176
+ if (trimmed.startsWith("diff ") || trimmed.startsWith("---") || trimmed.startsWith("***")) {
177
+ return false;
178
+ }
179
+ return !value.includes("\n");
180
+ }
181
+ function readPatchFile(dir, relPath) {
182
+ const full = isAbsolute(relPath) ? relPath : resolve(dir, relPath);
183
+ try {
184
+ return readFileSync(full, "utf-8");
185
+ } catch (err) {
186
+ throw new Error(`Failed to read patch file "${relPath}" (resolved to ${full}): ${err.message}`);
187
+ }
188
+ }
189
+ function testCaseToDbRow(testCase) {
190
+ return {
191
+ id: testCase.id || testCase.name,
192
+ name: testCase.name,
193
+ description: testCase.description,
194
+ fixture: testCase.fixture,
195
+ prompt: testCase.prompt,
196
+ success: JSON.stringify(testCase.success),
197
+ timeoutSeconds: testCase.timeout_seconds,
198
+ createdAt: Math.floor(Date.now() / 1e3),
199
+ tags: testCase.tags ? JSON.stringify(testCase.tags) : null,
200
+ testCommand: testCase.test_command ?? null,
201
+ failToPass: testCase.fail_to_pass ? JSON.stringify(testCase.fail_to_pass) : null,
202
+ passToPass: testCase.pass_to_pass ? JSON.stringify(testCase.pass_to_pass) : null,
203
+ forbidModified: testCase.forbid_modified ? JSON.stringify(testCase.forbid_modified) : null,
204
+ expectedFiles: testCase.expected_files ? JSON.stringify(testCase.expected_files) : null,
205
+ solution: testCase.solution ?? null,
206
+ testPatch: testCase.test_patch ?? null,
207
+ sourceCreatedAt: testCase.created_at ?? null
208
+ };
209
+ }
210
+ function findTestCaseYamlFiles(dir) {
211
+ const files = [];
212
+ try {
213
+ const entries = readdirSync(dir);
214
+ for (const entry of entries) {
215
+ const fullPath = resolve(dir, entry);
216
+ const stat = statSync(fullPath);
217
+ if (stat.isDirectory()) {
218
+ if (entry !== "fixture" && entry !== "node_modules" && !entry.startsWith(".")) {
219
+ files.push(...findTestCaseYamlFiles(fullPath));
220
+ }
221
+ } else if (entry === "agr.yaml" || entry.endsWith(".yaml") && !entry.includes("config")) {
222
+ files.push(fullPath);
223
+ }
224
+ }
225
+ } catch {
226
+ }
227
+ return files;
228
+ }
229
+ async function runBenchCommand(opts) {
230
+ const suiteDir = resolve(opts.suite);
231
+ const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
232
+ const concurrency = opts.concurrency || 2;
233
+ const agentConfigs = configPaths.map((p) => loadAgentConfig(p));
234
+ const yamlFiles = findTestCaseYamlFiles(suiteDir);
235
+ if (yamlFiles.length === 0) {
236
+ console.error(`No test cases found in suite directory: ${opts.suite}`);
237
+ process.exit(1);
238
+ }
239
+ const testCases = [];
240
+ for (const f of yamlFiles) {
241
+ testCases.push(loadTestCase(f));
242
+ }
243
+ const db = initDb();
244
+ for (const tc of testCases) {
245
+ await saveTestCase(db, testCaseToDbRow(tc));
246
+ }
247
+ for (const ac of agentConfigs) {
248
+ await saveAgentConfig(db, {
249
+ id: ac.id || ac.name,
250
+ name: ac.name,
251
+ model: ac.model,
252
+ maxSteps: ac.max_steps,
253
+ temperature: ac.temperature,
254
+ systemPrompt: ac.system_prompt,
255
+ tools: JSON.stringify(ac.tools),
256
+ createdAt: Math.floor(Date.now() / 1e3)
257
+ });
258
+ }
259
+ const sandboxProvider = new DockerSandboxProvider();
260
+ const adapter = new OpenRouterAgentAdapter();
261
+ const runStates = {};
262
+ const testCaseIds = testCases.map((tc) => tc.id || tc.name);
263
+ const configIds = agentConfigs.map((ac) => ac.id || ac.name);
264
+ const { rerender, waitUntilExit } = render(
265
+ /* @__PURE__ */ jsx(
266
+ Dashboard,
267
+ {
268
+ runs: runStates,
269
+ testCases: testCaseIds,
270
+ configs: configIds,
271
+ isFinished: false
272
+ }
273
+ )
274
+ );
275
+ const onRunUpdate = (run) => {
276
+ const key = `${run.testCaseId}_${run.agentConfigId}`;
277
+ runStates[key] = run;
278
+ rerender(
279
+ /* @__PURE__ */ jsx(
280
+ Dashboard,
281
+ {
282
+ runs: runStates,
283
+ testCases: testCaseIds,
284
+ configs: configIds,
285
+ isFinished: false
286
+ }
287
+ )
288
+ );
289
+ };
290
+ try {
291
+ await runBenchmark({
292
+ testCases,
293
+ agentConfigs,
294
+ adapter,
295
+ sandboxProvider,
296
+ db,
297
+ concurrency,
298
+ onRunUpdate
299
+ });
300
+ } catch (err) {
301
+ console.error("Benchmark runner encountered an error:", err);
302
+ }
303
+ rerender(
304
+ /* @__PURE__ */ jsx(
305
+ Dashboard,
306
+ {
307
+ runs: runStates,
308
+ testCases: testCaseIds,
309
+ configs: configIds,
310
+ isFinished: true
311
+ }
312
+ )
313
+ );
314
+ printTagBreakdown(testCases, agentConfigs, runStates);
315
+ process.exit(0);
316
+ }
317
+ function printTagBreakdown(testCases, agentConfigs, runStates) {
318
+ const tagStats = {};
319
+ for (const tc of testCases) {
320
+ const tags2 = tc.tags ?? [];
321
+ if (tags2.length === 0) continue;
322
+ for (const ac of agentConfigs) {
323
+ const key = `${tc.id || tc.name}_${ac.id || ac.name}`;
324
+ const run = runStates[key];
325
+ if (!run || run.status === "running") continue;
326
+ for (const tag of tags2) {
327
+ const stats = tagStats[tag] ??= { passed: 0, total: 0 };
328
+ stats.total++;
329
+ if (run.passed) stats.passed++;
330
+ }
331
+ }
332
+ }
333
+ const tags = Object.keys(tagStats).sort();
334
+ if (tags.length === 0) return;
335
+ console.log("\n================ TAG BREAKDOWN ================");
336
+ for (const tag of tags) {
337
+ const { passed, total } = tagStats[tag];
338
+ const pct = total > 0 ? (passed / total * 100).toFixed(0) : "0";
339
+ console.log(`${tag.padEnd(24)} ${passed}/${total} (${pct}%)`);
340
+ }
341
+ console.log("=================================================\n");
342
+ }
343
+ var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
344
+ async function importPrCommand(repo, prNumber, opts) {
345
+ const [owner, repoName] = repo.split("/");
346
+ if (!owner || !repoName) {
347
+ throw new Error(`Invalid repo "${repo}" - expected format "owner/repo".`);
348
+ }
349
+ const headers = {
350
+ Accept: "application/vnd.github+json",
351
+ "User-Agent": "agentgrader-import-pr"
352
+ };
353
+ if (process.env.GITHUB_TOKEN) {
354
+ headers.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`;
355
+ }
356
+ const apiUrl = `https://api.github.com/repos/${owner}/${repoName}/pulls/${prNumber}`;
357
+ console.log(`Fetching PR #${prNumber} from ${owner}/${repoName}...`);
358
+ const prRes = await fetch(apiUrl, { headers });
359
+ if (!prRes.ok) {
360
+ throw new Error(`Failed to fetch PR metadata: ${prRes.status} ${prRes.statusText}`);
361
+ }
362
+ const pr = await prRes.json();
363
+ console.log("Fetching PR diff...");
364
+ const diffRes = await fetch(apiUrl, {
365
+ headers: { ...headers, Accept: "application/vnd.github.v3.diff" }
366
+ });
367
+ if (!diffRes.ok) {
368
+ throw new Error(`Failed to fetch PR diff: ${diffRes.status} ${diffRes.statusText}`);
369
+ }
370
+ const fullDiff = await diffRes.text();
371
+ const { solutionDiff, testDiff, expectedFiles, forbidModified } = splitDiff(fullDiff);
372
+ const slug = `${repoName}-pr-${pr.number}`;
373
+ const outDir = resolve(opts.out || `./imported/${slug}`);
374
+ mkdirSync(outDir, { recursive: true });
375
+ if (solutionDiff.trim()) {
376
+ writeFileSync(resolve(outDir, "solution.patch"), solutionDiff);
377
+ }
378
+ if (testDiff.trim()) {
379
+ writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
380
+ }
381
+ const yamlDoc = {
382
+ name: slug,
383
+ description: pr.title,
384
+ fixture: "./fixture",
385
+ prompt: buildPrompt(pr),
386
+ success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
387
+ timeout_seconds: 600,
388
+ tags: ["imported", repoName],
389
+ created_at: pr.created_at,
390
+ // TODO: fill these in after setting up ./fixture (checked out at
391
+ // base.sha below) and running the test suite to discover real test names.
392
+ test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
393
+ fail_to_pass: ["<TODO: fill in via `agr validate`>"],
394
+ pass_to_pass: ["<TODO: fill in via `agr validate`>"]
395
+ };
396
+ if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
397
+ if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
398
+ if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
399
+ if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
400
+ writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
401
+ console.log(`
402
+ Imported PR #${pr.number}: "${pr.title}"`);
403
+ console.log(`Wrote scaffold to: ${outDir}`);
404
+ console.log(" - agr.yaml");
405
+ if (solutionDiff.trim())
406
+ console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
407
+ if (testDiff.trim())
408
+ console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
409
+ console.log("\nNext steps:");
410
+ console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
411
+ console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
412
+ console.log(
413
+ ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
414
+ );
415
+ }
416
+ function buildPrompt(pr) {
417
+ const body = (pr.body || "").trim();
418
+ return body ? `${pr.title}
419
+
420
+ ${body}` : pr.title;
421
+ }
422
+ function splitDiff(diff) {
423
+ const fileDiffs = diff.split(/(?=^diff --git )/m).filter((d) => d.trim().length > 0);
424
+ const solutionParts = [];
425
+ const testParts = [];
426
+ const expectedFiles = [];
427
+ const forbidModified = [];
428
+ for (const fileDiff of fileDiffs) {
429
+ const match = fileDiff.match(/^diff --git a\/(.+?) b\/(.+)$/m);
430
+ const path = match?.[2];
431
+ if (path && TEST_FILE_PATTERN.test(path)) {
432
+ testParts.push(fileDiff);
433
+ forbidModified.push(path);
434
+ } else {
435
+ solutionParts.push(fileDiff);
436
+ if (path) expectedFiles.push(path);
437
+ }
438
+ }
439
+ return {
440
+ solutionDiff: solutionParts.join(""),
441
+ testDiff: testParts.join(""),
442
+ expectedFiles,
443
+ forbidModified
444
+ };
445
+ }
446
+ async function runSingleCommand(testCasePath, opts) {
447
+ const testCase = loadTestCase(testCasePath);
448
+ let agentConfig = {
449
+ id: "baseline",
450
+ name: "Baseline Agent",
451
+ model: "gpt-4o-mini",
452
+ max_steps: 20
453
+ };
454
+ if (opts.config) {
455
+ agentConfig = loadAgentConfig(opts.config);
456
+ }
457
+ console.log(`Starting run for "${testCase.name}" using model "${agentConfig.model}"...`);
458
+ const sandboxProvider = new DockerSandboxProvider();
459
+ const adapter = new OpenRouterAgentAdapter();
460
+ const db = initDb();
461
+ await saveTestCase(db, testCaseToDbRow(testCase));
462
+ const runId = randomUUID();
463
+ try {
464
+ const result = await runSingle({
465
+ testCase,
466
+ agentConfig,
467
+ adapter,
468
+ sandboxProvider,
469
+ db,
470
+ runId
471
+ });
472
+ console.log("\n================ RUN SUMMARY ================");
473
+ console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
474
+ console.log(`Steps: ${result.stepsCount}`);
475
+ console.log(`Cost: $${result.costUsd.toFixed(4)}`);
476
+ console.log(`Duration: ${(result.durationMs / 1e3).toFixed(1)}s`);
477
+ if (result.error) {
478
+ console.log(`Error: ${result.error}`);
479
+ }
480
+ if (result.metrics?.regression) {
481
+ console.log(`Regression: ${result.metrics.regression.detail}`);
482
+ }
483
+ if (result.metrics?.diff) {
484
+ console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
485
+ }
486
+ if (result.metrics?.localization) {
487
+ console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
488
+ }
489
+ console.log("=============================================\n");
490
+ } catch (err) {
491
+ console.error(`Run failed with error: ${err.message}`);
492
+ process.exit(1);
493
+ }
494
+ process.exit(0);
495
+ }
496
+ async function validateCommand(testCasePath) {
497
+ const testCase = loadTestCase(testCasePath);
498
+ console.log(`Validating "${testCase.name}" (${testCasePath})...
499
+ `);
500
+ const sandboxProvider = new DockerSandboxProvider();
501
+ const report = await validateTestCase({ testCase, sandboxProvider });
502
+ for (const check of report.checks) {
503
+ const icon = check.passed ? "\u2705" : "\u274C";
504
+ console.log(`${icon} ${check.name}`);
505
+ if (check.detail && check.detail !== "ok") {
506
+ const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
507
+ console.log(indented);
508
+ }
509
+ }
510
+ console.log("");
511
+ console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
512
+ process.exit(report.ok ? 0 : 1);
513
+ }
514
+
515
+ // src/index.ts
516
+ var cli = cac("agr");
517
+ cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
518
+ try {
519
+ await runSingleCommand(testCase, options);
520
+ } catch (err) {
521
+ console.error(`Error executing run: ${err.message}`);
522
+ process.exit(1);
523
+ }
524
+ });
525
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).action(async (options) => {
526
+ if (!options.configs || !options.suite) {
527
+ console.error("Error: --configs and --suite are required for benchmarking.");
528
+ process.exit(1);
529
+ }
530
+ try {
531
+ await runBenchCommand({
532
+ configs: options.configs,
533
+ suite: options.suite,
534
+ concurrency: Number(options.concurrency)
535
+ });
536
+ } catch (err) {
537
+ console.error(`Error executing benchmark: ${err.message}`);
538
+ process.exit(1);
539
+ }
540
+ });
541
+ cli.command(
542
+ "validate <testCase>",
543
+ "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
544
+ ).action(async (testCase) => {
545
+ try {
546
+ await validateCommand(testCase);
547
+ } catch (err) {
548
+ console.error(`Error executing validate: ${err.message}`);
549
+ process.exit(1);
550
+ }
551
+ });
552
+ cli.command(
553
+ "import-pr <repo> <prNumber>",
554
+ "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
555
+ ).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
556
+ try {
557
+ await importPrCommand(repo, prNumber, options);
558
+ } catch (err) {
559
+ console.error(`Error executing import-pr: ${err.message}`);
560
+ process.exit(1);
561
+ }
562
+ });
563
+ cli.help();
564
+ cli.parse();
package/package.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "agentgrader",
3
+ "version": "1.0.0",
4
+ "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
+ "license": "MIT",
6
+ "type": "module",
7
+ "bin": {
8
+ "agr": "./dist/index.js",
9
+ "agentgrader": "./dist/index.js"
10
+ },
11
+ "main": "./dist/index.js",
12
+ "types": "./dist/index.d.ts",
13
+ "files": [
14
+ "dist"
15
+ ],
16
+ "scripts": {
17
+ "build": "tsup",
18
+ "build:watch": "tsup --watch",
19
+ "dev": "bun run src/index.ts"
20
+ },
21
+ "dependencies": {
22
+ "@agentgrader/agent-openrouter": "workspace:*",
23
+ "@agentgrader/core": "workspace:*",
24
+ "@agentgrader/sandbox-docker": "workspace:*",
25
+ "@agentgrader/store": "workspace:*",
26
+ "cac": "^6.7.14",
27
+ "ink": "^4.4.1",
28
+ "react": "^18.2.0",
29
+ "yaml": "^2.5.1"
30
+ },
31
+ "devDependencies": {
32
+ "@types/react": "^18.2.0",
33
+ "tsup": "^8.5.1"
34
+ }
35
+ }