agentgrader 1.0.2 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +246 -33
- package/package.json +9 -5
package/dist/index.js
CHANGED
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
2
3
|
import { cac } from 'cac';
|
|
4
|
+
import { randomUUID } from 'crypto';
|
|
3
5
|
import { resolve, dirname, isAbsolute } from 'path';
|
|
4
6
|
import { render, Box, Text } from 'ink';
|
|
5
|
-
import { initDb, saveTestCase, saveAgentConfig } from '@agentgrader/store';
|
|
7
|
+
import { initDb, saveTestCase, saveAgentConfig, getRun, getTraces, getRunsByMatrixId } from '@agentgrader/store';
|
|
6
8
|
import { runSingle, runBenchmark, validateTestCase, TestCaseSchema, AgentConfigSchema } from '@agentgrader/core';
|
|
7
9
|
import { DockerSandboxProvider } from '@agentgrader/sandbox-docker';
|
|
8
10
|
import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
|
|
11
|
+
import { StaticQualityScorer } from '@agentgrader/scorer-static';
|
|
12
|
+
import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
|
|
9
13
|
import { jsx, jsxs } from 'react/jsx-runtime';
|
|
10
14
|
import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
|
|
11
15
|
import { stringify, parse } from 'yaml';
|
|
12
|
-
import {
|
|
16
|
+
import { ZodError } from 'zod';
|
|
17
|
+
import { execFileSync } from 'child_process';
|
|
13
18
|
|
|
14
19
|
var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
15
20
|
let totalCost = 0;
|
|
@@ -134,12 +139,32 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
|
|
|
134
139
|
] })
|
|
135
140
|
] });
|
|
136
141
|
};
|
|
142
|
+
|
|
143
|
+
// src/lib/format-zod-error.ts
|
|
144
|
+
function formatZodError(err, fileLabel) {
|
|
145
|
+
const lines = err.issues.map((issue) => {
|
|
146
|
+
const path = issue.path.join(".") || "(root)";
|
|
147
|
+
return ` - ${path}: ${issue.message}`;
|
|
148
|
+
});
|
|
149
|
+
return `Invalid ${fileLabel}:
|
|
150
|
+
${lines.join("\n")}`;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// src/lib/load-agent-config.ts
|
|
137
154
|
function loadAgentConfig(yamlPath) {
|
|
138
155
|
const path = resolve(yamlPath);
|
|
139
156
|
const fileContent = readFileSync(path, "utf-8");
|
|
140
157
|
const raw = parse(fileContent);
|
|
141
158
|
const dir = dirname(path);
|
|
142
|
-
|
|
159
|
+
let config;
|
|
160
|
+
try {
|
|
161
|
+
config = AgentConfigSchema.parse(raw);
|
|
162
|
+
} catch (err) {
|
|
163
|
+
if (err instanceof ZodError) {
|
|
164
|
+
throw new Error(formatZodError(err, `agent config "${path}"`));
|
|
165
|
+
}
|
|
166
|
+
throw err;
|
|
167
|
+
}
|
|
143
168
|
config.id = config.id || config.name;
|
|
144
169
|
if (config.toolkits) {
|
|
145
170
|
config.toolkits = config.toolkits.map(
|
|
@@ -148,6 +173,12 @@ function loadAgentConfig(yamlPath) {
|
|
|
148
173
|
}
|
|
149
174
|
return config;
|
|
150
175
|
}
|
|
176
|
+
function loadMatrix(yamlPath) {
|
|
177
|
+
const path = resolve(yamlPath);
|
|
178
|
+
const fileContent = readFileSync(path, "utf-8");
|
|
179
|
+
const raw = parse(fileContent);
|
|
180
|
+
return MatrixSchema.parse(raw);
|
|
181
|
+
}
|
|
151
182
|
function loadTestCase(yamlPath) {
|
|
152
183
|
const path = resolve(yamlPath);
|
|
153
184
|
const fileContent = readFileSync(path, "utf-8");
|
|
@@ -156,7 +187,15 @@ function loadTestCase(yamlPath) {
|
|
|
156
187
|
if (raw.fixture && !String(raw.fixture).startsWith("/") && !String(raw.fixture).startsWith("http")) {
|
|
157
188
|
raw.fixture = resolve(dir, raw.fixture);
|
|
158
189
|
}
|
|
159
|
-
|
|
190
|
+
let testCase;
|
|
191
|
+
try {
|
|
192
|
+
testCase = TestCaseSchema.parse(raw);
|
|
193
|
+
} catch (err) {
|
|
194
|
+
if (err instanceof ZodError) {
|
|
195
|
+
throw new Error(formatZodError(err, `test case "${path}"`));
|
|
196
|
+
}
|
|
197
|
+
throw err;
|
|
198
|
+
}
|
|
160
199
|
testCase.id = testCase.id || testCase.name;
|
|
161
200
|
if (testCase.toolkits) {
|
|
162
201
|
testCase.toolkits = testCase.toolkits.map(
|
|
@@ -228,9 +267,22 @@ function findTestCaseYamlFiles(dir) {
|
|
|
228
267
|
}
|
|
229
268
|
async function runBenchCommand(opts) {
|
|
230
269
|
const suiteDir = resolve(opts.suite);
|
|
231
|
-
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
232
270
|
const concurrency = opts.concurrency || 2;
|
|
233
|
-
|
|
271
|
+
let agentConfigs;
|
|
272
|
+
let matrixId;
|
|
273
|
+
if (opts.matrix) {
|
|
274
|
+
const matrix = loadMatrix(opts.matrix);
|
|
275
|
+
agentConfigs = expandMatrix(matrix);
|
|
276
|
+
matrixId = randomUUID();
|
|
277
|
+
console.log(
|
|
278
|
+
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
279
|
+
);
|
|
280
|
+
} else if (opts.configs) {
|
|
281
|
+
const configPaths = opts.configs.split(",").map((c) => resolve(c.trim()));
|
|
282
|
+
agentConfigs = configPaths.map((p) => loadAgentConfig(p));
|
|
283
|
+
} else {
|
|
284
|
+
throw new Error("Either --configs or --matrix must be provided.");
|
|
285
|
+
}
|
|
234
286
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
235
287
|
if (yamlFiles.length === 0) {
|
|
236
288
|
console.error(`No test cases found in suite directory: ${opts.suite}`);
|
|
@@ -295,7 +347,9 @@ async function runBenchCommand(opts) {
|
|
|
295
347
|
sandboxProvider,
|
|
296
348
|
db,
|
|
297
349
|
concurrency,
|
|
298
|
-
onRunUpdate
|
|
350
|
+
onRunUpdate,
|
|
351
|
+
extraScorers: [new StaticQualityScorer()],
|
|
352
|
+
matrixId
|
|
299
353
|
});
|
|
300
354
|
} catch (err) {
|
|
301
355
|
console.error("Benchmark runner encountered an error:", err);
|
|
@@ -312,8 +366,33 @@ async function runBenchCommand(opts) {
|
|
|
312
366
|
)
|
|
313
367
|
);
|
|
314
368
|
printTagBreakdown(testCases, agentConfigs, runStates);
|
|
369
|
+
if (matrixId) {
|
|
370
|
+
await printMatrixSummary(db, matrixId, agentConfigs);
|
|
371
|
+
}
|
|
315
372
|
process.exit(0);
|
|
316
373
|
}
|
|
374
|
+
async function printMatrixSummary(db, matrixId, agentConfigs) {
|
|
375
|
+
const runs = await getRunsByMatrixId(db, matrixId);
|
|
376
|
+
const aggregates = aggregateResults(runs, agentConfigs);
|
|
377
|
+
if (aggregates.length === 0) return;
|
|
378
|
+
const front = paretoFront(aggregates);
|
|
379
|
+
const frontIds = new Set(front.map((a) => a.agentConfigId));
|
|
380
|
+
const includesQuality = front.some((a) => a.avgQuality?.linterViolations !== void 0);
|
|
381
|
+
console.log("\n================ MATRIX SUMMARY ================");
|
|
382
|
+
for (const agg of aggregates) {
|
|
383
|
+
const marker = frontIds.has(agg.agentConfigId) ? "*" : " ";
|
|
384
|
+
const solveRatePct = (agg.solveRate * 100).toFixed(0);
|
|
385
|
+
const lint = agg.avgQuality?.linterViolations !== void 0 ? ` lint:${agg.avgQuality.linterViolations.toFixed(1)}` : "";
|
|
386
|
+
console.log(
|
|
387
|
+
`${marker} ${agg.agentConfigName.padEnd(36)} solve:${solveRatePct.padStart(3)}% (${agg.passedRuns}/${agg.totalRuns}) cost:$${agg.avgCostUsd.toFixed(4)}${lint}`
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
console.log(
|
|
391
|
+
`
|
|
392
|
+
* = Pareto-optimal (solve rate, cost${includesQuality ? ", lint violations" : ""})`
|
|
393
|
+
);
|
|
394
|
+
console.log("=================================================\n");
|
|
395
|
+
}
|
|
317
396
|
function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
318
397
|
const tagStats = {};
|
|
319
398
|
for (const tc of testCases) {
|
|
@@ -340,6 +419,26 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
|
|
|
340
419
|
}
|
|
341
420
|
console.log("=================================================\n");
|
|
342
421
|
}
|
|
422
|
+
async function validateCommand(testCasePath) {
|
|
423
|
+
const testCase = loadTestCase(testCasePath);
|
|
424
|
+
console.log(`Validating "${testCase.name}" (${testCasePath})...
|
|
425
|
+
`);
|
|
426
|
+
const sandboxProvider = new DockerSandboxProvider();
|
|
427
|
+
const report = await validateTestCase({ testCase, sandboxProvider });
|
|
428
|
+
for (const check of report.checks) {
|
|
429
|
+
const icon = check.passed ? "\u2705" : "\u274C";
|
|
430
|
+
console.log(`${icon} ${check.name}`);
|
|
431
|
+
if (check.detail && check.detail !== "ok") {
|
|
432
|
+
const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
|
|
433
|
+
console.log(indented);
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
console.log("");
|
|
437
|
+
console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
|
|
438
|
+
process.exit(report.ok ? 0 : 1);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// src/commands/import-pr.ts
|
|
343
442
|
var TEST_FILE_PATTERN = /(^|\/)(tests?|specs?|__tests__)(\/|$)|\.(test|spec)\.[jt]sx?$/i;
|
|
344
443
|
async function importPrCommand(repo, prNumber, opts) {
|
|
345
444
|
const [owner, repoName] = repo.split("/");
|
|
@@ -406,12 +505,33 @@ Imported PR #${pr.number}: "${pr.title}"`);
|
|
|
406
505
|
console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
|
|
407
506
|
if (testDiff.trim())
|
|
408
507
|
console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
|
|
508
|
+
if (opts.cloneFixture) {
|
|
509
|
+
const fixtureDir = resolve(outDir, "fixture");
|
|
510
|
+
console.log(`
|
|
511
|
+
Cloning ${owner}/${repoName} into ${fixtureDir}...`);
|
|
512
|
+
execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
|
|
513
|
+
stdio: "inherit"
|
|
514
|
+
});
|
|
515
|
+
console.log(`Checking out base commit ${pr.base.sha}...`);
|
|
516
|
+
execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
|
|
517
|
+
}
|
|
409
518
|
console.log("\nNext steps:");
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
)
|
|
519
|
+
if (!opts.cloneFixture) {
|
|
520
|
+
console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
|
|
521
|
+
console.log(" 2. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
|
|
522
|
+
console.log(
|
|
523
|
+
` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
524
|
+
);
|
|
525
|
+
} else {
|
|
526
|
+
console.log(" 1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
|
|
527
|
+
console.log(
|
|
528
|
+
` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
|
|
529
|
+
);
|
|
530
|
+
}
|
|
531
|
+
if (opts.validate) {
|
|
532
|
+
console.log("\nRunning validation...\n");
|
|
533
|
+
await validateCommand(resolve(outDir, "agr.yaml"));
|
|
534
|
+
}
|
|
415
535
|
}
|
|
416
536
|
function buildPrompt(pr) {
|
|
417
537
|
const body = (pr.body || "").trim();
|
|
@@ -459,6 +579,14 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
459
579
|
const adapter = new AiSdkAgentAdapter();
|
|
460
580
|
const db = initDb();
|
|
461
581
|
await saveTestCase(db, testCaseToDbRow(testCase));
|
|
582
|
+
await saveAgentConfig(db, {
|
|
583
|
+
id: agentConfig.id || agentConfig.name,
|
|
584
|
+
name: agentConfig.name,
|
|
585
|
+
model: agentConfig.model,
|
|
586
|
+
maxSteps: agentConfig.max_steps,
|
|
587
|
+
temperature: agentConfig.temperature,
|
|
588
|
+
createdAt: Math.floor(Date.now() / 1e3)
|
|
589
|
+
});
|
|
462
590
|
const runId = randomUUID();
|
|
463
591
|
try {
|
|
464
592
|
const result = await runSingle({
|
|
@@ -493,23 +621,82 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
493
621
|
}
|
|
494
622
|
process.exit(0);
|
|
495
623
|
}
|
|
496
|
-
async function
|
|
497
|
-
const
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
624
|
+
async function traceCommand(runId, opts) {
|
|
625
|
+
const db = initDb();
|
|
626
|
+
const run = await getRun(db, runId);
|
|
627
|
+
if (!run) {
|
|
628
|
+
console.error(`Run not found: ${runId}`);
|
|
629
|
+
process.exit(1);
|
|
630
|
+
}
|
|
631
|
+
console.log(`Run ${run.id}`);
|
|
632
|
+
console.log(` test case: ${run.testCaseId}`);
|
|
633
|
+
console.log(` agent config: ${run.agentConfigId}`);
|
|
634
|
+
console.log(
|
|
635
|
+
` status: ${run.status}${run.passed === true ? " (passed)" : run.passed === false ? " (failed)" : ""}`
|
|
636
|
+
);
|
|
637
|
+
console.log(` cost: $${run.costUsd.toFixed(4)}`);
|
|
638
|
+
console.log(` duration: ${run.durationMs}ms`);
|
|
639
|
+
if (run.error) console.log(` error: ${run.error}`);
|
|
640
|
+
if (opts.quality) {
|
|
641
|
+
printQualityBreakdown(run.metrics);
|
|
642
|
+
return;
|
|
643
|
+
}
|
|
644
|
+
const steps = await getTraces(db, runId);
|
|
645
|
+
console.log(`
|
|
646
|
+
${steps.length} step(s):`);
|
|
647
|
+
for (const step of steps) {
|
|
648
|
+
const label = step.tool ? `${step.kind}:${step.tool}` : step.kind;
|
|
649
|
+
console.log(
|
|
650
|
+
` [${step.stepIndex}] ${label} (in:${step.tokensIn} out:${step.tokensOut} $${step.costUsd.toFixed(4)})`
|
|
651
|
+
);
|
|
652
|
+
if (step.content) {
|
|
653
|
+
const preview = step.content.length > 200 ? `${step.content.slice(0, 200)}...` : step.content;
|
|
654
|
+
console.log(` ${preview.replace(/\n/g, "\n ")}`);
|
|
508
655
|
}
|
|
509
656
|
}
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
657
|
+
}
|
|
658
|
+
function printQualityBreakdown(metricsJson) {
|
|
659
|
+
const metrics = metricsJson ? safeParseJson(metricsJson) : void 0;
|
|
660
|
+
console.log("\n================ QUALITY BREAKDOWN ================");
|
|
661
|
+
const staticQuality = metrics?.["static-quality"]?.quality;
|
|
662
|
+
const llmJudge = metrics?.["llm-judge"]?.quality;
|
|
663
|
+
const diff = metrics?.diff;
|
|
664
|
+
const localization = metrics?.localization;
|
|
665
|
+
if (staticQuality) {
|
|
666
|
+
console.log("Static quality (static-quality):");
|
|
667
|
+
if (staticQuality.diffLines !== void 0) console.log(` diff lines: ${staticQuality.diffLines}`);
|
|
668
|
+
if (staticQuality.filesModified !== void 0)
|
|
669
|
+
console.log(` files modified: ${staticQuality.filesModified}`);
|
|
670
|
+
if (staticQuality.todosIntroduced !== void 0)
|
|
671
|
+
console.log(` TODOs introduced: ${staticQuality.todosIntroduced}`);
|
|
672
|
+
if (staticQuality.linterViolations !== void 0)
|
|
673
|
+
console.log(` lint violations: ${staticQuality.linterViolations}`);
|
|
674
|
+
}
|
|
675
|
+
if (llmJudge) {
|
|
676
|
+
if (staticQuality) console.log("");
|
|
677
|
+
console.log("LLM judge (llm-judge):");
|
|
678
|
+
if (llmJudge.llmJudgeScore !== void 0)
|
|
679
|
+
console.log(` score: ${llmJudge.llmJudgeScore.toFixed(2)} / 1.00`);
|
|
680
|
+
if (llmJudge.llmJudgeDetail) console.log(` rationale: ${llmJudge.llmJudgeDetail}`);
|
|
681
|
+
}
|
|
682
|
+
if (diff) {
|
|
683
|
+
if (staticQuality || llmJudge) console.log("");
|
|
684
|
+
console.log(`Diff scope: ${diff.detail ?? JSON.stringify(diff)}`);
|
|
685
|
+
}
|
|
686
|
+
if (localization) {
|
|
687
|
+
console.log(`Localization: ${localization.detail ?? JSON.stringify(localization)}`);
|
|
688
|
+
}
|
|
689
|
+
if (!staticQuality && !llmJudge && !diff && !localization) {
|
|
690
|
+
console.log(" (no quality metrics recorded for this run)");
|
|
691
|
+
}
|
|
692
|
+
console.log("=====================================================\n");
|
|
693
|
+
}
|
|
694
|
+
function safeParseJson(value) {
|
|
695
|
+
try {
|
|
696
|
+
return JSON.parse(value);
|
|
697
|
+
} catch {
|
|
698
|
+
return void 0;
|
|
699
|
+
}
|
|
513
700
|
}
|
|
514
701
|
|
|
515
702
|
// src/index.ts
|
|
@@ -522,16 +709,20 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
|
|
|
522
709
|
process.exit(1);
|
|
523
710
|
}
|
|
524
711
|
});
|
|
525
|
-
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).
|
|
526
|
-
|
|
527
|
-
|
|
712
|
+
cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
|
|
713
|
+
"--matrix <matrix>",
|
|
714
|
+
"Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
|
|
715
|
+
).action(async (options) => {
|
|
716
|
+
if (!options.suite || !options.configs && !options.matrix) {
|
|
717
|
+
console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
|
|
528
718
|
process.exit(1);
|
|
529
719
|
}
|
|
530
720
|
try {
|
|
531
721
|
await runBenchCommand({
|
|
532
722
|
configs: options.configs,
|
|
533
723
|
suite: options.suite,
|
|
534
|
-
concurrency: Number(options.concurrency)
|
|
724
|
+
concurrency: Number(options.concurrency),
|
|
725
|
+
matrix: options.matrix
|
|
535
726
|
});
|
|
536
727
|
} catch (err) {
|
|
537
728
|
console.error(`Error executing benchmark: ${err.message}`);
|
|
@@ -552,7 +743,7 @@ cli.command(
|
|
|
552
743
|
cli.command(
|
|
553
744
|
"import-pr <repo> <prNumber>",
|
|
554
745
|
"Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
|
|
555
|
-
).option("--out <dir>", "Output directory for the scaffolded test case").action(async (repo, prNumber, options) => {
|
|
746
|
+
).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
|
|
556
747
|
try {
|
|
557
748
|
await importPrCommand(repo, prNumber, options);
|
|
558
749
|
} catch (err) {
|
|
@@ -560,5 +751,27 @@ cli.command(
|
|
|
560
751
|
process.exit(1);
|
|
561
752
|
}
|
|
562
753
|
});
|
|
754
|
+
cli.command("trace <runId>", "Show the step trace and metrics for a single run").option(
|
|
755
|
+
"--quality",
|
|
756
|
+
"Show only the quality-metrics breakdown (static-quality, llm-judge, diff, localization)"
|
|
757
|
+
).action(async (runId, options) => {
|
|
758
|
+
try {
|
|
759
|
+
await traceCommand(runId, options);
|
|
760
|
+
} catch (err) {
|
|
761
|
+
console.error(`Error executing trace: ${err.message}`);
|
|
762
|
+
process.exit(1);
|
|
763
|
+
}
|
|
764
|
+
});
|
|
563
765
|
cli.help();
|
|
564
|
-
|
|
766
|
+
try {
|
|
767
|
+
cli.parse();
|
|
768
|
+
} catch (err) {
|
|
769
|
+
if (err.name === "CACError") {
|
|
770
|
+
console.error(`
|
|
771
|
+
\u274C ${err.message}
|
|
772
|
+
`);
|
|
773
|
+
cli.outputHelp();
|
|
774
|
+
process.exit(1);
|
|
775
|
+
}
|
|
776
|
+
throw err;
|
|
777
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.5",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -19,14 +19,18 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^2.0.
|
|
23
|
-
"@agentgrader/core": "^1.1.
|
|
24
|
-
"@agentgrader/
|
|
22
|
+
"@agentgrader/agent-openrouter": "^2.0.1",
|
|
23
|
+
"@agentgrader/core": "^1.1.1",
|
|
24
|
+
"@agentgrader/optimizer": "^0.1.0",
|
|
25
|
+
"@agentgrader/sandbox-docker": "^2.0.2",
|
|
26
|
+
"@agentgrader/scorer-static": "^0.1.0",
|
|
25
27
|
"@agentgrader/store": "^1.0.2",
|
|
26
28
|
"cac": "^6.7.14",
|
|
29
|
+
"dotenv": "^17.4.2",
|
|
27
30
|
"ink": "^4.4.1",
|
|
28
31
|
"react": "^18.2.0",
|
|
29
|
-
"yaml": "^2.5.1"
|
|
32
|
+
"yaml": "^2.5.1",
|
|
33
|
+
"zod": "^3.23.8"
|
|
30
34
|
},
|
|
31
35
|
"devDependencies": {
|
|
32
36
|
"@types/react": "^18.2.0",
|