agentgrader 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +42 -7
- package/package.json +6 -6
package/dist/index.js
CHANGED
|
@@ -476,6 +476,9 @@ function loadTestCase(yamlPath) {
|
|
|
476
476
|
(toolkit) => isAbsolute(toolkit) ? toolkit : resolve(dir, toolkit)
|
|
477
477
|
);
|
|
478
478
|
}
|
|
479
|
+
if (testCase.agent_config) {
|
|
480
|
+
testCase.agent_config = isAbsolute(testCase.agent_config) ? testCase.agent_config : resolve(dir, testCase.agent_config);
|
|
481
|
+
}
|
|
479
482
|
if (testCase.solution && looksLikeFilePath(testCase.solution)) {
|
|
480
483
|
testCase.solution = readPatchFile(dir, testCase.solution);
|
|
481
484
|
}
|
|
@@ -484,6 +487,26 @@ function loadTestCase(yamlPath) {
|
|
|
484
487
|
}
|
|
485
488
|
return testCase;
|
|
486
489
|
}
|
|
490
|
+
function resolveSharedAgentConfigFromTestCases(testCases) {
|
|
491
|
+
if (testCases.length === 0) {
|
|
492
|
+
throw new Error("No test cases loaded.");
|
|
493
|
+
}
|
|
494
|
+
const paths = /* @__PURE__ */ new Set();
|
|
495
|
+
for (const tc of testCases) {
|
|
496
|
+
if (!tc.agent_config) {
|
|
497
|
+
throw new Error(
|
|
498
|
+
"Either --configs, --configs-dir, --matrix, or --manifest must be provided, or every test case in the suite must define the same agent_config in agr.yaml."
|
|
499
|
+
);
|
|
500
|
+
}
|
|
501
|
+
paths.add(tc.agent_config);
|
|
502
|
+
}
|
|
503
|
+
if (paths.size > 1) {
|
|
504
|
+
throw new Error(
|
|
505
|
+
`Multiple agent_config values found across test cases (${[...paths].join(", ")}). Use --configs, --configs-dir, or --matrix to specify agent configs explicitly.`
|
|
506
|
+
);
|
|
507
|
+
}
|
|
508
|
+
return [...paths][0];
|
|
509
|
+
}
|
|
487
510
|
function looksLikeFilePath(value) {
|
|
488
511
|
const trimmed = value.trimStart();
|
|
489
512
|
if (trimmed.startsWith("diff ") || trimmed.startsWith("---") || trimmed.startsWith("***")) {
|
|
@@ -542,7 +565,7 @@ function findTestCaseYamlFiles(dir) {
|
|
|
542
565
|
async function runBenchCommand(opts) {
|
|
543
566
|
let suiteDir;
|
|
544
567
|
let concurrency = opts.concurrency ?? 2;
|
|
545
|
-
let agentConfigs;
|
|
568
|
+
let agentConfigs = [];
|
|
546
569
|
let matrixId;
|
|
547
570
|
if (opts.manifest) {
|
|
548
571
|
const manifestPath = resolve(opts.manifest);
|
|
@@ -574,7 +597,7 @@ async function runBenchCommand(opts) {
|
|
|
574
597
|
console.log(
|
|
575
598
|
`Matrix "${matrix.name}" expanded to ${agentConfigs.length} agent config(s) (matrixId: ${matrixId})`
|
|
576
599
|
);
|
|
577
|
-
} else {
|
|
600
|
+
} else if (opts.configs || opts.configsDir) {
|
|
578
601
|
const configPaths = resolveAgentConfigPathList({
|
|
579
602
|
commaSeparated: opts.configs,
|
|
580
603
|
dir: opts.configsDir
|
|
@@ -585,9 +608,6 @@ async function runBenchCommand(opts) {
|
|
|
585
608
|
}
|
|
586
609
|
}
|
|
587
610
|
}
|
|
588
|
-
if (agentConfigs.length === 0) {
|
|
589
|
-
throw new Error("No agent configs to benchmark.");
|
|
590
|
-
}
|
|
591
611
|
const yamlFiles = findTestCaseYamlFiles(suiteDir);
|
|
592
612
|
if (yamlFiles.length === 0) {
|
|
593
613
|
console.error(`No test cases found in suite directory: ${suiteDir}`);
|
|
@@ -597,6 +617,16 @@ async function runBenchCommand(opts) {
|
|
|
597
617
|
for (const f of yamlFiles) {
|
|
598
618
|
testCases.push(loadTestCase(f));
|
|
599
619
|
}
|
|
620
|
+
if (agentConfigs.length === 0) {
|
|
621
|
+
const sharedAgentConfig = resolveSharedAgentConfigFromTestCases(testCases);
|
|
622
|
+
const configPaths = resolveAgentConfigPathList({
|
|
623
|
+
explicitPaths: [sharedAgentConfig]
|
|
624
|
+
});
|
|
625
|
+
agentConfigs = loadAgentConfigsFromPaths(configPaths);
|
|
626
|
+
console.log(
|
|
627
|
+
`Using shared agent_config from agr.yaml: ${sharedAgentConfig} (${agentConfigs.length} config).`
|
|
628
|
+
);
|
|
629
|
+
}
|
|
600
630
|
const db = initDb();
|
|
601
631
|
for (const tc of testCases) {
|
|
602
632
|
await saveTestCase(db, testCaseToDbRow(tc));
|
|
@@ -1034,6 +1064,11 @@ async function runSingleCommand(testCasePath, opts) {
|
|
|
1034
1064
|
};
|
|
1035
1065
|
if (opts.config) {
|
|
1036
1066
|
agentConfig = loadAgentConfig(opts.config);
|
|
1067
|
+
} else if (testCase.agent_config) {
|
|
1068
|
+
agentConfig = loadAgentConfig(testCase.agent_config);
|
|
1069
|
+
console.log(
|
|
1070
|
+
`Using agent config from agr.yaml: ${testCase.agent_config} (model: ${agentConfig.model})`
|
|
1071
|
+
);
|
|
1037
1072
|
}
|
|
1038
1073
|
console.log(`Starting run for "${testCase.name}" using model "${agentConfig.model}"...`);
|
|
1039
1074
|
const sandboxProvider = new DockerSandboxProvider();
|
|
@@ -1204,9 +1239,9 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
|
|
|
1204
1239
|
);
|
|
1205
1240
|
process.exit(1);
|
|
1206
1241
|
}
|
|
1207
|
-
} else if (!options.suite
|
|
1242
|
+
} else if (!options.suite) {
|
|
1208
1243
|
console.error(
|
|
1209
|
-
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, or
|
|
1244
|
+
"Error: provide --manifest, or --suite with one of --configs, --config, --configs-dir, --matrix, or a shared agent_config in every agr.yaml."
|
|
1210
1245
|
);
|
|
1211
1246
|
process.exit(1);
|
|
1212
1247
|
} else if (agentSourceCount > 1) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agentgrader",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
@@ -19,11 +19,11 @@
|
|
|
19
19
|
"dev": "bun run src/index.ts"
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
|
-
"@agentgrader/agent-openrouter": "^
|
|
23
|
-
"@agentgrader/core": "^1.
|
|
24
|
-
"@agentgrader/optimizer": "^0.
|
|
25
|
-
"@agentgrader/sandbox-docker": "^
|
|
26
|
-
"@agentgrader/scorer-static": "^
|
|
22
|
+
"@agentgrader/agent-openrouter": "^3.0.0",
|
|
23
|
+
"@agentgrader/core": "^1.2.0",
|
|
24
|
+
"@agentgrader/optimizer": "^1.0.0",
|
|
25
|
+
"@agentgrader/sandbox-docker": "^3.0.0",
|
|
26
|
+
"@agentgrader/scorer-static": "^1.0.0",
|
|
27
27
|
"@agentgrader/store": "^1.0.3",
|
|
28
28
|
"cac": "^6.7.14",
|
|
29
29
|
"dotenv": "^17.4.2",
|