agentv 3.14.4 → 3.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-3NLBBQX6.js → chunk-CQRWNXVG.js} +42 -20
- package/dist/chunk-CQRWNXVG.js.map +1 -0
- package/dist/{chunk-SAPEYQ5U.js → chunk-Y25VL7PX.js} +3 -3
- package/dist/chunk-Y25VL7PX.js.map +1 -0
- package/dist/cli.js +2 -2
- package/dist/index.js +2 -2
- package/dist/{interactive-PGZ55VHT.js → interactive-5ESM5DWV.js} +2 -2
- package/dist/templates/.env.example +0 -3
- package/package.json +1 -1
- package/dist/chunk-3NLBBQX6.js.map +0 -1
- package/dist/chunk-SAPEYQ5U.js.map +0 -1
- /package/dist/{interactive-PGZ55VHT.js.map → interactive-5ESM5DWV.js.map} +0 -0
|
@@ -23,7 +23,7 @@ import {
|
|
|
23
23
|
validateFileReferences,
|
|
24
24
|
validateTargetsFile,
|
|
25
25
|
writeArtifactsFromResults
|
|
26
|
-
} from "./chunk-
|
|
26
|
+
} from "./chunk-Y25VL7PX.js";
|
|
27
27
|
import {
|
|
28
28
|
createBuiltinRegistry,
|
|
29
29
|
executeScript,
|
|
@@ -4186,7 +4186,7 @@ var evalRunCommand = command({
|
|
|
4186
4186
|
},
|
|
4187
4187
|
handler: async (args) => {
|
|
4188
4188
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4189
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4189
|
+
const { launchInteractiveWizard } = await import("./interactive-5ESM5DWV.js");
|
|
4190
4190
|
await launchInteractiveWizard();
|
|
4191
4191
|
return;
|
|
4192
4192
|
}
|
|
@@ -4421,6 +4421,8 @@ var evalBenchCommand = command({
|
|
|
4421
4421
|
const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
|
|
4422
4422
|
const testIds = manifest.test_ids;
|
|
4423
4423
|
const targetName = manifest.target?.name ?? "unknown";
|
|
4424
|
+
const evalSet = manifest.eval_set ?? "";
|
|
4425
|
+
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4424
4426
|
let stdinData;
|
|
4425
4427
|
if (llmScoresPath) {
|
|
4426
4428
|
stdinData = await readFile(llmScoresPath, "utf8");
|
|
@@ -4431,7 +4433,9 @@ var evalBenchCommand = command({
|
|
|
4431
4433
|
const indexLines = [];
|
|
4432
4434
|
const allPassRates = [];
|
|
4433
4435
|
for (const testId of testIds) {
|
|
4434
|
-
const
|
|
4436
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
4437
|
+
const testDir = join(exportDir, ...subpath);
|
|
4438
|
+
const artifactSubdir = subpath.join("/");
|
|
4435
4439
|
const evaluators = [];
|
|
4436
4440
|
const allAssertions = [];
|
|
4437
4441
|
const codeResultsDir = join(testDir, "code_grader_results");
|
|
@@ -4527,13 +4531,14 @@ var evalBenchCommand = command({
|
|
|
4527
4531
|
JSON.stringify({
|
|
4528
4532
|
timestamp: manifest.timestamp,
|
|
4529
4533
|
test_id: testId,
|
|
4534
|
+
eval_set: evalSet || void 0,
|
|
4530
4535
|
score: Math.round(weightedScore * 1e3) / 1e3,
|
|
4531
4536
|
target: targetName,
|
|
4532
4537
|
scores,
|
|
4533
4538
|
execution_status: executionStatus,
|
|
4534
|
-
grading_path: `${
|
|
4535
|
-
timing_path: `${
|
|
4536
|
-
response_path: hasResponse ? `${
|
|
4539
|
+
grading_path: `${artifactSubdir}/grading.json`,
|
|
4540
|
+
timing_path: `${artifactSubdir}/timing.json`,
|
|
4541
|
+
response_path: hasResponse ? `${artifactSubdir}/response.md` : void 0
|
|
4537
4542
|
})
|
|
4538
4543
|
);
|
|
4539
4544
|
}
|
|
@@ -4603,10 +4608,13 @@ var evalGradeCommand = command({
|
|
|
4603
4608
|
const manifestPath = join2(exportDir, "manifest.json");
|
|
4604
4609
|
const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
|
|
4605
4610
|
const testIds = manifest.test_ids;
|
|
4611
|
+
const evalSet = manifest.eval_set ?? "";
|
|
4612
|
+
const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4606
4613
|
let totalGraders = 0;
|
|
4607
4614
|
let totalPassed = 0;
|
|
4608
4615
|
for (const testId of testIds) {
|
|
4609
|
-
const
|
|
4616
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
4617
|
+
const testDir = join2(exportDir, ...subpath);
|
|
4610
4618
|
const codeGradersDir = join2(testDir, "code_graders");
|
|
4611
4619
|
const resultsDir = join2(testDir, "code_grader_results");
|
|
4612
4620
|
let graderFiles;
|
|
@@ -4701,7 +4709,7 @@ import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
|
|
|
4701
4709
|
import { dirname, join as join3, resolve } from "node:path";
|
|
4702
4710
|
var evalInputCommand = command({
|
|
4703
4711
|
name: "input",
|
|
4704
|
-
description: "Extract eval inputs, target commands, and grader prompts for
|
|
4712
|
+
description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
|
|
4705
4713
|
args: {
|
|
4706
4714
|
evalPath: positional({
|
|
4707
4715
|
type: string,
|
|
@@ -4711,7 +4719,7 @@ var evalInputCommand = command({
|
|
|
4711
4719
|
out: option({
|
|
4712
4720
|
type: optional(string),
|
|
4713
4721
|
long: "out",
|
|
4714
|
-
description: "Output directory for extracted inputs (default: .agentv/results/runs
|
|
4722
|
+
description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
|
|
4715
4723
|
})
|
|
4716
4724
|
},
|
|
4717
4725
|
handler: async ({ evalPath, out }) => {
|
|
@@ -4752,9 +4760,12 @@ var evalInputCommand = command({
|
|
|
4752
4760
|
}
|
|
4753
4761
|
} catch {
|
|
4754
4762
|
}
|
|
4763
|
+
const evalSetName = suite.metadata?.name?.trim() ?? "";
|
|
4764
|
+
const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4755
4765
|
const testIds = [];
|
|
4756
4766
|
for (const test of tests) {
|
|
4757
|
-
const
|
|
4767
|
+
const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
|
|
4768
|
+
const testDir = join3(outDir, ...subpath);
|
|
4758
4769
|
await mkdir3(testDir, { recursive: true });
|
|
4759
4770
|
testIds.push(test.id);
|
|
4760
4771
|
const inputText = test.question;
|
|
@@ -4793,6 +4804,7 @@ var evalInputCommand = command({
|
|
|
4793
4804
|
}
|
|
4794
4805
|
await writeJson(join3(outDir, "manifest.json"), {
|
|
4795
4806
|
eval_file: resolvedEvalPath,
|
|
4807
|
+
eval_set: evalSetName || void 0,
|
|
4796
4808
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4797
4809
|
target: {
|
|
4798
4810
|
name: targetName,
|
|
@@ -4892,7 +4904,7 @@ var evalRunCommand2 = command({
|
|
|
4892
4904
|
out: option({
|
|
4893
4905
|
type: optional(string),
|
|
4894
4906
|
long: "out",
|
|
4895
|
-
description: "Output directory for results (default: .agentv/results/runs
|
|
4907
|
+
description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
|
|
4896
4908
|
}),
|
|
4897
4909
|
workers: option({
|
|
4898
4910
|
type: optional(number),
|
|
@@ -4938,9 +4950,12 @@ var evalRunCommand2 = command({
|
|
|
4938
4950
|
}
|
|
4939
4951
|
} catch {
|
|
4940
4952
|
}
|
|
4953
|
+
const evalSetName = suite.metadata?.name?.trim() ?? "";
|
|
4954
|
+
const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
|
|
4941
4955
|
const testIds = [];
|
|
4942
4956
|
for (const test of tests) {
|
|
4943
|
-
const
|
|
4957
|
+
const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
|
|
4958
|
+
const testDir = join4(outDir, ...subpath);
|
|
4944
4959
|
await mkdir4(testDir, { recursive: true });
|
|
4945
4960
|
testIds.push(test.id);
|
|
4946
4961
|
const inputText = test.question;
|
|
@@ -4979,6 +4994,7 @@ var evalRunCommand2 = command({
|
|
|
4979
4994
|
}
|
|
4980
4995
|
await writeJson2(join4(outDir, "manifest.json"), {
|
|
4981
4996
|
eval_file: resolvedEvalPath,
|
|
4997
|
+
eval_set: evalSetName || void 0,
|
|
4982
4998
|
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
4983
4999
|
target: { name: targetName, kind: targetKind },
|
|
4984
5000
|
test_ids: testIds
|
|
@@ -4993,7 +5009,8 @@ var evalRunCommand2 = command({
|
|
|
4993
5009
|
const maxWorkers = workers ?? testIds.length;
|
|
4994
5010
|
console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
|
|
4995
5011
|
const invokeTarget = async (testId) => {
|
|
4996
|
-
const
|
|
5012
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5013
|
+
const testDir = join4(outDir, ...subpath);
|
|
4997
5014
|
const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
|
|
4998
5015
|
if (invoke.kind !== "cli") return;
|
|
4999
5016
|
const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
|
|
@@ -5061,12 +5078,13 @@ var evalRunCommand2 = command({
|
|
|
5061
5078
|
}
|
|
5062
5079
|
await Promise.all(pending);
|
|
5063
5080
|
} else {
|
|
5064
|
-
console.log("
|
|
5081
|
+
console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
|
|
5065
5082
|
}
|
|
5066
5083
|
let totalGraders = 0;
|
|
5067
5084
|
let totalPassed = 0;
|
|
5068
5085
|
for (const testId of testIds) {
|
|
5069
|
-
const
|
|
5086
|
+
const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
|
|
5087
|
+
const testDir = join4(outDir, ...subpath);
|
|
5070
5088
|
const codeGradersDir = join4(testDir, "code_graders");
|
|
5071
5089
|
const resultsDir = join4(testDir, "code_grader_results");
|
|
5072
5090
|
let graderFiles;
|
|
@@ -5684,9 +5702,11 @@ function patchTestIds(results) {
|
|
|
5684
5702
|
// src/commands/results/export.ts
|
|
5685
5703
|
function deriveOutputDir(cwd, sourceFile) {
|
|
5686
5704
|
const parentDir = path7.basename(path7.dirname(sourceFile));
|
|
5705
|
+
if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) {
|
|
5706
|
+
return path7.join(cwd, ".agentv", "results", "export", parentDir);
|
|
5707
|
+
}
|
|
5687
5708
|
if (parentDir.startsWith("eval_")) {
|
|
5688
|
-
|
|
5689
|
-
return path7.join(cwd, ".agentv", "results", "export", dirName2);
|
|
5709
|
+
return path7.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
|
|
5690
5710
|
}
|
|
5691
5711
|
const basename = path7.basename(sourceFile, ".jsonl");
|
|
5692
5712
|
const dirName = basename.startsWith("eval_") ? basename.slice(5) : basename;
|
|
@@ -5939,10 +5959,12 @@ function checkDirectoryNaming(runDir) {
|
|
|
5939
5959
|
message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
|
|
5940
5960
|
});
|
|
5941
5961
|
}
|
|
5942
|
-
|
|
5962
|
+
const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
|
|
5963
|
+
const isLegacyFormat = /^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
|
|
5964
|
+
if (!isNewFormat && !isLegacyFormat) {
|
|
5943
5965
|
diagnostics.push({
|
|
5944
5966
|
severity: "warning",
|
|
5945
|
-
message: `Directory name '${dirName}' does not match the expected pattern '
|
|
5967
|
+
message: `Directory name '${dirName}' does not match the expected pattern '<ISO-timestamp>'. Example: 2026-03-27T12-42-24-429Z`
|
|
5946
5968
|
});
|
|
5947
5969
|
}
|
|
5948
5970
|
return diagnostics;
|
|
@@ -8525,4 +8547,4 @@ export {
|
|
|
8525
8547
|
preprocessArgv,
|
|
8526
8548
|
runCli
|
|
8527
8549
|
};
|
|
8528
|
-
//# sourceMappingURL=chunk-
|
|
8550
|
+
//# sourceMappingURL=chunk-CQRWNXVG.js.map
|