agentv 3.14.5 → 3.14.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,7 +23,7 @@ import {
23
23
  validateFileReferences,
24
24
  validateTargetsFile,
25
25
  writeArtifactsFromResults
26
- } from "./chunk-GUXXTOYK.js";
26
+ } from "./chunk-Y25VL7PX.js";
27
27
  import {
28
28
  createBuiltinRegistry,
29
29
  executeScript,
@@ -4186,7 +4186,7 @@ var evalRunCommand = command({
4186
4186
  },
4187
4187
  handler: async (args) => {
4188
4188
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4189
- const { launchInteractiveWizard } = await import("./interactive-WUIEXGWM.js");
4189
+ const { launchInteractiveWizard } = await import("./interactive-5ESM5DWV.js");
4190
4190
  await launchInteractiveWizard();
4191
4191
  return;
4192
4192
  }
@@ -4421,6 +4421,8 @@ var evalBenchCommand = command({
4421
4421
  const manifest = JSON.parse(await readFile(join(exportDir, "manifest.json"), "utf8"));
4422
4422
  const testIds = manifest.test_ids;
4423
4423
  const targetName = manifest.target?.name ?? "unknown";
4424
+ const evalSet = manifest.eval_set ?? "";
4425
+ const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4424
4426
  let stdinData;
4425
4427
  if (llmScoresPath) {
4426
4428
  stdinData = await readFile(llmScoresPath, "utf8");
@@ -4431,7 +4433,9 @@ var evalBenchCommand = command({
4431
4433
  const indexLines = [];
4432
4434
  const allPassRates = [];
4433
4435
  for (const testId of testIds) {
4434
- const testDir = join(exportDir, testId);
4436
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
4437
+ const testDir = join(exportDir, ...subpath);
4438
+ const artifactSubdir = subpath.join("/");
4435
4439
  const evaluators = [];
4436
4440
  const allAssertions = [];
4437
4441
  const codeResultsDir = join(testDir, "code_grader_results");
@@ -4527,13 +4531,14 @@ var evalBenchCommand = command({
4527
4531
  JSON.stringify({
4528
4532
  timestamp: manifest.timestamp,
4529
4533
  test_id: testId,
4534
+ eval_set: evalSet || void 0,
4530
4535
  score: Math.round(weightedScore * 1e3) / 1e3,
4531
4536
  target: targetName,
4532
4537
  scores,
4533
4538
  execution_status: executionStatus,
4534
- grading_path: `${testId}/grading.json`,
4535
- timing_path: `${testId}/timing.json`,
4536
- response_path: hasResponse ? `${testId}/response.md` : null
4539
+ grading_path: `${artifactSubdir}/grading.json`,
4540
+ timing_path: `${artifactSubdir}/timing.json`,
4541
+ response_path: hasResponse ? `${artifactSubdir}/response.md` : void 0
4537
4542
  })
4538
4543
  );
4539
4544
  }
@@ -4603,10 +4608,13 @@ var evalGradeCommand = command({
4603
4608
  const manifestPath = join2(exportDir, "manifest.json");
4604
4609
  const manifest = JSON.parse(await readFile2(manifestPath, "utf8"));
4605
4610
  const testIds = manifest.test_ids;
4611
+ const evalSet = manifest.eval_set ?? "";
4612
+ const safeEvalSet = evalSet ? evalSet.replace(/[\/\\:*?"<>|]/g, "_") : "";
4606
4613
  let totalGraders = 0;
4607
4614
  let totalPassed = 0;
4608
4615
  for (const testId of testIds) {
4609
- const testDir = join2(exportDir, testId);
4616
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
4617
+ const testDir = join2(exportDir, ...subpath);
4610
4618
  const codeGradersDir = join2(testDir, "code_graders");
4611
4619
  const resultsDir = join2(testDir, "code_grader_results");
4612
4620
  let graderFiles;
@@ -4701,7 +4709,7 @@ import { mkdir as mkdir3, writeFile as writeFile4 } from "node:fs/promises";
4701
4709
  import { dirname, join as join3, resolve } from "node:path";
4702
4710
  var evalInputCommand = command({
4703
4711
  name: "input",
4704
- description: "Extract eval inputs, target commands, and grader prompts for agent-mode runs",
4712
+ description: "Extract eval inputs, target commands, and grader prompts for subagent-mode runs",
4705
4713
  args: {
4706
4714
  evalPath: positional({
4707
4715
  type: string,
@@ -4711,7 +4719,7 @@ var evalInputCommand = command({
4711
4719
  out: option({
4712
4720
  type: optional(string),
4713
4721
  long: "out",
4714
- description: "Output directory for extracted inputs (default: .agentv/results/runs/eval_<timestamp>)"
4722
+ description: "Output directory for extracted inputs (default: .agentv/results/runs/<timestamp>)"
4715
4723
  })
4716
4724
  },
4717
4725
  handler: async ({ evalPath, out }) => {
@@ -4752,9 +4760,12 @@ var evalInputCommand = command({
4752
4760
  }
4753
4761
  } catch {
4754
4762
  }
4763
+ const evalSetName = suite.metadata?.name?.trim() ?? "";
4764
+ const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
4755
4765
  const testIds = [];
4756
4766
  for (const test of tests) {
4757
- const testDir = join3(outDir, test.id);
4767
+ const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
4768
+ const testDir = join3(outDir, ...subpath);
4758
4769
  await mkdir3(testDir, { recursive: true });
4759
4770
  testIds.push(test.id);
4760
4771
  const inputText = test.question;
@@ -4793,6 +4804,7 @@ var evalInputCommand = command({
4793
4804
  }
4794
4805
  await writeJson(join3(outDir, "manifest.json"), {
4795
4806
  eval_file: resolvedEvalPath,
4807
+ eval_set: evalSetName || void 0,
4796
4808
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4797
4809
  target: {
4798
4810
  name: targetName,
@@ -4892,7 +4904,7 @@ var evalRunCommand2 = command({
4892
4904
  out: option({
4893
4905
  type: optional(string),
4894
4906
  long: "out",
4895
- description: "Output directory for results (default: .agentv/results/runs/eval_<timestamp>)"
4907
+ description: "Output directory for results (default: .agentv/results/runs/<timestamp>)"
4896
4908
  }),
4897
4909
  workers: option({
4898
4910
  type: optional(number),
@@ -4938,9 +4950,12 @@ var evalRunCommand2 = command({
4938
4950
  }
4939
4951
  } catch {
4940
4952
  }
4953
+ const evalSetName = suite.metadata?.name?.trim() ?? "";
4954
+ const safeEvalSet = evalSetName ? evalSetName.replace(/[\/\\:*?"<>|]/g, "_") : "";
4941
4955
  const testIds = [];
4942
4956
  for (const test of tests) {
4943
- const testDir = join4(outDir, test.id);
4957
+ const subpath = safeEvalSet ? [safeEvalSet, test.id] : [test.id];
4958
+ const testDir = join4(outDir, ...subpath);
4944
4959
  await mkdir4(testDir, { recursive: true });
4945
4960
  testIds.push(test.id);
4946
4961
  const inputText = test.question;
@@ -4979,6 +4994,7 @@ var evalRunCommand2 = command({
4979
4994
  }
4980
4995
  await writeJson2(join4(outDir, "manifest.json"), {
4981
4996
  eval_file: resolvedEvalPath,
4997
+ eval_set: evalSetName || void 0,
4982
4998
  timestamp: (/* @__PURE__ */ new Date()).toISOString(),
4983
4999
  target: { name: targetName, kind: targetKind },
4984
5000
  test_ids: testIds
@@ -4993,7 +5009,8 @@ var evalRunCommand2 = command({
4993
5009
  const maxWorkers = workers ?? testIds.length;
4994
5010
  console.log(`Invoking ${testIds.length} CLI target(s) (${maxWorkers} workers)...`);
4995
5011
  const invokeTarget = async (testId) => {
4996
- const testDir = join4(outDir, testId);
5012
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
5013
+ const testDir = join4(outDir, ...subpath);
4997
5014
  const invoke = JSON.parse(await readFile4(join4(testDir, "invoke.json"), "utf8"));
4998
5015
  if (invoke.kind !== "cli") return;
4999
5016
  const inputData = JSON.parse(await readFile4(join4(testDir, "input.json"), "utf8"));
@@ -5061,12 +5078,13 @@ var evalRunCommand2 = command({
5061
5078
  }
5062
5079
  await Promise.all(pending);
5063
5080
  } else {
5064
- console.log("Agent-as-target mode \u2014 skipping CLI invocation.");
5081
+ console.log("Subagent-as-target mode \u2014 skipping CLI invocation.");
5065
5082
  }
5066
5083
  let totalGraders = 0;
5067
5084
  let totalPassed = 0;
5068
5085
  for (const testId of testIds) {
5069
- const testDir = join4(outDir, testId);
5086
+ const subpath = safeEvalSet ? [safeEvalSet, testId] : [testId];
5087
+ const testDir = join4(outDir, ...subpath);
5070
5088
  const codeGradersDir = join4(testDir, "code_graders");
5071
5089
  const resultsDir = join4(testDir, "code_grader_results");
5072
5090
  let graderFiles;
@@ -5684,9 +5702,11 @@ function patchTestIds(results) {
5684
5702
  // src/commands/results/export.ts
5685
5703
  function deriveOutputDir(cwd, sourceFile) {
5686
5704
  const parentDir = path7.basename(path7.dirname(sourceFile));
5705
+ if (/^\d{4}-\d{2}-\d{2}T/.test(parentDir)) {
5706
+ return path7.join(cwd, ".agentv", "results", "export", parentDir);
5707
+ }
5687
5708
  if (parentDir.startsWith("eval_")) {
5688
- const dirName2 = parentDir.slice(5);
5689
- return path7.join(cwd, ".agentv", "results", "export", dirName2);
5709
+ return path7.join(cwd, ".agentv", "results", "export", parentDir.slice(5));
5690
5710
  }
5691
5711
  const basename = path7.basename(sourceFile, ".jsonl");
5692
5712
  const dirName = basename.startsWith("eval_") ? basename.slice(5) : basename;
@@ -5939,10 +5959,12 @@ function checkDirectoryNaming(runDir) {
5939
5959
  message: `Directory is not under a 'runs/' parent (found '${parentName}/'). Expected: .agentv/results/runs/<run-dir>`
5940
5960
  });
5941
5961
  }
5942
- if (!/^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName)) {
5962
+ const isNewFormat = /^\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
5963
+ const isLegacyFormat = /^eval_\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}-\d{3}Z$/.test(dirName);
5964
+ if (!isNewFormat && !isLegacyFormat) {
5943
5965
  diagnostics.push({
5944
5966
  severity: "warning",
5945
- message: `Directory name '${dirName}' does not match the expected pattern 'eval_<ISO-timestamp>'. Example: eval_2026-03-27T12-42-24-429Z`
5967
+ message: `Directory name '${dirName}' does not match the expected pattern '<ISO-timestamp>'. Example: 2026-03-27T12-42-24-429Z`
5946
5968
  });
5947
5969
  }
5948
5970
  return diagnostics;
@@ -8525,4 +8547,4 @@ export {
8525
8547
  preprocessArgv,
8526
8548
  runCli
8527
8549
  };
8528
- //# sourceMappingURL=chunk-UBLKP2F4.js.map
8550
+ //# sourceMappingURL=chunk-CQRWNXVG.js.map