agentv 3.7.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,7 +16,7 @@ import {
16
16
  validateEvalFile,
17
17
  validateFileReferences,
18
18
  validateTargetsFile
19
- } from "./chunk-7YS6YNJZ.js";
19
+ } from "./chunk-GC5P5HHZ.js";
20
20
  import {
21
21
  createBuiltinRegistry,
22
22
  createProvider,
@@ -34,7 +34,7 @@ import {
34
34
  toSnakeCaseDeep as toSnakeCaseDeep2,
35
35
  transpileEvalYamlFile,
36
36
  trimBaselineResult
37
- } from "./chunk-XGG64VIY.js";
37
+ } from "./chunk-TXDPYXHY.js";
38
38
  import {
39
39
  __commonJS,
40
40
  __esm,
@@ -3714,7 +3714,6 @@ async function getPromptEvalInput(evalPath, testId) {
3714
3714
  return {
3715
3715
  test_id: evalCase.id,
3716
3716
  input: resolveMessages(evalCase.input, fileMap),
3717
- guideline_paths: evalCase.guideline_paths,
3718
3717
  criteria: evalCase.criteria
3719
3718
  };
3720
3719
  }
@@ -3739,9 +3738,8 @@ async function getPromptEvalGradingBrief(evalPath, testId) {
3739
3738
  if (inputText) {
3740
3739
  lines.push(`Input: "${inputText}"`);
3741
3740
  }
3742
- const filePaths = evalCase.file_paths.filter((p) => !evalCase.guideline_paths.includes(p));
3743
- if (filePaths.length > 0) {
3744
- lines.push(`Files: ${filePaths.join(", ")}`);
3741
+ if (evalCase.file_paths.length > 0) {
3742
+ lines.push(`Files: ${evalCase.file_paths.join(", ")}`);
3745
3743
  }
3746
3744
  if (evalCase.reference_answer) {
3747
3745
  lines.push(`Expected: "${evalCase.reference_answer}"`);
@@ -3973,7 +3971,6 @@ var evalAssertCommand = command({
3973
3971
  criteria: "",
3974
3972
  expected_output: [],
3975
3973
  reference_answer: "",
3976
- guideline_files: [],
3977
3974
  input_files: [],
3978
3975
  trace: null,
3979
3976
  token_usage: null,
@@ -4180,11 +4177,16 @@ var evalRunCommand = command({
4180
4177
  type: optional(string),
4181
4178
  long: "model",
4182
4179
  description: 'Override model for the grader target (e.g., "openai:gpt-5-mini")'
4180
+ }),
4181
+ outputMessages: option({
4182
+ type: optional(string),
4183
+ long: "output-messages",
4184
+ description: 'Number of trailing messages to include in results output (default: 1, or "all")'
4183
4185
  })
4184
4186
  },
4185
4187
  handler: async (args) => {
4186
4188
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4187
- const { launchInteractiveWizard } = await import("./interactive-F6XECJ33.js");
4189
+ const { launchInteractiveWizard } = await import("./interactive-3VTDK5NX.js");
4188
4190
  await launchInteractiveWizard();
4189
4191
  return;
4190
4192
  }
@@ -4220,7 +4222,8 @@ var evalRunCommand = command({
4220
4222
  benchmarkJson: args.benchmarkJson,
4221
4223
  artifacts: args.artifacts,
4222
4224
  graderTarget: args.graderTarget,
4223
- model: args.model
4225
+ model: args.model,
4226
+ outputMessages: args.outputMessages
4224
4227
  };
4225
4228
  await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
4226
4229
  }
@@ -4706,10 +4709,9 @@ function exportResults(sourceFile, content, outputDir) {
4706
4709
  const outputsDir = path8.join(outputDir, "outputs");
4707
4710
  mkdirSync2(outputsDir, { recursive: true });
4708
4711
  for (const result of patched) {
4709
- const outputText = result.outputText;
4710
- if (outputText) {
4712
+ if (result.output && result.output.length > 0) {
4711
4713
  const id = safeTestId(result);
4712
- writeFileSync3(path8.join(outputsDir, `${id}.txt`), outputText);
4714
+ writeFileSync3(path8.join(outputsDir, `${id}.txt`), JSON.stringify(result.output, null, 2));
4713
4715
  }
4714
4716
  }
4715
4717
  }
@@ -5021,7 +5023,6 @@ function toTraceSummary(raw) {
5021
5023
  return toCamelCaseDeep(raw.trace);
5022
5024
  }
5023
5025
  function extractCandidate(raw) {
5024
- if (raw.output_text !== void 0) return raw.output_text;
5025
5026
  if (raw.output !== void 0)
5026
5027
  return typeof raw.output === "string" ? raw.output : JSON.stringify(raw.output);
5027
5028
  return "";
@@ -5033,7 +5034,6 @@ function buildEvalTest(raw) {
5033
5034
  input: [],
5034
5035
  input_segments: [],
5035
5036
  expected_output: [],
5036
- guideline_paths: [],
5037
5037
  file_paths: [],
5038
5038
  criteria: ""
5039
5039
  };
@@ -5071,7 +5071,7 @@ async function runScore(results, evaluatorConfig, testIdFilter) {
5071
5071
  target: { kind: "custom", name: raw.target ?? "unknown", config: {} },
5072
5072
  provider: stubProvider,
5073
5073
  attempt: 1,
5074
- promptInputs: { question: "", guidelines: "" },
5074
+ promptInputs: { question: "" },
5075
5075
  now: /* @__PURE__ */ new Date(),
5076
5076
  output: Array.isArray(output) ? output : void 0,
5077
5077
  trace,
@@ -5325,7 +5325,7 @@ function formatResultDetail(result, index, tree) {
5325
5325
  }
5326
5326
  const scoreColor = result.score >= 0.9 ? c2.green : result.score >= 0.5 ? c2.yellow : c2.red;
5327
5327
  lines.push(
5328
- `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.dataset ? ` ${c2.dim}dataset: ${result.dataset}${c2.reset}` : ""}`
5328
+ `${c2.bold}${testId}${c2.reset} ${scoreColor}${formatScore(result.score)}${c2.reset}${result.target ? ` ${c2.dim}target: ${result.target}${c2.reset}` : ""}${result.eval_set ? ` ${c2.dim}eval-set: ${result.eval_set}${c2.reset}` : ""}`
5329
5329
  );
5330
5330
  if (result.error) {
5331
5331
  lines.push(` ${c2.red}Error: ${result.error}${c2.reset}`);
@@ -5499,8 +5499,8 @@ function groupResults(results, groupBy2) {
5499
5499
  case "target":
5500
5500
  key = result.target ?? "unknown";
5501
5501
  break;
5502
- case "dataset":
5503
- key = result.dataset ?? "unknown";
5502
+ case "eval-set":
5503
+ key = result.eval_set ?? "unknown";
5504
5504
  break;
5505
5505
  case "test-id":
5506
5506
  key = result.test_id ?? result.eval_id ?? "unknown";
@@ -5582,10 +5582,10 @@ var traceStatsCommand = command({
5582
5582
  description: "Path to JSONL result file"
5583
5583
  }),
5584
5584
  groupBy: option({
5585
- type: optional(oneOf(["target", "dataset", "test-id"])),
5585
+ type: optional(oneOf(["target", "eval-set", "test-id"])),
5586
5586
  long: "group-by",
5587
5587
  short: "g",
5588
- description: "Group statistics by: target, dataset, or test-id"
5588
+ description: "Group statistics by: target, eval-set, or test-id"
5589
5589
  }),
5590
5590
  format: option({
5591
5591
  type: optional(oneOf(["table", "json"])),
@@ -6276,4 +6276,4 @@ export {
6276
6276
  preprocessArgv,
6277
6277
  runCli
6278
6278
  };
6279
- //# sourceMappingURL=chunk-TR6H437M.js.map
6279
+ //# sourceMappingURL=chunk-Q2YWV4QM.js.map