agentv 2.18.2 → 2.18.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ import {
11
11
  validateEvalFile,
12
12
  validateFileReferences,
13
13
  validateTargetsFile
14
- } from "./chunk-ZCUOH72A.js";
14
+ } from "./chunk-KSUL3F3R.js";
15
15
  import {
16
16
  assembleLlmJudgePrompt,
17
17
  buildPromptInputs,
@@ -27,7 +27,7 @@ import {
27
27
  toCamelCaseDeep,
28
28
  toSnakeCaseDeep as toSnakeCaseDeep2,
29
29
  trimBaselineResult
30
- } from "./chunk-ID5SDIYE.js";
30
+ } from "./chunk-YTHTGLMT.js";
31
31
  import {
32
32
  __commonJS,
33
33
  __esm,
@@ -3771,6 +3771,129 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
3771
3771
  }
3772
3772
 
3773
3773
  // src/commands/eval/commands/prompt/overview.ts
3774
+ function getEvalMode() {
3775
+ const mode = process.env.AGENTV_PROMPT_EVAL_MODE ?? "agent";
3776
+ if (mode !== "agent" && mode !== "cli") {
3777
+ throw new Error(`Invalid AGENTV_PROMPT_EVAL_MODE="${mode}". Valid values: agent, cli`);
3778
+ }
3779
+ return mode;
3780
+ }
3781
+ async function generateOverviewPrompt(evalPaths) {
3782
+ const cwd = process.cwd();
3783
+ const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
3784
+ const repoRoot = await findRepoRoot(cwd);
3785
+ const mode = getEvalMode();
3786
+ const fileEntries = [];
3787
+ for (const evalPath of resolvedPaths) {
3788
+ const tests = await loadTests(evalPath, repoRoot);
3789
+ fileEntries.push({ path: evalPath, tests });
3790
+ }
3791
+ const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
3792
+ if (mode === "cli") {
3793
+ return generateCliModePrompt(fileEntries, totalCases);
3794
+ }
3795
+ return generateAgentModePrompt(fileEntries, totalCases);
3796
+ }
3797
+ function generateAgentModePrompt(fileEntries, totalCases) {
3798
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-").slice(0, -1);
3799
+ const lines = [
3800
+ "# AgentV Eval Orchestration",
3801
+ "",
3802
+ "**Mode: agent** \u2014 You orchestrate the evaluation using agents. No API keys needed.",
3803
+ "",
3804
+ `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
3805
+ "",
3806
+ "## Setup",
3807
+ "",
3808
+ `- **Results file:** \`.agentv/results/eval_${timestamp}.jsonl\``,
3809
+ "- **Temp answers:** `.agentv/tmp/`",
3810
+ "",
3811
+ "Ensure both directories exist before starting.",
3812
+ "",
3813
+ "## For each test case",
3814
+ "",
3815
+ "Run these two agents **sequentially**:",
3816
+ "",
3817
+ "### 1. Dispatch `eval-candidate` agent",
3818
+ "",
3819
+ "Parameters:",
3820
+ "- `eval-path`: Path to the eval YAML file",
3821
+ "- `test-id`: The test case ID",
3822
+ "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
3823
+ "",
3824
+ "The agent retrieves the task input, acts as the candidate LLM, and saves its response.",
3825
+ "",
3826
+ "### 2. Dispatch `eval-judge` agent (after candidate completes)",
3827
+ "",
3828
+ "Parameters:",
3829
+ "- `eval-path`: Path to the eval YAML file",
3830
+ "- `test-id`: The test case ID",
3831
+ "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
3832
+ `- \`results-file\`: \`.agentv/results/eval_${timestamp}.jsonl\``,
3833
+ "",
3834
+ "The agent runs evaluators, scores the response, and appends results to the JSONL file.",
3835
+ ""
3836
+ ];
3837
+ for (const { path: evalPath, tests } of fileEntries) {
3838
+ lines.push(`## ${evalPath}`);
3839
+ lines.push("");
3840
+ for (const evalCase of tests) {
3841
+ const evaluatorSummary = describeEvaluators(evalCase);
3842
+ lines.push(`### ${evalCase.id}`);
3843
+ lines.push(`Criteria: ${evalCase.criteria}`);
3844
+ if (evaluatorSummary) {
3845
+ lines.push(`Evaluators: ${evaluatorSummary}`);
3846
+ }
3847
+ lines.push("");
3848
+ lines.push("**1. Dispatch `eval-candidate` agent:**");
3849
+ lines.push(`- eval-path: \`${evalPath}\``);
3850
+ lines.push(`- test-id: \`${evalCase.id}\``);
3851
+ lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
3852
+ lines.push("");
3853
+ lines.push("**2. Dispatch `eval-judge` agent** (after candidate completes):");
3854
+ lines.push(`- eval-path: \`${evalPath}\``);
3855
+ lines.push(`- test-id: \`${evalCase.id}\``);
3856
+ lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
3857
+ lines.push(`- results-file: \`.agentv/results/eval_${timestamp}.jsonl\``);
3858
+ lines.push("");
3859
+ }
3860
+ }
3861
+ return lines.join("\n");
3862
+ }
3863
+ function generateCliModePrompt(fileEntries, totalCases) {
3864
+ const evalPathArgs = fileEntries.map((e) => e.path).join(" ");
3865
+ const lines = [
3866
+ "# AgentV Eval Orchestration",
3867
+ "",
3868
+ "**Mode: cli** \u2014 Run the evaluation end-to-end using the CLI.",
3869
+ "",
3870
+ `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
3871
+ "",
3872
+ "## Run the evaluation",
3873
+ "",
3874
+ "```bash",
3875
+ `agentv eval ${evalPathArgs}`,
3876
+ "```",
3877
+ "",
3878
+ "Results are written to `.agentv/results/`. The output path is printed in the CLI output.",
3879
+ "Parse the JSONL file for per-test scores, hits, and misses.",
3880
+ ""
3881
+ ];
3882
+ for (const { path: evalPath, tests } of fileEntries) {
3883
+ lines.push(`## ${evalPath}`);
3884
+ lines.push("");
3885
+ for (const evalCase of tests) {
3886
+ const evaluatorSummary = describeEvaluators(evalCase);
3887
+ lines.push(`### ${evalCase.id}`);
3888
+ lines.push(`Criteria: ${evalCase.criteria}`);
3889
+ if (evaluatorSummary) {
3890
+ lines.push(`Evaluators: ${evaluatorSummary}`);
3891
+ }
3892
+ lines.push("");
3893
+ }
3894
+ }
3895
+ return lines.join("\n");
3896
+ }
3774
3897
  var evalPromptOverviewCommand = command({
3775
3898
  name: "overview",
3776
3899
  description: "Output orchestration prompt for host agent to run evals",
@@ -3782,65 +3905,8 @@ var evalPromptOverviewCommand = command({
3782
3905
  })
3783
3906
  },
3784
3907
  handler: async (args) => {
3785
- const cwd = process.cwd();
3786
- const resolvedPaths = await resolveEvalPaths(args.evalPaths, cwd);
3787
- const repoRoot = await findRepoRoot(cwd);
3788
- const fileEntries = [];
3789
- for (const evalPath of resolvedPaths) {
3790
- const tests = await loadTests(evalPath, repoRoot);
3791
- fileEntries.push({ path: evalPath, tests });
3792
- }
3793
- const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
3794
- const lines = [
3795
- "# AgentV Eval Orchestration",
3796
- "",
3797
- `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}. For each case: get the task input, execute it, then judge the result.`,
3798
- "",
3799
- "## Step 1: Get Task Input",
3800
- "",
3801
- "Run `agentv prompt eval input <path> --test-id <id>` to get the task as JSON.",
3802
- "",
3803
- "The output contains:",
3804
- '- `input` \u2014 `[{role, content}]` array. Content segments are either `{type: "text", value: "..."}` or `{type: "file", path: "/absolute/path"}`. Read file segments from the filesystem.',
3805
- "- `guideline_paths` \u2014 files containing additional instructions to prepend to the system message (may be empty). Read these from the filesystem.",
3806
- "- `criteria` \u2014 what a good answer should accomplish (for your reference, do not leak to the agent being tested)",
3807
- "",
3808
- "## Step 2: Execute the Task",
3809
- "",
3810
- "Send the prompt to the agent/LLM being evaluated. Save the complete response text to a file.",
3811
- "",
3812
- "## Step 3: Judge the Result",
3813
- "",
3814
- "Run `agentv prompt eval judge <path> --test-id <id> --answer-file <response-file>`.",
3815
- "",
3816
- "The output contains an `evaluators` array. Each evaluator has a `status`:",
3817
- "",
3818
- '- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
3819
- '- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
3820
- " `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
3821
- ""
3822
- ];
3823
- for (const { path: evalPath, tests } of fileEntries) {
3824
- lines.push(`## ${evalPath}`);
3825
- lines.push("");
3826
- for (const evalCase of tests) {
3827
- const evaluatorSummary = describeEvaluators(evalCase);
3828
- lines.push(`### ${evalCase.id}`);
3829
- lines.push(`Criteria: ${evalCase.criteria}`);
3830
- if (evaluatorSummary) {
3831
- lines.push(`Evaluators: ${evaluatorSummary}`);
3832
- }
3833
- lines.push("");
3834
- lines.push("```bash");
3835
- lines.push(`agentv prompt eval input ${evalPath} --test-id ${evalCase.id}`);
3836
- lines.push(
3837
- `agentv prompt eval judge ${evalPath} --test-id ${evalCase.id} --answer-file <response-file>`
3838
- );
3839
- lines.push("```");
3840
- lines.push("");
3841
- }
3842
- }
3843
- process.stdout.write(lines.join("\n"));
3908
+ const output = await generateOverviewPrompt(args.evalPaths);
3909
+ process.stdout.write(output);
3844
3910
  }
3845
3911
  });
3846
3912
  function describeEvaluators(evalCase) {
@@ -3938,7 +4004,7 @@ var evalRunCommand = command({
3938
4004
  agentTimeout: option({
3939
4005
  type: optional(number),
3940
4006
  long: "agent-timeout",
3941
- description: "Timeout in seconds for provider responses (default: 120)"
4007
+ description: "Optional top-level evaluation timeout in seconds. Unset by default."
3942
4008
  }),
3943
4009
  maxRetries: option({
3944
4010
  type: optional(number),
@@ -4006,7 +4072,7 @@ var evalRunCommand = command({
4006
4072
  },
4007
4073
  handler: async (args) => {
4008
4074
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4009
- const { launchInteractiveWizard } = await import("./interactive-RJTBLMVF.js");
4075
+ const { launchInteractiveWizard } = await import("./interactive-J4IBXJF7.js");
4010
4076
  await launchInteractiveWizard();
4011
4077
  return;
4012
4078
  }
@@ -5885,4 +5951,4 @@ export {
5885
5951
  preprocessArgv,
5886
5952
  runCli
5887
5953
  };
5888
- //# sourceMappingURL=chunk-PJBBVLLB.js.map
5954
+ //# sourceMappingURL=chunk-RMUVJ44Z.js.map