npm - agentv - Versions diffs - 2.18.2 → 2.18.4 - Mend

agentv 2.18.2 → 2.18.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +11 -2
package/dist/{chunk-ZCUOH72A.js → chunk-KSUL3F3R.js} +6 -6
package/dist/chunk-KSUL3F3R.js.map +1 -0
package/dist/{chunk-PJBBVLLB.js → chunk-RMUVJ44Z.js} +130 -64
package/dist/chunk-RMUVJ44Z.js.map +1 -0
package/dist/{chunk-ID5SDIYE.js → chunk-YTHTGLMT.js} +20 -2
package/dist/chunk-YTHTGLMT.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-CRELSNY6.js → dist-EDQZMZH2.js} +2 -2
package/dist/index.js +3 -3
package/dist/{interactive-RJTBLMVF.js → interactive-J4IBXJF7.js} +3 -3
package/dist/templates/.agentv/config.yaml +5 -0
package/dist/templates/.agentv/targets.yaml +7 -7
package/package.json +1 -1
package/dist/chunk-ID5SDIYE.js.map +0 -1
package/dist/chunk-PJBBVLLB.js.map +0 -1
package/dist/chunk-ZCUOH72A.js.map +0 -1
/package/dist/{dist-CRELSNY6.js.map → dist-EDQZMZH2.js.map} +0 -0
/package/dist/{interactive-RJTBLMVF.js.map → interactive-J4IBXJF7.js.map} +0 -0

package/dist/{chunk-PJBBVLLB.js → chunk-RMUVJ44Z.js} RENAMED Viewed

@@ -11,7 +11,7 @@ import {
   validateEvalFile,
   validateFileReferences,
   validateTargetsFile
-} from "./chunk-ZCUOH72A.js";
+} from "./chunk-KSUL3F3R.js";
 import {
   assembleLlmJudgePrompt,
   buildPromptInputs,
@@ -27,7 +27,7 @@ import {
   toCamelCaseDeep,
   toSnakeCaseDeep as toSnakeCaseDeep2,
   trimBaselineResult
-} from "./chunk-ID5SDIYE.js";
+} from "./chunk-YTHTGLMT.js";
 import {
   __commonJS,
   __esm,
@@ -3771,6 +3771,129 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
 }
 // src/commands/eval/commands/prompt/overview.ts
+function getEvalMode() {
+  const mode = process.env.AGENTV_PROMPT_EVAL_MODE ?? "agent";
+  if (mode !== "agent" && mode !== "cli") {
+    throw new Error(`Invalid AGENTV_PROMPT_EVAL_MODE="${mode}". Valid values: agent, cli`);
+  }
+  return mode;
+}
+async function generateOverviewPrompt(evalPaths) {
+  const cwd = process.cwd();
+  const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
+  const repoRoot = await findRepoRoot(cwd);
+  const mode = getEvalMode();
+  const fileEntries = [];
+  for (const evalPath of resolvedPaths) {
+    const tests = await loadTests(evalPath, repoRoot);
+    fileEntries.push({ path: evalPath, tests });
+  }
+  const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
+  if (mode === "cli") {
+    return generateCliModePrompt(fileEntries, totalCases);
+  }
+  return generateAgentModePrompt(fileEntries, totalCases);
+}
+function generateAgentModePrompt(fileEntries, totalCases) {
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-").slice(0, -1);
+  const lines = [
+    "# AgentV Eval Orchestration",
+    "",
+    "**Mode: agent** \u2014 You orchestrate the evaluation using agents. No API keys needed.",
+    "",
+    `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
+    "",
+    "## Setup",
+    "",
+    `- **Results file:** \`.agentv/results/eval_${timestamp}.jsonl\``,
+    "- **Temp answers:** `.agentv/tmp/`",
+    "",
+    "Ensure both directories exist before starting.",
+    "",
+    "## For each test case",
+    "",
+    "Run these two agents **sequentially**:",
+    "",
+    "### 1. Dispatch `eval-candidate` agent",
+    "",
+    "Parameters:",
+    "- `eval-path`: Path to the eval YAML file",
+    "- `test-id`: The test case ID",
+    "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
+    "",
+    "The agent retrieves the task input, acts as the candidate LLM, and saves its response.",
+    "",
+    "### 2. Dispatch `eval-judge` agent (after candidate completes)",
+    "",
+    "Parameters:",
+    "- `eval-path`: Path to the eval YAML file",
+    "- `test-id`: The test case ID",
+    "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
+    `- \`results-file\`: \`.agentv/results/eval_${timestamp}.jsonl\``,
+    "",
+    "The agent runs evaluators, scores the response, and appends results to the JSONL file.",
+    ""
+  ];
+  for (const { path: evalPath, tests } of fileEntries) {
+    lines.push(`## ${evalPath}`);
+    lines.push("");
+    for (const evalCase of tests) {
+      const evaluatorSummary = describeEvaluators(evalCase);
+      lines.push(`### ${evalCase.id}`);
+      lines.push(`Criteria: ${evalCase.criteria}`);
+      if (evaluatorSummary) {
+        lines.push(`Evaluators: ${evaluatorSummary}`);
+      }
+      lines.push("");
+      lines.push("**1. Dispatch `eval-candidate` agent:**");
+      lines.push(`- eval-path: \`${evalPath}\``);
+      lines.push(`- test-id: \`${evalCase.id}\``);
+      lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
+      lines.push("");
+      lines.push("**2. Dispatch `eval-judge` agent** (after candidate completes):");
+      lines.push(`- eval-path: \`${evalPath}\``);
+      lines.push(`- test-id: \`${evalCase.id}\``);
+      lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
+      lines.push(`- results-file: \`.agentv/results/eval_${timestamp}.jsonl\``);
+      lines.push("");
+    }
+  }
+  return lines.join("\n");
+}
+function generateCliModePrompt(fileEntries, totalCases) {
+  const evalPathArgs = fileEntries.map((e) => e.path).join(" ");
+  const lines = [
+    "# AgentV Eval Orchestration",
+    "",
+    "**Mode: cli** \u2014 Run the evaluation end-to-end using the CLI.",
+    "",
+    `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
+    "",
+    "## Run the evaluation",
+    "",
+    "```bash",
+    `agentv eval ${evalPathArgs}`,
+    "```",
+    "",
+    "Results are written to `.agentv/results/`. The output path is printed in the CLI output.",
+    "Parse the JSONL file for per-test scores, hits, and misses.",
+    ""
+  ];
+  for (const { path: evalPath, tests } of fileEntries) {
+    lines.push(`## ${evalPath}`);
+    lines.push("");
+    for (const evalCase of tests) {
+      const evaluatorSummary = describeEvaluators(evalCase);
+      lines.push(`### ${evalCase.id}`);
+      lines.push(`Criteria: ${evalCase.criteria}`);
+      if (evaluatorSummary) {
+        lines.push(`Evaluators: ${evaluatorSummary}`);
+      }
+      lines.push("");
+    }
+  }
+  return lines.join("\n");
+}
 var evalPromptOverviewCommand = command({
   name: "overview",
   description: "Output orchestration prompt for host agent to run evals",
@@ -3782,65 +3905,8 @@ var evalPromptOverviewCommand = command({
     })
   },
   handler: async (args) => {
-    const cwd = process.cwd();
-    const resolvedPaths = await resolveEvalPaths(args.evalPaths, cwd);
-    const repoRoot = await findRepoRoot(cwd);
-    const fileEntries = [];
-    for (const evalPath of resolvedPaths) {
-      const tests = await loadTests(evalPath, repoRoot);
-      fileEntries.push({ path: evalPath, tests });
-    }
-    const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
-    const lines = [
-      "# AgentV Eval Orchestration",
-      "",
-      `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}. For each case: get the task input, execute it, then judge the result.`,
-      "",
-      "## Step 1: Get Task Input",
-      "",
-      "Run `agentv prompt eval input <path> --test-id <id>` to get the task as JSON.",
-      "",
-      "The output contains:",
-      '- `input` \u2014 `[{role, content}]` array. Content segments are either `{type: "text", value: "..."}` or `{type: "file", path: "/absolute/path"}`. Read file segments from the filesystem.',
-      "- `guideline_paths` \u2014 files containing additional instructions to prepend to the system message (may be empty). Read these from the filesystem.",
-      "- `criteria` \u2014 what a good answer should accomplish (for your reference, do not leak to the agent being tested)",
-      "",
-      "## Step 2: Execute the Task",
-      "",
-      "Send the prompt to the agent/LLM being evaluated. Save the complete response text to a file.",
-      "",
-      "## Step 3: Judge the Result",
-      "",
-      "Run `agentv prompt eval judge <path> --test-id <id> --answer-file <response-file>`.",
-      "",
-      "The output contains an `evaluators` array. Each evaluator has a `status`:",
-      "",
-      '- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
-      '- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
-      "  `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
-      ""
-    ];
-    for (const { path: evalPath, tests } of fileEntries) {
-      lines.push(`## ${evalPath}`);
-      lines.push("");
-      for (const evalCase of tests) {
-        const evaluatorSummary = describeEvaluators(evalCase);
-        lines.push(`### ${evalCase.id}`);
-        lines.push(`Criteria: ${evalCase.criteria}`);
-        if (evaluatorSummary) {
-          lines.push(`Evaluators: ${evaluatorSummary}`);
-        }
-        lines.push("");
-        lines.push("```bash");
-        lines.push(`agentv prompt eval input ${evalPath} --test-id ${evalCase.id}`);
-        lines.push(
-          `agentv prompt eval judge ${evalPath} --test-id ${evalCase.id} --answer-file <response-file>`
-        );
-        lines.push("```");
-        lines.push("");
-      }
-    }
-    process.stdout.write(lines.join("\n"));
+    const output = await generateOverviewPrompt(args.evalPaths);
+    process.stdout.write(output);
   }
 });
 function describeEvaluators(evalCase) {
@@ -3938,7 +4004,7 @@ var evalRunCommand = command({
     agentTimeout: option({
       type: optional(number),
       long: "agent-timeout",
-      description: "Timeout in seconds for provider responses (default: 120)"
+      description: "Optional top-level evaluation timeout in seconds. Unset by default."
     }),
     maxRetries: option({
       type: optional(number),
@@ -4006,7 +4072,7 @@ var evalRunCommand = command({
   },
   handler: async (args) => {
     if (args.evalPaths.length === 0 && process.stdin.isTTY) {
-      const { launchInteractiveWizard } = await import("./interactive-RJTBLMVF.js");
+      const { launchInteractiveWizard } = await import("./interactive-J4IBXJF7.js");
       await launchInteractiveWizard();
       return;
     }
@@ -5885,4 +5951,4 @@ export {
   preprocessArgv,
   runCli
 };
-//# sourceMappingURL=chunk-PJBBVLLB.js.map
+//# sourceMappingURL=chunk-RMUVJ44Z.js.map