agentv 2.18.3 → 2.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/{chunk-3WNB7XKU.js → chunk-4MSAOMCC.js} +4 -4
- package/dist/{chunk-3WNB7XKU.js.map → chunk-4MSAOMCC.js.map} +1 -1
- package/dist/{chunk-XKIJ4ATV.js → chunk-GC6T3RD4.js} +129 -63
- package/dist/chunk-GC6T3RD4.js.map +1 -0
- package/dist/{chunk-BM77B57R.js → chunk-XTYMR4I5.js} +53 -10
- package/dist/chunk-XTYMR4I5.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-4VCI5NDA.js → dist-MQBGD6LP.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-NEHIYZ2F.js → interactive-3TDBCSDW.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-BM77B57R.js.map +0 -1
- package/dist/chunk-XKIJ4ATV.js.map +0 -1
- /package/dist/{dist-4VCI5NDA.js.map → dist-MQBGD6LP.js.map} +0 -0
- /package/dist/{interactive-NEHIYZ2F.js.map → interactive-3TDBCSDW.js.map} +0 -0
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
validateEvalFile,
|
|
12
12
|
validateFileReferences,
|
|
13
13
|
validateTargetsFile
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-4MSAOMCC.js";
|
|
15
15
|
import {
|
|
16
16
|
assembleLlmJudgePrompt,
|
|
17
17
|
buildPromptInputs,
|
|
@@ -27,7 +27,7 @@ import {
|
|
|
27
27
|
toCamelCaseDeep,
|
|
28
28
|
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
29
29
|
trimBaselineResult
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-XTYMR4I5.js";
|
|
31
31
|
import {
|
|
32
32
|
__commonJS,
|
|
33
33
|
__esm,
|
|
@@ -3771,6 +3771,129 @@ async function processEvaluator(config, evalCase, candidate, promptInputs) {
|
|
|
3771
3771
|
}
|
|
3772
3772
|
|
|
3773
3773
|
// src/commands/eval/commands/prompt/overview.ts
|
|
3774
|
+
function getEvalMode() {
|
|
3775
|
+
const mode = process.env.AGENTV_PROMPT_EVAL_MODE ?? "agent";
|
|
3776
|
+
if (mode !== "agent" && mode !== "cli") {
|
|
3777
|
+
throw new Error(`Invalid AGENTV_PROMPT_EVAL_MODE="${mode}". Valid values: agent, cli`);
|
|
3778
|
+
}
|
|
3779
|
+
return mode;
|
|
3780
|
+
}
|
|
3781
|
+
async function generateOverviewPrompt(evalPaths) {
|
|
3782
|
+
const cwd = process.cwd();
|
|
3783
|
+
const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
|
|
3784
|
+
const repoRoot = await findRepoRoot(cwd);
|
|
3785
|
+
const mode = getEvalMode();
|
|
3786
|
+
const fileEntries = [];
|
|
3787
|
+
for (const evalPath of resolvedPaths) {
|
|
3788
|
+
const tests = await loadTests(evalPath, repoRoot);
|
|
3789
|
+
fileEntries.push({ path: evalPath, tests });
|
|
3790
|
+
}
|
|
3791
|
+
const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
|
|
3792
|
+
if (mode === "cli") {
|
|
3793
|
+
return generateCliModePrompt(fileEntries, totalCases);
|
|
3794
|
+
}
|
|
3795
|
+
return generateAgentModePrompt(fileEntries, totalCases);
|
|
3796
|
+
}
|
|
3797
|
+
function generateAgentModePrompt(fileEntries, totalCases) {
|
|
3798
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-").slice(0, -1);
|
|
3799
|
+
const lines = [
|
|
3800
|
+
"# AgentV Eval Orchestration",
|
|
3801
|
+
"",
|
|
3802
|
+
"**Mode: agent** \u2014 You orchestrate the evaluation using agents. No API keys needed.",
|
|
3803
|
+
"",
|
|
3804
|
+
`You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
|
|
3805
|
+
"",
|
|
3806
|
+
"## Setup",
|
|
3807
|
+
"",
|
|
3808
|
+
`- **Results file:** \`.agentv/results/eval_${timestamp}.jsonl\``,
|
|
3809
|
+
"- **Temp answers:** `.agentv/tmp/`",
|
|
3810
|
+
"",
|
|
3811
|
+
"Ensure both directories exist before starting.",
|
|
3812
|
+
"",
|
|
3813
|
+
"## For each test case",
|
|
3814
|
+
"",
|
|
3815
|
+
"Run these two agents **sequentially**:",
|
|
3816
|
+
"",
|
|
3817
|
+
"### 1. Dispatch `eval-candidate` agent",
|
|
3818
|
+
"",
|
|
3819
|
+
"Parameters:",
|
|
3820
|
+
"- `eval-path`: Path to the eval YAML file",
|
|
3821
|
+
"- `test-id`: The test case ID",
|
|
3822
|
+
"- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
|
|
3823
|
+
"",
|
|
3824
|
+
"The agent retrieves the task input, acts as the candidate LLM, and saves its response.",
|
|
3825
|
+
"",
|
|
3826
|
+
"### 2. Dispatch `eval-judge` agent (after candidate completes)",
|
|
3827
|
+
"",
|
|
3828
|
+
"Parameters:",
|
|
3829
|
+
"- `eval-path`: Path to the eval YAML file",
|
|
3830
|
+
"- `test-id`: The test case ID",
|
|
3831
|
+
"- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
|
|
3832
|
+
`- \`results-file\`: \`.agentv/results/eval_${timestamp}.jsonl\``,
|
|
3833
|
+
"",
|
|
3834
|
+
"The agent runs evaluators, scores the response, and appends results to the JSONL file.",
|
|
3835
|
+
""
|
|
3836
|
+
];
|
|
3837
|
+
for (const { path: evalPath, tests } of fileEntries) {
|
|
3838
|
+
lines.push(`## ${evalPath}`);
|
|
3839
|
+
lines.push("");
|
|
3840
|
+
for (const evalCase of tests) {
|
|
3841
|
+
const evaluatorSummary = describeEvaluators(evalCase);
|
|
3842
|
+
lines.push(`### ${evalCase.id}`);
|
|
3843
|
+
lines.push(`Criteria: ${evalCase.criteria}`);
|
|
3844
|
+
if (evaluatorSummary) {
|
|
3845
|
+
lines.push(`Evaluators: ${evaluatorSummary}`);
|
|
3846
|
+
}
|
|
3847
|
+
lines.push("");
|
|
3848
|
+
lines.push("**1. Dispatch `eval-candidate` agent:**");
|
|
3849
|
+
lines.push(`- eval-path: \`${evalPath}\``);
|
|
3850
|
+
lines.push(`- test-id: \`${evalCase.id}\``);
|
|
3851
|
+
lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
|
|
3852
|
+
lines.push("");
|
|
3853
|
+
lines.push("**2. Dispatch `eval-judge` agent** (after candidate completes):");
|
|
3854
|
+
lines.push(`- eval-path: \`${evalPath}\``);
|
|
3855
|
+
lines.push(`- test-id: \`${evalCase.id}\``);
|
|
3856
|
+
lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
|
|
3857
|
+
lines.push(`- results-file: \`.agentv/results/eval_${timestamp}.jsonl\``);
|
|
3858
|
+
lines.push("");
|
|
3859
|
+
}
|
|
3860
|
+
}
|
|
3861
|
+
return lines.join("\n");
|
|
3862
|
+
}
|
|
3863
|
+
function generateCliModePrompt(fileEntries, totalCases) {
|
|
3864
|
+
const evalPathArgs = fileEntries.map((e) => e.path).join(" ");
|
|
3865
|
+
const lines = [
|
|
3866
|
+
"# AgentV Eval Orchestration",
|
|
3867
|
+
"",
|
|
3868
|
+
"**Mode: cli** \u2014 Run the evaluation end-to-end using the CLI.",
|
|
3869
|
+
"",
|
|
3870
|
+
`You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
|
|
3871
|
+
"",
|
|
3872
|
+
"## Run the evaluation",
|
|
3873
|
+
"",
|
|
3874
|
+
"```bash",
|
|
3875
|
+
`agentv eval ${evalPathArgs}`,
|
|
3876
|
+
"```",
|
|
3877
|
+
"",
|
|
3878
|
+
"Results are written to `.agentv/results/`. The output path is printed in the CLI output.",
|
|
3879
|
+
"Parse the JSONL file for per-test scores, hits, and misses.",
|
|
3880
|
+
""
|
|
3881
|
+
];
|
|
3882
|
+
for (const { path: evalPath, tests } of fileEntries) {
|
|
3883
|
+
lines.push(`## ${evalPath}`);
|
|
3884
|
+
lines.push("");
|
|
3885
|
+
for (const evalCase of tests) {
|
|
3886
|
+
const evaluatorSummary = describeEvaluators(evalCase);
|
|
3887
|
+
lines.push(`### ${evalCase.id}`);
|
|
3888
|
+
lines.push(`Criteria: ${evalCase.criteria}`);
|
|
3889
|
+
if (evaluatorSummary) {
|
|
3890
|
+
lines.push(`Evaluators: ${evaluatorSummary}`);
|
|
3891
|
+
}
|
|
3892
|
+
lines.push("");
|
|
3893
|
+
}
|
|
3894
|
+
}
|
|
3895
|
+
return lines.join("\n");
|
|
3896
|
+
}
|
|
3774
3897
|
var evalPromptOverviewCommand = command({
|
|
3775
3898
|
name: "overview",
|
|
3776
3899
|
description: "Output orchestration prompt for host agent to run evals",
|
|
@@ -3782,65 +3905,8 @@ var evalPromptOverviewCommand = command({
|
|
|
3782
3905
|
})
|
|
3783
3906
|
},
|
|
3784
3907
|
handler: async (args) => {
|
|
3785
|
-
const
|
|
3786
|
-
|
|
3787
|
-
const repoRoot = await findRepoRoot(cwd);
|
|
3788
|
-
const fileEntries = [];
|
|
3789
|
-
for (const evalPath of resolvedPaths) {
|
|
3790
|
-
const tests = await loadTests(evalPath, repoRoot);
|
|
3791
|
-
fileEntries.push({ path: evalPath, tests });
|
|
3792
|
-
}
|
|
3793
|
-
const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
|
|
3794
|
-
const lines = [
|
|
3795
|
-
"# AgentV Eval Orchestration",
|
|
3796
|
-
"",
|
|
3797
|
-
`You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}. For each case: get the task input, execute it, then judge the result.`,
|
|
3798
|
-
"",
|
|
3799
|
-
"## Step 1: Get Task Input",
|
|
3800
|
-
"",
|
|
3801
|
-
"Run `agentv prompt eval input <path> --test-id <id>` to get the task as JSON.",
|
|
3802
|
-
"",
|
|
3803
|
-
"The output contains:",
|
|
3804
|
-
'- `input` \u2014 `[{role, content}]` array. Content segments are either `{type: "text", value: "..."}` or `{type: "file", path: "/absolute/path"}`. Read file segments from the filesystem.',
|
|
3805
|
-
"- `guideline_paths` \u2014 files containing additional instructions to prepend to the system message (may be empty). Read these from the filesystem.",
|
|
3806
|
-
"- `criteria` \u2014 what a good answer should accomplish (for your reference, do not leak to the agent being tested)",
|
|
3807
|
-
"",
|
|
3808
|
-
"## Step 2: Execute the Task",
|
|
3809
|
-
"",
|
|
3810
|
-
"Send the prompt to the agent/LLM being evaluated. Save the complete response text to a file.",
|
|
3811
|
-
"",
|
|
3812
|
-
"## Step 3: Judge the Result",
|
|
3813
|
-
"",
|
|
3814
|
-
"Run `agentv prompt eval judge <path> --test-id <id> --answer-file <response-file>`.",
|
|
3815
|
-
"",
|
|
3816
|
-
"The output contains an `evaluators` array. Each evaluator has a `status`:",
|
|
3817
|
-
"",
|
|
3818
|
-
'- **`"completed"`** \u2014 Score is final (code-judge ran deterministically). Read `result.score` (0.0\u20131.0).',
|
|
3819
|
-
'- **`"prompt_ready"`** \u2014 LLM grading required. Send `prompt.system_prompt` as system and',
|
|
3820
|
-
" `prompt.user_prompt` as user to your LLM. Parse the JSON response to get `score`, `hits`, `misses`.",
|
|
3821
|
-
""
|
|
3822
|
-
];
|
|
3823
|
-
for (const { path: evalPath, tests } of fileEntries) {
|
|
3824
|
-
lines.push(`## ${evalPath}`);
|
|
3825
|
-
lines.push("");
|
|
3826
|
-
for (const evalCase of tests) {
|
|
3827
|
-
const evaluatorSummary = describeEvaluators(evalCase);
|
|
3828
|
-
lines.push(`### ${evalCase.id}`);
|
|
3829
|
-
lines.push(`Criteria: ${evalCase.criteria}`);
|
|
3830
|
-
if (evaluatorSummary) {
|
|
3831
|
-
lines.push(`Evaluators: ${evaluatorSummary}`);
|
|
3832
|
-
}
|
|
3833
|
-
lines.push("");
|
|
3834
|
-
lines.push("```bash");
|
|
3835
|
-
lines.push(`agentv prompt eval input ${evalPath} --test-id ${evalCase.id}`);
|
|
3836
|
-
lines.push(
|
|
3837
|
-
`agentv prompt eval judge ${evalPath} --test-id ${evalCase.id} --answer-file <response-file>`
|
|
3838
|
-
);
|
|
3839
|
-
lines.push("```");
|
|
3840
|
-
lines.push("");
|
|
3841
|
-
}
|
|
3842
|
-
}
|
|
3843
|
-
process.stdout.write(lines.join("\n"));
|
|
3908
|
+
const output = await generateOverviewPrompt(args.evalPaths);
|
|
3909
|
+
process.stdout.write(output);
|
|
3844
3910
|
}
|
|
3845
3911
|
});
|
|
3846
3912
|
function describeEvaluators(evalCase) {
|
|
@@ -4006,7 +4072,7 @@ var evalRunCommand = command({
|
|
|
4006
4072
|
},
|
|
4007
4073
|
handler: async (args) => {
|
|
4008
4074
|
if (args.evalPaths.length === 0 && process.stdin.isTTY) {
|
|
4009
|
-
const { launchInteractiveWizard } = await import("./interactive-
|
|
4075
|
+
const { launchInteractiveWizard } = await import("./interactive-3TDBCSDW.js");
|
|
4010
4076
|
await launchInteractiveWizard();
|
|
4011
4077
|
return;
|
|
4012
4078
|
}
|
|
@@ -5885,4 +5951,4 @@ export {
|
|
|
5885
5951
|
preprocessArgv,
|
|
5886
5952
|
runCli
|
|
5887
5953
|
};
|
|
5888
|
-
//# sourceMappingURL=chunk-
|
|
5954
|
+
//# sourceMappingURL=chunk-GC6T3RD4.js.map
|