agentv 2.18.4 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/README.md +62 -36
  2. package/dist/agentv-provider-5CJVBBGG-2XVZBW7L.js +9 -0
  3. package/dist/{chunk-RMUVJ44Z.js → chunk-5WIB7A27.js} +598 -403
  4. package/dist/chunk-5WIB7A27.js.map +1 -0
  5. package/dist/chunk-6GSYTMXD.js +31520 -0
  6. package/dist/chunk-6GSYTMXD.js.map +1 -0
  7. package/dist/{chunk-KSUL3F3R.js → chunk-DY4ZDTTO.js} +1018 -140
  8. package/dist/chunk-DY4ZDTTO.js.map +1 -0
  9. package/dist/chunk-HF4X7ALN.js +24299 -0
  10. package/dist/chunk-HF4X7ALN.js.map +1 -0
  11. package/dist/{chunk-FV32QHPB.js → chunk-XOSNETAV.js} +1 -1
  12. package/dist/cli.js +5 -4
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{dist-EDQZMZH2.js → dist-WN2QIOQR.js} +27 -11
  15. package/dist/{esm-DX3WQKEN.js → esm-CZAWIY6F.js} +2 -2
  16. package/dist/esm-CZAWIY6F.js.map +1 -0
  17. package/dist/index.js +5 -4
  18. package/dist/{interactive-J4IBXJF7.js → interactive-B432TCRZ.js} +5 -4
  19. package/dist/{interactive-J4IBXJF7.js.map → interactive-B432TCRZ.js.map} +1 -1
  20. package/dist/{src-2N5EJ2N6.js → src-ML4D2MC2.js} +2 -2
  21. package/dist/templates/.agentv/config.yaml +0 -5
  22. package/dist/templates/.agentv/targets.yaml +8 -11
  23. package/package.json +2 -2
  24. package/dist/chunk-KSUL3F3R.js.map +0 -1
  25. package/dist/chunk-RMUVJ44Z.js.map +0 -1
  26. package/dist/chunk-YTHTGLMT.js +0 -49786
  27. package/dist/chunk-YTHTGLMT.js.map +0 -1
  28. /package/dist/{dist-EDQZMZH2.js.map → agentv-provider-5CJVBBGG-2XVZBW7L.js.map} +0 -0
  29. /package/dist/{chunk-FV32QHPB.js.map → chunk-XOSNETAV.js.map} +0 -0
  30. /package/dist/{esm-DX3WQKEN.js.map → dist-WN2QIOQR.js.map} +0 -0
  31. /package/dist/{src-2N5EJ2N6.js.map → src-ML4D2MC2.js.map} +0 -0
@@ -1,5 +1,6 @@
1
1
  import { createRequire } from 'node:module'; const require = createRequire(import.meta.url);
2
2
  import {
3
+ HtmlWriter,
3
4
  detectFileType,
4
5
  findRepoRoot,
5
6
  package_default,
@@ -11,23 +12,25 @@ import {
11
12
  validateEvalFile,
12
13
  validateFileReferences,
13
14
  validateTargetsFile
14
- } from "./chunk-KSUL3F3R.js";
15
+ } from "./chunk-DY4ZDTTO.js";
15
16
  import {
16
- assembleLlmJudgePrompt,
17
- buildPromptInputs,
18
17
  createBuiltinRegistry,
19
18
  createProvider,
20
19
  executeScript,
21
20
  generateRubrics,
22
21
  getAgentvHome,
22
+ getOutputFilenames,
23
23
  getWorkspacePoolRoot,
24
+ isAgentSkillsFormat,
24
25
  loadTestById,
25
26
  loadTests,
26
27
  normalizeLineEndings,
28
+ parseAgentSkillsEvals,
27
29
  toCamelCaseDeep,
28
30
  toSnakeCaseDeep as toSnakeCaseDeep2,
31
+ transpileEvalYamlFile,
29
32
  trimBaselineResult
30
- } from "./chunk-YTHTGLMT.js";
33
+ } from "./chunk-HF4X7ALN.js";
31
34
  import {
32
35
  __commonJS,
33
36
  __esm,
@@ -3296,6 +3299,16 @@ var compareCommand = command({
3296
3299
  import { readFileSync as readFileSync2, writeFileSync } from "node:fs";
3297
3300
  import path from "node:path";
3298
3301
  import { stringify as stringifyYaml } from "yaml";
3302
+ async function convertJsonlToHtml(inputPath, outputPath) {
3303
+ const content = readFileSync2(inputPath, "utf8");
3304
+ const lines = content.trim().split("\n").filter((line) => line.trim());
3305
+ const writer = await HtmlWriter.open(outputPath);
3306
+ for (const line of lines) {
3307
+ await writer.append(JSON.parse(line));
3308
+ }
3309
+ await writer.close();
3310
+ return lines.length;
3311
+ }
3299
3312
  function convertJsonlToYaml(inputPath, outputPath) {
3300
3313
  const content = readFileSync2(inputPath, "utf8");
3301
3314
  const lines = content.trim().split("\n").filter((line) => line.trim());
@@ -3315,35 +3328,157 @@ function convertJsonlToYaml(inputPath, outputPath) {
3315
3328
  writeFileSync(outputPath, yamlOutput);
3316
3329
  return lines.length;
3317
3330
  }
3331
+ function convertEvalsJsonToYaml(inputPath) {
3332
+ const content = readFileSync2(inputPath, "utf8");
3333
+ const parsed = JSON.parse(content);
3334
+ if (!isAgentSkillsFormat(parsed)) {
3335
+ throw new Error(`Not a valid Agent Skills evals.json: missing 'evals' array`);
3336
+ }
3337
+ const tests = parseAgentSkillsEvals(parsed, inputPath, path.dirname(path.resolve(inputPath)));
3338
+ const lines = [];
3339
+ lines.push("# Converted from Agent Skills evals.json");
3340
+ lines.push("# See: https://agentskills.io/skill-creation/evaluating-skills");
3341
+ lines.push("#");
3342
+ lines.push("# AgentV features you can add:");
3343
+ lines.push("# - type: is_json, contains, regex for deterministic evaluators");
3344
+ lines.push("# - type: code-grader for custom scoring scripts");
3345
+ lines.push("# - Multi-turn conversations via input message arrays");
3346
+ lines.push("# - Composite evaluators with weighted scoring");
3347
+ lines.push("# - Workspace isolation with repos and hooks");
3348
+ lines.push("");
3349
+ if (parsed.skill_name) {
3350
+ lines.push(`description: "Evals for ${parsed.skill_name} skill"`);
3351
+ lines.push("");
3352
+ }
3353
+ lines.push("tests:");
3354
+ for (const test of tests) {
3355
+ lines.push(` - id: "${test.id}"`);
3356
+ lines.push("");
3357
+ if (test.criteria) {
3358
+ lines.push(" criteria: |-");
3359
+ for (const line of test.criteria.split("\n")) {
3360
+ lines.push(` ${line}`);
3361
+ }
3362
+ lines.push("");
3363
+ }
3364
+ lines.push(" input:");
3365
+ for (const msg of test.input) {
3366
+ lines.push(` - role: ${msg.role}`);
3367
+ if (typeof msg.content === "string" && msg.content.includes("\n")) {
3368
+ lines.push(" content: |-");
3369
+ for (const line of msg.content.split("\n")) {
3370
+ lines.push(` ${line}`);
3371
+ }
3372
+ } else {
3373
+ lines.push(
3374
+ ` content: "${typeof msg.content === "string" ? msg.content.replace(/"/g, '\\"') : msg.content}"`
3375
+ );
3376
+ }
3377
+ }
3378
+ lines.push("");
3379
+ if (test.expected_output && test.expected_output.length > 0) {
3380
+ lines.push(" expected_output:");
3381
+ for (const msg of test.expected_output) {
3382
+ lines.push(` - role: ${msg.role}`);
3383
+ if (typeof msg.content === "string" && msg.content.includes("\n")) {
3384
+ lines.push(" content: |-");
3385
+ for (const line of msg.content.split("\n")) {
3386
+ lines.push(` ${line}`);
3387
+ }
3388
+ } else {
3389
+ lines.push(
3390
+ ` content: "${typeof msg.content === "string" ? msg.content.replace(/"/g, '\\"') : msg.content}"`
3391
+ );
3392
+ }
3393
+ }
3394
+ lines.push("");
3395
+ }
3396
+ if (test.assertions && test.assertions.length > 0) {
3397
+ lines.push(" # Promoted from evals.json assertions[]");
3398
+ lines.push(" # Replace with type: is_json, contains, or regex for deterministic checks");
3399
+ lines.push(" assertions:");
3400
+ for (const assertion of test.assertions) {
3401
+ lines.push(` - name: ${assertion.name}`);
3402
+ lines.push(` type: ${assertion.type}`);
3403
+ if ((assertion.type === "llm-grader" || assertion.type === "llm-judge") && "prompt" in assertion) {
3404
+ const prompt = assertion.prompt;
3405
+ lines.push(` prompt: "${prompt.replace(/"/g, '\\"')}"`);
3406
+ }
3407
+ }
3408
+ lines.push("");
3409
+ }
3410
+ if (test.file_paths && test.file_paths.length > 0) {
3411
+ lines.push(" # TODO: Configure workspace.repos or file references for these files:");
3412
+ const agentSkillsFiles = test.metadata?.agent_skills_files;
3413
+ if (agentSkillsFiles) {
3414
+ for (const file of agentSkillsFiles) {
3415
+ lines.push(` # - ${file}`);
3416
+ }
3417
+ }
3418
+ lines.push("");
3419
+ }
3420
+ }
3421
+ return `${lines.join("\n")}
3422
+ `;
3423
+ }
3318
3424
  var convertCommand = command({
3319
3425
  name: "convert",
3320
- description: "Convert evaluation results from JSONL to YAML format",
3426
+ description: "Convert between evaluation formats (JSONL\u2192YAML, JSONL\u2192HTML, evals.json\u2192EVAL.yaml)",
3321
3427
  args: {
3322
3428
  input: positional({
3323
3429
  type: string,
3324
3430
  displayName: "input",
3325
- description: "Path to input JSONL file"
3431
+ description: "Path to input file (.jsonl or .json)"
3326
3432
  }),
3327
3433
  out: option({
3328
3434
  type: optional(string),
3329
3435
  long: "out",
3330
3436
  short: "o",
3331
- description: "Output file path (defaults to input path with .yaml extension)"
3437
+ description: "Output file path (defaults to stdout for evals.json, .yaml or .html for JSONL)"
3332
3438
  })
3333
3439
  },
3334
3440
  handler: async ({ input, out }) => {
3335
- if (!input.endsWith(".jsonl")) {
3336
- console.error("Error: Input file must be a .jsonl file");
3337
- process.exit(1);
3441
+ const ext = path.extname(input).toLowerCase();
3442
+ if (ext === ".json") {
3443
+ try {
3444
+ const yaml = convertEvalsJsonToYaml(input);
3445
+ if (out) {
3446
+ writeFileSync(out, yaml);
3447
+ console.log(`Converted to ${path.resolve(out)}`);
3448
+ } else {
3449
+ process.stdout.write(yaml);
3450
+ }
3451
+ } catch (error) {
3452
+ console.error(`Error: ${error.message}`);
3453
+ process.exit(1);
3454
+ }
3455
+ return;
3338
3456
  }
3339
- const outputPath = out ?? input.replace(/\.jsonl$/, ".yaml");
3340
- try {
3341
- const count = convertJsonlToYaml(input, outputPath);
3342
- console.log(`Converted ${count} records to ${path.resolve(outputPath)}`);
3343
- } catch (error) {
3344
- console.error(`Error: ${error.message}`);
3345
- process.exit(1);
3457
+ if (ext === ".jsonl") {
3458
+ const outExt = out ? path.extname(out).toLowerCase() : ".yaml";
3459
+ if (outExt === ".html" || outExt === ".htm") {
3460
+ const outputPath2 = out ?? input.replace(/\.jsonl$/, ".html");
3461
+ try {
3462
+ const count = await convertJsonlToHtml(input, outputPath2);
3463
+ console.log(`Converted ${count} records to ${path.resolve(outputPath2)}`);
3464
+ } catch (error) {
3465
+ console.error(`Error: ${error.message}`);
3466
+ process.exit(1);
3467
+ }
3468
+ return;
3469
+ }
3470
+ const outputPath = out ?? input.replace(/\.jsonl$/, ".yaml");
3471
+ try {
3472
+ const count = convertJsonlToYaml(input, outputPath);
3473
+ console.log(`Converted ${count} records to ${path.resolve(outputPath)}`);
3474
+ } catch (error) {
3475
+ console.error(`Error: ${error.message}`);
3476
+ process.exit(1);
3477
+ }
3478
+ return;
3346
3479
  }
3480
+ console.error(`Error: Unsupported input format '${ext}'. Supported: .json, .jsonl`);
3481
+ process.exit(1);
3347
3482
  }
3348
3483
  });
3349
3484
 
@@ -3387,7 +3522,7 @@ tests:
3387
3522
  criteria: Agent responds correctly
3388
3523
  input: "Hello, how are you?"
3389
3524
  expected_output: "I'm doing well"
3390
- assert:
3525
+ assertions:
3391
3526
  - type: contains
3392
3527
  value: "well"
3393
3528
  `,
@@ -3400,8 +3535,8 @@ tests:
3400
3535
  criteria: Agent responds correctly and completely
3401
3536
  input: "Hello, how are you?"
3402
3537
  expected_output: "I'm doing well, thank you for asking!"
3403
- assert:
3404
- - type: llm-judge
3538
+ assertions:
3539
+ - type: llm-grader
3405
3540
  rubric:
3406
3541
  accuracy:
3407
3542
  weight: 0.6
@@ -3470,7 +3605,7 @@ var createAssertionCommand = command({
3470
3605
  console.log(`Created ${path2.relative(process.cwd(), filePath)} (template: ${templateName})`);
3471
3606
  console.log(`
3472
3607
  Use in EVAL.yaml:
3473
- assert:
3608
+ assertions:
3474
3609
  - type: ${name}`);
3475
3610
  }
3476
3611
  });
@@ -3559,38 +3694,104 @@ var createCommand = subcommands({
3559
3694
  }
3560
3695
  });
3561
3696
 
3562
- // src/commands/eval/commands/prompt/input.ts
3563
- var evalPromptInputCommand = command({
3564
- name: "input",
3565
- description: "Output task input JSON for a single test",
3566
- args: {
3567
- evalPath: positional({
3568
- type: string,
3569
- displayName: "eval-path",
3570
- description: "Path to evaluation .yaml file"
3571
- }),
3572
- testId: option({
3573
- type: string,
3574
- long: "test-id",
3575
- description: "Test ID"
3576
- })
3577
- },
3578
- handler: async (args) => {
3579
- const cwd = process.cwd();
3580
- const repoRoot = await findRepoRoot(cwd);
3581
- const evalCase = await loadTestById(args.evalPath, repoRoot, args.testId);
3582
- const fileMap = buildFileMap(evalCase.input_segments, evalCase.file_paths);
3583
- const resolvedMessages = resolveMessages(evalCase.input, fileMap);
3584
- const output = {
3585
- test_id: evalCase.id,
3586
- input: resolvedMessages,
3587
- guideline_paths: evalCase.guideline_paths,
3588
- criteria: evalCase.criteria
3589
- };
3590
- process.stdout.write(JSON.stringify(output, null, 2));
3591
- process.stdout.write("\n");
3697
+ // src/commands/eval/commands/prompt/accessors.ts
3698
+ async function listPromptEvalTestIds(evalPath) {
3699
+ const repoRoot = await findRepoRoot(process.cwd());
3700
+ const tests = await loadTests(evalPath, repoRoot);
3701
+ return {
3702
+ eval_path: evalPath,
3703
+ test_ids: tests.map((test) => test.id).sort()
3704
+ };
3705
+ }
3706
+ async function getPromptEvalInput(evalPath, testId) {
3707
+ const repoRoot = await findRepoRoot(process.cwd());
3708
+ const evalCase = await loadTestById(evalPath, repoRoot, testId);
3709
+ const fileMap = buildFileMap(evalCase.input_segments, evalCase.file_paths);
3710
+ return {
3711
+ test_id: evalCase.id,
3712
+ input: resolveMessages(evalCase.input, fileMap),
3713
+ guideline_paths: evalCase.guideline_paths,
3714
+ criteria: evalCase.criteria
3715
+ };
3716
+ }
3717
+ async function getPromptEvalExpectedOutput(evalPath, testId) {
3718
+ const repoRoot = await findRepoRoot(process.cwd());
3719
+ const evalCase = await loadTestById(evalPath, repoRoot, testId);
3720
+ return {
3721
+ test_id: evalCase.id,
3722
+ criteria: evalCase.criteria,
3723
+ expected_output: evalCase.expected_output,
3724
+ reference_answer: evalCase.reference_answer,
3725
+ assertions: evalCase.assertions ?? []
3726
+ };
3727
+ }
3728
+ async function getPromptEvalGradingBrief(evalPath, testId) {
3729
+ const repoRoot = await findRepoRoot(process.cwd());
3730
+ const evalCase = await loadTestById(evalPath, repoRoot, testId);
3731
+ const fileMap = buildFileMap(evalCase.input_segments, evalCase.file_paths);
3732
+ const resolvedInput = resolveMessages(evalCase.input, fileMap);
3733
+ const lines = [];
3734
+ const inputText = extractTextFromMessages(resolvedInput);
3735
+ if (inputText) {
3736
+ lines.push(`Input: "${inputText}"`);
3737
+ }
3738
+ const filePaths = evalCase.file_paths.filter((p) => !evalCase.guideline_paths.includes(p));
3739
+ if (filePaths.length > 0) {
3740
+ lines.push(`Files: ${filePaths.join(", ")}`);
3741
+ }
3742
+ if (evalCase.reference_answer) {
3743
+ lines.push(`Expected: "${evalCase.reference_answer}"`);
3744
+ }
3745
+ const criteria = [];
3746
+ if (evalCase.criteria) {
3747
+ criteria.push(evalCase.criteria);
3748
+ }
3749
+ for (const assertion of evalCase.assertions ?? []) {
3750
+ const entry = assertion;
3751
+ const type = entry.type;
3752
+ const bag = entry.config ?? {};
3753
+ if (type === "contains") {
3754
+ criteria.push(`Output contains '${entry.value}'`);
3755
+ } else if (type === "rubrics") {
3756
+ const items = entry.criteria ?? bag.criteria;
3757
+ if (Array.isArray(items)) {
3758
+ for (const item of items) {
3759
+ if (item.outcome) criteria.push(item.outcome);
3760
+ }
3761
+ }
3762
+ } else if (type === "llm-grader" || type === "llm_grader" || type === "llm-judge" || type === "llm_judge") {
3763
+ const prompt = entry.prompt ?? bag.prompt ?? bag.criteria;
3764
+ criteria.push(`[llm-grader] ${typeof prompt === "string" ? prompt : ""}`);
3765
+ } else if (type === "code-grader" || type === "code_grader" || type === "code-judge" || type === "code_judge") {
3766
+ const name = entry.name ?? type;
3767
+ const desc = bag.description ?? entry.description;
3768
+ criteria.push(`[code-grader] ${name}${desc ? `: ${desc}` : ""}`);
3769
+ } else if (type === "skill-trigger") {
3770
+ const trigger = entry.should_trigger !== false;
3771
+ criteria.push(`[skill-trigger] should_trigger: ${trigger} for ${entry.skill}`);
3772
+ } else if (type) {
3773
+ criteria.push(`[${type}] ${entry.value ?? bag.criteria ?? bag.prompt ?? ""}`);
3774
+ }
3592
3775
  }
3593
- });
3776
+ if (criteria.length > 0) {
3777
+ lines.push("Criteria:");
3778
+ for (const c3 of criteria) {
3779
+ lines.push(` - ${c3}`);
3780
+ }
3781
+ }
3782
+ return lines.join("\n");
3783
+ }
3784
+ function extractTextFromMessages(messages) {
3785
+ for (const msg of messages) {
3786
+ if (msg.role !== "user") continue;
3787
+ if (typeof msg.content === "string") return msg.content;
3788
+ if (Array.isArray(msg.content)) {
3789
+ const textBlocks = msg.content.filter((b) => b.type === "text").map((b) => b.value);
3790
+ if (textBlocks.length > 0) return textBlocks.join(" ");
3791
+ }
3792
+ }
3793
+ return "";
3794
+ }
3594
3795
  function buildFileMap(inputSegments, allFilePaths) {
3595
3796
  const map = /* @__PURE__ */ new Map();
3596
3797
  for (const segment of inputSegments) {
@@ -3602,7 +3803,7 @@ function buildFileMap(inputSegments, allFilePaths) {
3602
3803
  get(key) {
3603
3804
  const direct = map.get(key);
3604
3805
  if (direct) return direct;
3605
- return allFilePaths.find((p) => p.endsWith(`/${key}`) || p === key);
3806
+ return allFilePaths.find((filePath) => filePath.endsWith(`/${key}`) || filePath === key);
3606
3807
  },
3607
3808
  has(key) {
3608
3809
  return this.get(key) !== void 0;
@@ -3638,291 +3839,61 @@ function resolveMessages(messages, fileMap) {
3638
3839
  });
3639
3840
  }
3640
3841
 
3641
- // src/commands/eval/commands/prompt/judge.ts
3642
- import { readFile } from "node:fs/promises";
3643
- var evalPromptJudgeCommand = command({
3644
- name: "judge",
3645
- description: "Run code judges and output LLM judge prompts for a single test",
3842
+ // src/commands/eval/commands/prompt/index.ts
3843
+ var evalPromptEvalSubcommand = command({
3844
+ name: "eval",
3845
+ description: "Extract eval prompt data for agents",
3646
3846
  args: {
3647
- evalPath: positional({
3648
- type: string,
3649
- displayName: "eval-path",
3650
- description: "Path to evaluation .yaml file"
3847
+ list: flag({
3848
+ long: "list",
3849
+ description: "List available test IDs"
3850
+ }),
3851
+ input: flag({
3852
+ long: "input",
3853
+ description: "Extract the test input payload for a single test"
3854
+ }),
3855
+ expectedOutput: flag({
3856
+ long: "expected-output",
3857
+ description: "Extract expected output and grading context for a single test"
3858
+ }),
3859
+ gradingBrief: flag({
3860
+ long: "grading-brief",
3861
+ description: "Output human-readable grading brief with typed criteria"
3651
3862
  }),
3652
3863
  testId: option({
3653
- type: string,
3864
+ type: optional(string),
3654
3865
  long: "test-id",
3655
- description: "Test ID"
3866
+ description: "Test ID (required for --input and --expected-output)"
3656
3867
  }),
3657
- answerFile: option({
3868
+ evalPath: positional({
3658
3869
  type: string,
3659
- long: "answer-file",
3660
- description: "Path to file containing the candidate answer"
3870
+ displayName: "eval-path",
3871
+ description: "Path to evaluation .yaml, .json, or .jsonl file"
3661
3872
  })
3662
3873
  },
3663
- handler: async (args) => {
3664
- const cwd = process.cwd();
3665
- const repoRoot = await findRepoRoot(cwd);
3666
- const evalCase = await loadTestById(args.evalPath, repoRoot, args.testId);
3667
- const candidate = (await readFile(args.answerFile, "utf8")).trim();
3668
- const promptInputs = await buildPromptInputs(evalCase);
3669
- const evaluators = evalCase.evaluators ?? [];
3670
- const outputs = [];
3671
- for (const config of evaluators) {
3672
- const output = await processEvaluator(config, evalCase, candidate, promptInputs);
3673
- outputs.push(output);
3674
- }
3675
- if (outputs.length === 0) {
3676
- const assembly = assembleLlmJudgePrompt({
3677
- evalCase,
3678
- candidate,
3679
- promptInputs
3680
- });
3681
- outputs.push({
3682
- name: "default_llm_judge",
3683
- type: "llm-judge",
3684
- status: "prompt_ready",
3685
- prompt: {
3686
- system_prompt: assembly.systemPrompt,
3687
- user_prompt: assembly.userPrompt
3688
- }
3689
- });
3690
- }
3691
- const result = {
3692
- test_id: evalCase.id,
3693
- evaluators: outputs
3694
- };
3695
- process.stdout.write(JSON.stringify(result, null, 2));
3696
- process.stdout.write("\n");
3697
- }
3698
- });
3699
- async function processEvaluator(config, evalCase, candidate, promptInputs) {
3700
- switch (config.type) {
3701
- case "code-judge": {
3702
- const codeConfig = config;
3703
- const script = codeConfig.command ?? codeConfig.script ?? [];
3704
- const scriptCwd = codeConfig.resolvedCwd ?? codeConfig.cwd;
3705
- const payload = {
3706
- question: evalCase.question,
3707
- criteria: evalCase.criteria,
3708
- expectedOutput: evalCase.expected_output,
3709
- referenceAnswer: evalCase.reference_answer,
3710
- answer: candidate,
3711
- output: null,
3712
- guidelineFiles: evalCase.guideline_paths,
3713
- inputFiles: evalCase.file_paths.filter((p) => !evalCase.guideline_paths.includes(p)),
3714
- input: evalCase.input,
3715
- trace: null,
3716
- fileChanges: null,
3717
- workspacePath: null,
3718
- config: codeConfig.config ?? null
3719
- };
3720
- try {
3721
- const inputPayload = JSON.stringify(toSnakeCaseDeep2(payload), null, 2);
3722
- const stdout = await executeScript(script, inputPayload, 6e4, scriptCwd);
3723
- const parsed = JSON.parse(stdout);
3724
- return {
3725
- name: codeConfig.name,
3726
- type: "code-judge",
3727
- status: "completed",
3728
- result: parsed
3729
- };
3730
- } catch (error) {
3731
- return {
3732
- name: codeConfig.name,
3733
- type: "code-judge",
3734
- status: "completed",
3735
- result: {
3736
- score: 0,
3737
- error: error instanceof Error ? error.message : String(error)
3738
- }
3739
- };
3740
- }
3741
- }
3742
- case "llm-judge": {
3743
- const llmConfig = config;
3744
- const assembly = assembleLlmJudgePrompt({
3745
- evalCase,
3746
- candidate,
3747
- promptInputs,
3748
- evaluatorConfig: llmConfig
3749
- });
3750
- return {
3751
- name: llmConfig.name,
3752
- type: "llm-judge",
3753
- status: "prompt_ready",
3754
- prompt: {
3755
- system_prompt: assembly.systemPrompt,
3756
- user_prompt: assembly.userPrompt
3757
- }
3758
- };
3759
- }
3760
- default: {
3761
- return {
3762
- name: config.name,
3763
- type: config.type,
3764
- status: "prompt_ready",
3765
- result: {
3766
- message: `Evaluator type "${config.type}" requires the full eval pipeline. Use \`agentv eval\` instead.`
3767
- }
3768
- };
3874
+ handler: async ({ evalPath, expectedOutput, gradingBrief, input, list, testId }) => {
3875
+ const selectedModes = [list, input, expectedOutput, gradingBrief].filter(Boolean).length;
3876
+ if (selectedModes !== 1) {
3877
+ throw new Error(
3878
+ "Specify exactly one of --list, --input, --expected-output, or --grading-brief."
3879
+ );
3769
3880
  }
3770
- }
3771
- }
3772
-
3773
- // src/commands/eval/commands/prompt/overview.ts
3774
- function getEvalMode() {
3775
- const mode = process.env.AGENTV_PROMPT_EVAL_MODE ?? "agent";
3776
- if (mode !== "agent" && mode !== "cli") {
3777
- throw new Error(`Invalid AGENTV_PROMPT_EVAL_MODE="${mode}". Valid values: agent, cli`);
3778
- }
3779
- return mode;
3780
- }
3781
- async function generateOverviewPrompt(evalPaths) {
3782
- const cwd = process.cwd();
3783
- const resolvedPaths = await resolveEvalPaths(evalPaths, cwd);
3784
- const repoRoot = await findRepoRoot(cwd);
3785
- const mode = getEvalMode();
3786
- const fileEntries = [];
3787
- for (const evalPath of resolvedPaths) {
3788
- const tests = await loadTests(evalPath, repoRoot);
3789
- fileEntries.push({ path: evalPath, tests });
3790
- }
3791
- const totalCases = fileEntries.reduce((sum, e) => sum + e.tests.length, 0);
3792
- if (mode === "cli") {
3793
- return generateCliModePrompt(fileEntries, totalCases);
3794
- }
3795
- return generateAgentModePrompt(fileEntries, totalCases);
3796
- }
3797
- function generateAgentModePrompt(fileEntries, totalCases) {
3798
- const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-").slice(0, -1);
3799
- const lines = [
3800
- "# AgentV Eval Orchestration",
3801
- "",
3802
- "**Mode: agent** \u2014 You orchestrate the evaluation using agents. No API keys needed.",
3803
- "",
3804
- `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
3805
- "",
3806
- "## Setup",
3807
- "",
3808
- `- **Results file:** \`.agentv/results/eval_${timestamp}.jsonl\``,
3809
- "- **Temp answers:** `.agentv/tmp/`",
3810
- "",
3811
- "Ensure both directories exist before starting.",
3812
- "",
3813
- "## For each test case",
3814
- "",
3815
- "Run these two agents **sequentially**:",
3816
- "",
3817
- "### 1. Dispatch `eval-candidate` agent",
3818
- "",
3819
- "Parameters:",
3820
- "- `eval-path`: Path to the eval YAML file",
3821
- "- `test-id`: The test case ID",
3822
- "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
3823
- "",
3824
- "The agent retrieves the task input, acts as the candidate LLM, and saves its response.",
3825
- "",
3826
- "### 2. Dispatch `eval-judge` agent (after candidate completes)",
3827
- "",
3828
- "Parameters:",
3829
- "- `eval-path`: Path to the eval YAML file",
3830
- "- `test-id`: The test case ID",
3831
- "- `answer-file`: `.agentv/tmp/eval_<test-id>.txt`",
3832
- `- \`results-file\`: \`.agentv/results/eval_${timestamp}.jsonl\``,
3833
- "",
3834
- "The agent runs evaluators, scores the response, and appends results to the JSONL file.",
3835
- ""
3836
- ];
3837
- for (const { path: evalPath, tests } of fileEntries) {
3838
- lines.push(`## ${evalPath}`);
3839
- lines.push("");
3840
- for (const evalCase of tests) {
3841
- const evaluatorSummary = describeEvaluators(evalCase);
3842
- lines.push(`### ${evalCase.id}`);
3843
- lines.push(`Criteria: ${evalCase.criteria}`);
3844
- if (evaluatorSummary) {
3845
- lines.push(`Evaluators: ${evaluatorSummary}`);
3881
+ if (gradingBrief) {
3882
+ if (!testId) {
3883
+ throw new Error("--test-id is required with --grading-brief.");
3846
3884
  }
3847
- lines.push("");
3848
- lines.push("**1. Dispatch `eval-candidate` agent:**");
3849
- lines.push(`- eval-path: \`${evalPath}\``);
3850
- lines.push(`- test-id: \`${evalCase.id}\``);
3851
- lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
3852
- lines.push("");
3853
- lines.push("**2. Dispatch `eval-judge` agent** (after candidate completes):");
3854
- lines.push(`- eval-path: \`${evalPath}\``);
3855
- lines.push(`- test-id: \`${evalCase.id}\``);
3856
- lines.push(`- answer-file: \`.agentv/tmp/eval_${evalCase.id}.txt\``);
3857
- lines.push(`- results-file: \`.agentv/results/eval_${timestamp}.jsonl\``);
3858
- lines.push("");
3885
+ const brief = await getPromptEvalGradingBrief(evalPath, testId);
3886
+ process.stdout.write(brief);
3887
+ process.stdout.write("\n");
3888
+ return;
3859
3889
  }
3860
- }
3861
- return lines.join("\n");
3862
- }
3863
- function generateCliModePrompt(fileEntries, totalCases) {
3864
- const evalPathArgs = fileEntries.map((e) => e.path).join(" ");
3865
- const lines = [
3866
- "# AgentV Eval Orchestration",
3867
- "",
3868
- "**Mode: cli** \u2014 Run the evaluation end-to-end using the CLI.",
3869
- "",
3870
- `You are orchestrating ${totalCases} evaluation case${totalCases === 1 ? "" : "s"}.`,
3871
- "",
3872
- "## Run the evaluation",
3873
- "",
3874
- "```bash",
3875
- `agentv eval ${evalPathArgs}`,
3876
- "```",
3877
- "",
3878
- "Results are written to `.agentv/results/`. The output path is printed in the CLI output.",
3879
- "Parse the JSONL file for per-test scores, hits, and misses.",
3880
- ""
3881
- ];
3882
- for (const { path: evalPath, tests } of fileEntries) {
3883
- lines.push(`## ${evalPath}`);
3884
- lines.push("");
3885
- for (const evalCase of tests) {
3886
- const evaluatorSummary = describeEvaluators(evalCase);
3887
- lines.push(`### ${evalCase.id}`);
3888
- lines.push(`Criteria: ${evalCase.criteria}`);
3889
- if (evaluatorSummary) {
3890
- lines.push(`Evaluators: ${evaluatorSummary}`);
3891
- }
3892
- lines.push("");
3890
+ if ((input || expectedOutput) && !testId) {
3891
+ throw new Error("--test-id is required with --input and --expected-output.");
3893
3892
  }
3894
- }
3895
- return lines.join("\n");
3896
- }
3897
- var evalPromptOverviewCommand = command({
3898
- name: "overview",
3899
- description: "Output orchestration prompt for host agent to run evals",
3900
- args: {
3901
- evalPaths: restPositionals({
3902
- type: string,
3903
- displayName: "eval-paths",
3904
- description: "Path(s) or glob(s) to evaluation .yaml file(s)"
3905
- })
3906
- },
3907
- handler: async (args) => {
3908
- const output = await generateOverviewPrompt(args.evalPaths);
3909
- process.stdout.write(output);
3910
- }
3911
- });
3912
- function describeEvaluators(evalCase) {
3913
- const configs = evalCase.evaluators;
3914
- if (!configs || configs.length === 0) return void 0;
3915
- return configs.map((c3) => `${c3.name} (${c3.type})`).join(", ");
3916
- }
3917
-
3918
- // src/commands/eval/commands/prompt/index.ts
3919
- var evalPromptEvalSubcommand = subcommands({
3920
- name: "eval",
3921
- description: "Eval prompt commands (overview, input, judge)",
3922
- cmds: {
3923
- overview: evalPromptOverviewCommand,
3924
- input: evalPromptInputCommand,
3925
- judge: evalPromptJudgeCommand
3893
+ const requiredTestId = testId ?? "";
3894
+ const output = list ? await listPromptEvalTestIds(evalPath) : input ? await getPromptEvalInput(evalPath, requiredTestId) : await getPromptEvalExpectedOutput(evalPath, requiredTestId);
3895
+ process.stdout.write(JSON.stringify(output, null, 2));
3896
+ process.stdout.write("\n");
3926
3897
  }
3927
3898
  });
3928
3899
  var evalPromptCommand = subcommands({
@@ -3933,6 +3904,120 @@ var evalPromptCommand = subcommands({
3933
3904
  }
3934
3905
  });
3935
3906
 
3907
+ // src/commands/eval/commands/assert.ts
3908
+ import { readFileSync as readFileSync3 } from "node:fs";
3909
+ import path3 from "node:path";
3910
+ import fg from "fast-glob";
3911
+ var evalAssertCommand = command({
3912
+ name: "assert",
3913
+ description: "Run a single code-grader assertion from .agentv/graders/ and print the score",
3914
+ args: {
3915
+ graderName: positional({
3916
+ type: string,
3917
+ displayName: "name",
3918
+ description: "Assertion name (matches filename without extension in .agentv/graders/)"
3919
+ }),
3920
+ agentOutput: option({
3921
+ type: optional(string),
3922
+ long: "agent-output",
3923
+ description: "The agent's full response text"
3924
+ }),
3925
+ agentInput: option({
3926
+ type: optional(string),
3927
+ long: "agent-input",
3928
+ description: "The original user prompt"
3929
+ }),
3930
+ file: option({
3931
+ type: optional(string),
3932
+ long: "file",
3933
+ description: "Path to JSON file with { output, input } fields"
3934
+ })
3935
+ },
3936
+ handler: async ({ graderName, agentOutput: output, agentInput: input, file }) => {
3937
+ let resolvedOutput;
3938
+ let resolvedInput;
3939
+ if (file) {
3940
+ const content = JSON.parse(readFileSync3(path3.resolve(file), "utf8"));
3941
+ resolvedOutput = content.output ?? "";
3942
+ resolvedInput = content.input ?? "";
3943
+ } else {
3944
+ if (output === void 0) {
3945
+ console.error("Error: --agent-output is required (or use --file)");
3946
+ process.exit(1);
3947
+ }
3948
+ resolvedOutput = output;
3949
+ resolvedInput = input ?? "";
3950
+ }
3951
+ if (!/^[a-zA-Z0-9_-]+$/.test(graderName)) {
3952
+ console.error(
3953
+ `Error: Invalid grader name '${graderName}' \u2014 only letters, digits, hyphens, and underscores allowed`
3954
+ );
3955
+ process.exit(1);
3956
+ }
3957
+ const scriptPath = await findGraderScript(graderName, process.cwd());
3958
+ if (!scriptPath) {
3959
+ console.error(
3960
+ `Error: Grader '${graderName}' not found in .agentv/graders/ (or .agentv/judges/)`
3961
+ );
3962
+ process.exit(1);
3963
+ }
3964
+ const payload = JSON.stringify(
3965
+ {
3966
+ answer: resolvedOutput,
3967
+ output: [{ role: "assistant", content: resolvedOutput }],
3968
+ input: [{ role: "user", content: resolvedInput }],
3969
+ question: resolvedInput,
3970
+ criteria: "",
3971
+ expected_output: [],
3972
+ reference_answer: "",
3973
+ guideline_files: [],
3974
+ input_files: [],
3975
+ trace: null,
3976
+ token_usage: null,
3977
+ cost_usd: null,
3978
+ duration_ms: null,
3979
+ start_time: null,
3980
+ end_time: null,
3981
+ file_changes: null,
3982
+ workspace_path: null,
3983
+ config: null,
3984
+ metadata: {}
3985
+ },
3986
+ null,
3987
+ 2
3988
+ );
3989
+ try {
3990
+ const stdout = await executeScript(["bun", "run", scriptPath], payload);
3991
+ const parsed = JSON.parse(stdout);
3992
+ const score = typeof parsed.score === "number" ? parsed.score : 0;
3993
+ process.stdout.write(JSON.stringify(parsed, null, 2));
3994
+ process.stdout.write("\n");
3995
+ process.exit(score >= 0.5 ? 0 : 1);
3996
+ } catch (error) {
3997
+ const message = error instanceof Error ? error.message : String(error);
3998
+ console.error(`Error: ${message}`);
3999
+ process.exit(1);
4000
+ }
4001
+ }
4002
+ });
4003
+ async function findGraderScript(graderName, startDir) {
4004
+ let dir = path3.resolve(startDir);
4005
+ const root = path3.parse(dir).root;
4006
+ while (dir !== root) {
4007
+ for (const subdir of ["graders", "judges"]) {
4008
+ const gradersDir = path3.join(dir, ".agentv", subdir);
4009
+ const found = await fg([`${graderName}.{ts,js,mts,mjs}`], {
4010
+ cwd: gradersDir,
4011
+ absolute: true,
4012
+ onlyFiles: true
4013
+ });
4014
+ if (found.length > 0) return found[0];
4015
+ }
4016
+ dir = path3.dirname(dir);
4017
+ }
4018
+ return null;
4019
+ }
4020
+
3936
4021
  // src/commands/eval/commands/run.ts
3937
4022
  var evalRunCommand = command({
3938
4023
  name: "eval",
@@ -3972,12 +4057,12 @@ var evalRunCommand = command({
3972
4057
  type: array(string),
3973
4058
  long: "output",
3974
4059
  short: "o",
3975
- description: "Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml"
4060
+ description: "Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html"
3976
4061
  }),
3977
4062
  outputFormat: option({
3978
4063
  type: optional(string),
3979
4064
  long: "output-format",
3980
- description: "Output format: 'jsonl' or 'yaml' (default: jsonl)"
4065
+ description: "Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)"
3981
4066
  }),
3982
4067
  dryRun: flag({
3983
4068
  long: "dry-run",
@@ -4068,11 +4153,31 @@ var evalRunCommand = command({
4068
4153
  strict: flag({
4069
4154
  long: "strict",
4070
4155
  description: "Exit with error on version mismatch (instead of warning)"
4156
+ }),
4157
+ benchmarkJson: option({
4158
+ type: optional(string),
4159
+ long: "benchmark-json",
4160
+ description: "Write Agent Skills benchmark.json to the specified path"
4161
+ }),
4162
+ artifacts: option({
4163
+ type: optional(string),
4164
+ long: "artifacts",
4165
+ description: "Write companion artifacts (grading/<test>.json, timing.json, benchmark.json) to the specified directory"
4166
+ }),
4167
+ graderTarget: option({
4168
+ type: optional(string),
4169
+ long: "grader-target",
4170
+ description: 'Override grader target for all evaluators (e.g., "agentv", or a target name from targets.yaml)'
4171
+ }),
4172
+ model: option({
4173
+ type: optional(string),
4174
+ long: "model",
4175
+ description: 'Override model for the grader target (e.g., "openai:gpt-5-mini")'
4071
4176
  })
4072
4177
  },
4073
4178
  handler: async (args) => {
4074
4179
  if (args.evalPaths.length === 0 && process.stdin.isTTY) {
4075
- const { launchInteractiveWizard } = await import("./interactive-J4IBXJF7.js");
4180
+ const { launchInteractiveWizard } = await import("./interactive-B432TCRZ.js");
4076
4181
  await launchInteractiveWizard();
4077
4182
  return;
4078
4183
  }
@@ -4104,15 +4209,30 @@ var evalRunCommand = command({
4104
4209
  otelCaptureContent: args.otelCaptureContent,
4105
4210
  otelGroupTurns: args.otelGroupTurns,
4106
4211
  retryErrors: args.retryErrors,
4107
- strict: args.strict
4212
+ strict: args.strict,
4213
+ benchmarkJson: args.benchmarkJson,
4214
+ artifacts: args.artifacts,
4215
+ graderTarget: args.graderTarget,
4216
+ model: args.model
4108
4217
  };
4109
4218
  await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
4110
4219
  }
4111
4220
  });
4112
4221
 
4222
+ // src/commands/eval/index.ts
4223
+ var evalCommand = subcommands({
4224
+ name: "eval",
4225
+ description: "Evaluation commands",
4226
+ cmds: {
4227
+ run: evalRunCommand,
4228
+ prompt: evalPromptCommand,
4229
+ assert: evalAssertCommand
4230
+ }
4231
+ });
4232
+
4113
4233
  // src/commands/generate/rubrics.ts
4114
- import { readFile as readFile2, writeFile as writeFile2 } from "node:fs/promises";
4115
- import path3 from "node:path";
4234
+ import { readFile, writeFile as writeFile2 } from "node:fs/promises";
4235
+ import path4 from "node:path";
4116
4236
  import { pathToFileURL } from "node:url";
4117
4237
  import { isMap, isSeq, parseDocument } from "yaml";
4118
4238
  function isJsonObject(value) {
@@ -4124,7 +4244,7 @@ function asString(value) {
4124
4244
  async function loadRubricGenerator() {
4125
4245
  const customGenerator = process.env.AGENTEVO_CLI_RUBRIC_GENERATOR;
4126
4246
  if (customGenerator) {
4127
- const generatorPath = path3.resolve(customGenerator);
4247
+ const generatorPath = path4.resolve(customGenerator);
4128
4248
  const generatorUrl = pathToFileURL(generatorPath).href;
4129
4249
  const module = await import(generatorUrl);
4130
4250
  return module.generateRubrics;
@@ -4134,8 +4254,8 @@ async function loadRubricGenerator() {
4134
4254
  async function generateRubricsCommand(options) {
4135
4255
  const { file, target: targetOverride, verbose } = options;
4136
4256
  console.log(`Generating rubrics for: ${file}`);
4137
- const absolutePath = path3.resolve(file);
4138
- const content = await readFile2(absolutePath, "utf8");
4257
+ const absolutePath = path4.resolve(file);
4258
+ const content = await readFile(absolutePath, "utf8");
4139
4259
  const doc = parseDocument(content);
4140
4260
  const parsed = doc.toJSON();
4141
4261
  if (!isJsonObject(parsed)) {
@@ -4293,23 +4413,23 @@ var generateCommand = subcommands({
4293
4413
 
4294
4414
  // src/commands/init/index.ts
4295
4415
  import { existsSync, mkdirSync, writeFileSync as writeFileSync2 } from "node:fs";
4296
- import path5 from "node:path";
4416
+ import path6 from "node:path";
4297
4417
  import * as readline from "node:readline/promises";
4298
4418
 
4299
4419
  // src/templates/index.ts
4300
- import { readFileSync as readFileSync3, readdirSync, statSync } from "node:fs";
4301
- import path4 from "node:path";
4420
+ import { readFileSync as readFileSync4, readdirSync, statSync } from "node:fs";
4421
+ import path5 from "node:path";
4302
4422
  import { fileURLToPath } from "node:url";
4303
4423
  function getAgentvTemplates() {
4304
4424
  return getTemplatesFromDir(".agentv");
4305
4425
  }
4306
4426
  function getTemplatesFromDir(subdir) {
4307
- const currentDir = path4.dirname(fileURLToPath(import.meta.url));
4427
+ const currentDir = path5.dirname(fileURLToPath(import.meta.url));
4308
4428
  let templatesDir;
4309
- if (currentDir.includes(`${path4.sep}dist`)) {
4310
- templatesDir = path4.join(currentDir, "templates", subdir);
4429
+ if (currentDir.includes(`${path5.sep}dist`)) {
4430
+ templatesDir = path5.join(currentDir, "templates", subdir);
4311
4431
  } else {
4312
- templatesDir = path4.join(currentDir, subdir);
4432
+ templatesDir = path5.join(currentDir, subdir);
4313
4433
  }
4314
4434
  return readTemplatesRecursively(templatesDir, "");
4315
4435
  }
@@ -4317,15 +4437,15 @@ function readTemplatesRecursively(dir, relativePath) {
4317
4437
  const templates = [];
4318
4438
  const entries2 = readdirSync(dir);
4319
4439
  for (const entry of entries2) {
4320
- const fullPath = path4.join(dir, entry);
4440
+ const fullPath = path5.join(dir, entry);
4321
4441
  const stat3 = statSync(fullPath);
4322
- const entryRelativePath = relativePath ? path4.join(relativePath, entry) : entry;
4442
+ const entryRelativePath = relativePath ? path5.join(relativePath, entry) : entry;
4323
4443
  if (stat3.isDirectory()) {
4324
4444
  templates.push(...readTemplatesRecursively(fullPath, entryRelativePath));
4325
4445
  } else {
4326
- const content = readFileSync3(fullPath, "utf-8");
4446
+ const content = readFileSync4(fullPath, "utf-8");
4327
4447
  templates.push({
4328
- path: entryRelativePath.split(path4.sep).join("/"),
4448
+ path: entryRelativePath.split(path5.sep).join("/"),
4329
4449
  // Normalize to forward slashes
4330
4450
  content
4331
4451
  });
@@ -4354,23 +4474,23 @@ async function promptYesNo(message) {
4354
4474
  }
4355
4475
  }
4356
4476
  async function initCommand(options = {}) {
4357
- const targetPath = path5.resolve(options.targetPath ?? ".");
4358
- const agentvDir = path5.join(targetPath, ".agentv");
4477
+ const targetPath = path6.resolve(options.targetPath ?? ".");
4478
+ const agentvDir = path6.join(targetPath, ".agentv");
4359
4479
  const agentvTemplates = getAgentvTemplates();
4360
4480
  const envTemplate = agentvTemplates.find((t) => t.path === ".env.example");
4361
4481
  const otherAgentvTemplates = agentvTemplates.filter((t) => t.path !== ".env.example");
4362
4482
  const existingFiles = [];
4363
4483
  if (envTemplate) {
4364
- const envFilePath = path5.join(targetPath, ".env.example");
4484
+ const envFilePath = path6.join(targetPath, ".env.example");
4365
4485
  if (existsSync(envFilePath)) {
4366
4486
  existingFiles.push(".env.example");
4367
4487
  }
4368
4488
  }
4369
4489
  if (existsSync(agentvDir)) {
4370
4490
  for (const template of otherAgentvTemplates) {
4371
- const targetFilePath = path5.join(agentvDir, template.path);
4491
+ const targetFilePath = path6.join(agentvDir, template.path);
4372
4492
  if (existsSync(targetFilePath)) {
4373
- existingFiles.push(path5.relative(targetPath, targetFilePath));
4493
+ existingFiles.push(path6.relative(targetPath, targetFilePath));
4374
4494
  }
4375
4495
  }
4376
4496
  }
@@ -4392,18 +4512,18 @@ async function initCommand(options = {}) {
4392
4512
  mkdirSync(agentvDir, { recursive: true });
4393
4513
  }
4394
4514
  if (envTemplate) {
4395
- const envFilePath = path5.join(targetPath, ".env.example");
4515
+ const envFilePath = path6.join(targetPath, ".env.example");
4396
4516
  writeFileSync2(envFilePath, envTemplate.content, "utf-8");
4397
4517
  console.log("Created .env.example");
4398
4518
  }
4399
4519
  for (const template of otherAgentvTemplates) {
4400
- const targetFilePath = path5.join(agentvDir, template.path);
4401
- const targetDirPath = path5.dirname(targetFilePath);
4520
+ const targetFilePath = path6.join(agentvDir, template.path);
4521
+ const targetDirPath = path6.dirname(targetFilePath);
4402
4522
  if (!existsSync(targetDirPath)) {
4403
4523
  mkdirSync(targetDirPath, { recursive: true });
4404
4524
  }
4405
4525
  writeFileSync2(targetFilePath, template.content, "utf-8");
4406
- console.log(`Created ${path5.relative(targetPath, targetFilePath)}`);
4526
+ console.log(`Created ${path6.relative(targetPath, targetFilePath)}`);
4407
4527
  }
4408
4528
  console.log("\nAgentV initialized successfully!");
4409
4529
  console.log("\nFiles installed to root:");
@@ -4411,7 +4531,7 @@ async function initCommand(options = {}) {
4411
4531
  console.log(" - .env.example");
4412
4532
  }
4413
4533
  console.log(`
4414
- Files installed to ${path5.relative(targetPath, agentvDir)}:`);
4534
+ Files installed to ${path6.relative(targetPath, agentvDir)}:`);
4415
4535
  for (const t of otherAgentvTemplates) {
4416
4536
  console.log(` - ${t.path}`);
4417
4537
  }
@@ -4530,8 +4650,8 @@ var selfCommand = subcommands({
4530
4650
  });
4531
4651
 
4532
4652
  // src/commands/trace/utils.ts
4533
- import { readFileSync as readFileSync4, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
4534
- import path6 from "node:path";
4653
+ import { readFileSync as readFileSync5, readdirSync as readdirSync2, statSync as statSync2 } from "node:fs";
4654
+ import path7 from "node:path";
4535
4655
  var colors2 = {
4536
4656
  reset: "\x1B[0m",
4537
4657
  bold: "\x1B[1m",
@@ -4557,7 +4677,7 @@ function padLeft2(str, len) {
4557
4677
  return " ".repeat(Math.max(0, len - plainLen)) + str;
4558
4678
  }
4559
4679
  function loadResultFile(filePath) {
4560
- const content = readFileSync4(filePath, "utf8");
4680
+ const content = readFileSync5(filePath, "utf8");
4561
4681
  const lines = content.trim().split("\n").filter((line) => line.trim());
4562
4682
  return lines.map((line, i) => {
4563
4683
  const record = JSON.parse(line);
@@ -4568,7 +4688,7 @@ function loadResultFile(filePath) {
4568
4688
  });
4569
4689
  }
4570
4690
  function listResultFiles(cwd, limit) {
4571
- const resultsDir = path6.join(cwd, ".agentv", "results");
4691
+ const resultsDir = path7.join(cwd, ".agentv", "results");
4572
4692
  let files;
4573
4693
  try {
4574
4694
  files = readdirSync2(resultsDir).filter((f) => f.endsWith(".jsonl"));
@@ -4581,7 +4701,7 @@ function listResultFiles(cwd, limit) {
4581
4701
  }
4582
4702
  const metas = [];
4583
4703
  for (const filename of files) {
4584
- const filePath = path6.join(resultsDir, filename);
4704
+ const filePath = path7.join(resultsDir, filename);
4585
4705
  try {
4586
4706
  const stat3 = statSync2(filePath);
4587
4707
  const results = loadResultFile(filePath);
@@ -4807,8 +4927,8 @@ var stubProvider = {
4807
4927
  throw new Error("trace score does not support LLM-based evaluators");
4808
4928
  }
4809
4929
  };
4810
- var stubLlmJudge = {
4811
- kind: "llm-judge",
4930
+ var stubLlmGrader = {
4931
+ kind: "llm-grader",
4812
4932
  evaluate() {
4813
4933
  throw new Error("trace score does not support LLM-based evaluators");
4814
4934
  }
@@ -4816,7 +4936,7 @@ var stubLlmJudge = {
4816
4936
  async function runScore(results, evaluatorConfig, testIdFilter) {
4817
4937
  const registry = createBuiltinRegistry();
4818
4938
  const dispatchContext = {
4819
- llmJudge: stubLlmJudge,
4939
+ llmGrader: stubLlmGrader,
4820
4940
  registry
4821
4941
  };
4822
4942
  const evaluator = await registry.create(evaluatorConfig, dispatchContext);
@@ -5380,8 +5500,70 @@ var traceCommand = subcommands({
5380
5500
  }
5381
5501
  });
5382
5502
 
5503
+ // src/commands/transpile/index.ts
5504
+ import { writeFileSync as writeFileSync3 } from "node:fs";
5505
+ import path8 from "node:path";
5506
+ var transpileCommand = command({
5507
+ name: "transpile",
5508
+ description: "Convert an EVAL.yaml file to Agent Skills evals.json format",
5509
+ args: {
5510
+ input: positional({
5511
+ type: string,
5512
+ displayName: "input",
5513
+ description: "Path to EVAL.yaml file"
5514
+ }),
5515
+ outDir: option({
5516
+ type: optional(string),
5517
+ long: "out-dir",
5518
+ short: "d",
5519
+ description: "Output directory (defaults to directory of input file)"
5520
+ }),
5521
+ stdout: flag({
5522
+ long: "stdout",
5523
+ description: "Write to stdout instead of file(s) (only valid for single-skill output)"
5524
+ })
5525
+ },
5526
+ handler: async ({ input, outDir, stdout }) => {
5527
+ let result;
5528
+ try {
5529
+ result = transpileEvalYamlFile(path8.resolve(input));
5530
+ } catch (error) {
5531
+ console.error(`Error: ${error.message}`);
5532
+ process.exit(1);
5533
+ }
5534
+ for (const warning of result.warnings) {
5535
+ console.warn(`Warning: ${warning}`);
5536
+ }
5537
+ if (result.files.size === 0) {
5538
+ console.error("Error: No output produced (no tests found)");
5539
+ process.exit(1);
5540
+ }
5541
+ if (stdout) {
5542
+ if (result.files.size > 1) {
5543
+ console.error(
5544
+ "Error: --stdout is only valid when input produces a single evals.json (multi-skill input produces multiple files)"
5545
+ );
5546
+ process.exit(1);
5547
+ }
5548
+ const [file] = result.files.values();
5549
+ process.stdout.write(JSON.stringify(file, null, 2));
5550
+ process.stdout.write("\n");
5551
+ return;
5552
+ }
5553
+ const outputDir = outDir ? path8.resolve(outDir) : path8.dirname(path8.resolve(input));
5554
+ const fileNames = getOutputFilenames(result);
5555
+ for (const [skill, evalsJson] of result.files) {
5556
+ const fileName = fileNames.get(skill) ?? "evals.json";
5557
+ const outputPath = path8.join(outputDir, fileName);
5558
+ writeFileSync3(outputPath, `${JSON.stringify(evalsJson, null, 2)}
5559
+ `);
5560
+ console.log(`Transpiled to ${outputPath}`);
5561
+ }
5562
+ }
5563
+ });
5564
+
5383
5565
  // src/commands/trim/index.ts
5384
- import { readFileSync as readFileSync5, writeFileSync as writeFileSync3 } from "node:fs";
5566
+ import { readFileSync as readFileSync6, writeFileSync as writeFileSync4 } from "node:fs";
5385
5567
  var trimCommand = command({
5386
5568
  name: "trim",
5387
5569
  description: "Trim evaluation results for baseline storage (strips debug/audit fields)",
@@ -5400,7 +5582,7 @@ var trimCommand = command({
5400
5582
  },
5401
5583
  handler: async ({ input, out }) => {
5402
5584
  try {
5403
- const content = readFileSync5(input, "utf8");
5585
+ const content = readFileSync6(input, "utf8");
5404
5586
  const lines = content.trim().split("\n").filter((line) => line.trim());
5405
5587
  const trimmedLines = lines.map((line) => {
5406
5588
  const record = JSON.parse(line);
@@ -5412,7 +5594,7 @@ var trimCommand = command({
5412
5594
  const output = `${trimmedLines.join("\n")}
5413
5595
  `;
5414
5596
  if (out) {
5415
- writeFileSync3(out, output, "utf8");
5597
+ writeFileSync4(out, output, "utf8");
5416
5598
  console.error(`Trimmed ${lines.length} record(s) \u2192 ${out}`);
5417
5599
  } else {
5418
5600
  process.stdout.write(output);
@@ -5507,7 +5689,7 @@ function isTTY() {
5507
5689
  // src/commands/validate/validate-files.ts
5508
5690
  import { constants } from "node:fs";
5509
5691
  import { access, readdir, stat } from "node:fs/promises";
5510
- import path7 from "node:path";
5692
+ import path9 from "node:path";
5511
5693
  async function validateFiles(paths) {
5512
5694
  const filePaths = await expandPaths(paths);
5513
5695
  const results = [];
@@ -5525,7 +5707,7 @@ async function validateFiles(paths) {
5525
5707
  };
5526
5708
  }
5527
5709
  async function validateSingleFile(filePath) {
5528
- const absolutePath = path7.resolve(filePath);
5710
+ const absolutePath = path9.resolve(filePath);
5529
5711
  const fileType = await detectFileType(absolutePath);
5530
5712
  let result;
5531
5713
  if (fileType === "eval") {
@@ -5550,7 +5732,7 @@ async function validateSingleFile(filePath) {
5550
5732
  async function expandPaths(paths) {
5551
5733
  const expanded = [];
5552
5734
  for (const inputPath of paths) {
5553
- const absolutePath = path7.resolve(inputPath);
5735
+ const absolutePath = path9.resolve(inputPath);
5554
5736
  try {
5555
5737
  await access(absolutePath, constants.F_OK);
5556
5738
  } catch {
@@ -5574,7 +5756,7 @@ async function findYamlFiles(dirPath) {
5574
5756
  try {
5575
5757
  const entries2 = await readdir(dirPath, { withFileTypes: true });
5576
5758
  for (const entry of entries2) {
5577
- const fullPath = path7.join(dirPath, entry.name);
5759
+ const fullPath = path9.join(dirPath, entry.name);
5578
5760
  if (entry.isDirectory()) {
5579
5761
  if (entry.name === "node_modules" || entry.name.startsWith(".")) {
5580
5762
  continue;
@@ -5591,7 +5773,7 @@ async function findYamlFiles(dirPath) {
5591
5773
  return results;
5592
5774
  }
5593
5775
  function isYamlFile(filePath) {
5594
- const ext = path7.extname(filePath).toLowerCase();
5776
+ const ext = path9.extname(filePath).toLowerCase();
5595
5777
  return ext === ".yaml" || ext === ".yml";
5596
5778
  }
5597
5779
 
@@ -5630,8 +5812,8 @@ var validateCommand = command({
5630
5812
 
5631
5813
  // src/commands/workspace/clean.ts
5632
5814
  import { existsSync as existsSync2 } from "node:fs";
5633
- import { readFile as readFile3, readdir as readdir2, rm } from "node:fs/promises";
5634
- import path8 from "node:path";
5815
+ import { readFile as readFile2, readdir as readdir2, rm } from "node:fs/promises";
5816
+ import path10 from "node:path";
5635
5817
  async function confirm(message) {
5636
5818
  const readline2 = await import("node:readline");
5637
5819
  const rl = readline2.createInterface({ input: process.stdin, output: process.stdout });
@@ -5667,10 +5849,10 @@ var cleanCommand = command({
5667
5849
  const poolDirs = entries2.filter((e) => e.isDirectory());
5668
5850
  const matchingDirs = [];
5669
5851
  for (const dir of poolDirs) {
5670
- const poolDir = path8.join(poolRoot, dir.name);
5671
- const metadataPath = path8.join(poolDir, "metadata.json");
5852
+ const poolDir = path10.join(poolRoot, dir.name);
5853
+ const metadataPath = path10.join(poolDir, "metadata.json");
5672
5854
  try {
5673
- const raw = await readFile3(metadataPath, "utf-8");
5855
+ const raw = await readFile2(metadataPath, "utf-8");
5674
5856
  const metadata = JSON.parse(raw);
5675
5857
  const hasRepo = metadata.repos?.some((r) => {
5676
5858
  if (r.source.type === "git" && r.source.url) {
@@ -5699,7 +5881,7 @@ var cleanCommand = command({
5699
5881
  }
5700
5882
  for (const dir of matchingDirs) {
5701
5883
  await rm(dir, { recursive: true, force: true });
5702
- console.log(`Removed: ${path8.basename(dir).slice(0, 12)}...`);
5884
+ console.log(`Removed: ${path10.basename(dir).slice(0, 12)}...`);
5703
5885
  }
5704
5886
  console.log("Done.");
5705
5887
  } else {
@@ -5718,14 +5900,14 @@ var cleanCommand = command({
5718
5900
 
5719
5901
  // src/commands/workspace/list.ts
5720
5902
  import { existsSync as existsSync3 } from "node:fs";
5721
- import { readFile as readFile4, readdir as readdir3, stat as stat2 } from "node:fs/promises";
5722
- import path9 from "node:path";
5903
+ import { readFile as readFile3, readdir as readdir3, stat as stat2 } from "node:fs/promises";
5904
+ import path11 from "node:path";
5723
5905
  async function getDirectorySize(dirPath) {
5724
5906
  let totalSize = 0;
5725
5907
  try {
5726
5908
  const entries2 = await readdir3(dirPath, { withFileTypes: true });
5727
5909
  for (const entry of entries2) {
5728
- const fullPath = path9.join(dirPath, entry.name);
5910
+ const fullPath = path11.join(dirPath, entry.name);
5729
5911
  if (entry.isDirectory()) {
5730
5912
  totalSize += await getDirectorySize(fullPath);
5731
5913
  } else {
@@ -5760,14 +5942,14 @@ var listCommand = command({
5760
5942
  return;
5761
5943
  }
5762
5944
  for (const dir of poolDirs) {
5763
- const poolDir = path9.join(poolRoot, dir.name);
5945
+ const poolDir = path11.join(poolRoot, dir.name);
5764
5946
  const fingerprint = dir.name;
5765
5947
  const poolEntries = await readdir3(poolDir, { withFileTypes: true });
5766
5948
  const slots = poolEntries.filter((e) => e.isDirectory() && e.name.startsWith("slot-"));
5767
- const metadataPath = path9.join(poolDir, "metadata.json");
5949
+ const metadataPath = path11.join(poolDir, "metadata.json");
5768
5950
  let metadata = null;
5769
5951
  try {
5770
- const raw = await readFile4(metadataPath, "utf-8");
5952
+ const raw = await readFile3(metadataPath, "utf-8");
5771
5953
  metadata = JSON.parse(raw);
5772
5954
  } catch {
5773
5955
  }
@@ -5804,16 +5986,16 @@ var workspaceCommand = subcommands({
5804
5986
 
5805
5987
  // src/update-check.ts
5806
5988
  import { spawn as spawn2 } from "node:child_process";
5807
- import { readFile as readFile5 } from "node:fs/promises";
5989
+ import { readFile as readFile4 } from "node:fs/promises";
5808
5990
  import { join } from "node:path";
5809
5991
  var CHECK_INTERVAL_MS = 24 * 60 * 60 * 1e3;
5810
5992
  var AGENTV_DIR = getAgentvHome();
5811
5993
  var CACHE_FILE = "version-check.json";
5812
5994
  var NPM_REGISTRY_URL = "https://registry.npmjs.org/agentv/latest";
5813
- async function getCachedUpdateInfo(path10) {
5814
- const filePath = path10 ?? join(AGENTV_DIR, CACHE_FILE);
5995
+ async function getCachedUpdateInfo(path12) {
5996
+ const filePath = path12 ?? join(AGENTV_DIR, CACHE_FILE);
5815
5997
  try {
5816
- const raw = await readFile5(filePath, "utf-8");
5998
+ const raw = await readFile4(filePath, "utf-8");
5817
5999
  const data = JSON.parse(raw);
5818
6000
  if (typeof data.latestVersion === "string" && typeof data.lastCheckedAt === "string") {
5819
6001
  return data;
@@ -5894,7 +6076,7 @@ var app = subcommands({
5894
6076
  description: "AgentV CLI",
5895
6077
  version: package_default.version,
5896
6078
  cmds: {
5897
- eval: evalRunCommand,
6079
+ eval: evalCommand,
5898
6080
  prompt: evalPromptCommand,
5899
6081
  compare: compareCommand,
5900
6082
  convert: convertCommand,
@@ -5903,26 +6085,29 @@ var app = subcommands({
5903
6085
  init: initCmdTsCommand,
5904
6086
  self: selfCommand,
5905
6087
  trace: traceCommand,
6088
+ transpile: transpileCommand,
5906
6089
  trim: trimCommand,
5907
6090
  validate: validateCommand,
5908
6091
  workspace: workspaceCommand
5909
6092
  }
5910
6093
  });
5911
- var PROMPT_EVAL_SUBCOMMANDS = /* @__PURE__ */ new Set(["overview", "input", "judge"]);
6094
+ var EVAL_SUBCOMMANDS = /* @__PURE__ */ new Set(["run", "prompt", "assert"]);
6095
+ var TOP_LEVEL_COMMANDS = /* @__PURE__ */ new Set([
6096
+ "prompt",
6097
+ "compare",
6098
+ "convert",
6099
+ "create",
6100
+ "generate",
6101
+ "init",
6102
+ "self",
6103
+ "trace",
6104
+ "transpile",
6105
+ "trim",
6106
+ "validate",
6107
+ "workspace"
6108
+ ]);
5912
6109
  function preprocessArgv(argv) {
5913
6110
  const result = [...argv];
5914
- const promptIndex = result.indexOf("prompt");
5915
- if (promptIndex !== -1) {
5916
- const nextArg = result[promptIndex + 1];
5917
- if (nextArg !== "eval") {
5918
- result.splice(promptIndex + 1, 0, "eval");
5919
- }
5920
- const evalIdx = promptIndex + 1;
5921
- const subSubArg = result[evalIdx + 1];
5922
- if (subSubArg === void 0 || !PROMPT_EVAL_SUBCOMMANDS.has(subSubArg)) {
5923
- result.splice(evalIdx + 1, 0, "overview");
5924
- }
5925
- }
5926
6111
  for (let i = 0; i < result.length; i++) {
5927
6112
  if (result[i] === "--eval-id") {
5928
6113
  result[i] = "--test-id";
@@ -5930,6 +6115,16 @@ function preprocessArgv(argv) {
5930
6115
  result[i] = `--test-id=${result[i].slice("--eval-id=".length)}`;
5931
6116
  }
5932
6117
  }
6118
+ const evalIdx = result.indexOf("eval");
6119
+ if (evalIdx !== -1) {
6120
+ const isTopLevel = !result.slice(0, evalIdx).some((arg) => TOP_LEVEL_COMMANDS.has(arg));
6121
+ if (isTopLevel) {
6122
+ const nextArg = result[evalIdx + 1];
6123
+ if (nextArg !== void 0 && !EVAL_SUBCOMMANDS.has(nextArg) && nextArg !== "--help" && nextArg !== "-h") {
6124
+ result.splice(evalIdx + 1, 0, "run");
6125
+ }
6126
+ }
6127
+ }
5933
6128
  return result;
5934
6129
  }
5935
6130
  async function runCli(argv = process.argv) {
@@ -5951,4 +6146,4 @@ export {
5951
6146
  preprocessArgv,
5952
6147
  runCli
5953
6148
  };
5954
- //# sourceMappingURL=chunk-RMUVJ44Z.js.map
6149
+ //# sourceMappingURL=chunk-5WIB7A27.js.map