agentv 3.10.2 → 3.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
  2. package/dist/chunk-ETMDLQ72.js.map +1 -0
  3. package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
  4. package/dist/chunk-EZGWZVVK.js.map +1 -0
  5. package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
  6. package/dist/chunk-JEW3FEO7.js.map +1 -0
  7. package/dist/cli.js +3 -3
  8. package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
  9. package/dist/index.js +3 -3
  10. package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
  11. package/package.json +3 -1
  12. package/dist/chunk-6UE665XI.js.map +0 -1
  13. package/dist/chunk-F7LAJMTO.js.map +0 -1
  14. package/dist/chunk-KGK5NUFG.js.map +0 -1
  15. package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
  16. package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
  17. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
  18. package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
  19. package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
  20. package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
  21. package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
  22. package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
  23. package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
  24. package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
  25. package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
  26. package/dist/templates/.agentv/.env.example +0 -25
  27. package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
  28. package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
  29. package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
  30. package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
  31. package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
  32. package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
  33. package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
  34. package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
  35. package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
  36. package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
  37. package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
  38. package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
  39. package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
  40. package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
  41. /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
  42. /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
@@ -27,12 +27,12 @@ import {
27
27
  subscribeToCopilotCliLogEntries,
28
28
  subscribeToCopilotSdkLogEntries,
29
29
  subscribeToPiLogEntries
30
- } from "./chunk-KGK5NUFG.js";
30
+ } from "./chunk-EZGWZVVK.js";
31
31
 
32
32
  // package.json
33
33
  var package_default = {
34
34
  name: "agentv",
35
- version: "3.10.2",
35
+ version: "3.11.0",
36
36
  description: "CLI entry point for AgentV",
37
37
  type: "module",
38
38
  repository: {
@@ -63,6 +63,7 @@ var package_default = {
63
63
  "@ai-sdk/openai": "^3.0.0",
64
64
  "@anthropic-ai/claude-agent-sdk": "^0.2.49",
65
65
  "@github/copilot-sdk": "^0.1.25",
66
+ "@hono/node-server": "^1.19.11",
66
67
  "@inquirer/prompts": "^8.2.1",
67
68
  "@mariozechner/pi-agent-core": "^0.54.2",
68
69
  "@mariozechner/pi-ai": "^0.54.2",
@@ -70,6 +71,7 @@ var package_default = {
70
71
  "cmd-ts": "^0.14.3",
71
72
  dotenv: "^16.4.5",
72
73
  "fast-glob": "^3.3.3",
74
+ hono: "^4.12.9",
73
75
  json5: "^2.2.3",
74
76
  micromatch: "^4.0.8",
75
77
  semver: "^7.7.4",
@@ -202,7 +204,7 @@ async function discoverTargetsFile(options) {
202
204
  }
203
205
 
204
206
  // src/commands/eval/run-eval.ts
205
- import { constants as constants4 } from "node:fs";
207
+ import { constants as constants4, mkdirSync } from "node:fs";
206
208
  import { access as access4 } from "node:fs/promises";
207
209
  import path13 from "node:path";
208
210
  import { pathToFileURL } from "node:url";
@@ -478,6 +480,33 @@ function buildBenchmarkArtifact(results, evalFile = "") {
478
480
  notes
479
481
  };
480
482
  }
483
+ function buildAggregateGradingArtifact(results) {
484
+ const assertions = [];
485
+ for (const result of results) {
486
+ if (!result.assertions) continue;
487
+ const testId = result.testId ?? "unknown";
488
+ for (const a of result.assertions) {
489
+ assertions.push({
490
+ test_id: testId,
491
+ text: a.text,
492
+ passed: a.passed,
493
+ evidence: a.evidence ?? ""
494
+ });
495
+ }
496
+ }
497
+ const passed = assertions.filter((a) => a.passed).length;
498
+ const failed = assertions.filter((a) => !a.passed).length;
499
+ const total = assertions.length;
500
+ return {
501
+ assertions,
502
+ summary: {
503
+ passed,
504
+ failed,
505
+ total,
506
+ pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
507
+ }
508
+ };
509
+ }
481
510
  function toCamelCase(str) {
482
511
  return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
483
512
  }
@@ -518,6 +547,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
518
547
  const gradingDir = path3.join(outputDir, "grading");
519
548
  const timingPath = path3.join(outputDir, "timing.json");
520
549
  const benchmarkPath = path3.join(outputDir, "benchmark.json");
550
+ const aggregateGradingPath = path3.join(outputDir, "grading.json");
521
551
  await mkdir(gradingDir, { recursive: true });
522
552
  for (const result of results) {
523
553
  const grading = buildGradingArtifact(result);
@@ -532,7 +562,10 @@ async function writeArtifactsFromResults(results, outputDir, options) {
532
562
  const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
533
563
  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
534
564
  `, "utf8");
535
- return { gradingDir, timingPath, benchmarkPath };
565
+ const aggregateGrading = buildAggregateGradingArtifact(results);
566
+ await writeFile(aggregateGradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
567
+ `, "utf8");
568
+ return { gradingDir, timingPath, benchmarkPath, aggregateGradingPath };
536
569
  }
537
570
 
538
571
  // src/commands/eval/benchmark-writer.ts
@@ -1646,20 +1679,6 @@ async function createOutputWriter(filePath, format) {
1646
1679
  }
1647
1680
  }
1648
1681
  }
1649
- function getDefaultExtension(format) {
1650
- switch (format) {
1651
- case "jsonl":
1652
- return ".jsonl";
1653
- case "yaml":
1654
- return ".yaml";
1655
- case "html":
1656
- return ".html";
1657
- default: {
1658
- const exhaustiveCheck = format;
1659
- throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
1660
- }
1661
- }
1662
- }
1663
1682
  var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
1664
1683
  function createWriterFromPath(filePath) {
1665
1684
  const ext = path10.extname(filePath).toLowerCase();
@@ -1866,6 +1885,12 @@ async function loadNonErrorResults(jsonlPath) {
1866
1885
  import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
1867
1886
  import path11 from "node:path";
1868
1887
  var CACHE_FILENAME = "cache.json";
1888
+ function resolveRunCacheFile(cache) {
1889
+ if (cache.lastRunDir) {
1890
+ return path11.join(cache.lastRunDir, "results.jsonl");
1891
+ }
1892
+ return cache.lastResultFile ?? "";
1893
+ }
1869
1894
  function cachePath(cwd) {
1870
1895
  return path11.join(cwd, ".agentv", CACHE_FILENAME);
1871
1896
  }
@@ -1877,11 +1902,11 @@ async function loadRunCache(cwd) {
1877
1902
  return void 0;
1878
1903
  }
1879
1904
  }
1880
- async function saveRunCache(cwd, resultFile) {
1905
+ async function saveRunCache(cwd, runDir) {
1881
1906
  const dir = path11.join(cwd, ".agentv");
1882
1907
  await mkdir7(dir, { recursive: true });
1883
1908
  const cache = {
1884
- lastResultFile: resultFile,
1909
+ lastRunDir: runDir,
1885
1910
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
1886
1911
  };
1887
1912
  await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
@@ -3787,10 +3812,10 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
3787
3812
  // Precedence: CLI > YAML config > TS config
3788
3813
  otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
3789
3814
  traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
3790
- exportOtel: normalizeBoolean(rawOptions.exportOtel),
3791
- otelBackend: normalizeString(rawOptions.otelBackend),
3792
- otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
3793
- otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
3815
+ exportOtel: normalizeBoolean(rawOptions.exportOtel) || yamlExecution?.export_otel === true,
3816
+ otelBackend: normalizeString(rawOptions.otelBackend) ?? yamlExecution?.otel_backend,
3817
+ otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
3818
+ otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
3794
3819
  retryErrors: normalizeString(rawOptions.retryErrors),
3795
3820
  workspaceMode,
3796
3821
  workspacePath,
@@ -3808,11 +3833,12 @@ async function ensureFileExists(filePath, description) {
3808
3833
  throw new Error(`${description} not found: ${filePath}`);
3809
3834
  }
3810
3835
  }
3811
- function buildDefaultOutputPath(cwd, format) {
3836
+ function buildDefaultOutputPath(cwd) {
3812
3837
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3813
- const baseName = "eval";
3814
- const extension = getDefaultExtension(format);
3815
- return path13.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3838
+ const dirName = `eval_${timestamp}`;
3839
+ const runDir = path13.join(cwd, ".agentv", "results", "raw", dirName);
3840
+ mkdirSync(runDir, { recursive: true });
3841
+ return path13.join(runDir, "results.jsonl");
3816
3842
  }
3817
3843
  function createProgressReporter(maxWorkers, options) {
3818
3844
  const display = new ProgressDisplay(maxWorkers, options);
@@ -4155,7 +4181,7 @@ async function runEvalCommand(input) {
4155
4181
  const useFileExport = !!(options.otelFile || options.traceFile);
4156
4182
  if (options.exportOtel || useFileExport) {
4157
4183
  try {
4158
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-3QUJEJUT.js");
4184
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QERRYDSC.js");
4159
4185
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4160
4186
  let headers = {};
4161
4187
  if (options.otelBackend) {
@@ -4196,7 +4222,7 @@ async function runEvalCommand(input) {
4196
4222
  otelExporter = null;
4197
4223
  }
4198
4224
  }
4199
- const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
4225
+ const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd);
4200
4226
  const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
4201
4227
  const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4202
4228
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
@@ -4430,7 +4456,15 @@ Results written to: ${outputPath}`);
4430
4456
  console.log(` ${p}`);
4431
4457
  }
4432
4458
  }
4433
- await saveRunCache(cwd, outputPath).catch(() => void 0);
4459
+ const runDir = path13.dirname(outputPath);
4460
+ await saveRunCache(cwd, runDir).catch(() => void 0);
4461
+ if (outputPath.endsWith(".jsonl")) {
4462
+ const { writeFile: writeFile7 } = await import("node:fs/promises");
4463
+ const gradingPath = path13.join(path13.dirname(outputPath), "grading.json");
4464
+ const aggregateGrading = buildAggregateGradingArtifact(allResults);
4465
+ await writeFile7(gradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
4466
+ `, "utf8");
4467
+ }
4434
4468
  }
4435
4469
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
4436
4470
  const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
@@ -4488,7 +4522,9 @@ export {
4488
4522
  buildGradingArtifact,
4489
4523
  buildTimingArtifact,
4490
4524
  buildBenchmarkArtifact,
4525
+ buildAggregateGradingArtifact,
4491
4526
  parseJsonlResults,
4527
+ resolveRunCacheFile,
4492
4528
  loadRunCache,
4493
4529
  detectFileType,
4494
4530
  validateEvalFile,
@@ -4500,4 +4536,4 @@ export {
4500
4536
  selectTarget,
4501
4537
  runEvalCommand
4502
4538
  };
4503
- //# sourceMappingURL=chunk-F7LAJMTO.js.map
4539
+ //# sourceMappingURL=chunk-JEW3FEO7.js.map