agentv 3.10.2 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-6UE665XI.js → chunk-ETMDLQ72.js} +1141 -60
- package/dist/chunk-ETMDLQ72.js.map +1 -0
- package/dist/{chunk-KGK5NUFG.js → chunk-EZGWZVVK.js} +377 -163
- package/dist/chunk-EZGWZVVK.js.map +1 -0
- package/dist/{chunk-F7LAJMTO.js → chunk-JEW3FEO7.js} +68 -32
- package/dist/chunk-JEW3FEO7.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-3QUJEJUT.js → dist-QERRYDSC.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-EO6AR2R3.js → interactive-AD4PRYDN.js} +3 -3
- package/package.json +3 -1
- package/dist/chunk-6UE665XI.js.map +0 -1
- package/dist/chunk-F7LAJMTO.js.map +0 -1
- package/dist/chunk-KGK5NUFG.js.map +0 -1
- package/dist/templates/.agents/skills/agentv-chat-to-eval/README.md +0 -84
- package/dist/templates/.agents/skills/agentv-chat-to-eval/SKILL.md +0 -144
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-json.md +0 -67
- package/dist/templates/.agents/skills/agentv-chat-to-eval/examples/transcript-markdown.md +0 -101
- package/dist/templates/.agents/skills/agentv-eval-builder/SKILL.md +0 -458
- package/dist/templates/.agents/skills/agentv-eval-builder/references/config-schema.json +0 -36
- package/dist/templates/.agents/skills/agentv-eval-builder/references/custom-evaluators.md +0 -118
- package/dist/templates/.agents/skills/agentv-eval-builder/references/eval-schema.json +0 -12753
- package/dist/templates/.agents/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -77
- package/dist/templates/.agents/skills/agentv-eval-orchestrator/SKILL.md +0 -50
- package/dist/templates/.agents/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.agentv/.env.example +0 -25
- package/dist/templates/.claude/skills/agentv-eval-builder/SKILL.md +0 -177
- package/dist/templates/.claude/skills/agentv-eval-builder/references/batch-cli-evaluator.md +0 -316
- package/dist/templates/.claude/skills/agentv-eval-builder/references/compare-command.md +0 -137
- package/dist/templates/.claude/skills/agentv-eval-builder/references/composite-evaluator.md +0 -215
- package/dist/templates/.claude/skills/agentv-eval-builder/references/config-schema.json +0 -27
- package/dist/templates/.claude/skills/agentv-eval-builder/references/custom-evaluators.md +0 -115
- package/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json +0 -278
- package/dist/templates/.claude/skills/agentv-eval-builder/references/example-evals.md +0 -333
- package/dist/templates/.claude/skills/agentv-eval-builder/references/rubric-evaluator.md +0 -79
- package/dist/templates/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md +0 -121
- package/dist/templates/.claude/skills/agentv-eval-builder/references/tool-trajectory-evaluator.md +0 -298
- package/dist/templates/.claude/skills/agentv-prompt-optimizer/SKILL.md +0 -78
- package/dist/templates/.github/prompts/agentv-eval-build.prompt.md +0 -5
- package/dist/templates/.github/prompts/agentv-optimize.prompt.md +0 -4
- /package/dist/{dist-3QUJEJUT.js.map → dist-QERRYDSC.js.map} +0 -0
- /package/dist/{interactive-EO6AR2R3.js.map → interactive-AD4PRYDN.js.map} +0 -0
|
@@ -27,12 +27,12 @@ import {
|
|
|
27
27
|
subscribeToCopilotCliLogEntries,
|
|
28
28
|
subscribeToCopilotSdkLogEntries,
|
|
29
29
|
subscribeToPiLogEntries
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-EZGWZVVK.js";
|
|
31
31
|
|
|
32
32
|
// package.json
|
|
33
33
|
var package_default = {
|
|
34
34
|
name: "agentv",
|
|
35
|
-
version: "3.
|
|
35
|
+
version: "3.11.0",
|
|
36
36
|
description: "CLI entry point for AgentV",
|
|
37
37
|
type: "module",
|
|
38
38
|
repository: {
|
|
@@ -63,6 +63,7 @@ var package_default = {
|
|
|
63
63
|
"@ai-sdk/openai": "^3.0.0",
|
|
64
64
|
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
|
|
65
65
|
"@github/copilot-sdk": "^0.1.25",
|
|
66
|
+
"@hono/node-server": "^1.19.11",
|
|
66
67
|
"@inquirer/prompts": "^8.2.1",
|
|
67
68
|
"@mariozechner/pi-agent-core": "^0.54.2",
|
|
68
69
|
"@mariozechner/pi-ai": "^0.54.2",
|
|
@@ -70,6 +71,7 @@ var package_default = {
|
|
|
70
71
|
"cmd-ts": "^0.14.3",
|
|
71
72
|
dotenv: "^16.4.5",
|
|
72
73
|
"fast-glob": "^3.3.3",
|
|
74
|
+
hono: "^4.12.9",
|
|
73
75
|
json5: "^2.2.3",
|
|
74
76
|
micromatch: "^4.0.8",
|
|
75
77
|
semver: "^7.7.4",
|
|
@@ -202,7 +204,7 @@ async function discoverTargetsFile(options) {
|
|
|
202
204
|
}
|
|
203
205
|
|
|
204
206
|
// src/commands/eval/run-eval.ts
|
|
205
|
-
import { constants as constants4 } from "node:fs";
|
|
207
|
+
import { constants as constants4, mkdirSync } from "node:fs";
|
|
206
208
|
import { access as access4 } from "node:fs/promises";
|
|
207
209
|
import path13 from "node:path";
|
|
208
210
|
import { pathToFileURL } from "node:url";
|
|
@@ -478,6 +480,33 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
478
480
|
notes
|
|
479
481
|
};
|
|
480
482
|
}
|
|
483
|
+
function buildAggregateGradingArtifact(results) {
|
|
484
|
+
const assertions = [];
|
|
485
|
+
for (const result of results) {
|
|
486
|
+
if (!result.assertions) continue;
|
|
487
|
+
const testId = result.testId ?? "unknown";
|
|
488
|
+
for (const a of result.assertions) {
|
|
489
|
+
assertions.push({
|
|
490
|
+
test_id: testId,
|
|
491
|
+
text: a.text,
|
|
492
|
+
passed: a.passed,
|
|
493
|
+
evidence: a.evidence ?? ""
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
const passed = assertions.filter((a) => a.passed).length;
|
|
498
|
+
const failed = assertions.filter((a) => !a.passed).length;
|
|
499
|
+
const total = assertions.length;
|
|
500
|
+
return {
|
|
501
|
+
assertions,
|
|
502
|
+
summary: {
|
|
503
|
+
passed,
|
|
504
|
+
failed,
|
|
505
|
+
total,
|
|
506
|
+
pass_rate: total > 0 ? Math.round(passed / total * 1e3) / 1e3 : 0
|
|
507
|
+
}
|
|
508
|
+
};
|
|
509
|
+
}
|
|
481
510
|
function toCamelCase(str) {
|
|
482
511
|
return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
|
|
483
512
|
}
|
|
@@ -518,6 +547,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
518
547
|
const gradingDir = path3.join(outputDir, "grading");
|
|
519
548
|
const timingPath = path3.join(outputDir, "timing.json");
|
|
520
549
|
const benchmarkPath = path3.join(outputDir, "benchmark.json");
|
|
550
|
+
const aggregateGradingPath = path3.join(outputDir, "grading.json");
|
|
521
551
|
await mkdir(gradingDir, { recursive: true });
|
|
522
552
|
for (const result of results) {
|
|
523
553
|
const grading = buildGradingArtifact(result);
|
|
@@ -532,7 +562,10 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
532
562
|
const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
|
|
533
563
|
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}
|
|
534
564
|
`, "utf8");
|
|
535
|
-
|
|
565
|
+
const aggregateGrading = buildAggregateGradingArtifact(results);
|
|
566
|
+
await writeFile(aggregateGradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
|
|
567
|
+
`, "utf8");
|
|
568
|
+
return { gradingDir, timingPath, benchmarkPath, aggregateGradingPath };
|
|
536
569
|
}
|
|
537
570
|
|
|
538
571
|
// src/commands/eval/benchmark-writer.ts
|
|
@@ -1646,20 +1679,6 @@ async function createOutputWriter(filePath, format) {
|
|
|
1646
1679
|
}
|
|
1647
1680
|
}
|
|
1648
1681
|
}
|
|
1649
|
-
function getDefaultExtension(format) {
|
|
1650
|
-
switch (format) {
|
|
1651
|
-
case "jsonl":
|
|
1652
|
-
return ".jsonl";
|
|
1653
|
-
case "yaml":
|
|
1654
|
-
return ".yaml";
|
|
1655
|
-
case "html":
|
|
1656
|
-
return ".html";
|
|
1657
|
-
default: {
|
|
1658
|
-
const exhaustiveCheck = format;
|
|
1659
|
-
throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
|
|
1660
|
-
}
|
|
1661
|
-
}
|
|
1662
|
-
}
|
|
1663
1682
|
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".jsonl", ".json", ".xml", ".yaml", ".yml", ".html", ".htm"]);
|
|
1664
1683
|
function createWriterFromPath(filePath) {
|
|
1665
1684
|
const ext = path10.extname(filePath).toLowerCase();
|
|
@@ -1866,6 +1885,12 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
1866
1885
|
import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
|
|
1867
1886
|
import path11 from "node:path";
|
|
1868
1887
|
var CACHE_FILENAME = "cache.json";
|
|
1888
|
+
function resolveRunCacheFile(cache) {
|
|
1889
|
+
if (cache.lastRunDir) {
|
|
1890
|
+
return path11.join(cache.lastRunDir, "results.jsonl");
|
|
1891
|
+
}
|
|
1892
|
+
return cache.lastResultFile ?? "";
|
|
1893
|
+
}
|
|
1869
1894
|
function cachePath(cwd) {
|
|
1870
1895
|
return path11.join(cwd, ".agentv", CACHE_FILENAME);
|
|
1871
1896
|
}
|
|
@@ -1877,11 +1902,11 @@ async function loadRunCache(cwd) {
|
|
|
1877
1902
|
return void 0;
|
|
1878
1903
|
}
|
|
1879
1904
|
}
|
|
1880
|
-
async function saveRunCache(cwd,
|
|
1905
|
+
async function saveRunCache(cwd, runDir) {
|
|
1881
1906
|
const dir = path11.join(cwd, ".agentv");
|
|
1882
1907
|
await mkdir7(dir, { recursive: true });
|
|
1883
1908
|
const cache = {
|
|
1884
|
-
|
|
1909
|
+
lastRunDir: runDir,
|
|
1885
1910
|
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
1886
1911
|
};
|
|
1887
1912
|
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
@@ -3787,10 +3812,10 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
3787
3812
|
// Precedence: CLI > YAML config > TS config
|
|
3788
3813
|
otelFile: normalizeString(rawOptions.otelFile) ?? (yamlExecution?.otel_file ? resolveTimestampPlaceholder(yamlExecution.otel_file) : void 0) ?? (config?.execution?.otelFile ? resolveTimestampPlaceholder(config.execution.otelFile) : void 0),
|
|
3789
3814
|
traceFile: normalizeString(rawOptions.traceFile) ?? (yamlExecution?.trace_file ? resolveTimestampPlaceholder(yamlExecution.trace_file) : void 0) ?? (config?.execution?.traceFile ? resolveTimestampPlaceholder(config.execution.traceFile) : void 0),
|
|
3790
|
-
exportOtel: normalizeBoolean(rawOptions.exportOtel),
|
|
3791
|
-
otelBackend: normalizeString(rawOptions.otelBackend),
|
|
3792
|
-
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
|
|
3793
|
-
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
|
|
3815
|
+
exportOtel: normalizeBoolean(rawOptions.exportOtel) || yamlExecution?.export_otel === true,
|
|
3816
|
+
otelBackend: normalizeString(rawOptions.otelBackend) ?? yamlExecution?.otel_backend,
|
|
3817
|
+
otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent) || yamlExecution?.otel_capture_content === true,
|
|
3818
|
+
otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns) || yamlExecution?.otel_group_turns === true,
|
|
3794
3819
|
retryErrors: normalizeString(rawOptions.retryErrors),
|
|
3795
3820
|
workspaceMode,
|
|
3796
3821
|
workspacePath,
|
|
@@ -3808,11 +3833,12 @@ async function ensureFileExists(filePath, description) {
|
|
|
3808
3833
|
throw new Error(`${description} not found: ${filePath}`);
|
|
3809
3834
|
}
|
|
3810
3835
|
}
|
|
3811
|
-
function buildDefaultOutputPath(cwd
|
|
3836
|
+
function buildDefaultOutputPath(cwd) {
|
|
3812
3837
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3813
|
-
const
|
|
3814
|
-
const
|
|
3815
|
-
|
|
3838
|
+
const dirName = `eval_${timestamp}`;
|
|
3839
|
+
const runDir = path13.join(cwd, ".agentv", "results", "raw", dirName);
|
|
3840
|
+
mkdirSync(runDir, { recursive: true });
|
|
3841
|
+
return path13.join(runDir, "results.jsonl");
|
|
3816
3842
|
}
|
|
3817
3843
|
function createProgressReporter(maxWorkers, options) {
|
|
3818
3844
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -4155,7 +4181,7 @@ async function runEvalCommand(input) {
|
|
|
4155
4181
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
4156
4182
|
if (options.exportOtel || useFileExport) {
|
|
4157
4183
|
try {
|
|
4158
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4184
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-QERRYDSC.js");
|
|
4159
4185
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4160
4186
|
let headers = {};
|
|
4161
4187
|
if (options.otelBackend) {
|
|
@@ -4196,7 +4222,7 @@ async function runEvalCommand(input) {
|
|
|
4196
4222
|
otelExporter = null;
|
|
4197
4223
|
}
|
|
4198
4224
|
}
|
|
4199
|
-
const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd
|
|
4225
|
+
const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd);
|
|
4200
4226
|
const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
|
|
4201
4227
|
const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
4202
4228
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
@@ -4430,7 +4456,15 @@ Results written to: ${outputPath}`);
|
|
|
4430
4456
|
console.log(` ${p}`);
|
|
4431
4457
|
}
|
|
4432
4458
|
}
|
|
4433
|
-
|
|
4459
|
+
const runDir = path13.dirname(outputPath);
|
|
4460
|
+
await saveRunCache(cwd, runDir).catch(() => void 0);
|
|
4461
|
+
if (outputPath.endsWith(".jsonl")) {
|
|
4462
|
+
const { writeFile: writeFile7 } = await import("node:fs/promises");
|
|
4463
|
+
const gradingPath = path13.join(path13.dirname(outputPath), "grading.json");
|
|
4464
|
+
const aggregateGrading = buildAggregateGradingArtifact(allResults);
|
|
4465
|
+
await writeFile7(gradingPath, `${JSON.stringify(aggregateGrading, null, 2)}
|
|
4466
|
+
`, "utf8");
|
|
4467
|
+
}
|
|
4434
4468
|
}
|
|
4435
4469
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4436
4470
|
const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
|
|
@@ -4488,7 +4522,9 @@ export {
|
|
|
4488
4522
|
buildGradingArtifact,
|
|
4489
4523
|
buildTimingArtifact,
|
|
4490
4524
|
buildBenchmarkArtifact,
|
|
4525
|
+
buildAggregateGradingArtifact,
|
|
4491
4526
|
parseJsonlResults,
|
|
4527
|
+
resolveRunCacheFile,
|
|
4492
4528
|
loadRunCache,
|
|
4493
4529
|
detectFileType,
|
|
4494
4530
|
validateEvalFile,
|
|
@@ -4500,4 +4536,4 @@ export {
|
|
|
4500
4536
|
selectTarget,
|
|
4501
4537
|
runEvalCommand
|
|
4502
4538
|
};
|
|
4503
|
-
//# sourceMappingURL=chunk-
|
|
4539
|
+
//# sourceMappingURL=chunk-JEW3FEO7.js.map
|