agentv 3.9.1 → 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-FRA6PDLZ.js → chunk-DDMAQT5P.js} +73 -42
- package/dist/chunk-DDMAQT5P.js.map +1 -0
- package/dist/{chunk-FNIEABNM.js → chunk-DJ6FJ6J4.js} +29 -18
- package/dist/chunk-DJ6FJ6J4.js.map +1 -0
- package/dist/{chunk-X24J6HCV.js → chunk-X3KJVUAB.js} +93 -49
- package/dist/chunk-X3KJVUAB.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/{dist-LPIGPS52.js → dist-DJFWBJNJ.js} +2 -2
- package/dist/index.js +3 -3
- package/dist/{interactive-O7HENH55.js → interactive-ST4ZSRK4.js} +3 -3
- package/dist/templates/.agentv/config.yaml +4 -13
- package/dist/templates/.agentv/targets.yaml +0 -16
- package/dist/templates/{.agentv/.env.example → .env.example} +11 -9
- package/package.json +1 -1
- package/dist/chunk-FNIEABNM.js.map +0 -1
- package/dist/chunk-FRA6PDLZ.js.map +0 -1
- package/dist/chunk-X24J6HCV.js.map +0 -1
- /package/dist/{dist-LPIGPS52.js.map → dist-DJFWBJNJ.js.map} +0 -0
- /package/dist/{interactive-O7HENH55.js.map → interactive-ST4ZSRK4.js.map} +0 -0
|
@@ -27,12 +27,12 @@ import {
|
|
|
27
27
|
subscribeToCopilotCliLogEntries,
|
|
28
28
|
subscribeToCopilotSdkLogEntries,
|
|
29
29
|
subscribeToPiLogEntries
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-X3KJVUAB.js";
|
|
31
31
|
|
|
32
32
|
// package.json
|
|
33
33
|
var package_default = {
|
|
34
34
|
name: "agentv",
|
|
35
|
-
version: "3.
|
|
35
|
+
version: "3.10.0",
|
|
36
36
|
description: "CLI entry point for AgentV",
|
|
37
37
|
type: "module",
|
|
38
38
|
repository: {
|
|
@@ -204,7 +204,7 @@ async function discoverTargetsFile(options) {
|
|
|
204
204
|
// src/commands/eval/run-eval.ts
|
|
205
205
|
import { constants as constants4 } from "node:fs";
|
|
206
206
|
import { access as access4 } from "node:fs/promises";
|
|
207
|
-
import
|
|
207
|
+
import path13 from "node:path";
|
|
208
208
|
import { pathToFileURL } from "node:url";
|
|
209
209
|
|
|
210
210
|
// src/version-check.ts
|
|
@@ -282,7 +282,7 @@ function computePassRate(result) {
|
|
|
282
282
|
const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
|
|
283
283
|
return passed / scores.length;
|
|
284
284
|
}
|
|
285
|
-
return result.score >= PASS_THRESHOLD ? 1 : 0;
|
|
285
|
+
return (result.score ?? 0) >= PASS_THRESHOLD ? 1 : 0;
|
|
286
286
|
}
|
|
287
287
|
function countToolCalls(result) {
|
|
288
288
|
const toolCalls = {};
|
|
@@ -323,6 +323,7 @@ function parseWorkspaceChanges(fileChanges) {
|
|
|
323
323
|
};
|
|
324
324
|
}
|
|
325
325
|
function buildAssertions(result) {
|
|
326
|
+
if (!result.assertions) return [];
|
|
326
327
|
return result.assertions.map((a) => ({
|
|
327
328
|
text: a.text,
|
|
328
329
|
passed: a.passed,
|
|
@@ -403,8 +404,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
403
404
|
const targetSet = /* @__PURE__ */ new Set();
|
|
404
405
|
const testIdSet = /* @__PURE__ */ new Set();
|
|
405
406
|
for (const result of results) {
|
|
406
|
-
targetSet.add(result.target);
|
|
407
|
-
testIdSet.add(result.testId);
|
|
407
|
+
targetSet.add(result.target ?? "unknown");
|
|
408
|
+
testIdSet.add(result.testId ?? "unknown");
|
|
408
409
|
}
|
|
409
410
|
const targets = [...targetSet].sort();
|
|
410
411
|
const testIds = [...testIdSet].sort();
|
|
@@ -452,7 +453,9 @@ function buildBenchmarkArtifact(results, evalFile = "") {
|
|
|
452
453
|
perEvaluatorSummary[key] = computeStats(scores);
|
|
453
454
|
}
|
|
454
455
|
}
|
|
455
|
-
const errorCount = results.filter(
|
|
456
|
+
const errorCount = results.filter(
|
|
457
|
+
(r) => r.executionStatus != null && r.executionStatus === "execution_error"
|
|
458
|
+
).length;
|
|
456
459
|
if (errorCount > 0) {
|
|
457
460
|
notes.push(
|
|
458
461
|
`${errorCount} test(s) had execution errors and are included in pass_rate as failures`
|
|
@@ -518,7 +521,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
|
|
|
518
521
|
await mkdir(gradingDir, { recursive: true });
|
|
519
522
|
for (const result of results) {
|
|
520
523
|
const grading = buildGradingArtifact(result);
|
|
521
|
-
const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
|
|
524
|
+
const safeTestId = (result.testId ?? "unknown").replace(/[/\\:*?"<>|]/g, "_");
|
|
522
525
|
const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
|
|
523
526
|
await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
|
|
524
527
|
`, "utf8");
|
|
@@ -1766,12 +1769,12 @@ var ProgressDisplay = class {
|
|
|
1766
1769
|
}
|
|
1767
1770
|
addLogPaths(paths, provider) {
|
|
1768
1771
|
const newPaths = [];
|
|
1769
|
-
for (const
|
|
1770
|
-
if (this.logPathSet.has(
|
|
1772
|
+
for (const path14 of paths) {
|
|
1773
|
+
if (this.logPathSet.has(path14)) {
|
|
1771
1774
|
continue;
|
|
1772
1775
|
}
|
|
1773
|
-
this.logPathSet.add(
|
|
1774
|
-
newPaths.push(
|
|
1776
|
+
this.logPathSet.add(path14);
|
|
1777
|
+
newPaths.push(path14);
|
|
1775
1778
|
}
|
|
1776
1779
|
if (newPaths.length === 0) {
|
|
1777
1780
|
return;
|
|
@@ -1784,8 +1787,8 @@ var ProgressDisplay = class {
|
|
|
1784
1787
|
this.hasPrintedLogHeader = true;
|
|
1785
1788
|
}
|
|
1786
1789
|
const startIndex = this.logPaths.length - newPaths.length;
|
|
1787
|
-
newPaths.forEach((
|
|
1788
|
-
console.log(`${startIndex + offset + 1}. ${
|
|
1790
|
+
newPaths.forEach((path14, offset) => {
|
|
1791
|
+
console.log(`${startIndex + offset + 1}. ${path14}`);
|
|
1789
1792
|
});
|
|
1790
1793
|
}
|
|
1791
1794
|
finish() {
|
|
@@ -1859,6 +1862,32 @@ async function loadNonErrorResults(jsonlPath) {
|
|
|
1859
1862
|
return results;
|
|
1860
1863
|
}
|
|
1861
1864
|
|
|
1865
|
+
// src/commands/eval/run-cache.ts
|
|
1866
|
+
import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
|
|
1867
|
+
import path11 from "node:path";
|
|
1868
|
+
var CACHE_FILENAME = "cache.json";
|
|
1869
|
+
function cachePath(cwd) {
|
|
1870
|
+
return path11.join(cwd, ".agentv", CACHE_FILENAME);
|
|
1871
|
+
}
|
|
1872
|
+
async function loadRunCache(cwd) {
|
|
1873
|
+
try {
|
|
1874
|
+
const content = await readFile2(cachePath(cwd), "utf-8");
|
|
1875
|
+
return JSON.parse(content);
|
|
1876
|
+
} catch {
|
|
1877
|
+
return void 0;
|
|
1878
|
+
}
|
|
1879
|
+
}
|
|
1880
|
+
async function saveRunCache(cwd, resultFile) {
|
|
1881
|
+
const dir = path11.join(cwd, ".agentv");
|
|
1882
|
+
await mkdir7(dir, { recursive: true });
|
|
1883
|
+
const cache = {
|
|
1884
|
+
lastResultFile: resultFile,
|
|
1885
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
1886
|
+
};
|
|
1887
|
+
await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
|
|
1888
|
+
`, "utf-8");
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1862
1891
|
// src/commands/eval/statistics.ts
|
|
1863
1892
|
var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
|
|
1864
1893
|
function computeMean(values) {
|
|
@@ -2107,13 +2136,13 @@ function formatMatrixSummary(results) {
|
|
|
2107
2136
|
}
|
|
2108
2137
|
|
|
2109
2138
|
// ../../packages/core/dist/evaluation/validation/index.js
|
|
2110
|
-
import { readFile as
|
|
2111
|
-
import
|
|
2139
|
+
import { readFile as readFile3 } from "node:fs/promises";
|
|
2140
|
+
import path12 from "node:path";
|
|
2112
2141
|
import { parse } from "yaml";
|
|
2113
2142
|
import { readFile as readFile22 } from "node:fs/promises";
|
|
2114
2143
|
import path22 from "node:path";
|
|
2115
2144
|
import { parse as parse2 } from "yaml";
|
|
2116
|
-
import { readFile as
|
|
2145
|
+
import { readFile as readFile32 } from "node:fs/promises";
|
|
2117
2146
|
import path32 from "node:path";
|
|
2118
2147
|
import { parse as parse3 } from "yaml";
|
|
2119
2148
|
import { readFile as readFile4 } from "node:fs/promises";
|
|
@@ -2126,7 +2155,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
|
|
|
2126
2155
|
var SCHEMA_CONFIG_V2 = "agentv-config-v2";
|
|
2127
2156
|
async function detectFileType(filePath) {
|
|
2128
2157
|
try {
|
|
2129
|
-
const content = await
|
|
2158
|
+
const content = await readFile3(filePath, "utf8");
|
|
2130
2159
|
const parsed = parse(content);
|
|
2131
2160
|
if (typeof parsed !== "object" || parsed === null) {
|
|
2132
2161
|
return inferFileTypeFromPath(filePath);
|
|
@@ -2151,8 +2180,8 @@ async function detectFileType(filePath) {
|
|
|
2151
2180
|
}
|
|
2152
2181
|
}
|
|
2153
2182
|
function inferFileTypeFromPath(filePath) {
|
|
2154
|
-
const normalized =
|
|
2155
|
-
const basename =
|
|
2183
|
+
const normalized = path12.normalize(filePath).replace(/\\/g, "/");
|
|
2184
|
+
const basename = path12.basename(filePath);
|
|
2156
2185
|
if (normalized.includes("/.agentv/")) {
|
|
2157
2186
|
if (basename === "config.yaml" || basename === "config.yml") {
|
|
2158
2187
|
return "config";
|
|
@@ -3008,8 +3037,8 @@ async function validateTargetsFile(filePath) {
|
|
|
3008
3037
|
const absolutePath = path32.resolve(filePath);
|
|
3009
3038
|
let parsed;
|
|
3010
3039
|
try {
|
|
3011
|
-
const content = await
|
|
3012
|
-
parsed = parse3(content);
|
|
3040
|
+
const content = await readFile32(absolutePath, "utf8");
|
|
3041
|
+
parsed = interpolateEnv(parse3(content), process.env);
|
|
3013
3042
|
} catch (error) {
|
|
3014
3043
|
errors.push({
|
|
3015
3044
|
severity: "error",
|
|
@@ -3784,7 +3813,7 @@ function buildDefaultOutputPath(cwd, format) {
|
|
|
3784
3813
|
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
3785
3814
|
const baseName = "eval";
|
|
3786
3815
|
const extension = getDefaultExtension(format);
|
|
3787
|
-
return
|
|
3816
|
+
return path13.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
|
|
3788
3817
|
}
|
|
3789
3818
|
function createProgressReporter(maxWorkers, options) {
|
|
3790
3819
|
const display = new ProgressDisplay(maxWorkers, options);
|
|
@@ -3798,7 +3827,7 @@ function createProgressReporter(maxWorkers, options) {
|
|
|
3798
3827
|
};
|
|
3799
3828
|
}
|
|
3800
3829
|
function makeEvalKey(testFilePath, evalId) {
|
|
3801
|
-
return `${
|
|
3830
|
+
return `${path13.resolve(testFilePath)}::${evalId}`;
|
|
3802
3831
|
}
|
|
3803
3832
|
function createDisplayIdTracker() {
|
|
3804
3833
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -4080,7 +4109,7 @@ async function runEvalCommand(input) {
|
|
|
4080
4109
|
);
|
|
4081
4110
|
}
|
|
4082
4111
|
const repoRoot = await findRepoRoot(cwd);
|
|
4083
|
-
const yamlConfig = await loadConfig(
|
|
4112
|
+
const yamlConfig = await loadConfig(path13.join(cwd, "_"), repoRoot);
|
|
4084
4113
|
if (yamlConfig?.required_version) {
|
|
4085
4114
|
await enforceRequiredVersion(yamlConfig.required_version, {
|
|
4086
4115
|
strict: normalizeBoolean(input.rawOptions.strict)
|
|
@@ -4092,7 +4121,7 @@ async function runEvalCommand(input) {
|
|
|
4092
4121
|
}
|
|
4093
4122
|
let retryNonErrorResults;
|
|
4094
4123
|
if (options.retryErrors) {
|
|
4095
|
-
const retryPath =
|
|
4124
|
+
const retryPath = path13.resolve(options.retryErrors);
|
|
4096
4125
|
await ensureFileExists(retryPath, "Retry-errors JSONL file");
|
|
4097
4126
|
const errorIds = await loadErrorTestIds(retryPath);
|
|
4098
4127
|
if (errorIds.length === 0) {
|
|
@@ -4105,7 +4134,7 @@ async function runEvalCommand(input) {
|
|
|
4105
4134
|
retryNonErrorResults = await loadNonErrorResults(retryPath);
|
|
4106
4135
|
}
|
|
4107
4136
|
if (options.workspacePath) {
|
|
4108
|
-
const resolvedWorkspace =
|
|
4137
|
+
const resolvedWorkspace = path13.resolve(options.workspacePath);
|
|
4109
4138
|
try {
|
|
4110
4139
|
const { stat: stat2 } = await import("node:fs/promises");
|
|
4111
4140
|
const stats = await stat2(resolvedWorkspace);
|
|
@@ -4127,7 +4156,7 @@ async function runEvalCommand(input) {
|
|
|
4127
4156
|
const useFileExport = !!(options.otelFile || options.traceFile);
|
|
4128
4157
|
if (options.exportOtel || useFileExport) {
|
|
4129
4158
|
try {
|
|
4130
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
4159
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-DJFWBJNJ.js");
|
|
4131
4160
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
4132
4161
|
let headers = {};
|
|
4133
4162
|
if (options.otelBackend) {
|
|
@@ -4151,8 +4180,8 @@ async function runEvalCommand(input) {
|
|
|
4151
4180
|
headers,
|
|
4152
4181
|
captureContent,
|
|
4153
4182
|
groupTurns: options.otelGroupTurns,
|
|
4154
|
-
otlpFilePath: options.otelFile ?
|
|
4155
|
-
traceFilePath: options.traceFile ?
|
|
4183
|
+
otlpFilePath: options.otelFile ? path13.resolve(options.otelFile) : void 0,
|
|
4184
|
+
traceFilePath: options.traceFile ? path13.resolve(options.traceFile) : void 0
|
|
4156
4185
|
});
|
|
4157
4186
|
const initialized = await otelExporter.init();
|
|
4158
4187
|
if (!initialized) {
|
|
@@ -4168,8 +4197,8 @@ async function runEvalCommand(input) {
|
|
|
4168
4197
|
otelExporter = null;
|
|
4169
4198
|
}
|
|
4170
4199
|
}
|
|
4171
|
-
const outputPath = options.outPath ?
|
|
4172
|
-
const extraOutputPaths = options.outputPaths.map((p) =>
|
|
4200
|
+
const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
|
|
4201
|
+
const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
|
|
4173
4202
|
const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
|
|
4174
4203
|
const uniqueOutputPaths = [...new Set(allOutputPaths)];
|
|
4175
4204
|
let outputWriter;
|
|
@@ -4183,12 +4212,12 @@ async function runEvalCommand(input) {
|
|
|
4183
4212
|
console.log(` ${p}`);
|
|
4184
4213
|
}
|
|
4185
4214
|
}
|
|
4186
|
-
const resolvedTestFiles = input.testFiles.map((file) =>
|
|
4215
|
+
const resolvedTestFiles = input.testFiles.map((file) => path13.resolve(file));
|
|
4187
4216
|
if (options.otelFile) {
|
|
4188
|
-
console.log(`OTLP JSON file: ${
|
|
4217
|
+
console.log(`OTLP JSON file: ${path13.resolve(options.otelFile)}`);
|
|
4189
4218
|
}
|
|
4190
4219
|
if (options.traceFile) {
|
|
4191
|
-
console.log(`Trace file: ${
|
|
4220
|
+
console.log(`Trace file: ${path13.resolve(options.traceFile)}`);
|
|
4192
4221
|
}
|
|
4193
4222
|
const evaluationRunner = await resolveEvaluationRunner();
|
|
4194
4223
|
const allResults = [];
|
|
@@ -4234,7 +4263,7 @@ async function runEvalCommand(input) {
|
|
|
4234
4263
|
cliNoCache: options.noCache,
|
|
4235
4264
|
yamlCache: yamlCacheEnabled
|
|
4236
4265
|
});
|
|
4237
|
-
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ?
|
|
4266
|
+
const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path13.resolve(yamlCachePath) : void 0) : void 0;
|
|
4238
4267
|
const useCache = cacheEnabled;
|
|
4239
4268
|
if (cacheEnabled) {
|
|
4240
4269
|
console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
|
|
@@ -4366,12 +4395,12 @@ async function runEvalCommand(input) {
|
|
|
4366
4395
|
console.log(formatMatrixSummary(allResults));
|
|
4367
4396
|
}
|
|
4368
4397
|
if (options.benchmarkJson && allResults.length > 0) {
|
|
4369
|
-
const benchmarkPath =
|
|
4398
|
+
const benchmarkPath = path13.resolve(options.benchmarkJson);
|
|
4370
4399
|
await writeBenchmarkJson(benchmarkPath, allResults);
|
|
4371
4400
|
console.log(`Benchmark written to: ${benchmarkPath}`);
|
|
4372
4401
|
}
|
|
4373
4402
|
if (options.artifacts && allResults.length > 0) {
|
|
4374
|
-
const artifactsDir =
|
|
4403
|
+
const artifactsDir = path13.resolve(options.artifacts);
|
|
4375
4404
|
const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
|
|
4376
4405
|
const {
|
|
4377
4406
|
gradingDir,
|
|
@@ -4402,11 +4431,12 @@ Results written to: ${outputPath}`);
|
|
|
4402
4431
|
console.log(` ${p}`);
|
|
4403
4432
|
}
|
|
4404
4433
|
}
|
|
4434
|
+
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
4405
4435
|
}
|
|
4406
4436
|
if (summary.executionErrorCount > 0 && !options.retryErrors) {
|
|
4407
|
-
const evalFileArgs = resolvedTestFiles.map((f) =>
|
|
4437
|
+
const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
|
|
4408
4438
|
const targetFlag = options.target ? ` --target ${options.target}` : "";
|
|
4409
|
-
const relativeOutputPath =
|
|
4439
|
+
const relativeOutputPath = path13.relative(cwd, outputPath);
|
|
4410
4440
|
console.log(
|
|
4411
4441
|
`
|
|
4412
4442
|
Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
|
|
@@ -4438,7 +4468,7 @@ async function resolveEvaluationRunner() {
|
|
|
4438
4468
|
if (!overridePath) {
|
|
4439
4469
|
return runEvaluation;
|
|
4440
4470
|
}
|
|
4441
|
-
const resolved =
|
|
4471
|
+
const resolved = path13.isAbsolute(overridePath) ? overridePath : path13.resolve(process.cwd(), overridePath);
|
|
4442
4472
|
const moduleUrl = pathToFileURL(resolved).href;
|
|
4443
4473
|
const mod = await import(moduleUrl);
|
|
4444
4474
|
const candidate = mod.runEvaluation;
|
|
@@ -4460,6 +4490,7 @@ export {
|
|
|
4460
4490
|
buildTimingArtifact,
|
|
4461
4491
|
buildBenchmarkArtifact,
|
|
4462
4492
|
parseJsonlResults,
|
|
4493
|
+
loadRunCache,
|
|
4463
4494
|
detectFileType,
|
|
4464
4495
|
validateEvalFile,
|
|
4465
4496
|
validateTargetsFile,
|
|
@@ -4470,4 +4501,4 @@ export {
|
|
|
4470
4501
|
selectTarget,
|
|
4471
4502
|
runEvalCommand
|
|
4472
4503
|
};
|
|
4473
|
-
//# sourceMappingURL=chunk-
|
|
4504
|
+
//# sourceMappingURL=chunk-DDMAQT5P.js.map
|