agentv 3.9.1 → 3.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,12 +27,12 @@ import {
27
27
  subscribeToCopilotCliLogEntries,
28
28
  subscribeToCopilotSdkLogEntries,
29
29
  subscribeToPiLogEntries
30
- } from "./chunk-X24J6HCV.js";
30
+ } from "./chunk-X3KJVUAB.js";
31
31
 
32
32
  // package.json
33
33
  var package_default = {
34
34
  name: "agentv",
35
- version: "3.9.1",
35
+ version: "3.10.0",
36
36
  description: "CLI entry point for AgentV",
37
37
  type: "module",
38
38
  repository: {
@@ -204,7 +204,7 @@ async function discoverTargetsFile(options) {
204
204
  // src/commands/eval/run-eval.ts
205
205
  import { constants as constants4 } from "node:fs";
206
206
  import { access as access4 } from "node:fs/promises";
207
- import path12 from "node:path";
207
+ import path13 from "node:path";
208
208
  import { pathToFileURL } from "node:url";
209
209
 
210
210
  // src/version-check.ts
@@ -282,7 +282,7 @@ function computePassRate(result) {
282
282
  const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
283
283
  return passed / scores.length;
284
284
  }
285
- return result.score >= PASS_THRESHOLD ? 1 : 0;
285
+ return (result.score ?? 0) >= PASS_THRESHOLD ? 1 : 0;
286
286
  }
287
287
  function countToolCalls(result) {
288
288
  const toolCalls = {};
@@ -323,6 +323,7 @@ function parseWorkspaceChanges(fileChanges) {
323
323
  };
324
324
  }
325
325
  function buildAssertions(result) {
326
+ if (!result.assertions) return [];
326
327
  return result.assertions.map((a) => ({
327
328
  text: a.text,
328
329
  passed: a.passed,
@@ -403,8 +404,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
403
404
  const targetSet = /* @__PURE__ */ new Set();
404
405
  const testIdSet = /* @__PURE__ */ new Set();
405
406
  for (const result of results) {
406
- targetSet.add(result.target);
407
- testIdSet.add(result.testId);
407
+ targetSet.add(result.target ?? "unknown");
408
+ testIdSet.add(result.testId ?? "unknown");
408
409
  }
409
410
  const targets = [...targetSet].sort();
410
411
  const testIds = [...testIdSet].sort();
@@ -452,7 +453,9 @@ function buildBenchmarkArtifact(results, evalFile = "") {
452
453
  perEvaluatorSummary[key] = computeStats(scores);
453
454
  }
454
455
  }
455
- const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
456
+ const errorCount = results.filter(
457
+ (r) => r.executionStatus != null && r.executionStatus === "execution_error"
458
+ ).length;
456
459
  if (errorCount > 0) {
457
460
  notes.push(
458
461
  `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
@@ -518,7 +521,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
518
521
  await mkdir(gradingDir, { recursive: true });
519
522
  for (const result of results) {
520
523
  const grading = buildGradingArtifact(result);
521
- const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
524
+ const safeTestId = (result.testId ?? "unknown").replace(/[/\\:*?"<>|]/g, "_");
522
525
  const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
523
526
  await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
524
527
  `, "utf8");
@@ -1766,12 +1769,12 @@ var ProgressDisplay = class {
1766
1769
  }
1767
1770
  addLogPaths(paths, provider) {
1768
1771
  const newPaths = [];
1769
- for (const path13 of paths) {
1770
- if (this.logPathSet.has(path13)) {
1772
+ for (const path14 of paths) {
1773
+ if (this.logPathSet.has(path14)) {
1771
1774
  continue;
1772
1775
  }
1773
- this.logPathSet.add(path13);
1774
- newPaths.push(path13);
1776
+ this.logPathSet.add(path14);
1777
+ newPaths.push(path14);
1775
1778
  }
1776
1779
  if (newPaths.length === 0) {
1777
1780
  return;
@@ -1784,8 +1787,8 @@ var ProgressDisplay = class {
1784
1787
  this.hasPrintedLogHeader = true;
1785
1788
  }
1786
1789
  const startIndex = this.logPaths.length - newPaths.length;
1787
- newPaths.forEach((path13, offset) => {
1788
- console.log(`${startIndex + offset + 1}. ${path13}`);
1790
+ newPaths.forEach((path14, offset) => {
1791
+ console.log(`${startIndex + offset + 1}. ${path14}`);
1789
1792
  });
1790
1793
  }
1791
1794
  finish() {
@@ -1859,6 +1862,32 @@ async function loadNonErrorResults(jsonlPath) {
1859
1862
  return results;
1860
1863
  }
1861
1864
 
1865
+ // src/commands/eval/run-cache.ts
1866
+ import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
1867
+ import path11 from "node:path";
1868
+ var CACHE_FILENAME = "cache.json";
1869
+ function cachePath(cwd) {
1870
+ return path11.join(cwd, ".agentv", CACHE_FILENAME);
1871
+ }
1872
+ async function loadRunCache(cwd) {
1873
+ try {
1874
+ const content = await readFile2(cachePath(cwd), "utf-8");
1875
+ return JSON.parse(content);
1876
+ } catch {
1877
+ return void 0;
1878
+ }
1879
+ }
1880
+ async function saveRunCache(cwd, resultFile) {
1881
+ const dir = path11.join(cwd, ".agentv");
1882
+ await mkdir7(dir, { recursive: true });
1883
+ const cache = {
1884
+ lastResultFile: resultFile,
1885
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
1886
+ };
1887
+ await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
1888
+ `, "utf-8");
1889
+ }
1890
+
1862
1891
  // src/commands/eval/statistics.ts
1863
1892
  var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
1864
1893
  function computeMean(values) {
@@ -2107,13 +2136,13 @@ function formatMatrixSummary(results) {
2107
2136
  }
2108
2137
 
2109
2138
  // ../../packages/core/dist/evaluation/validation/index.js
2110
- import { readFile as readFile2 } from "node:fs/promises";
2111
- import path11 from "node:path";
2139
+ import { readFile as readFile3 } from "node:fs/promises";
2140
+ import path12 from "node:path";
2112
2141
  import { parse } from "yaml";
2113
2142
  import { readFile as readFile22 } from "node:fs/promises";
2114
2143
  import path22 from "node:path";
2115
2144
  import { parse as parse2 } from "yaml";
2116
- import { readFile as readFile3 } from "node:fs/promises";
2145
+ import { readFile as readFile32 } from "node:fs/promises";
2117
2146
  import path32 from "node:path";
2118
2147
  import { parse as parse3 } from "yaml";
2119
2148
  import { readFile as readFile4 } from "node:fs/promises";
@@ -2126,7 +2155,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
2126
2155
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
2127
2156
  async function detectFileType(filePath) {
2128
2157
  try {
2129
- const content = await readFile2(filePath, "utf8");
2158
+ const content = await readFile3(filePath, "utf8");
2130
2159
  const parsed = parse(content);
2131
2160
  if (typeof parsed !== "object" || parsed === null) {
2132
2161
  return inferFileTypeFromPath(filePath);
@@ -2151,8 +2180,8 @@ async function detectFileType(filePath) {
2151
2180
  }
2152
2181
  }
2153
2182
  function inferFileTypeFromPath(filePath) {
2154
- const normalized = path11.normalize(filePath).replace(/\\/g, "/");
2155
- const basename = path11.basename(filePath);
2183
+ const normalized = path12.normalize(filePath).replace(/\\/g, "/");
2184
+ const basename = path12.basename(filePath);
2156
2185
  if (normalized.includes("/.agentv/")) {
2157
2186
  if (basename === "config.yaml" || basename === "config.yml") {
2158
2187
  return "config";
@@ -3008,8 +3037,8 @@ async function validateTargetsFile(filePath) {
3008
3037
  const absolutePath = path32.resolve(filePath);
3009
3038
  let parsed;
3010
3039
  try {
3011
- const content = await readFile3(absolutePath, "utf8");
3012
- parsed = parse3(content);
3040
+ const content = await readFile32(absolutePath, "utf8");
3041
+ parsed = interpolateEnv(parse3(content), process.env);
3013
3042
  } catch (error) {
3014
3043
  errors.push({
3015
3044
  severity: "error",
@@ -3784,7 +3813,7 @@ function buildDefaultOutputPath(cwd, format) {
3784
3813
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3785
3814
  const baseName = "eval";
3786
3815
  const extension = getDefaultExtension(format);
3787
- return path12.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3816
+ return path13.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3788
3817
  }
3789
3818
  function createProgressReporter(maxWorkers, options) {
3790
3819
  const display = new ProgressDisplay(maxWorkers, options);
@@ -3798,7 +3827,7 @@ function createProgressReporter(maxWorkers, options) {
3798
3827
  };
3799
3828
  }
3800
3829
  function makeEvalKey(testFilePath, evalId) {
3801
- return `${path12.resolve(testFilePath)}::${evalId}`;
3830
+ return `${path13.resolve(testFilePath)}::${evalId}`;
3802
3831
  }
3803
3832
  function createDisplayIdTracker() {
3804
3833
  const map = /* @__PURE__ */ new Map();
@@ -4080,7 +4109,7 @@ async function runEvalCommand(input) {
4080
4109
  );
4081
4110
  }
4082
4111
  const repoRoot = await findRepoRoot(cwd);
4083
- const yamlConfig = await loadConfig(path12.join(cwd, "_"), repoRoot);
4112
+ const yamlConfig = await loadConfig(path13.join(cwd, "_"), repoRoot);
4084
4113
  if (yamlConfig?.required_version) {
4085
4114
  await enforceRequiredVersion(yamlConfig.required_version, {
4086
4115
  strict: normalizeBoolean(input.rawOptions.strict)
@@ -4092,7 +4121,7 @@ async function runEvalCommand(input) {
4092
4121
  }
4093
4122
  let retryNonErrorResults;
4094
4123
  if (options.retryErrors) {
4095
- const retryPath = path12.resolve(options.retryErrors);
4124
+ const retryPath = path13.resolve(options.retryErrors);
4096
4125
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
4097
4126
  const errorIds = await loadErrorTestIds(retryPath);
4098
4127
  if (errorIds.length === 0) {
@@ -4105,7 +4134,7 @@ async function runEvalCommand(input) {
4105
4134
  retryNonErrorResults = await loadNonErrorResults(retryPath);
4106
4135
  }
4107
4136
  if (options.workspacePath) {
4108
- const resolvedWorkspace = path12.resolve(options.workspacePath);
4137
+ const resolvedWorkspace = path13.resolve(options.workspacePath);
4109
4138
  try {
4110
4139
  const { stat: stat2 } = await import("node:fs/promises");
4111
4140
  const stats = await stat2(resolvedWorkspace);
@@ -4127,7 +4156,7 @@ async function runEvalCommand(input) {
4127
4156
  const useFileExport = !!(options.otelFile || options.traceFile);
4128
4157
  if (options.exportOtel || useFileExport) {
4129
4158
  try {
4130
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-LPIGPS52.js");
4159
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-DJFWBJNJ.js");
4131
4160
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4132
4161
  let headers = {};
4133
4162
  if (options.otelBackend) {
@@ -4151,8 +4180,8 @@ async function runEvalCommand(input) {
4151
4180
  headers,
4152
4181
  captureContent,
4153
4182
  groupTurns: options.otelGroupTurns,
4154
- otlpFilePath: options.otelFile ? path12.resolve(options.otelFile) : void 0,
4155
- traceFilePath: options.traceFile ? path12.resolve(options.traceFile) : void 0
4183
+ otlpFilePath: options.otelFile ? path13.resolve(options.otelFile) : void 0,
4184
+ traceFilePath: options.traceFile ? path13.resolve(options.traceFile) : void 0
4156
4185
  });
4157
4186
  const initialized = await otelExporter.init();
4158
4187
  if (!initialized) {
@@ -4168,8 +4197,8 @@ async function runEvalCommand(input) {
4168
4197
  otelExporter = null;
4169
4198
  }
4170
4199
  }
4171
- const outputPath = options.outPath ? path12.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
4172
- const extraOutputPaths = options.outputPaths.map((p) => path12.resolve(p));
4200
+ const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
4201
+ const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
4173
4202
  const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4174
4203
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
4175
4204
  let outputWriter;
@@ -4183,12 +4212,12 @@ async function runEvalCommand(input) {
4183
4212
  console.log(` ${p}`);
4184
4213
  }
4185
4214
  }
4186
- const resolvedTestFiles = input.testFiles.map((file) => path12.resolve(file));
4215
+ const resolvedTestFiles = input.testFiles.map((file) => path13.resolve(file));
4187
4216
  if (options.otelFile) {
4188
- console.log(`OTLP JSON file: ${path12.resolve(options.otelFile)}`);
4217
+ console.log(`OTLP JSON file: ${path13.resolve(options.otelFile)}`);
4189
4218
  }
4190
4219
  if (options.traceFile) {
4191
- console.log(`Trace file: ${path12.resolve(options.traceFile)}`);
4220
+ console.log(`Trace file: ${path13.resolve(options.traceFile)}`);
4192
4221
  }
4193
4222
  const evaluationRunner = await resolveEvaluationRunner();
4194
4223
  const allResults = [];
@@ -4234,7 +4263,7 @@ async function runEvalCommand(input) {
4234
4263
  cliNoCache: options.noCache,
4235
4264
  yamlCache: yamlCacheEnabled
4236
4265
  });
4237
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path12.resolve(yamlCachePath) : void 0) : void 0;
4266
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path13.resolve(yamlCachePath) : void 0) : void 0;
4238
4267
  const useCache = cacheEnabled;
4239
4268
  if (cacheEnabled) {
4240
4269
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
@@ -4366,12 +4395,12 @@ async function runEvalCommand(input) {
4366
4395
  console.log(formatMatrixSummary(allResults));
4367
4396
  }
4368
4397
  if (options.benchmarkJson && allResults.length > 0) {
4369
- const benchmarkPath = path12.resolve(options.benchmarkJson);
4398
+ const benchmarkPath = path13.resolve(options.benchmarkJson);
4370
4399
  await writeBenchmarkJson(benchmarkPath, allResults);
4371
4400
  console.log(`Benchmark written to: ${benchmarkPath}`);
4372
4401
  }
4373
4402
  if (options.artifacts && allResults.length > 0) {
4374
- const artifactsDir = path12.resolve(options.artifacts);
4403
+ const artifactsDir = path13.resolve(options.artifacts);
4375
4404
  const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4376
4405
  const {
4377
4406
  gradingDir,
@@ -4402,11 +4431,12 @@ Results written to: ${outputPath}`);
4402
4431
  console.log(` ${p}`);
4403
4432
  }
4404
4433
  }
4434
+ await saveRunCache(cwd, outputPath).catch(() => void 0);
4405
4435
  }
4406
4436
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
4407
- const evalFileArgs = resolvedTestFiles.map((f) => path12.relative(cwd, f)).join(" ");
4437
+ const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
4408
4438
  const targetFlag = options.target ? ` --target ${options.target}` : "";
4409
- const relativeOutputPath = path12.relative(cwd, outputPath);
4439
+ const relativeOutputPath = path13.relative(cwd, outputPath);
4410
4440
  console.log(
4411
4441
  `
4412
4442
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
@@ -4438,7 +4468,7 @@ async function resolveEvaluationRunner() {
4438
4468
  if (!overridePath) {
4439
4469
  return runEvaluation;
4440
4470
  }
4441
- const resolved = path12.isAbsolute(overridePath) ? overridePath : path12.resolve(process.cwd(), overridePath);
4471
+ const resolved = path13.isAbsolute(overridePath) ? overridePath : path13.resolve(process.cwd(), overridePath);
4442
4472
  const moduleUrl = pathToFileURL(resolved).href;
4443
4473
  const mod = await import(moduleUrl);
4444
4474
  const candidate = mod.runEvaluation;
@@ -4460,6 +4490,7 @@ export {
4460
4490
  buildTimingArtifact,
4461
4491
  buildBenchmarkArtifact,
4462
4492
  parseJsonlResults,
4493
+ loadRunCache,
4463
4494
  detectFileType,
4464
4495
  validateEvalFile,
4465
4496
  validateTargetsFile,
@@ -4470,4 +4501,4 @@ export {
4470
4501
  selectTarget,
4471
4502
  runEvalCommand
4472
4503
  };
4473
- //# sourceMappingURL=chunk-FRA6PDLZ.js.map
4504
+ //# sourceMappingURL=chunk-DDMAQT5P.js.map