agentv 3.9.0 → 3.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,12 +27,12 @@ import {
27
27
  subscribeToCopilotCliLogEntries,
28
28
  subscribeToCopilotSdkLogEntries,
29
29
  subscribeToPiLogEntries
30
- } from "./chunk-TXDPYXHY.js";
30
+ } from "./chunk-OIVGGWJ3.js";
31
31
 
32
32
  // package.json
33
33
  var package_default = {
34
34
  name: "agentv",
35
- version: "3.9.0",
35
+ version: "3.9.2",
36
36
  description: "CLI entry point for AgentV",
37
37
  type: "module",
38
38
  repository: {
@@ -204,7 +204,7 @@ async function discoverTargetsFile(options) {
204
204
  // src/commands/eval/run-eval.ts
205
205
  import { constants as constants4 } from "node:fs";
206
206
  import { access as access4 } from "node:fs/promises";
207
- import path12 from "node:path";
207
+ import path13 from "node:path";
208
208
  import { pathToFileURL } from "node:url";
209
209
 
210
210
  // src/version-check.ts
@@ -282,7 +282,7 @@ function computePassRate(result) {
282
282
  const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
283
283
  return passed / scores.length;
284
284
  }
285
- return result.score >= PASS_THRESHOLD ? 1 : 0;
285
+ return (result.score ?? 0) >= PASS_THRESHOLD ? 1 : 0;
286
286
  }
287
287
  function countToolCalls(result) {
288
288
  const toolCalls = {};
@@ -323,6 +323,7 @@ function parseWorkspaceChanges(fileChanges) {
323
323
  };
324
324
  }
325
325
  function buildAssertions(result) {
326
+ if (!result.assertions) return [];
326
327
  return result.assertions.map((a) => ({
327
328
  text: a.text,
328
329
  passed: a.passed,
@@ -403,8 +404,8 @@ function buildBenchmarkArtifact(results, evalFile = "") {
403
404
  const targetSet = /* @__PURE__ */ new Set();
404
405
  const testIdSet = /* @__PURE__ */ new Set();
405
406
  for (const result of results) {
406
- targetSet.add(result.target);
407
- testIdSet.add(result.testId);
407
+ targetSet.add(result.target ?? "unknown");
408
+ testIdSet.add(result.testId ?? "unknown");
408
409
  }
409
410
  const targets = [...targetSet].sort();
410
411
  const testIds = [...testIdSet].sort();
@@ -452,7 +453,9 @@ function buildBenchmarkArtifact(results, evalFile = "") {
452
453
  perEvaluatorSummary[key] = computeStats(scores);
453
454
  }
454
455
  }
455
- const errorCount = results.filter((r) => r.executionStatus === "execution_error").length;
456
+ const errorCount = results.filter(
457
+ (r) => r.executionStatus != null && r.executionStatus === "execution_error"
458
+ ).length;
456
459
  if (errorCount > 0) {
457
460
  notes.push(
458
461
  `${errorCount} test(s) had execution errors and are included in pass_rate as failures`
@@ -518,7 +521,7 @@ async function writeArtifactsFromResults(results, outputDir, options) {
518
521
  await mkdir(gradingDir, { recursive: true });
519
522
  for (const result of results) {
520
523
  const grading = buildGradingArtifact(result);
521
- const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, "_");
524
+ const safeTestId = (result.testId ?? "unknown").replace(/[/\\:*?"<>|]/g, "_");
522
525
  const gradingPath = path3.join(gradingDir, `${safeTestId}.json`);
523
526
  await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}
524
527
  `, "utf8");
@@ -1766,12 +1769,12 @@ var ProgressDisplay = class {
1766
1769
  }
1767
1770
  addLogPaths(paths, provider) {
1768
1771
  const newPaths = [];
1769
- for (const path13 of paths) {
1770
- if (this.logPathSet.has(path13)) {
1772
+ for (const path14 of paths) {
1773
+ if (this.logPathSet.has(path14)) {
1771
1774
  continue;
1772
1775
  }
1773
- this.logPathSet.add(path13);
1774
- newPaths.push(path13);
1776
+ this.logPathSet.add(path14);
1777
+ newPaths.push(path14);
1775
1778
  }
1776
1779
  if (newPaths.length === 0) {
1777
1780
  return;
@@ -1784,8 +1787,8 @@ var ProgressDisplay = class {
1784
1787
  this.hasPrintedLogHeader = true;
1785
1788
  }
1786
1789
  const startIndex = this.logPaths.length - newPaths.length;
1787
- newPaths.forEach((path13, offset) => {
1788
- console.log(`${startIndex + offset + 1}. ${path13}`);
1790
+ newPaths.forEach((path14, offset) => {
1791
+ console.log(`${startIndex + offset + 1}. ${path14}`);
1789
1792
  });
1790
1793
  }
1791
1794
  finish() {
@@ -1859,6 +1862,32 @@ async function loadNonErrorResults(jsonlPath) {
1859
1862
  return results;
1860
1863
  }
1861
1864
 
1865
+ // src/commands/eval/run-cache.ts
1866
+ import { mkdir as mkdir7, readFile as readFile2, writeFile as writeFile6 } from "node:fs/promises";
1867
+ import path11 from "node:path";
1868
+ var CACHE_FILENAME = "cache.json";
1869
+ function cachePath(cwd) {
1870
+ return path11.join(cwd, ".agentv", CACHE_FILENAME);
1871
+ }
1872
+ async function loadRunCache(cwd) {
1873
+ try {
1874
+ const content = await readFile2(cachePath(cwd), "utf-8");
1875
+ return JSON.parse(content);
1876
+ } catch {
1877
+ return void 0;
1878
+ }
1879
+ }
1880
+ async function saveRunCache(cwd, resultFile) {
1881
+ const dir = path11.join(cwd, ".agentv");
1882
+ await mkdir7(dir, { recursive: true });
1883
+ const cache = {
1884
+ lastResultFile: resultFile,
1885
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
1886
+ };
1887
+ await writeFile6(cachePath(cwd), `${JSON.stringify(cache, null, 2)}
1888
+ `, "utf-8");
1889
+ }
1890
+
1862
1891
  // src/commands/eval/statistics.ts
1863
1892
  var HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
1864
1893
  function computeMean(values) {
@@ -2107,13 +2136,13 @@ function formatMatrixSummary(results) {
2107
2136
  }
2108
2137
 
2109
2138
  // ../../packages/core/dist/evaluation/validation/index.js
2110
- import { readFile as readFile2 } from "node:fs/promises";
2111
- import path11 from "node:path";
2139
+ import { readFile as readFile3 } from "node:fs/promises";
2140
+ import path12 from "node:path";
2112
2141
  import { parse } from "yaml";
2113
2142
  import { readFile as readFile22 } from "node:fs/promises";
2114
2143
  import path22 from "node:path";
2115
2144
  import { parse as parse2 } from "yaml";
2116
- import { readFile as readFile3 } from "node:fs/promises";
2145
+ import { readFile as readFile32 } from "node:fs/promises";
2117
2146
  import path32 from "node:path";
2118
2147
  import { parse as parse3 } from "yaml";
2119
2148
  import { readFile as readFile4 } from "node:fs/promises";
@@ -2126,7 +2155,7 @@ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
2126
2155
  var SCHEMA_CONFIG_V2 = "agentv-config-v2";
2127
2156
  async function detectFileType(filePath) {
2128
2157
  try {
2129
- const content = await readFile2(filePath, "utf8");
2158
+ const content = await readFile3(filePath, "utf8");
2130
2159
  const parsed = parse(content);
2131
2160
  if (typeof parsed !== "object" || parsed === null) {
2132
2161
  return inferFileTypeFromPath(filePath);
@@ -2151,8 +2180,8 @@ async function detectFileType(filePath) {
2151
2180
  }
2152
2181
  }
2153
2182
  function inferFileTypeFromPath(filePath) {
2154
- const normalized = path11.normalize(filePath).replace(/\\/g, "/");
2155
- const basename = path11.basename(filePath);
2183
+ const normalized = path12.normalize(filePath).replace(/\\/g, "/");
2184
+ const basename = path12.basename(filePath);
2156
2185
  if (normalized.includes("/.agentv/")) {
2157
2186
  if (basename === "config.yaml" || basename === "config.yml") {
2158
2187
  return "config";
@@ -3008,7 +3037,7 @@ async function validateTargetsFile(filePath) {
3008
3037
  const absolutePath = path32.resolve(filePath);
3009
3038
  let parsed;
3010
3039
  try {
3011
- const content = await readFile3(absolutePath, "utf8");
3040
+ const content = await readFile32(absolutePath, "utf8");
3012
3041
  parsed = parse3(content);
3013
3042
  } catch (error) {
3014
3043
  errors.push({
@@ -3784,7 +3813,7 @@ function buildDefaultOutputPath(cwd, format) {
3784
3813
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
3785
3814
  const baseName = "eval";
3786
3815
  const extension = getDefaultExtension(format);
3787
- return path12.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3816
+ return path13.join(cwd, ".agentv", "results", `${baseName}_${timestamp}${extension}`);
3788
3817
  }
3789
3818
  function createProgressReporter(maxWorkers, options) {
3790
3819
  const display = new ProgressDisplay(maxWorkers, options);
@@ -3798,7 +3827,7 @@ function createProgressReporter(maxWorkers, options) {
3798
3827
  };
3799
3828
  }
3800
3829
  function makeEvalKey(testFilePath, evalId) {
3801
- return `${path12.resolve(testFilePath)}::${evalId}`;
3830
+ return `${path13.resolve(testFilePath)}::${evalId}`;
3802
3831
  }
3803
3832
  function createDisplayIdTracker() {
3804
3833
  const map = /* @__PURE__ */ new Map();
@@ -3913,6 +3942,7 @@ async function prepareFileMetadata(params) {
3913
3942
  selections,
3914
3943
  trialsConfig: suite.trials,
3915
3944
  suiteTargets,
3945
+ yamlWorkers: suite.workers,
3916
3946
  yamlCache: suite.cacheConfig?.enabled,
3917
3947
  yamlCachePath: suite.cacheConfig?.cachePath,
3918
3948
  totalBudgetUsd: suite.totalBudgetUsd,
@@ -3942,6 +3972,7 @@ async function runSingleEvalFile(params) {
3942
3972
  cache,
3943
3973
  evaluationRunner,
3944
3974
  workersOverride,
3975
+ yamlWorkers,
3945
3976
  progressReporter,
3946
3977
  seenEvalCases,
3947
3978
  displayIdTracker,
@@ -3963,7 +3994,7 @@ async function runSingleEvalFile(params) {
3963
3994
  }
3964
3995
  const agentTimeoutMs = options.agentTimeoutSeconds != null ? Math.max(0, options.agentTimeoutSeconds) * 1e3 : void 0;
3965
3996
  const workerPreference = workersOverride ?? options.workers;
3966
- let resolvedWorkers = workerPreference ?? resolvedTargetSelection.resolvedTarget.workers ?? DEFAULT_WORKERS;
3997
+ let resolvedWorkers = workerPreference ?? yamlWorkers ?? resolvedTargetSelection.resolvedTarget.workers ?? DEFAULT_WORKERS;
3967
3998
  if (resolvedWorkers < 1 || resolvedWorkers > 50) {
3968
3999
  throw new Error(`Workers must be between 1 and 50, got: ${resolvedWorkers}`);
3969
4000
  }
@@ -4078,7 +4109,7 @@ async function runEvalCommand(input) {
4078
4109
  );
4079
4110
  }
4080
4111
  const repoRoot = await findRepoRoot(cwd);
4081
- const yamlConfig = await loadConfig(path12.join(cwd, "_"), repoRoot);
4112
+ const yamlConfig = await loadConfig(path13.join(cwd, "_"), repoRoot);
4082
4113
  if (yamlConfig?.required_version) {
4083
4114
  await enforceRequiredVersion(yamlConfig.required_version, {
4084
4115
  strict: normalizeBoolean(input.rawOptions.strict)
@@ -4090,7 +4121,7 @@ async function runEvalCommand(input) {
4090
4121
  }
4091
4122
  let retryNonErrorResults;
4092
4123
  if (options.retryErrors) {
4093
- const retryPath = path12.resolve(options.retryErrors);
4124
+ const retryPath = path13.resolve(options.retryErrors);
4094
4125
  await ensureFileExists(retryPath, "Retry-errors JSONL file");
4095
4126
  const errorIds = await loadErrorTestIds(retryPath);
4096
4127
  if (errorIds.length === 0) {
@@ -4103,7 +4134,7 @@ async function runEvalCommand(input) {
4103
4134
  retryNonErrorResults = await loadNonErrorResults(retryPath);
4104
4135
  }
4105
4136
  if (options.workspacePath) {
4106
- const resolvedWorkspace = path12.resolve(options.workspacePath);
4137
+ const resolvedWorkspace = path13.resolve(options.workspacePath);
4107
4138
  try {
4108
4139
  const { stat: stat2 } = await import("node:fs/promises");
4109
4140
  const stats = await stat2(resolvedWorkspace);
@@ -4125,7 +4156,7 @@ async function runEvalCommand(input) {
4125
4156
  const useFileExport = !!(options.otelFile || options.traceFile);
4126
4157
  if (options.exportOtel || useFileExport) {
4127
4158
  try {
4128
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-PIOSPBKX.js");
4159
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-PUPHGVKL.js");
4129
4160
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
4130
4161
  let headers = {};
4131
4162
  if (options.otelBackend) {
@@ -4149,8 +4180,8 @@ async function runEvalCommand(input) {
4149
4180
  headers,
4150
4181
  captureContent,
4151
4182
  groupTurns: options.otelGroupTurns,
4152
- otlpFilePath: options.otelFile ? path12.resolve(options.otelFile) : void 0,
4153
- traceFilePath: options.traceFile ? path12.resolve(options.traceFile) : void 0
4183
+ otlpFilePath: options.otelFile ? path13.resolve(options.otelFile) : void 0,
4184
+ traceFilePath: options.traceFile ? path13.resolve(options.traceFile) : void 0
4154
4185
  });
4155
4186
  const initialized = await otelExporter.init();
4156
4187
  if (!initialized) {
@@ -4166,8 +4197,8 @@ async function runEvalCommand(input) {
4166
4197
  otelExporter = null;
4167
4198
  }
4168
4199
  }
4169
- const outputPath = options.outPath ? path12.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
4170
- const extraOutputPaths = options.outputPaths.map((p) => path12.resolve(p));
4200
+ const outputPath = options.outPath ? path13.resolve(options.outPath) : buildDefaultOutputPath(cwd, options.format);
4201
+ const extraOutputPaths = options.outputPaths.map((p) => path13.resolve(p));
4171
4202
  const allOutputPaths = extraOutputPaths.length > 0 ? [outputPath, ...extraOutputPaths] : [outputPath];
4172
4203
  const uniqueOutputPaths = [...new Set(allOutputPaths)];
4173
4204
  let outputWriter;
@@ -4181,12 +4212,12 @@ async function runEvalCommand(input) {
4181
4212
  console.log(` ${p}`);
4182
4213
  }
4183
4214
  }
4184
- const resolvedTestFiles = input.testFiles.map((file) => path12.resolve(file));
4215
+ const resolvedTestFiles = input.testFiles.map((file) => path13.resolve(file));
4185
4216
  if (options.otelFile) {
4186
- console.log(`OTLP JSON file: ${path12.resolve(options.otelFile)}`);
4217
+ console.log(`OTLP JSON file: ${path13.resolve(options.otelFile)}`);
4187
4218
  }
4188
4219
  if (options.traceFile) {
4189
- console.log(`Trace file: ${path12.resolve(options.traceFile)}`);
4220
+ console.log(`Trace file: ${path13.resolve(options.traceFile)}`);
4190
4221
  }
4191
4222
  const evaluationRunner = await resolveEvaluationRunner();
4192
4223
  const allResults = [];
@@ -4232,7 +4263,7 @@ async function runEvalCommand(input) {
4232
4263
  cliNoCache: options.noCache,
4233
4264
  yamlCache: yamlCacheEnabled
4234
4265
  });
4235
- const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path12.resolve(yamlCachePath) : void 0) : void 0;
4266
+ const cache = cacheEnabled ? new ResponseCache(yamlCachePath ? path13.resolve(yamlCachePath) : void 0) : void 0;
4236
4267
  const useCache = cacheEnabled;
4237
4268
  if (cacheEnabled) {
4238
4269
  console.log(`Response cache: enabled${yamlCachePath ? ` (${yamlCachePath})` : ""}`);
@@ -4329,6 +4360,7 @@ async function runEvalCommand(input) {
4329
4360
  cache,
4330
4361
  evaluationRunner,
4331
4362
  workersOverride: perFileWorkers,
4363
+ yamlWorkers: targetPrep.yamlWorkers,
4332
4364
  progressReporter,
4333
4365
  seenEvalCases,
4334
4366
  displayIdTracker,
@@ -4363,12 +4395,12 @@ async function runEvalCommand(input) {
4363
4395
  console.log(formatMatrixSummary(allResults));
4364
4396
  }
4365
4397
  if (options.benchmarkJson && allResults.length > 0) {
4366
- const benchmarkPath = path12.resolve(options.benchmarkJson);
4398
+ const benchmarkPath = path13.resolve(options.benchmarkJson);
4367
4399
  await writeBenchmarkJson(benchmarkPath, allResults);
4368
4400
  console.log(`Benchmark written to: ${benchmarkPath}`);
4369
4401
  }
4370
4402
  if (options.artifacts && allResults.length > 0) {
4371
- const artifactsDir = path12.resolve(options.artifacts);
4403
+ const artifactsDir = path13.resolve(options.artifacts);
4372
4404
  const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : "";
4373
4405
  const {
4374
4406
  gradingDir,
@@ -4399,11 +4431,12 @@ Results written to: ${outputPath}`);
4399
4431
  console.log(` ${p}`);
4400
4432
  }
4401
4433
  }
4434
+ await saveRunCache(cwd, outputPath).catch(() => void 0);
4402
4435
  }
4403
4436
  if (summary.executionErrorCount > 0 && !options.retryErrors) {
4404
- const evalFileArgs = resolvedTestFiles.map((f) => path12.relative(cwd, f)).join(" ");
4437
+ const evalFileArgs = resolvedTestFiles.map((f) => path13.relative(cwd, f)).join(" ");
4405
4438
  const targetFlag = options.target ? ` --target ${options.target}` : "";
4406
- const relativeOutputPath = path12.relative(cwd, outputPath);
4439
+ const relativeOutputPath = path13.relative(cwd, outputPath);
4407
4440
  console.log(
4408
4441
  `
4409
4442
  Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed tests with:
@@ -4435,7 +4468,7 @@ async function resolveEvaluationRunner() {
4435
4468
  if (!overridePath) {
4436
4469
  return runEvaluation;
4437
4470
  }
4438
- const resolved = path12.isAbsolute(overridePath) ? overridePath : path12.resolve(process.cwd(), overridePath);
4471
+ const resolved = path13.isAbsolute(overridePath) ? overridePath : path13.resolve(process.cwd(), overridePath);
4439
4472
  const moduleUrl = pathToFileURL(resolved).href;
4440
4473
  const mod = await import(moduleUrl);
4441
4474
  const candidate = mod.runEvaluation;
@@ -4457,6 +4490,7 @@ export {
4457
4490
  buildTimingArtifact,
4458
4491
  buildBenchmarkArtifact,
4459
4492
  parseJsonlResults,
4493
+ loadRunCache,
4460
4494
  detectFileType,
4461
4495
  validateEvalFile,
4462
4496
  validateTargetsFile,
@@ -4467,4 +4501,4 @@ export {
4467
4501
  selectTarget,
4468
4502
  runEvalCommand
4469
4503
  };
4470
- //# sourceMappingURL=chunk-GC5P5HHZ.js.map
4504
+ //# sourceMappingURL=chunk-JGMJL2LV.js.map