agentv 4.32.0-next.1 → 4.34.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +4 -5
  2. package/dist/{artifact-writer-VDF7KRWL.js → artifact-writer-UWZX5JKX.js} +4 -4
  3. package/dist/{chunk-TAZBCVEZ.js → chunk-6QEIZ33V.js} +1340 -279
  4. package/dist/chunk-6QEIZ33V.js.map +1 -0
  5. package/dist/{chunk-IGTRNQAM.js → chunk-FK5FLLME.js} +2383 -674
  6. package/dist/chunk-FK5FLLME.js.map +1 -0
  7. package/dist/chunk-GPRZ7XSC.js +1234 -0
  8. package/dist/chunk-GPRZ7XSC.js.map +1 -0
  9. package/dist/{chunk-5JMFFG36.js → chunk-KMO527KH.js} +784 -1081
  10. package/dist/chunk-KMO527KH.js.map +1 -0
  11. package/dist/{chunk-LX5AK3P7.js → chunk-KP4SPQ2M.js} +585 -191
  12. package/dist/chunk-KP4SPQ2M.js.map +1 -0
  13. package/dist/cli.js +5 -5
  14. package/dist/dashboard/assets/{index-BdoQWnyM.js → index-79OddHgT.js} +1 -1
  15. package/dist/dashboard/assets/index-BPMAZqjE.css +1 -0
  16. package/dist/dashboard/assets/index-BycNIWwy.js +118 -0
  17. package/dist/dashboard/index.html +3 -3
  18. package/dist/{dist-GICSKMNP.js → dist-Z5VWSDOO.js} +58 -6
  19. package/dist/index.js +5 -5
  20. package/dist/{interactive-GIDBBDYZ.js → interactive-NTT2QLPR.js} +5 -5
  21. package/dist/skills/agentv-eval-writer/SKILL.md +2 -1
  22. package/dist/skills/agentv-eval-writer/references/eval-schema.json +104 -0
  23. package/dist/skills/agentv-eval-writer/references/rubric-evaluator.md +20 -0
  24. package/dist/{ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js → ts-eval-loader-EQJX3OLT-THE7D3GR.js} +2 -2
  25. package/package.json +2 -2
  26. package/dist/chunk-2ZEY3WBH.js +0 -729
  27. package/dist/chunk-2ZEY3WBH.js.map +0 -1
  28. package/dist/chunk-5JMFFG36.js.map +0 -1
  29. package/dist/chunk-IGTRNQAM.js.map +0 -1
  30. package/dist/chunk-LX5AK3P7.js.map +0 -1
  31. package/dist/chunk-TAZBCVEZ.js.map +0 -1
  32. package/dist/dashboard/assets/index-DcPH8PyS.css +0 -1
  33. package/dist/dashboard/assets/index-EXkiwqam.js +0 -116
  34. /package/dist/{artifact-writer-VDF7KRWL.js.map → artifact-writer-UWZX5JKX.js.map} +0 -0
  35. /package/dist/{dist-GICSKMNP.js.map → dist-Z5VWSDOO.js.map} +0 -0
  36. /package/dist/{interactive-GIDBBDYZ.js.map → interactive-NTT2QLPR.js.map} +0 -0
  37. /package/dist/{ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js.map → ts-eval-loader-EQJX3OLT-THE7D3GR.js.map} +0 -0
@@ -4056,16 +4056,18 @@ var coerce = {
4056
4056
  };
4057
4057
  var NEVER = INVALID;
4058
4058
 
4059
- // ../../packages/core/dist/chunk-5RQMJZDJ.js
4059
+ // ../../packages/core/dist/chunk-EW5X2RGJ.js
4060
4060
  import { parse } from "yaml";
4061
+ import os from "node:os";
4062
+ import path from "node:path";
4061
4063
  import { constants } from "node:fs";
4062
4064
  import { access, readFile } from "node:fs/promises";
4063
- import path from "node:path";
4065
+ import path2 from "node:path";
4064
4066
  import { existsSync, readFileSync } from "node:fs";
4065
4067
  import { homedir } from "node:os";
4066
- import path2 from "node:path";
4067
- import { readFile as readFile2, readdir, stat } from "node:fs/promises";
4068
4068
  import path3 from "node:path";
4069
+ import { readFile as readFile2, readdir, stat } from "node:fs/promises";
4070
+ import path4 from "node:path";
4069
4071
  import fg from "fast-glob";
4070
4072
  var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
4071
4073
  function isContent(value) {
@@ -4164,10 +4166,37 @@ var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
4164
4166
  function isGraderKind(value) {
4165
4167
  return typeof value === "string" && GRADER_KIND_SET.has(value);
4166
4168
  }
4169
+ var RUBRIC_OPERATOR_VALUES = ["correctness", "contradiction"];
4167
4170
  var PARSE_OPTIONS = { merge: true };
4168
4171
  function parseYamlValue(content) {
4169
4172
  return parse(content, PARSE_OPTIONS);
4170
4173
  }
4174
+ function readEnvPath(name) {
4175
+ const value = process.env[name];
4176
+ if (!value || value === "undefined") return void 0;
4177
+ return value;
4178
+ }
4179
+ function getAgentvConfigDir() {
4180
+ return readEnvPath("AGENTV_HOME") ?? path.join(os.homedir(), ".agentv");
4181
+ }
4182
+ function getAgentvHome() {
4183
+ return getAgentvConfigDir();
4184
+ }
4185
+ function getAgentvDataDir() {
4186
+ return readEnvPath("AGENTV_DATA_DIR") ?? getAgentvConfigDir();
4187
+ }
4188
+ function getWorkspacesRoot() {
4189
+ return path.join(getAgentvDataDir(), "workspaces");
4190
+ }
4191
+ function getSubagentsRoot() {
4192
+ return path.join(getAgentvDataDir(), "subagents");
4193
+ }
4194
+ function getTraceStateRoot() {
4195
+ return path.join(getAgentvDataDir(), "trace-state");
4196
+ }
4197
+ function getWorkspacePoolRoot() {
4198
+ return path.join(getAgentvDataDir(), "workspace-pool");
4199
+ }
4171
4200
  async function fileExists(filePath) {
4172
4201
  try {
4173
4202
  await access(filePath, constants.F_OK);
@@ -4188,14 +4217,14 @@ async function readJsonFile(filePath) {
4188
4217
  return JSON.parse(content);
4189
4218
  }
4190
4219
  async function findGitRoot(startPath) {
4191
- let currentDir = path.dirname(path.resolve(startPath));
4192
- const root = path.parse(currentDir).root;
4220
+ let currentDir = path2.dirname(path2.resolve(startPath));
4221
+ const root = path2.parse(currentDir).root;
4193
4222
  while (currentDir !== root) {
4194
- const gitPath = path.join(currentDir, ".git");
4223
+ const gitPath = path2.join(currentDir, ".git");
4195
4224
  if (await fileExists(gitPath)) {
4196
4225
  return currentDir;
4197
4226
  }
4198
- const parentDir = path.dirname(currentDir);
4227
+ const parentDir = path2.dirname(currentDir);
4199
4228
  if (parentDir === currentDir) {
4200
4229
  break;
4201
4230
  }
@@ -4206,8 +4235,8 @@ async function findGitRoot(startPath) {
4206
4235
  function buildDirectoryChain(filePath, repoRoot) {
4207
4236
  const directories = [];
4208
4237
  const seen = /* @__PURE__ */ new Set();
4209
- const boundary = path.resolve(repoRoot);
4210
- let current = path.resolve(path.dirname(filePath));
4238
+ const boundary = path2.resolve(repoRoot);
4239
+ let current = path2.resolve(path2.dirname(filePath));
4211
4240
  while (current !== void 0) {
4212
4241
  if (!seen.has(current)) {
4213
4242
  directories.push(current);
@@ -4216,7 +4245,7 @@ function buildDirectoryChain(filePath, repoRoot) {
4216
4245
  if (current === boundary) {
4217
4246
  break;
4218
4247
  }
4219
- const parent = path.dirname(current);
4248
+ const parent = path2.dirname(current);
4220
4249
  if (parent === current) {
4221
4250
  break;
4222
4251
  }
@@ -4230,16 +4259,16 @@ function buildDirectoryChain(filePath, repoRoot) {
4230
4259
  function buildSearchRoots(evalPath, repoRoot) {
4231
4260
  const uniqueRoots = [];
4232
4261
  const addRoot = (root) => {
4233
- const normalized = path.resolve(root);
4262
+ const normalized = path2.resolve(root);
4234
4263
  if (!uniqueRoots.includes(normalized)) {
4235
4264
  uniqueRoots.push(normalized);
4236
4265
  }
4237
4266
  };
4238
- let currentDir = path.dirname(evalPath);
4267
+ let currentDir = path2.dirname(evalPath);
4239
4268
  let reachedBoundary = false;
4240
4269
  while (!reachedBoundary) {
4241
4270
  addRoot(currentDir);
4242
- const parentDir = path.dirname(currentDir);
4271
+ const parentDir = path2.dirname(currentDir);
4243
4272
  if (currentDir === repoRoot || parentDir === currentDir) {
4244
4273
  reachedBoundary = true;
4245
4274
  } else {
@@ -4257,16 +4286,16 @@ function trimLeadingSeparators(value) {
4257
4286
  async function resolveFileReference(rawValue, searchRoots) {
4258
4287
  const displayPath = trimLeadingSeparators(rawValue);
4259
4288
  const potentialPaths = [];
4260
- if (path.isAbsolute(rawValue)) {
4261
- potentialPaths.push(path.normalize(rawValue));
4289
+ if (path2.isAbsolute(rawValue)) {
4290
+ potentialPaths.push(path2.normalize(rawValue));
4262
4291
  }
4263
4292
  for (const base of searchRoots) {
4264
- potentialPaths.push(path.resolve(base, displayPath));
4293
+ potentialPaths.push(path2.resolve(base, displayPath));
4265
4294
  }
4266
4295
  const attempted = [];
4267
4296
  const seen = /* @__PURE__ */ new Set();
4268
4297
  for (const candidate of potentialPaths) {
4269
- const absoluteCandidate = path.resolve(candidate);
4298
+ const absoluteCandidate = path2.resolve(candidate);
4270
4299
  if (seen.has(absoluteCandidate)) {
4271
4300
  continue;
4272
4301
  }
@@ -4448,11 +4477,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
4448
4477
  allowLiteral: true,
4449
4478
  optionalEnv: true
4450
4479
  });
4451
- if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
4452
- cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
4480
+ if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
4481
+ cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
4453
4482
  }
4454
4483
  if (!cwd && evalFilePath) {
4455
- cwd = path2.dirname(path2.resolve(evalFilePath));
4484
+ cwd = path3.dirname(path3.resolve(evalFilePath));
4456
4485
  }
4457
4486
  return {
4458
4487
  command,
@@ -4469,11 +4498,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
4469
4498
  allowLiteral: true,
4470
4499
  optionalEnv: true
4471
4500
  });
4472
- if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
4473
- cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
4501
+ if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
4502
+ cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
4474
4503
  }
4475
4504
  if (!cwd && evalFilePath) {
4476
- cwd = path2.dirname(path2.resolve(evalFilePath));
4505
+ cwd = path3.dirname(path3.resolve(evalFilePath));
4477
4506
  }
4478
4507
  const timeoutSeconds = input.timeout_seconds;
4479
4508
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -4531,7 +4560,15 @@ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
4531
4560
  ["retryInitialDelayMs", "retry_initial_delay_ms"],
4532
4561
  ["retryMaxDelayMs", "retry_max_delay_ms"],
4533
4562
  ["retryBackoffFactor", "retry_backoff_factor"],
4534
- ["retryStatusCodes", "retry_status_codes"]
4563
+ ["retryStatusCodes", "retry_status_codes"],
4564
+ ["modelReasoningEffort", "model_reasoning_effort"]
4565
+ ]);
4566
+ var CODEX_MODEL_REASONING_EFFORT_VALUES = /* @__PURE__ */ new Set([
4567
+ "minimal",
4568
+ "low",
4569
+ "medium",
4570
+ "high",
4571
+ "xhigh"
4535
4572
  ]);
4536
4573
  var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
4537
4574
  ["timeoutSeconds", "timeout_seconds"]
@@ -4869,6 +4906,9 @@ function normalizeOpenAIBaseUrl(value) {
4869
4906
  if (trimmed.length === 0) {
4870
4907
  return DEFAULT_OPENAI_BASE_URL;
4871
4908
  }
4909
+ if (/\.openai\.azure\.com\/openai\/deployments\/[^/]+$/i.test(trimmed)) {
4910
+ return trimmed;
4911
+ }
4872
4912
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
4873
4913
  }
4874
4914
  function resolveAzureConfig(target, env) {
@@ -4997,22 +5037,34 @@ function resolveGeminiConfig(target, env) {
4997
5037
  }
4998
5038
  function resolveCodexConfig(target, env, _evalFilePath) {
4999
5039
  const modelSource = target.model;
5040
+ const modelReasoningEffortSource = target.model_reasoning_effort;
5000
5041
  const executableSource = target.executable ?? target.command ?? target.binary;
5001
5042
  const argsSource = target.args ?? target.arguments;
5002
5043
  const cwdSource = target.cwd;
5003
5044
  const timeoutSource = target.timeout_seconds;
5004
5045
  const logDirSource = target.log_dir ?? target.log_directory;
5005
- const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
5006
5046
  const systemPromptSource = target.system_prompt;
5007
- const streamLogResult = resolveStreamLog(target, env.AGENTV_CODEX_LOG_FORMAT);
5008
- if (streamLogResult.deprecationWarning) {
5009
- process.stderr.write(`[agentv] \u26A0 ${streamLogResult.deprecationWarning}
5010
- `);
5047
+ if (target.log_format !== void 0 || target.log_output_format !== void 0) {
5048
+ throw new Error(
5049
+ `${target.name}: log_format is no longer supported for codex targets. Use stream_log instead.`
5050
+ );
5011
5051
  }
5052
+ const streamLogResult = resolveStreamLog({ name: target.name, stream_log: target.stream_log });
5012
5053
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
5013
5054
  allowLiteral: true,
5014
5055
  optionalEnv: true
5015
5056
  });
5057
+ const modelReasoningEffort = normalizeCodexModelReasoningEffort(
5058
+ resolveOptionalString(
5059
+ modelReasoningEffortSource,
5060
+ env,
5061
+ `${target.name} codex model reasoning effort`,
5062
+ {
5063
+ allowLiteral: true,
5064
+ optionalEnv: true
5065
+ }
5066
+ )
5067
+ );
5016
5068
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
5017
5069
  allowLiteral: true,
5018
5070
  optionalEnv: true
@@ -5027,32 +5079,30 @@ function resolveCodexConfig(target, env, _evalFilePath) {
5027
5079
  allowLiteral: true,
5028
5080
  optionalEnv: true
5029
5081
  });
5030
- const logFormat = normalizeCodexLogFormat(logFormatSource);
5031
5082
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
5032
5083
  return {
5033
5084
  model,
5085
+ modelReasoningEffort,
5034
5086
  executable,
5035
5087
  args,
5036
5088
  cwd,
5037
5089
  timeoutMs,
5038
5090
  logDir,
5039
- logFormat,
5040
5091
  streamLog: streamLogResult.streamLog,
5041
5092
  systemPrompt
5042
5093
  };
5043
5094
  }
5044
- function normalizeCodexLogFormat(value) {
5045
- if (value === void 0 || value === null) {
5095
+ function normalizeCodexModelReasoningEffort(value) {
5096
+ if (value === void 0) {
5046
5097
  return void 0;
5047
5098
  }
5048
- if (typeof value !== "string") {
5049
- throw new Error("codex log format must be 'summary' or 'json'");
5050
- }
5051
5099
  const normalized = value.trim().toLowerCase();
5052
- if (normalized === "json" || normalized === "summary") {
5100
+ if (CODEX_MODEL_REASONING_EFFORT_VALUES.has(normalized)) {
5053
5101
  return normalized;
5054
5102
  }
5055
- throw new Error("codex log format must be 'summary' or 'json'");
5103
+ throw new Error(
5104
+ `codex model_reasoning_effort must be one of: ${[...CODEX_MODEL_REASONING_EFFORT_VALUES].join(", ")}`
5105
+ );
5056
5106
  }
5057
5107
  function resolveStreamLog(target, envFallback) {
5058
5108
  if (target.stream_log !== void 0 && target.stream_log !== null) {
@@ -5461,7 +5511,7 @@ function resolveClaudeConfig(target, env, _evalFilePath) {
5461
5511
  };
5462
5512
  }
5463
5513
  function resolveCcMirrorBinaryPath(variant) {
5464
- const variantJsonPath = path2.join(homedir(), ".cc-mirror", variant, "variant.json");
5514
+ const variantJsonPath = path3.join(homedir(), ".cc-mirror", variant, "variant.json");
5465
5515
  if (!existsSync(variantJsonPath)) {
5466
5516
  throw new Error(
5467
5517
  `cc-mirror variant "${variant}": ${variantJsonPath} not found. Install the variant or set "executable" explicitly.`
@@ -5538,8 +5588,8 @@ function resolveCliConfig(target, env, evalFilePath) {
5538
5588
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
5539
5589
  if (!parseResult.success) {
5540
5590
  const firstError = parseResult.error.errors[0];
5541
- const path47 = firstError?.path.join(".") || "";
5542
- const prefix = path47 ? `${target.name} ${path47}: ` : `${target.name}: `;
5591
+ const path53 = firstError?.path.join(".") || "";
5592
+ const prefix = path53 ? `${target.name} ${path53}: ` : `${target.name}: `;
5543
5593
  throw new Error(`${prefix}${firstError?.message}`);
5544
5594
  }
5545
5595
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -5560,11 +5610,11 @@ function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath
5560
5610
  allowLiteral: true,
5561
5611
  optionalEnv: true
5562
5612
  });
5563
- if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
5564
- cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
5613
+ if (cwd && evalFilePath && !path3.isAbsolute(cwd)) {
5614
+ cwd = path3.resolve(path3.dirname(path3.resolve(evalFilePath)), cwd);
5565
5615
  }
5566
5616
  if (!cwd && evalFilePath) {
5567
- cwd = path2.dirname(path2.resolve(evalFilePath));
5617
+ cwd = path3.dirname(path3.resolve(evalFilePath));
5568
5618
  }
5569
5619
  return {
5570
5620
  command,
@@ -5918,7 +5968,7 @@ function parseJsonlCases(content, filePath) {
5918
5968
  return results;
5919
5969
  }
5920
5970
  async function loadCasesFromFile(filePath) {
5921
- const ext = path3.extname(filePath).toLowerCase();
5971
+ const ext = path4.extname(filePath).toLowerCase();
5922
5972
  let content;
5923
5973
  try {
5924
5974
  content = await readFile2(filePath, "utf8");
@@ -5945,7 +5995,7 @@ async function loadCasesFromFile(filePath) {
5945
5995
  }
5946
5996
  async function resolveFileReference2(ref, evalFileDir) {
5947
5997
  const rawPath = extractFilePath(ref);
5948
- const absolutePattern = path3.resolve(evalFileDir, rawPath);
5998
+ const absolutePattern = path4.resolve(evalFileDir, rawPath);
5949
5999
  if (isGlobPattern(rawPath)) {
5950
6000
  const matches = await fg(absolutePattern.replaceAll("\\", "/"), {
5951
6001
  onlyFiles: true,
@@ -5972,10 +6022,10 @@ async function loadCasesFromDirectory(dirPath) {
5972
6022
  const subdirs = entries.filter((e) => e.isDirectory()).sort((a, b) => a.name < b.name ? -1 : a.name > b.name ? 1 : 0);
5973
6023
  const results = [];
5974
6024
  for (const subdir of subdirs) {
5975
- const subdirPath = path3.join(dirPath, subdir.name);
6025
+ const subdirPath = path4.join(dirPath, subdir.name);
5976
6026
  let caseFilePath;
5977
6027
  for (const filename of ["case.yaml", "case.yml"]) {
5978
- const candidate = path3.join(subdirPath, filename);
6028
+ const candidate = path4.join(subdirPath, filename);
5979
6029
  try {
5980
6030
  const s = await stat(candidate);
5981
6031
  if (s.isFile()) {
@@ -6011,7 +6061,7 @@ async function loadCasesFromDirectory(dirPath) {
6011
6061
  caseObj.id = subdir.name;
6012
6062
  }
6013
6063
  if (!caseObj.workspace) {
6014
- const workspaceDirPath = path3.join(subdirPath, "workspace");
6064
+ const workspaceDirPath = path4.join(subdirPath, "workspace");
6015
6065
  try {
6016
6066
  const s = await stat(workspaceDirPath);
6017
6067
  if (s.isDirectory()) {
@@ -6037,40 +6087,40 @@ async function expandFileReferences(tests, evalFileDir) {
6037
6087
  return expanded;
6038
6088
  }
6039
6089
 
6040
- // ../../packages/core/dist/chunk-N5EU446L.js
6090
+ // ../../packages/core/dist/chunk-7QB53OPK.js
6041
6091
  import path46 from "node:path";
6042
6092
  import { pathToFileURL as pathToFileURL2 } from "node:url";
6043
6093
  import { existsSync as existsSync6 } from "node:fs";
6044
6094
  import path45 from "node:path";
6045
6095
  import micromatch4 from "micromatch";
6096
+ import { mkdir, readFile as readFile3, writeFile } from "node:fs/promises";
6097
+ import path5 from "node:path";
6046
6098
  import { execFile as execFile3 } from "node:child_process";
6047
6099
  import { createHash as createHash2, randomUUID as randomUUID9 } from "node:crypto";
6048
6100
  import { existsSync as existsSync5 } from "node:fs";
6049
- import { copyFile as copyFile2, mkdir as mkdir14, readdir as readdir8, stat as stat9 } from "node:fs/promises";
6101
+ import { copyFile as copyFile2, mkdir as mkdir15, readdir as readdir8, stat as stat9 } from "node:fs/promises";
6050
6102
  import path44 from "node:path";
6051
6103
  import { promisify as promisify7 } from "node:util";
6052
6104
  import micromatch3 from "micromatch";
6053
- import os from "node:os";
6054
- import path4 from "node:path";
6055
- import { mkdtemp, rm, writeFile } from "node:fs/promises";
6105
+ import { mkdtemp, rm, writeFile as writeFile2 } from "node:fs/promises";
6056
6106
  import { tmpdir } from "node:os";
6057
6107
  import { dirname, join } from "node:path";
6058
6108
  import { randomBytes } from "node:crypto";
6059
6109
  import { createServer } from "node:http";
6060
6110
  import fs from "node:fs/promises";
6061
6111
  import path32 from "node:path";
6062
- import { readFile as readFile3 } from "node:fs/promises";
6112
+ import { readFile as readFile22 } from "node:fs/promises";
6063
6113
  import path22 from "node:path";
6064
6114
  import { fileURLToPath } from "node:url";
6065
6115
  import { spawn } from "node:child_process";
6066
6116
  import { randomUUID } from "node:crypto";
6067
6117
  import { createWriteStream } from "node:fs";
6068
- import { mkdir } from "node:fs/promises";
6069
- import path5 from "node:path";
6118
+ import { mkdir as mkdir2 } from "node:fs/promises";
6119
+ import path52 from "node:path";
6070
6120
  import path42 from "node:path";
6071
6121
  import { randomUUID as randomUUID2 } from "node:crypto";
6072
6122
  import { createWriteStream as createWriteStream2 } from "node:fs";
6073
- import { mkdir as mkdir2 } from "node:fs/promises";
6123
+ import { mkdir as mkdir3 } from "node:fs/promises";
6074
6124
  import path6 from "node:path";
6075
6125
  import { exec as execWithCallback } from "node:child_process";
6076
6126
  import fs2 from "node:fs/promises";
@@ -6079,10 +6129,10 @@ import path7 from "node:path";
6079
6129
  import { promisify } from "node:util";
6080
6130
  import { randomUUID as randomUUID3 } from "node:crypto";
6081
6131
  import { createWriteStream as createWriteStream3 } from "node:fs";
6082
- import { mkdir as mkdir3 } from "node:fs/promises";
6132
+ import { mkdir as mkdir4 } from "node:fs/promises";
6083
6133
  import path8 from "node:path";
6084
6134
  import { randomUUID as randomUUID5 } from "node:crypto";
6085
- import { mkdir as mkdir4 } from "node:fs/promises";
6135
+ import { mkdir as mkdir5 } from "node:fs/promises";
6086
6136
  import { homedir as homedir2 } from "node:os";
6087
6137
  import path11 from "node:path";
6088
6138
  import { Readable, Writable } from "node:stream";
@@ -18704,10 +18754,10 @@ var RequestError = class _RequestError extends Error {
18704
18754
  }
18705
18755
  };
18706
18756
 
18707
- // ../../packages/core/dist/chunk-N5EU446L.js
18757
+ // ../../packages/core/dist/chunk-7QB53OPK.js
18708
18758
  import { exec as execCallback } from "node:child_process";
18709
18759
  import { readdirSync, statSync } from "node:fs";
18710
- import { readFile as readFile22, readdir as readdir2, stat as stat2 } from "node:fs/promises";
18760
+ import { readFile as readFile32, readdir as readdir2, stat as stat2 } from "node:fs/promises";
18711
18761
  import path9 from "node:path";
18712
18762
  import { promisify as promisify2 } from "node:util";
18713
18763
  import { randomUUID as randomUUID4 } from "node:crypto";
@@ -18715,26 +18765,26 @@ import { createWriteStream as createWriteStream4, existsSync as existsSync2, rea
18715
18765
  import { arch, homedir as homedir3, platform } from "node:os";
18716
18766
  import path10 from "node:path";
18717
18767
  import { fileURLToPath as fileURLToPath2 } from "node:url";
18718
- import { readFile as readFile4 } from "node:fs/promises";
18768
+ import { readFile as readFile5 } from "node:fs/promises";
18719
18769
  import { homedir as homedir4 } from "node:os";
18720
18770
  import path13 from "node:path";
18721
- import { readFile as readFile32, readdir as readdir22, stat as stat22 } from "node:fs/promises";
18771
+ import { readFile as readFile4, readdir as readdir22, stat as stat22 } from "node:fs/promises";
18722
18772
  import { homedir as homedir32 } from "node:os";
18723
18773
  import path12 from "node:path";
18724
18774
  import { randomUUID as randomUUID6 } from "node:crypto";
18725
18775
  import { existsSync as existsSync22 } from "node:fs";
18726
- import { mkdir as mkdir5 } from "node:fs/promises";
18776
+ import { mkdir as mkdir6 } from "node:fs/promises";
18727
18777
  import path14 from "node:path";
18728
18778
  import { execSync, spawn as spawn3 } from "node:child_process";
18729
18779
  import { randomUUID as randomUUID7 } from "node:crypto";
18730
18780
  import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
18731
- import { mkdir as mkdir6, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
18781
+ import { mkdir as mkdir7, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile3 } from "node:fs/promises";
18732
18782
  import { tmpdir as tmpdir2 } from "node:os";
18733
18783
  import path15 from "node:path";
18734
18784
  import { execSync as execSync2 } from "node:child_process";
18735
18785
  import { randomUUID as randomUUID8 } from "node:crypto";
18736
18786
  import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
18737
- import { mkdir as mkdir7 } from "node:fs/promises";
18787
+ import { mkdir as mkdir8 } from "node:fs/promises";
18738
18788
  import path16 from "node:path";
18739
18789
  import { createInterface } from "node:readline";
18740
18790
  import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
@@ -18742,28 +18792,28 @@ import { exec as exec2 } from "node:child_process";
18742
18792
  import { constants as constants2, access as access2 } from "node:fs/promises";
18743
18793
  import path27 from "node:path";
18744
18794
  import { promisify as promisify4 } from "node:util";
18745
- import { stat as stat5, writeFile as writeFile5 } from "node:fs/promises";
18795
+ import { stat as stat5, writeFile as writeFile6 } from "node:fs/promises";
18746
18796
  import path25 from "node:path";
18747
18797
  import { constants as constants3 } from "node:fs";
18748
- import { access as access3, mkdir as mkdir8, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
18798
+ import { access as access3, mkdir as mkdir9, readdir as readdir3, rm as rm3, stat as stat3 } from "node:fs/promises";
18749
18799
  import path17 from "node:path";
18750
18800
  import path18 from "node:path";
18751
18801
  import path19 from "node:path";
18752
- import { readFile as readFile5 } from "node:fs/promises";
18802
+ import { readFile as readFile6 } from "node:fs/promises";
18753
18803
  import path20 from "node:path";
18754
18804
  import { exec, spawn as spawn4 } from "node:child_process";
18755
- import { mkdir as mkdir9, writeFile as writeFile3 } from "node:fs/promises";
18805
+ import { mkdir as mkdir10, writeFile as writeFile4 } from "node:fs/promises";
18756
18806
  import path222 from "node:path";
18757
18807
  import { promisify as promisify3 } from "node:util";
18758
18808
  import path21 from "node:path";
18759
- import { copyFile, mkdir as mkdir10, readFile as readFile6, readdir as readdir4, stat as stat4, writeFile as writeFile4 } from "node:fs/promises";
18809
+ import { copyFile, mkdir as mkdir11, readFile as readFile7, readdir as readdir4, stat as stat4, writeFile as writeFile5 } from "node:fs/promises";
18760
18810
  import path24 from "node:path";
18761
18811
  import path23 from "node:path";
18762
18812
  import JSON5 from "json5";
18763
- import { writeFile as writeFile6 } from "node:fs/promises";
18813
+ import { writeFile as writeFile7 } from "node:fs/promises";
18764
18814
  import path26 from "node:path";
18765
18815
  import { constants as constants32 } from "node:fs";
18766
- import { access as access32, readFile as readFile7 } from "node:fs/promises";
18816
+ import { access as access32, readFile as readFile8 } from "node:fs/promises";
18767
18817
  import path28 from "node:path";
18768
18818
  import path29 from "node:path";
18769
18819
  import fg2 from "fast-glob";
@@ -18772,12 +18822,12 @@ import path31 from "node:path";
18772
18822
  import fg22 from "fast-glob";
18773
18823
  import path322 from "node:path";
18774
18824
  import fg3 from "fast-glob";
18775
- import { cp, mkdir as mkdir12, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
18825
+ import { cp, mkdir as mkdir13, readdir as readdir5, rm as rm4, stat as stat6 } from "node:fs/promises";
18776
18826
  import path33 from "node:path";
18777
18827
  import { execFile } from "node:child_process";
18778
18828
  import { createHash } from "node:crypto";
18779
18829
  import { existsSync as existsSync3 } from "node:fs";
18780
- import { cp as cp2, mkdir as mkdir13, readFile as readFile8, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
18830
+ import { cp as cp2, mkdir as mkdir14, readFile as readFile9, readdir as readdir6, rm as rm5, unlink, writeFile as writeFile8 } from "node:fs/promises";
18781
18831
  import path34 from "node:path";
18782
18832
  import { promisify as promisify5 } from "node:util";
18783
18833
  import { execFile as execFile2 } from "node:child_process";
@@ -18786,25 +18836,65 @@ import path35 from "node:path";
18786
18836
  import { promisify as promisify6 } from "node:util";
18787
18837
  import { readdir as readdir7, stat as stat7 } from "node:fs/promises";
18788
18838
  import path36 from "node:path";
18789
- import { readFile as readFile15, stat as stat8 } from "node:fs/promises";
18839
+ import { readFile as readFile16, stat as stat8 } from "node:fs/promises";
18790
18840
  import path43 from "node:path";
18791
18841
  import micromatch2 from "micromatch";
18792
- import { readFile as readFile9 } from "node:fs/promises";
18793
- import path37 from "node:path";
18842
+ import { stringify as stringifyYaml } from "yaml";
18794
18843
  import { readFile as readFile10 } from "node:fs/promises";
18844
+ import path37 from "node:path";
18845
+ import { readFile as readFile11 } from "node:fs/promises";
18795
18846
  import path39 from "node:path";
18796
18847
  import { constants as constants4 } from "node:fs";
18797
18848
  import { access as access4 } from "node:fs/promises";
18798
18849
  import path38 from "node:path";
18799
18850
  import { fileURLToPath as fileURLToPath4 } from "node:url";
18800
- import { readFile as readFile12 } from "node:fs/promises";
18851
+ import { readFile as readFile13 } from "node:fs/promises";
18801
18852
  import path40 from "node:path";
18802
- import { readFile as readFile11 } from "node:fs/promises";
18803
- import { readFile as readFile14 } from "node:fs/promises";
18853
+ import { readFile as readFile12 } from "node:fs/promises";
18854
+ import { readFile as readFile15 } from "node:fs/promises";
18804
18855
  import path422 from "node:path";
18805
18856
  import micromatch from "micromatch";
18806
- import { readFile as readFile13 } from "node:fs/promises";
18857
+ import { readFile as readFile14 } from "node:fs/promises";
18807
18858
  import path41 from "node:path";
18859
+ var DEFAULT_CACHE_PATH = ".agentv/cache";
18860
+ var ResponseCache = class {
18861
+ cachePath;
18862
+ constructor(cachePath) {
18863
+ this.cachePath = cachePath ?? DEFAULT_CACHE_PATH;
18864
+ }
18865
+ async get(key) {
18866
+ const filePath = this.keyToPath(key);
18867
+ try {
18868
+ const data = await readFile3(filePath, "utf8");
18869
+ return JSON.parse(data);
18870
+ } catch {
18871
+ return void 0;
18872
+ }
18873
+ }
18874
+ async set(key, value) {
18875
+ const filePath = this.keyToPath(key);
18876
+ const dir = path5.dirname(filePath);
18877
+ await mkdir(dir, { recursive: true });
18878
+ await writeFile(filePath, JSON.stringify(value, null, 2), "utf8");
18879
+ }
18880
+ keyToPath(key) {
18881
+ const prefix = key.slice(0, 2);
18882
+ return path5.join(this.cachePath, prefix, `${key}.json`);
18883
+ }
18884
+ };
18885
+ function shouldEnableCache(params) {
18886
+ if (params.cliNoCache) return false;
18887
+ if (params.cliCache) return true;
18888
+ if (params.yamlCache !== void 0) return params.yamlCache;
18889
+ return params.tsConfigCache === true;
18890
+ }
18891
+ function shouldSkipCacheForTemperature(targetConfig) {
18892
+ const temp = targetConfig.temperature;
18893
+ if (typeof temp === "number" && temp > 0) {
18894
+ return true;
18895
+ }
18896
+ return false;
18897
+ }
18808
18898
  var DEFAULT_THRESHOLD = 0.8;
18809
18899
  var PASS_THRESHOLD = DEFAULT_THRESHOLD;
18810
18900
  function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
@@ -19026,32 +19116,6 @@ function validateConcurrency(concurrency) {
19026
19116
  throw new TypeError("Expected `concurrency` to be a number from 1 and up");
19027
19117
  }
19028
19118
  }
19029
- function readEnvPath(name) {
19030
- const value = process.env[name];
19031
- if (!value || value === "undefined") return void 0;
19032
- return value;
19033
- }
19034
- function getAgentvConfigDir() {
19035
- return readEnvPath("AGENTV_HOME") ?? path4.join(os.homedir(), ".agentv");
19036
- }
19037
- function getAgentvHome() {
19038
- return getAgentvConfigDir();
19039
- }
19040
- function getAgentvDataDir() {
19041
- return readEnvPath("AGENTV_DATA_DIR") ?? getAgentvConfigDir();
19042
- }
19043
- function getWorkspacesRoot() {
19044
- return path4.join(getAgentvDataDir(), "workspaces");
19045
- }
19046
- function getSubagentsRoot() {
19047
- return path4.join(getAgentvDataDir(), "subagents");
19048
- }
19049
- function getTraceStateRoot() {
19050
- return path4.join(getAgentvDataDir(), "trace-state");
19051
- }
19052
- function getWorkspacePoolRoot() {
19053
- return path4.join(getAgentvDataDir(), "workspace-pool");
19054
- }
19055
19119
  var DEFAULT_MAX_CALLS = 50;
19056
19120
  async function createTargetProxy(options) {
19057
19121
  const { defaultProvider, targetResolver, availableTargets, maxCalls } = options;
@@ -19373,7 +19437,7 @@ async function materializeContentForGrader(messages, getWorkDir) {
19373
19437
  const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
19374
19438
  const dir = await getWorkDir();
19375
19439
  const filePath = join(dir, `img-${counter++}.${ext}`);
19376
- await writeFile(filePath, Buffer.from(base64Data, "base64"));
19440
+ await writeFile2(filePath, Buffer.from(base64Data, "base64"));
19377
19441
  blocks.push({ type: "image", media_type: img.media_type, path: filePath });
19378
19442
  } else {
19379
19443
  blocks.push({ type: "image", media_type: img.media_type, path: img.source });
@@ -19416,7 +19480,7 @@ var CodeGrader = class {
19416
19480
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
19417
19481
  const tmpDir = await mkdtemp(join(tmpdir(), "agentv-grader-"));
19418
19482
  outputPath = join(tmpDir, "output.json");
19419
- await writeFile(outputPath, serialized);
19483
+ await writeFile2(outputPath, serialized);
19420
19484
  outputForPayload = null;
19421
19485
  }
19422
19486
  }
@@ -19433,6 +19497,7 @@ var CodeGrader = class {
19433
19497
  context.evalCase.input,
19434
19498
  getImageDir
19435
19499
  ),
19500
+ metadata: context.evalCase.metadata ?? null,
19436
19501
  trace: context.trace ?? null,
19437
19502
  tokenUsage: context.tokenUsage ?? null,
19438
19503
  costUsd: context.costUsd ?? null,
@@ -19664,7 +19729,7 @@ async function preprocessContentFile(block, preprocessors, basePath) {
19664
19729
  return runContentPreprocessor(block, resolvedPath, preprocessor);
19665
19730
  }
19666
19731
  try {
19667
- const buffer = await readFile3(resolvedPath);
19732
+ const buffer = await readFile22(resolvedPath);
19668
19733
  const text = buffer.toString("utf8").replace(/\r\n/g, "\n");
19669
19734
  if (buffer.includes(0) || text.includes(REPLACEMENT_CHAR)) {
19670
19735
  return {
@@ -19758,6 +19823,10 @@ ${text}`;
19758
19823
  var TEMPLATE_VARIABLES = {
19759
19824
  EXPECTED_OUTPUT: "expected_output",
19760
19825
  CRITERIA: "criteria",
19826
+ METADATA: "metadata",
19827
+ METADATA_JSON: "metadata_json",
19828
+ RUBRICS: "rubrics",
19829
+ RUBRICS_JSON: "rubrics_json",
19761
19830
  INPUT: "input",
19762
19831
  OUTPUT: "output",
19763
19832
  FILE_CHANGES: "file_changes",
@@ -19779,6 +19848,25 @@ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
19779
19848
  [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
19780
19849
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
19781
19850
  ]);
19851
+ var OPERATOR_GUIDANCE = {
19852
+ correctness: "Correctness: mark satisfied only when the answer positively supports or fulfills the outcome. Omission or contradiction should not satisfy it.",
19853
+ contradiction: "Contradiction guard: mark satisfied when the answer does not make a claim that contradicts the outcome. Do not require the answer to mention the outcome; mark unsatisfied only for incompatible claims."
19854
+ };
19855
+ function formatRubricOperatorLabel(operator) {
19856
+ return operator ? ` (operator: ${operator})` : "";
19857
+ }
19858
+ function formatRubricOperatorGuidance(rubrics) {
19859
+ const operators = /* @__PURE__ */ new Set();
19860
+ for (const rubric of rubrics) {
19861
+ if (rubric.operator) {
19862
+ operators.add(rubric.operator);
19863
+ }
19864
+ }
19865
+ if (operators.size === 0) {
19866
+ return [];
19867
+ }
19868
+ return [...operators].map((operator) => OPERATOR_GUIDANCE[operator]);
19869
+ }
19782
19870
  var DEFAULT_MAX_STEPS = 10;
19783
19871
  var MAX_STEPS_LIMIT = 50;
19784
19872
  var MAX_FILE_SIZE = 50 * 1024;
@@ -19860,6 +19948,32 @@ var scoreRangeEvaluationSchema = external_exports.object({
19860
19948
  checks: external_exports.array(scoreRangeCheckResultSchema).describe("Scores for each rubric criterion"),
19861
19949
  overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)").optional()
19862
19950
  });
19951
+ function stringifyPretty(value) {
19952
+ return value === void 0 ? "" : JSON.stringify(value, null, 2);
19953
+ }
19954
+ function stringifyCompact(value) {
19955
+ return value === void 0 ? "" : JSON.stringify(value);
19956
+ }
19957
+ function buildTemplateVariables(context) {
19958
+ const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
19959
+ const rubrics = context.evaluator?.type === "llm-grader" ? context.evaluator.rubrics : void 0;
19960
+ return {
19961
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
19962
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
19963
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
19964
+ [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
19965
+ [TEMPLATE_VARIABLES.METADATA]: stringifyPretty(context.evalCase.metadata),
19966
+ [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact(context.evalCase.metadata),
19967
+ [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty(rubrics),
19968
+ [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact(rubrics),
19969
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
19970
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
19971
+ // Deprecated aliases — same values as the primary variables above
19972
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
19973
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
19974
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
19975
+ };
19976
+ }
19863
19977
  function resolveContentBasePath(context) {
19864
19978
  if (context.workspacePath) {
19865
19979
  return context.workspacePath;
@@ -19931,19 +20045,7 @@ var LlmGrader = class {
19931
20045
  // LLM mode (existing)
19932
20046
  // ---------------------------------------------------------------------------
19933
20047
  async evaluateFreeform(context, graderProvider) {
19934
- const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
19935
- const variables = {
19936
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
19937
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
19938
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
19939
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
19940
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
19941
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
19942
- // Deprecated aliases — same values as the primary variables above
19943
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
19944
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
19945
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
19946
- };
20048
+ const variables = buildTemplateVariables(context);
19947
20049
  const systemPrompt = buildOutputSchema();
19948
20050
  const graderTemplate = context.graderTemplateOverride ?? this.graderTemplate ?? DEFAULT_GRADER_TEMPLATE;
19949
20051
  warnDeprecatedTemplateVars(graderTemplate);
@@ -20010,7 +20112,7 @@ ${context.toolCalls}`;
20010
20112
  if (hasScoreRanges) {
20011
20113
  return this.evaluateWithScoreRanges(context, graderProvider, rubrics);
20012
20114
  }
20013
- const prompt = this.buildRubricPrompt(context, rubrics);
20115
+ const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildRubricPrompt(context, rubrics);
20014
20116
  const systemPrompt = buildRubricOutputSchema();
20015
20117
  const graderRawRequest = {
20016
20118
  userPrompt: prompt,
@@ -20055,7 +20157,7 @@ ${context.toolCalls}`;
20055
20157
  * Each criterion is scored 0-10 and normalized to 0-1.
20056
20158
  */
20057
20159
  async evaluateWithScoreRanges(context, graderProvider, rubrics) {
20058
- const prompt = this.buildScoreRangePrompt(context, rubrics);
20160
+ const prompt = context.graderTemplateOverride || this.graderTemplate ? this.buildCustomPrompt(context) : this.buildScoreRangePrompt(context, rubrics);
20059
20161
  const systemPrompt = buildScoreRangeOutputSchema();
20060
20162
  const graderRawRequest = {
20061
20163
  userPrompt: prompt,
@@ -20274,21 +20376,11 @@ ${context.toolCalls}`;
20274
20376
  */
20275
20377
  buildAgentUserPrompt(context) {
20276
20378
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
20277
- const variables = {
20278
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
20279
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
20280
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
20281
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
20282
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
20283
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
20284
- // Deprecated aliases
20285
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
20286
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
20287
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
20288
- };
20289
- if (this.graderTemplate) {
20290
- warnDeprecatedTemplateVars(this.graderTemplate);
20291
- return substituteVariables(this.graderTemplate, variables);
20379
+ const variables = buildTemplateVariables(context);
20380
+ const template = context.graderTemplateOverride ?? this.graderTemplate;
20381
+ if (template) {
20382
+ warnDeprecatedTemplateVars(template);
20383
+ return substituteVariables(template, variables);
20292
20384
  }
20293
20385
  const config2 = context.evaluator;
20294
20386
  const rubrics = config2?.type === "llm-grader" ? config2.rubrics : void 0;
@@ -20338,21 +20430,11 @@ ${context.toolCalls}`;
20338
20430
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
20339
20431
  const config2 = context.evaluator;
20340
20432
  const rubrics = config2?.type === "llm-grader" ? config2.rubrics : void 0;
20341
- if (this.graderTemplate) {
20342
- const variables = {
20343
- [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
20344
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
20345
- [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
20346
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
20347
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
20348
- [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? "",
20349
- // Deprecated aliases
20350
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
20351
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
20352
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
20353
- };
20354
- warnDeprecatedTemplateVars(this.graderTemplate);
20355
- const customPrompt = substituteVariables(this.graderTemplate, variables);
20433
+ const template = context.graderTemplateOverride ?? this.graderTemplate;
20434
+ if (template) {
20435
+ const variables = buildTemplateVariables(context);
20436
+ warnDeprecatedTemplateVars(template);
20437
+ const customPrompt = substituteVariables(template, variables);
20356
20438
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
20357
20439
  return `${customPrompt}
20358
20440
 
@@ -20478,6 +20560,9 @@ ${outputSchema}`;
20478
20560
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
20479
20561
  const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
20480
20562
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
20563
+ if (rubric.operator) {
20564
+ parts.push(`Operator: ${rubric.operator}`);
20565
+ }
20481
20566
  if (rubric.outcome) {
20482
20567
  parts.push(`Description: ${rubric.outcome}`);
20483
20568
  }
@@ -20490,12 +20575,21 @@ ${outputSchema}`;
20490
20575
  }
20491
20576
  }
20492
20577
  }
20578
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
20579
+ if (operatorGuidance.length > 0) {
20580
+ parts.push("", ...operatorGuidance);
20581
+ }
20493
20582
  parts.push(
20494
20583
  "",
20495
20584
  "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
20496
20585
  );
20497
20586
  return parts.join("\n");
20498
20587
  }
20588
+ buildCustomPrompt(context) {
20589
+ const template = context.graderTemplateOverride ?? this.graderTemplate ?? "";
20590
+ warnDeprecatedTemplateVars(template);
20591
+ return substituteVariables(template, buildTemplateVariables(context));
20592
+ }
20499
20593
  buildRubricPrompt(context, rubrics) {
20500
20594
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
20501
20595
  const parts = [
@@ -20519,10 +20613,21 @@ ${outputSchema}`;
20519
20613
  parts.push("[[ ## tool_calls ## ]]", context.toolCalls, "");
20520
20614
  }
20521
20615
  parts.push("[[ ## rubrics ## ]]");
20616
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
20617
+ if (operatorGuidance.length > 0) {
20618
+ parts.push("", "Operator guidance:");
20619
+ for (const guidance of operatorGuidance) {
20620
+ parts.push(`- ${guidance}`);
20621
+ }
20622
+ parts.push("");
20623
+ }
20522
20624
  for (const rubric of rubrics) {
20523
20625
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
20524
20626
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
20525
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
20627
+ const operatorLabel = formatRubricOperatorLabel(rubric.operator);
20628
+ parts.push(
20629
+ `- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`
20630
+ );
20526
20631
  }
20527
20632
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
20528
20633
  return parts.join("\n");
@@ -21248,6 +21353,384 @@ var CostGrader = class {
21248
21353
  };
21249
21354
  }
21250
21355
  };
21356
+ var NORMALIZED_TRAJECTORY_SCHEMA_VERSION = "agentv.trace.v1";
21357
+ var NORMALIZED_TRACE_SOURCE_KINDS = [
21358
+ "agentv_run",
21359
+ "otlp",
21360
+ "phoenix",
21361
+ "langfuse",
21362
+ "pi_session",
21363
+ "imported_transcript",
21364
+ "compact_transcript"
21365
+ ];
21366
+ var NORMALIZED_TRACE_EVENT_TYPES = [
21367
+ "message",
21368
+ "model_turn",
21369
+ "tool_call",
21370
+ "tool_result"
21371
+ ];
21372
+ var NORMALIZED_TOOL_STATUSES = ["ok", "error", "timeout", "cancelled", "unknown"];
21373
+ var NORMALIZED_REDACTION_LEVELS = ["none", "partial", "full"];
21374
+ function omitUndefinedProperties(value) {
21375
+ return Object.fromEntries(
21376
+ Object.entries(value).filter(([, property]) => property !== void 0)
21377
+ );
21378
+ }
21379
+ var MetadataWireSchema = external_exports.record(external_exports.string(), external_exports.unknown());
21380
+ var TokenUsageWireSchema = external_exports.object({
21381
+ input: external_exports.number(),
21382
+ output: external_exports.number(),
21383
+ cached: external_exports.number().optional(),
21384
+ reasoning: external_exports.number().optional()
21385
+ });
21386
+ var NormalizedRedactionStateWireSchema = external_exports.object({
21387
+ level: external_exports.enum(NORMALIZED_REDACTION_LEVELS),
21388
+ fields: external_exports.array(external_exports.string()).optional(),
21389
+ reason: external_exports.string().optional()
21390
+ });
21391
+ var NormalizedTraceErrorWireSchema = external_exports.object({
21392
+ message: external_exports.string(),
21393
+ name: external_exports.string().optional(),
21394
+ code: external_exports.string().optional(),
21395
+ stack: external_exports.string().optional(),
21396
+ metadata: MetadataWireSchema.optional()
21397
+ });
21398
+ var NormalizedTraceSourceWireSchema = external_exports.object({
21399
+ kind: external_exports.enum(NORMALIZED_TRACE_SOURCE_KINDS),
21400
+ path: external_exports.string().optional(),
21401
+ url: external_exports.string().optional(),
21402
+ provider: external_exports.string().optional(),
21403
+ format: external_exports.string().optional(),
21404
+ version: external_exports.string().optional(),
21405
+ metadata: MetadataWireSchema.optional()
21406
+ });
21407
+ var NormalizedTraceSessionWireSchema = external_exports.object({
21408
+ session_id: external_exports.string().optional(),
21409
+ conversation_id: external_exports.string().optional(),
21410
+ cwd: external_exports.string().optional(),
21411
+ started_at: external_exports.string().optional(),
21412
+ ended_at: external_exports.string().optional(),
21413
+ metadata: MetadataWireSchema.optional()
21414
+ });
21415
+ var NormalizedTraceBranchWireSchema = external_exports.object({
21416
+ selected_leaf_id: external_exports.string().optional(),
21417
+ selected_path_ids: external_exports.array(external_exports.string()).optional(),
21418
+ included_event_ids: external_exports.array(external_exports.string()).optional(),
21419
+ omitted_event_ids: external_exports.array(external_exports.string()).optional(),
21420
+ selection_reason: external_exports.string().optional()
21421
+ });
21422
+ var NormalizedTraceSourceRefWireSchema = external_exports.object({
21423
+ event_id: external_exports.string().optional(),
21424
+ message_id: external_exports.string().optional(),
21425
+ span_id: external_exports.string().optional(),
21426
+ trace_id: external_exports.string().optional(),
21427
+ raw_kind: external_exports.string().optional(),
21428
+ path: external_exports.string().optional(),
21429
+ line: external_exports.number().int().nonnegative().optional(),
21430
+ metadata: MetadataWireSchema.optional()
21431
+ });
21432
+ var NormalizedRawEvidenceWireSchema = external_exports.object({
21433
+ kind: external_exports.string(),
21434
+ ref: external_exports.string().optional(),
21435
+ media_type: external_exports.string().optional(),
21436
+ content: external_exports.unknown().optional(),
21437
+ redacted: external_exports.boolean().optional(),
21438
+ metadata: MetadataWireSchema.optional()
21439
+ });
21440
+ var NormalizedTraceMessageWireSchema = external_exports.object({
21441
+ role: external_exports.string(),
21442
+ name: external_exports.string().optional(),
21443
+ content: external_exports.unknown().optional(),
21444
+ redaction: NormalizedRedactionStateWireSchema.optional(),
21445
+ token_usage: TokenUsageWireSchema.optional(),
21446
+ metadata: MetadataWireSchema.optional()
21447
+ });
21448
+ var NormalizedTraceModelWireSchema = external_exports.object({
21449
+ provider: external_exports.string().optional(),
21450
+ name: external_exports.string().optional(),
21451
+ invocation_id: external_exports.string().optional(),
21452
+ token_usage: TokenUsageWireSchema.optional(),
21453
+ metadata: MetadataWireSchema.optional()
21454
+ });
21455
+ var NormalizedTraceToolWireSchema = external_exports.object({
21456
+ name: external_exports.string(),
21457
+ call_id: external_exports.string().optional(),
21458
+ input: external_exports.unknown().optional(),
21459
+ output: external_exports.unknown().optional(),
21460
+ status: external_exports.enum(NORMALIZED_TOOL_STATUSES).optional(),
21461
+ error: NormalizedTraceErrorWireSchema.optional(),
21462
+ redaction: NormalizedRedactionStateWireSchema.optional(),
21463
+ metadata: MetadataWireSchema.optional()
21464
+ });
21465
+ var NormalizedTraceEventWireSchema = external_exports.object({
21466
+ event_id: external_exports.string(),
21467
+ parent_event_id: external_exports.string().optional(),
21468
+ ordinal: external_exports.number().int().nonnegative(),
21469
+ type: external_exports.enum(NORMALIZED_TRACE_EVENT_TYPES),
21470
+ timestamp: external_exports.string().optional(),
21471
+ duration_ms: external_exports.number().nonnegative().optional(),
21472
+ duration_inferred: external_exports.boolean().optional(),
21473
+ turn_index: external_exports.number().int().nonnegative().optional(),
21474
+ message: NormalizedTraceMessageWireSchema.optional(),
21475
+ model: NormalizedTraceModelWireSchema.optional(),
21476
+ tool: NormalizedTraceToolWireSchema.optional(),
21477
+ source_ref: NormalizedTraceSourceRefWireSchema.optional(),
21478
+ raw_evidence: external_exports.array(NormalizedRawEvidenceWireSchema).optional(),
21479
+ redaction: NormalizedRedactionStateWireSchema.optional(),
21480
+ metadata: MetadataWireSchema.optional()
21481
+ });
21482
+ var NormalizedTrajectoryWireSchema = external_exports.object({
21483
+ schema_version: external_exports.literal(NORMALIZED_TRAJECTORY_SCHEMA_VERSION),
21484
+ source: NormalizedTraceSourceWireSchema,
21485
+ session: NormalizedTraceSessionWireSchema,
21486
+ branch: NormalizedTraceBranchWireSchema.optional(),
21487
+ events: external_exports.array(NormalizedTraceEventWireSchema),
21488
+ token_usage: TokenUsageWireSchema.optional(),
21489
+ cost_usd: external_exports.number().optional(),
21490
+ duration_ms: external_exports.number().optional(),
21491
+ started_at: external_exports.string().optional(),
21492
+ ended_at: external_exports.string().optional(),
21493
+ metadata: MetadataWireSchema.optional()
21494
+ });
21495
+ function toNormalizedTrajectoryWire(trajectory) {
21496
+ return NormalizedTrajectoryWireSchema.parse(
21497
+ omitUndefinedProperties({
21498
+ schema_version: trajectory.schemaVersion,
21499
+ source: toNormalizedTraceSourceWire(trajectory.source),
21500
+ session: toNormalizedTraceSessionWire(trajectory.session),
21501
+ branch: trajectory.branch ? toNormalizedTraceBranchWire(trajectory.branch) : void 0,
21502
+ events: trajectory.events.map(toNormalizedTraceEventWire),
21503
+ token_usage: trajectory.tokenUsage,
21504
+ cost_usd: trajectory.costUsd,
21505
+ duration_ms: trajectory.durationMs,
21506
+ started_at: trajectory.startedAt,
21507
+ ended_at: trajectory.endedAt,
21508
+ metadata: trajectory.metadata
21509
+ })
21510
+ );
21511
+ }
21512
+ function fromNormalizedTrajectoryWire(input) {
21513
+ const wire = NormalizedTrajectoryWireSchema.parse(input);
21514
+ return {
21515
+ schemaVersion: wire.schema_version,
21516
+ source: fromNormalizedTraceSourceWire(wire.source),
21517
+ session: fromNormalizedTraceSessionWire(wire.session),
21518
+ branch: wire.branch ? fromNormalizedTraceBranchWire(wire.branch) : void 0,
21519
+ events: wire.events.map(fromNormalizedTraceEventWire),
21520
+ tokenUsage: wire.token_usage,
21521
+ costUsd: wire.cost_usd,
21522
+ durationMs: wire.duration_ms,
21523
+ startedAt: wire.started_at,
21524
+ endedAt: wire.ended_at,
21525
+ metadata: wire.metadata
21526
+ };
21527
+ }
21528
+ function toNormalizedTraceSourceWire(source) {
21529
+ return omitUndefinedProperties({
21530
+ kind: source.kind,
21531
+ path: source.path,
21532
+ url: source.url,
21533
+ provider: source.provider,
21534
+ format: source.format,
21535
+ version: source.version,
21536
+ metadata: source.metadata
21537
+ });
21538
+ }
21539
+ function fromNormalizedTraceSourceWire(source) {
21540
+ return {
21541
+ kind: source.kind,
21542
+ path: source.path,
21543
+ url: source.url,
21544
+ provider: source.provider,
21545
+ format: source.format,
21546
+ version: source.version,
21547
+ metadata: source.metadata
21548
+ };
21549
+ }
21550
+ function toNormalizedTraceSessionWire(session) {
21551
+ return omitUndefinedProperties({
21552
+ session_id: session.sessionId,
21553
+ conversation_id: session.conversationId,
21554
+ cwd: session.cwd,
21555
+ started_at: session.startedAt,
21556
+ ended_at: session.endedAt,
21557
+ metadata: session.metadata
21558
+ });
21559
+ }
21560
+ function fromNormalizedTraceSessionWire(session) {
21561
+ return {
21562
+ sessionId: session.session_id,
21563
+ conversationId: session.conversation_id,
21564
+ cwd: session.cwd,
21565
+ startedAt: session.started_at,
21566
+ endedAt: session.ended_at,
21567
+ metadata: session.metadata
21568
+ };
21569
+ }
21570
+ function toNormalizedTraceBranchWire(branch) {
21571
+ return omitUndefinedProperties({
21572
+ selected_leaf_id: branch.selectedLeafId,
21573
+ selected_path_ids: branch.selectedPathIds,
21574
+ included_event_ids: branch.includedEventIds,
21575
+ omitted_event_ids: branch.omittedEventIds,
21576
+ selection_reason: branch.selectionReason
21577
+ });
21578
+ }
21579
+ function fromNormalizedTraceBranchWire(branch) {
21580
+ return {
21581
+ selectedLeafId: branch.selected_leaf_id,
21582
+ selectedPathIds: branch.selected_path_ids,
21583
+ includedEventIds: branch.included_event_ids,
21584
+ omittedEventIds: branch.omitted_event_ids,
21585
+ selectionReason: branch.selection_reason
21586
+ };
21587
+ }
21588
+ function toNormalizedTraceEventWire(event) {
21589
+ return NormalizedTraceEventWireSchema.parse(
21590
+ omitUndefinedProperties({
21591
+ event_id: event.eventId,
21592
+ parent_event_id: event.parentEventId,
21593
+ ordinal: event.ordinal,
21594
+ type: event.type,
21595
+ timestamp: event.timestamp,
21596
+ duration_ms: event.durationMs,
21597
+ duration_inferred: event.durationInferred,
21598
+ turn_index: event.turnIndex,
21599
+ message: event.message ? toNormalizedTraceMessageWire(event.message) : void 0,
21600
+ model: event.model ? toNormalizedTraceModelWire(event.model) : void 0,
21601
+ tool: event.tool ? toNormalizedTraceToolWire(event.tool) : void 0,
21602
+ source_ref: event.sourceRef ? toNormalizedTraceSourceRefWire(event.sourceRef) : void 0,
21603
+ raw_evidence: event.rawEvidence?.map(toNormalizedRawEvidenceWire),
21604
+ redaction: event.redaction,
21605
+ metadata: event.metadata
21606
+ })
21607
+ );
21608
+ }
21609
+ function fromNormalizedTraceEventWire(event) {
21610
+ return {
21611
+ eventId: event.event_id,
21612
+ parentEventId: event.parent_event_id,
21613
+ ordinal: event.ordinal,
21614
+ type: event.type,
21615
+ timestamp: event.timestamp,
21616
+ durationMs: event.duration_ms,
21617
+ durationInferred: event.duration_inferred,
21618
+ turnIndex: event.turn_index,
21619
+ message: event.message ? fromNormalizedTraceMessageWire(event.message) : void 0,
21620
+ model: event.model ? fromNormalizedTraceModelWire(event.model) : void 0,
21621
+ tool: event.tool ? fromNormalizedTraceToolWire(event.tool) : void 0,
21622
+ sourceRef: event.source_ref ? fromNormalizedTraceSourceRefWire(event.source_ref) : void 0,
21623
+ rawEvidence: event.raw_evidence?.map(fromNormalizedRawEvidenceWire),
21624
+ redaction: event.redaction,
21625
+ metadata: event.metadata
21626
+ };
21627
+ }
21628
+ function toNormalizedTraceMessageWire(message) {
21629
+ return omitUndefinedProperties({
21630
+ role: message.role,
21631
+ name: message.name,
21632
+ content: message.content,
21633
+ redaction: message.redaction,
21634
+ token_usage: message.tokenUsage,
21635
+ metadata: message.metadata
21636
+ });
21637
+ }
21638
+ function fromNormalizedTraceMessageWire(message) {
21639
+ return {
21640
+ role: message.role,
21641
+ name: message.name,
21642
+ content: message.content,
21643
+ redaction: message.redaction,
21644
+ tokenUsage: message.token_usage,
21645
+ metadata: message.metadata
21646
+ };
21647
+ }
21648
+ function toNormalizedTraceModelWire(model) {
21649
+ return omitUndefinedProperties({
21650
+ provider: model.provider,
21651
+ name: model.name,
21652
+ invocation_id: model.invocationId,
21653
+ token_usage: model.tokenUsage,
21654
+ metadata: model.metadata
21655
+ });
21656
+ }
21657
+ function fromNormalizedTraceModelWire(model) {
21658
+ return {
21659
+ provider: model.provider,
21660
+ name: model.name,
21661
+ invocationId: model.invocation_id,
21662
+ tokenUsage: model.token_usage,
21663
+ metadata: model.metadata
21664
+ };
21665
+ }
21666
+ function toNormalizedTraceToolWire(tool) {
21667
+ return omitUndefinedProperties({
21668
+ name: tool.name,
21669
+ call_id: tool.callId,
21670
+ input: tool.input,
21671
+ output: tool.output,
21672
+ status: tool.status,
21673
+ error: tool.error,
21674
+ redaction: tool.redaction,
21675
+ metadata: tool.metadata
21676
+ });
21677
+ }
21678
+ function fromNormalizedTraceToolWire(tool) {
21679
+ return {
21680
+ name: tool.name,
21681
+ callId: tool.call_id,
21682
+ input: tool.input,
21683
+ output: tool.output,
21684
+ status: tool.status,
21685
+ error: tool.error,
21686
+ redaction: tool.redaction,
21687
+ metadata: tool.metadata
21688
+ };
21689
+ }
21690
+ function toNormalizedTraceSourceRefWire(sourceRef) {
21691
+ return omitUndefinedProperties({
21692
+ event_id: sourceRef.eventId,
21693
+ message_id: sourceRef.messageId,
21694
+ span_id: sourceRef.spanId,
21695
+ trace_id: sourceRef.traceId,
21696
+ raw_kind: sourceRef.rawKind,
21697
+ path: sourceRef.path,
21698
+ line: sourceRef.line,
21699
+ metadata: sourceRef.metadata
21700
+ });
21701
+ }
21702
+ function fromNormalizedTraceSourceRefWire(sourceRef) {
21703
+ return {
21704
+ eventId: sourceRef.event_id,
21705
+ messageId: sourceRef.message_id,
21706
+ spanId: sourceRef.span_id,
21707
+ traceId: sourceRef.trace_id,
21708
+ rawKind: sourceRef.raw_kind,
21709
+ path: sourceRef.path,
21710
+ line: sourceRef.line,
21711
+ metadata: sourceRef.metadata
21712
+ };
21713
+ }
21714
+ function toNormalizedRawEvidenceWire(evidence) {
21715
+ return omitUndefinedProperties({
21716
+ kind: evidence.kind,
21717
+ ref: evidence.ref,
21718
+ media_type: evidence.mediaType,
21719
+ content: evidence.content,
21720
+ redacted: evidence.redacted,
21721
+ metadata: evidence.metadata
21722
+ });
21723
+ }
21724
+ function fromNormalizedRawEvidenceWire(evidence) {
21725
+ return {
21726
+ kind: evidence.kind,
21727
+ ref: evidence.ref,
21728
+ mediaType: evidence.media_type,
21729
+ content: evidence.content,
21730
+ redacted: evidence.redacted,
21731
+ metadata: evidence.metadata
21732
+ };
21733
+ }
21251
21734
  function computeTraceSummary(messages) {
21252
21735
  const toolCallCounts = {};
21253
21736
  const toolDurations = {};
@@ -21315,6 +21798,82 @@ function computeTraceSummary(messages) {
21315
21798
  endTime: latestEnd?.toISOString()
21316
21799
  };
21317
21800
  }
21801
+ function getSelectedTrajectoryEvents(trajectory) {
21802
+ if (!trajectory.branch?.includedEventIds || trajectory.branch.includedEventIds.length === 0) {
21803
+ return trajectory.events;
21804
+ }
21805
+ const includedIds = new Set(trajectory.branch.includedEventIds);
21806
+ return trajectory.events.filter((event) => includedIds.has(event.eventId));
21807
+ }
21808
+ function computeTraceSummaryFromTrajectory(trajectory) {
21809
+ const selectedEvents = getSelectedTrajectoryEvents(trajectory);
21810
+ const hasModelTurnEvents = selectedEvents.some((event) => event.type === "model_turn");
21811
+ const toolCallCounts = {};
21812
+ const toolDurations = {};
21813
+ let totalToolCalls = 0;
21814
+ let errorCount = 0;
21815
+ let llmCallCount = 0;
21816
+ let earliestStart;
21817
+ let latestEnd;
21818
+ let hasAnyDuration = false;
21819
+ for (const event of selectedEvents) {
21820
+ if (event.type === "model_turn" || !hasModelTurnEvents && event.type === "message" && event.message?.role === "assistant") {
21821
+ llmCallCount++;
21822
+ }
21823
+ const eventStart = parseTimestamp(event.timestamp);
21824
+ if (eventStart && (!earliestStart || eventStart < earliestStart)) {
21825
+ earliestStart = eventStart;
21826
+ }
21827
+ const eventEnd = deriveEventEnd(eventStart, event.durationMs);
21828
+ if (eventEnd && (!latestEnd || eventEnd > latestEnd)) {
21829
+ latestEnd = eventEnd;
21830
+ }
21831
+ if (event.type !== "tool_call" || !event.tool) {
21832
+ continue;
21833
+ }
21834
+ toolCallCounts[event.tool.name] = (toolCallCounts[event.tool.name] ?? 0) + 1;
21835
+ totalToolCalls++;
21836
+ if (isErrorToolEvent(event)) {
21837
+ errorCount++;
21838
+ }
21839
+ if (event.durationMs !== void 0) {
21840
+ hasAnyDuration = true;
21841
+ if (!toolDurations[event.tool.name]) {
21842
+ toolDurations[event.tool.name] = [];
21843
+ }
21844
+ toolDurations[event.tool.name].push(event.durationMs);
21845
+ }
21846
+ }
21847
+ return {
21848
+ trace: {
21849
+ eventCount: totalToolCalls,
21850
+ toolCalls: toolCallCounts,
21851
+ errorCount,
21852
+ llmCallCount,
21853
+ ...hasAnyDuration ? { toolDurations } : {}
21854
+ },
21855
+ tokenUsage: trajectory.tokenUsage,
21856
+ costUsd: trajectory.costUsd,
21857
+ durationMs: trajectory.durationMs,
21858
+ startTime: trajectory.startedAt ?? earliestStart?.toISOString(),
21859
+ endTime: trajectory.endedAt ?? latestEnd?.toISOString()
21860
+ };
21861
+ }
21862
+ function parseTimestamp(timestamp) {
21863
+ if (!timestamp) return void 0;
21864
+ const value = new Date(timestamp);
21865
+ return Number.isNaN(value.getTime()) ? void 0 : value;
21866
+ }
21867
+ function deriveEventEnd(start, durationMs) {
21868
+ if (!start) return void 0;
21869
+ if (durationMs === void 0) return start;
21870
+ return new Date(start.getTime() + durationMs);
21871
+ }
21872
+ function isErrorToolEvent(event) {
21873
+ return Boolean(
21874
+ event.tool?.error || event.tool?.status === "error" || event.tool?.status === "timeout" || event.tool?.status === "cancelled"
21875
+ );
21876
+ }
21318
21877
  var DEFAULT_EXPLORATION_TOOLS = [
21319
21878
  "read",
21320
21879
  "grep",
@@ -22099,6 +22658,30 @@ var SkillTriggerGrader = class {
22099
22658
  };
22100
22659
  }
22101
22660
  };
22661
+ function stringifyPretty2(value) {
22662
+ return value === void 0 ? "" : JSON.stringify(value, null, 2);
22663
+ }
22664
+ function stringifyCompact2(value) {
22665
+ return value === void 0 ? "" : JSON.stringify(value);
22666
+ }
22667
+ function buildTemplateVariables2(input) {
22668
+ const formattedQuestion = input.promptInputs.question && input.promptInputs.question.trim().length > 0 ? input.promptInputs.question : input.evalCase.question;
22669
+ return {
22670
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
22671
+ [TEMPLATE_VARIABLES.OUTPUT]: input.candidate.trim(),
22672
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (input.evalCase.reference_answer ?? "").trim(),
22673
+ [TEMPLATE_VARIABLES.CRITERIA]: input.evalCase.criteria.trim(),
22674
+ [TEMPLATE_VARIABLES.METADATA]: stringifyPretty2(input.evalCase.metadata),
22675
+ [TEMPLATE_VARIABLES.METADATA_JSON]: stringifyCompact2(input.evalCase.metadata),
22676
+ [TEMPLATE_VARIABLES.RUBRICS]: stringifyPretty2(input.rubrics),
22677
+ [TEMPLATE_VARIABLES.RUBRICS_JSON]: stringifyCompact2(input.rubrics),
22678
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: input.fileChanges ?? "",
22679
+ [TEMPLATE_VARIABLES.TOOL_CALLS]: input.toolCalls ?? "",
22680
+ [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
22681
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT]: input.candidate.trim(),
22682
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (input.evalCase.reference_answer ?? "").trim()
22683
+ };
22684
+ }
22102
22685
  function assembleLlmGraderPrompt(input) {
22103
22686
  const {
22104
22687
  evalCase,
@@ -22111,6 +22694,17 @@ function assembleLlmGraderPrompt(input) {
22111
22694
  } = input;
22112
22695
  const rubrics = evaluatorConfig?.rubrics;
22113
22696
  if (rubrics && rubrics.length > 0) {
22697
+ if (graderTemplateOverride) {
22698
+ return assembleCustom(
22699
+ evalCase,
22700
+ candidate,
22701
+ promptInputs,
22702
+ rubrics,
22703
+ fileChanges,
22704
+ toolCalls,
22705
+ graderTemplateOverride
22706
+ );
22707
+ }
22114
22708
  const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
22115
22709
  if (hasScoreRanges) {
22116
22710
  return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls);
@@ -22127,19 +22721,13 @@ function assembleLlmGraderPrompt(input) {
22127
22721
  );
22128
22722
  }
22129
22723
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, toolCalls, graderTemplateOverride) {
22130
- const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
22131
- const variables = {
22132
- [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
22133
- [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
22134
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
22135
- [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
22136
- [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
22137
- [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? "",
22138
- // Deprecated aliases
22139
- [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
22140
- [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
22141
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
22142
- };
22724
+ const variables = buildTemplateVariables2({
22725
+ evalCase,
22726
+ candidate,
22727
+ promptInputs,
22728
+ fileChanges,
22729
+ toolCalls
22730
+ });
22143
22731
  const systemPrompt = buildOutputSchema();
22144
22732
  const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE;
22145
22733
  let userPrompt = substituteVariables(template, variables);
@@ -22162,6 +22750,27 @@ ${toolCalls}`;
22162
22750
  mode: "freeform"
22163
22751
  };
22164
22752
  }
22753
+ function assembleCustom(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls, graderTemplateOverride) {
22754
+ const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0);
22755
+ const systemPrompt = hasScoreRanges ? buildScoreRangeOutputSchema() : buildRubricOutputSchema();
22756
+ const userPrompt = substituteVariables(
22757
+ graderTemplateOverride,
22758
+ buildTemplateVariables2({
22759
+ evalCase,
22760
+ candidate,
22761
+ promptInputs,
22762
+ rubrics,
22763
+ fileChanges,
22764
+ toolCalls
22765
+ })
22766
+ );
22767
+ return {
22768
+ systemPrompt,
22769
+ userPrompt,
22770
+ responseSchema: systemPrompt,
22771
+ mode: hasScoreRanges ? "score_range" : "checklist"
22772
+ };
22773
+ }
22165
22774
  function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls) {
22166
22775
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
22167
22776
  const parts = [
@@ -22185,10 +22794,19 @@ function assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChang
22185
22794
  parts.push("[[ ## tool_calls ## ]]", toolCalls, "");
22186
22795
  }
22187
22796
  parts.push("[[ ## rubrics ## ]]");
22797
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
22798
+ if (operatorGuidance.length > 0) {
22799
+ parts.push("", "Operator guidance:");
22800
+ for (const guidance of operatorGuidance) {
22801
+ parts.push(`- ${guidance}`);
22802
+ }
22803
+ parts.push("");
22804
+ }
22188
22805
  for (const rubric of rubrics) {
22189
22806
  const requiredLabel = rubric.required ? " (REQUIRED)" : "";
22190
22807
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
22191
- parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
22808
+ const operatorLabel = formatRubricOperatorLabel(rubric.operator);
22809
+ parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}${operatorLabel}: ${rubric.outcome}`);
22192
22810
  }
22193
22811
  parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
22194
22812
  const systemPrompt = buildRubricOutputSchema();
@@ -22228,6 +22846,9 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
22228
22846
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
22229
22847
  const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
22230
22848
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
22849
+ if (rubric.operator) {
22850
+ parts.push(`Operator: ${rubric.operator}`);
22851
+ }
22231
22852
  if (rubric.outcome) {
22232
22853
  parts.push(`Description: ${rubric.outcome}`);
22233
22854
  }
@@ -22240,6 +22861,10 @@ function assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChan
22240
22861
  }
22241
22862
  }
22242
22863
  }
22864
+ const operatorGuidance = formatRubricOperatorGuidance(rubrics);
22865
+ if (operatorGuidance.length > 0) {
22866
+ parts.push("", ...operatorGuidance);
22867
+ }
22243
22868
  parts.push(
22244
22869
  "",
22245
22870
  "For each criterion, provide an integer score 0-10 that matches one of its defined score ranges."
@@ -23409,10 +24034,10 @@ var ClaudeCliProvider = class {
23409
24034
  }
23410
24035
  resolveCwd(cwdOverride) {
23411
24036
  if (cwdOverride) {
23412
- return path5.resolve(cwdOverride);
24037
+ return path52.resolve(cwdOverride);
23413
24038
  }
23414
24039
  if (this.config.cwd) {
23415
- return path5.resolve(this.config.cwd);
24040
+ return path52.resolve(this.config.cwd);
23416
24041
  }
23417
24042
  return void 0;
23418
24043
  }
@@ -23422,9 +24047,9 @@ var ClaudeCliProvider = class {
23422
24047
  return void 0;
23423
24048
  }
23424
24049
  if (this.config.logDir) {
23425
- return path5.resolve(this.config.logDir);
24050
+ return path52.resolve(this.config.logDir);
23426
24051
  }
23427
- return path5.join(process.cwd(), ".agentv", "logs", "claude-cli");
24052
+ return path52.join(process.cwd(), ".agentv", "logs", "claude-cli");
23428
24053
  }
23429
24054
  async createStreamLogger(request) {
23430
24055
  const logDir = this.resolveLogDirectory();
@@ -23432,13 +24057,13 @@ var ClaudeCliProvider = class {
23432
24057
  return void 0;
23433
24058
  }
23434
24059
  try {
23435
- await mkdir(logDir, { recursive: true });
24060
+ await mkdir2(logDir, { recursive: true });
23436
24061
  } catch (error40) {
23437
24062
  const message = error40 instanceof Error ? error40.message : String(error40);
23438
24063
  console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`);
23439
24064
  return void 0;
23440
24065
  }
23441
- const filePath = path5.join(logDir, buildLogFilename(request, this.targetName));
24066
+ const filePath = path52.join(logDir, buildLogFilename(request, this.targetName));
23442
24067
  try {
23443
24068
  const logger = await ClaudeCliStreamLogger.create({
23444
24069
  filePath,
@@ -23921,7 +24546,7 @@ var ClaudeSdkProvider = class {
23921
24546
  return void 0;
23922
24547
  }
23923
24548
  try {
23924
- await mkdir2(logDir, { recursive: true });
24549
+ await mkdir3(logDir, { recursive: true });
23925
24550
  } catch (error40) {
23926
24551
  const message = error40 instanceof Error ? error40.message : String(error40);
23927
24552
  console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`);
@@ -24746,6 +25371,9 @@ var CodexProvider = class {
24746
25371
  const startMs = Date.now();
24747
25372
  const logger = await this.createStreamLogger(request).catch(() => void 0);
24748
25373
  const codexOptions = {};
25374
+ if (this.config.executable) {
25375
+ codexOptions.codexPathOverride = this.config.executable;
25376
+ }
24749
25377
  if (this.config.model) {
24750
25378
  codexOptions.config = { model: this.config.model };
24751
25379
  }
@@ -24757,6 +25385,9 @@ var CodexProvider = class {
24757
25385
  if (cwd) {
24758
25386
  threadOptions.workingDirectory = cwd;
24759
25387
  }
25388
+ if (this.config.modelReasoningEffort) {
25389
+ threadOptions.modelReasoningEffort = this.config.modelReasoningEffort;
25390
+ }
24760
25391
  const thread = codex.startThread(threadOptions);
24761
25392
  const inputFiles = normalizeInputFiles(request.inputFiles);
24762
25393
  const basePrompt = buildPromptDocument(request, inputFiles);
@@ -24904,7 +25535,7 @@ ${basePrompt}` : basePrompt;
24904
25535
  }
24905
25536
  resolveLogDirectory() {
24906
25537
  const disabled = isCodexLogStreamingDisabled();
24907
- if (disabled) {
25538
+ if (disabled || this.config.streamLog === false) {
24908
25539
  return void 0;
24909
25540
  }
24910
25541
  if (this.config.logDir) {
@@ -24918,7 +25549,7 @@ ${basePrompt}` : basePrompt;
24918
25549
  return void 0;
24919
25550
  }
24920
25551
  try {
24921
- await mkdir3(logDir, { recursive: true });
25552
+ await mkdir4(logDir, { recursive: true });
24922
25553
  } catch (error40) {
24923
25554
  const message = error40 instanceof Error ? error40.message : String(error40);
24924
25555
  console.warn(`Skipping Codex SDK stream logging (could not create ${logDir}): ${message}`);
@@ -24931,7 +25562,7 @@ ${basePrompt}` : basePrompt;
24931
25562
  targetName: this.targetName,
24932
25563
  evalCaseId: request.evalCaseId,
24933
25564
  attempt: request.attempt,
24934
- format: this.config.logFormat ?? "summary"
25565
+ format: this.config.streamLog === "raw" ? "json" : "summary"
24935
25566
  });
24936
25567
  recordCodexLogEntry({
24937
25568
  filePath,
@@ -25136,7 +25767,7 @@ async function walkDir(rootDir, currentDir, snapshot) {
25136
25767
  if (fileStat.size > SNAPSHOT_MAX_FILE_BYTES) continue;
25137
25768
  let content;
25138
25769
  try {
25139
- content = await readFile22(fullPath, "utf8");
25770
+ content = await readFile32(fullPath, "utf8");
25140
25771
  if (content.includes("\0")) continue;
25141
25772
  } catch {
25142
25773
  continue;
@@ -25220,7 +25851,7 @@ function subscribeToCopilotCliLogEntries(listener) {
25220
25851
  };
25221
25852
  }
25222
25853
  function resolvePlatformCliPath() {
25223
- const os3 = platform();
25854
+ const os22 = platform();
25224
25855
  const cpu = arch();
25225
25856
  const platformMap = {
25226
25857
  linux: "linux",
@@ -25231,13 +25862,13 @@ function resolvePlatformCliPath() {
25231
25862
  x64: "x64",
25232
25863
  arm64: "arm64"
25233
25864
  };
25234
- const osPart = platformMap[os3];
25865
+ const osPart = platformMap[os22];
25235
25866
  const archPart = archMap[cpu];
25236
25867
  if (!osPart || !archPart) {
25237
25868
  return void 0;
25238
25869
  }
25239
25870
  const packageName = `@github/copilot-${osPart}-${archPart}`;
25240
- const binaryName = os3 === "win32" ? "copilot.exe" : "copilot";
25871
+ const binaryName = os22 === "win32" ? "copilot.exe" : "copilot";
25241
25872
  try {
25242
25873
  const resolved = import.meta.resolve(`${packageName}/package.json`);
25243
25874
  const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
@@ -25305,9 +25936,9 @@ function resolvePlatformCliPath() {
25305
25936
  }
25306
25937
  function globalNpmRoots() {
25307
25938
  const roots = [];
25308
- const os3 = platform();
25939
+ const os22 = platform();
25309
25940
  const home = homedir3();
25310
- if (os3 === "win32") {
25941
+ if (os22 === "win32") {
25311
25942
  if (process.env.APPDATA) {
25312
25943
  roots.push(path10.join(process.env.APPDATA, "npm", "node_modules"));
25313
25944
  }
@@ -25322,7 +25953,7 @@ function globalNpmRoots() {
25322
25953
  if (process.env.npm_config_prefix) {
25323
25954
  const prefix = process.env.npm_config_prefix;
25324
25955
  roots.push(
25325
- os3 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
25956
+ os22 === "win32" ? path10.join(prefix, "node_modules") : path10.join(prefix, "lib", "node_modules")
25326
25957
  );
25327
25958
  }
25328
25959
  return Array.from(new Set(roots));
@@ -25741,7 +26372,7 @@ var CopilotCliProvider = class {
25741
26372
  return void 0;
25742
26373
  }
25743
26374
  try {
25744
- await mkdir4(logDir, { recursive: true });
26375
+ await mkdir5(logDir, { recursive: true });
25745
26376
  } catch (error40) {
25746
26377
  const message = error40 instanceof Error ? error40.message : String(error40);
25747
26378
  console.warn(`Skipping Copilot CLI stream logging (could not create ${logDir}): ${message}`);
@@ -25992,7 +26623,7 @@ async function discoverCopilotSessions(opts) {
25992
26623
  const workspacePath = path12.join(sessionDir, "workspace.yaml");
25993
26624
  const eventsPath = path12.join(sessionDir, "events.jsonl");
25994
26625
  try {
25995
- const workspaceContent = await readFile32(workspacePath, "utf8");
26626
+ const workspaceContent = await readFile4(workspacePath, "utf8");
25996
26627
  const workspace = parseYamlValue(workspaceContent) ?? {};
25997
26628
  const cwd = String(workspace.cwd ?? "");
25998
26629
  let updatedAt;
@@ -26052,7 +26683,7 @@ var CopilotLogProvider = class {
26052
26683
  const eventsPath = path13.join(sessionDir, "events.jsonl");
26053
26684
  let eventsContent;
26054
26685
  try {
26055
- eventsContent = await readFile4(eventsPath, "utf8");
26686
+ eventsContent = await readFile5(eventsPath, "utf8");
26056
26687
  } catch (err) {
26057
26688
  throw new Error(
26058
26689
  `Failed to read Copilot session transcript at ${eventsPath}: ${err instanceof Error ? err.message : String(err)}`
@@ -26429,7 +27060,7 @@ var CopilotSdkProvider = class {
26429
27060
  return void 0;
26430
27061
  }
26431
27062
  try {
26432
- await mkdir5(logDir, { recursive: true });
27063
+ await mkdir6(logDir, { recursive: true });
26433
27064
  } catch (error40) {
26434
27065
  const message = error40 instanceof Error ? error40.message : String(error40);
26435
27066
  console.warn(`Skipping Copilot SDK stream logging (could not create ${logDir}): ${message}`);
@@ -26746,7 +27377,7 @@ var PiCliProvider = class {
26746
27377
  const logger = await this.createStreamLogger(request).catch(() => void 0);
26747
27378
  try {
26748
27379
  const promptFile = path15.join(cwd, PROMPT_FILENAME);
26749
- await writeFile2(promptFile, request.question, "utf8");
27380
+ await writeFile3(promptFile, request.question, "utf8");
26750
27381
  const args = this.buildPiArgs(request.question, inputFiles);
26751
27382
  const result = await this.executePi(args, cwd, request.signal, logger);
26752
27383
  if (result.timedOut) {
@@ -26937,7 +27568,7 @@ ${prompt}` : prompt;
26937
27568
  return void 0;
26938
27569
  }
26939
27570
  try {
26940
- await mkdir6(logDir, { recursive: true });
27571
+ await mkdir7(logDir, { recursive: true });
26941
27572
  } catch (error40) {
26942
27573
  const message = error40 instanceof Error ? error40.message : String(error40);
26943
27574
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -27928,7 +28559,7 @@ ${fileList}`;
27928
28559
  return void 0;
27929
28560
  }
27930
28561
  try {
27931
- await mkdir7(logDir, { recursive: true });
28562
+ await mkdir8(logDir, { recursive: true });
27932
28563
  } catch (error40) {
27933
28564
  const message = error40 instanceof Error ? error40.message : String(error40);
27934
28565
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
@@ -28152,7 +28783,7 @@ async function pathExists(target) {
28152
28783
  }
28153
28784
  }
28154
28785
  async function ensureDir(target) {
28155
- await mkdir8(target, { recursive: true });
28786
+ await mkdir9(target, { recursive: true });
28156
28787
  }
28157
28788
  async function readDirEntries(target) {
28158
28789
  const entries = await readdir3(target, { withFileTypes: true });
@@ -28304,7 +28935,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
28304
28935
  const maxAttempts = 10;
28305
28936
  while (attempts < maxAttempts) {
28306
28937
  try {
28307
- const content = await readFile5(responseFileFinal, { encoding: "utf8" });
28938
+ const content = await readFile6(responseFileFinal, { encoding: "utf8" });
28308
28939
  if (!silent) {
28309
28940
  process.stdout.write(`${content}
28310
28941
  `);
@@ -28361,7 +28992,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
28361
28992
  const maxAttempts = 10;
28362
28993
  while (attempts < maxAttempts) {
28363
28994
  try {
28364
- const content = await readFile5(file2, { encoding: "utf8" });
28995
+ const content = await readFile6(file2, { encoding: "utf8" });
28365
28996
  if (!silent) {
28366
28997
  process.stdout.write(`${content}
28367
28998
  `);
@@ -28454,9 +29085,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
28454
29085
  const aliveFile = path222.join(subagentDir, DEFAULT_ALIVE_FILENAME);
28455
29086
  await removeIfExists(aliveFile);
28456
29087
  const githubAgentsDir = path222.join(subagentDir, ".github", "agents");
28457
- await mkdir9(githubAgentsDir, { recursive: true });
29088
+ await mkdir10(githubAgentsDir, { recursive: true });
28458
29089
  const wakeupDst = path222.join(githubAgentsDir, "wakeup.md");
28459
- await writeFile3(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
29090
+ await writeFile4(wakeupDst, DEFAULT_WAKEUP_CONTENT, "utf8");
28460
29091
  const workspaceChild = spawnVsCode(vscodeCmd, [workspacePath], {
28461
29092
  label: "open-workspace"
28462
29093
  });
@@ -28485,9 +29116,9 @@ async function ensureWorkspaceFocused(workspacePath, workspaceName, subagentDir,
28485
29116
  async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, requestInstructions, timestamp, vscodeCmd) {
28486
29117
  const workspacePath = path222.join(subagentDir, `${path222.basename(subagentDir)}.code-workspace`);
28487
29118
  const messagesDir = path222.join(subagentDir, "messages");
28488
- await mkdir9(messagesDir, { recursive: true });
29119
+ await mkdir10(messagesDir, { recursive: true });
28489
29120
  const reqFile = path222.join(messagesDir, `${timestamp}_req.md`);
28490
- await writeFile3(reqFile, requestInstructions, { encoding: "utf8" });
29121
+ await writeFile4(reqFile, requestInstructions, { encoding: "utf8" });
28491
29122
  const reqUri = pathToFileUri2(reqFile);
28492
29123
  const chatArgs = ["-r", "chat", "-m", chatId];
28493
29124
  for (const attachment of attachmentPaths) {
@@ -28513,7 +29144,7 @@ async function launchVsCodeWithChat(subagentDir, chatId, attachmentPaths, reques
28513
29144
  async function launchVsCodeWithBatchChat(subagentDir, chatId, attachmentPaths, chatInstruction, vscodeCmd) {
28514
29145
  const workspacePath = path222.join(subagentDir, `${path222.basename(subagentDir)}.code-workspace`);
28515
29146
  const messagesDir = path222.join(subagentDir, "messages");
28516
- await mkdir9(messagesDir, { recursive: true });
29147
+ await mkdir10(messagesDir, { recursive: true });
28517
29148
  const chatArgs = ["-r", "chat", "-m", chatId];
28518
29149
  for (const attachment of attachmentPaths) {
28519
29150
  chatArgs.push("-a", attachment);
@@ -28643,7 +29274,7 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
28643
29274
  if (!stats.isFile()) {
28644
29275
  throw new Error(`workspace template must be a file, not a directory: ${workspaceSrc}`);
28645
29276
  }
28646
- const templateText = await readFile6(workspaceSrc, "utf8");
29277
+ const templateText = await readFile7(workspaceSrc, "utf8");
28647
29278
  workspaceContent = JSON.parse(templateText);
28648
29279
  } else {
28649
29280
  workspaceContent = DEFAULT_WORKSPACE_TEMPLATE;
@@ -28662,9 +29293,9 @@ async function copyAgentConfig(subagentDir, workspaceTemplate, cwd) {
28662
29293
  transformedContent = JSON.stringify(parsed, null, 2);
28663
29294
  }
28664
29295
  }
28665
- await writeFile4(workspaceDst, transformedContent, "utf8");
29296
+ await writeFile5(workspaceDst, transformedContent, "utf8");
28666
29297
  const messagesDir = path24.join(subagentDir, "messages");
28667
- await mkdir10(messagesDir, { recursive: true });
29298
+ await mkdir11(messagesDir, { recursive: true });
28668
29299
  return { workspace: workspaceDst, messagesDir };
28669
29300
  }
28670
29301
  async function createSubagentLock(subagentDir) {
@@ -28687,7 +29318,7 @@ async function createSubagentLock(subagentDir) {
28687
29318
  );
28688
29319
  }
28689
29320
  const lockFile = path24.join(subagentDir, DEFAULT_LOCK_NAME);
28690
- await writeFile4(lockFile, "", { encoding: "utf8" });
29321
+ await writeFile5(lockFile, "", { encoding: "utf8" });
28691
29322
  return lockFile;
28692
29323
  }
28693
29324
  async function removeSubagentLock(subagentDir) {
@@ -28712,7 +29343,7 @@ async function prepareSubagentDirectory(subagentDir, promptFile, chatId, workspa
28712
29343
  }
28713
29344
  if (promptFile) {
28714
29345
  const githubAgentsDir = path24.join(subagentDir, ".github", "agents");
28715
- await mkdir10(githubAgentsDir, { recursive: true });
29346
+ await mkdir11(githubAgentsDir, { recursive: true });
28716
29347
  const agentFile = path24.join(githubAgentsDir, `${chatId}.md`);
28717
29348
  try {
28718
29349
  await copyFile(promptFile, agentFile);
@@ -28971,7 +29602,7 @@ async function dispatchBatchAgent(options) {
28971
29602
  const reqFile = requestFiles[index];
28972
29603
  const tmpFile = responseTmpFiles[index];
28973
29604
  const finalFile = responseFilesFinal[index];
28974
- return writeFile5(
29605
+ return writeFile6(
28975
29606
  reqFile,
28976
29607
  createBatchRequestPrompt(query, tmpFile, finalFile, batchRequestTemplateContent),
28977
29608
  { encoding: "utf8" }
@@ -28983,7 +29614,7 @@ async function dispatchBatchAgent(options) {
28983
29614
  responseFilesFinal,
28984
29615
  orchestratorTemplateContent
28985
29616
  );
28986
- await writeFile5(orchestratorFile, orchestratorContent, { encoding: "utf8" });
29617
+ await writeFile6(orchestratorFile, orchestratorContent, { encoding: "utf8" });
28987
29618
  }
28988
29619
  const chatAttachments = [orchestratorFile, ...attachments];
28989
29620
  const orchestratorUri = pathToFileUri2(orchestratorFile);
@@ -29126,8 +29757,8 @@ async function provisionSubagents(options) {
29126
29757
  if (!dryRun) {
29127
29758
  await removeIfExists(lockFile);
29128
29759
  await ensureDir(githubAgentsDir);
29129
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29130
- await writeFile6(wakeupDst, wakeupContent, "utf8");
29760
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29761
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
29131
29762
  }
29132
29763
  created.push(subagentDir);
29133
29764
  lockedSubagents.delete(subagentDir);
@@ -29137,8 +29768,8 @@ async function provisionSubagents(options) {
29137
29768
  if (!isLocked && force) {
29138
29769
  if (!dryRun) {
29139
29770
  await ensureDir(githubAgentsDir);
29140
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29141
- await writeFile6(wakeupDst, wakeupContent, "utf8");
29771
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29772
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
29142
29773
  }
29143
29774
  created.push(subagentDir);
29144
29775
  subagentsProvisioned += 1;
@@ -29146,8 +29777,8 @@ async function provisionSubagents(options) {
29146
29777
  }
29147
29778
  if (!dryRun && !await pathExists(workspaceDst)) {
29148
29779
  await ensureDir(githubAgentsDir);
29149
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29150
- await writeFile6(wakeupDst, wakeupContent, "utf8");
29780
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29781
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
29151
29782
  }
29152
29783
  skippedExisting.push(subagentDir);
29153
29784
  subagentsProvisioned += 1;
@@ -29162,8 +29793,8 @@ async function provisionSubagents(options) {
29162
29793
  if (!dryRun) {
29163
29794
  await ensureDir(subagentDir);
29164
29795
  await ensureDir(githubAgentsDir);
29165
- await writeFile6(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29166
- await writeFile6(wakeupDst, wakeupContent, "utf8");
29796
+ await writeFile7(workspaceDst, JSON.stringify(workspaceTemplate, null, 2), "utf8");
29797
+ await writeFile7(wakeupDst, wakeupContent, "utf8");
29167
29798
  }
29168
29799
  created.push(subagentDir);
29169
29800
  subagentsProvisioned += 1;
@@ -29523,7 +30154,7 @@ async function readTargetDefinitions(filePath) {
29523
30154
  if (!await fileExists2(absolutePath)) {
29524
30155
  throw new Error(`targets.yaml not found at ${absolutePath}`);
29525
30156
  }
29526
- const raw = await readFile7(absolutePath, "utf8");
30157
+ const raw = await readFile8(absolutePath, "utf8");
29527
30158
  const parsed = parseYamlValue(raw);
29528
30159
  if (!isRecord(parsed)) {
29529
30160
  throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -29701,6 +30332,7 @@ async function executePromptTemplate(script, context, config2, timeoutMs) {
29701
30332
  output: context.output ?? null,
29702
30333
  inputFiles: context.evalCase.file_paths,
29703
30334
  input: context.evalCase.input,
30335
+ metadata: context.evalCase.metadata ?? null,
29704
30336
  trace: context.trace ?? null,
29705
30337
  fileChanges: context.fileChanges ?? null,
29706
30338
  workspacePath: context.workspacePath ?? null,
@@ -30236,7 +30868,7 @@ function getWorkspacePath(evalRunId, caseId, workspaceRoot) {
30236
30868
  return path33.join(root, evalRunId, caseId);
30237
30869
  }
30238
30870
  async function copyDirectoryRecursive(src, dest) {
30239
- await mkdir12(dest, { recursive: true });
30871
+ await mkdir13(dest, { recursive: true });
30240
30872
  const entries = await readdir5(src, { withFileTypes: true });
30241
30873
  for (const entry of entries) {
30242
30874
  const srcPath = path33.join(src, entry.name);
@@ -30357,7 +30989,7 @@ function computeWorkspaceFingerprint(repos) {
30357
30989
  return createHash("sha256").update(JSON.stringify(canonical)).digest("hex");
30358
30990
  }
30359
30991
  async function copyDirectoryRecursive2(src, dest, skipDirs) {
30360
- await mkdir13(dest, { recursive: true });
30992
+ await mkdir14(dest, { recursive: true });
30361
30993
  const entries = await readdir6(src, { withFileTypes: true });
30362
30994
  for (const entry of entries) {
30363
30995
  const srcPath = path34.join(src, entry.name);
@@ -30395,7 +31027,7 @@ var WorkspacePoolManager = class {
30395
31027
  const { templatePath, repos, maxSlots, repoManager, poolReset } = options;
30396
31028
  const fingerprint = computeWorkspaceFingerprint(repos);
30397
31029
  const poolDir = path34.join(this.poolRoot, fingerprint);
30398
- await mkdir13(poolDir, { recursive: true });
31030
+ await mkdir14(poolDir, { recursive: true });
30399
31031
  const drifted = await this.checkDrift(poolDir, fingerprint);
30400
31032
  if (drifted) {
30401
31033
  console.warn(
@@ -30422,7 +31054,7 @@ var WorkspacePoolManager = class {
30422
31054
  poolDir
30423
31055
  };
30424
31056
  }
30425
- await mkdir13(slotPath, { recursive: true });
31057
+ await mkdir14(slotPath, { recursive: true });
30426
31058
  if (templatePath) {
30427
31059
  await copyDirectoryRecursive2(templatePath, slotPath);
30428
31060
  }
@@ -30459,14 +31091,14 @@ var WorkspacePoolManager = class {
30459
31091
  async tryLock(lockPath) {
30460
31092
  for (let attempt = 0; attempt < 3; attempt++) {
30461
31093
  try {
30462
- await writeFile7(lockPath, String(process.pid), { flag: "wx" });
31094
+ await writeFile8(lockPath, String(process.pid), { flag: "wx" });
30463
31095
  return true;
30464
31096
  } catch (err) {
30465
31097
  if (err.code !== "EEXIST") {
30466
31098
  throw err;
30467
31099
  }
30468
31100
  try {
30469
- const pidStr = await readFile8(lockPath, "utf-8");
31101
+ const pidStr = await readFile9(lockPath, "utf-8");
30470
31102
  const pid = Number.parseInt(pidStr.trim(), 10);
30471
31103
  if (!Number.isNaN(pid)) {
30472
31104
  try {
@@ -30493,7 +31125,7 @@ var WorkspacePoolManager = class {
30493
31125
  async checkDrift(poolDir, fingerprint) {
30494
31126
  const metadataPath = path34.join(poolDir, "metadata.json");
30495
31127
  try {
30496
- const raw = await readFile8(metadataPath, "utf-8");
31128
+ const raw = await readFile9(metadataPath, "utf-8");
30497
31129
  const metadata = JSON.parse(raw);
30498
31130
  return metadata.fingerprint !== fingerprint;
30499
31131
  } catch {
@@ -30508,7 +31140,7 @@ var WorkspacePoolManager = class {
30508
31140
  repos,
30509
31141
  createdAt: (/* @__PURE__ */ new Date()).toISOString()
30510
31142
  };
30511
- await writeFile7(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
31143
+ await writeFile8(path34.join(poolDir, "metadata.json"), JSON.stringify(metadata, null, 2));
30512
31144
  }
30513
31145
  /** Remove all slot directories and their lock files from a pool directory. */
30514
31146
  async removeAllSlots(poolDir) {
@@ -30518,7 +31150,7 @@ var WorkspacePoolManager = class {
30518
31150
  const lockPath = path34.join(poolDir, `${entry}.lock`);
30519
31151
  if (existsSync3(lockPath)) {
30520
31152
  try {
30521
- const pidStr = await readFile8(lockPath, "utf-8");
31153
+ const pidStr = await readFile9(lockPath, "utf-8");
30522
31154
  const pid = Number.parseInt(pidStr.trim(), 10);
30523
31155
  if (!Number.isNaN(pid)) {
30524
31156
  try {
@@ -30936,7 +31568,7 @@ function isAgentSkillsFormat(parsed) {
30936
31568
  return Array.isArray(obj.evals);
30937
31569
  }
30938
31570
  async function loadTestsFromAgentSkills(filePath) {
30939
- const raw = await readFile9(filePath, "utf8");
31571
+ const raw = await readFile10(filePath, "utf8");
30940
31572
  let parsed;
30941
31573
  try {
30942
31574
  parsed = JSON.parse(raw);
@@ -31105,20 +31737,22 @@ var DEFAULT_EVAL_PATTERNS = [
31105
31737
  ];
31106
31738
  async function loadConfig(evalFilePath, repoRoot) {
31107
31739
  const directories = buildDirectoryChain2(evalFilePath, repoRoot);
31740
+ const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
31108
31741
  for (const directory of directories) {
31109
31742
  const configPath = path39.join(directory, ".agentv", "config.yaml");
31110
31743
  if (!await fileExists3(configPath)) {
31111
31744
  continue;
31112
31745
  }
31113
31746
  const config2 = await readConfigFile(configPath);
31114
- if (config2) return config2;
31747
+ if (config2) {
31748
+ return config2;
31749
+ }
31115
31750
  }
31116
- const globalConfigPath = path39.join(getAgentvConfigDir(), "config.yaml");
31117
31751
  return await fileExists3(globalConfigPath) ? readConfigFile(globalConfigPath) : null;
31118
31752
  }
31119
31753
  async function readConfigFile(configPath) {
31120
31754
  try {
31121
- const rawConfig = await readFile10(configPath, "utf8");
31755
+ const rawConfig = await readFile11(configPath, "utf8");
31122
31756
  const parsed = interpolateEnv(parseYamlValue(rawConfig), process.env);
31123
31757
  if (!isJsonObject(parsed)) {
31124
31758
  logWarning(`Invalid config.yaml format at ${configPath}`);
@@ -31331,7 +31965,10 @@ function extractCacheConfig(suite) {
31331
31965
  logWarning(`Invalid execution.cache: ${cache}. Must be a boolean. Ignoring.`);
31332
31966
  return void 0;
31333
31967
  }
31334
- const cachePath = executionObj.cache_path ?? executionObj.cachePath;
31968
+ if (executionObj.cachePath !== void 0) {
31969
+ logWarning("Invalid execution.cachePath: use snake_case execution.cache_path in YAML.");
31970
+ }
31971
+ const cachePath = executionObj.cache_path;
31335
31972
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
31336
31973
  return { enabled: cache, cachePath: resolvedCachePath };
31337
31974
  }
@@ -31500,6 +32137,12 @@ function parseResultsConfig(raw, configPath) {
31500
32137
  ...branchPrefix && { branch_prefix: branchPrefix }
31501
32138
  };
31502
32139
  }
32140
+ function resolveResultsConfigForProject(config2, _projectId) {
32141
+ if (!config2) {
32142
+ return void 0;
32143
+ }
32144
+ return config2.results;
32145
+ }
31503
32146
  function parseHooksConfig(raw, configPath) {
31504
32147
  if (raw === void 0 || raw === null) {
31505
32148
  return void 0;
@@ -31525,7 +32168,7 @@ function logWarning(message) {
31525
32168
  var ANSI_YELLOW3 = "\x1B[33m";
31526
32169
  var ANSI_RESET4 = "\x1B[0m";
31527
32170
  async function validateCustomPromptContent(promptPath) {
31528
- const content = await readFile11(promptPath, "utf8");
32171
+ const content = await readFile12(promptPath, "utf8");
31529
32172
  validateTemplateVariables(content, promptPath);
31530
32173
  }
31531
32174
  function validateTemplateVariables(content, source) {
@@ -31655,7 +32298,7 @@ ${resolved.attempted.map((attempt) => ` Tried: ${attempt}`).join("\n")}` : "";
31655
32298
  const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
31656
32299
  throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
31657
32300
  }
31658
- const content = await readFile12(resolved.resolvedPath, "utf8");
32301
+ const content = await readFile13(resolved.resolvedPath, "utf8");
31659
32302
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
31660
32303
  if (!isJsonObject2(parsed)) {
31661
32304
  throw new Error(
@@ -31702,6 +32345,103 @@ async function expandGraderEntries(candidateEvaluators, searchRoots, evalId, inc
31702
32345
  }
31703
32346
  return expanded;
31704
32347
  }
32348
+ async function collectAssertionTemplateSourceReferences(rawEvalCase, globalExecution, searchRoots, evalId) {
32349
+ const execution = rawEvalCase.execution;
32350
+ const executionObject = isJsonObject2(execution) ? execution : void 0;
32351
+ const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? (executionObject ? executionObject.evaluators : void 0) ?? rawEvalCase.evaluators;
32352
+ const skipDefaults = executionObject?.skip_defaults === true;
32353
+ const rootEvaluators = skipDefaults ? void 0 : globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators;
32354
+ return [
32355
+ ...await collectAssertionTemplateReferencesFromValue(caseEvaluators, searchRoots, evalId),
32356
+ ...await collectAssertionTemplateReferencesFromValue(rootEvaluators, searchRoots, evalId)
32357
+ ];
32358
+ }
32359
+ async function collectAssertionTemplateReferencesFromValue(value, searchRoots, evalId, includeContext = { depth: 0, chain: [] }) {
32360
+ if (value === void 0) {
32361
+ return [];
32362
+ }
32363
+ const references = [];
32364
+ if (Array.isArray(value)) {
32365
+ for (const item of value) {
32366
+ if (isIncludeEntry(item)) {
32367
+ const nextDepth = includeContext.depth + 1;
32368
+ if (nextDepth > MAX_ASSERTION_INCLUDE_DEPTH) {
32369
+ const chain = [...includeContext.chain, item.include].join(" -> ");
32370
+ throw new Error(
32371
+ `Assertion template include depth exceeded ${MAX_ASSERTION_INCLUDE_DEPTH} in '${evalId}'. Include chain: ${chain}`
32372
+ );
32373
+ }
32374
+ const resolved = await resolveAssertionTemplateReference(item.include, searchRoots);
32375
+ references.push({
32376
+ kind: "assertion_template",
32377
+ displayPath: resolved.displayPath,
32378
+ ...resolved.resolvedPath ? { resolvedPath: path40.resolve(resolved.resolvedPath) } : {}
32379
+ });
32380
+ if (resolved.resolvedPath) {
32381
+ if (includeContext.chain.includes(resolved.resolvedPath)) {
32382
+ const cycle = [...includeContext.chain, resolved.resolvedPath].join(" -> ");
32383
+ throw new Error(`Assertion template cycle detected in '${evalId}': ${cycle}`);
32384
+ }
32385
+ const content = await readFile13(resolved.resolvedPath, "utf8");
32386
+ const parsed = interpolateEnv(parseYamlValue(content), process.env);
32387
+ if (isJsonObject2(parsed) && Array.isArray(parsed.assertions)) {
32388
+ const templateDir = path40.dirname(resolved.resolvedPath);
32389
+ const nestedSearchRoots = [
32390
+ templateDir,
32391
+ ...searchRoots.filter((root) => path40.resolve(root) !== templateDir)
32392
+ ];
32393
+ references.push(
32394
+ ...await collectAssertionTemplateReferencesFromValue(
32395
+ parsed.assertions,
32396
+ nestedSearchRoots,
32397
+ evalId,
32398
+ {
32399
+ depth: nextDepth,
32400
+ chain: [...includeContext.chain, resolved.resolvedPath]
32401
+ }
32402
+ )
32403
+ );
32404
+ }
32405
+ }
32406
+ continue;
32407
+ }
32408
+ if (isJsonObject2(item)) {
32409
+ references.push(
32410
+ ...await collectAssertionTemplateReferencesFromObject(
32411
+ item,
32412
+ searchRoots,
32413
+ evalId,
32414
+ includeContext
32415
+ )
32416
+ );
32417
+ }
32418
+ }
32419
+ } else if (isJsonObject2(value)) {
32420
+ references.push(
32421
+ ...await collectAssertionTemplateReferencesFromObject(
32422
+ value,
32423
+ searchRoots,
32424
+ evalId,
32425
+ includeContext
32426
+ )
32427
+ );
32428
+ }
32429
+ return references;
32430
+ }
32431
+ async function collectAssertionTemplateReferencesFromObject(value, searchRoots, evalId, includeContext) {
32432
+ const references = [];
32433
+ for (const key of ["assertions", "assert", "evaluators"]) {
32434
+ references.push(
32435
+ ...await collectAssertionTemplateReferencesFromValue(
32436
+ value[key],
32437
+ searchRoots,
32438
+ evalId,
32439
+ includeContext
32440
+ )
32441
+ );
32442
+ }
32443
+ return references;
32444
+ }
31705
32445
  async function parseGraderList(candidateEvaluators, searchRoots, evalId, defaultPreprocessors) {
31706
32446
  const expandedEvaluators = await expandGraderEntries(candidateEvaluators, searchRoots, evalId);
31707
32447
  if (!expandedEvaluators) {
@@ -31828,6 +32568,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
31828
32568
  continue;
31829
32569
  }
31830
32570
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
32571
+ const resolvedScriptPath = await resolveOptionalCommandSource(command, searchRoots);
31831
32572
  const cwd = asString(rawEvaluator.cwd);
31832
32573
  let resolvedCwd;
31833
32574
  if (cwd) {
@@ -31893,6 +32634,7 @@ async function parseGraderList(candidateEvaluators, searchRoots, evalId, default
31893
32634
  name,
31894
32635
  type: "code-grader",
31895
32636
  command,
32637
+ ...resolvedScriptPath ? { resolvedScriptPath } : {},
31896
32638
  cwd,
31897
32639
  resolvedCwd,
31898
32640
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -32960,6 +33702,17 @@ function asStringArray(value, description) {
32960
33702
  }
32961
33703
  return result;
32962
33704
  }
33705
+ async function resolveOptionalCommandSource(command, searchRoots) {
33706
+ const candidate = command.at(-1);
33707
+ if (!candidate || !looksLikeFilePath(candidate)) {
33708
+ return void 0;
33709
+ }
33710
+ const resolved = await resolveFileReference3(candidate, searchRoots);
33711
+ return resolved.resolvedPath ? path40.resolve(resolved.resolvedPath) : void 0;
33712
+ }
33713
+ function looksLikeFilePath(value) {
33714
+ return path40.isAbsolute(value) || value.startsWith(".") || value.includes("/") || value.includes("\\") || /\.[cm]?[jt]sx?$|\.py$|\.sh$|\.bash$|\.rb$|\.go$|\.rs$/i.test(value);
33715
+ }
32963
33716
  function parseCommandToArgv(command) {
32964
33717
  if (process.platform === "win32") {
32965
33718
  return ["cmd.exe", "/c", command];
@@ -33028,6 +33781,19 @@ var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average",
33028
33781
  function isValidFieldAggregationType(value) {
33029
33782
  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
33030
33783
  }
33784
+ var VALID_RUBRIC_OPERATORS = new Set(RUBRIC_OPERATOR_VALUES);
33785
+ function parseRubricOperator(value, rubricId, evaluatorName, evalId) {
33786
+ if (value === void 0) {
33787
+ return void 0;
33788
+ }
33789
+ if (typeof value === "string" && VALID_RUBRIC_OPERATORS.has(value)) {
33790
+ return value;
33791
+ }
33792
+ logWarning2(
33793
+ `Ignoring invalid operator for rubric '${rubricId}' in evaluator '${evaluatorName}' in '${evalId}': must be one of ${RUBRIC_OPERATOR_VALUES.join(", ")}`
33794
+ );
33795
+ return void 0;
33796
+ }
33031
33797
  function parseRubricItems(rawRubrics, evaluatorName, evalId) {
33032
33798
  const items = [];
33033
33799
  for (const [index, rawRubric] of rawRubrics.entries()) {
@@ -33038,7 +33804,8 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
33038
33804
  continue;
33039
33805
  }
33040
33806
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
33041
- const expectedOutcome = asString(rawRubric.outcome) ?? "";
33807
+ const expectedOutcome = asString(rawRubric.outcome) ?? asString(rawRubric.criteria) ?? "";
33808
+ const operator = parseRubricOperator(rawRubric.operator, id, evaluatorName, evalId);
33042
33809
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
33043
33810
  let minScore;
33044
33811
  let requiredMinScore;
@@ -33082,6 +33849,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
33082
33849
  id,
33083
33850
  weight,
33084
33851
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
33852
+ ...operator !== void 0 ? { operator } : {},
33085
33853
  ...required2 !== void 0 ? { required: required2 } : {},
33086
33854
  ...minScore !== void 0 ? { min_score: minScore } : {},
33087
33855
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
@@ -33097,6 +33865,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
33097
33865
  items.push({
33098
33866
  id,
33099
33867
  outcome: expectedOutcome,
33868
+ ...operator !== void 0 ? { operator } : {},
33100
33869
  weight,
33101
33870
  // Default to required: true if not specified (backward compatibility)
33102
33871
  required: required2 ?? true,
@@ -33219,6 +33988,8 @@ function parseInlineRubrics(rawRubrics) {
33219
33988
  };
33220
33989
  }
33221
33990
  const expectedOutcome = asString(rubric.outcome) ?? "";
33991
+ const id = asString(rubric.id) ?? `rubric-${index + 1}`;
33992
+ const operator = parseRubricOperator(rubric.operator, id, "rubrics", "<inline>");
33222
33993
  const rawScoreRanges = rubric.score_ranges;
33223
33994
  const normalizedScoreRanges = rawScoreRanges !== void 0 ? normalizeScoreRangesShorthand(rawScoreRanges) : void 0;
33224
33995
  const scoreRanges = Array.isArray(normalizedScoreRanges) && normalizedScoreRanges.length > 0 ? normalizedScoreRanges.filter((r) => isJsonObject2(r)).map((range) => ({
@@ -33226,7 +33997,8 @@ function parseInlineRubrics(rawRubrics) {
33226
33997
  outcome: asString(range.outcome) ?? ""
33227
33998
  })).filter((r) => r.outcome.length > 0) : void 0;
33228
33999
  const baseRubric = {
33229
- id: asString(rubric.id) ?? `rubric-${index + 1}`,
34000
+ id,
34001
+ ...operator !== void 0 ? { operator } : {},
33230
34002
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
33231
34003
  };
33232
34004
  let inlineMinScore;
@@ -33386,7 +34158,7 @@ async function processMessages(options) {
33386
34158
  continue;
33387
34159
  }
33388
34160
  try {
33389
- const fileContent = (await readFile13(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
34161
+ const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
33390
34162
  processedContent.push({
33391
34163
  ...cloneJsonObject(rawSegment),
33392
34164
  path: displayPath,
@@ -33427,7 +34199,7 @@ async function processMessages(options) {
33427
34199
  continue;
33428
34200
  }
33429
34201
  try {
33430
- const imageBuffer = await readFile13(resolvedPath);
34202
+ const imageBuffer = await readFile14(resolvedPath);
33431
34203
  const base643 = imageBuffer.toString("base64");
33432
34204
  processedContent.push({
33433
34205
  type: "image",
@@ -33510,7 +34282,7 @@ async function processExpectedMessages(options) {
33510
34282
  continue;
33511
34283
  }
33512
34284
  try {
33513
- const fileContent = (await readFile13(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
34285
+ const fileContent = (await readFile14(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
33514
34286
  processedContent.push({
33515
34287
  type: "file",
33516
34288
  path: displayPath,
@@ -33550,7 +34322,7 @@ async function processExpectedMessages(options) {
33550
34322
  continue;
33551
34323
  }
33552
34324
  try {
33553
- const imageBuffer = await readFile13(resolvedPath);
34325
+ const imageBuffer = await readFile14(resolvedPath);
33554
34326
  const base643 = imageBuffer.toString("base64");
33555
34327
  processedContent.push({
33556
34328
  type: "image",
@@ -33590,6 +34362,12 @@ function expandInputShorthand(value) {
33590
34362
  if (typeof value === "string") {
33591
34363
  return [{ role: "user", content: value }];
33592
34364
  }
34365
+ if (isJsonObject(value)) {
34366
+ if ("role" in value) {
34367
+ return isTestMessage(value) ? [value] : void 0;
34368
+ }
34369
+ return [{ role: "user", content: value }];
34370
+ }
33593
34371
  if (Array.isArray(value)) {
33594
34372
  const messages = value.filter((msg) => isTestMessage(msg));
33595
34373
  return messages.length > 0 ? messages : void 0;
@@ -33675,7 +34453,7 @@ async function loadSidecarMetadata(jsonlPath, verbose) {
33675
34453
  return {};
33676
34454
  }
33677
34455
  try {
33678
- const content = await readFile14(sidecarPath, "utf8");
34456
+ const content = await readFile15(sidecarPath, "utf8");
33679
34457
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
33680
34458
  if (!isJsonObject(parsed)) {
33681
34459
  logWarning4(`Invalid sidecar metadata format in ${sidecarPath}`);
@@ -33720,7 +34498,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
33720
34498
  const repoRootPath = resolveToAbsolutePath(repoRoot);
33721
34499
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
33722
34500
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
33723
- const rawFile = await readFile14(absoluteTestPath, "utf8");
34501
+ const rawFile = await readFile15(absoluteTestPath, "utf8");
33724
34502
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
33725
34503
  const fallbackSuiteName = path422.basename(absoluteTestPath, ".jsonl") || "eval";
33726
34504
  const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
@@ -34129,7 +34907,7 @@ function interpolateRawEvalCase(raw, vars) {
34129
34907
  async function readTestSuiteMetadata(testFilePath) {
34130
34908
  try {
34131
34909
  const absolutePath = path43.resolve(testFilePath);
34132
- const content = await readFile15(absolutePath, "utf8");
34910
+ const content = await readFile16(absolutePath, "utf8");
34133
34911
  const parsed = interpolateEnv(parseYamlValue(content), process.env);
34134
34912
  if (!isJsonObject(parsed)) {
34135
34913
  return {};
@@ -34153,7 +34931,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
34153
34931
  return { tests: await loadTestsFromAgentSkills(evalFilePath) };
34154
34932
  }
34155
34933
  if (format === "typescript") {
34156
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js");
34934
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT-THE7D3GR.js");
34157
34935
  return loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
34158
34936
  }
34159
34937
  const { tests, parsed, suiteWorkspacePath } = await loadTestsFromYaml(
@@ -34188,7 +34966,7 @@ async function loadTests(evalFilePath, repoRoot, options) {
34188
34966
  return loadTestsFromAgentSkills(evalFilePath);
34189
34967
  }
34190
34968
  if (format === "typescript") {
34191
- const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-Z6IUSDNA-YBOE4JIQ.js");
34969
+ const { loadTsEvalSuite: loadTsEvalSuite2 } = await import("./ts-eval-loader-EQJX3OLT-THE7D3GR.js");
34192
34970
  const suite = await loadTsEvalSuite2(evalFilePath, resolveToAbsolutePath(repoRoot), options);
34193
34971
  return suite.tests;
34194
34972
  }
@@ -34203,8 +34981,10 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
34203
34981
  const repoRootPath = resolveToAbsolutePath(repoRoot);
34204
34982
  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
34205
34983
  const config2 = await loadConfig(absoluteTestPath, repoRootPath);
34206
- const rawFile = await readFile15(absoluteTestPath, "utf8");
34207
- const interpolated = interpolateEnv(parseYamlValue(rawFile), process.env);
34984
+ const rawFile = await readFile16(absoluteTestPath, "utf8");
34985
+ const rawParsed = parseYamlValue(rawFile);
34986
+ const rawCaseSnapshots = buildRawInlineTestSnapshots(rawParsed);
34987
+ const interpolated = interpolateEnv(rawParsed, process.env);
34208
34988
  if (!isJsonObject(interpolated)) {
34209
34989
  throw new Error(`Invalid test file format: ${evalFilePath}`);
34210
34990
  }
@@ -34241,7 +35021,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
34241
35021
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
34242
35022
  }
34243
35023
  const suiteWorkspace = await resolveWorkspaceConfig(suite.workspace, evalFileDir);
34244
- const suiteGovernance = extractSuiteGovernance(suite);
35024
+ const suiteMetadataPayload = extractSuiteMetadataPayload(suite);
34245
35025
  const rawSuiteInput = suite.input;
34246
35026
  const rawSuiteInputFiles = suite.input_files;
34247
35027
  const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
@@ -34343,6 +35123,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
34343
35123
  logError3(`Skipping test '${id}': ${message}`);
34344
35124
  continue;
34345
35125
  }
35126
+ const assertionTemplateReferences = await collectAssertionTemplateSourceReferences(
35127
+ renderedCase,
35128
+ globalExecution,
35129
+ searchRoots,
35130
+ id ?? "unknown"
35131
+ );
34346
35132
  const inlineRubrics = renderedCase.rubrics;
34347
35133
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
34348
35134
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
@@ -34355,8 +35141,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
34355
35141
  const caseWorkspace = await resolveWorkspaceConfig(renderedCase.workspace, evalFileDir);
34356
35142
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
34357
35143
  const rawCaseMetadata = isJsonObject(renderedCase.metadata) ? renderedCase.metadata : void 0;
34358
- const suitePayload = suiteGovernance !== void 0 ? { governance: suiteGovernance } : void 0;
34359
- const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suitePayload);
35144
+ const metadata = mergeSuiteMetadataPayload(rawCaseMetadata, suiteMetadataPayload);
34360
35145
  const caseTargets = extractTargetsFromTestCase(renderedCase);
34361
35146
  const dependsOn = Array.isArray(renderedCase.depends_on) ? renderedCase.depends_on.filter(
34362
35147
  (v) => typeof v === "string"
@@ -34395,12 +35180,245 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
34395
35180
  ...onTurnFailure ? { on_turn_failure: onTurnFailure } : {},
34396
35181
  ...windowSize !== void 0 ? { window_size: windowSize } : {},
34397
35182
  ...dependsOn && dependsOn.length > 0 ? { depends_on: dependsOn } : {},
34398
- ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {}
35183
+ ...onDependencyFailure ? { on_dependency_failure: onDependencyFailure } : {},
35184
+ source: buildEvalTestSource({
35185
+ evalFilePath,
35186
+ absoluteTestPath,
35187
+ repoRootPath,
35188
+ id,
35189
+ renderedCase,
35190
+ rawCaseSnapshots,
35191
+ inputMessages,
35192
+ evaluators,
35193
+ assertionTemplateReferences
35194
+ })
34399
35195
  };
34400
35196
  results.push(testCase);
34401
35197
  }
34402
35198
  return { tests: results, parsed: suite, suiteWorkspacePath: suiteWorkspace?.path };
34403
35199
  }
35200
+ var SOURCE_SECRET_KEY_PATTERN = /(api[_-]?key|authorization|bearer|credential|password|private[_-]?key|secret|token)/i;
35201
+ var REDACTED_SOURCE_VALUE = "[redacted]";
35202
+ function buildRawInlineTestSnapshots(rawParsed) {
35203
+ const snapshots = /* @__PURE__ */ new Map();
35204
+ if (!isJsonObject(rawParsed)) {
35205
+ return snapshots;
35206
+ }
35207
+ const rawTests = rawParsed.tests ?? rawParsed.eval_cases ?? rawParsed.evalcases;
35208
+ if (!Array.isArray(rawTests)) {
35209
+ return snapshots;
35210
+ }
35211
+ for (const rawTest of rawTests) {
35212
+ if (!isJsonObject(rawTest) || typeof rawTest.id !== "string") {
35213
+ continue;
35214
+ }
35215
+ snapshots.set(rawTest.id, stringifySourceYaml(rawTest));
35216
+ }
35217
+ return snapshots;
35218
+ }
35219
+ function buildEvalTestSource(params) {
35220
+ const evalFileRepoPath = toPortableRelativePath(params.repoRootPath, params.absoluteTestPath);
35221
+ const testSnapshotYaml = params.rawCaseSnapshots.get(params.id) ?? stringifySourceYaml(params.renderedCase);
35222
+ const evaluatorReferences = collectGraderSourceReferences(params.evaluators);
35223
+ const inputReferences = collectInputSourceReferences(params.inputMessages);
35224
+ const references = dedupeSourceReferences([
35225
+ ...inputReferences,
35226
+ ...evaluatorReferences,
35227
+ ...params.assertionTemplateReferences
35228
+ ]);
35229
+ return {
35230
+ evalFilePath: params.evalFilePath,
35231
+ evalFileAbsolutePath: params.absoluteTestPath,
35232
+ ...evalFileRepoPath ? { evalFileRepoPath } : {},
35233
+ testId: params.id,
35234
+ testSnapshotYaml,
35235
+ graderDefinitions: buildGraderSourceDefinitions(params.evaluators),
35236
+ references
35237
+ };
35238
+ }
35239
+ function stringifySourceYaml(value) {
35240
+ return stringifyYaml(sanitizeSourceValue(value), { lineWidth: 0 }).trimEnd();
35241
+ }
35242
+ function sanitizeSourceValue(value, keyHint) {
35243
+ if (keyHint && SOURCE_SECRET_KEY_PATTERN.test(keyHint)) {
35244
+ return REDACTED_SOURCE_VALUE;
35245
+ }
35246
+ if (value === null || typeof value === "string" || typeof value === "number") {
35247
+ return value;
35248
+ }
35249
+ if (typeof value === "boolean") {
35250
+ return value;
35251
+ }
35252
+ if (Array.isArray(value)) {
35253
+ return value.map((item) => sanitizeSourceValue(item));
35254
+ }
35255
+ if (typeof value === "object" && value !== null) {
35256
+ const entries = Object.entries(value).map(([key, entryValue]) => [
35257
+ key,
35258
+ sanitizeSourceValue(entryValue, key)
35259
+ ]);
35260
+ return Object.fromEntries(entries);
35261
+ }
35262
+ return String(value);
35263
+ }
35264
+ function buildGraderSourceDefinitions(evaluators) {
35265
+ return (evaluators ?? []).map((evaluator) => ({
35266
+ name: evaluator.name,
35267
+ type: evaluator.type,
35268
+ ...evaluator.weight !== void 0 ? { weight: evaluator.weight } : {},
35269
+ ...evaluator.required !== void 0 ? { required: evaluator.required } : {},
35270
+ ..."min_score" in evaluator && evaluator.min_score !== void 0 ? { minScore: evaluator.min_score } : {},
35271
+ definition: sanitizeGraderDefinition(evaluator)
35272
+ }));
35273
+ }
35274
+ function sanitizeGraderDefinition(evaluator) {
35275
+ const copy = sanitizeSourceValue(evaluator);
35276
+ return stripRuntimeResolutionFields(copy);
35277
+ }
35278
+ function stripRuntimeResolutionFields(value) {
35279
+ const stripped = {};
35280
+ for (const [key, entryValue] of Object.entries(value)) {
35281
+ if (key === "resolvedPromptPath" || key === "promptPath" || key === "resolvedPromptScript" || key === "resolvedScriptPath" || key === "resolvedCwd" || key === "resolvedCommand") {
35282
+ continue;
35283
+ }
35284
+ if (Array.isArray(entryValue)) {
35285
+ stripped[key] = entryValue.map(
35286
+ (item) => isJsonObject(item) ? stripRuntimeResolutionFields(item) : item
35287
+ );
35288
+ } else if (isJsonObject(entryValue)) {
35289
+ stripped[key] = stripRuntimeResolutionFields(entryValue);
35290
+ } else {
35291
+ stripped[key] = entryValue;
35292
+ }
35293
+ }
35294
+ return stripped;
35295
+ }
35296
+ function collectInputSourceReferences(inputMessages) {
35297
+ const references = [];
35298
+ for (const message of inputMessages) {
35299
+ if (!Array.isArray(message.content)) {
35300
+ continue;
35301
+ }
35302
+ for (const segment of message.content) {
35303
+ if (!isJsonObject(segment) || segment.type !== "file") {
35304
+ continue;
35305
+ }
35306
+ const displayPath = typeof segment.path === "string" ? segment.path : typeof segment.value === "string" ? segment.value : "input file";
35307
+ references.push({
35308
+ kind: "input_file",
35309
+ displayPath,
35310
+ ...typeof segment.resolvedPath === "string" ? { resolvedPath: path43.resolve(segment.resolvedPath) } : {}
35311
+ });
35312
+ }
35313
+ }
35314
+ return references;
35315
+ }
35316
+ function collectGraderSourceReferences(evaluators) {
35317
+ const references = [];
35318
+ for (const evaluator of evaluators ?? []) {
35319
+ references.push(...collectSingleGraderSourceReferences(evaluator));
35320
+ }
35321
+ return references;
35322
+ }
35323
+ function collectSingleGraderSourceReferences(evaluator) {
35324
+ const references = [];
35325
+ if (evaluator.type === "code-grader") {
35326
+ const command = evaluator.command ?? evaluator.script ?? [];
35327
+ references.push({
35328
+ kind: "code_grader_command",
35329
+ displayPath: evaluator.resolvedScriptPath ?? command.join(" "),
35330
+ ...evaluator.resolvedScriptPath ? { resolvedPath: evaluator.resolvedScriptPath } : {},
35331
+ graderName: evaluator.name,
35332
+ command
35333
+ });
35334
+ if (evaluator.resolvedCwd) {
35335
+ references.push({
35336
+ kind: "code_grader_cwd",
35337
+ displayPath: evaluator.cwd ?? evaluator.resolvedCwd,
35338
+ resolvedPath: evaluator.resolvedCwd,
35339
+ graderName: evaluator.name
35340
+ });
35341
+ }
35342
+ }
35343
+ if (evaluator.type === "llm-grader") {
35344
+ const promptPath = evaluator.resolvedPromptPath ?? evaluator.promptPath;
35345
+ if (promptPath) {
35346
+ references.push({
35347
+ kind: "llm_grader_prompt",
35348
+ displayPath: typeof evaluator.prompt === "string" ? evaluator.prompt : promptPath,
35349
+ resolvedPath: promptPath,
35350
+ graderName: evaluator.name
35351
+ });
35352
+ }
35353
+ if (evaluator.resolvedPromptScript && evaluator.resolvedPromptScript.length > 0) {
35354
+ references.push({
35355
+ kind: "prompt_script",
35356
+ displayPath: evaluator.resolvedPromptScript.at(-1) ?? evaluator.name,
35357
+ resolvedPath: evaluator.resolvedPromptScript.at(-1),
35358
+ graderName: evaluator.name,
35359
+ command: evaluator.resolvedPromptScript
35360
+ });
35361
+ }
35362
+ }
35363
+ const preprocessors = "preprocessors" in evaluator ? evaluator.preprocessors : void 0;
35364
+ for (const preprocessor of preprocessors ?? []) {
35365
+ if (preprocessor.resolvedCommand && preprocessor.resolvedCommand.length > 0) {
35366
+ references.push({
35367
+ kind: "preprocessor_command",
35368
+ displayPath: preprocessor.resolvedCommand.at(-1) ?? preprocessor.type,
35369
+ resolvedPath: preprocessor.resolvedCommand.at(-1),
35370
+ graderName: evaluator.name,
35371
+ command: preprocessor.resolvedCommand
35372
+ });
35373
+ }
35374
+ }
35375
+ if (evaluator.type === "composite") {
35376
+ for (const member of evaluator.assertions) {
35377
+ references.push(...collectSingleGraderSourceReferences(member));
35378
+ }
35379
+ if (evaluator.aggregator.type === "code-grader") {
35380
+ references.push({
35381
+ kind: "code_grader_command",
35382
+ displayPath: evaluator.aggregator.path,
35383
+ resolvedPath: path43.resolve(evaluator.aggregator.cwd ?? "", evaluator.aggregator.path),
35384
+ graderName: evaluator.name
35385
+ });
35386
+ } else if (evaluator.aggregator.type === "llm-grader" && evaluator.aggregator.promptPath) {
35387
+ references.push({
35388
+ kind: "llm_grader_prompt",
35389
+ displayPath: evaluator.aggregator.prompt ?? evaluator.aggregator.promptPath,
35390
+ resolvedPath: evaluator.aggregator.promptPath,
35391
+ graderName: evaluator.name
35392
+ });
35393
+ }
35394
+ }
35395
+ return references;
35396
+ }
35397
+ function dedupeSourceReferences(references) {
35398
+ const seen = /* @__PURE__ */ new Set();
35399
+ const deduped = [];
35400
+ for (const reference of references) {
35401
+ const key = JSON.stringify([
35402
+ reference.kind,
35403
+ reference.resolvedPath ?? reference.displayPath,
35404
+ reference.graderName ?? "",
35405
+ reference.command?.join("\0") ?? ""
35406
+ ]);
35407
+ if (seen.has(key)) {
35408
+ continue;
35409
+ }
35410
+ seen.add(key);
35411
+ deduped.push(reference);
35412
+ }
35413
+ return deduped;
35414
+ }
35415
+ function toPortableRelativePath(root, candidate) {
35416
+ const relative = path43.relative(root, candidate);
35417
+ if (relative && !relative.startsWith("..") && !path43.isAbsolute(relative)) {
35418
+ return relative.split(path43.sep).join("/");
35419
+ }
35420
+ return void 0;
35421
+ }
34404
35422
  async function loadTestById(evalFilePath, repoRoot, evalId) {
34405
35423
  const tests = await loadTests(evalFilePath, repoRoot);
34406
35424
  const match = tests.find((c) => c.id === evalId);
@@ -34493,7 +35511,7 @@ async function resolveWorkspaceConfig(raw, evalFileDir) {
34493
35511
  const workspaceFilePath = path43.resolve(evalFileDir, raw);
34494
35512
  let content;
34495
35513
  try {
34496
- content = await readFile15(workspaceFilePath, "utf8");
35514
+ content = await readFile16(workspaceFilePath, "utf8");
34497
35515
  } catch {
34498
35516
  throw new Error(`Workspace file not found: ${raw} (resolved to ${workspaceFilePath})`);
34499
35517
  }
@@ -34617,19 +35635,18 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
34617
35635
  function asString5(value) {
34618
35636
  return typeof value === "string" ? value : void 0;
34619
35637
  }
34620
- function extractSuiteGovernance(suite) {
35638
+ function extractSuiteMetadataPayload(suite) {
35639
+ const payload = isJsonObject(suite.metadata) ? { ...suite.metadata } : {};
34621
35640
  const top = suite.governance;
34622
35641
  if (isJsonObject(top)) {
34623
- return top;
34624
- }
34625
- const wrapper = suite.metadata;
34626
- if (isJsonObject(wrapper)) {
34627
- const nested = wrapper.governance;
35642
+ payload.governance = top;
35643
+ } else {
35644
+ const nested = payload.governance;
34628
35645
  if (isJsonObject(nested)) {
34629
- return nested;
35646
+ payload.governance = nested;
34630
35647
  }
34631
35648
  }
34632
- return void 0;
35649
+ return Object.keys(payload).length > 0 ? payload : void 0;
34633
35650
  }
34634
35651
  function mergeSuiteMetadataPayload(caseMetadata, suitePayload) {
34635
35652
  if (!suitePayload) return caseMetadata;
@@ -35118,7 +36135,7 @@ async function runEvaluation(options) {
35118
36135
  const isEmpty = dirExists ? (await readdir8(configuredStaticPath)).length === 0 : false;
35119
36136
  if (isYamlConfiguredPath && (!dirExists || isEmpty)) {
35120
36137
  if (!dirExists) {
35121
- await mkdir14(configuredStaticPath, { recursive: true });
36138
+ await mkdir15(configuredStaticPath, { recursive: true });
35122
36139
  }
35123
36140
  if (workspaceTemplate) {
35124
36141
  await copyDirectoryRecursive(workspaceTemplate, configuredStaticPath);
@@ -35163,7 +36180,7 @@ async function runEvaluation(options) {
35163
36180
  }
35164
36181
  } else if (!isPerTestIsolation && (suiteWorkspace?.hooks || suiteWorkspace?.repos?.length)) {
35165
36182
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
35166
- await mkdir14(sharedWorkspacePath, { recursive: true });
36183
+ await mkdir15(sharedWorkspacePath, { recursive: true });
35167
36184
  setupLog(`created empty shared workspace at: ${sharedWorkspacePath}`);
35168
36185
  }
35169
36186
  try {
@@ -36013,7 +37030,7 @@ async function runEvalCase(options) {
36013
37030
  }
36014
37031
  if (!workspacePath && (evalCase.workspace?.hooks || evalCase.workspace?.repos?.length) && evalRunId) {
36015
37032
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
36016
- await mkdir14(workspacePath, { recursive: true });
37033
+ await mkdir15(workspacePath, { recursive: true });
36017
37034
  }
36018
37035
  if (evalCase.workspace?.repos?.length && workspacePath) {
36019
37036
  const localPathErrors = RepoManager.validateLocalPaths(evalCase.workspace.repos);
@@ -36068,7 +37085,7 @@ async function runEvalCase(options) {
36068
37085
  const srcPath = path44.resolve(baseDir, relPath);
36069
37086
  const destPath = path44.resolve(workspacePath, relPath);
36070
37087
  try {
36071
- await mkdir14(path44.dirname(destPath), { recursive: true });
37088
+ await mkdir15(path44.dirname(destPath), { recursive: true });
36072
37089
  await copyFile2(srcPath, destPath);
36073
37090
  } catch (error40) {
36074
37091
  const message = error40 instanceof Error ? error40.message : String(error40);
@@ -37632,6 +38649,12 @@ async function evaluate(config2) {
37632
38649
  resolvedTarget = resolveTargetDefinition(targetDef);
37633
38650
  }
37634
38651
  const collectedResults = [];
38652
+ const cacheEnabled = shouldEnableCache({
38653
+ cliCache: config2.cache === true,
38654
+ cliNoCache: false,
38655
+ yamlCache: config2.cache === void 0 ? materialized.cache : void 0
38656
+ });
38657
+ const cache = cacheEnabled ? new ResponseCache(materialized.cachePath ? path45.resolve(materialized.cachePath) : void 0) : void 0;
37635
38658
  const results = await runEvaluation({
37636
38659
  testFilePath,
37637
38660
  repoRoot,
@@ -37644,6 +38667,8 @@ async function evaluate(config2) {
37644
38667
  filter: config2.filter,
37645
38668
  threshold: config2.threshold,
37646
38669
  evalCases: materialized.tests,
38670
+ cache,
38671
+ useCache: !!cache && !shouldSkipCacheForTemperature(resolvedTarget.config),
37647
38672
  ...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
37648
38673
  onResult: async (result) => {
37649
38674
  collectedResults.push(result);
@@ -37674,6 +38699,7 @@ async function materializeEvalConfig(config2, options) {
37674
38699
  tests: tests2,
37675
38700
  workers: config2.workers ?? suite.workers,
37676
38701
  cache: config2.cache ?? suite.cacheConfig?.enabled,
38702
+ cachePath: config2.cachePath ?? suite.cacheConfig?.cachePath,
37677
38703
  budgetUsd: config2.budgetUsd ?? suite.budgetUsd,
37678
38704
  threshold: config2.threshold ?? suite.threshold,
37679
38705
  metadata: config2.metadata ?? suite.metadata,
@@ -37692,6 +38718,7 @@ async function materializeEvalConfig(config2, options) {
37692
38718
  tests,
37693
38719
  workers: config2.workers,
37694
38720
  cache: config2.cache,
38721
+ cachePath: config2.cachePath,
37695
38722
  budgetUsd: config2.budgetUsd,
37696
38723
  threshold: config2.threshold,
37697
38724
  metadata: config2.metadata,
@@ -37809,9 +38836,11 @@ function mapAssertionType(type) {
37809
38836
  }
37810
38837
  function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
37811
38838
  const total = results.length;
38839
+ const qualityResults = results.filter((r) => r.executionStatus !== "execution_error");
38840
+ const executionErrors = total - qualityResults.length;
37812
38841
  let passed = 0;
37813
38842
  let scoreSum = 0;
37814
- for (const r of results) {
38843
+ for (const r of qualityResults) {
37815
38844
  scoreSum += r.score;
37816
38845
  if (r.score >= threshold) {
37817
38846
  passed++;
@@ -37820,9 +38849,10 @@ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
37820
38849
  return {
37821
38850
  total,
37822
38851
  passed,
37823
- failed: total - passed,
38852
+ failed: qualityResults.length - passed,
38853
+ executionErrors,
37824
38854
  durationMs,
37825
- meanScore: total > 0 ? scoreSum / total : 0
38855
+ meanScore: qualityResults.length > 0 ? scoreSum / qualityResults.length : 0
37826
38856
  };
37827
38857
  }
37828
38858
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
@@ -37903,7 +38933,12 @@ async function loadTsEvalSuite(filePath, repoRoot, options) {
37903
38933
  return {
37904
38934
  tests: materialized.tests,
37905
38935
  ...materialized.workers !== void 0 && { workers: materialized.workers },
37906
- ...materialized.cache !== void 0 && { cacheConfig: { enabled: materialized.cache } },
38936
+ ...materialized.cache !== void 0 && {
38937
+ cacheConfig: {
38938
+ enabled: materialized.cache,
38939
+ ...materialized.cachePath !== void 0 && { cachePath: materialized.cachePath }
38940
+ }
38941
+ },
37907
38942
  ...materialized.budgetUsd !== void 0 && { budgetUsd: materialized.budgetUsd },
37908
38943
  ...materialized.threshold !== void 0 && { threshold: materialized.threshold },
37909
38944
  ...materialized.metadata !== void 0 && { metadata: materialized.metadata },
@@ -37936,7 +38971,15 @@ export {
37936
38971
  isJsonValue,
37937
38972
  isTestMessage,
37938
38973
  isGraderKind,
38974
+ RUBRIC_OPERATOR_VALUES,
37939
38975
  parseYamlValue,
38976
+ getAgentvConfigDir,
38977
+ getAgentvHome,
38978
+ getAgentvDataDir,
38979
+ getWorkspacesRoot,
38980
+ getSubagentsRoot,
38981
+ getTraceStateRoot,
38982
+ getWorkspacePoolRoot,
37940
38983
  fileExists,
37941
38984
  normalizeLineEndings,
37942
38985
  readTextFile,
@@ -37956,6 +38999,9 @@ export {
37956
38999
  interpolateEnv,
37957
39000
  loadCasesFromFile,
37958
39001
  loadCasesFromDirectory,
39002
+ ResponseCache,
39003
+ shouldEnableCache,
39004
+ shouldSkipCacheForTemperature,
37959
39005
  DEFAULT_THRESHOLD,
37960
39006
  PASS_THRESHOLD,
37961
39007
  scoreToVerdict,
@@ -37966,13 +39012,6 @@ export {
37966
39012
  parseJsonSafe,
37967
39013
  deepEqual,
37968
39014
  negateScore,
37969
- getAgentvConfigDir,
37970
- getAgentvHome,
37971
- getAgentvDataDir,
37972
- getWorkspacesRoot,
37973
- getSubagentsRoot,
37974
- getTraceStateRoot,
37975
- getWorkspacePoolRoot,
37976
39015
  toSnakeCaseDeep,
37977
39016
  toCamelCaseDeep,
37978
39017
  CodeGrader,
@@ -37990,7 +39029,28 @@ export {
37990
39029
  extractImageBlocks,
37991
39030
  CompositeGrader,
37992
39031
  CostGrader,
39032
+ NORMALIZED_TRAJECTORY_SCHEMA_VERSION,
39033
+ NORMALIZED_TRACE_SOURCE_KINDS,
39034
+ NORMALIZED_TRACE_EVENT_TYPES,
39035
+ NORMALIZED_TOOL_STATUSES,
39036
+ NORMALIZED_REDACTION_LEVELS,
39037
+ NormalizedRedactionStateWireSchema,
39038
+ NormalizedTraceErrorWireSchema,
39039
+ NormalizedTraceSourceWireSchema,
39040
+ NormalizedTraceSessionWireSchema,
39041
+ NormalizedTraceBranchWireSchema,
39042
+ NormalizedTraceSourceRefWireSchema,
39043
+ NormalizedRawEvidenceWireSchema,
39044
+ NormalizedTraceMessageWireSchema,
39045
+ NormalizedTraceModelWireSchema,
39046
+ NormalizedTraceToolWireSchema,
39047
+ NormalizedTraceEventWireSchema,
39048
+ NormalizedTrajectoryWireSchema,
39049
+ toNormalizedTrajectoryWire,
39050
+ fromNormalizedTrajectoryWire,
37993
39051
  computeTraceSummary,
39052
+ getSelectedTrajectoryEvents,
39053
+ computeTraceSummaryFromTrajectory,
37994
39054
  DEFAULT_EXPLORATION_TOOLS,
37995
39055
  explorationRatio,
37996
39056
  tokensPerTool,
@@ -38071,6 +39131,7 @@ export {
38071
39131
  extractCacheConfig,
38072
39132
  extractFailOnError,
38073
39133
  extractThreshold,
39134
+ resolveResultsConfigForProject,
38074
39135
  detectFormat,
38075
39136
  parseRepoSource,
38076
39137
  parseRepoCheckout,
@@ -38089,4 +39150,4 @@ export {
38089
39150
  loadTsEvalFile,
38090
39151
  loadTsEvalSuite
38091
39152
  };
38092
- //# sourceMappingURL=chunk-TAZBCVEZ.js.map
39153
+ //# sourceMappingURL=chunk-6QEIZ33V.js.map