@agentv/core 2.10.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-7Q4PH265.js";
20
+ } from "./chunk-REN5PS7B.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -1285,18 +1285,96 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1285
1285
  });
1286
1286
  continue;
1287
1287
  }
1288
+ if (typeValue === "contains_any" || typeValue === "contains_all") {
1289
+ const value = asStringArrayStrict(rawEvaluator.value);
1290
+ if (!value || value.length === 0) {
1291
+ logWarning2(
1292
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
1293
+ );
1294
+ continue;
1295
+ }
1296
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1297
+ const required2 = parseRequired(rawEvaluator.required);
1298
+ evaluators.push({
1299
+ name,
1300
+ type: typeValue,
1301
+ value,
1302
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1303
+ ...required2 !== void 0 ? { required: required2 } : {},
1304
+ ...negate !== void 0 ? { negate } : {}
1305
+ });
1306
+ continue;
1307
+ }
1308
+ if (typeValue === "icontains") {
1309
+ const value = asString(rawEvaluator.value);
1310
+ if (!value) {
1311
+ logWarning2(`Skipping icontains evaluator '${name}' in '${evalId}': missing value`);
1312
+ continue;
1313
+ }
1314
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1315
+ const required2 = parseRequired(rawEvaluator.required);
1316
+ evaluators.push({
1317
+ name,
1318
+ type: "icontains",
1319
+ value,
1320
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1321
+ ...required2 !== void 0 ? { required: required2 } : {},
1322
+ ...negate !== void 0 ? { negate } : {}
1323
+ });
1324
+ continue;
1325
+ }
1326
+ if (typeValue === "icontains_any" || typeValue === "icontains_all") {
1327
+ const value = asStringArrayStrict(rawEvaluator.value);
1328
+ if (!value || value.length === 0) {
1329
+ logWarning2(
1330
+ `Skipping ${typeValue} evaluator '${name}' in '${evalId}': value must be a non-empty string array`
1331
+ );
1332
+ continue;
1333
+ }
1334
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1335
+ const required2 = parseRequired(rawEvaluator.required);
1336
+ evaluators.push({
1337
+ name,
1338
+ type: typeValue,
1339
+ value,
1340
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1341
+ ...required2 !== void 0 ? { required: required2 } : {},
1342
+ ...negate !== void 0 ? { negate } : {}
1343
+ });
1344
+ continue;
1345
+ }
1346
+ if (typeValue === "starts_with" || typeValue === "ends_with") {
1347
+ const value = asString(rawEvaluator.value);
1348
+ if (!value) {
1349
+ logWarning2(`Skipping ${typeValue} evaluator '${name}' in '${evalId}': missing value`);
1350
+ continue;
1351
+ }
1352
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1353
+ const required2 = parseRequired(rawEvaluator.required);
1354
+ evaluators.push({
1355
+ name,
1356
+ type: typeValue,
1357
+ value,
1358
+ ...weight2 !== void 0 ? { weight: weight2 } : {},
1359
+ ...required2 !== void 0 ? { required: required2 } : {},
1360
+ ...negate !== void 0 ? { negate } : {}
1361
+ });
1362
+ continue;
1363
+ }
1288
1364
  if (typeValue === "regex") {
1289
1365
  const value = asString(rawEvaluator.value);
1290
1366
  if (!value) {
1291
1367
  logWarning2(`Skipping regex evaluator '${name}' in '${evalId}': missing value`);
1292
1368
  continue;
1293
1369
  }
1370
+ const flags = asString(rawEvaluator.flags);
1294
1371
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
1295
1372
  const required2 = parseRequired(rawEvaluator.required);
1296
1373
  evaluators.push({
1297
1374
  name,
1298
1375
  type: "regex",
1299
1376
  value,
1377
+ ...flags !== void 0 ? { flags } : {},
1300
1378
  ...weight2 !== void 0 ? { weight: weight2 } : {},
1301
1379
  ...required2 !== void 0 ? { required: required2 } : {},
1302
1380
  ...negate !== void 0 ? { negate } : {}
@@ -1469,15 +1547,43 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1469
1547
  }
1470
1548
  return evaluators.length > 0 ? evaluators : void 0;
1471
1549
  }
1472
- var ASSERTION_TYPES = /* @__PURE__ */ new Set(["contains", "regex", "is_json", "equals", "rubrics"]);
1550
+ var ASSERTION_TYPES = /* @__PURE__ */ new Set([
1551
+ "contains",
1552
+ "contains_any",
1553
+ "contains_all",
1554
+ "icontains",
1555
+ "icontains_any",
1556
+ "icontains_all",
1557
+ "starts_with",
1558
+ "ends_with",
1559
+ "regex",
1560
+ "is_json",
1561
+ "equals",
1562
+ "rubrics"
1563
+ ]);
1473
1564
  function generateAssertionName(typeValue, rawEvaluator) {
1474
1565
  if (!ASSERTION_TYPES.has(typeValue)) {
1475
1566
  return void 0;
1476
1567
  }
1477
1568
  const value = asString(rawEvaluator.value);
1569
+ const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : void 0;
1478
1570
  switch (typeValue) {
1479
1571
  case "contains":
1480
1572
  return value ? `contains-${value}` : "contains";
1573
+ case "contains_any":
1574
+ return arrayValue ? `contains_any-${arrayValue.length}` : "contains_any";
1575
+ case "contains_all":
1576
+ return arrayValue ? `contains_all-${arrayValue.length}` : "contains_all";
1577
+ case "icontains":
1578
+ return value ? `icontains-${value}` : "icontains";
1579
+ case "icontains_any":
1580
+ return arrayValue ? `icontains_any-${arrayValue.length}` : "icontains_any";
1581
+ case "icontains_all":
1582
+ return arrayValue ? `icontains_all-${arrayValue.length}` : "icontains_all";
1583
+ case "starts_with":
1584
+ return value ? `starts_with-${value}` : "starts_with";
1585
+ case "ends_with":
1586
+ return value ? `ends_with-${value}` : "ends_with";
1481
1587
  case "regex":
1482
1588
  return value ? `regex-${value.length > 30 ? value.slice(0, 30) : value}` : "regex";
1483
1589
  case "is_json":
@@ -1503,6 +1609,13 @@ function coerceEvaluator(candidate, contextId) {
1503
1609
  function asString(value) {
1504
1610
  return typeof value === "string" ? value : void 0;
1505
1611
  }
1612
+ function asStringArrayStrict(value) {
1613
+ if (!Array.isArray(value)) {
1614
+ return void 0;
1615
+ }
1616
+ const result = value.filter((v) => typeof v === "string");
1617
+ return result.length > 0 ? result : void 0;
1618
+ }
1506
1619
  function asStringArray(value, description) {
1507
1620
  if (value === void 0) {
1508
1621
  return void 0;
@@ -2820,6 +2933,69 @@ function parseWorkspaceScriptConfig(raw, evalFileDir) {
2820
2933
  }
2821
2934
  return cwd ? { ...config, cwd } : config;
2822
2935
  }
2936
+ function parseRepoSource(raw) {
2937
+ if (!isJsonObject(raw)) return void 0;
2938
+ const obj = raw;
2939
+ if (obj.type === "git" && typeof obj.url === "string") {
2940
+ return { type: "git", url: obj.url };
2941
+ }
2942
+ if (obj.type === "local" && typeof obj.path === "string") {
2943
+ return { type: "local", path: obj.path };
2944
+ }
2945
+ return void 0;
2946
+ }
2947
+ function parseRepoCheckout(raw) {
2948
+ if (!isJsonObject(raw)) return void 0;
2949
+ const obj = raw;
2950
+ const ref = typeof obj.ref === "string" ? obj.ref : void 0;
2951
+ const resolve = obj.resolve === "remote" || obj.resolve === "local" ? obj.resolve : void 0;
2952
+ const ancestor = typeof obj.ancestor === "number" ? obj.ancestor : void 0;
2953
+ if (!ref && !resolve && ancestor === void 0) return void 0;
2954
+ return {
2955
+ ...ref !== void 0 && { ref },
2956
+ ...resolve !== void 0 && { resolve },
2957
+ ...ancestor !== void 0 && { ancestor }
2958
+ };
2959
+ }
2960
+ function parseRepoClone(raw) {
2961
+ if (!isJsonObject(raw)) return void 0;
2962
+ const obj = raw;
2963
+ const depth = typeof obj.depth === "number" ? obj.depth : void 0;
2964
+ const filter = typeof obj.filter === "string" ? obj.filter : void 0;
2965
+ const sparse = Array.isArray(obj.sparse) ? obj.sparse.filter((s) => typeof s === "string") : void 0;
2966
+ if (depth === void 0 && !filter && !sparse) return void 0;
2967
+ return {
2968
+ ...depth !== void 0 && { depth },
2969
+ ...filter !== void 0 && { filter },
2970
+ ...sparse !== void 0 && { sparse }
2971
+ };
2972
+ }
2973
+ function parseRepoConfig(raw) {
2974
+ if (!isJsonObject(raw)) return void 0;
2975
+ const obj = raw;
2976
+ const repoPath = typeof obj.path === "string" ? obj.path : void 0;
2977
+ const source = parseRepoSource(obj.source);
2978
+ if (!repoPath || !source) return void 0;
2979
+ const checkout = parseRepoCheckout(obj.checkout);
2980
+ const clone = parseRepoClone(obj.clone);
2981
+ return {
2982
+ path: repoPath,
2983
+ source,
2984
+ ...checkout !== void 0 && { checkout },
2985
+ ...clone !== void 0 && { clone }
2986
+ };
2987
+ }
2988
+ function parseResetConfig(raw) {
2989
+ if (!isJsonObject(raw)) return void 0;
2990
+ const obj = raw;
2991
+ const strategy = obj.strategy === "none" || obj.strategy === "hard" || obj.strategy === "recreate" ? obj.strategy : void 0;
2992
+ const afterEach = typeof obj.after_each === "boolean" ? obj.after_each : void 0;
2993
+ if (!strategy && afterEach === void 0) return void 0;
2994
+ return {
2995
+ ...strategy !== void 0 && { strategy },
2996
+ ...afterEach !== void 0 && { after_each: afterEach }
2997
+ };
2998
+ }
2823
2999
  function parseWorkspaceConfig(raw, evalFileDir) {
2824
3000
  if (!isJsonObject(raw)) return void 0;
2825
3001
  const obj = raw;
@@ -2827,13 +3003,20 @@ function parseWorkspaceConfig(raw, evalFileDir) {
2827
3003
  if (template && !path8.isAbsolute(template)) {
2828
3004
  template = path8.resolve(evalFileDir, template);
2829
3005
  }
3006
+ const isolation = obj.isolation === "shared" || obj.isolation === "per_test" ? obj.isolation : void 0;
3007
+ const repos = Array.isArray(obj.repos) ? obj.repos.map(parseRepoConfig).filter(Boolean) : void 0;
3008
+ const reset = parseResetConfig(obj.reset);
2830
3009
  const beforeAll = parseWorkspaceScriptConfig(obj.before_all, evalFileDir);
2831
3010
  const afterAll = parseWorkspaceScriptConfig(obj.after_all, evalFileDir);
2832
3011
  const beforeEach = parseWorkspaceScriptConfig(obj.before_each, evalFileDir);
2833
3012
  const afterEach = parseWorkspaceScriptConfig(obj.after_each, evalFileDir);
2834
- if (!template && !beforeAll && !afterAll && !beforeEach && !afterEach) return void 0;
3013
+ if (!template && !isolation && !repos && !reset && !beforeAll && !afterAll && !beforeEach && !afterEach)
3014
+ return void 0;
2835
3015
  return {
2836
3016
  ...template !== void 0 && { template },
3017
+ ...isolation !== void 0 && { isolation },
3018
+ ...repos !== void 0 && { repos },
3019
+ ...reset !== void 0 && { reset },
2837
3020
  ...beforeAll !== void 0 && { before_all: beforeAll },
2838
3021
  ...afterAll !== void 0 && { after_all: afterAll },
2839
3022
  ...beforeEach !== void 0 && { before_each: beforeEach },
@@ -2846,6 +3029,9 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
2846
3029
  if (!caseLevel) return suiteLevel;
2847
3030
  return {
2848
3031
  template: caseLevel.template ?? suiteLevel.template,
3032
+ isolation: caseLevel.isolation ?? suiteLevel.isolation,
3033
+ repos: caseLevel.repos ?? suiteLevel.repos,
3034
+ reset: caseLevel.reset ?? suiteLevel.reset,
2849
3035
  before_all: caseLevel.before_all ?? suiteLevel.before_all,
2850
3036
  after_all: caseLevel.after_all ?? suiteLevel.after_all,
2851
3037
  before_each: caseLevel.before_each ?? suiteLevel.before_each,
@@ -3385,11 +3571,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
3385
3571
  }
3386
3572
  return claudeSdkModule;
3387
3573
  }
3388
- var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
3389
- - Do NOT create any additional output files in the workspace.
3390
- - All intended file outputs/changes MUST be written in your response.
3391
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
3392
- This is required for evaluation scoring.`;
3393
3574
  var ClaudeProvider = class {
3394
3575
  id;
3395
3576
  kind = "claude";
@@ -3411,7 +3592,7 @@ var ClaudeProvider = class {
3411
3592
  const logger = await this.createStreamLogger(request).catch(() => void 0);
3412
3593
  const inputFiles = normalizeInputFiles(request.inputFiles);
3413
3594
  const prompt = buildPromptDocument(request, inputFiles);
3414
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT2);
3595
+ const systemPrompt = this.config.systemPrompt;
3415
3596
  const queryOptions = {
3416
3597
  permissionMode: "bypassPermissions",
3417
3598
  allowDangerouslySkipPermissions: true,
@@ -4392,11 +4573,6 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
4392
4573
  }
4393
4574
  return codexSdkModule;
4394
4575
  }
4395
- var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
4396
- - Do NOT create any additional output files in the workspace.
4397
- - All intended file outputs/changes MUST be written in your response.
4398
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
4399
- This is required for evaluation scoring.`;
4400
4576
  var CodexProvider = class {
4401
4577
  id;
4402
4578
  kind = "codex";
@@ -4431,7 +4607,7 @@ var CodexProvider = class {
4431
4607
  const thread = codex.startThread(threadOptions);
4432
4608
  const inputFiles = normalizeInputFiles(request.inputFiles);
4433
4609
  const basePrompt = buildPromptDocument(request, inputFiles);
4434
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT3);
4610
+ const systemPrompt = this.config.systemPrompt;
4435
4611
  const prompt = systemPrompt ? `${systemPrompt}
4436
4612
 
4437
4613
  ${basePrompt}` : basePrompt;
@@ -4797,7 +4973,7 @@ import { arch, platform } from "node:os";
4797
4973
  import path13 from "node:path";
4798
4974
  import { fileURLToPath as fileURLToPath2 } from "node:url";
4799
4975
  function resolvePlatformCliPath() {
4800
- const os4 = platform();
4976
+ const os5 = platform();
4801
4977
  const cpu = arch();
4802
4978
  const platformMap = {
4803
4979
  linux: "linux",
@@ -4808,13 +4984,13 @@ function resolvePlatformCliPath() {
4808
4984
  x64: "x64",
4809
4985
  arm64: "arm64"
4810
4986
  };
4811
- const osPart = platformMap[os4];
4987
+ const osPart = platformMap[os5];
4812
4988
  const archPart = archMap[cpu];
4813
4989
  if (!osPart || !archPart) {
4814
4990
  return void 0;
4815
4991
  }
4816
4992
  const packageName = `@github/copilot-${osPart}-${archPart}`;
4817
- const binaryName = os4 === "win32" ? "copilot.exe" : "copilot";
4993
+ const binaryName = os5 === "win32" ? "copilot.exe" : "copilot";
4818
4994
  try {
4819
4995
  const resolved = import.meta.resolve(`${packageName}/package.json`);
4820
4996
  const packageJsonPath = resolved.startsWith("file:") ? fileURLToPath2(resolved) : resolved;
@@ -4956,11 +5132,6 @@ function isLogStreamingDisabled(envKey) {
4956
5132
  }
4957
5133
 
4958
5134
  // src/evaluation/providers/copilot-cli.ts
4959
- var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
4960
- - Do NOT create any additional output files in the workspace.
4961
- - All intended file outputs/changes MUST be written in your response.
4962
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
4963
- This is required for evaluation scoring.`;
4964
5135
  var CopilotCliProvider = class {
4965
5136
  id;
4966
5137
  kind = "copilot-cli";
@@ -5163,8 +5334,8 @@ var CopilotCliProvider = class {
5163
5334
  }
5164
5335
  return args;
5165
5336
  }
5166
- resolveSystemPrompt(request) {
5167
- return this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT4);
5337
+ resolveSystemPrompt(_request) {
5338
+ return this.config.systemPrompt;
5168
5339
  }
5169
5340
  async raceWithTimeout(sendPromise, agentProcess) {
5170
5341
  const timeoutMs = this.config.timeoutMs;
@@ -5352,21 +5523,16 @@ Original error: ${error instanceof Error ? error.message : String(error)}`
5352
5523
  }
5353
5524
  return copilotSdkModule;
5354
5525
  }
5355
- var DEFAULT_SYSTEM_PROMPT5 = `**IMPORTANT**: Follow these instructions for your response:
5356
- - Do NOT create any additional output files in the workspace.
5357
- - All intended file outputs/changes MUST be written in your response.
5358
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5359
- This is required for evaluation scoring.`;
5360
5526
  var CopilotSdkProvider = class {
5361
5527
  id;
5362
- kind = "copilot";
5528
+ kind = "copilot-sdk";
5363
5529
  targetName;
5364
5530
  supportsBatch = false;
5365
5531
  config;
5366
5532
  // biome-ignore lint/suspicious/noExplicitAny: SDK client type is dynamically loaded
5367
5533
  client = null;
5368
5534
  constructor(targetName, config) {
5369
- this.id = `copilot:${targetName}`;
5535
+ this.id = `copilot-sdk:${targetName}`;
5370
5536
  this.targetName = targetName;
5371
5537
  this.config = config;
5372
5538
  }
@@ -5389,7 +5555,7 @@ var CopilotSdkProvider = class {
5389
5555
  if (cwd) {
5390
5556
  sessionOptions.workingDirectory = cwd;
5391
5557
  }
5392
- const systemPrompt = this.config.systemPrompt ?? (request.captureFileChanges ? void 0 : DEFAULT_SYSTEM_PROMPT5);
5558
+ const systemPrompt = this.config.systemPrompt;
5393
5559
  if (systemPrompt) {
5394
5560
  sessionOptions.systemMessage = {
5395
5561
  mode: "append",
@@ -5905,11 +6071,6 @@ function subscribeToPiLogEntries(listener) {
5905
6071
  // src/evaluation/providers/pi-coding-agent.ts
5906
6072
  var WORKSPACE_PREFIX = "agentv-pi-";
5907
6073
  var PROMPT_FILENAME = "prompt.md";
5908
- var DEFAULT_SYSTEM_PROMPT6 = `**IMPORTANT**: Follow these instructions for your response:
5909
- - Do NOT create any additional output files in the workspace.
5910
- - All intended file outputs/changes MUST be written in your response.
5911
- - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
5912
- This is required for evaluation scoring.`;
5913
6074
  var PiCodingAgentProvider = class {
5914
6075
  id;
5915
6076
  kind = "pi-coding-agent";
@@ -5986,7 +6147,7 @@ var PiCodingAgentProvider = class {
5986
6147
  }
5987
6148
  return path16.resolve(this.config.cwd);
5988
6149
  }
5989
- buildPiArgs(prompt, inputFiles, captureFileChanges2) {
6150
+ buildPiArgs(prompt, inputFiles, _captureFileChanges) {
5990
6151
  const args = [];
5991
6152
  if (this.config.provider) {
5992
6153
  args.push("--provider", this.config.provider);
@@ -6014,7 +6175,7 @@ var PiCodingAgentProvider = class {
6014
6175
  args.push(`@${file}`);
6015
6176
  }
6016
6177
  }
6017
- const systemPrompt = this.config.systemPrompt ?? (captureFileChanges2 ? void 0 : DEFAULT_SYSTEM_PROMPT6);
6178
+ const systemPrompt = this.config.systemPrompt;
6018
6179
  const fullPrompt = systemPrompt ? `${systemPrompt}
6019
6180
 
6020
6181
  ${prompt}` : prompt;
@@ -7708,7 +7869,6 @@ var AGENTV_REQUEST_TEMPLATE = `[[ ## task ## ]]
7708
7869
 
7709
7870
  **IMPORTANT**: Follow these exact steps:
7710
7871
  1. Create and write your complete response to: {{responseFileTmp}}
7711
- - Do NOT create any additional output files in the workspace.
7712
7872
  - All intended file outputs/changes MUST be written in your response file.
7713
7873
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
7714
7874
  2. When completely finished, run these PowerShell commands to signal completion:
@@ -7727,7 +7887,6 @@ var AGENTV_BATCH_REQUEST_TEMPLATE = `[[ ## task ## ]]
7727
7887
 
7728
7888
  **IMPORTANT**: Follow these exact steps:
7729
7889
  1. Create and write your complete response to: {{responseFileTmp}}
7730
- - Do NOT create any additional output files in the workspace.
7731
7890
  - All intended file outputs/changes MUST be written in your response file.
7732
7891
  - For each intended file, include the repo name, relative path and unified git diff following the convention \`diff --git ...\`.
7733
7892
  2. When completely finished and the response is stable, rename it to: {{responseFileFinal}}
@@ -8153,7 +8312,7 @@ async function discoverProviders(registry, baseDir) {
8153
8312
  // src/evaluation/providers/index.ts
8154
8313
  function createBuiltinProviderRegistry() {
8155
8314
  const registry = new ProviderRegistry();
8156
- registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
8315
+ registry.register("azure", (t) => new AzureProvider(t.name, t.config)).register("anthropic", (t) => new AnthropicProvider(t.name, t.config)).register("gemini", (t) => new GeminiProvider(t.name, t.config)).register("cli", (t) => new CliProvider(t.name, t.config)).register("codex", (t) => new CodexProvider(t.name, t.config)).register("copilot-sdk", (t) => new CopilotSdkProvider(t.name, t.config)).register("copilot-cli", (t) => new CopilotCliProvider(t.name, t.config)).register("pi-coding-agent", (t) => new PiCodingAgentProvider(t.name, t.config)).register("pi-agent-sdk", (t) => new PiAgentSdkProvider(t.name, t.config)).register("claude", (t) => new ClaudeProvider(t.name, t.config)).register("mock", (t) => new MockProvider(t.name, t.config)).register("vscode", (t) => new VSCodeProvider(t.name, t.config, "vscode")).register(
8157
8316
  "vscode-insiders",
8158
8317
  (t) => new VSCodeProvider(t.name, t.config, "vscode-insiders")
8159
8318
  );
@@ -8342,16 +8501,16 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
8342
8501
  });
8343
8502
  }
8344
8503
  async function execShellWithStdin(command, stdinPayload, options = {}) {
8345
- const { mkdir: mkdir13, readFile: readFile12, rm: rm5, writeFile: writeFile8 } = await import("node:fs/promises");
8504
+ const { mkdir: mkdir14, readFile: readFile12, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
8346
8505
  const { tmpdir: tmpdir3 } = await import("node:os");
8347
- const path39 = await import("node:path");
8506
+ const path40 = await import("node:path");
8348
8507
  const { randomUUID: randomUUID8 } = await import("node:crypto");
8349
- const dir = path39.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8350
- await mkdir13(dir, { recursive: true });
8351
- const stdinPath = path39.join(dir, "stdin.txt");
8352
- const stdoutPath = path39.join(dir, "stdout.txt");
8353
- const stderrPath = path39.join(dir, "stderr.txt");
8354
- await writeFile8(stdinPath, stdinPayload, "utf8");
8508
+ const dir = path40.join(tmpdir3(), `agentv-exec-${randomUUID8()}`);
8509
+ await mkdir14(dir, { recursive: true });
8510
+ const stdinPath = path40.join(dir, "stdin.txt");
8511
+ const stdoutPath = path40.join(dir, "stdout.txt");
8512
+ const stderrPath = path40.join(dir, "stderr.txt");
8513
+ await writeFile9(stdinPath, stdinPayload, "utf8");
8355
8514
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
8356
8515
  const { spawn: spawn4 } = await import("node:child_process");
8357
8516
  try {
@@ -8384,7 +8543,7 @@ async function execShellWithStdin(command, stdinPayload, options = {}) {
8384
8543
  const stderr = (await readFile12(stderrPath, "utf8")).replace(/\r\n/g, "\n");
8385
8544
  return { stdout, stderr, exitCode };
8386
8545
  } finally {
8387
- await rm5(dir, { recursive: true, force: true });
8546
+ await rm6(dir, { recursive: true, force: true });
8388
8547
  }
8389
8548
  }
8390
8549
 
@@ -8702,7 +8861,7 @@ var CodeEvaluator = class {
8702
8861
  outputPath,
8703
8862
  guidelineFiles: context.evalCase.guideline_paths,
8704
8863
  inputFiles: context.evalCase.file_paths.filter(
8705
- (path39) => !context.evalCase.guideline_paths.includes(path39)
8864
+ (path40) => !context.evalCase.guideline_paths.includes(path40)
8706
8865
  ),
8707
8866
  input: context.evalCase.input,
8708
8867
  trace: context.trace ?? null,
@@ -8950,13 +9109,15 @@ ${context.fileChanges}`;
8950
9109
  evaluatorRawRequest,
8951
9110
  tokenUsage
8952
9111
  };
8953
- } catch {
9112
+ } catch (e) {
9113
+ const message = e instanceof Error ? e.message : String(e);
8954
9114
  return {
8955
9115
  score: 0,
8956
- verdict: "fail",
9116
+ verdict: "skip",
8957
9117
  hits: [],
8958
- misses: [],
9118
+ misses: [`Judge parse failure after 3 attempts: ${message}`],
8959
9119
  expectedAspectCount: 1,
9120
+ reasoning: `Judge parse failure after 3 attempts: ${message}`,
8960
9121
  evaluatorRawRequest
8961
9122
  };
8962
9123
  }
@@ -9898,115 +10059,115 @@ var FieldAccuracyEvaluator = class {
9898
10059
  * Evaluate a single field against the expected value.
9899
10060
  */
9900
10061
  evaluateField(fieldConfig, candidateData, expectedData) {
9901
- const { path: path39, match, required = true, weight = 1 } = fieldConfig;
9902
- const candidateValue = resolvePath(candidateData, path39);
9903
- const expectedValue = resolvePath(expectedData, path39);
10062
+ const { path: path40, match, required = true, weight = 1 } = fieldConfig;
10063
+ const candidateValue = resolvePath(candidateData, path40);
10064
+ const expectedValue = resolvePath(expectedData, path40);
9904
10065
  if (expectedValue === void 0) {
9905
10066
  return {
9906
- path: path39,
10067
+ path: path40,
9907
10068
  score: 1,
9908
10069
  // No expected value means no comparison needed
9909
10070
  weight,
9910
10071
  hit: true,
9911
- message: `${path39}: no expected value`
10072
+ message: `${path40}: no expected value`
9912
10073
  };
9913
10074
  }
9914
10075
  if (candidateValue === void 0) {
9915
10076
  if (required) {
9916
10077
  return {
9917
- path: path39,
10078
+ path: path40,
9918
10079
  score: 0,
9919
10080
  weight,
9920
10081
  hit: false,
9921
- message: `${path39} (required, missing)`
10082
+ message: `${path40} (required, missing)`
9922
10083
  };
9923
10084
  }
9924
10085
  return {
9925
- path: path39,
10086
+ path: path40,
9926
10087
  score: 1,
9927
10088
  // Don't penalize missing optional fields
9928
10089
  weight: 0,
9929
10090
  // Zero weight means it won't affect the score
9930
10091
  hit: true,
9931
- message: `${path39}: optional field missing`
10092
+ message: `${path40}: optional field missing`
9932
10093
  };
9933
10094
  }
9934
10095
  switch (match) {
9935
10096
  case "exact":
9936
- return this.compareExact(path39, candidateValue, expectedValue, weight);
10097
+ return this.compareExact(path40, candidateValue, expectedValue, weight);
9937
10098
  case "numeric_tolerance":
9938
10099
  return this.compareNumericTolerance(
9939
- path39,
10100
+ path40,
9940
10101
  candidateValue,
9941
10102
  expectedValue,
9942
10103
  fieldConfig,
9943
10104
  weight
9944
10105
  );
9945
10106
  case "date":
9946
- return this.compareDate(path39, candidateValue, expectedValue, fieldConfig, weight);
10107
+ return this.compareDate(path40, candidateValue, expectedValue, fieldConfig, weight);
9947
10108
  default:
9948
10109
  return {
9949
- path: path39,
10110
+ path: path40,
9950
10111
  score: 0,
9951
10112
  weight,
9952
10113
  hit: false,
9953
- message: `${path39}: unknown match type "${match}"`
10114
+ message: `${path40}: unknown match type "${match}"`
9954
10115
  };
9955
10116
  }
9956
10117
  }
9957
10118
  /**
9958
10119
  * Exact equality comparison.
9959
10120
  */
9960
- compareExact(path39, candidateValue, expectedValue, weight) {
10121
+ compareExact(path40, candidateValue, expectedValue, weight) {
9961
10122
  if (deepEqual(candidateValue, expectedValue)) {
9962
10123
  return {
9963
- path: path39,
10124
+ path: path40,
9964
10125
  score: 1,
9965
10126
  weight,
9966
10127
  hit: true,
9967
- message: path39
10128
+ message: path40
9968
10129
  };
9969
10130
  }
9970
10131
  if (typeof candidateValue !== typeof expectedValue) {
9971
10132
  return {
9972
- path: path39,
10133
+ path: path40,
9973
10134
  score: 0,
9974
10135
  weight,
9975
10136
  hit: false,
9976
- message: `${path39} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
10137
+ message: `${path40} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
9977
10138
  };
9978
10139
  }
9979
10140
  return {
9980
- path: path39,
10141
+ path: path40,
9981
10142
  score: 0,
9982
10143
  weight,
9983
10144
  hit: false,
9984
- message: `${path39} (value mismatch)`
10145
+ message: `${path40} (value mismatch)`
9985
10146
  };
9986
10147
  }
9987
10148
  /**
9988
10149
  * Numeric comparison with absolute or relative tolerance.
9989
10150
  */
9990
- compareNumericTolerance(path39, candidateValue, expectedValue, fieldConfig, weight) {
10151
+ compareNumericTolerance(path40, candidateValue, expectedValue, fieldConfig, weight) {
9991
10152
  const { tolerance = 0, relative = false } = fieldConfig;
9992
10153
  const candidateNum = toNumber2(candidateValue);
9993
10154
  const expectedNum = toNumber2(expectedValue);
9994
10155
  if (candidateNum === null || expectedNum === null) {
9995
10156
  return {
9996
- path: path39,
10157
+ path: path40,
9997
10158
  score: 0,
9998
10159
  weight,
9999
10160
  hit: false,
10000
- message: `${path39} (non-numeric value)`
10161
+ message: `${path40} (non-numeric value)`
10001
10162
  };
10002
10163
  }
10003
10164
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
10004
10165
  return {
10005
- path: path39,
10166
+ path: path40,
10006
10167
  score: 0,
10007
10168
  weight,
10008
10169
  hit: false,
10009
- message: `${path39} (invalid numeric value)`
10170
+ message: `${path40} (invalid numeric value)`
10010
10171
  };
10011
10172
  }
10012
10173
  const diff = Math.abs(candidateNum - expectedNum);
@@ -10019,61 +10180,61 @@ var FieldAccuracyEvaluator = class {
10019
10180
  }
10020
10181
  if (withinTolerance) {
10021
10182
  return {
10022
- path: path39,
10183
+ path: path40,
10023
10184
  score: 1,
10024
10185
  weight,
10025
10186
  hit: true,
10026
- message: `${path39} (within tolerance: diff=${diff.toFixed(2)})`
10187
+ message: `${path40} (within tolerance: diff=${diff.toFixed(2)})`
10027
10188
  };
10028
10189
  }
10029
10190
  return {
10030
- path: path39,
10191
+ path: path40,
10031
10192
  score: 0,
10032
10193
  weight,
10033
10194
  hit: false,
10034
- message: `${path39} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10195
+ message: `${path40} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
10035
10196
  };
10036
10197
  }
10037
10198
  /**
10038
10199
  * Date comparison with format normalization.
10039
10200
  */
10040
- compareDate(path39, candidateValue, expectedValue, fieldConfig, weight) {
10201
+ compareDate(path40, candidateValue, expectedValue, fieldConfig, weight) {
10041
10202
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
10042
10203
  const candidateDate = parseDate(String(candidateValue), formats);
10043
10204
  const expectedDate = parseDate(String(expectedValue), formats);
10044
10205
  if (candidateDate === null) {
10045
10206
  return {
10046
- path: path39,
10207
+ path: path40,
10047
10208
  score: 0,
10048
10209
  weight,
10049
10210
  hit: false,
10050
- message: `${path39} (unparseable candidate date)`
10211
+ message: `${path40} (unparseable candidate date)`
10051
10212
  };
10052
10213
  }
10053
10214
  if (expectedDate === null) {
10054
10215
  return {
10055
- path: path39,
10216
+ path: path40,
10056
10217
  score: 0,
10057
10218
  weight,
10058
10219
  hit: false,
10059
- message: `${path39} (unparseable expected date)`
10220
+ message: `${path40} (unparseable expected date)`
10060
10221
  };
10061
10222
  }
10062
10223
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
10063
10224
  return {
10064
- path: path39,
10225
+ path: path40,
10065
10226
  score: 1,
10066
10227
  weight,
10067
10228
  hit: true,
10068
- message: path39
10229
+ message: path40
10069
10230
  };
10070
10231
  }
10071
10232
  return {
10072
- path: path39,
10233
+ path: path40,
10073
10234
  score: 0,
10074
10235
  weight,
10075
10236
  hit: false,
10076
- message: `${path39} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10237
+ message: `${path40} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
10077
10238
  };
10078
10239
  }
10079
10240
  /**
@@ -10114,11 +10275,11 @@ var FieldAccuracyEvaluator = class {
10114
10275
  };
10115
10276
  }
10116
10277
  };
10117
- function resolvePath(obj, path39) {
10118
- if (!path39 || !obj) {
10278
+ function resolvePath(obj, path40) {
10279
+ if (!path40 || !obj) {
10119
10280
  return void 0;
10120
10281
  }
10121
- const parts = path39.split(/\.|\[|\]/).filter((p) => p.length > 0);
10282
+ const parts = path40.split(/\.|\[|\]/).filter((p) => p.length > 0);
10122
10283
  let current = obj;
10123
10284
  for (const part of parts) {
10124
10285
  if (current === null || current === void 0) {
@@ -10936,8 +11097,8 @@ var TokenUsageEvaluator = class {
10936
11097
  };
10937
11098
 
10938
11099
  // src/evaluation/evaluators/tool-trajectory.ts
10939
- function getNestedValue(obj, path39) {
10940
- const parts = path39.split(".");
11100
+ function getNestedValue(obj, path40) {
11101
+ const parts = path40.split(".");
10941
11102
  let current = obj;
10942
11103
  for (const part of parts) {
10943
11104
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -11401,13 +11562,78 @@ function runContainsAssertion(output, value) {
11401
11562
  misses: passed ? [] : [`Output does not contain "${value}"`]
11402
11563
  };
11403
11564
  }
11404
- function runRegexAssertion(output, pattern) {
11405
- const regex = new RegExp(pattern);
11565
+ function runContainsAnyAssertion(output, values) {
11566
+ const matched = values.filter((v) => output.includes(v));
11567
+ const passed = matched.length > 0;
11568
+ return {
11569
+ score: passed ? 1 : 0,
11570
+ hits: passed ? [`Output contains "${matched[0]}"`] : [],
11571
+ misses: passed ? [] : [`Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`]
11572
+ };
11573
+ }
11574
+ function runContainsAllAssertion(output, values) {
11575
+ const missing = values.filter((v) => !output.includes(v));
11576
+ const passed = missing.length === 0;
11577
+ return {
11578
+ score: passed ? 1 : 0,
11579
+ hits: passed ? [`Output contains all ${values.length} expected strings`] : [],
11580
+ misses: passed ? [] : [`Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`]
11581
+ };
11582
+ }
11583
+ function runIcontainsAssertion(output, value) {
11584
+ const passed = output.toLowerCase().includes(value.toLowerCase());
11585
+ return {
11586
+ score: passed ? 1 : 0,
11587
+ hits: passed ? [`Output contains "${value}" (case-insensitive)`] : [],
11588
+ misses: passed ? [] : [`Output does not contain "${value}" (case-insensitive)`]
11589
+ };
11590
+ }
11591
+ function runIcontainsAnyAssertion(output, values) {
11592
+ const lower = output.toLowerCase();
11593
+ const matched = values.filter((v) => lower.includes(v.toLowerCase()));
11594
+ const passed = matched.length > 0;
11595
+ return {
11596
+ score: passed ? 1 : 0,
11597
+ hits: passed ? [`Output contains "${matched[0]}" (case-insensitive)`] : [],
11598
+ misses: passed ? [] : [
11599
+ `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`
11600
+ ]
11601
+ };
11602
+ }
11603
+ function runIcontainsAllAssertion(output, values) {
11604
+ const lower = output.toLowerCase();
11605
+ const missing = values.filter((v) => !lower.includes(v.toLowerCase()));
11606
+ const passed = missing.length === 0;
11607
+ return {
11608
+ score: passed ? 1 : 0,
11609
+ hits: passed ? [`Output contains all ${values.length} expected strings (case-insensitive)`] : [],
11610
+ misses: passed ? [] : [`Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`]
11611
+ };
11612
+ }
11613
+ function runStartsWithAssertion(output, value) {
11614
+ const passed = output.trim().startsWith(value.trim());
11615
+ return {
11616
+ score: passed ? 1 : 0,
11617
+ hits: passed ? [`Output starts with "${value}"`] : [],
11618
+ misses: passed ? [] : [`Output does not start with "${value}"`]
11619
+ };
11620
+ }
11621
+ function runEndsWithAssertion(output, value) {
11622
+ const passed = output.trim().endsWith(value.trim());
11623
+ return {
11624
+ score: passed ? 1 : 0,
11625
+ hits: passed ? [`Output ends with "${value}"`] : [],
11626
+ misses: passed ? [] : [`Output does not end with "${value}"`]
11627
+ };
11628
+ }
11629
+ function runRegexAssertion(output, pattern, flags) {
11630
+ const regex = new RegExp(pattern, flags);
11406
11631
  const passed = regex.test(output);
11632
+ const flagsLabel = flags ? ` (flags: ${flags})` : "";
11407
11633
  return {
11408
11634
  score: passed ? 1 : 0,
11409
- hits: passed ? [`Output matches pattern /${pattern}/`] : [],
11410
- misses: passed ? [] : [`Output does not match pattern /${pattern}/`]
11635
+ hits: passed ? [`Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}`] : [],
11636
+ misses: passed ? [] : [`Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`]
11411
11637
  };
11412
11638
  }
11413
11639
  function runIsJsonAssertion(output) {
@@ -11433,9 +11659,9 @@ function runEqualsAssertion(output, value) {
11433
11659
  }
11434
11660
 
11435
11661
  // src/evaluation/orchestrator.ts
11436
- import { createHash, randomUUID as randomUUID7 } from "node:crypto";
11437
- import { mkdir as mkdir11 } from "node:fs/promises";
11438
- import path36 from "node:path";
11662
+ import { createHash as createHash2, randomUUID as randomUUID7 } from "node:crypto";
11663
+ import { mkdir as mkdir12 } from "node:fs/promises";
11664
+ import path37 from "node:path";
11439
11665
  import micromatch4 from "micromatch";
11440
11666
 
11441
11667
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
@@ -11825,13 +12051,13 @@ var containsFactory = (config) => {
11825
12051
  var regexFactory = (config) => {
11826
12052
  const c = config;
11827
12053
  return new DeterministicAssertionEvaluator("regex", (ctx) => {
11828
- const result = runRegexAssertion(ctx.candidate, c.value);
12054
+ const result = runRegexAssertion(ctx.candidate, c.value, c.flags);
11829
12055
  return {
11830
12056
  score: result.score,
11831
12057
  verdict: result.score === 1 ? "pass" : "fail",
11832
12058
  hits: result.hits,
11833
12059
  misses: result.misses,
11834
- reasoning: result.score === 1 ? `Output matches pattern /${c.value}/` : `Output does not match pattern /${c.value}/`,
12060
+ reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
11835
12061
  expectedAspectCount: 1
11836
12062
  };
11837
12063
  });
@@ -11863,9 +12089,107 @@ var equalsFactory = (config) => {
11863
12089
  };
11864
12090
  });
11865
12091
  };
12092
+ var containsAnyFactory = (config) => {
12093
+ const c = config;
12094
+ return new DeterministicAssertionEvaluator("contains_any", (ctx) => {
12095
+ const result = runContainsAnyAssertion(ctx.candidate, c.value);
12096
+ return {
12097
+ score: result.score,
12098
+ verdict: result.score === 1 ? "pass" : "fail",
12099
+ hits: result.hits,
12100
+ misses: result.misses,
12101
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12102
+ expectedAspectCount: 1
12103
+ };
12104
+ });
12105
+ };
12106
+ var containsAllFactory = (config) => {
12107
+ const c = config;
12108
+ return new DeterministicAssertionEvaluator("contains_all", (ctx) => {
12109
+ const result = runContainsAllAssertion(ctx.candidate, c.value);
12110
+ return {
12111
+ score: result.score,
12112
+ verdict: result.score === 1 ? "pass" : "fail",
12113
+ hits: result.hits,
12114
+ misses: result.misses,
12115
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12116
+ expectedAspectCount: 1
12117
+ };
12118
+ });
12119
+ };
12120
+ var icontainsFactory = (config) => {
12121
+ const c = config;
12122
+ return new DeterministicAssertionEvaluator("icontains", (ctx) => {
12123
+ const result = runIcontainsAssertion(ctx.candidate, c.value);
12124
+ return {
12125
+ score: result.score,
12126
+ verdict: result.score === 1 ? "pass" : "fail",
12127
+ hits: result.hits,
12128
+ misses: result.misses,
12129
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12130
+ expectedAspectCount: 1
12131
+ };
12132
+ });
12133
+ };
12134
+ var icontainsAnyFactory = (config) => {
12135
+ const c = config;
12136
+ return new DeterministicAssertionEvaluator("icontains_any", (ctx) => {
12137
+ const result = runIcontainsAnyAssertion(ctx.candidate, c.value);
12138
+ return {
12139
+ score: result.score,
12140
+ verdict: result.score === 1 ? "pass" : "fail",
12141
+ hits: result.hits,
12142
+ misses: result.misses,
12143
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12144
+ expectedAspectCount: 1
12145
+ };
12146
+ });
12147
+ };
12148
+ var icontainsAllFactory = (config) => {
12149
+ const c = config;
12150
+ return new DeterministicAssertionEvaluator("icontains_all", (ctx) => {
12151
+ const result = runIcontainsAllAssertion(ctx.candidate, c.value);
12152
+ return {
12153
+ score: result.score,
12154
+ verdict: result.score === 1 ? "pass" : "fail",
12155
+ hits: result.hits,
12156
+ misses: result.misses,
12157
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12158
+ expectedAspectCount: 1
12159
+ };
12160
+ });
12161
+ };
12162
+ var startsWithFactory = (config) => {
12163
+ const c = config;
12164
+ return new DeterministicAssertionEvaluator("starts_with", (ctx) => {
12165
+ const result = runStartsWithAssertion(ctx.candidate, c.value);
12166
+ return {
12167
+ score: result.score,
12168
+ verdict: result.score === 1 ? "pass" : "fail",
12169
+ hits: result.hits,
12170
+ misses: result.misses,
12171
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12172
+ expectedAspectCount: 1
12173
+ };
12174
+ });
12175
+ };
12176
+ var endsWithFactory = (config) => {
12177
+ const c = config;
12178
+ return new DeterministicAssertionEvaluator("ends_with", (ctx) => {
12179
+ const result = runEndsWithAssertion(ctx.candidate, c.value);
12180
+ return {
12181
+ score: result.score,
12182
+ verdict: result.score === 1 ? "pass" : "fail",
12183
+ hits: result.hits,
12184
+ misses: result.misses,
12185
+ reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
12186
+ expectedAspectCount: 1
12187
+ };
12188
+ });
12189
+ };
11866
12190
  function createBuiltinRegistry() {
11867
12191
  const registry = new EvaluatorRegistry();
11868
- registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
12192
+ registry.register("llm_judge", llmJudgeFactory).register("code", codeFactory).register("composite", compositeFactory).register("tool_trajectory", toolTrajectoryFactory).register("field_accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token_usage", tokenUsageFactory).register("execution_metrics", executionMetricsFactory).register("agent_judge", agentJudgeFactory).register("contains", containsFactory).register("contains_any", containsAnyFactory).register("contains_all", containsAllFactory).register("icontains", icontainsFactory).register("icontains_any", icontainsAnyFactory).register("icontains_all", icontainsAllFactory).register("starts_with", startsWithFactory).register("ends_with", endsWithFactory).register("regex", regexFactory).register("is_json", isJsonFactory).register("equals", equalsFactory);
11869
12193
  return registry;
11870
12194
  }
11871
12195
 
@@ -12209,18 +12533,198 @@ async function cleanupEvalWorkspaces(evalRunId, workspaceRoot) {
12209
12533
  }
12210
12534
  }
12211
12535
 
12536
+ // src/evaluation/workspace/repo-manager.ts
12537
+ import { execFile } from "node:child_process";
12538
+ import { createHash } from "node:crypto";
12539
+ import { existsSync as existsSync2 } from "node:fs";
12540
+ import { mkdir as mkdir11, rm as rm5, unlink, writeFile as writeFile7 } from "node:fs/promises";
12541
+ import os4 from "node:os";
12542
+ import path35 from "node:path";
12543
+ import { promisify as promisify5 } from "node:util";
12544
+ var execFileAsync = promisify5(execFile);
12545
+ var DEFAULT_CACHE_DIR = path35.join(os4.homedir(), ".agentv", "git-cache");
12546
+ var DEFAULT_TIMEOUT_MS2 = 3e5;
12547
+ var LOCK_TIMEOUT_MS = 6e4;
12548
+ function gitEnv() {
12549
+ const env = { ...process.env };
12550
+ for (const key of Object.keys(env)) {
12551
+ if (key.startsWith("GIT_") && key !== "GIT_SSH_COMMAND") {
12552
+ delete env[key];
12553
+ }
12554
+ }
12555
+ return {
12556
+ ...env,
12557
+ GIT_TERMINAL_PROMPT: "0",
12558
+ GIT_ASKPASS: "",
12559
+ GIT_SSH_COMMAND: "ssh -o BatchMode=yes"
12560
+ };
12561
+ }
12562
+ function cacheKey(source) {
12563
+ const raw = source.type === "git" ? source.url.toLowerCase().replace(/\.git$/, "") : source.path;
12564
+ return createHash("sha256").update(raw).digest("hex");
12565
+ }
12566
+ function getSourceUrl(source) {
12567
+ return source.type === "git" ? source.url : source.path;
12568
+ }
12569
+ async function git(args, opts) {
12570
+ const { stdout } = await execFileAsync("git", args, {
12571
+ cwd: opts?.cwd,
12572
+ timeout: opts?.timeout ?? DEFAULT_TIMEOUT_MS2,
12573
+ env: gitEnv(),
12574
+ maxBuffer: 50 * 1024 * 1024
12575
+ // 50MB
12576
+ });
12577
+ return stdout.trim();
12578
+ }
12579
+ async function acquireLock(lockPath) {
12580
+ const start = Date.now();
12581
+ while (Date.now() - start < LOCK_TIMEOUT_MS) {
12582
+ try {
12583
+ await writeFile7(lockPath, String(process.pid), { flag: "wx" });
12584
+ return;
12585
+ } catch (err) {
12586
+ if (err.code === "EEXIST") {
12587
+ await new Promise((r) => setTimeout(r, 200));
12588
+ continue;
12589
+ }
12590
+ throw err;
12591
+ }
12592
+ }
12593
+ throw new Error(`Timed out waiting for lock: ${lockPath}`);
12594
+ }
12595
+ async function releaseLock(lockPath) {
12596
+ try {
12597
+ await unlink(lockPath);
12598
+ } catch {
12599
+ }
12600
+ }
12601
+ var RepoManager = class {
12602
+ cacheDir;
12603
+ constructor(cacheDir) {
12604
+ this.cacheDir = cacheDir ?? DEFAULT_CACHE_DIR;
12605
+ }
12606
+ /**
12607
+ * Ensure a bare mirror cache exists for the given source.
12608
+ * Creates on first access, fetches updates on subsequent calls.
12609
+ * Returns the absolute path to the cache directory.
12610
+ */
12611
+ async ensureCache(source) {
12612
+ const key = cacheKey(source);
12613
+ const cachePath = path35.join(this.cacheDir, key);
12614
+ const lockPath = `${cachePath}.lock`;
12615
+ await mkdir11(this.cacheDir, { recursive: true });
12616
+ await acquireLock(lockPath);
12617
+ try {
12618
+ if (existsSync2(path35.join(cachePath, "HEAD"))) {
12619
+ await git(["fetch", "--prune"], { cwd: cachePath });
12620
+ } else {
12621
+ await git(["clone", "--mirror", "--bare", getSourceUrl(source), cachePath]);
12622
+ }
12623
+ } finally {
12624
+ await releaseLock(lockPath);
12625
+ }
12626
+ return cachePath;
12627
+ }
12628
+ /**
12629
+ * Clone a repo from cache into the workspace at the configured path.
12630
+ * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
12631
+ */
12632
+ async materialize(repo, workspacePath) {
12633
+ const targetDir = path35.join(workspacePath, repo.path);
12634
+ const cachePath = await this.ensureCache(repo.source);
12635
+ const cloneArgs = ["clone"];
12636
+ if (repo.clone?.depth) {
12637
+ cloneArgs.push("--depth", String(repo.clone.depth));
12638
+ }
12639
+ if (repo.clone?.filter) {
12640
+ cloneArgs.push("--filter", repo.clone.filter);
12641
+ }
12642
+ cloneArgs.push("--no-checkout");
12643
+ const cloneUrl = repo.clone?.depth || repo.clone?.filter ? `file://${cachePath}` : cachePath;
12644
+ cloneArgs.push(cloneUrl, targetDir);
12645
+ await git(cloneArgs);
12646
+ if (repo.clone?.sparse?.length) {
12647
+ await git(["sparse-checkout", "init", "--cone"], { cwd: targetDir });
12648
+ await git(["sparse-checkout", "set", ...repo.clone.sparse], { cwd: targetDir });
12649
+ }
12650
+ const ref = repo.checkout?.ref ?? "HEAD";
12651
+ const resolve = repo.checkout?.resolve ?? "remote";
12652
+ let resolvedSha;
12653
+ if (resolve === "remote" && repo.source.type === "git") {
12654
+ const url = getSourceUrl(repo.source);
12655
+ try {
12656
+ const lsOutput = await git(["ls-remote", url, ref]);
12657
+ const match = lsOutput.split(" ")[0];
12658
+ if (!match) {
12659
+ throw new Error(`Ref '${ref}' not found on remote ${url}`);
12660
+ }
12661
+ resolvedSha = match;
12662
+ } catch (err) {
12663
+ if (err instanceof Error && err.message.includes("not found")) throw err;
12664
+ resolvedSha = ref;
12665
+ }
12666
+ } else {
12667
+ resolvedSha = ref;
12668
+ }
12669
+ await git(["checkout", resolvedSha], { cwd: targetDir });
12670
+ const ancestor = repo.checkout?.ancestor ?? 0;
12671
+ if (ancestor > 0) {
12672
+ try {
12673
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
12674
+ await git(["checkout", ancestorSha], { cwd: targetDir });
12675
+ } catch {
12676
+ if (repo.clone?.depth) {
12677
+ await git(["fetch", "--deepen", String(ancestor)], { cwd: targetDir });
12678
+ const ancestorSha = await git(["rev-parse", `HEAD~${ancestor}`], { cwd: targetDir });
12679
+ await git(["checkout", ancestorSha], { cwd: targetDir });
12680
+ } else {
12681
+ throw new Error(
12682
+ `Cannot resolve ancestor ${ancestor} of ref '${ref}'. If using shallow clone, increase clone.depth to at least ${ancestor + 1}.`
12683
+ );
12684
+ }
12685
+ }
12686
+ }
12687
+ }
12688
+ /** Materialize all repos into the workspace. */
12689
+ async materializeAll(repos, workspacePath) {
12690
+ for (const repo of repos) {
12691
+ await this.materialize(repo, workspacePath);
12692
+ }
12693
+ }
12694
+ /** Reset repos in workspace to their checkout state. */
12695
+ async reset(repos, workspacePath, strategy) {
12696
+ if (strategy === "recreate") {
12697
+ for (const repo of repos) {
12698
+ const targetDir = path35.join(workspacePath, repo.path);
12699
+ await rm5(targetDir, { recursive: true, force: true });
12700
+ }
12701
+ await this.materializeAll(repos, workspacePath);
12702
+ return;
12703
+ }
12704
+ for (const repo of repos) {
12705
+ const targetDir = path35.join(workspacePath, repo.path);
12706
+ await git(["reset", "--hard", "HEAD"], { cwd: targetDir });
12707
+ await git(["clean", "-fd"], { cwd: targetDir });
12708
+ }
12709
+ }
12710
+ /** Remove the entire cache directory. */
12711
+ async cleanCache() {
12712
+ await rm5(this.cacheDir, { recursive: true, force: true });
12713
+ }
12714
+ };
12715
+
12212
12716
  // src/evaluation/workspace/resolve.ts
12213
12717
  import { readdir as readdir4, stat as stat6 } from "node:fs/promises";
12214
- import path35 from "node:path";
12718
+ import path36 from "node:path";
12215
12719
  async function resolveWorkspaceTemplate(templatePath) {
12216
12720
  if (!templatePath) {
12217
12721
  return void 0;
12218
12722
  }
12219
- const resolved = path35.resolve(templatePath);
12723
+ const resolved = path36.resolve(templatePath);
12220
12724
  const stats = await stat6(resolved);
12221
12725
  if (stats.isFile()) {
12222
12726
  return {
12223
- dir: path35.dirname(resolved),
12727
+ dir: path36.dirname(resolved),
12224
12728
  workspaceFile: resolved
12225
12729
  };
12226
12730
  }
@@ -12232,14 +12736,14 @@ async function resolveWorkspaceTemplate(templatePath) {
12232
12736
  if (workspaceFiles.length === 1) {
12233
12737
  return {
12234
12738
  dir: resolved,
12235
- workspaceFile: path35.join(resolved, workspaceFiles[0])
12739
+ workspaceFile: path36.join(resolved, workspaceFiles[0])
12236
12740
  };
12237
12741
  }
12238
12742
  if (workspaceFiles.length > 1) {
12239
12743
  const conventionFile = workspaceFiles.find((f) => f === "template.code-workspace");
12240
12744
  return {
12241
12745
  dir: resolved,
12242
- workspaceFile: conventionFile ? path35.join(resolved, conventionFile) : void 0
12746
+ workspaceFile: conventionFile ? path36.join(resolved, conventionFile) : void 0
12243
12747
  };
12244
12748
  }
12245
12749
  return { dir: resolved };
@@ -12361,6 +12865,11 @@ async function runEvaluation(options) {
12361
12865
  }
12362
12866
  return getOrCreateProvider(resolvedJudge);
12363
12867
  };
12868
+ if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
12869
+ throw new Error(
12870
+ `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target \u2014 agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure_base).`
12871
+ );
12872
+ }
12364
12873
  const targetResolver = (name) => {
12365
12874
  const resolved = resolveTargetByName(name);
12366
12875
  if (!resolved) {
@@ -12374,7 +12883,7 @@ async function runEvaluation(options) {
12374
12883
  ];
12375
12884
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
12376
12885
  const typeRegistry = createBuiltinRegistry();
12377
- const discoveryBaseDir = evalFilePath ? path36.dirname(path36.resolve(evalFilePath)) : process.cwd();
12886
+ const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
12378
12887
  await discoverAssertions(typeRegistry, discoveryBaseDir);
12379
12888
  const providerRegistry = createBuiltinProviderRegistry();
12380
12889
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -12429,7 +12938,8 @@ async function runEvaluation(options) {
12429
12938
  const resolvedTemplate = await resolveWorkspaceTemplate(rawTemplate);
12430
12939
  const workspaceTemplate = resolvedTemplate?.dir;
12431
12940
  const suiteWorkspaceFile = resolvedTemplate?.workspaceFile;
12432
- const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all);
12941
+ const isPerTestIsolation = suiteWorkspace?.isolation === "per_test";
12942
+ const hasSharedWorkspace = !!(workspaceTemplate || suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation);
12433
12943
  const requestedWorkers = options.maxConcurrency ?? target.workers ?? 1;
12434
12944
  const workers = hasSharedWorkspace ? 1 : requestedWorkers;
12435
12945
  if (hasSharedWorkspace && requestedWorkers > 1) {
@@ -12448,9 +12958,22 @@ async function runEvaluation(options) {
12448
12958
  const message = error instanceof Error ? error.message : String(error);
12449
12959
  throw new Error(`Failed to create shared workspace: ${message}`);
12450
12960
  }
12451
- } else if (suiteWorkspace?.before_all) {
12961
+ } else if (suiteWorkspace?.before_all || suiteWorkspace?.repos?.length && !isPerTestIsolation) {
12452
12962
  sharedWorkspacePath = getWorkspacePath(evalRunId, "shared");
12453
- await mkdir11(sharedWorkspacePath, { recursive: true });
12963
+ await mkdir12(sharedWorkspacePath, { recursive: true });
12964
+ }
12965
+ const repoManager = suiteWorkspace?.repos?.length ? new RepoManager() : void 0;
12966
+ if (repoManager && sharedWorkspacePath && suiteWorkspace?.repos && !isPerTestIsolation) {
12967
+ try {
12968
+ await repoManager.materializeAll(suiteWorkspace.repos, sharedWorkspacePath);
12969
+ } catch (error) {
12970
+ const message = error instanceof Error ? error.message : String(error);
12971
+ if (sharedWorkspacePath) {
12972
+ await cleanupWorkspace(sharedWorkspacePath).catch(() => {
12973
+ });
12974
+ }
12975
+ throw new Error(`Failed to materialize repos: ${message}`);
12976
+ }
12454
12977
  }
12455
12978
  if (sharedWorkspacePath && suiteWorkspace?.before_all) {
12456
12979
  const scriptContext = {
@@ -12541,7 +13064,8 @@ async function runEvaluation(options) {
12541
13064
  sharedBaselineCommit,
12542
13065
  suiteWorkspaceFile,
12543
13066
  streamCallbacks,
12544
- typeRegistry
13067
+ typeRegistry,
13068
+ repoManager
12545
13069
  };
12546
13070
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
12547
13071
  if (totalBudgetUsd !== void 0) {
@@ -12816,15 +13340,16 @@ async function runEvalCase(options) {
12816
13340
  sharedWorkspacePath,
12817
13341
  sharedBaselineCommit,
12818
13342
  suiteWorkspaceFile,
12819
- typeRegistry: providedTypeRegistry
13343
+ typeRegistry: providedTypeRegistry,
13344
+ repoManager
12820
13345
  } = options;
12821
13346
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
12822
13347
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
12823
13348
  const typeRegistry = providedTypeRegistry ?? createBuiltinRegistry();
12824
- const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
13349
+ const cacheKey2 = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
12825
13350
  let cachedResponse;
12826
- if (cacheKey && cache) {
12827
- cachedResponse = await cache.get(cacheKey);
13351
+ if (cacheKey2 && cache) {
13352
+ cachedResponse = await cache.get(cacheKey2);
12828
13353
  }
12829
13354
  const nowFn = now ?? (() => /* @__PURE__ */ new Date());
12830
13355
  let workspacePath = sharedWorkspacePath;
@@ -12853,9 +13378,25 @@ async function runEvalCase(options) {
12853
13378
  );
12854
13379
  }
12855
13380
  }
12856
- if (!workspacePath && evalCase.workspace?.before_all && evalRunId) {
13381
+ if (!workspacePath && (evalCase.workspace?.before_all || evalCase.workspace?.repos?.length) && evalRunId) {
12857
13382
  workspacePath = getWorkspacePath(evalRunId, evalCase.id);
12858
- await mkdir11(workspacePath, { recursive: true });
13383
+ await mkdir12(workspacePath, { recursive: true });
13384
+ }
13385
+ if (evalCase.workspace?.repos?.length && workspacePath) {
13386
+ const perCaseRepoManager = new RepoManager();
13387
+ try {
13388
+ await perCaseRepoManager.materializeAll(evalCase.workspace.repos, workspacePath);
13389
+ } catch (error) {
13390
+ const message = error instanceof Error ? error.message : String(error);
13391
+ return buildErrorResult(
13392
+ evalCase,
13393
+ target.name,
13394
+ nowFn(),
13395
+ new Error(`Failed to materialize repos: ${message}`),
13396
+ promptInputs,
13397
+ provider
13398
+ );
13399
+ }
12859
13400
  }
12860
13401
  if (workspacePath && evalCase.workspace?.before_all) {
12861
13402
  const scriptContext = {
@@ -12979,8 +13520,8 @@ async function runEvalCase(options) {
12979
13520
  }
12980
13521
  return errorResult;
12981
13522
  }
12982
- if (cacheKey && cache && !cachedResponse) {
12983
- await cache.set(cacheKey, providerResponse);
13523
+ if (cacheKey2 && cache && !cachedResponse) {
13524
+ await cache.set(cacheKey2, providerResponse);
12984
13525
  }
12985
13526
  const output = providerResponse.output;
12986
13527
  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
@@ -13008,6 +13549,16 @@ async function runEvalCase(options) {
13008
13549
  }
13009
13550
  }
13010
13551
  const providerError = extractProviderError(providerResponse);
13552
+ if (repoManager && workspacePath && evalCase.workspace?.reset?.after_each && evalCase.workspace.reset.strategy && evalCase.workspace.reset.strategy !== "none" && evalCase.workspace.repos) {
13553
+ try {
13554
+ await repoManager.reset(
13555
+ evalCase.workspace.repos,
13556
+ workspacePath,
13557
+ evalCase.workspace.reset.strategy
13558
+ );
13559
+ } catch {
13560
+ }
13561
+ }
13011
13562
  if (workspacePath && evalCase.workspace?.after_each) {
13012
13563
  const scriptContext = {
13013
13564
  workspacePath,
@@ -13372,7 +13923,7 @@ async function runEvaluatorList(options) {
13372
13923
  fileChanges,
13373
13924
  workspacePath
13374
13925
  };
13375
- const evalFileDir = evalCase.guideline_paths[0] ? path36.dirname(evalCase.guideline_paths[0]) : process.cwd();
13926
+ const evalFileDir = evalCase.guideline_paths[0] ? path37.dirname(evalCase.guideline_paths[0]) : process.cwd();
13376
13927
  const dispatchContext = {
13377
13928
  judgeProvider,
13378
13929
  targetResolver,
@@ -13462,8 +14013,9 @@ async function runEvaluatorList(options) {
13462
14013
  const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
13463
14014
  return entry.score.score < minScore;
13464
14015
  });
13465
- const aggregateScore = hasRequiredFailure ? 0 : scored.length > 0 ? computeWeightedMean(
13466
- scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
14016
+ const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
14017
+ const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
14018
+ scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
13467
14019
  ) : 0;
13468
14020
  const hits = scored.flatMap((entry) => entry.score.hits);
13469
14021
  const misses = scored.flatMap((entry) => entry.score.misses);
@@ -13603,7 +14155,7 @@ function extractProviderError(response) {
13603
14155
  return trimmed.length > 0 ? trimmed : void 0;
13604
14156
  }
13605
14157
  function createCacheKey(provider, target, evalCase, promptInputs) {
13606
- const hash = createHash("sha256");
14158
+ const hash = createHash2("sha256");
13607
14159
  hash.update(provider.id);
13608
14160
  hash.update(target.name);
13609
14161
  hash.update(evalCase.id);
@@ -13671,8 +14223,8 @@ function computeWeightedMean(entries) {
13671
14223
  }
13672
14224
 
13673
14225
  // src/evaluation/evaluate.ts
13674
- import { existsSync as existsSync2 } from "node:fs";
13675
- import path37 from "node:path";
14226
+ import { existsSync as existsSync3 } from "node:fs";
14227
+ import path38 from "node:path";
13676
14228
  async function evaluate(config) {
13677
14229
  const startTime = Date.now();
13678
14230
  if (config.tests && config.specFile) {
@@ -13694,13 +14246,13 @@ async function evaluate(config) {
13694
14246
  let evalCases;
13695
14247
  let testFilePath;
13696
14248
  if (config.specFile) {
13697
- testFilePath = path37.resolve(config.specFile);
14249
+ testFilePath = path38.resolve(config.specFile);
13698
14250
  evalCases = await loadTests(testFilePath, repoRoot, {
13699
14251
  verbose: config.verbose,
13700
14252
  filter: config.filter
13701
14253
  });
13702
14254
  } else {
13703
- testFilePath = path37.join(process.cwd(), "__programmatic__.yaml");
14255
+ testFilePath = path38.join(process.cwd(), "__programmatic__.yaml");
13704
14256
  evalCases = (config.tests ?? []).map((test) => {
13705
14257
  const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
13706
14258
  const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
@@ -13791,11 +14343,11 @@ function computeSummary(results, durationMs) {
13791
14343
  var TARGET_FILE_CANDIDATES = [".agentv/targets.yaml", ".agentv/targets.yml"];
13792
14344
  async function discoverDefaultTarget(repoRoot) {
13793
14345
  const cwd = process.cwd();
13794
- const chain = buildDirectoryChain(path37.join(cwd, "_placeholder"), repoRoot);
14346
+ const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
13795
14347
  for (const dir of chain) {
13796
14348
  for (const candidate of TARGET_FILE_CANDIDATES) {
13797
- const targetsPath = path37.join(dir, candidate);
13798
- if (!existsSync2(targetsPath)) continue;
14349
+ const targetsPath = path38.join(dir, candidate);
14350
+ if (!existsSync3(targetsPath)) continue;
13799
14351
  try {
13800
14352
  const definitions = await readTargetDefinitions(targetsPath);
13801
14353
  const defaultTarget = definitions.find((d) => d.name === "default");
@@ -13809,11 +14361,11 @@ async function discoverDefaultTarget(repoRoot) {
13809
14361
  async function loadEnvHierarchy(repoRoot) {
13810
14362
  const { readFileSync: readFileSync2 } = await import("node:fs");
13811
14363
  const cwd = process.cwd();
13812
- const chain = buildDirectoryChain(path37.join(cwd, "_placeholder"), repoRoot);
14364
+ const chain = buildDirectoryChain(path38.join(cwd, "_placeholder"), repoRoot);
13813
14365
  const envFiles = [];
13814
14366
  for (const dir of chain) {
13815
- const envPath = path37.join(dir, ".env");
13816
- if (existsSync2(envPath)) envFiles.push(envPath);
14367
+ const envPath = path38.join(dir, ".env");
14368
+ if (existsSync3(envPath)) envFiles.push(envPath);
13817
14369
  }
13818
14370
  for (let i = envFiles.length - 1; i >= 0; i--) {
13819
14371
  try {
@@ -13883,12 +14435,12 @@ var CONFIG_FILE_NAMES = [
13883
14435
  ".agentv/config.js"
13884
14436
  ];
13885
14437
  async function loadTsConfig(projectRoot) {
13886
- const { existsSync: existsSync3 } = await import("node:fs");
14438
+ const { existsSync: existsSync4 } = await import("node:fs");
13887
14439
  const { pathToFileURL } = await import("node:url");
13888
14440
  const { join: join2 } = await import("node:path");
13889
14441
  for (const fileName of CONFIG_FILE_NAMES) {
13890
14442
  const filePath = join2(projectRoot, fileName);
13891
- if (!existsSync3(filePath)) {
14443
+ if (!existsSync4(filePath)) {
13892
14444
  continue;
13893
14445
  }
13894
14446
  try {
@@ -13985,8 +14537,8 @@ function buildPrompt(criteria, question, referenceAnswer) {
13985
14537
  }
13986
14538
 
13987
14539
  // src/evaluation/cache/response-cache.ts
13988
- import { mkdir as mkdir12, readFile as readFile11, writeFile as writeFile7 } from "node:fs/promises";
13989
- import path38 from "node:path";
14540
+ import { mkdir as mkdir13, readFile as readFile11, writeFile as writeFile8 } from "node:fs/promises";
14541
+ import path39 from "node:path";
13990
14542
  var DEFAULT_CACHE_PATH = ".agentv/cache";
13991
14543
  var ResponseCache = class {
13992
14544
  cachePath;
@@ -14004,13 +14556,13 @@ var ResponseCache = class {
14004
14556
  }
14005
14557
  async set(key, value) {
14006
14558
  const filePath = this.keyToPath(key);
14007
- const dir = path38.dirname(filePath);
14008
- await mkdir12(dir, { recursive: true });
14009
- await writeFile7(filePath, JSON.stringify(value, null, 2), "utf8");
14559
+ const dir = path39.dirname(filePath);
14560
+ await mkdir13(dir, { recursive: true });
14561
+ await writeFile8(filePath, JSON.stringify(value, null, 2), "utf8");
14010
14562
  }
14011
14563
  keyToPath(key) {
14012
14564
  const prefix = key.slice(0, 2);
14013
- return path38.join(this.cachePath, prefix, `${key}.json`);
14565
+ return path39.join(this.cachePath, prefix, `${key}.json`);
14014
14566
  }
14015
14567
  };
14016
14568
  function shouldEnableCache(params) {
@@ -14483,6 +15035,7 @@ export {
14483
15035
  OtelTraceExporter,
14484
15036
  OtlpJsonFileExporter,
14485
15037
  ProviderRegistry,
15038
+ RepoManager,
14486
15039
  ResponseCache,
14487
15040
  SimpleTraceFileExporter,
14488
15041
  TEST_MESSAGE_ROLES,
@@ -14568,12 +15121,19 @@ export {
14568
15121
  resolveTargetDefinition,
14569
15122
  resolveWorkspaceTemplate,
14570
15123
  rubricEvaluationSchema,
15124
+ runContainsAllAssertion,
15125
+ runContainsAnyAssertion,
14571
15126
  runContainsAssertion,
15127
+ runEndsWithAssertion,
14572
15128
  runEqualsAssertion,
14573
15129
  runEvalCase,
14574
15130
  runEvaluation,
15131
+ runIcontainsAllAssertion,
15132
+ runIcontainsAnyAssertion,
15133
+ runIcontainsAssertion,
14575
15134
  runIsJsonAssertion,
14576
15135
  runRegexAssertion,
15136
+ runStartsWithAssertion,
14577
15137
  scoreToVerdict,
14578
15138
  shouldEnableCache,
14579
15139
  shouldSkipCacheForTemperature,